# Mark Huang <mlhuang@cs.princeton.edu>
# Copyright (C) 2005 The Trustees of Princeton University
#
-# $Id: pl_mop.sh,v 1.2 2005/11/03 17:23:25 mlhuang Exp $
+# $Id$
#
PATH=/sbin:/usr/sbin:$PATH
+# Parse PLC configuration
+if [ -r /etc/planetlab/plc_config ] ; then
+ . /etc/planetlab/plc_config
+else
+ PLC_SLICE_PREFIX="pl"
+fi
+
PIDFILE=/var/run/pl_mop.pid
# Record PID
if [ -f $PIDFILE ] ; then
if kill -0 `cat $PIDFILE` >/dev/null 2>&1 ; then
- logger -p info -t pl_mom "$0 (`cat $PIDFILE`) already running"
- exit 1
+ logger -p info -t pl_mom "$0 (`cat $PIDFILE`) already running"
+ exit 1
fi
fi
echo $$ > $PIDFILE
shopt -s nullglob
for file in /etc/vservers/*.conf pl_admin.conf site_admin.conf ; do
- slice=$(basename ${file%*.conf})
- if grep -q "$slice:\!\!" /etc/shadow ; then
- sed -i -e "s/$slice:\!\!:\(.*\)/$slice:*:\1/" /etc/shadow
- fi
+ slice=$(basename ${file%*.conf})
+ if grep -q "$slice:\!\!" /etc/shadow ; then
+ sed -i -e "s/$slice:\!\!:\(.*\)/$slice:*:\1/" /etc/shadow
+ fi
done
}
# keep essential services running
restart_services() {
- for service in sshd pl_sshd pl_mom pl_nm proper ; do
- echo "* Checking $service"
- status=$(service $service status)
- if [ $? -ne 0 ] || echo $status 2>&1 | grep -q stopped ; then
- echo "* Restarting $service"
- service $service start
- fi
+ for service in sshd pl_sshd swapmon nm proper ; do
+ echo "* Checking $service"
+ status=$(service $service status)
+ if [ $? -ne 0 ] || echo $status 2>&1 | grep -q stopped ; then
+ echo "* Restarting $service"
+ service $service start
+ fi
done
}
echo "* Checking netflow"
echo "sudo /sbin/service netflow restart" | su - pl_netflow
if [ $? -ne 0 ] ; then
- echo "* Restarting netflow"
- service netflow-init start
- vserver pl_netflow start
- echo "sudo /sbin/service netflow restart" | su - pl_netflow
+ echo "* Restarting netflow"
+ service netflow-init start
+ vserver pl_netflow start
+ echo "sudo /sbin/service netflow restart" | su - pl_netflow
fi
}
-# keep pl_conf running
-restart_pl_conf() {
- echo "* Checking pl_conf"
- vserver pl_conf exec /sbin/service pl_conf status >/dev/null 2>&1
- if [ $? -ne 0 ] ; then
- echo "* Restarting pl_conf"
- vserver pl_conf restart
+# GPG keys are installed in /etc/pki/rpm-gpg by both the Boot Manager
+# during initial installation, and by PlanetLabConf during daily
+# updates. NodeUpdate imports the keys into the RPM database before
+# running yum daily. vserver-reference copies and imports the keys
+# into the reference images and system slices daily. The only parts of
+# this process that are actually necessary, are the Boot Manager and
+# vserver-reference. However, we do not want to force a re-install of
+# all nodes, and we do not want to force an update of
+# vserver-reference, so in the meantime, PlanetLabConf and NodeUpdate
+# take care of getting the keys installed and imported in /, and this
+# script takes care of getting them installed in the reference images
+# and system slices, until we can get a new vserver-reference image
+# pushed out.
+update_vserver_reference() {
+ echo "* Updating VServer reference"
+
+ shopt -s nullglob
+
+ VROOTS="/vservers/vserver-reference /vservers/.vcache/* /vservers/${PLC_SLICE_PREFIX}_*"
+
+ # Copy configuration files from host to slices
+ for file in \
+ /etc/hosts /etc/resolv.conf /etc/yum.conf /etc/planetlab/node_id \
+ /etc/planetlab/plc_config* /etc/planetlab/php/* \
+ /etc/pki/rpm-gpg/* ; do
+ if [ -r $file ] ; then
+ for vroot in $VROOTS ; do
+ install -D -m 644 $file $vroot/$file
+ done
+ fi
+ done
+
+ # (Re)install GPG signing keys
+ if [ -d /etc/pki/rpm-gpg ] ; then
+ for vroot in $VROOTS ; do
+ chroot $vroot rpm --allmatches -e gpg-pubkey || :
+ chroot $vroot rpm --import /etc/pki/rpm-gpg/* || :
+ done
fi
-}
+}
# kill all the processes running in slice contexts
vkillall() {
# unmounts all the /proc and /dev/pts mounts in each vserver
tries=10
while grep -q /vservers/ /proc/mounts && [ $tries -gt 0 ] ; do
- tries=$(($tries -1))
- # arizona_stork seems to generate some weird mount points of the form
- # /vservers/arizona_stork/tmp/0.886421543959\040(deleted) that should be
- # /vservers/arizona_stork/tmp/0.886421543959
- awk '(/vservers\//) { sub(/\\040.*$/, ""); print "Unmounting " $2; system("umount " $2); }' /proc/mounts
+ tries=$(($tries -1))
+ # arizona_stork seems to generate some weird mount points of the form
+ # /vservers/arizona_stork/tmp/0.886421543959\040(deleted) that should be
+ # /vservers/arizona_stork/tmp/0.886421543959
+ awk '(/vservers\//) { sub(/\\040.*$/, ""); print "Unmounting " $2; system("umount " $2); }' /proc/mounts
done
}
mkdir -p /vservers/.vtmp
tmp=$(mktemp /vservers/.vtmp/fixit.XXXXXX)
if [ $? -eq 0 ] ; then
- rm -f $tmp
- return 0
+ rm -f $tmp
+ return 0
fi
# kill all processes running in slice contexts
# stop vcached
pidfile=/var/run/vcached.pid
if [ -r "$pidfile" ] ; then
- kill $(cat $pidfile)
+ kill $(cat $pidfile)
fi
touch $pidfile
# unmounts /vservers
if umount /vservers ; then
# install expect if necessary
- if ! rpm -q expect ; then
- yum -y install expect
- fi
+ if ! rpm -q expect ; then
+ yum -y install expect
+ fi
# tell expect to hit the 'y' key every time fsck asks
- expect -c 'set timeout 3600; spawn fsck /dev/mapper/planetlab-vservers; expect "<y>?" { send "y\r"; exp_continue }'
+ expect -c 'set timeout 3600; spawn fsck /dev/mapper/planetlab-vservers; expect "<y>?" { send "y\r"; exp_continue }'
# blow away the vserver cache
- rm -rf /vservers/.vcache/*
+ rm -rf /vservers/.vcache/*
+
+ # XXX re-mount /vservers
+ # mount /vservers
- # re-mount /vservers
- mount /vservers
+ # shutdown instead to avoid clearing disk quotas
+ shutdown -r now "/vservers filesystem repaired, rebooting"
else
- echo "Unable to unmount /vservers!" >&2
+ echo "Unable to unmount /vservers!" >&2
fi
# allow vcached to run again
grep " \[priv\]" |
sort | uniq -c |
while read instances sshd slice priv ; do
- # kill all old instances
- if [ $instances -gt 10 ] ; then
- ps -C sshd -o pid=,start_time=,command= |
- grep "$slice \[priv\]" |
- while read pid start_time command ; do
- start_time=$(date -d "$start_time" +%s)
- min=$(date -d "6 hours ago" +%s)
- if [ $start_time -lt $min ] ; then
- echo "* Killing $slice sshd pid $pid"
- kill -9 $pid
- fi
- done
- fi
+ # kill all old instances
+ if [ $instances -gt 10 ] ; then
+ ps -C sshd -o pid=,start_time=,command= |
+ grep "$slice \[priv\]" |
+ while read pid start_time command ; do
+ start_time=$(date -d "$start_time" +%s)
+ min=$(date -d "6 hours ago" +%s)
+ if [ $start_time -lt $min ] ; then
+ echo "* Killing $slice sshd pid $pid"
+ kill -9 $pid
+ fi
+ done
+ fi
done
}
+kill_nm_inslice(){
+ pids=$(vps aux | awk '$1 != "root" && $14 == "/usr/share/NodeManager/nm.py" {print $2}')
+ for pid in $pids ; do
+ line=$(vps aux | grep $pid)
+ echo NM found in slice. Killing PID $pid
+ echo $line
+ kill -9 $pid
+ done
+}
+
+kill_nonroot_nm(){
+ # For whatever reason, Some NM's, after fork and chcontext...don't chcontext. Kill them.
+ pids=$(ps aux | awk '$1 != "root" && $12 == "/usr/share/NodeManager/nm.py" {print $2}')
+ for pid in $pids ; do
+ line=$(ps aux | grep $pid)
+ echo NM found not belonging to root. Killing PID $pid
+ echo $line
+ kill -9 $pid
+ done
+}
+
+kill_multi_nm(){
+ # if there is more than one nm running around, kill them, then nm restart
+ pids=$(ps aux | awk '$1 == "root" && $12 == "/usr/share/NodeManager/nm.py" {print $2}')
+ i=0
+ for pid in $pids ; do
+ i=$[$i+1]
+ done
+ if [ $i -gt 1 ] ; then
+ # stop nm
+ echo "More than 1 NM found belonging to root. Restarting NM."
+ /etc/init.d/nm stop
+ pids=$(ps aux | awk '$1 == "root" && $12 == "/usr/share/NodeManager/nm.py" {print $2}')
+ for pid in $pids ; do
+ kill -9 $pid
+ done
+ /etc/init.d/nm start
+ fi
+}
# XXX kill zombie slices
# XXX reboot if boot state changes
+run kill_nonroot_nm
+
+run kill_nm_inslice
+
+run kill_multi_nm
run fix_vservers
run restart_services
-run restart_pl_conf
-
run restart_netflow
run kill_duplicate_ssh
+
+run update_vserver_reference
+