X-Git-Url: http://git.onelab.eu/?p=mom.git;a=blobdiff_plain;f=pl_mop.sh;h=49222770d418f13607c09126939cd276521c18cc;hp=16dba73dd6c2dc6ff760e3a478500b6c33b8ee41;hb=a7ab53cbaf3037b67fbda98fed3887c905d93e96;hpb=f070bb7d9b88cea7fdd164afd52169b04cbaf18d diff --git a/pl_mop.sh b/pl_mop.sh index 16dba73..4922277 100755 --- a/pl_mop.sh +++ b/pl_mop.sh @@ -5,8 +5,6 @@ # Mark Huang # Copyright (C) 2005 The Trustees of Princeton University # -# $Id$ -# PATH=/sbin:/usr/sbin:$PATH @@ -22,8 +20,8 @@ PIDFILE=/var/run/pl_mop.pid # Record PID if [ -f $PIDFILE ] ; then if kill -0 `cat $PIDFILE` >/dev/null 2>&1 ; then - logger -p info -t pl_mom "$0 (`cat $PIDFILE`) already running" - exit 1 + logger -p info -t pl_mom "$0 (`cat $PIDFILE`) already running" + exit 1 fi fi echo $$ > $PIDFILE @@ -44,89 +42,34 @@ fix_etc_shadow() { shopt -s nullglob for file in /etc/vservers/*.conf pl_admin.conf site_admin.conf ; do - slice=$(basename ${file%*.conf}) - if grep -q "$slice:\!\!" /etc/shadow ; then - sed -i -e "s/$slice:\!\!:\(.*\)/$slice:*:\1/" /etc/shadow - fi + slice=$(basename ${file%*.conf}) + if grep -q "$slice:\!\!" /etc/shadow ; then + sed -i -e "s/$slice:\!\!:\(.*\)/$slice:*:\1/" /etc/shadow + fi done } # keep essential services running restart_services() { - for service in sshd pl_sshd swapmon nm proper ; do - echo "* Checking $service" - status=$(service $service status) - if [ $? -ne 0 ] || echo $status 2>&1 | grep -q stopped ; then - echo "* Restarting $service" - service $service start - fi + for service in sshd pl_sshd swapmon nm fprobe-ulog codemux; do + chkconfig --list $service | grep -q 3:on || continue + echo "* Checking $service" + status=$(service $service status) + if [ $? -ne 0 ] || echo $status 2>&1 | grep -q stopped ; then + echo "* Restarting $service" + service $service start + fi done } -# keep netflow running -restart_netflow() { - echo "* Checking netflow" - echo "sudo /sbin/service netflow restart" | su - pl_netflow - if [ $? -ne 0 ] ; then - echo "* Restarting netflow" - service netflow-init start - vserver pl_netflow start - echo "sudo /sbin/service netflow restart" | su - pl_netflow - fi -} - -# GPG keys are installed in /etc/pki/rpm-gpg by both the Boot Manager -# during initial installation, and by PlanetLabConf during daily -# updates. NodeUpdate imports the keys into the RPM database before -# running yum daily. vserver-reference copies and imports the keys -# into the reference images and system slices daily. The only parts of -# this process that are actually necessary, are the Boot Manager and -# vserver-reference. However, we do not want to force a re-install of -# all nodes, and we do not want to force an update of -# vserver-reference, so in the meantime, PlanetLabConf and NodeUpdate -# take care of getting the keys installed and imported in /, and this -# script takes care of getting them installed in the reference images -# and system slices, until we can get a new vserver-reference image -# pushed out. -update_vserver_reference() { - echo "* Updating VServer reference" - - shopt -s nullglob - - VROOTS="/vservers/vserver-reference /vservers/.vcache/* /vservers/${PLC_SLICE_PREFIX}_*" - - # Copy configuration files from host to slices - for file in \ - /etc/hosts /etc/resolv.conf /etc/yum.conf /etc/planetlab/node_id \ - /etc/planetlab/plc_config* /etc/planetlab/php/* \ - /etc/pki/rpm-gpg/* ; do - if [ -r $file ] ; then - for vroot in $VROOTS ; do - install -D -m 644 $file $vroot/$file - done - fi - done - - # (Re)install GPG signing keys - if [ -d /etc/pki/rpm-gpg ] ; then - for vroot in $VROOTS ; do - chroot $vroot rpm --allmatches -e gpg-pubkey || : - chroot $vroot rpm --import /etc/pki/rpm-gpg/* || : - done - fi -} - # kill all the processes running in slice contexts vkillall() { vps -A | awk '(int($2) > 1) { system("vkill -c " $2 " -s 9 " $1); }' # unmounts all the /proc and /dev/pts mounts in each vserver tries=10 while grep -q /vservers/ /proc/mounts && [ $tries -gt 0 ] ; do - tries=$(($tries -1)) - # arizona_stork seems to generate some weird mount points of the form - # /vservers/arizona_stork/tmp/0.886421543959\040(deleted) that should be - # /vservers/arizona_stork/tmp/0.886421543959 - awk '(/vservers\//) { sub(/\\040.*$/, ""); print "Unmounting " $2; system("umount " $2); }' /proc/mounts + tries=$(($tries -1)) + awk '(/vservers\//) { sub(/\\040.*$/, ""); print "Unmounting " $2; system("umount " $2); }' /proc/mounts done } @@ -139,8 +82,8 @@ fix_vservers() { mkdir -p /vservers/.vtmp tmp=$(mktemp /vservers/.vtmp/fixit.XXXXXX) if [ $? -eq 0 ] ; then - rm -f $tmp - return 0 + rm -f $tmp + return 0 fi # kill all processes running in slice contexts @@ -149,30 +92,30 @@ fix_vservers() { # stop vcached pidfile=/var/run/vcached.pid if [ -r "$pidfile" ] ; then - kill $(cat $pidfile) + kill $(cat $pidfile) fi touch $pidfile # unmounts /vservers if umount /vservers ; then # install expect if necessary - if ! rpm -q expect ; then - yum -y install expect - fi + if ! rpm -q expect ; then + yum -y install expect + fi # tell expect to hit the 'y' key every time fsck asks - expect -c 'set timeout 3600; spawn fsck /dev/mapper/planetlab-vservers; expect "?" { send "y\r"; exp_continue }' + expect -c 'set timeout 3600; spawn fsck /dev/mapper/planetlab-vservers; expect "?" { send "y\r"; exp_continue }' # blow away the vserver cache - rm -rf /vservers/.vcache/* + rm -rf /vservers/.vcache/* # XXX re-mount /vservers - # mount /vservers + # mount /vservers - # shutdown instead to avoid clearing disk quotas - shutdown -r now "/vservers filesystem repaired, rebooting" + # shutdown instead to avoid clearing disk quotas + shutdown -r now "/vservers filesystem repaired, rebooting" else - echo "Unable to unmount /vservers!" >&2 + echo "Unable to unmount /vservers!" >&2 fi # allow vcached to run again @@ -192,12 +135,12 @@ kill_duplicate_ssh() { ps -C sshd -o pid=,start_time=,command= | grep "$slice \[priv\]" | while read pid start_time command ; do - start_time=$(date -d "$start_time" +%s) - min=$(date -d "6 hours ago" +%s) - if [ $start_time -lt $min ] ; then - echo "* Killing $slice sshd pid $pid" - kill -9 $pid - fi + start_time=$(date -d "$start_time" +%s) + min=$(date -d "6 hours ago" +%s) + if [ $start_time -lt $min ] ; then + echo "* Killing $slice sshd pid $pid" + kill -9 $pid + fi done fi done @@ -209,7 +152,7 @@ kill_nm_inslice(){ line=$(vps aux | grep $pid) echo NM found in slice. Killing PID $pid echo $line - kill -9 $pid + vkill -9 $pid done } @@ -227,24 +170,64 @@ kill_nonroot_nm(){ kill_multi_nm(){ # if there is more than one nm running around, kill them, then nm restart pids=$(ps aux | awk '$1 == "root" && $12 == "/usr/share/NodeManager/nm.py" {print $2}') - i=0 - for pid in $pids ; do - i=$[$i+1] - done - if [ $i -gt 1 ] ; then - # stop nm - echo "More than 1 NM found belonging to root. Restarting NM." - /etc/init.d/nm stop - pids=$(ps aux | awk '$1 == "root" && $12 == "/usr/share/NodeManager/nm.py" {print $2}') - for pid in $pids ; do - kill -9 $pid - done - /etc/init.d/nm start - fi + i=0 + for pid in $pids ; do + i=$[$i+1] + done + if [ $i -gt 1 ] ; then + # stop nm + echo "More than 1 NM found belonging to root. Restarting NM." + /etc/init.d/nm stop + pids=$(ps aux | awk '$1 == "root" && $12 == "/usr/share/NodeManager/nm.py" {print $2}') + for pid in $pids ; do + kill -9 $pid + done + /etc/init.d/nm start + fi } + +fix_rpm() { + echo "* Checking for stuck rpm processes" + + rpm_count=`pgrep -f "rpm" | wc -l` + + if [[ $rpm_count -ge 6 ]]; then + echo "* $rpm_count rpm processes found" + + # kill rpm processes, attempt up to 10 times and then give up + try_count=0 + rpm_count=`pgrep "rpm|yum" | wc -l` + while [[ $rpm_count -gt 0 ]]; do + echo "* killing rpm/yum processes" + killall -9 rpm rpmd rpmq rpmk yum + sleep 1 + rpm_count=`pgrep "rpm|yum" | wc -l` + try_count=`expr $try_count + 1` + if [[ $try_count -ge 10 ]]; then + echo "* failed to kill rpm processes" + return + fi + done + + # remove lock files + echo "* deleting rpm lock files" + rm -f /var/lib/rpm/__* + + # rebuild rpm database + echo "* rebuilding rpm database" + rpm --rebuilddb + + echo "* rpm repair sequence complete" + + fi +} + # XXX kill zombie slices -# XXX reboot if boot state changes +run restart_services + +run fix_rpm + run kill_nonroot_nm run kill_nm_inslice @@ -255,11 +238,4 @@ run fix_vservers run fix_etc_shadow -run restart_services - -run restart_netflow - run kill_duplicate_ssh - -run update_vserver_reference -