X-Git-Url: http://git.onelab.eu/?p=mom.git;a=blobdiff_plain;f=pl_mop.sh;h=49222770d418f13607c09126939cd276521c18cc;hp=e40b872d31615b3ee48379ab621cd99265ff9394;hb=a7ab53cbaf3037b67fbda98fed3887c905d93e96;hpb=10c81030eb73f2cd7d8ac83565ec14932cf6f7a5

diff --git a/pl_mop.sh b/pl_mop.sh
index e40b872..4922277 100755
--- a/pl_mop.sh
+++ b/pl_mop.sh
@@ -5,18 +5,23 @@
 # Mark Huang <mlhuang@cs.princeton.edu>
 # Copyright (C) 2005 The Trustees of Princeton University
 #
-# $Id: pl_mop.sh,v 1.3 2005/12/01 23:38:58 mlhuang Exp $
-#
 
 PATH=/sbin:/usr/sbin:$PATH
 
+# Parse PLC configuration
+if [ -r /etc/planetlab/plc_config ] ; then
+    . /etc/planetlab/plc_config
+else
+    PLC_SLICE_PREFIX="pl"
+fi
+
 PIDFILE=/var/run/pl_mop.pid
 
 # Record PID
 if [ -f $PIDFILE ] ; then
     if kill -0 `cat $PIDFILE` >/dev/null 2>&1 ; then
-	logger -p info -t pl_mom "$0 (`cat $PIDFILE`) already running"
-	exit 1
+        logger -p info -t pl_mom "$0 (`cat $PIDFILE`) already running"
+        exit 1
     fi
 fi
 echo $$ > $PIDFILE
@@ -37,58 +42,34 @@ fix_etc_shadow() {
 
     shopt -s nullglob
     for file in /etc/vservers/*.conf pl_admin.conf site_admin.conf ; do
-	slice=$(basename ${file%*.conf})
-	if grep -q "$slice:\!\!" /etc/shadow ; then
-	    sed -i -e "s/$slice:\!\!:\(.*\)/$slice:*:\1/" /etc/shadow
-	fi
+        slice=$(basename ${file%*.conf})
+        if grep -q "$slice:\!\!" /etc/shadow ; then
+            sed -i -e "s/$slice:\!\!:\(.*\)/$slice:*:\1/" /etc/shadow
+        fi
     done
 }
 
 # keep essential services running
 restart_services() {
-    for service in sshd pl_sshd pl_mom pl_nm proper ; do
-	echo "* Checking $service"
-	status=$(service $service status)
-	if [ $? -ne 0 ] || echo $status 2>&1 | grep -q stopped ; then
-	    echo "* Restarting $service"
-	    service $service start
-	fi
+    for service in sshd pl_sshd swapmon nm fprobe-ulog codemux; do
+	chkconfig --list $service | grep -q 3:on || continue
+        echo "* Checking $service"
+        status=$(service $service status)
+        if [ $? -ne 0 ] || echo $status 2>&1 | grep -q stopped ; then
+            echo "* Restarting $service"
+            service $service start
+        fi
     done
 }
 
-# keep netflow running
-restart_netflow() {
-    echo "* Checking netflow"
-    echo "sudo /sbin/service netflow restart" | su - pl_netflow
-    if [ $? -ne 0 ] ; then
-	echo "* Restarting netflow"
-	service netflow-init start
-	vserver pl_netflow start
-	echo "sudo /sbin/service netflow restart" | su - pl_netflow
-    fi
-}
-
-# keep pl_conf running
-restart_pl_conf() {
-    echo "* Checking pl_conf"
-    vserver pl_conf exec /sbin/service pl_conf status >/dev/null 2>&1
-    if [ $? -ne 0 ] ; then
-	echo "* Restarting pl_conf"
-	vserver pl_conf restart
-    fi
-}
-
 # kill all the processes running in slice contexts
 vkillall() {
     vps -A | awk '(int($2) > 1) { system("vkill -c " $2 " -s 9 " $1); }'
     # unmounts all the /proc and /dev/pts mounts in each vserver
     tries=10
     while grep -q /vservers/ /proc/mounts && [ $tries -gt 0 ] ; do
-	tries=$(($tries -1))
-	# arizona_stork seems to generate some weird mount points of the form
-	# /vservers/arizona_stork/tmp/0.886421543959\040(deleted) that should be
-	# /vservers/arizona_stork/tmp/0.886421543959
-	awk '(/vservers\//) { sub(/\\040.*$/, ""); print "Unmounting " $2; system("umount " $2); }' /proc/mounts
+        tries=$(($tries -1))
+        awk '(/vservers\//) { sub(/\\040.*$/, ""); print "Unmounting " $2; system("umount " $2); }' /proc/mounts
     done
 }   
 
@@ -101,8 +82,8 @@ fix_vservers() {
     mkdir -p /vservers/.vtmp
     tmp=$(mktemp /vservers/.vtmp/fixit.XXXXXX)
     if [ $? -eq 0 ] ; then
-	rm -f $tmp
-	return 0
+        rm -f $tmp
+        return 0
     fi
 
     # kill all processes running in slice contexts
@@ -111,30 +92,30 @@ fix_vservers() {
     # stop vcached
     pidfile=/var/run/vcached.pid
     if [ -r "$pidfile" ] ; then
-	kill $(cat $pidfile)
+        kill $(cat $pidfile)
     fi
     touch $pidfile
 
     # unmounts /vservers
     if umount /vservers ; then
         # install expect if necessary
-	if ! rpm -q expect ; then
-	    yum -y install expect
-	fi
+        if ! rpm -q expect ; then
+            yum -y install expect
+        fi
 
         # tell expect to hit the 'y' key every time fsck asks
-	expect -c 'set timeout 3600; spawn fsck /dev/mapper/planetlab-vservers; expect "<y>?" { send "y\r"; exp_continue }'
+        expect -c 'set timeout 3600; spawn fsck /dev/mapper/planetlab-vservers; expect "<y>?" { send "y\r"; exp_continue }'
 
         # blow away the vserver cache
-	rm -rf /vservers/.vcache/*
+        rm -rf /vservers/.vcache/*
 
         # XXX re-mount /vservers
-	# mount /vservers
+        # mount /vservers
 
-	# shutdown instead to avoid clearing disk quotas
-	shutdown -r now "/vservers filesystem repaired, rebooting"
+        # shutdown instead to avoid clearing disk quotas
+        shutdown -r now "/vservers filesystem repaired, rebooting"
     else
-	echo "Unable to unmount /vservers!" >&2
+        echo "Unable to unmount /vservers!" >&2
     fi
 
     # allow vcached to run again
@@ -149,34 +130,112 @@ kill_duplicate_ssh() {
     grep " \[priv\]" |
     sort | uniq -c |
     while read instances sshd slice priv ; do
-	# kill all old instances
-	if [ $instances -gt 10 ] ; then
-	    ps -C sshd -o pid=,start_time=,command= |
-	    grep "$slice \[priv\]" |
-	    while read pid start_time command ; do
-		start_time=$(date -d "$start_time" +%s)
-		min=$(date -d "6 hours ago" +%s)
-		if [ $start_time -lt $min ] ; then
-		    echo "* Killing $slice sshd pid $pid"
-		    kill -9 $pid
-		fi
-	    done
-	fi
+    # kill all old instances
+    if [ $instances -gt 10 ] ; then
+        ps -C sshd -o pid=,start_time=,command= |
+        grep "$slice \[priv\]" |
+        while read pid start_time command ; do
+            start_time=$(date -d "$start_time" +%s)
+            min=$(date -d "6 hours ago" +%s)
+            if [ $start_time -lt $min ] ; then
+                echo "* Killing $slice sshd pid $pid"
+                kill -9 $pid
+            fi
+        done
+    fi
     done
 }
 
-# XXX kill zombie slices
+kill_nm_inslice(){
+    pids=$(vps aux | awk '$1 != "root" && $14 == "/usr/share/NodeManager/nm.py" {print $2}')
+    for pid in $pids ; do
+        line=$(vps aux | grep $pid)
+        echo NM found in slice. Killing PID $pid
+        echo $line
+        vkill -9 $pid
+    done
+}
 
-# XXX reboot if boot state changes
+kill_nonroot_nm(){
+    # For whatever reason, Some NM's, after fork and chcontext...don't chcontext.  Kill them.
+    pids=$(ps aux | awk '$1 != "root" && $12 == "/usr/share/NodeManager/nm.py" {print $2}')
+    for pid in $pids ; do
+        line=$(ps aux | grep $pid)
+        echo NM found not belonging to root. Killing PID $pid
+        echo $line
+        kill -9 $pid
+    done
+}
 
-run fix_vservers
+kill_multi_nm(){
+    # if there is more than one nm running around, kill them, then nm restart
+    pids=$(ps aux | awk '$1 == "root" && $12 == "/usr/share/NodeManager/nm.py" {print $2}')
+    i=0
+    for pid in $pids ; do
+        i=$[$i+1]
+    done
+    if [ $i -gt 1 ] ; then
+        # stop nm
+        echo "More than 1 NM found belonging to root.  Restarting NM."
+        /etc/init.d/nm stop 
+        pids=$(ps aux | awk '$1 == "root" && $12 == "/usr/share/NodeManager/nm.py" {print $2}')
+        for pid in $pids ; do
+            kill -9 $pid
+        done
+        /etc/init.d/nm start
+    fi
+}
 
-run fix_etc_shadow
+fix_rpm() {
+    echo "* Checking for stuck rpm processes"
+
+    rpm_count=`pgrep -f "rpm" | wc -l`
+
+    if [[ $rpm_count -ge 6 ]]; then
+        echo "* $rpm_count rpm processes found"
+
+        # kill rpm processes, attempt up to 10 times and then give up
+        try_count=0
+        rpm_count=`pgrep "rpm|yum" | wc -l`
+        while [[ $rpm_count -gt 0 ]]; do
+	    echo "* killing rpm/yum processes"
+            killall -9 rpm rpmd rpmq rpmk yum
+            sleep 1
+            rpm_count=`pgrep "rpm|yum" | wc -l`
+            try_count=`expr $try_count + 1`
+            if [[ $try_count -ge 10 ]]; then
+                echo "* failed to kill rpm processes"
+                return
+            fi
+        done
+
+        # remove lock files
+        echo "* deleting rpm lock files"
+        rm -f /var/lib/rpm/__*
+
+        # rebuild rpm database
+        echo "* rebuilding rpm database"
+        rpm --rebuilddb
+
+        echo "* rpm repair sequence complete"
+
+    fi
+}
+
+# XXX kill zombie slices
 
 run restart_services
 
-run restart_pl_conf
+run fix_rpm
+
+run kill_nonroot_nm
+
+run kill_nm_inslice
 
-run restart_netflow
+run kill_multi_nm
+
+run fix_vservers
+
+run fix_etc_shadow
 
 run kill_duplicate_ssh