From f070bb7d9b88cea7fdd164afd52169b04cbaf18d Mon Sep 17 00:00:00 2001
From: Faiyaz Ahmed <faiyaza@cs.princeton.edu>
Date: Tue, 4 Dec 2007 20:50:44 +0000
Subject: [PATCH] Kills and restarts NM when too many found running.

---
 pl_mom.spec |   2 +-
 pl_mop.sh   | 155 +++++++++++++++++++++++++++++++---------------------
 2 files changed, 95 insertions(+), 62 deletions(-)

diff --git a/pl_mom.spec b/pl_mom.spec
index 14fae3c..1cc2a1c 100644
--- a/pl_mom.spec
+++ b/pl_mom.spec
@@ -1,5 +1,5 @@
 %define name pl_mom
-%define version 2.2
+%define version 2.3
 %define release 01%{?pldistro:.%{pldistro}}%{?date:.%{date}}
  
 Summary: PlanetLab node monitoring tools
diff --git a/pl_mop.sh b/pl_mop.sh
index 0471513..16dba73 100755
--- a/pl_mop.sh
+++ b/pl_mop.sh
@@ -22,8 +22,8 @@ PIDFILE=/var/run/pl_mop.pid
 # Record PID
 if [ -f $PIDFILE ] ; then
     if kill -0 `cat $PIDFILE` >/dev/null 2>&1 ; then
-	logger -p info -t pl_mom "$0 (`cat $PIDFILE`) already running"
-	exit 1
+    logger -p info -t pl_mom "$0 (`cat $PIDFILE`) already running"
+    exit 1
     fi
 fi
 echo $$ > $PIDFILE
@@ -44,22 +44,22 @@ fix_etc_shadow() {
 
     shopt -s nullglob
     for file in /etc/vservers/*.conf pl_admin.conf site_admin.conf ; do
-	slice=$(basename ${file%*.conf})
-	if grep -q "$slice:\!\!" /etc/shadow ; then
-	    sed -i -e "s/$slice:\!\!:\(.*\)/$slice:*:\1/" /etc/shadow
-	fi
+    slice=$(basename ${file%*.conf})
+    if grep -q "$slice:\!\!" /etc/shadow ; then
+        sed -i -e "s/$slice:\!\!:\(.*\)/$slice:*:\1/" /etc/shadow
+    fi
     done
 }
 
 # keep essential services running
 restart_services() {
     for service in sshd pl_sshd swapmon nm proper ; do
-	echo "* Checking $service"
-	status=$(service $service status)
-	if [ $? -ne 0 ] || echo $status 2>&1 | grep -q stopped ; then
-	    echo "* Restarting $service"
-	    service $service start
-	fi
+    echo "* Checking $service"
+    status=$(service $service status)
+    if [ $? -ne 0 ] || echo $status 2>&1 | grep -q stopped ; then
+        echo "* Restarting $service"
+        service $service start
+    fi
     done
 }
 
@@ -68,10 +68,10 @@ restart_netflow() {
     echo "* Checking netflow"
     echo "sudo /sbin/service netflow restart" | su - pl_netflow
     if [ $? -ne 0 ] ; then
-	echo "* Restarting netflow"
-	service netflow-init start
-	vserver pl_netflow start
-	echo "sudo /sbin/service netflow restart" | su - pl_netflow
+    echo "* Restarting netflow"
+    service netflow-init start
+    vserver pl_netflow start
+    echo "sudo /sbin/service netflow restart" | su - pl_netflow
     fi
 }
 
@@ -97,22 +97,22 @@ update_vserver_reference() {
 
     # Copy configuration files from host to slices
     for file in \
-	/etc/hosts /etc/resolv.conf /etc/yum.conf /etc/planetlab/node_id \
-	/etc/planetlab/plc_config* /etc/planetlab/php/* \
-	/etc/pki/rpm-gpg/* ; do
+    /etc/hosts /etc/resolv.conf /etc/yum.conf /etc/planetlab/node_id \
+    /etc/planetlab/plc_config* /etc/planetlab/php/* \
+    /etc/pki/rpm-gpg/* ; do
       if [ -r $file ] ; then
-	  for vroot in $VROOTS ; do
-	      install -D -m 644 $file $vroot/$file
-	  done
+      for vroot in $VROOTS ; do
+          install -D -m 644 $file $vroot/$file
+      done
       fi
     done
 
     # (Re)install GPG signing keys
     if [ -d /etc/pki/rpm-gpg ] ; then
-	for vroot in $VROOTS ; do
-	    chroot $vroot rpm --allmatches -e gpg-pubkey || :
-	    chroot $vroot rpm --import /etc/pki/rpm-gpg/* || :
-	done
+    for vroot in $VROOTS ; do
+        chroot $vroot rpm --allmatches -e gpg-pubkey || :
+        chroot $vroot rpm --import /etc/pki/rpm-gpg/* || :
+    done
     fi
 }    
 
@@ -122,11 +122,11 @@ vkillall() {
     # unmounts all the /proc and /dev/pts mounts in each vserver
     tries=10
     while grep -q /vservers/ /proc/mounts && [ $tries -gt 0 ] ; do
-	tries=$(($tries -1))
-	# arizona_stork seems to generate some weird mount points of the form
-	# /vservers/arizona_stork/tmp/0.886421543959\040(deleted) that should be
-	# /vservers/arizona_stork/tmp/0.886421543959
-	awk '(/vservers\//) { sub(/\\040.*$/, ""); print "Unmounting " $2; system("umount " $2); }' /proc/mounts
+    tries=$(($tries -1))
+    # arizona_stork seems to generate some weird mount points of the form
+    # /vservers/arizona_stork/tmp/0.886421543959\040(deleted) that should be
+    # /vservers/arizona_stork/tmp/0.886421543959
+    awk '(/vservers\//) { sub(/\\040.*$/, ""); print "Unmounting " $2; system("umount " $2); }' /proc/mounts
     done
 }   
 
@@ -139,8 +139,8 @@ fix_vservers() {
     mkdir -p /vservers/.vtmp
     tmp=$(mktemp /vservers/.vtmp/fixit.XXXXXX)
     if [ $? -eq 0 ] ; then
-	rm -f $tmp
-	return 0
+    rm -f $tmp
+    return 0
     fi
 
     # kill all processes running in slice contexts
@@ -149,30 +149,30 @@ fix_vservers() {
     # stop vcached
     pidfile=/var/run/vcached.pid
     if [ -r "$pidfile" ] ; then
-	kill $(cat $pidfile)
+    kill $(cat $pidfile)
     fi
     touch $pidfile
 
     # unmounts /vservers
     if umount /vservers ; then
         # install expect if necessary
-	if ! rpm -q expect ; then
-	    yum -y install expect
-	fi
+    if ! rpm -q expect ; then
+        yum -y install expect
+    fi
 
         # tell expect to hit the 'y' key every time fsck asks
-	expect -c 'set timeout 3600; spawn fsck /dev/mapper/planetlab-vservers; expect "<y>?" { send "y\r"; exp_continue }'
+    expect -c 'set timeout 3600; spawn fsck /dev/mapper/planetlab-vservers; expect "<y>?" { send "y\r"; exp_continue }'
 
         # blow away the vserver cache
-	rm -rf /vservers/.vcache/*
+    rm -rf /vservers/.vcache/*
 
         # XXX re-mount /vservers
-	# mount /vservers
+    # mount /vservers
 
-	# shutdown instead to avoid clearing disk quotas
-	shutdown -r now "/vservers filesystem repaired, rebooting"
+    # shutdown instead to avoid clearing disk quotas
+    shutdown -r now "/vservers filesystem repaired, rebooting"
     else
-	echo "Unable to unmount /vservers!" >&2
+    echo "Unable to unmount /vservers!" >&2
     fi
 
     # allow vcached to run again
@@ -187,38 +187,70 @@ kill_duplicate_ssh() {
     grep " \[priv\]" |
     sort | uniq -c |
     while read instances sshd slice priv ; do
-	# kill all old instances
-	if [ $instances -gt 10 ] ; then
-	    ps -C sshd -o pid=,start_time=,command= |
-	    grep "$slice \[priv\]" |
-	    while read pid start_time command ; do
-		start_time=$(date -d "$start_time" +%s)
-		min=$(date -d "6 hours ago" +%s)
-		if [ $start_time -lt $min ] ; then
-		    echo "* Killing $slice sshd pid $pid"
-		    kill -9 $pid
-		fi
-	    done
-	fi
+    # kill all old instances
+    if [ $instances -gt 10 ] ; then
+        ps -C sshd -o pid=,start_time=,command= |
+        grep "$slice \[priv\]" |
+        while read pid start_time command ; do
+        start_time=$(date -d "$start_time" +%s)
+        min=$(date -d "6 hours ago" +%s)
+        if [ $start_time -lt $min ] ; then
+            echo "* Killing $slice sshd pid $pid"
+            kill -9 $pid
+        fi
+        done
+    fi
     done
 }
 
 kill_nm_inslice(){
     pids=$(vps aux | awk '$1 != "root" && $14 == "/usr/share/NodeManager/nm.py" {print $2}')
     for pid in $pids ; do
-		line=$(vps aux | grep $pid)
-		echo Killing PID $pid
-		echo $line
-		kill -9 $pid
-	done
+        line=$(vps aux | grep $pid)
+        echo NM found in slice. Killing PID $pid
+        echo $line
+        kill -9 $pid
+    done
+}
+
+kill_nonroot_nm(){
+    # For whatever reason, Some NM's, after fork and chcontext...don't chcontext.  Kill them.
+    pids=$(ps aux | awk '$1 != "root" && $12 == "/usr/share/NodeManager/nm.py" {print $2}')
+    for pid in $pids ; do
+        line=$(ps aux | grep $pid)
+        echo NM found not belonging to root. Killing PID $pid
+        echo $line
+        kill -9 $pid
+    done
 }
 
+kill_multi_nm(){
+    # if there is more than one nm running around, kill them, then nm restart
+    pids=$(ps aux | awk '$1 == "root" && $12 == "/usr/share/NodeManager/nm.py" {print $2}')
+	i=0
+	for pid in $pids ; do
+		i=$[$i+1]
+	done
+	if [ $i -gt 1 ] ; then
+		# stop nm
+		echo "More than 1 NM found belonging to root.  Restarting NM."
+		/etc/init.d/nm stop 
+    	pids=$(ps aux | awk '$1 == "root" && $12 == "/usr/share/NodeManager/nm.py" {print $2}')
+		for pid in $pids ; do
+			kill -9 $pid
+		done
+		/etc/init.d/nm start
+	fi
+}
 # XXX kill zombie slices
 
 # XXX reboot if boot state changes
+run kill_nonroot_nm
 
 run kill_nm_inslice
 
+run kill_multi_nm
+
 run fix_vservers
 
 run fix_etc_shadow
@@ -230,3 +262,4 @@ run restart_netflow
 run kill_duplicate_ssh
 
 run update_vserver_reference
+
-- 
2.43.0