Setting tag mom-2.3-5

[mom.git] / pl_mop.sh
diff --git a/pl_mop.sh b/pl_mop.sh

index 16dba73..4922277 100755 (executable)
--- a/pl_mop.sh
+++ b/pl_mop.sh
@@ -5,8 +5,6 @@
  # Mark Huang <mlhuang@cs.princeton.edu>
  # Copyright (C) 2005 The Trustees of Princeton University
  #
-# $Id$
-#
  
  PATH=/sbin:/usr/sbin:$PATH
  
@@ -22,8 +20,8 @@ PIDFILE=/var/run/pl_mop.pid
  # Record PID
  if [ -f $PIDFILE ] ; then
      if kill -0 `cat $PIDFILE` >/dev/null 2>&1 ; then
-    logger -p info -t pl_mom "$0 (`cat $PIDFILE`) already running"
-    exit 1
+        logger -p info -t pl_mom "$0 (`cat $PIDFILE`) already running"
+        exit 1
      fi
  fi
  echo $$ > $PIDFILE
@@ -44,89 +42,34 @@ fix_etc_shadow() {
  
      shopt -s nullglob
      for file in /etc/vservers/*.conf pl_admin.conf site_admin.conf ; do
-    slice=$(basename ${file%*.conf})
-    if grep -q "$slice:\!\!" /etc/shadow ; then
-        sed -i -e "s/$slice:\!\!:\(.*\)/$slice:*:\1/" /etc/shadow
-    fi
+        slice=$(basename ${file%*.conf})
+        if grep -q "$slice:\!\!" /etc/shadow ; then
+            sed -i -e "s/$slice:\!\!:\(.*\)/$slice:*:\1/" /etc/shadow
+        fi
      done
  }
  
  # keep essential services running
  restart_services() {
-    for service in sshd pl_sshd swapmon nm proper ; do
-    echo "* Checking $service"
-    status=$(service $service status)
-    if [ $? -ne 0 ] || echo $status 2>&1 | grep -q stopped ; then
-        echo "* Restarting $service"
-        service $service start
-    fi
+    for service in sshd pl_sshd swapmon nm fprobe-ulog codemux; do
+       chkconfig --list $service | grep -q 3:on || continue
+        echo "* Checking $service"
+        status=$(service $service status)
+        if [ $? -ne 0 ] || echo $status 2>&1 | grep -q stopped ; then
+            echo "* Restarting $service"
+            service $service start
+        fi
      done
  }
  
-# keep netflow running
-restart_netflow() {
-    echo "* Checking netflow"
-    echo "sudo /sbin/service netflow restart" | su - pl_netflow
-    if [ $? -ne 0 ] ; then
-    echo "* Restarting netflow"
-    service netflow-init start
-    vserver pl_netflow start
-    echo "sudo /sbin/service netflow restart" | su - pl_netflow
-    fi
-}
-
-# GPG keys are installed in /etc/pki/rpm-gpg by both the Boot Manager
-# during initial installation, and by PlanetLabConf during daily
-# updates. NodeUpdate imports the keys into the RPM database before
-# running yum daily. vserver-reference copies and imports the keys
-# into the reference images and system slices daily. The only parts of
-# this process that are actually necessary, are the Boot Manager and
-# vserver-reference. However, we do not want to force a re-install of
-# all nodes, and we do not want to force an update of
-# vserver-reference, so in the meantime, PlanetLabConf and NodeUpdate
-# take care of getting the keys installed and imported in /, and this
-# script takes care of getting them installed in the reference images
-# and system slices, until we can get a new vserver-reference image
-# pushed out.
-update_vserver_reference() {
-    echo "* Updating VServer reference"
-
-    shopt -s nullglob
-
-    VROOTS="/vservers/vserver-reference /vservers/.vcache/* /vservers/${PLC_SLICE_PREFIX}_*"
-
-    # Copy configuration files from host to slices
-    for file in \
-    /etc/hosts /etc/resolv.conf /etc/yum.conf /etc/planetlab/node_id \
-    /etc/planetlab/plc_config* /etc/planetlab/php/* \
-    /etc/pki/rpm-gpg/* ; do
-      if [ -r $file ] ; then
-      for vroot in $VROOTS ; do
-          install -D -m 644 $file $vroot/$file
-      done
-      fi
-    done
-
-    # (Re)install GPG signing keys
-    if [ -d /etc/pki/rpm-gpg ] ; then
-    for vroot in $VROOTS ; do
-        chroot $vroot rpm --allmatches -e gpg-pubkey || :
-        chroot $vroot rpm --import /etc/pki/rpm-gpg/* || :
-    done
-    fi
-}    
-
  # kill all the processes running in slice contexts
  vkillall() {
      vps -A | awk '(int($2) > 1) { system("vkill -c " $2 " -s 9 " $1); }'
      # unmounts all the /proc and /dev/pts mounts in each vserver
      tries=10
      while grep -q /vservers/ /proc/mounts && [ $tries -gt 0 ] ; do
-    tries=$(($tries -1))
-    # arizona_stork seems to generate some weird mount points of the form
-    # /vservers/arizona_stork/tmp/0.886421543959\040(deleted) that should be
-    # /vservers/arizona_stork/tmp/0.886421543959
-    awk '(/vservers\//) { sub(/\\040.*$/, ""); print "Unmounting " $2; system("umount " $2); }' /proc/mounts
+        tries=$(($tries -1))
+        awk '(/vservers\//) { sub(/\\040.*$/, ""); print "Unmounting " $2; system("umount " $2); }' /proc/mounts
      done
  }   
  
@@ -139,8 +82,8 @@ fix_vservers() {
      mkdir -p /vservers/.vtmp
      tmp=$(mktemp /vservers/.vtmp/fixit.XXXXXX)
      if [ $? -eq 0 ] ; then
-    rm -f $tmp
-    return 0
+        rm -f $tmp
+        return 0
      fi
  
      # kill all processes running in slice contexts
@@ -149,30 +92,30 @@ fix_vservers() {
      # stop vcached
      pidfile=/var/run/vcached.pid
      if [ -r "$pidfile" ] ; then
-    kill $(cat $pidfile)
+        kill $(cat $pidfile)
      fi
      touch $pidfile
  
      # unmounts /vservers
      if umount /vservers ; then
          # install expect if necessary
-    if ! rpm -q expect ; then
-        yum -y install expect
-    fi
+        if ! rpm -q expect ; then
+            yum -y install expect
+        fi
  
          # tell expect to hit the 'y' key every time fsck asks
-    expect -c 'set timeout 3600; spawn fsck /dev/mapper/planetlab-vservers; expect "<y>?" { send "y\r"; exp_continue }'
+        expect -c 'set timeout 3600; spawn fsck /dev/mapper/planetlab-vservers; expect "<y>?" { send "y\r"; exp_continue }'
  
          # blow away the vserver cache
-    rm -rf /vservers/.vcache/*
+        rm -rf /vservers/.vcache/*
  
          # XXX re-mount /vservers
-    # mount /vservers
+        # mount /vservers
  
-    # shutdown instead to avoid clearing disk quotas
-    shutdown -r now "/vservers filesystem repaired, rebooting"
+        # shutdown instead to avoid clearing disk quotas
+        shutdown -r now "/vservers filesystem repaired, rebooting"
      else
-    echo "Unable to unmount /vservers!" >&2
+        echo "Unable to unmount /vservers!" >&2
      fi
  
      # allow vcached to run again
@@ -192,12 +135,12 @@ kill_duplicate_ssh() {
          ps -C sshd -o pid=,start_time=,command= |
          grep "$slice \[priv\]" |
          while read pid start_time command ; do
-        start_time=$(date -d "$start_time" +%s)
-        min=$(date -d "6 hours ago" +%s)
-        if [ $start_time -lt $min ] ; then
-            echo "* Killing $slice sshd pid $pid"
-            kill -9 $pid
-        fi
+            start_time=$(date -d "$start_time" +%s)
+            min=$(date -d "6 hours ago" +%s)
+            if [ $start_time -lt $min ] ; then
+                echo "* Killing $slice sshd pid $pid"
+                kill -9 $pid
+            fi
          done
      fi
      done
@@ -209,7 +152,7 @@ kill_nm_inslice(){
          line=$(vps aux | grep $pid)
          echo NM found in slice. Killing PID $pid
          echo $line
-        kill -9 $pid
+        vkill -9 $pid
      done
  }
  
@@ -227,24 +170,64 @@ kill_nonroot_nm(){
  kill_multi_nm(){
      # if there is more than one nm running around, kill them, then nm restart
      pids=$(ps aux | awk '$1 == "root" && $12 == "/usr/share/NodeManager/nm.py" {print $2}')
-       i=0
-       for pid in $pids ; do
-               i=$[$i+1]
-       done
-       if [ $i -gt 1 ] ; then
-               # stop nm
-               echo "More than 1 NM found belonging to root.  Restarting NM."
-               /etc/init.d/nm stop 
-       pids=$(ps aux | awk '$1 == "root" && $12 == "/usr/share/NodeManager/nm.py" {print $2}')
-               for pid in $pids ; do
-                       kill -9 $pid
-               done
-               /etc/init.d/nm start
-       fi
+    i=0
+    for pid in $pids ; do
+        i=$[$i+1]
+    done
+    if [ $i -gt 1 ] ; then
+        # stop nm
+        echo "More than 1 NM found belonging to root.  Restarting NM."
+        /etc/init.d/nm stop 
+        pids=$(ps aux | awk '$1 == "root" && $12 == "/usr/share/NodeManager/nm.py" {print $2}')
+        for pid in $pids ; do
+            kill -9 $pid
+        done
+        /etc/init.d/nm start
+    fi
  }
+
+fix_rpm() {
+    echo "* Checking for stuck rpm processes"
+
+    rpm_count=`pgrep -f "rpm" | wc -l`
+
+    if [[ $rpm_count -ge 6 ]]; then
+        echo "* $rpm_count rpm processes found"
+
+        # kill rpm processes, attempt up to 10 times and then give up
+        try_count=0
+        rpm_count=`pgrep "rpm|yum" | wc -l`
+        while [[ $rpm_count -gt 0 ]]; do
+           echo "* killing rpm/yum processes"
+            killall -9 rpm rpmd rpmq rpmk yum
+            sleep 1
+            rpm_count=`pgrep "rpm|yum" | wc -l`
+            try_count=`expr $try_count + 1`
+            if [[ $try_count -ge 10 ]]; then
+                echo "* failed to kill rpm processes"
+                return
+            fi
+        done
+
+        # remove lock files
+        echo "* deleting rpm lock files"
+        rm -f /var/lib/rpm/__*
+
+        # rebuild rpm database
+        echo "* rebuilding rpm database"
+        rpm --rebuilddb
+
+        echo "* rpm repair sequence complete"
+
+    fi
+}
+
  # XXX kill zombie slices
  
-# XXX reboot if boot state changes
+run restart_services
+
+run fix_rpm
+
  run kill_nonroot_nm
  
  run kill_nm_inslice
@@ -255,11 +238,4 @@ run fix_vservers
  
  run fix_etc_shadow
  
-run restart_services
-
-run restart_netflow
-
  run kill_duplicate_ssh
-
-run update_vserver_reference
-