versions were off from branch. Bumped back.

[mom.git] / pl_mop.sh
diff --git a/pl_mop.sh b/pl_mop.sh

index 2fca18b..16dba73 100755 (executable)
--- a/pl_mop.sh
+++ b/pl_mop.sh
@@ -5,18 +5,25 @@
  # Mark Huang <mlhuang@cs.princeton.edu>
  # Copyright (C) 2005 The Trustees of Princeton University
  #
-# $Id: pl_mop.sh,v 1.2 2005/11/03 17:23:25 mlhuang Exp $
+# $Id$
  #
  
  PATH=/sbin:/usr/sbin:$PATH
  
+# Parse PLC configuration
+if [ -r /etc/planetlab/plc_config ] ; then
+    . /etc/planetlab/plc_config
+else
+    PLC_SLICE_PREFIX="pl"
+fi
+
  PIDFILE=/var/run/pl_mop.pid
  
  # Record PID
  if [ -f $PIDFILE ] ; then
      if kill -0 `cat $PIDFILE` >/dev/null 2>&1 ; then
-       logger -p info -t pl_mom "$0 (`cat $PIDFILE`) already running"
-       exit 1
+    logger -p info -t pl_mom "$0 (`cat $PIDFILE`) already running"
+    exit 1
      fi
  fi
  echo $$ > $PIDFILE
@@ -37,22 +44,22 @@ fix_etc_shadow() {
  
      shopt -s nullglob
      for file in /etc/vservers/*.conf pl_admin.conf site_admin.conf ; do
-       slice=$(basename ${file%*.conf})
-       if grep -q "$slice:\!\!" /etc/shadow ; then
-           sed -i -e "s/$slice:\!\!:\(.*\)/$slice:*:\1/" /etc/shadow
-       fi
+    slice=$(basename ${file%*.conf})
+    if grep -q "$slice:\!\!" /etc/shadow ; then
+        sed -i -e "s/$slice:\!\!:\(.*\)/$slice:*:\1/" /etc/shadow
+    fi
      done
  }
  
  # keep essential services running
  restart_services() {
-    for service in sshd pl_sshd pl_mom pl_nm proper ; do
-       echo "* Checking $service"
-       status=$(service $service status)
-       if [ $? -ne 0 ] || echo $status 2>&1 | grep -q stopped ; then
-           echo "* Restarting $service"
-           service $service start
-       fi
+    for service in sshd pl_sshd swapmon nm proper ; do
+    echo "* Checking $service"
+    status=$(service $service status)
+    if [ $? -ne 0 ] || echo $status 2>&1 | grep -q stopped ; then
+        echo "* Restarting $service"
+        service $service start
+    fi
      done
  }
  
@@ -61,22 +68,53 @@ restart_netflow() {
      echo "* Checking netflow"
      echo "sudo /sbin/service netflow restart" | su - pl_netflow
      if [ $? -ne 0 ] ; then
-       echo "* Restarting netflow"
-       service netflow-init start
-       vserver pl_netflow start
-       echo "sudo /sbin/service netflow restart" | su - pl_netflow
+    echo "* Restarting netflow"
+    service netflow-init start
+    vserver pl_netflow start
+    echo "sudo /sbin/service netflow restart" | su - pl_netflow
      fi
  }
  
-# keep pl_conf running
-restart_pl_conf() {
-    echo "* Checking pl_conf"
-    vserver pl_conf exec /sbin/service pl_conf status >/dev/null 2>&1
-    if [ $? -ne 0 ] ; then
-       echo "* Restarting pl_conf"
-       vserver pl_conf restart
+# GPG keys are installed in /etc/pki/rpm-gpg by both the Boot Manager
+# during initial installation, and by PlanetLabConf during daily
+# updates. NodeUpdate imports the keys into the RPM database before
+# running yum daily. vserver-reference copies and imports the keys
+# into the reference images and system slices daily. The only parts of
+# this process that are actually necessary, are the Boot Manager and
+# vserver-reference. However, we do not want to force a re-install of
+# all nodes, and we do not want to force an update of
+# vserver-reference, so in the meantime, PlanetLabConf and NodeUpdate
+# take care of getting the keys installed and imported in /, and this
+# script takes care of getting them installed in the reference images
+# and system slices, until we can get a new vserver-reference image
+# pushed out.
+update_vserver_reference() {
+    echo "* Updating VServer reference"
+
+    shopt -s nullglob
+
+    VROOTS="/vservers/vserver-reference /vservers/.vcache/* /vservers/${PLC_SLICE_PREFIX}_*"
+
+    # Copy configuration files from host to slices
+    for file in \
+    /etc/hosts /etc/resolv.conf /etc/yum.conf /etc/planetlab/node_id \
+    /etc/planetlab/plc_config* /etc/planetlab/php/* \
+    /etc/pki/rpm-gpg/* ; do
+      if [ -r $file ] ; then
+      for vroot in $VROOTS ; do
+          install -D -m 644 $file $vroot/$file
+      done
+      fi
+    done
+
+    # (Re)install GPG signing keys
+    if [ -d /etc/pki/rpm-gpg ] ; then
+    for vroot in $VROOTS ; do
+        chroot $vroot rpm --allmatches -e gpg-pubkey || :
+        chroot $vroot rpm --import /etc/pki/rpm-gpg/* || :
+    done
      fi
-}
+}    
  
  # kill all the processes running in slice contexts
  vkillall() {
@@ -84,11 +122,11 @@ vkillall() {
      # unmounts all the /proc and /dev/pts mounts in each vserver
      tries=10
      while grep -q /vservers/ /proc/mounts && [ $tries -gt 0 ] ; do
-       tries=$(($tries -1))
-       # arizona_stork seems to generate some weird mount points of the form
-       # /vservers/arizona_stork/tmp/0.886421543959\040(deleted) that should be
-       # /vservers/arizona_stork/tmp/0.886421543959
-       awk '(/vservers\//) { sub(/\\040.*$/, ""); print "Unmounting " $2; system("umount " $2); }' /proc/mounts
+    tries=$(($tries -1))
+    # arizona_stork seems to generate some weird mount points of the form
+    # /vservers/arizona_stork/tmp/0.886421543959\040(deleted) that should be
+    # /vservers/arizona_stork/tmp/0.886421543959
+    awk '(/vservers\//) { sub(/\\040.*$/, ""); print "Unmounting " $2; system("umount " $2); }' /proc/mounts
      done
  }   
  
@@ -101,8 +139,8 @@ fix_vservers() {
      mkdir -p /vservers/.vtmp
      tmp=$(mktemp /vservers/.vtmp/fixit.XXXXXX)
      if [ $? -eq 0 ] ; then
-       rm -f $tmp
-       return 0
+    rm -f $tmp
+    return 0
      fi
  
      # kill all processes running in slice contexts
@@ -111,27 +149,30 @@ fix_vservers() {
      # stop vcached
      pidfile=/var/run/vcached.pid
      if [ -r "$pidfile" ] ; then
-       kill $(cat $pidfile)
+    kill $(cat $pidfile)
      fi
      touch $pidfile
  
      # unmounts /vservers
      if umount /vservers ; then
          # install expect if necessary
-       if ! rpm -q expect ; then
-           yum -y install expect
-       fi
+    if ! rpm -q expect ; then
+        yum -y install expect
+    fi
  
          # tell expect to hit the 'y' key every time fsck asks
-       expect -c 'set timeout 3600; spawn fsck /dev/mapper/planetlab-vservers; expect "<y>?" { send "y\r"; exp_continue }'
+    expect -c 'set timeout 3600; spawn fsck /dev/mapper/planetlab-vservers; expect "<y>?" { send "y\r"; exp_continue }'
  
          # blow away the vserver cache
-       rm -rf /vservers/.vcache/*
+    rm -rf /vservers/.vcache/*
+
+        # XXX re-mount /vservers
+    # mount /vservers
  
-        # re-mount /vservers
-       mount /vservers
+    # shutdown instead to avoid clearing disk quotas
+    shutdown -r now "/vservers filesystem repaired, rebooting"
      else
-       echo "Unable to unmount /vservers!" >&2
+    echo "Unable to unmount /vservers!" >&2
      fi
  
      # allow vcached to run again
@@ -146,25 +187,69 @@ kill_duplicate_ssh() {
      grep " \[priv\]" |
      sort | uniq -c |
      while read instances sshd slice priv ; do
-       # kill all old instances
-       if [ $instances -gt 10 ] ; then
-           ps -C sshd -o pid=,start_time=,command= |
-           grep "$slice \[priv\]" |
-           while read pid start_time command ; do
-               start_time=$(date -d "$start_time" +%s)
-               min=$(date -d "6 hours ago" +%s)
-               if [ $start_time -lt $min ] ; then
-                   echo "* Killing $slice sshd pid $pid"
-                   kill -9 $pid
-               fi
-           done
-       fi
+    # kill all old instances
+    if [ $instances -gt 10 ] ; then
+        ps -C sshd -o pid=,start_time=,command= |
+        grep "$slice \[priv\]" |
+        while read pid start_time command ; do
+        start_time=$(date -d "$start_time" +%s)
+        min=$(date -d "6 hours ago" +%s)
+        if [ $start_time -lt $min ] ; then
+            echo "* Killing $slice sshd pid $pid"
+            kill -9 $pid
+        fi
+        done
+    fi
      done
  }
  
+kill_nm_inslice(){
+    pids=$(vps aux | awk '$1 != "root" && $14 == "/usr/share/NodeManager/nm.py" {print $2}')
+    for pid in $pids ; do
+        line=$(vps aux | grep $pid)
+        echo NM found in slice. Killing PID $pid
+        echo $line
+        kill -9 $pid
+    done
+}
+
+kill_nonroot_nm(){
+    # For whatever reason, Some NM's, after fork and chcontext...don't chcontext.  Kill them.
+    pids=$(ps aux | awk '$1 != "root" && $12 == "/usr/share/NodeManager/nm.py" {print $2}')
+    for pid in $pids ; do
+        line=$(ps aux | grep $pid)
+        echo NM found not belonging to root. Killing PID $pid
+        echo $line
+        kill -9 $pid
+    done
+}
+
+kill_multi_nm(){
+    # if there is more than one nm running around, kill them, then nm restart
+    pids=$(ps aux | awk '$1 == "root" && $12 == "/usr/share/NodeManager/nm.py" {print $2}')
+       i=0
+       for pid in $pids ; do
+               i=$[$i+1]
+       done
+       if [ $i -gt 1 ] ; then
+               # stop nm
+               echo "More than 1 NM found belonging to root.  Restarting NM."
+               /etc/init.d/nm stop 
+       pids=$(ps aux | awk '$1 == "root" && $12 == "/usr/share/NodeManager/nm.py" {print $2}')
+               for pid in $pids ; do
+                       kill -9 $pid
+               done
+               /etc/init.d/nm start
+       fi
+}
  # XXX kill zombie slices
  
  # XXX reboot if boot state changes
+run kill_nonroot_nm
+
+run kill_nm_inslice
+
+run kill_multi_nm
  
  run fix_vservers
  
@@ -172,8 +257,9 @@ run fix_etc_shadow
  
  run restart_services
  
-run restart_pl_conf
-
  run restart_netflow
  
  run kill_duplicate_ssh
+
+run update_vserver_reference
+