pl_mop.sh

   1 #!/bin/bash
   2 #
   3 # Runs once a day to "fix" nodes in various ways
   4 #
   5 # Mark Huang <mlhuang@cs.princeton.edu>
   6 # Copyright (C) 2005 The Trustees of Princeton University
   7 #
   8
   9 PATH=/sbin:/usr/sbin:$PATH
  10
  11 # Parse PLC configuration
  12 if [ -r /etc/planetlab/plc_config ] ; then
  13     . /etc/planetlab/plc_config
  14 else
  15     PLC_SLICE_PREFIX="pl"
  16 fi
  17
  18 PIDFILE=/var/run/pl_mop.pid
  19
  20 # Record PID
  21 if [ -f $PIDFILE ] ; then
  22     if kill -0 `cat $PIDFILE` >/dev/null 2>&1 ; then
  23         logger -p info -t pl_mom "$0 (`cat $PIDFILE`) already running"
  24         exit 1
  25     fi
  26 fi
  27 echo $$ > $PIDFILE
  28
  29 # Clean up stale lock files
  30 trap "rm -f $PIDFILE" EXIT
  31
  32 # Run a command and log its output to syslog
  33 run() {
  34     eval $* 2>&1 | logger -p info -t "pl_mom: $1"
  35 }
  36
  37 # OpenSSH server 3.8 and above refuse login for "locked"
  38 # accounts. Replace "!!" with "*" in /etc/shadow for all VServer
  39 # accounts.
  40 fix_etc_shadow() {
  41     echo "* Fixing /etc/shadow"
  42
  43     shopt -s nullglob
  44     for file in /etc/vservers/*.conf pl_admin.conf site_admin.conf ; do
  45         slice=$(basename ${file%*.conf})
  46         if grep -q "$slice:\!\!" /etc/shadow ; then
  47             sed -i -e "s/$slice:\!\!:\(.*\)/$slice:*:\1/" /etc/shadow
  48         fi
  49     done
  50 }
  51
  52 # keep essential services running
  53 restart_services() {
  54     for service in sshd pl_sshd swapmon nm fprobe-ulog codemux; do
  55         chkconfig --list $service | grep -q 3:on || continue
  56         echo "* Checking $service"
  57         status=$(service $service status)
  58         if [ $? -ne 0 ] || echo $status 2>&1 | grep -q stopped ; then
  59             echo "* Restarting $service"
  60             service $service start
  61         fi
  62     done
  63 }
  64
  65 # kill all the processes running in slice contexts
  66 vkillall() {
  67     vps -A | awk '(int($2) > 1) { system("vkill -c " $2 " -s 9 " $1); }'
  68     # unmounts all the /proc and /dev/pts mounts in each vserver
  69     tries=10
  70     while grep -q /vservers/ /proc/mounts && [ $tries -gt 0 ] ; do
  71         tries=$(($tries -1))
  72         awk '(/vservers\//) { sub(/\\040.*$/, ""); print "Unmounting " $2; system("umount " $2); }' /proc/mounts
  73     done
  74 }
  75
  76 # /vservers gets re-mounted read-only by the kernel if an ext3 journal
  77 # transaction aborts
  78 fix_vservers() {
  79     echo "* Fixing /vservers"
  80
  81     # test to see if /vservers is mounted read-only
  82     mkdir -p /vservers/.vtmp
  83     tmp=$(mktemp /vservers/.vtmp/fixit.XXXXXX)
  84     if [ $? -eq 0 ] ; then
  85         rm -f $tmp
  86         return 0
  87     fi
  88
  89     # kill all processes running in slice contexts
  90     vkillall
  91
  92     # stop vcached
  93     pidfile=/var/run/vcached.pid
  94     if [ -r "$pidfile" ] ; then
  95         kill $(cat $pidfile)
  96     fi
  97     touch $pidfile
  98
  99     # unmounts /vservers
 100     if umount /vservers ; then
 101         # install expect if necessary
 102         if ! rpm -q expect ; then
 103             yum -y install expect
 104         fi
 105
 106         # tell expect to hit the 'y' key every time fsck asks
 107         expect -c 'set timeout 3600; spawn fsck /dev/mapper/planetlab-vservers; expect "<y>?" { send "y\r"; exp_continue }'
 108
 109         # blow away the vserver cache
 110         rm -rf /vservers/.vcache/*
 111
 112         # XXX re-mount /vservers
 113         # mount /vservers
 114
 115         # shutdown instead to avoid clearing disk quotas
 116         shutdown -r now "/vservers filesystem repaired, rebooting"
 117     else
 118         echo "Unable to unmount /vservers!" >&2
 119     fi
 120
 121     # allow vcached to run again
 122     rm -f $pidfile
 123 }
 124
 125 kill_duplicate_ssh() {
 126     echo "* Killing stale duplicate SSH instances"
 127
 128     # count the number of SSH instances started by each slice
 129     ps -C sshd -o command= |
 130     grep " \[priv\]" |
 131     sort | uniq -c |
 132     while read instances sshd slice priv ; do
 133     # kill all old instances
 134     if [ $instances -gt 10 ] ; then
 135         ps -C sshd -o pid=,start_time=,command= |
 136         grep "$slice \[priv\]" |
 137         while read pid start_time command ; do
 138             start_time=$(date -d "$start_time" +%s)
 139             min=$(date -d "6 hours ago" +%s)
 140             if [ $start_time -lt $min ] ; then
 141                 echo "* Killing $slice sshd pid $pid"
 142                 kill -9 $pid
 143             fi
 144         done
 145     fi
 146     done
 147 }
 148
 149 kill_nm_inslice(){
 150     pids=$(vps aux | awk '$1 != "root" && $14 == "/usr/share/NodeManager/nm.py" {print $2}')
 151     for pid in $pids ; do
 152         line=$(vps aux | grep $pid)
 153         echo NM found in slice. Killing PID $pid
 154         echo $line
 155         vkill -9 $pid
 156     done
 157 }
 158
 159 kill_nonroot_nm(){
 160     # For whatever reason, Some NM's, after fork and chcontext...don't chcontext.  Kill them.
 161     pids=$(ps aux | awk '$1 != "root" && $12 == "/usr/share/NodeManager/nm.py" {print $2}')
 162     for pid in $pids ; do
 163         line=$(ps aux | grep $pid)
 164         echo NM found not belonging to root. Killing PID $pid
 165         echo $line
 166         kill -9 $pid
 167     done
 168 }
 169
 170 kill_multi_nm(){
 171     # if there is more than one nm running around, kill them, then nm restart
 172     pids=$(ps aux | awk '$1 == "root" && $12 == "/usr/share/NodeManager/nm.py" {print $2}')
 173     i=0
 174     for pid in $pids ; do
 175         i=$[$i+1]
 176     done
 177     if [ $i -gt 1 ] ; then
 178         # stop nm
 179         echo "More than 1 NM found belonging to root.  Restarting NM."
 180         /etc/init.d/nm stop
 181         pids=$(ps aux | awk '$1 == "root" && $12 == "/usr/share/NodeManager/nm.py" {print $2}')
 182         for pid in $pids ; do
 183             kill -9 $pid
 184         done
 185         /etc/init.d/nm start
 186     fi
 187 }
 188
 189 fix_rpm() {
 190     echo "* Checking for stuck rpm processes"
 191
 192     rpm_count=`pgrep -f "rpm" | wc -l`
 193
 194     if [[ $rpm_count -ge 6 ]]; then
 195         echo "* $rpm_count rpm processes found"
 196
 197         # kill rpm processes, attempt up to 10 times and then give up
 198         try_count=0
 199         rpm_count=`pgrep "rpm|yum" | wc -l`
 200         while [[ $rpm_count -gt 0 ]]; do
 201             echo "* killing rpm/yum processes"
 202             killall -9 rpm rpmd rpmq rpmk yum
 203             sleep 1
 204             rpm_count=`pgrep "rpm|yum" | wc -l`
 205             try_count=`expr $try_count + 1`
 206             if [[ $try_count -ge 10 ]]; then
 207                 echo "* failed to kill rpm processes"
 208                 return
 209             fi
 210         done
 211
 212         # remove lock files
 213         echo "* deleting rpm lock files"
 214         rm -f /var/lib/rpm/__*
 215
 216         # rebuild rpm database
 217         echo "* rebuilding rpm database"
 218         rpm --rebuilddb
 219
 220         echo "* rpm repair sequence complete"
 221
 222     fi
 223 }
 224
 225 # XXX kill zombie slices
 226
 227 run restart_services
 228
 229 run fix_rpm
 230
 231 run kill_nonroot_nm
 232
 233 run kill_nm_inslice
 234
 235 run kill_multi_nm
 236
 237 run fix_vservers
 238
 239 run fix_etc_shadow
 240
 241 run kill_duplicate_ssh