pl_mop.sh

   1 #!/bin/bash
   2 #
   3 # Runs once a day to "fix" nodes in various ways
   4 #
   5 # Mark Huang <mlhuang@cs.princeton.edu>
   6 # Copyright (C) 2005 The Trustees of Princeton University
   7 #
   8 # $Id: pl_mop.sh,v 1.5 2006/03/06 20:40:33 mlhuang Exp $
   9 #
  10
  11 PATH=/sbin:/usr/sbin:$PATH
  12
  13 PIDFILE=/var/run/pl_mop.pid
  14
  15 # Record PID
  16 if [ -f $PIDFILE ] ; then
  17     if kill -0 `cat $PIDFILE` >/dev/null 2>&1 ; then
  18         logger -p info -t pl_mom "$0 (`cat $PIDFILE`) already running"
  19         exit 1
  20     fi
  21 fi
  22 echo $$ > $PIDFILE
  23
  24 # Clean up stale lock files
  25 trap "rm -f $PIDFILE" EXIT
  26
  27 # Run a command and log its output to syslog
  28 run() {
  29     eval $* 2>&1 | logger -p info -t "pl_mom: $1"
  30 }
  31
  32 # OpenSSH server 3.8 and above refuse login for "locked"
  33 # accounts. Replace "!!" with "*" in /etc/shadow for all VServer
  34 # accounts.
  35 fix_etc_shadow() {
  36     echo "* Fixing /etc/shadow"
  37
  38     shopt -s nullglob
  39     for file in /etc/vservers/*.conf pl_admin.conf site_admin.conf ; do
  40         slice=$(basename ${file%*.conf})
  41         if grep -q "$slice:\!\!" /etc/shadow ; then
  42             sed -i -e "s/$slice:\!\!:\(.*\)/$slice:*:\1/" /etc/shadow
  43         fi
  44     done
  45 }
  46
  47 # keep essential services running
  48 restart_services() {
  49     for service in sshd pl_sshd swapmon pl_nm proper ; do
  50         echo "* Checking $service"
  51         status=$(service $service status)
  52         if [ $? -ne 0 ] || echo $status 2>&1 | grep -q stopped ; then
  53             echo "* Restarting $service"
  54             service $service start
  55         fi
  56     done
  57 }
  58
  59 # keep netflow running
  60 restart_netflow() {
  61     echo "* Checking netflow"
  62     echo "sudo /sbin/service netflow restart" | su - pl_netflow
  63     if [ $? -ne 0 ] ; then
  64         echo "* Restarting netflow"
  65         service netflow-init start
  66         vserver pl_netflow start
  67         echo "sudo /sbin/service netflow restart" | su - pl_netflow
  68     fi
  69 }
  70
  71 # keep pl_conf running
  72 restart_pl_conf() {
  73     echo "* Checking pl_conf"
  74     vserver pl_conf exec /sbin/service pl_conf status >/dev/null 2>&1
  75     if [ $? -ne 0 ] ; then
  76         echo "* Restarting pl_conf"
  77         vserver pl_conf stop
  78         vserver pl_conf start
  79     fi
  80 }
  81
  82 # kill all the processes running in slice contexts
  83 vkillall() {
  84     vps -A | awk '(int($2) > 1) { system("vkill -c " $2 " -s 9 " $1); }'
  85     # unmounts all the /proc and /dev/pts mounts in each vserver
  86     tries=10
  87     while grep -q /vservers/ /proc/mounts && [ $tries -gt 0 ] ; do
  88         tries=$(($tries -1))
  89         # arizona_stork seems to generate some weird mount points of the form
  90         # /vservers/arizona_stork/tmp/0.886421543959\040(deleted) that should be
  91         # /vservers/arizona_stork/tmp/0.886421543959
  92         awk '(/vservers\//) { sub(/\\040.*$/, ""); print "Unmounting " $2; system("umount " $2); }' /proc/mounts
  93     done
  94 }
  95
  96 # /vservers gets re-mounted read-only by the kernel if an ext3 journal
  97 # transaction aborts
  98 fix_vservers() {
  99     echo "* Fixing /vservers"
 100
 101     # test to see if /vservers is mounted read-only
 102     mkdir -p /vservers/.vtmp
 103     tmp=$(mktemp /vservers/.vtmp/fixit.XXXXXX)
 104     if [ $? -eq 0 ] ; then
 105         rm -f $tmp
 106         return 0
 107     fi
 108
 109     # kill all processes running in slice contexts
 110     vkillall
 111
 112     # stop vcached
 113     pidfile=/var/run/vcached.pid
 114     if [ -r "$pidfile" ] ; then
 115         kill $(cat $pidfile)
 116     fi
 117     touch $pidfile
 118
 119     # unmounts /vservers
 120     if umount /vservers ; then
 121         # install expect if necessary
 122         if ! rpm -q expect ; then
 123             yum -y install expect
 124         fi
 125
 126         # tell expect to hit the 'y' key every time fsck asks
 127         expect -c 'set timeout 3600; spawn fsck /dev/mapper/planetlab-vservers; expect "<y>?" { send "y\r"; exp_continue }'
 128
 129         # blow away the vserver cache
 130         rm -rf /vservers/.vcache/*
 131
 132         # XXX re-mount /vservers
 133         # mount /vservers
 134
 135         # shutdown instead to avoid clearing disk quotas
 136         shutdown -r now "/vservers filesystem repaired, rebooting"
 137     else
 138         echo "Unable to unmount /vservers!" >&2
 139     fi
 140
 141     # allow vcached to run again
 142     rm -f $pidfile
 143 }
 144
 145 kill_duplicate_ssh() {
 146     echo "* Killing stale duplicate SSH instances"
 147
 148     # count the number of SSH instances started by each slice
 149     ps -C sshd -o command= |
 150     grep " \[priv\]" |
 151     sort | uniq -c |
 152     while read instances sshd slice priv ; do
 153         # kill all old instances
 154         if [ $instances -gt 10 ] ; then
 155             ps -C sshd -o pid=,start_time=,command= |
 156             grep "$slice \[priv\]" |
 157             while read pid start_time command ; do
 158                 start_time=$(date -d "$start_time" +%s)
 159                 min=$(date -d "6 hours ago" +%s)
 160                 if [ $start_time -lt $min ] ; then
 161                     echo "* Killing $slice sshd pid $pid"
 162                     kill -9 $pid
 163                 fi
 164             done
 165         fi
 166     done
 167 }
 168
 169 # XXX kill zombie slices
 170
 171 # XXX reboot if boot state changes
 172
 173 run fix_vservers
 174
 175 run fix_etc_shadow
 176
 177 run restart_services
 178
 179 run restart_pl_conf
 180
 181 run restart_netflow
 182
 183 run kill_duplicate_ssh