pl_mop.sh

   1 #!/bin/bash
   2 #
   3 # Runs once a day to "fix" nodes in various ways
   4 #
   5 # Mark Huang <mlhuang@cs.princeton.edu>
   6 # Copyright (C) 2005 The Trustees of Princeton University
   7 #
   8 # $Id$
   9 #
  10
  11 PATH=/sbin:/usr/sbin:$PATH
  12
  13 # Parse PLC configuration
  14 if [ -r /etc/planetlab/plc_config ] ; then
  15     . /etc/planetlab/plc_config
  16 else
  17     PLC_SLICE_PREFIX="pl"
  18 fi
  19
  20 PIDFILE=/var/run/pl_mop.pid
  21
  22 # Record PID
  23 if [ -f $PIDFILE ] ; then
  24     if kill -0 `cat $PIDFILE` >/dev/null 2>&1 ; then
  25     logger -p info -t pl_mom "$0 (`cat $PIDFILE`) already running"
  26     exit 1
  27     fi
  28 fi
  29 echo $$ > $PIDFILE
  30
  31 # Clean up stale lock files
  32 trap "rm -f $PIDFILE" EXIT
  33
  34 # Run a command and log its output to syslog
  35 run() {
  36     eval $* 2>&1 | logger -p info -t "pl_mom: $1"
  37 }
  38
  39 # OpenSSH server 3.8 and above refuse login for "locked"
  40 # accounts. Replace "!!" with "*" in /etc/shadow for all VServer
  41 # accounts.
  42 fix_etc_shadow() {
  43     echo "* Fixing /etc/shadow"
  44
  45     shopt -s nullglob
  46     for file in /etc/vservers/*.conf pl_admin.conf site_admin.conf ; do
  47     slice=$(basename ${file%*.conf})
  48     if grep -q "$slice:\!\!" /etc/shadow ; then
  49         sed -i -e "s/$slice:\!\!:\(.*\)/$slice:*:\1/" /etc/shadow
  50     fi
  51     done
  52 }
  53
  54 # keep essential services running
  55 restart_services() {
  56     for service in sshd pl_sshd swapmon nm proper ; do
  57     echo "* Checking $service"
  58     status=$(service $service status)
  59     if [ $? -ne 0 ] || echo $status 2>&1 | grep -q stopped ; then
  60         echo "* Restarting $service"
  61         service $service start
  62     fi
  63     done
  64 }
  65
  66 # keep netflow running
  67 restart_netflow() {
  68     echo "* Checking netflow"
  69     echo "sudo /sbin/service netflow restart" | su - pl_netflow
  70     if [ $? -ne 0 ] ; then
  71     echo "* Restarting netflow"
  72     service netflow-init start
  73     vserver pl_netflow start
  74     echo "sudo /sbin/service netflow restart" | su - pl_netflow
  75     fi
  76 }
  77
  78 # GPG keys are installed in /etc/pki/rpm-gpg by both the Boot Manager
  79 # during initial installation, and by PlanetLabConf during daily
  80 # updates. NodeUpdate imports the keys into the RPM database before
  81 # running yum daily. vserver-reference copies and imports the keys
  82 # into the reference images and system slices daily. The only parts of
  83 # this process that are actually necessary, are the Boot Manager and
  84 # vserver-reference. However, we do not want to force a re-install of
  85 # all nodes, and we do not want to force an update of
  86 # vserver-reference, so in the meantime, PlanetLabConf and NodeUpdate
  87 # take care of getting the keys installed and imported in /, and this
  88 # script takes care of getting them installed in the reference images
  89 # and system slices, until we can get a new vserver-reference image
  90 # pushed out.
  91 update_vserver_reference() {
  92     echo "* Updating VServer reference"
  93
  94     shopt -s nullglob
  95
  96     VROOTS="/vservers/vserver-reference /vservers/.vcache/* /vservers/${PLC_SLICE_PREFIX}_*"
  97
  98     # Copy configuration files from host to slices
  99     for file in \
 100     /etc/hosts /etc/resolv.conf /etc/yum.conf /etc/planetlab/node_id \
 101     /etc/planetlab/plc_config* /etc/planetlab/php/* \
 102     /etc/pki/rpm-gpg/* ; do
 103       if [ -r $file ] ; then
 104       for vroot in $VROOTS ; do
 105           install -D -m 644 $file $vroot/$file
 106       done
 107       fi
 108     done
 109
 110     # (Re)install GPG signing keys
 111     if [ -d /etc/pki/rpm-gpg ] ; then
 112     for vroot in $VROOTS ; do
 113         chroot $vroot rpm --allmatches -e gpg-pubkey || :
 114         chroot $vroot rpm --import /etc/pki/rpm-gpg/* || :
 115     done
 116     fi
 117 }
 118
 119 # kill all the processes running in slice contexts
 120 vkillall() {
 121     vps -A | awk '(int($2) > 1) { system("vkill -c " $2 " -s 9 " $1); }'
 122     # unmounts all the /proc and /dev/pts mounts in each vserver
 123     tries=10
 124     while grep -q /vservers/ /proc/mounts && [ $tries -gt 0 ] ; do
 125     tries=$(($tries -1))
 126     # arizona_stork seems to generate some weird mount points of the form
 127     # /vservers/arizona_stork/tmp/0.886421543959\040(deleted) that should be
 128     # /vservers/arizona_stork/tmp/0.886421543959
 129     awk '(/vservers\//) { sub(/\\040.*$/, ""); print "Unmounting " $2; system("umount " $2); }' /proc/mounts
 130     done
 131 }
 132
 133 # /vservers gets re-mounted read-only by the kernel if an ext3 journal
 134 # transaction aborts
 135 fix_vservers() {
 136     echo "* Fixing /vservers"
 137
 138     # test to see if /vservers is mounted read-only
 139     mkdir -p /vservers/.vtmp
 140     tmp=$(mktemp /vservers/.vtmp/fixit.XXXXXX)
 141     if [ $? -eq 0 ] ; then
 142     rm -f $tmp
 143     return 0
 144     fi
 145
 146     # kill all processes running in slice contexts
 147     vkillall
 148
 149     # stop vcached
 150     pidfile=/var/run/vcached.pid
 151     if [ -r "$pidfile" ] ; then
 152     kill $(cat $pidfile)
 153     fi
 154     touch $pidfile
 155
 156     # unmounts /vservers
 157     if umount /vservers ; then
 158         # install expect if necessary
 159     if ! rpm -q expect ; then
 160         yum -y install expect
 161     fi
 162
 163         # tell expect to hit the 'y' key every time fsck asks
 164     expect -c 'set timeout 3600; spawn fsck /dev/mapper/planetlab-vservers; expect "<y>?" { send "y\r"; exp_continue }'
 165
 166         # blow away the vserver cache
 167     rm -rf /vservers/.vcache/*
 168
 169         # XXX re-mount /vservers
 170     # mount /vservers
 171
 172     # shutdown instead to avoid clearing disk quotas
 173     shutdown -r now "/vservers filesystem repaired, rebooting"
 174     else
 175     echo "Unable to unmount /vservers!" >&2
 176     fi
 177
 178     # allow vcached to run again
 179     rm -f $pidfile
 180 }
 181
 182 kill_duplicate_ssh() {
 183     echo "* Killing stale duplicate SSH instances"
 184
 185     # count the number of SSH instances started by each slice
 186     ps -C sshd -o command= |
 187     grep " \[priv\]" |
 188     sort | uniq -c |
 189     while read instances sshd slice priv ; do
 190     # kill all old instances
 191     if [ $instances -gt 10 ] ; then
 192         ps -C sshd -o pid=,start_time=,command= |
 193         grep "$slice \[priv\]" |
 194         while read pid start_time command ; do
 195         start_time=$(date -d "$start_time" +%s)
 196         min=$(date -d "6 hours ago" +%s)
 197         if [ $start_time -lt $min ] ; then
 198             echo "* Killing $slice sshd pid $pid"
 199             kill -9 $pid
 200         fi
 201         done
 202     fi
 203     done
 204 }
 205
 206 kill_nm_inslice(){
 207     pids=$(vps aux | awk '$1 != "root" && $14 == "/usr/share/NodeManager/nm.py" {print $2}')
 208     for pid in $pids ; do
 209         line=$(vps aux | grep $pid)
 210         echo NM found in slice. Killing PID $pid
 211         echo $line
 212         kill -9 $pid
 213     done
 214 }
 215
 216 kill_nonroot_nm(){
 217     # For whatever reason, Some NM's, after fork and chcontext...don't chcontext.  Kill them.
 218     pids=$(ps aux | awk '$1 != "root" && $12 == "/usr/share/NodeManager/nm.py" {print $2}')
 219     for pid in $pids ; do
 220         line=$(ps aux | grep $pid)
 221         echo NM found not belonging to root. Killing PID $pid
 222         echo $line
 223         kill -9 $pid
 224     done
 225 }
 226
 227 kill_multi_nm(){
 228     # if there is more than one nm running around, kill them, then nm restart
 229     pids=$(ps aux | awk '$1 == "root" && $12 == "/usr/share/NodeManager/nm.py" {print $2}')
 230         i=0
 231         for pid in $pids ; do
 232                 i=$[$i+1]
 233         done
 234         if [ $i -gt 1 ] ; then
 235                 # stop nm
 236                 echo "More than 1 NM found belonging to root.  Restarting NM."
 237                 /etc/init.d/nm stop
 238         pids=$(ps aux | awk '$1 == "root" && $12 == "/usr/share/NodeManager/nm.py" {print $2}')
 239                 for pid in $pids ; do
 240                         kill -9 $pid
 241                 done
 242                 /etc/init.d/nm start
 243         fi
 244 }
 245 # XXX kill zombie slices
 246
 247 # XXX reboot if boot state changes
 248 run kill_nonroot_nm
 249
 250 run kill_nm_inslice
 251
 252 run kill_multi_nm
 253
 254 run fix_vservers
 255
 256 run fix_etc_shadow
 257
 258 run restart_services
 259
 260 run restart_netflow
 261
 262 run kill_duplicate_ssh
 263
 264 run update_vserver_reference
 265