3 # Runs once a day to "fix" nodes in various ways
5 # Mark Huang <mlhuang@cs.princeton.edu>
6 # Copyright (C) 2005 The Trustees of Princeton University
8 # $Id: pl_mop.sh,v 1.4 2006/01/26 19:26:20 mlhuang Exp $
11 PATH=/sbin:/usr/sbin:$PATH
13 PIDFILE=/var/run/pl_mop.pid
16 if [ -f $PIDFILE ] ; then
17 if kill -0 `cat $PIDFILE` >/dev/null 2>&1 ; then
18 logger -p info -t pl_mom "$0 (`cat $PIDFILE`) already running"
24 # Clean up stale lock files
25 trap "rm -f $PIDFILE" EXIT
27 # Run a command and log its output to syslog
29 eval $* 2>&1 | logger -p info -t "pl_mom: $1"
32 # OpenSSH server 3.8 and above refuse login for "locked"
33 # accounts. Replace "!!" with "*" in /etc/shadow for all VServer
36 echo "* Fixing /etc/shadow"
39 for file in /etc/vservers/*.conf pl_admin.conf site_admin.conf ; do
40 slice=$(basename ${file%*.conf})
41 if grep -q "$slice:\!\!" /etc/shadow ; then
42 sed -i -e "s/$slice:\!\!:\(.*\)/$slice:*:\1/" /etc/shadow
47 # keep essential services running
49 for service in sshd pl_sshd pl_mom pl_nm proper ; do
50 echo "* Checking $service"
51 status=$(service $service status)
52 if [ $? -ne 0 ] || echo $status 2>&1 | grep -q stopped ; then
53 echo "* Restarting $service"
54 service $service start
59 # keep netflow running
61 echo "* Checking netflow"
62 echo "sudo /sbin/service netflow restart" | su - pl_netflow
63 if [ $? -ne 0 ] ; then
64 echo "* Restarting netflow"
65 service netflow-init start
66 vserver pl_netflow start
67 echo "sudo /sbin/service netflow restart" | su - pl_netflow
71 # keep pl_conf running
73 echo "* Checking pl_conf"
74 vserver pl_conf exec /sbin/service pl_conf status >/dev/null 2>&1
75 if [ $? -ne 0 ] ; then
76 echo "* Restarting pl_conf"
82 # kill all the processes running in slice contexts
84 vps -A | awk '(int($2) > 1) { system("vkill -c " $2 " -s 9 " $1); }'
85 # unmounts all the /proc and /dev/pts mounts in each vserver
87 while grep -q /vservers/ /proc/mounts && [ $tries -gt 0 ] ; do
89 # arizona_stork seems to generate some weird mount points of the form
90 # /vservers/arizona_stork/tmp/0.886421543959\040(deleted) that should be
91 # /vservers/arizona_stork/tmp/0.886421543959
92 awk '(/vservers\//) { sub(/\\040.*$/, ""); print "Unmounting " $2; system("umount " $2); }' /proc/mounts
96 # /vservers gets re-mounted read-only by the kernel if an ext3 journal
99 echo "* Fixing /vservers"
101 # test to see if /vservers is mounted read-only
102 mkdir -p /vservers/.vtmp
103 tmp=$(mktemp /vservers/.vtmp/fixit.XXXXXX)
104 if [ $? -eq 0 ] ; then
109 # kill all processes running in slice contexts
113 pidfile=/var/run/vcached.pid
114 if [ -r "$pidfile" ] ; then
120 if umount /vservers ; then
121 # install expect if necessary
122 if ! rpm -q expect ; then
123 yum -y install expect
126 # tell expect to hit the 'y' key every time fsck asks
127 expect -c 'set timeout 3600; spawn fsck /dev/mapper/planetlab-vservers; expect "<y>?" { send "y\r"; exp_continue }'
129 # blow away the vserver cache
130 rm -rf /vservers/.vcache/*
132 # XXX re-mount /vservers
135 # shutdown instead to avoid clearing disk quotas
136 shutdown -r now "/vservers filesystem repaired, rebooting"
138 echo "Unable to unmount /vservers!" >&2
141 # allow vcached to run again
145 kill_duplicate_ssh() {
146 echo "* Killing stale duplicate SSH instances"
148 # count the number of SSH instances started by each slice
149 ps -C sshd -o command= |
152 while read instances sshd slice priv ; do
153 # kill all old instances
154 if [ $instances -gt 10 ] ; then
155 ps -C sshd -o pid=,start_time=,command= |
156 grep "$slice \[priv\]" |
157 while read pid start_time command ; do
158 start_time=$(date -d "$start_time" +%s)
159 min=$(date -d "6 hours ago" +%s)
160 if [ $start_time -lt $min ] ; then
161 echo "* Killing $slice sshd pid $pid"
169 # XXX kill zombie slices
171 # XXX reboot if boot state changes
183 run kill_duplicate_ssh