3 # Runs once a day to "fix" nodes in various ways
5 # Mark Huang <mlhuang@cs.princeton.edu>
6 # Copyright (C) 2005 The Trustees of Princeton University
9 PATH=/sbin:/usr/sbin:$PATH
11 # Parse PLC configuration
12 if [ -r /etc/planetlab/plc_config ] ; then
13 . /etc/planetlab/plc_config
18 PIDFILE=/var/run/pl_mop.pid
21 if [ -f $PIDFILE ] ; then
22 if kill -0 `cat $PIDFILE` >/dev/null 2>&1 ; then
23 logger -p info -t pl_mom "$0 (`cat $PIDFILE`) already running"
29 # Clean up stale lock files
30 trap "rm -f $PIDFILE" EXIT
32 # Run a command and log its output to syslog
34 eval $* 2>&1 | logger -p info -t "pl_mom: $1"
37 # OpenSSH server 3.8 and above refuse login for "locked"
38 # accounts. Replace "!!" with "*" in /etc/shadow for all VServer
41 echo "* Fixing /etc/shadow"
44 for file in /etc/vservers/*.conf pl_admin.conf site_admin.conf ; do
45 slice=$(basename ${file%*.conf})
46 if grep -q "$slice:\!\!" /etc/shadow ; then
47 sed -i -e "s/$slice:\!\!:\(.*\)/$slice:*:\1/" /etc/shadow
52 # keep essential services running
54 for service in sshd pl_sshd swapmon nm fprobe-ulog codemux; do
55 chkconfig --list $service | grep -q 3:on || continue
56 echo "* Checking $service"
57 status=$(service $service status)
58 if [ $? -ne 0 ] || echo $status 2>&1 | grep -q stopped ; then
59 echo "* Restarting $service"
60 service $service start
65 # kill all the processes running in slice contexts
67 vps -A | awk '(int($2) > 1) { system("vkill -c " $2 " -s 9 " $1); }'
68 # unmounts all the /proc and /dev/pts mounts in each vserver
70 while grep -q /vservers/ /proc/mounts && [ $tries -gt 0 ] ; do
72 awk '(/vservers\//) { sub(/\\040.*$/, ""); print "Unmounting " $2; system("umount " $2); }' /proc/mounts
76 # /vservers gets re-mounted read-only by the kernel if an ext3 journal
79 echo "* Fixing /vservers"
81 # test to see if /vservers is mounted read-only
82 mkdir -p /vservers/.vtmp
83 tmp=$(mktemp /vservers/.vtmp/fixit.XXXXXX)
84 if [ $? -eq 0 ] ; then
89 # kill all processes running in slice contexts
93 pidfile=/var/run/vcached.pid
94 if [ -r "$pidfile" ] ; then
100 if umount /vservers ; then
101 # install expect if necessary
102 if ! rpm -q expect ; then
103 yum -y install expect
106 # tell expect to hit the 'y' key every time fsck asks
107 expect -c 'set timeout 3600; spawn fsck /dev/mapper/planetlab-vservers; expect "<y>?" { send "y\r"; exp_continue }'
109 # blow away the vserver cache
110 rm -rf /vservers/.vcache/*
112 # XXX re-mount /vservers
115 # shutdown instead to avoid clearing disk quotas
116 shutdown -r now "/vservers filesystem repaired, rebooting"
118 echo "Unable to unmount /vservers!" >&2
121 # allow vcached to run again
125 kill_duplicate_ssh() {
126 echo "* Killing stale duplicate SSH instances"
128 # count the number of SSH instances started by each slice
129 ps -C sshd -o command= |
132 while read instances sshd slice priv ; do
133 # kill all old instances
134 if [ $instances -gt 10 ] ; then
135 ps -C sshd -o pid=,start_time=,command= |
136 grep "$slice \[priv\]" |
137 while read pid start_time command ; do
138 start_time=$(date -d "$start_time" +%s)
139 min=$(date -d "6 hours ago" +%s)
140 if [ $start_time -lt $min ] ; then
141 echo "* Killing $slice sshd pid $pid"
150 pids=$(vps aux | awk '$1 != "root" && $14 == "/usr/share/NodeManager/nm.py" {print $2}')
151 for pid in $pids ; do
152 line=$(vps aux | grep $pid)
153 echo NM found in slice. Killing PID $pid
160 # For whatever reason, Some NM's, after fork and chcontext...don't chcontext. Kill them.
161 pids=$(ps aux | awk '$1 != "root" && $12 == "/usr/share/NodeManager/nm.py" {print $2}')
162 for pid in $pids ; do
163 line=$(ps aux | grep $pid)
164 echo NM found not belonging to root. Killing PID $pid
171 # if there is more than one nm running around, kill them, then nm restart
172 pids=$(ps aux | awk '$1 == "root" && $12 == "/usr/share/NodeManager/nm.py" {print $2}')
174 for pid in $pids ; do
177 if [ $i -gt 1 ] ; then
179 echo "More than 1 NM found belonging to root. Restarting NM."
181 pids=$(ps aux | awk '$1 == "root" && $12 == "/usr/share/NodeManager/nm.py" {print $2}')
182 for pid in $pids ; do
190 echo "* Checking for stuck rpm processes"
192 rpm_count=`pgrep -f "rpm" | wc -l`
194 if [[ $rpm_count -ge 6 ]]; then
195 echo "* $rpm_count rpm processes found"
197 # kill rpm processes, attempt up to 10 times and then give up
199 rpm_count=`pgrep "rpm|yum" | wc -l`
200 while [[ $rpm_count -gt 0 ]]; do
201 echo "* killing rpm/yum processes"
202 killall -9 rpm rpmd rpmq rpmk yum
204 rpm_count=`pgrep "rpm|yum" | wc -l`
205 try_count=`expr $try_count + 1`
206 if [[ $try_count -ge 10 ]]; then
207 echo "* failed to kill rpm processes"
213 echo "* deleting rpm lock files"
214 rm -f /var/lib/rpm/__*
216 # rebuild rpm database
217 echo "* rebuilding rpm database"
220 echo "* rpm repair sequence complete"
225 # XXX kill zombie slices
241 run kill_duplicate_ssh