3 # Runs once a day to "fix" nodes in various ways
5 # Mark Huang <mlhuang@cs.princeton.edu>
6 # Copyright (C) 2005 The Trustees of Princeton University
11 PATH=/sbin:/usr/sbin:$PATH
13 # Parse PLC configuration
14 if [ -r /etc/planetlab/plc_config ] ; then
15 . /etc/planetlab/plc_config
20 PIDFILE=/var/run/pl_mop.pid
23 if [ -f $PIDFILE ] ; then
24 if kill -0 `cat $PIDFILE` >/dev/null 2>&1 ; then
25 logger -p info -t pl_mom "$0 (`cat $PIDFILE`) already running"
31 # Clean up stale lock files
32 trap "rm -f $PIDFILE" EXIT
34 # Run a command and log its output to syslog
36 eval $* 2>&1 | logger -p info -t "pl_mom: $1"
39 # OpenSSH server 3.8 and above refuse login for "locked"
40 # accounts. Replace "!!" with "*" in /etc/shadow for all VServer
43 echo "* Fixing /etc/shadow"
46 for file in /etc/vservers/*.conf pl_admin.conf site_admin.conf ; do
47 slice=$(basename ${file%*.conf})
48 if grep -q "$slice:\!\!" /etc/shadow ; then
49 sed -i -e "s/$slice:\!\!:\(.*\)/$slice:*:\1/" /etc/shadow
54 # keep essential services running
56 for service in sshd pl_sshd swapmon nm proper ; do
57 echo "* Checking $service"
58 status=$(service $service status)
59 if [ $? -ne 0 ] || echo $status 2>&1 | grep -q stopped ; then
60 echo "* Restarting $service"
61 service $service start
66 # keep netflow running
68 echo "* Checking netflow"
69 echo "sudo /sbin/service netflow restart" | su - pl_netflow
70 if [ $? -ne 0 ] ; then
71 echo "* Restarting netflow"
72 service netflow-init start
73 vserver pl_netflow start
74 echo "sudo /sbin/service netflow restart" | su - pl_netflow
78 # GPG keys are installed in /etc/pki/rpm-gpg by both the Boot Manager
79 # during initial installation, and by PlanetLabConf during daily
80 # updates. NodeUpdate imports the keys into the RPM database before
81 # running yum daily. vserver-reference copies and imports the keys
82 # into the reference images and system slices daily. The only parts of
83 # this process that are actually necessary, are the Boot Manager and
84 # vserver-reference. However, we do not want to force a re-install of
85 # all nodes, and we do not want to force an update of
86 # vserver-reference, so in the meantime, PlanetLabConf and NodeUpdate
87 # take care of getting the keys installed and imported in /, and this
88 # script takes care of getting them installed in the reference images
89 # and system slices, until we can get a new vserver-reference image
91 update_vserver_reference() {
92 echo "* Updating VServer reference"
96 VROOTS="/vservers/vserver-reference /vservers/.vcache/* /vservers/${PLC_SLICE_PREFIX}_*"
98 # Copy configuration files from host to slices
100 /etc/hosts /etc/resolv.conf /etc/yum.conf /etc/planetlab/node_id \
101 /etc/planetlab/plc_config* /etc/planetlab/php/* \
102 /etc/pki/rpm-gpg/* ; do
103 if [ -r $file ] ; then
104 for vroot in $VROOTS ; do
105 install -D -m 644 $file $vroot/$file
110 # (Re)install GPG signing keys
111 if [ -d /etc/pki/rpm-gpg ] ; then
112 for vroot in $VROOTS ; do
113 chroot $vroot rpm --allmatches -e gpg-pubkey || :
114 chroot $vroot rpm --import /etc/pki/rpm-gpg/* || :
119 # kill all the processes running in slice contexts
121 vps -A | awk '(int($2) > 1) { system("vkill -c " $2 " -s 9 " $1); }'
122 # unmounts all the /proc and /dev/pts mounts in each vserver
124 while grep -q /vservers/ /proc/mounts && [ $tries -gt 0 ] ; do
126 awk '(/vservers\//) { sub(/\\040.*$/, ""); print "Unmounting " $2; system("umount " $2); }' /proc/mounts
130 # /vservers gets re-mounted read-only by the kernel if an ext3 journal
133 echo "* Fixing /vservers"
135 # test to see if /vservers is mounted read-only
136 mkdir -p /vservers/.vtmp
137 tmp=$(mktemp /vservers/.vtmp/fixit.XXXXXX)
138 if [ $? -eq 0 ] ; then
143 # kill all processes running in slice contexts
147 pidfile=/var/run/vcached.pid
148 if [ -r "$pidfile" ] ; then
154 if umount /vservers ; then
155 # install expect if necessary
156 if ! rpm -q expect ; then
157 yum -y install expect
160 # tell expect to hit the 'y' key every time fsck asks
161 expect -c 'set timeout 3600; spawn fsck /dev/mapper/planetlab-vservers; expect "<y>?" { send "y\r"; exp_continue }'
163 # blow away the vserver cache
164 rm -rf /vservers/.vcache/*
166 # XXX re-mount /vservers
169 # shutdown instead to avoid clearing disk quotas
170 shutdown -r now "/vservers filesystem repaired, rebooting"
172 echo "Unable to unmount /vservers!" >&2
175 # allow vcached to run again
179 kill_duplicate_ssh() {
180 echo "* Killing stale duplicate SSH instances"
182 # count the number of SSH instances started by each slice
183 ps -C sshd -o command= |
186 while read instances sshd slice priv ; do
187 # kill all old instances
188 if [ $instances -gt 10 ] ; then
189 ps -C sshd -o pid=,start_time=,command= |
190 grep "$slice \[priv\]" |
191 while read pid start_time command ; do
192 start_time=$(date -d "$start_time" +%s)
193 min=$(date -d "6 hours ago" +%s)
194 if [ $start_time -lt $min ] ; then
195 echo "* Killing $slice sshd pid $pid"
204 pids=$(vps aux | awk '$1 != "root" && $14 == "/usr/share/NodeManager/nm.py" {print $2}')
205 for pid in $pids ; do
206 line=$(vps aux | grep $pid)
207 echo NM found in slice. Killing PID $pid
214 # For whatever reason, Some NM's, after fork and chcontext...don't chcontext. Kill them.
215 pids=$(ps aux | awk '$1 != "root" && $12 == "/usr/share/NodeManager/nm.py" {print $2}')
216 for pid in $pids ; do
217 line=$(ps aux | grep $pid)
218 echo NM found not belonging to root. Killing PID $pid
225 # if there is more than one nm running around, kill them, then nm restart
226 pids=$(ps aux | awk '$1 == "root" && $12 == "/usr/share/NodeManager/nm.py" {print $2}')
228 for pid in $pids ; do
231 if [ $i -gt 1 ] ; then
233 echo "More than 1 NM found belonging to root. Restarting NM."
235 pids=$(ps aux | awk '$1 == "root" && $12 == "/usr/share/NodeManager/nm.py" {print $2}')
236 for pid in $pids ; do
242 # XXX kill zombie slices
244 # XXX reboot if boot state changes
259 run kill_duplicate_ssh
261 run update_vserver_reference