From 67cc76870b2f956d6655a129839431b96f5f7f11 Mon Sep 17 00:00:00 2001 From: Mark Huang Date: Tue, 11 Oct 2005 17:34:57 +0000 Subject: [PATCH] Runs once a day to "fix" nodes in various ways --- pl_mop.cron | 10 +++ pl_mop.sh | 173 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 183 insertions(+) create mode 100644 pl_mop.cron create mode 100755 pl_mop.sh diff --git a/pl_mop.cron b/pl_mop.cron new file mode 100644 index 0000000..6f924f1 --- /dev/null +++ b/pl_mop.cron @@ -0,0 +1,10 @@ +# +# Runs once a day to "fix" nodes in various ways +# +# Mark Huang +# Copyright (C) 2005 The Trustees of Princeton University +# +# $Id: plb.cron,v 1.1 2005/09/27 23:55:02 mlh-pl_rock Exp $ +# + +@M@ @H@ * * * root /usr/local/planetlab/bin/pl_mop.sh diff --git a/pl_mop.sh b/pl_mop.sh new file mode 100755 index 0000000..d80ce64 --- /dev/null +++ b/pl_mop.sh @@ -0,0 +1,173 @@ +#!/bin/bash +# +# Runs once a day to "fix" nodes in various ways +# +# Mark Huang +# Copyright (C) 2005 The Trustees of Princeton University +# +# $Id$ +# + +PATH=/sbin:/usr/sbin:$PATH + +PIDFILE=/var/run/pl_mop.pid + +# Record PID +if [ -f $PIDFILE ] ; then + if kill -0 `cat $PIDFILE` >/dev/null 2>&1 ; then + logger -p info -t pl_mom "$0 (`cat $PIDFILE`) already running" + exit 1 + fi +fi +echo $$ > $PIDFILE + +# Clean up stale lock files +trap "rm -f $PIDFILE" EXIT + +# Run a command and log its output to syslog +run() { + eval $* 2>&1 | logger -p info -t "pl_mom: $1" +} + +# OpenSSH server 3.8 and above refuse login for "locked" +# accounts. Replace "!!" with "*" in /etc/shadow for all VServer +# accounts. +fix_etc_shadow() { + echo "* Fixing /etc/shadow" + + shopt -s nullglob + for file in /etc/vservers/*.conf pl_admin.conf site_admin.conf ; do + slice=$(basename ${file%*.conf}) + if grep -q "$slice:\!\!" /etc/shadow ; then + sed -i -e "s/$slice:\!\!:\(.*\)/$slice:*:\1/" /etc/shadow + fi + done +} + +# keep essential services running +restart_services() { + for service in autofs sshd pl_sshd pl_mom pl_nm pl_conf proper ; do + echo "* Checking $service" + status=$(service $service status) + if [ $? -ne 0 ] || echo $status 2>&1 | grep -q stopped ; then + echo "* Restarting $service" + service $service start + fi + done +} + +# keep netflow running +restart_netflow() { + echo "* Checking netflow" + echo "sudo /sbin/service netflow restart" | su - pl_netflow + if [ $? -ne 0 ] ; then + echo "* Restarting netflow" + service netflow-init start + vserver pl_netflow start + echo "sudo /sbin/service netflow restart" | su - pl_netflow + fi +} + +# kill all the processes running in slice contexts +vkillall() { + vps -A | awk '(int($2) > 1) { system("vkill -c " $2 " -s 9 " $1); }' + # unmounts all the /proc and /dev/pts mounts in each vserver + tries=10 + while grep -q /vservers/ /proc/mounts && [ $tries -gt 0 ] ; do + tries=$(($tries -1)) + # arizona_stork seems to generate some weird mount points of the form + # /vservers/arizona_stork/tmp/0.886421543959\040(deleted) that should be + # /vservers/arizona_stork/tmp/0.886421543959 + awk '(/vservers\//) { sub(/\\040.*$/, ""); print "Unmounting " $2; system("umount " $2); }' /proc/mounts + done +} + +# /vservers gets re-mounted read-only by the kernel if an ext3 journal +# transaction aborts +fix_vservers() { + echo "* Fixing /vservers" + + # test to see if /vservers is mounted read-only + mkdir -p /vservers/.vtmp + tmp=$(mktemp /vservers/.vtmp/fixit.XXXXXX) + if [ $? -eq 0 ] ; then + rm -f $tmp + return 0 + fi + + # kill all processes running in slice contexts + vkillall + + # stop the key automounter + service autofs stop + + # stop vcached + pidfile=/var/run/vcached.pid + if [ -r "$pidfile" ] ; then + kill $(cat $pidfile) + fi + touch $pidfile + + # unmounts /vservers + if umount /vservers ; then + # install expect if necessary + if ! rpm -q expect ; then + yum -y install expect + fi + + # tell expect to hit the 'y' key every time fsck asks + expect -c 'set timeout 3600; spawn fsck /dev/mapper/planetlab-vservers; expect "?" { send "y\r"; exp_continue }' + + # blow away the vserver cache + rm -rf /vservers/.vcache/* + + # re-mount /vservers + mount /vservers + else + echo "Unable to unmount /vservers!" >&2 + fi + + # allow vcached to run again + rm -f $pidfile + + # restart the key automounter + service autofs start +} + +kill_duplicate_ssh() { + echo "* Killing stale duplicate SSH instances" + + # count the number of SSH instances started by each slice + ps -C sshd -o command= | + grep " \[priv\]" | + sort | uniq -c | + while read instances sshd slice priv ; do + # kill all old instances + if [ $instances -gt 10 ] ; then + ps -C sshd -o pid=,start_time=,command= | + grep "$slice \[priv\]" | + while read pid start_time command ; do + start_time=$(date -d "$start_time" +%s) + min=$(date -d "6 hours ago" +%s) + if [ $start_time -lt $min ] ; then + echo "* Killing $slice sshd pid $pid" + kill -9 $pid + fi + done + fi + done +} + +# XXX kill zombie slices + +# XXX reboot if boot state changes + +run fix_vservers + +run fix_etc_shadow + +run restart_services + +run restart_netflow + +run kill_duplicate_ssh -- 2.43.0