From dd301eafa4f964ec3dfc28bda85c3576ac9ee634 Mon Sep 17 00:00:00 2001 From: Stephen Soltesz <soltesz@cs.princeton.edu> Date: Sun, 22 May 2011 01:37:13 +0000 Subject: [PATCH] Several updates to policy, repair, and automate script. Add a check for the number of sshkeys added to the current agent Separate police.py and repair.py for failboot nodes --- automate-default.sh | 17 +++ commands/policy.py | 10 -- commands/repair.py | 124 +++++++++++++++++++ monitor/scanapi.py | 18 +-- statistics/functions.r | 268 +++++++++++++++++++++++++---------------- 5 files changed, 311 insertions(+), 126 deletions(-) create mode 100755 commands/repair.py diff --git a/automate-default.sh b/automate-default.sh index a51144a..db53fd4 100755 --- a/automate-default.sh +++ b/automate-default.sh @@ -11,6 +11,16 @@ set -e DATE=`date +%Y-%m-%d-%T` MONITOR_PID="${MONITOR_SCRIPT_ROOT}/SKIP" +function send_mail () +{ + subject=$1 + body=$2 + mail -s "$subject" $exception_email <<EOF +$body +EOF +} + + echo "#######################################"; echo "Running Monitor at $DATE"; echo "######################################" echo "Performing API test" API=$(${MONITOR_SCRIPT_ROOT}/tools/testapi.py) @@ -62,6 +72,13 @@ fi #TODO: should add a call to ssh-add -l to check if the keys are loaded or not. source ${MONITOR_SCRIPT_ROOT}/agent.sh +# CHECK AGENT IS UP AND RUNNING +count=$( ssh-add -l | wc -l ) +if [ $count -lt 3 ] ; then + send_mail "ssh-agent is not up and running." "Add keys before monitoring can continue" + exit +fi + ${MONITOR_SCRIPT_ROOT}/commands/syncwithplc.py $DATE || : service plc restart monitor diff --git a/commands/policy.py b/commands/policy.py index 7f8c5a2..392746f 100755 --- a/commands/policy.py +++ b/commands/policy.py @@ -182,16 +182,6 @@ def main(hostnames, sitenames): sitehist.sendMessage('pcufailed_notice', hostname=host, pcu_name=pcu_name) print "send message for host %s PCU Failure" % host - if nodehist.status == 'failboot' and \ - changed_greaterthan(nodehist.last_changed, 0.25) and \ - not found_between(recent_actions, 'bootmanager_restore', 0.5, 0): - # send down node notice - # delay 0.5 days before retrying... - - print "send message for host %s bootmanager_restore" % host - sitehist.runBootManager(host) - # sitehist.sendMessage('retry_bootman', hostname=host) - if nodehist.status == 'down' and \ changed_greaterthan(nodehist.last_changed, 2): if not nodehist.firewall and not found_within(recent_actions, 'down_notice', 3.5): diff --git a/commands/repair.py b/commands/repair.py new file mode 100755 index 0000000..1a0d8ab --- /dev/null +++ b/commands/repair.py @@ -0,0 +1,124 @@ +#!/usr/bin/python + +# This script is used to manipulate the operational state of nodes in +# different node groups. These are basically set operations on nodes via the +# PLC api. +# +# Take the ng name as an argument.... +# optionally, +# * get a list of nodes in the given nodegroup. +# * set some or all in the set to rins. +# * restart them all. +# * do something else to them all. +# + +import os +import time +import traceback +import sys +from optparse import OptionParser + +from monitor import config +from monitor import parser as parsermodule +from monitor.common import * +from monitor.const import MINUP +from monitor.model import * +from monitor.wrapper import plc +from monitor.wrapper import plccache +from monitor.database.info.model import * +from monitor.database.info.interface import * + +from monitor.query import verify,query_to_dict,node_select + +def main(hostnames, config): + # commands: + i = 1 + node_count = 1 + print "failboot-repair" + for i,host in enumerate(hostnames): + try: + lb = plccache.plcdb_hn2lb[host] + except: + print "unknown host in plcdb_hn2lb %s" % host + email_exception("%s %s" % (i,host)) + continue + + nodeblack = BlacklistRecord.get_by(hostname=host) + + if nodeblack and not nodeblack.expired(): + print "skipping %s due to blacklist. will expire %s" % (host, nodeblack.willExpire() ) + continue + + sitehist = SiteInterface.get_or_make(loginbase=lb) + + recent_actions = sitehist.getRecentActions(hostname=host) + + nodehist = HistoryNodeRecord.findby_or_create(hostname=host) + + print "%s %s %s" % (i, nodehist.hostname, nodehist.status) + + if nodehist.status == 'failboot' and \ + changed_greaterthan(nodehist.last_changed, 0.25) and \ + ( not found_between(recent_actions, 'bootmanager_restore', 0.5, 0) \ + or config.force ): + # send down node notice + # delay 0.5 days before retrying... + print "send message for host %s bootmanager_restore" % host + sitehist.runBootManager(host) + + node_count = node_count + 1 + print "time: ", time.strftime('%Y-%m-%d %H:%M:%S') + sys.stdout.flush() + session.flush() + + session.flush() + return + + +if __name__ == "__main__": + parser = parsermodule.getParser(['nodesets']) + parser.set_defaults(rins=False, + reboot=False, + force=False, + nosetup=False, + verbose=False, + quiet=False,) + + parser.add_option("", "--force", dest="force", action="store_true", + help="Force action regardless of previous actions/logs.") + parser.add_option("", "--rins", dest="rins", action="store_true", + help="Set the boot_state to 'rins' for all nodes.") + parser.add_option("", "--reboot", dest="reboot", action="store_true", + help="Actively try to reboot the nodes, keeping a log of actions.") + + parser.add_option("", "--verbose", dest="verbose", action="store_true", + help="Extra debug output messages.") + + parser = parsermodule.getParser(['defaults'], parser) + config = parsermodule.parse_args(parser) + + fbquery = HistoryNodeRecord.query.all() + hostnames = [ n.hostname for n in fbquery ] + + if config.site: + # TODO: replace with calls to local db. the api fails so often that + # these calls should be regarded as unreliable. + l_nodes = plccache.GetNodesBySite(config.site) + filter_hostnames = [ n['hostname'] for n in l_nodes ] + + hostnames = filter(lambda x: x in filter_hostnames, hostnames) + + if config.node: + hostnames = [ config.node ] + + try: + main(hostnames, config) + session.flush() + except KeyboardInterrupt: + print "Killed by interrupt" + session.flush() + sys.exit(0) + except: + email_exception() + print traceback.print_exc(); + print "fail all..." diff --git a/monitor/scanapi.py b/monitor/scanapi.py index 84bb6e0..eef53aa 100644 --- a/monitor/scanapi.py +++ b/monitor/scanapi.py @@ -184,10 +184,10 @@ class ScanNodeInternal(ScanInterface): # commands at once. values = {} nmap = command.CMD() - print "nmap -oG - -P0 -p22,80,806 %s | grep Host:" % nodename - (oval1,eval) = nmap.run_noexcept("nmap -oG - -P0 -p22,80,806 %s | grep Host:" % nodename) - (oval2,eval) = nmap.run_noexcept("nmap -oG - -P0 -p22,80,806 %s | grep Host:" % nodename) - (oval3,eval) = nmap.run_noexcept("nmap -oG - -P0 -p22,80,806 %s | grep Host:" % nodename) + print "nmap -oG - -P0 -p22,80,806 %s | grep -v Down | grep Ports:" % nodename + (oval1,eval) = nmap.run_noexcept("nmap -oG - -P0 -p22,80,806 %s | grep -v Down | grep Ports:" % nodename) + (oval2,eval) = nmap.run_noexcept("nmap -oG - -P0 -p22,80,806 %s | grep -v Down | grep Ports:" % nodename) + (oval3,eval) = nmap.run_noexcept("nmap -oG - -P0 -p22,80,806 %s | grep -v Down | grep Ports:" % nodename) # NOTE: an empty / error value for oval, will still work. values['port_status'] = {} (o1,continue_probe) = nmap_port_status(oval1) @@ -249,8 +249,8 @@ class ScanNodeInternal(ScanInterface): echo ' "princeton_comon_running":"'`ls -d /proc/virtual/$ID`'",' echo ' "princeton_comon_procs":"'`vps ax | grep $ID | grep -v grep | wc -l`'",' echo ' "fs_status":"'`grep proc /proc/mounts | grep ro, ; if [ -x /usr/bin/timeout.pl ] ; then timeout.pl 20 touch /var/log/monitor 2>&1 ; if [ -d /vservers/ ] ; then timeout.pl 20 touch /vservers/monitor.log 2>&1 ; fi ; fi`'",' - echo ' "rpm_version":"'`if [ -x /usr/bin/timeout.pl ] ; then timeout.pl 30 rpm -q NodeManager ; fi`'",' - echo ' "rpm_versions":"'`if [ -x /usr/bin/timeout.pl ] ; then timeout.pl 45 rpm -q -a ; fi`'",' + echo ' "rpm_version":"''",' + echo ' "rpm_versions":"''",' echo ' "md5sums":"'`md5sum /etc/yum.conf /etc/yum.myplc.d/myplc.repo /etc/yum.myplc.d/stock.repo | awk '{print $1}'`'",' echo ' "md5sum_yum":"'`grep -v -E "^#" /etc/yum.myplc.d/myplc.repo | md5sum`'",' echo ' "nada":"'``'",' @@ -529,13 +529,13 @@ class ScanPCU(ScanInterface): traceback.print_exc() continue_probe = False - if b_except or not continue_probe: return (None, None, None) + if b_except or not continue_probe: return (None, None) #### RUN NMAP ############################### if continue_probe: nmap = command.CMD() - print "nmap -oG - -P0 -p22,23,80,443,623,5869,9100,16992 %s | grep Host:" % reboot.pcu_name(values['plc_pcu_stats']) - (oval,eval) = nmap.run_noexcept("nmap -oG - -P0 -p22,23,80,443,623,5869,9100,16992 %s | grep Host:" % reboot.pcu_name(values['plc_pcu_stats'])) + print "nmap -oG - -P0 -p22,23,80,443,623,5869,9100,16992 %s | grep -v Down | grep Ports:" % reboot.pcu_name(values['plc_pcu_stats']) + (oval,eval) = nmap.run_noexcept("nmap -oG - -P0 -p22,23,80,443,623,5869,9100,16992 %s | grep -v Down | grep Ports:" % reboot.pcu_name(values['plc_pcu_stats'])) # NOTE: an empty / error value for oval, will still work. (values['port_status'], continue_probe) = nmap_port_status(oval) else: diff --git a/statistics/functions.r b/statistics/functions.r index e02ee67..8548e41 100644 --- a/statistics/functions.r +++ b/statistics/functions.r @@ -96,7 +96,7 @@ slices_4 <- function (x, components=FALSE) a<-(m+d+c*r+b+p); } - return (a/5*5); + return (a/5*5); # I know. Preserved for clarity and consistency with earlier examples } index_of_bin <- function (h, value) @@ -194,112 +194,6 @@ year_hist <- function (t, year, from, to, max, type="week", title="Histogram for return (h); } -year_hist_unique <- function (t, year, from, to, max, type="week", title="Histogram for Tickets in") -{ - dates <-seq(as.Date(from), as.Date(to), type) - months <- format(dates, "%b-%d") - hbreaks<-unclass(as.POSIXct(dates)) - - rows <- NULL - for ( d in hbreaks ) - { - d_end <- d+60*60*24 - t_sub <- t[which(t$start > d & t$start <= d_end),] - rows <- rbind(rows, c('start'=d, 'reboots'=length(unique(t_sub$hostname))) ) - } - rows <- data.frame(rows) - - if ( max == 0 ) { - max = max(rows$reboots) - } - main<-sprintf(paste(title, "%s: MEAN %s\n"), year, mean(rows$reboots)) - print(main); - barplot(rows$reboots, ylim=c(0,max), main=main, axes=FALSE, space=0) - #plot(h, ylim=c(0,max), main=main, axes=FALSE) - axis(1, labels=months, at=seq(1,length(hbreaks))) - axis(2) - abline(mean(rows$reboots), 0, col='grey') - #qqnorm(h$counts) - #qqline(h$counts) - return (rows); -} - -year_hist_unique_recent <- function (t, year, from, to, max, blocks=c(1,3,7,14,30), type="week", title="Histogram for Tickets in") -{ - dates <-seq(as.Date(from), as.Date(to), type) - months <- format(dates, "%b-%d") - hbreaks<-unclass(as.POSIXct(dates)) - - rows <- NULL - - - for ( d in hbreaks ) - { - # initialize row for this iteration - row <- NULL - row[as.character(0)] <- 0 - for ( block in blocks ) { - row[as.character(block)] <- 0 - } - - # find the range : d plus a day - d_end <- d+60*60*24 - # find unique hosts in this day range - t_sub <- t[which(t$start > d & t$start <= d_end),] - unique_hosts <- unique(t_sub$hostname) - if (length(unique_hosts) == 0 ) { - rows <- rbind(rows, c('start'=d, row)) - next - } - - #print(sprintf("unique_hosts: %s\n", unique_hosts)); - print(sprintf("unique_hosts: %s\n", length(unique_hosts))); - - for ( host in as.character(unique_hosts) ) - { - found <- 0 - for ( block in blocks ) - { - #print(sprintf("date: %s, block: -%s, %s\n", d, block, host)); - #print(sprintf("row: %s\n", row)); - # find the range : 'block' days ago to 'd' - d_back <- d - 60*60*24 * block - t_back_sub <- t[which(t$start > d_back & t$start <= d),] - u <- unique(t_back_sub$hostname) - if ( length(u[u==host]) >= 1) - { - # add to block_count and go to next host. - found <- 1 - i <- as.character(block) - row[i] <- row[i] + 1 - break - } - } - if ( found == 0 ) - { - # no range found - row['0'] <- row['0'] + 1 - } - } - rows <- rbind(rows, c('start'=d, row)) - } - - rows <- data.frame(rows) - - if ( max == 0 ) { - max = max(rows['0']) - } - #main<-sprintf(paste(title, "%s: MEAN %s\n"), year, mean(rows$reboots)) - #print(main); - #barplot(rows$reboots, ylim=c(0,max), main=main, axes=FALSE, space=0) - ##plot(h, ylim=c(0,max), main=main, axes=FALSE) - #axis(1, labels=months, at=seq(1,length(hbreaks))) - #axis(2) - #abline(mean(rows$reboots), 0, col='grey') - #qqnorm(h$counts) - #qqline(h$counts) - return (rows); -} source("myImagePlot.R") reboot_image <- function (t, year, from, to, max=0, type="week", title="") @@ -397,6 +291,17 @@ add_timestamp <- function (t) return (t); } +convert_datestr <- function (t, format) +{ + t$start <- c(0) # assign new column with zero value initially + for ( i in 1:length(t$Date) ) + { + tstamp <-unclass(as.POSIXct(strptime(t$Date[i], format)))[1] + t$start[i] <- tstamp + } + return (t); +} + abline_at_date <- function (date, col='black', lty=1, format="%Y-%m-%d", height=0) { ts <-unclass(as.POSIXct(date, format=format, origin="1970-01-01"))[1] @@ -420,3 +325,152 @@ lowess_smooth <- function (x, y, delta=(60*60*24), f=0.02) a<-lowess(x, y, delta=delta, f=f) return (a); } + +in_list <- function ( str, str_list ) +{ + for ( f in str_list ) + { + if ( str == f ) + { + return (TRUE); + } + } + return (FALSE); +} + +col2hex <- function (colorname, alpha=1) +{ + hex = "FFFFFFFF"; + c_rgb <- col2rgb(colorname) + c_rgb <- c_rgb / 255 + hex <- rgb(c_rgb[1,1], c_rgb[2,1], c_rgb[3,1], alpha) + return (hex); +} + +printf <- function (...) +{ + return(print(sprintf(...))); +} + +time_graph_setup <- function (from, to) +{ + # find 'type' range of days + xlim <- c(tstamp(from, format="%Y/%m/%d"), tstamp(to, format="%Y/%m/%d")) + + begin_date <- as.Date(from) + end_date <- as.Date(to) + + begin_day <- as.numeric(format(begin_date, "%j")) + end_day <- as.numeric(format(end_date, "%j")) + print(begin_day) + + date_days <-seq(as.Date(from), as.Date(to), 'day') + date_weeks <-seq(as.Date(from), as.Date(to), 'week') + date_months <-seq(as.Date(from), as.Date(to), 'month') + date_years <-seq(as.Date(from), as.Date(to), 'year') + + day_str <- format(date_months, "%a") + day_ts <- unclass(as.POSIXct(date_days)) + + week_str <- format(date_months, "%W") + week_ts <- unclass(as.POSIXct(date_weeks)) + + month_str <- format(date_months, "%b") + month_ts <- unclass(as.POSIXct(date_months)) + + year_str <- format(date_years, "%Y") + year_ts <- unclass(as.POSIXct(date_years)) + print(year_ts) + year_ts_before <- year_ts + + l <- length(year_ts) + print(l) + if ( l == 1 ) { + # center year between begin_day and end_day + print("one year!") + year_ts[1] <- (xlim[1] + xlim[2]) / 2.0 + } else + { + print("multitple years!") + # center first year between start day and last day of that year. + print(year_ts) + year_ts[1] <- year_ts[1] + ((365 - begin_day)/2.0)*60*60*24 + print(year_ts) + year_ts[l] <- year_ts[l] + ( -begin_day + end_day/2.0)*60*60*24 + print(year_ts) + if ( l > 2 ) { + year_ts <- c(year_ts[1], year_ts[seq(2,l-1)] + (180 - begin_day)*60*60*24, year_ts[l]) + } + print(year_ts) + } + print(year_ts - year_ts_before) + + return (list(xlim=xlim, day_str=day_str, day_ts=day_ts, + week_str=week_str, week_ts=week_ts, + month_str=month_str, month_ts=month_ts, + year_str=year_str, year_ts=year_ts)) +} + +planetlab_releases <- function (height) +{ + h = height + tstamp_20040412 <-abline_at_date("2004-04-12", col='white', lty=0, height=h) + tstamp_20041112 <-abline_at_date("2004-11-12", col='white', lty=3, height=h) + tstamp_20050301 <-abline_at_date("2005-03-01", col='grey60', lty=3, height=h) + tstamp_20050615 <-abline_at_date("2005-06-15", col='white', lty=0, height=h) + tstamp_20051001 <-abline_at_date("2005-10-01", col='grey60', lty=3, height=h) + tstamp_20060519 <-abline_at_date("2006-05-19", col='grey60', lty=3, height=h) + tstamp_20070228 <-abline_at_date("2007-02-28", col='grey60', lty=3, height=h) + tstamp_20070501 <-abline_at_date("2007-05-01", col='white', lty=0, height=h) + tstamp_20071021 <-abline_at_date("2007-10-21", col='grey60', lty=3, height=h) + tstamp_20080601 <-abline_at_date("2008-06-01", col='grey60', lty=3, height=h) + tstamp_20080815 <-abline_at_date("2008-08-15", col='white', lty=0, height=h) + tstamp_20090501 <-abline_at_date("2009-05-01", col='grey60', lty=3, height=h) + tstamp_20100201 <-abline_at_date("2010-02-01", col='white', lty=0, height=h) + tstamp_20100628 <-abline_at_date("2010-06-28", col='white', lty=3, height=h) + tstamp_20110222 <-abline_at_date("2011-02-22", col='grey60', lty=3, height=h) + # I think 5.0 was released 02/22/2011... not 03-09 + + text(x=c(tstamp_20040412, + tstamp_20041112, + tstamp_20050301, + tstamp_20050615, + tstamp_20051001, + tstamp_20060519, + tstamp_20070228, + tstamp_20071021, + tstamp_20080601, + tstamp_20090501, + tstamp_20100628, + tstamp_20110222), + y=c(h-h*0.05), + #labels=c('Release', '3.0', '3.1', '', '3.2', '3.3', '4.0', '4.1', '4.2', '4.3')) + labels=c('', '', '3.1', '', '3.2', '3.3', '4.0', '4.1', '4.2', '4.3', '', '5.0')) + text(x=c(tstamp_20050301), y=c(h), labels=c("Releases")) +} + +plc_releases <- function (height) +{ + h = height + tstamp_pre <-abline_at_date("2004-10-01", col='grey60', lty=3, height=h) + tstamp_3_1 <-abline_at_date("2005-03-01", col='grey60', lty=3, height=h) + tstamp_3_2 <-abline_at_date("2005-10-01", col='grey60', lty=3, height=h) + tstamp_3_3 <-abline_at_date("2006-05-19", col='grey60', lty=3, height=h) + tstamp_4_0 <-abline_at_date("2007-02-28", col='grey60', lty=3, height=h) + tstamp_4_1 <-abline_at_date("2007-10-21", col='grey60', lty=3, height=h) + tstamp_4_2 <-abline_at_date("2008-06-01", col='grey60', lty=3, height=h) + tstamp_4_3 <-abline_at_date("2009-05-01", col='grey60', lty=3, height=h) + tstamp_5_0 <-abline_at_date("2011-02-22", col='grey60', lty=3, height=h) + + text(x=c(tstamp_3_1, + tstamp_3_2, + tstamp_3_3, + tstamp_4_0, + tstamp_4_1, + tstamp_4_2, + tstamp_4_3, + tstamp_5_0), + y=c(h-h*0.05), + labels=c('3.1', '3.2', '3.3', '4.0', '4.1', '4.2', '4.3', '5.0')) + text(x=c(tstamp_pre), y=c(h), labels=c("Releases")) +} -- 2.47.0