--- /dev/null
+#!/usr/bin/python
+
+# This script is used to manipulate the operational state of nodes in
+# different node groups. These are basically set operations on nodes via the
+# PLC api.
+#
+# Take the ng name as an argument....
+# optionally,
+# * get a list of nodes in the given nodegroup.
+# * set some or all in the set to rins.
+# * restart them all.
+# * do something else to them all.
+#
+
+import os
+import time
+import traceback
+import sys
+from optparse import OptionParser
+
+from monitor import config
+from monitor import parser as parsermodule
+from monitor.common import *
+from monitor.const import MINUP
+from monitor.model import *
+from monitor.wrapper import plc
+from monitor.wrapper import plccache
+from monitor.database.info.model import *
+from monitor.database.info.interface import *
+
+from monitor.query import verify,query_to_dict,node_select
+
+def main(hostnames, config):
+ # commands:
+ i = 1
+ node_count = 1
+ print "failboot-repair"
+ for i,host in enumerate(hostnames):
+ try:
+ lb = plccache.plcdb_hn2lb[host]
+ except:
+ print "unknown host in plcdb_hn2lb %s" % host
+ email_exception("%s %s" % (i,host))
+ continue
+
+ nodeblack = BlacklistRecord.get_by(hostname=host)
+
+ if nodeblack and not nodeblack.expired():
+ print "skipping %s due to blacklist. will expire %s" % (host, nodeblack.willExpire() )
+ continue
+
+ sitehist = SiteInterface.get_or_make(loginbase=lb)
+
+ recent_actions = sitehist.getRecentActions(hostname=host)
+
+ nodehist = HistoryNodeRecord.findby_or_create(hostname=host)
+
+ print "%s %s %s" % (i, nodehist.hostname, nodehist.status)
+
+ if nodehist.status == 'failboot' and \
+ changed_greaterthan(nodehist.last_changed, 0.25) and \
+ ( not found_between(recent_actions, 'bootmanager_restore', 0.5, 0) \
+ or config.force ):
+ # send down node notice
+ # delay 0.5 days before retrying...
+ print "send message for host %s bootmanager_restore" % host
+ sitehist.runBootManager(host)
+
+ node_count = node_count + 1
+ print "time: ", time.strftime('%Y-%m-%d %H:%M:%S')
+ sys.stdout.flush()
+ session.flush()
+
+ session.flush()
+ return
+
+
+if __name__ == "__main__":
+ parser = parsermodule.getParser(['nodesets'])
+ parser.set_defaults(rins=False,
+ reboot=False,
+ force=False,
+ nosetup=False,
+ verbose=False,
+ quiet=False,)
+
+ parser.add_option("", "--force", dest="force", action="store_true",
+ help="Force action regardless of previous actions/logs.")
+ parser.add_option("", "--rins", dest="rins", action="store_true",
+ help="Set the boot_state to 'rins' for all nodes.")
+ parser.add_option("", "--reboot", dest="reboot", action="store_true",
+ help="Actively try to reboot the nodes, keeping a log of actions.")
+
+ parser.add_option("", "--verbose", dest="verbose", action="store_true",
+ help="Extra debug output messages.")
+
+ parser = parsermodule.getParser(['defaults'], parser)
+ config = parsermodule.parse_args(parser)
+
+ fbquery = HistoryNodeRecord.query.all()
+ hostnames = [ n.hostname for n in fbquery ]
+
+ if config.site:
+ # TODO: replace with calls to local db. the api fails so often that
+ # these calls should be regarded as unreliable.
+ l_nodes = plccache.GetNodesBySite(config.site)
+ filter_hostnames = [ n['hostname'] for n in l_nodes ]
+
+ hostnames = filter(lambda x: x in filter_hostnames, hostnames)
+
+ if config.node:
+ hostnames = [ config.node ]
+
+ try:
+ main(hostnames, config)
+ session.flush()
+ except KeyboardInterrupt:
+ print "Killed by interrupt"
+ session.flush()
+ sys.exit(0)
+ except:
+ email_exception()
+ print traceback.print_exc();
+ print "fail all..."
# commands at once.
values = {}
nmap = command.CMD()
- print "nmap -oG - -P0 -p22,80,806 %s | grep Host:" % nodename
- (oval1,eval) = nmap.run_noexcept("nmap -oG - -P0 -p22,80,806 %s | grep Host:" % nodename)
- (oval2,eval) = nmap.run_noexcept("nmap -oG - -P0 -p22,80,806 %s | grep Host:" % nodename)
- (oval3,eval) = nmap.run_noexcept("nmap -oG - -P0 -p22,80,806 %s | grep Host:" % nodename)
+ print "nmap -oG - -P0 -p22,80,806 %s | grep -v Down | grep Ports:" % nodename
+ (oval1,eval) = nmap.run_noexcept("nmap -oG - -P0 -p22,80,806 %s | grep -v Down | grep Ports:" % nodename)
+ (oval2,eval) = nmap.run_noexcept("nmap -oG - -P0 -p22,80,806 %s | grep -v Down | grep Ports:" % nodename)
+ (oval3,eval) = nmap.run_noexcept("nmap -oG - -P0 -p22,80,806 %s | grep -v Down | grep Ports:" % nodename)
# NOTE: an empty / error value for oval, will still work.
values['port_status'] = {}
(o1,continue_probe) = nmap_port_status(oval1)
echo ' "princeton_comon_running":"'`ls -d /proc/virtual/$ID`'",'
echo ' "princeton_comon_procs":"'`vps ax | grep $ID | grep -v grep | wc -l`'",'
echo ' "fs_status":"'`grep proc /proc/mounts | grep ro, ; if [ -x /usr/bin/timeout.pl ] ; then timeout.pl 20 touch /var/log/monitor 2>&1 ; if [ -d /vservers/ ] ; then timeout.pl 20 touch /vservers/monitor.log 2>&1 ; fi ; fi`'",'
- echo ' "rpm_version":"'`if [ -x /usr/bin/timeout.pl ] ; then timeout.pl 30 rpm -q NodeManager ; fi`'",'
- echo ' "rpm_versions":"'`if [ -x /usr/bin/timeout.pl ] ; then timeout.pl 45 rpm -q -a ; fi`'",'
+ echo ' "rpm_version":"''",'
+ echo ' "rpm_versions":"''",'
echo ' "md5sums":"'`md5sum /etc/yum.conf /etc/yum.myplc.d/myplc.repo /etc/yum.myplc.d/stock.repo | awk '{print $1}'`'",'
echo ' "md5sum_yum":"'`grep -v -E "^#" /etc/yum.myplc.d/myplc.repo | md5sum`'",'
echo ' "nada":"'``'",'
traceback.print_exc()
continue_probe = False
- if b_except or not continue_probe: return (None, None, None)
+ if b_except or not continue_probe: return (None, None)
#### RUN NMAP ###############################
if continue_probe:
nmap = command.CMD()
- print "nmap -oG - -P0 -p22,23,80,443,623,5869,9100,16992 %s | grep Host:" % reboot.pcu_name(values['plc_pcu_stats'])
- (oval,eval) = nmap.run_noexcept("nmap -oG - -P0 -p22,23,80,443,623,5869,9100,16992 %s | grep Host:" % reboot.pcu_name(values['plc_pcu_stats']))
+ print "nmap -oG - -P0 -p22,23,80,443,623,5869,9100,16992 %s | grep -v Down | grep Ports:" % reboot.pcu_name(values['plc_pcu_stats'])
+ (oval,eval) = nmap.run_noexcept("nmap -oG - -P0 -p22,23,80,443,623,5869,9100,16992 %s | grep -v Down | grep Ports:" % reboot.pcu_name(values['plc_pcu_stats']))
# NOTE: an empty / error value for oval, will still work.
(values['port_status'], continue_probe) = nmap_port_status(oval)
else:
a<-(m+d+c*r+b+p);
}
- return (a/5*5);
+ return (a/5*5); # I know. Preserved for clarity and consistency with earlier examples
}
index_of_bin <- function (h, value)
return (h);
}
-year_hist_unique <- function (t, year, from, to, max, type="week", title="Histogram for Tickets in")
-{
- dates <-seq(as.Date(from), as.Date(to), type)
- months <- format(dates, "%b-%d")
- hbreaks<-unclass(as.POSIXct(dates))
-
- rows <- NULL
- for ( d in hbreaks )
- {
- d_end <- d+60*60*24
- t_sub <- t[which(t$start > d & t$start <= d_end),]
- rows <- rbind(rows, c('start'=d, 'reboots'=length(unique(t_sub$hostname))) )
- }
- rows <- data.frame(rows)
-
- if ( max == 0 ) {
- max = max(rows$reboots)
- }
- main<-sprintf(paste(title, "%s: MEAN %s\n"), year, mean(rows$reboots))
- print(main);
- barplot(rows$reboots, ylim=c(0,max), main=main, axes=FALSE, space=0)
- #plot(h, ylim=c(0,max), main=main, axes=FALSE)
- axis(1, labels=months, at=seq(1,length(hbreaks)))
- axis(2)
- abline(mean(rows$reboots), 0, col='grey')
- #qqnorm(h$counts)
- #qqline(h$counts)
- return (rows);
-}
-
-year_hist_unique_recent <- function (t, year, from, to, max, blocks=c(1,3,7,14,30), type="week", title="Histogram for Tickets in")
-{
- dates <-seq(as.Date(from), as.Date(to), type)
- months <- format(dates, "%b-%d")
- hbreaks<-unclass(as.POSIXct(dates))
-
- rows <- NULL
-
-
- for ( d in hbreaks )
- {
- # initialize row for this iteration
- row <- NULL
- row[as.character(0)] <- 0
- for ( block in blocks ) {
- row[as.character(block)] <- 0
- }
-
- # find the range : d plus a day
- d_end <- d+60*60*24
- # find unique hosts in this day range
- t_sub <- t[which(t$start > d & t$start <= d_end),]
- unique_hosts <- unique(t_sub$hostname)
- if (length(unique_hosts) == 0 ) {
- rows <- rbind(rows, c('start'=d, row))
- next
- }
-
- #print(sprintf("unique_hosts: %s\n", unique_hosts));
- print(sprintf("unique_hosts: %s\n", length(unique_hosts)));
-
- for ( host in as.character(unique_hosts) )
- {
- found <- 0
- for ( block in blocks )
- {
- #print(sprintf("date: %s, block: -%s, %s\n", d, block, host));
- #print(sprintf("row: %s\n", row));
- # find the range : 'block' days ago to 'd'
- d_back <- d - 60*60*24 * block
- t_back_sub <- t[which(t$start > d_back & t$start <= d),]
- u <- unique(t_back_sub$hostname)
- if ( length(u[u==host]) >= 1)
- {
- # add to block_count and go to next host.
- found <- 1
- i <- as.character(block)
- row[i] <- row[i] + 1
- break
- }
- }
- if ( found == 0 )
- {
- # no range found
- row['0'] <- row['0'] + 1
- }
- }
- rows <- rbind(rows, c('start'=d, row))
- }
-
- rows <- data.frame(rows)
-
- if ( max == 0 ) {
- max = max(rows['0'])
- }
- #main<-sprintf(paste(title, "%s: MEAN %s\n"), year, mean(rows$reboots))
- #print(main);
- #barplot(rows$reboots, ylim=c(0,max), main=main, axes=FALSE, space=0)
- ##plot(h, ylim=c(0,max), main=main, axes=FALSE)
- #axis(1, labels=months, at=seq(1,length(hbreaks)))
- #axis(2)
- #abline(mean(rows$reboots), 0, col='grey')
- #qqnorm(h$counts)
- #qqline(h$counts)
- return (rows);
-}
source("myImagePlot.R")
reboot_image <- function (t, year, from, to, max=0, type="week", title="")
return (t);
}
+convert_datestr <- function (t, format)
+{
+ t$start <- c(0) # assign new column with zero value initially
+ for ( i in 1:length(t$Date) )
+ {
+ tstamp <-unclass(as.POSIXct(strptime(t$Date[i], format)))[1]
+ t$start[i] <- tstamp
+ }
+ return (t);
+}
+
abline_at_date <- function (date, col='black', lty=1, format="%Y-%m-%d", height=0)
{
ts <-unclass(as.POSIXct(date, format=format, origin="1970-01-01"))[1]
a<-lowess(x, y, delta=delta, f=f)
return (a);
}
+
+in_list <- function ( str, str_list )
+{
+ for ( f in str_list )
+ {
+ if ( str == f )
+ {
+ return (TRUE);
+ }
+ }
+ return (FALSE);
+}
+
+col2hex <- function (colorname, alpha=1)
+{
+ hex = "FFFFFFFF";
+ c_rgb <- col2rgb(colorname)
+ c_rgb <- c_rgb / 255
+ hex <- rgb(c_rgb[1,1], c_rgb[2,1], c_rgb[3,1], alpha)
+ return (hex);
+}
+
+printf <- function (...)
+{
+ return(print(sprintf(...)));
+}
+
+time_graph_setup <- function (from, to)
+{
+ # find 'type' range of days
+ xlim <- c(tstamp(from, format="%Y/%m/%d"), tstamp(to, format="%Y/%m/%d"))
+
+ begin_date <- as.Date(from)
+ end_date <- as.Date(to)
+
+ begin_day <- as.numeric(format(begin_date, "%j"))
+ end_day <- as.numeric(format(end_date, "%j"))
+ print(begin_day)
+
+ date_days <-seq(as.Date(from), as.Date(to), 'day')
+ date_weeks <-seq(as.Date(from), as.Date(to), 'week')
+ date_months <-seq(as.Date(from), as.Date(to), 'month')
+ date_years <-seq(as.Date(from), as.Date(to), 'year')
+
+ day_str <- format(date_months, "%a")
+ day_ts <- unclass(as.POSIXct(date_days))
+
+ week_str <- format(date_months, "%W")
+ week_ts <- unclass(as.POSIXct(date_weeks))
+
+ month_str <- format(date_months, "%b")
+ month_ts <- unclass(as.POSIXct(date_months))
+
+ year_str <- format(date_years, "%Y")
+ year_ts <- unclass(as.POSIXct(date_years))
+ print(year_ts)
+ year_ts_before <- year_ts
+
+ l <- length(year_ts)
+ print(l)
+ if ( l == 1 ) {
+ # center year between begin_day and end_day
+ print("one year!")
+ year_ts[1] <- (xlim[1] + xlim[2]) / 2.0
+ } else
+ {
+ print("multitple years!")
+ # center first year between start day and last day of that year.
+ print(year_ts)
+ year_ts[1] <- year_ts[1] + ((365 - begin_day)/2.0)*60*60*24
+ print(year_ts)
+ year_ts[l] <- year_ts[l] + ( -begin_day + end_day/2.0)*60*60*24
+ print(year_ts)
+ if ( l > 2 ) {
+ year_ts <- c(year_ts[1], year_ts[seq(2,l-1)] + (180 - begin_day)*60*60*24, year_ts[l])
+ }
+ print(year_ts)
+ }
+ print(year_ts - year_ts_before)
+
+ return (list(xlim=xlim, day_str=day_str, day_ts=day_ts,
+ week_str=week_str, week_ts=week_ts,
+ month_str=month_str, month_ts=month_ts,
+ year_str=year_str, year_ts=year_ts))
+}
+
+planetlab_releases <- function (height)
+{
+ h = height
+ tstamp_20040412 <-abline_at_date("2004-04-12", col='white', lty=0, height=h)
+ tstamp_20041112 <-abline_at_date("2004-11-12", col='white', lty=3, height=h)
+ tstamp_20050301 <-abline_at_date("2005-03-01", col='grey60', lty=3, height=h)
+ tstamp_20050615 <-abline_at_date("2005-06-15", col='white', lty=0, height=h)
+ tstamp_20051001 <-abline_at_date("2005-10-01", col='grey60', lty=3, height=h)
+ tstamp_20060519 <-abline_at_date("2006-05-19", col='grey60', lty=3, height=h)
+ tstamp_20070228 <-abline_at_date("2007-02-28", col='grey60', lty=3, height=h)
+ tstamp_20070501 <-abline_at_date("2007-05-01", col='white', lty=0, height=h)
+ tstamp_20071021 <-abline_at_date("2007-10-21", col='grey60', lty=3, height=h)
+ tstamp_20080601 <-abline_at_date("2008-06-01", col='grey60', lty=3, height=h)
+ tstamp_20080815 <-abline_at_date("2008-08-15", col='white', lty=0, height=h)
+ tstamp_20090501 <-abline_at_date("2009-05-01", col='grey60', lty=3, height=h)
+ tstamp_20100201 <-abline_at_date("2010-02-01", col='white', lty=0, height=h)
+ tstamp_20100628 <-abline_at_date("2010-06-28", col='white', lty=3, height=h)
+ tstamp_20110222 <-abline_at_date("2011-02-22", col='grey60', lty=3, height=h)
+ # I think 5.0 was released 02/22/2011... not 03-09
+
+ text(x=c(tstamp_20040412,
+ tstamp_20041112,
+ tstamp_20050301,
+ tstamp_20050615,
+ tstamp_20051001,
+ tstamp_20060519,
+ tstamp_20070228,
+ tstamp_20071021,
+ tstamp_20080601,
+ tstamp_20090501,
+ tstamp_20100628,
+ tstamp_20110222),
+ y=c(h-h*0.05),
+ #labels=c('Release', '3.0', '3.1', '', '3.2', '3.3', '4.0', '4.1', '4.2', '4.3'))
+ labels=c('', '', '3.1', '', '3.2', '3.3', '4.0', '4.1', '4.2', '4.3', '', '5.0'))
+ text(x=c(tstamp_20050301), y=c(h), labels=c("Releases"))
+}
+
+plc_releases <- function (height)
+{
+ h = height
+ tstamp_pre <-abline_at_date("2004-10-01", col='grey60', lty=3, height=h)
+ tstamp_3_1 <-abline_at_date("2005-03-01", col='grey60', lty=3, height=h)
+ tstamp_3_2 <-abline_at_date("2005-10-01", col='grey60', lty=3, height=h)
+ tstamp_3_3 <-abline_at_date("2006-05-19", col='grey60', lty=3, height=h)
+ tstamp_4_0 <-abline_at_date("2007-02-28", col='grey60', lty=3, height=h)
+ tstamp_4_1 <-abline_at_date("2007-10-21", col='grey60', lty=3, height=h)
+ tstamp_4_2 <-abline_at_date("2008-06-01", col='grey60', lty=3, height=h)
+ tstamp_4_3 <-abline_at_date("2009-05-01", col='grey60', lty=3, height=h)
+ tstamp_5_0 <-abline_at_date("2011-02-22", col='grey60', lty=3, height=h)
+
+ text(x=c(tstamp_3_1,
+ tstamp_3_2,
+ tstamp_3_3,
+ tstamp_4_0,
+ tstamp_4_1,
+ tstamp_4_2,
+ tstamp_4_3,
+ tstamp_5_0),
+ y=c(h-h*0.05),
+ labels=c('3.1', '3.2', '3.3', '4.0', '4.1', '4.2', '4.3', '5.0'))
+ text(x=c(tstamp_pre), y=c(h), labels=c("Releases"))
+}