From: Stephen Soltesz <soltesz@cs.princeton.edu>
Date: Mon, 19 May 2008 17:52:56 +0000 (+0000)
Subject: mass commit
X-Git-Tag: Monitor-1.0-4~3
X-Git-Url: http://git.onelab.eu/?a=commitdiff_plain;h=8424072ea9faa9afaee496c039e3f626b5b36e41;p=monitor.git

mass commit
---

diff --git a/automate_pl03.sh b/automate_pl03.sh
index 4a07326..82f25dc 100755
--- a/automate_pl03.sh
+++ b/automate_pl03.sh
@@ -4,6 +4,16 @@ set -e
 cd $HOME/monitor/
 DATE=`date +%Y-%m-%d-%T`
 
+
+if [ -f $HOME/monitor/SKIP ] ; then 
+	echo "SKIPPING Monitor"
+	# TODO: should be possible to kill the old version if 
+	# desired and prevent lingering instances of automate.
+	#./kill.cmd.sh `cat $HOME/monitor/SKIP`
+	exit
+else
+	echo $$ > $HOME/monitor/SKIP
+fi
 #########################
 # 1. FINDBAD NODES 
 rm -f pdb/production.findbad2.pkl
@@ -40,3 +50,5 @@ cp pdb/production.findbadpcus2.pkl pdb/production.findbadpcus.pkl
 for f in findbad act_all findbadpcus l_plcnodes; do 
 	cp pdb/production.$f.pkl archive-pdb/`date +%F`.production.$f.pkl
 done
+
+rm -f $HOME/monitor/SKIP
diff --git a/findbadpcu.py b/findbadpcu.py
index 2179d3e..2900b65 100755
--- a/findbadpcu.py
+++ b/findbadpcu.py
@@ -27,14 +27,17 @@ import signal
 from config import config
 from optparse import OptionParser
 parser = OptionParser()
-parser.set_defaults(filename="", 
+parser.set_defaults(filename=None, 
 					increment=False, 
+					pcuid=None,
 					dbname="findbadpcus", 
 					cachenodes=False,
 					refresh=False,
 					)
 parser.add_option("-f", "--nodelist", dest="filename", metavar="FILE", 
 					help="Provide the input file for the node list")
+parser.add_option("", "--pcuid", dest="pcuid", metavar="id", 
+					help="Provide the id for a single pcu")
 parser.add_option("", "--cachenodes", action="store_true",
 					help="Cache node lookup from PLC")
 parser.add_option("", "--dbname", dest="dbname", metavar="FILE", 
@@ -321,14 +324,18 @@ def main():
 		# update global round number to force refreshes across all nodes
 		externalState['round'] += 1
 
-	if config.filename == "":
+	if config.filename == None and config.pcuid == None:
 		print "Calling API GetPCUs() : refresh(%s)" % config.refresh
 		l_pcus = soltesz.if_cached_else_refresh(1, 
 								config.refresh, "pculist", lambda : plc.GetPCUs())
 		l_pcus  = [pcu['pcu_id'] for pcu in l_pcus]
-	else:
+	elif config.filename is not None:
 		l_pcus = config.getListFromFile(config.filename)
 		l_pcus = [int(pcu) for pcu in l_pcus]
+	elif config.pcuid is not None:
+		l_pcus = [ config.pcuid ] 
+		l_pcus = [int(pcu) for pcu in l_pcus]
+		
 
 	checkAndRecordState(l_pcus, cohash)
 
diff --git a/monitor.py b/monitor.py
index ddc3722..d876dc3 100644
--- a/monitor.py
+++ b/monitor.py
@@ -2,224 +2,53 @@
 #
 # Copyright (c) 2004  The Trustees of Princeton University (Trustees).
 # 
-# Faiyaz Ahmed <faiyaza@cs.princeton.edu>
 # Stephen Soltesz <soltesz@cs.princeton.edu>
 #
 # $Id: monitor.py,v 1.7 2007/07/03 19:59:02 soltesz Exp $
 
-import sys
-import os
-import getopt 
-import thread
-from threading import *
-import time
-import logging
-import Queue
-from sets import Set
-# Global config options
-from config import config
-# daemonize and *pid
-from util.process import * 
-
-# Comon DB
-import comon
-# RT tickets
-import rt
-# Correlates input with policy to form actions
-import policy
 import soltesz
-import plc
-
-# Log to what 
-LOG="./monitor.log"
-
-# Time to refresh DB and remove unused entries
-RTSLEEP=7200 #2hrs
-# Time between policy enforce/update
-#POLSLEEP=43200 #12hrs
-POLSLEEP=10
-
-# Global list of all running threads.  Any threads added to 
-# list will be monitored.
-runningthreads = {}
-# Seconds between checking threads
-WATCHSLEEP = 10
- 
-# Set up Logging
-logger = logging.getLogger("monitor")
-logger.setLevel(logging.DEBUG)
-fh = logging.FileHandler(LOG, mode = 'a')
-fh.setLevel(logging.DEBUG)
-formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
-fh.setFormatter(formatter)
-logger.addHandler(fh)
-
-
-"""
-Launches threads and adds them to the runningthreads global list.
-Assigns name for thread, starts.
-"""
-def startThread(fnct, name):
-		runningthreads[name] = fnct
-		runningthreads[name].setName(name)
-		try:
-			logger.info("Starting thread " + name)
-			runningthreads[name].start()
-		except Exception, err:
-			logger.error("Thread: " + name + " " + error)
-
-
-"""
-Watches threads and catches exceptions.  Each launched thread is
-watched and state is logged.
-"""
-class ThreadWatcher(Thread):
-	def __init__(self):
-		Thread.__init__(self)
-
-	def run(self):
-		while 1:
-			self.checkThreads()
-			time.sleep(WATCHSLEEP)
-
-	def checkThreads(self):
-		# Iterate through treads, compare with last running.
-	 	for thread in runningthreads.keys():
-			# If thread found dead, remove from queue
-			#print "found %s" % thread
-			if not runningthreads[thread].isAlive():
-				logger.error("***********Thread died: %s**********" %(thread))
-				del runningthreads[thread]
-		return len(runningthreads.keys())
-
-
-class Dummy(Thread):
-	def __init__(self):
-                Thread.__init__(self)
-
-	def run(self):
-		time.sleep(5)
-
-def dict_from_nodelist(nl):
-	d = {}
-	for host in nl:
-		h = host['hostname']
-		d[h] = host
-	return d
 
-"""
-Start threads, do some housekeeping, then daemonize.
-"""
-def main():
-	# Defaults
-	global status, logger
-	global config
-
-	#if not debug:
-        #	daemonize()
-        #	writepid("monitor")
-
-	config = config()
-	#config.parse_args()
-
-	logger.info('Monitor Started')
-	##########  VARIABLES   ########################################
-	# Nodes to check. Queue of all sick nodes.
-	toCheck = Queue.Queue()
-	# Nodes that are sick w/o tickets
-	sickNoTicket = Queue.Queue()
-	# Comon DB of all nodes
-	cdb = {}
-	# RT DB
-	tickets = {}
-	# Nodes we've emailed.
-	# host - > (type of email, time)
-	emailed = {}
+from monitor_policy import *
 
-	#########  GET NODES    ########################################
-	# TODO: get authoritative node list from PLC every PLCSLEEP seconds,
-	# 		feed this into Comon.
-	l_plcnodes = soltesz.if_cached_else(config.cachenodes, 
-								"l_plcnodes", 
-								lambda : plc.getNodes({'peer_id':None}))
+import plc
+import auth
+api = plc.PLC(auth.auth, auth.plc)
 
-	s_plcnodes = Set([x['hostname'] for x in l_plcnodes])
+def reboot(hostname):
 
-	# List of nodes from a user-provided file.
-	if config.nodelist:
-		file = config.nodelist
-		nodelist = config.getListFromFile(file)
-		l_nodelist = []
-		print "Getting node info for hosts in: %s" % file
-		for nodename in nodelist:
-			if config.debug: print ".", ; sys.stdout.flush()
-			l_nodelist += plc.getNodes({'hostname': nodename, 'peer_id':None})
-		if config.debug: print ""
+	l_nodes = api.GetNodes(hostname)
+	if len(l_nodes) == 0:
+		raise Exception("No such host: %s" % hostname)
 	
-		s_usernodes = Set(nodelist)
-		# nodes from PLC and in the user list.
-		s_safe_usernodes   = s_plcnodes & s_usernodes
-		s_unsafe_usernodes = s_usernodes - s_plcnodes
-		if len(s_unsafe_usernodes) > 0 :
-			for node in s_unsafe_usernodes:
-				print "WARNING: User provided: %s but not found in PLC" % node
-
-		l_nodes = filter(lambda x: x['hostname'] in s_safe_usernodes,l_plcnodes)
-	else:
-		l_nodes = l_plcnodes
-
-	# Minus blacklisted ones..
 	l_blacklist = soltesz.if_cached_else(1, "l_blacklist", lambda : [])
 	l_ticket_blacklist = soltesz.if_cached_else(1,"l_ticket_blacklist",lambda : [])
-	l_wl_nodes  = filter(lambda x : not x['hostname'] in l_blacklist, l_nodes)
-	# A handy dict of hostname-to-nodestruct mapping
-	d_allplc_nodes = dict_from_nodelist(l_wl_nodes)
-
-	#######  RT tickets    #########################################
-	t = soltesz.MyTimer()
-	ad_dbTickets = soltesz.if_cached_else(config.cachert, "ad_dbTickets", rt.rt_tickets)
-	print "Getting tickets from RT took: %f sec" % t.diff() ; del t
 
-	# TODO: get input nodes from findbad database, pipe into toCheck
-	cm1 = read_findbad_db(d_allplc_nodes, toCheck)
+	l_nodes  = filter(lambda x : not x['hostname'] in l_blacklist, l_nodes)
+	if len(l_nodes) == 0:
+		raise Exception("Host removed via blacklist: %s" % hostname)
+
+	ad_dbTickets = soltesz.if_cached_else_refresh(True, False, "ad_dbTickets", lambda : None)
+	if ad_dbTickets == None:
+		raise Exception("Could not find cached dbTickets")
+
+	#print "merge"
+	merge = Merge( [node['hostname'] for node in l_nodes])
+	record_list = merge.run()
+	#print "rt"
+	rt = RT(record_list, ad_dbTickets, l_ticket_blacklist)
+	record_list = rt.run()
+	#print "diagnose"
+	diag = Diagnose(record_list)
+	diagnose_out = diag.run()
+	#print diagnose_out
+	#print "action"
+	action = Action(diagnose_out)
+	action.run()
+
+	return True
 
-	# Search for toCheck nodes in the RT db.
-	rt1 = rt.RT(ad_dbTickets, tickets, toCheck, sickNoTicket, l_ticket_blacklist)
-	# 	Kind of a hack. Cleans the DB for stale entries and updates db.
-	#   (UNTESTED)
-	#	rt5 = rt.RT(ad_dbTickets, tickets, toCheck, sickNoTicket)
-	#	clean = Thread(target=rt5.cleanTickets)
-
-	startThread(rt1,"rt1")
-	#	startThread(rt5,"rt5")
-	#	startThread(clean,"cleanrt5")
-
-	# Actually digest the info and do something with it.
-	pol = policy.Policy(cm1, sickNoTicket, emailed)
-	# Start Sending Emails
-	startThread(pol, "policy")
-
-
-	tw = ThreadWatcher()
-	while True:
-		if tw.checkThreads() == 0:
-			break
-		time.sleep(WATCHSLEEP)
-
-	logger.info('Monitor Exitting')
-	#if not debug:
-	#	removepid("monitor")
+def main():
+	pass
 
-	# Store state of emails
-	#pol.emailedStore("WRITE")
-	soltesz.dbDump("ad_dbTickets")
-	sys.exit(0)
-	
 if __name__ == '__main__':
-	try:
-		main()
-	except KeyboardInterrupt:
-		print "Killed.  Exitting."
-		logger.info('Monitor Killed')
-		#soltesz.dbDump("ad_dbTickets")
-		sys.exit(0)
+	main()
diff --git a/nodegroups.py b/nodegroups.py
index 90ca183..430bb7b 100755
--- a/nodegroups.py
+++ b/nodegroups.py
@@ -94,15 +94,15 @@ if config.list:
 		print nodegroup_display(node, fb)
 		i += 1
 
-elif config.add:
+elif config.add and config.nodegroup:
 	for node in hostnames:
-		print "Adding %s to %s nodegroup" % (config.node, config.nodegroup)
-		api.AddNodeToNodeGroup(config.node, config.nodegroup)
+		print "Adding %s to %s nodegroup" % (node, config.nodegroup)
+		api.AddNodeToNodeGroup(node, config.nodegroup)
 
 elif config.delete:
 	for node in hostnames:
-		print "Deleting %s from %s nodegroup" % (config.node, config.nodegroup)
-		api.DeleteNodeFromNodeGroup(config.node, config.nodegroup)
+		print "Deleting %s from %s nodegroup" % (node, config.nodegroup)
+		api.DeleteNodeFromNodeGroup(node, config.nodegroup)
 
 else:
 	print "no other options supported."
diff --git a/racadm.py b/racadm.py
index e627f10..8dec875 100755
--- a/racadm.py
+++ b/racadm.py
@@ -1,6 +1,10 @@
 #!/usr/bin/python
 
 import threading
+import socket
+import os
+import popen2
+#import logger
 
 def runcmd(command, args, username, password, timeout = None):
 
@@ -72,14 +76,14 @@ def runcmd(command, args, username, password, timeout = None):
 				out += "; output follows:\n" + data
 			raise Exception, out
 
-def racadm_reboot(host, username, password, dryrun):
+def racadm_reboot(host, username, password, dryrun, state="powercycle"):
 
 	ip = socket.gethostbyname(host)
 	try:
 		cmd = "/usr/sbin/racadm"
 		os.stat(cmd)
 		if not dryrun:
-			output = runcmd(cmd, ["-r %s -i serveraction powercycle" % ip],
+			output = runcmd(cmd, ["-r %s -i serveraction %s" % (ip, state)],
 				username, password)
 		else:
 			output = runcmd(cmd, ["-r %s -i getsysinfo" % ip],
@@ -89,17 +93,20 @@ def racadm_reboot(host, username, password, dryrun):
 		return 0
 
 	except Exception, err:
-		logger.debug("runcmd raised exception %s" % err)
+		#logger.debug("runcmd raised exception %s" % err)
+		print "runcmd raised exception %s" % err
 		return -1
 
 
 from optparse import OptionParser
 parser = OptionParser()
-parser.set_defaults(ip="", user="", password="")
+parser.set_defaults(ip="", user="", password="", state="powercycle")
 parser.add_option("-r", "", dest="ip", metavar="nodename.edu", 
 					help="A single node name to add to the nodegroup")
 parser.add_option("-u", "", dest="user", metavar="username",
 					help="")
+parser.add_option("-s", "", dest="state", metavar="powercycle",
+					help="")
 parser.add_option("-p", "", dest="password", metavar="password",
 					help="")
 (options, args) = parser.parse_args()
@@ -110,6 +117,6 @@ if __name__ == '__main__':
 		options.user is not "" and \
 		options.password is not "":
 
-		racadm_reboot(options.ip, options.user, options.password, False)
+		racadm_reboot(options.ip, options.user, options.password, False, options.state)
 	else:
 		parser.print_help()