AM nagios/plc2nagios.py
[monitor.git] / action.py
1 #!/usr/bin/python
2 #
3 # Copyright (c) 2004  The Trustees of Princeton University (Trustees).
4
5 # Faiyaz Ahmed <faiyaza@cs.princeton.edu>
6 # Stephen Soltesz <soltesz@cs.princeton.edu>
7 #
8 # $Id$
9
10 import sys
11 from threading import *
12 import time
13 import logging
14 import Queue
15 from sets import Set
16
17 # Global config options
18 from config import config
19 from optparse import OptionParser
20 parser = OptionParser()
21
22 parser.set_defaults(nodelist=None, 
23                                         cachert=False, 
24                                         cachenodes=False, 
25                                         blacklist=None, 
26                                         ticketlist=None)
27
28 parser.add_option("", "--nodelist", dest="nodelist",
29                                         help="Read nodes to act on from specified file")
30 parser.add_option("", "--cachert", action="store_true",
31                                         help="Cache the RT database query")
32 parser.add_option("", "--cachenodes", action="store_true",
33                                         help="Cache node lookup from PLC")
34 parser.add_option("", "--ticketlist", dest="ticketlist",
35                                         help="Whitelist all RT tickets in this file")
36 parser.add_option("", "--blacklist", dest="blacklist",
37                                         help="Blacklist all nodes in this file")
38
39 config = config(parser)
40 config.parse_args()
41
42 # daemonize and *pid
43 #from util.process import * 
44
45 # RT tickets
46 import rt
47 # Correlates input with policy to form actions
48 import policy
49 import soltesz
50 import plc
51
52 # Log to what 
53 LOG="./monitor.log"
54
55 # Time to refresh DB and remove unused entries
56 RTSLEEP=7200 #2hrs
57 # Time between policy enforce/update
58 #POLSLEEP=43200 #12hrs
59 POLSLEEP=10
60
61 # Global list of all running threads.  Any threads added to 
62 # list will be monitored.
63 runningthreads = {}
64 # Seconds between checking threads
65 WATCHSLEEP = 10
66  
67 # Set up Logging
68 logger = logging.getLogger("monitor")
69 logger.setLevel(logging.DEBUG)
70 fh = logging.FileHandler(LOG, mode = 'a')
71 fh.setLevel(logging.DEBUG)
72 formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
73 fh.setFormatter(formatter)
74 logger.addHandler(fh)
75
76
77 """
78 Launches threads and adds them to the runningthreads global list.
79 Assigns name for thread, starts.
80 """
81 def startThread(fnct, name):
82                 runningthreads[name] = fnct
83                 runningthreads[name].setName(name)
84                 try:
85                         logger.info("Starting thread " + name)
86                         runningthreads[name].start()
87                 except Exception, err:
88                         logger.error("Thread: " + name + " " + error)
89
90
91 """
92 Watches threads and catches exceptions.  Each launched thread is
93 watched and state is logged.
94 """
95 class ThreadWatcher(Thread):
96         def __init__(self):
97                 Thread.__init__(self)
98
99         def run(self):
100                 while 1:
101                         self.checkThreads()
102                         time.sleep(WATCHSLEEP)
103
104         def checkThreads(self):
105                 # Iterate through treads, compare with last running.
106                 for thread in runningthreads.keys():
107                         # If thread found dead, remove from queue
108                         #print "found %s" % thread
109                         if not runningthreads[thread].isAlive():
110                                 logger.error("***********Thread died: %s**********" %(thread))
111                                 del runningthreads[thread]
112                 return len(runningthreads.keys())
113
114
115 class Dummy(Thread):
116         def __init__(self):
117                 Thread.__init__(self)
118
119         def run(self):
120                 time.sleep(5)
121
122 def dict_from_nodelist(nl):
123         d = {}
124         for host in nl:
125                 h = host['hostname']
126                 d[h] = host
127         return d
128
129 """
130 Start threads, do some housekeeping, then daemonize.
131 """
132 def main():
133         # Defaults
134         global status, logger
135         global config
136
137         logger.info('Action Started')
138         print 'Action Started'
139
140         #########  GET NODES    ########################################
141         logger.info('Get Nodes from PLC')
142         print "getnode from plc"
143         l_plcnodes = soltesz.if_cached_else(True,
144                                                                 "l_plcnodes", 
145                                                                 lambda : plc.getNodes({'peer_id':None}))
146
147         s_plcnodenames = Set([x['hostname'] for x in l_plcnodes])
148
149         # List of nodes from a user-provided file.
150         if config.nodelist:
151                 file = config.nodelist
152                 nodelist = config.getListFromFile(file)
153                 #for node in nodelist:
154                 #       print "%s" % node
155         
156                 s_usernodes = Set(nodelist)
157                 # SAFE nodes are in PLC and the list 
158                 s_safe_usernodes   = s_plcnodenames & s_usernodes
159                 # UNSAFE nodes are in list but not in PLC. i.e. ignore them.
160                 s_unsafe_usernodes = s_usernodes - s_plcnodenames
161                 if len(s_unsafe_usernodes) > 0 :
162                         for node in s_unsafe_usernodes:
163                                 print "WARNING: User provided: %s but not found in PLC" % node
164
165                 l_nodes = filter(lambda x: x['hostname'] in s_safe_usernodes,l_plcnodes)
166         else:
167                 l_nodes = l_plcnodes
168
169         print "len of l_nodes: %d" % len(l_nodes)
170         # Minus blacklisted ones..
171         l_ticket_blacklist = soltesz.if_cached_else(1,"l_ticket_blacklist",lambda : [])
172
173         l_blacklist = soltesz.if_cached_else(1, "l_blacklist", lambda : [])
174         l_nodes  = filter(lambda x : not x['hostname'] in l_blacklist, l_nodes)
175
176         #######  Get RT tickets    #########################################
177         #logger.info('Get Tickets from RT')
178         #t = soltesz.MyTimer()
179         #ad_dbTickets = soltesz.if_cached_else(config.cachert, "ad_dbTickets", rt.rt_tickets)
180         #print "Getting tickets from RT took: %f sec" % t.diff() ; del t
181
182         logger.info('Start Action thread')
183         ####### Action
184         action = policy.Action( [node['hostname'] for node in l_nodes] )
185         startThread(action,"action")
186
187
188         tw = ThreadWatcher()
189         while True:
190                 if tw.checkThreads() == 0:
191                         break
192                 time.sleep(WATCHSLEEP)
193
194         logger.info('Action Exitting')
195         sys.exit(0)
196         
197 if __name__ == '__main__':
198         try:
199                 main()
200         except KeyboardInterrupt:
201                 print "Killed.  Exitting."
202                 logger.info('Action Killed')
203                 sys.exit(0)