Uses CoMon's ability to find 'upness' to email. Changed queueing between threads...
[monitor.git] / monitor.py
1 #!/usr/bin/python
2 #
3 # Copyright (c) 2004  The Trustees of Princeton University (Trustees).
4
5 # Faiyaz Ahmed <faiyaza@cs.princeton.edu>
6 #
7 # $Id: $
8
9 import sys
10 import os
11 import getopt 
12 import thread
13 from threading import *
14 import time
15 import logging
16 import Queue
17 # daemonize and *pid
18 from util.process import * 
19
20 # Comon DB
21 import comon
22 # RT tickets
23 import rt
24 # Correlates input with policy to form actions
25 import policy
26 # Email
27 import mailer
28 import emailTxt
29 # Defaults
30 debug = False 
31
32 # Log to what 
33 LOG="./monitor.log"
34
35 # DAT
36 DAT="./monitor.dat"
37
38 # Email defaults
39 MTA="localhost"
40 FROM="support@planet-lab.org"
41
42 # API
43 XMLRPC_SERVER = 'https://www.planet-lab.org/PLCAPI/'
44
45 # Time between comon refresh
46 COSLEEP=300 #5mins
47 # Time to refresh DB and remove unused entries
48 RTSLEEP=7200 #2hrs
49 # Time between policy enforce/update
50 #POLSLEEP=43200 #12hrs
51 POLSLEEP=10
52
53 # Global list of all running threads.  Any threads added to 
54 # list will be monitored.
55 runningthreads = {}
56 # Seconds between checking threads
57 WATCHSLEEP = 10
58  
59 # Set up Logging
60 logger = logging.getLogger("monitor")
61 logger.setLevel(logging.DEBUG)
62 fh = logging.FileHandler(LOG, mode = 'a')
63 fh.setLevel(logging.DEBUG)
64 formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
65 fh.setFormatter(formatter)
66 logger.addHandler(fh)
67
68 def usage():
69     print """
70 Usage: %s [OPTIONS]...
71
72 Options:
73         -d, --debug             Enable debugging (default: %s)
74         --status                Print memory usage statistics and exit
75         -h, --help              This message
76 """.lstrip() % (sys.argv[0], debug)
77
78
79 """
80 Launches threads and adds them to the runningthreads global list.
81 Assigns name for thread, starts.
82 """
83 def startThread(fnct, name):
84                 runningthreads[name] = fnct
85                 runningthreads[name].setName(name)
86                 try:
87                         logger.info("Starting thread " + name)
88                         runningthreads[name].start()
89                 except Exception, err:
90                         logger.error("Thread: " + name + " " + error)
91
92
93 """
94 Watches threads and catches exceptions.  Each launched thread is
95 watched and state is logged.
96 """
97 class ThreadWatcher(Thread):
98         def __init__(self):
99                 Thread.__init__(self)
100
101         def run(self):
102                 while 1:
103                         self.checkThreads()
104                         time.sleep(WATCHSLEEP)
105
106         def checkThreads(self):
107                 # Iterate through treads, compare with last running.
108                 for thread in runningthreads.keys():
109                         # If thread found dead, remove from queue
110                         if not runningthreads[thread].isAlive():
111                                 logger.error("Thread Died: %s" %(thread))
112                                 del runningthreads[thread]
113
114
115 class Dummy(Thread):
116         def __init__(self):
117                 Thread.__init__(self)
118
119         def run(self):
120                 time.sleep(5)
121
122
123 """
124 Start threads, do some housekeeping, then daemonize.
125 """
126 def main():
127         # Defaults
128         global debug, status, logger
129
130         try:
131                 longopts = ["debug", "status", "help"]
132                 (opts, argv) = getopt.getopt(sys.argv[1:], "dvf:s:ph", longopts)
133         except getopt.GetoptError, err:
134                 print "Error: " + err.msg
135                 usage()
136                 sys.exit(1)
137
138         for (opt, optval) in opts:
139                 if opt == "-d" or opt == "--debug":
140                         debug = True
141                 elif opt == "--status":
142                         #print summary(names)
143                         sys.exit(0)
144                 else:
145                         usage()
146                         sys.exit(0)
147
148         #if not debug:
149         #       daemonize()
150         #       writepid("monitor")
151
152         # Init stuff.  Watch Threads to see if they die.  Perhaps send email?
153         logger.info('Monitor Started')
154         startThread(ThreadWatcher(), "Watcher")
155         # The meat of it.
156
157         # Nodes to check. Queue of all sick nodes.
158         toCheck = Queue.Queue()
159         # Nodes that are sick w/o tickets
160         sickNoTicket = Queue.Queue()
161         # Comon DB of all nodes
162         cdb = {}
163         # Nodes that are down.  Use this to maintain DB;  cleanup.
164         #alldown = Queue.Queue()
165         # RT DB
166         tickets = {}
167         # Nodes we've emailed.
168         # host - > (type of email, time)
169         emailed = {}
170
171
172         # Get RT Tickets.
173         # Event based.  Add to queue(toCheck) and hosts are queried.
174         rt1 = rt.RT(tickets, toCheck, sickNoTicket)
175         rt2 = rt.RT(tickets, toCheck, sickNoTicket)
176         rt3 = rt.RT(tickets, toCheck, sickNoTicket)
177         rt4 = rt.RT(tickets, toCheck, sickNoTicket)
178         rt5 = rt.RT(tickets, toCheck, sickNoTicket)
179         # Kind of a hack. Cleans the DB for stale entries and updates db.
180         clean = Thread(target=rt5.cleanTickets)
181         # Poll Comon.  Refreshes Comon data every COSLEEP seconds
182         cm1 = comon.Comon(cdb, toCheck)
183
184         # Actually digest the info and do something with it.
185         pol = policy.Policy(cm1, sickNoTicket, emailed)
186
187         # Load emailed sites from last run.
188         pol.emailedStore("LOAD")
189
190         # Start Threads
191         startThread(rt1,"rt1")
192         startThread(rt2,"rt2")
193         startThread(rt3,"rt3")
194         startThread(rt4,"rt4")
195         startThread(rt5,"rt5")
196         startThread(clean,"cleanrt5")
197
198         # Start Comon Thread    
199         startThread(cm1,"comon")
200
201         # Wait for threads to init.  Probably should join, but work on that later.
202         time.sleep(10)
203         # Start Sending Emails
204         startThread(pol, "policy")
205
206         # Wait to finish
207         while (sickNoTicket.empty() == False) or (toCheck.empty() == False):
208                 time.sleep(15)
209
210
211         pol.emailedStore("WRITE")
212         logger.info('Monitor Exitted')
213         #if not debug:
214         #       removepid("monitor")
215         os._exit(0)
216         
217 if __name__ == '__main__':
218         try:
219                 main()
220         except KeyboardInterrupt:
221                 print "Killed.  Exitting."
222                 logger.info('Monitor Killed')
223                 os._exit(0)