4b80d9a07094fed9c2a296448616b78357df7276
[monitor.git] / monitor.py
1 #!/usr/bin/python
2 #
3 # Copyright (c) 2004  The Trustees of Princeton University (Trustees).
4
5 # Faiyaz Ahmed <faiyaza@cs.princeton.edu>
6 #
7 # $Id: $
8
9 import sys
10 import os
11 import getopt 
12 import thread
13 from threading import *
14 import time
15 import logging
16 import Queue
17 # daemonize and *pid
18 from util.process import * 
19
20 # Comon DB
21 import comon
22 # RT tickets
23 import rt
24 # Correlates input with policy to form actions
25 import policy
26 # Email
27 import mailer
28 import emailTxt
29 # Defaults
30 debug = False 
31
32 # Log to what 
33 LOG="./monitor.log"
34
35 # DAT
36 DAT="./monitor.dat"
37
38 # Email defaults
39 MTA="localhost"
40 FROM="support@planet-lab.org"
41 TECHEMAIL="tech-%s@sites.planet-lab.org"
42 PIEMAIL="pi-%s@sites.planet-lab.org"
43
44 # API
45 XMLRPC_SERVER = 'https://www.planet-lab.org/PLCAPI/'
46
47 # Time between comon refresh
48 COSLEEP=300 #5mins
49 # Time to refresh DB and remove unused entries
50 RTSLEEP=7200 #2hrs
51 # Time between policy enforce/update
52 #POLSLEEP=43200 #12hrs
53 POLSLEEP=10
54
55 # Global list of all running threads.  Any threads added to 
56 # list will be monitored.
57 runningthreads = {}
58 # Seconds between checking threads
59 WATCHSLEEP = 10
60  
61 # Set up Logging
62 logger = logging.getLogger("monitor")
63 logger.setLevel(logging.DEBUG)
64 fh = logging.FileHandler(LOG, mode = 'a')
65 fh.setLevel(logging.DEBUG)
66 formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
67 fh.setFormatter(formatter)
68 logger.addHandler(fh)
69
70 def usage():
71     print """
72 Usage: %s [OPTIONS]...
73
74 Options:
75         -d, --debug             Enable debugging (default: %s)
76         --status                Print memory usage statistics and exit
77         -h, --help              This message
78 """.lstrip() % (sys.argv[0], debug)
79
80
81 """
82 Launches threads and adds them to the runningthreads global list.
83 Assigns name for thread, starts.
84 """
85 def startThread(fnct, name):
86                 runningthreads[name] = fnct
87                 runningthreads[name].setName(name)
88                 try:
89                         logger.info("Starting thread " + name)
90                         runningthreads[name].start()
91                 except Exception, err:
92                         logger.error("Thread: " + name + " " + error)
93
94
95 """
96 Watches threads and catches exceptions.  Each launched thread is
97 watched and state is logged.
98 """
99 class ThreadWatcher(Thread):
100         def __init__(self):
101                 Thread.__init__(self)
102
103         def run(self):
104                 while 1:
105                         self.checkThreads()
106                         time.sleep(WATCHSLEEP)
107
108         def checkThreads(self):
109                 # Iterate through treads, compare with last running.
110                 for thread in runningthreads.keys():
111                         # If thread found dead, remove from queue
112                         if not runningthreads[thread].isAlive():
113                                 logger.error("Thread Died: %s" %(thread))
114                                 del runningthreads[thread]
115
116
117 class Dummy(Thread):
118         def __init__(self):
119                 Thread.__init__(self)
120
121         def run(self):
122                 time.sleep(5)
123
124
125 """
126 Start threads, do some housekeeping, then daemonize.
127 """
128 def main():
129         # Defaults
130         global debug, status, logger
131
132         try:
133                 longopts = ["debug", "status", "help"]
134                 (opts, argv) = getopt.getopt(sys.argv[1:], "dvf:s:ph", longopts)
135         except getopt.GetoptError, err:
136                 print "Error: " + err.msg
137                 usage()
138                 sys.exit(1)
139
140         for (opt, optval) in opts:
141                 if opt == "-d" or opt == "--debug":
142                         debug = True
143                 elif opt == "--status":
144                         #print summary(names)
145                         sys.exit(0)
146                 else:
147                         usage()
148                         sys.exit(0)
149
150         #if not debug:
151         #       daemonize()
152         #       writepid("monitor")
153
154         # Init stuff.  Watch Threads to see if they die.  Perhaps send email?
155         logger.info('Monitor Started')
156         startThread(ThreadWatcher(), "Watcher")
157         # The meat of it.
158
159         # Nodes to check. Queue of all sick nodes.
160         toCheck = Queue.Queue()
161         # Nodes that are sick w/o tickets
162         sickNoTicket = Queue.Queue()
163         # Comon DB of all nodes
164         cdb = {}
165         # Nodes that are down.  Use this to maintain DB;  cleanup.
166         #alldown = Queue.Queue()
167         # RT DB
168         tickets = {}
169         # Nodes we've emailed.
170         # host - > (type of email, time)
171         emailed = {}
172
173
174         # Get RT Tickets.
175         # Event based.  Add to queue(toCheck) and hosts are queried.
176         rt1 = rt.RT(tickets, toCheck, sickNoTicket)
177         rt2 = rt.RT(tickets, toCheck, sickNoTicket)
178         rt3 = rt.RT(tickets, toCheck, sickNoTicket)
179         rt4 = rt.RT(tickets, toCheck, sickNoTicket)
180         rt5 = rt.RT(tickets, toCheck, sickNoTicket)
181         # Kind of a hack. Cleans the DB for stale entries and updates db.
182         clean = Thread(target=rt5.cleanTickets)
183         # Poll Comon.  Refreshes Comon data every COSLEEP seconds
184         cm1 = comon.Comon(cdb, toCheck)
185
186         # Actually digest the info and do something with it.
187         pol = policy.Policy(cm1, sickNoTicket, emailed)
188
189         # Load emailed sites from last run.
190         pol.emailedStore("LOAD")
191
192         # Start Threads
193         startThread(rt1,"rt1")
194         startThread(rt2,"rt2")
195         startThread(rt3,"rt3")
196         startThread(rt4,"rt4")
197         startThread(rt5,"rt5")
198         startThread(clean,"cleanrt5")
199
200         # Start Comon Thread    
201         startThread(cm1,"comon")
202
203         # Wait for threads to init.  Probably should join, but work on that later.
204         time.sleep(10)
205
206         # Start Sending Emails
207         startThread(pol, "policy")
208
209         # Wait to finish
210         while (sickNoTicket.empty() == False) or (toCheck.empty() == False):
211                 time.sleep(15)
212
213
214
215         pol.status()
216
217         # Store state of emails
218         pol.emailedStore("WRITE")
219
220         # Email what we did.
221         pol.status()
222
223         logger.info('Monitor Exitted')
224         #if not debug:
225         #       removepid("monitor")
226         os._exit(0)
227         
228 if __name__ == '__main__':
229         try:
230                 main()
231         except KeyboardInterrupt:
232                 print "Killed.  Exitting."
233                 logger.info('Monitor Killed')
234                 os._exit(0)