*** empty log message ***
[monitor.git] / monitor.py
1 #!/usr/bin/python
2 #
3 # Copyright (c) 2004  The Trustees of Princeton University (Trustees).
4
5 # Faiyaz Ahmed <faiyaza@cs.princeton.edu>
6 #
7 # $Id: $
8
9 import sys
10 import os
11 import getopt 
12 import thread
13 from threading import *
14 import time
15 import logging
16 import Queue
17 # Global config options
18 import config
19 # daemonize and *pid
20 from util.process import * 
21
22 # Comon DB
23 import comon
24 # RT tickets
25 import rt
26 # Correlates input with policy to form actions
27 import policy
28 # Email
29 import mailer
30 import emailTxt
31
32 # Log to what 
33 LOG="./monitor.log"
34
35 # DAT
36 DAT="./monitor.dat"
37
38 # Email defaults
39 MTA="localhost"
40 FROM="support@planet-lab.org"
41 TECHEMAIL="tech-%s@sites.planet-lab.org"
42 PIEMAIL="pi-%s@sites.planet-lab.org"
43
44 # API
45 XMLRPC_SERVER = 'https://www.planet-lab.org/PLCAPI/'
46
47 # Time between comon refresh
48 COSLEEP=300 #5mins
49 # Time to refresh DB and remove unused entries
50 RTSLEEP=7200 #2hrs
51 # Time between policy enforce/update
52 #POLSLEEP=43200 #12hrs
53 POLSLEEP=10
54
55 # Global list of all running threads.  Any threads added to 
56 # list will be monitored.
57 runningthreads = {}
58 # Seconds between checking threads
59 WATCHSLEEP = 10
60  
61 # Set up Logging
62 logger = logging.getLogger("monitor")
63 logger.setLevel(logging.DEBUG)
64 fh = logging.FileHandler(LOG, mode = 'a')
65 fh.setLevel(logging.DEBUG)
66 formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
67 fh.setFormatter(formatter)
68 logger.addHandler(fh)
69
70 def usage():
71     print """
72 Usage: %s [OPTIONS]...
73
74 Options:
75         -d, --debug             Enable debugging (default: %s)
76         --status                Print memory usage statistics and exit
77         -h, --help              This message
78 """.lstrip() % (sys.argv[0], debug)
79
80
81 """
82 Launches threads and adds them to the runningthreads global list.
83 Assigns name for thread, starts.
84 """
85 def startThread(fnct, name):
86                 runningthreads[name] = fnct
87                 runningthreads[name].setName(name)
88                 try:
89                         logger.info("Starting thread " + name)
90                         runningthreads[name].start()
91                 except Exception, err:
92                         logger.error("Thread: " + name + " " + error)
93
94
95 """
96 Watches threads and catches exceptions.  Each launched thread is
97 watched and state is logged.
98 """
99 class ThreadWatcher(Thread):
100         def __init__(self):
101                 Thread.__init__(self)
102
103         def run(self):
104                 while 1:
105                         self.checkThreads()
106                         time.sleep(WATCHSLEEP)
107
108         def checkThreads(self):
109                 # Iterate through treads, compare with last running.
110                 for thread in runningthreads.keys():
111                         # If thread found dead, remove from queue
112                         if not runningthreads[thread].isAlive():
113                                 logger.error("***********Thread died: %s**********" %(thread))
114                                 del runningthreads[thread]
115
116
117 class Dummy(Thread):
118         def __init__(self):
119                 Thread.__init__(self)
120
121         def run(self):
122                 time.sleep(5)
123
124
125 """
126 Start threads, do some housekeeping, then daemonize.
127 """
128 def main():
129         # Defaults
130         global status, logger
131
132         try:
133                 longopts = ["debug", "status", "help"]
134                 (opts, argv) = getopt.getopt(sys.argv[1:], "dvf:s:ph", longopts)
135         except getopt.GetoptError, err:
136                 print "Error: " + err.msg
137                 usage()
138                 sys.exit(1)
139
140         for (opt, optval) in opts:
141                 if opt == "-d" or opt == "--debug":
142                         config.debug = True
143                         print "Running in DEBUG mode:  NO EMAILS SENT AND NO SLICES SQUEEZED."
144                 elif opt == "--status":
145                         #print summary(names)
146                         sys.exit(0)
147                 else:
148                         usage()
149                         sys.exit(0)
150
151         #if not debug:
152         #       daemonize()
153         #       writepid("monitor")
154
155         # Init stuff.  Watch Threads to see if they die.  Perhaps send email?
156         logger.info('Monitor Started')
157         startThread(ThreadWatcher(), "Watcher")
158         # The meat of it.
159
160         # Nodes to check. Queue of all sick nodes.
161         toCheck = Queue.Queue()
162         # Nodes that are sick w/o tickets
163         sickNoTicket = Queue.Queue()
164         # Comon DB of all nodes
165         cdb = {}
166         # Nodes that are down.  Use this to maintain DB;  cleanup.
167         #alldown = Queue.Queue()
168         # RT DB
169         tickets = {}
170         # Nodes we've emailed.
171         # host - > (type of email, time)
172         emailed = {}
173
174
175         # Get RT Tickets.
176         # Event based.  Add to queue(toCheck) and hosts are queried.
177         rt1 = rt.RT(tickets, toCheck, sickNoTicket)
178         rt2 = rt.RT(tickets, toCheck, sickNoTicket)
179         rt3 = rt.RT(tickets, toCheck, sickNoTicket)
180         rt4 = rt.RT(tickets, toCheck, sickNoTicket)
181         rt5 = rt.RT(tickets, toCheck, sickNoTicket)
182         # Kind of a hack. Cleans the DB for stale entries and updates db.
183         clean = Thread(target=rt5.cleanTickets)
184         # Poll Comon.  Refreshes Comon data every COSLEEP seconds
185         cm1 = comon.Comon(cdb, toCheck)
186
187         # Actually digest the info and do something with it.
188         pol = policy.Policy(cm1, sickNoTicket, emailed)
189
190         # Load emailed sites from last run.
191         pol.emailedStore("LOAD")
192
193         # Start Threads
194         startThread(rt1,"rt1")
195         startThread(rt2,"rt2")
196         startThread(rt3,"rt3")
197         startThread(rt4,"rt4")
198         startThread(rt5,"rt5")
199         startThread(clean,"cleanrt5")
200
201         # Start Comon Thread    
202         startThread(cm1,"comon")
203
204         # Wait for threads to init.  Probably should join, but work on that later.
205         time.sleep(10)
206
207         # Start Sending Emails
208         startThread(pol, "policy")
209
210         # Wait to finish
211         while (sickNoTicket.empty() == False) or (toCheck.empty() == False):
212                 time.sleep(15)
213
214
215
216         # Store state of emails
217         pol.emailedStore("WRITE")
218
219         # Email what we did.
220         pol.status()
221
222         logger.info('Monitor Exitted')
223         #if not debug:
224         #       removepid("monitor")
225         os._exit(0)
226         
227 if __name__ == '__main__':
228         try:
229                 main()
230         except KeyboardInterrupt:
231                 print "Killed.  Exitting."
232                 logger.info('Monitor Killed')
233                 os._exit(0)