Initial Checkin.
[monitor.git] / monitor.py
1 #
2 # Copyright (c) 2004  The Trustees of Princeton University (Trustees).
3
4 # Faiyaz Ahmed <faiyaza@cs.princeton.edu>
5 #
6 # $Id: $
7
8 import sys
9 import os
10 import getopt 
11 import thread
12 from threading import *
13 import time
14 import logging
15 import Queue
16 # daemonize and *pid
17 from util.process import * 
18
19 # Comon DB
20 import comon
21 # RT tickets
22 import rt
23 # Correlates input with policy to form actions
24 import policy
25 # Email
26 import mailer
27 import emailTxt
28 # Defaults
29 debug = False 
30
31 # Log to what 
32 LOG="./monitor.log"
33
34 # Email defaults
35 MTA="localhost"
36 FROM="support@planet-lab.org"
37 # API
38 XMLRPC_SERVER = 'https://www.planet-lab.org/PLCAPI/'
39 # Time between comon refresh
40 COSLEEP=300 #5mins
41 # Time to refresh DB and remove unused entries
42 RTSLEEP=7200 #2hrs
43 # Time between policy enforce/update
44 #POLSLEEP=43200 #12hrs
45 POLSLEEP=10
46
47 # Global list of all running threads.  Any threads added to 
48 # list will be monitored.
49 runningthreads = {}
50 # Seconds between checking threads
51 WATCHSLEEP = 10
52  
53 # Set up Logging
54 logger = logging.getLogger("monitor")
55 logger.setLevel(logging.DEBUG)
56 fh = logging.FileHandler(LOG, mode = 'a')
57 fh.setLevel(logging.DEBUG)
58 formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
59 fh.setFormatter(formatter)
60 logger.addHandler(fh)
61
62 def usage():
63     print """
64 Usage: %s [OPTIONS]...
65
66 Options:
67         -d, --debug             Enable debugging (default: %s)
68         --status                Print memory usage statistics and exit
69         -h, --help              This message
70 """.lstrip() % (sys.argv[0], debug)
71
72
73 """
74 Launches threads and adds them to the runningthreads global list.
75 Assigns name for thread, starts.
76 """
77 def startThread(fnct, name):
78                 runningthreads[name] = fnct
79                 runningthreads[name].setName(name)
80                 try:
81                         logger.info("Starting thread " + name)
82                         runningthreads[name].start()
83                 except Exception, err:
84                         logger.error("Thread: " + name + " " + error)
85
86
87 """
88 Watches threads and catches exceptions.  Each launched thread is
89 watched and state is logged.
90 """
91 class ThreadWatcher(Thread):
92         def __init__(self):
93                 Thread.__init__(self)
94
95         def run(self):
96                 while 1:
97                         self.checkThreads()
98                         time.sleep(WATCHSLEEP)
99
100         def checkThreads(self):
101                 # Iterate through treads, compare with last running.
102                 for thread in runningthreads.keys():
103                         # If thread found dead, remove from queue
104                         if not runningthreads[thread].isAlive():
105                                 logger.error("Thread Died: %s" %(thread))
106                                 del runningthreads[thread]
107
108
109 class Dummy(Thread):
110         def __init__(self):
111                 Thread.__init__(self)
112
113         def run(self):
114                 time.sleep(5)
115
116
117 """
118 Start threads, do some housekeeping, then daemonize.
119 """
120 def main():
121         # Defaults
122         global debug, status, logger
123
124         try:
125                 longopts = ["debug", "status", "help"]
126                 (opts, argv) = getopt.getopt(sys.argv[1:], "dvf:s:ph", longopts)
127         except getopt.GetoptError, err:
128                 print "Error: " + err.msg
129                 usage()
130                 sys.exit(1)
131
132         for (opt, optval) in opts:
133                 if opt == "-d" or opt == "--debug":
134                         debug = True
135                 elif opt == "--status":
136                         #print summary(names)
137                         sys.exit(0)
138                 else:
139                         usage()
140                         sys.exit(0)
141
142         #if not debug:
143         #       daemonize()
144         #       writepid("monitor")
145
146         # Init stuff.  Watch Threads to see if they die.  Perhaps send email?
147         logger.info('Monitor Started')
148         startThread(ThreadWatcher(), "Watcher")
149         # The meat of it.
150
151         # Nodes to check
152         bucket = Queue.Queue()
153         # Comon DB of all nodes
154         cdb = {}
155         # Nodes that are down.  Use this to maintain DB;  cleanup.
156         alldown = Queue.Queue()
157         # RT DB
158         tickets = {}
159
160         # Get RT Tickets.
161         # Event based.  Add to queue(bucket) and hosts are queried.
162         rt1 = rt.RT(tickets, bucket)
163         rt2 = rt.RT(tickets, bucket)
164         rt3 = rt.RT(tickets, bucket)
165         rt4 = rt.RT(tickets, bucket)
166         rt5 = rt.RT(tickets, bucket)
167         # Kind of a hack. Cleans the DB for stale entries and updates db.
168         clean = Thread(target=rt5.cleanTickets)
169         # Poll Comon.  Refreshes Comon data every COSLEEP seconds
170         cm1 = comon.Comon(cdb, bucket)
171
172         # Start Threads
173         startThread(rt1,"rt1")
174         startThread(rt2,"rt2")
175         startThread(rt3,"rt3")
176         startThread(rt4,"rt4")
177         startThread(rt5,"rt5")
178         startThread(clean,"cleanrt5")
179         startThread(cm1,"rt5")
180         time.sleep(10)
181
182         # Actually digest the info and do something with it.
183         pol = policy.Policy(cm1, tickets)
184
185         while bucket.empty() == False:
186                 time.sleep(3)
187
188         startThread(pol, "policy")
189         time.sleep(3600)        
190         
191         #print runningthreads["RT"].ssh 
192
193         logger.info('Monitor Exitted')
194         #if not debug:
195         #       removepid("monitor")
196         os._exit(0)
197         
198 if __name__ == '__main__':
199         try:
200                 main()
201         except KeyboardInterrupt:
202                 print "Killed.  Exitting."
203                 logger.info('Monitor Killed')
204                 os._exit(0)