Uses CoMon's ability to find 'upness' to email. Changed queueing between threads...
[monitor.git] / policy.py
1 #
2 # Copyright (c) 2004  The Trustees of Princeton University (Trustees).
3 #
4 # Faiyaz Ahmed <faiyaza@cs.princeton.edu>
5 #
6 # $Id: $
7 #
8 # Policy Engine.
9
10 #from monitor import *
11 from threading import *
12 import time
13 import logging
14 import mailer
15 import emailTxt
16 import pickle
17 import xml, xmlrpclib
18 import Queue
19
20 DAT="./monitor.dat"
21
22 logger = logging.getLogger("monitor")
23
24 # Time to enforce policy
25 POLSLEEP = 7200
26
27 # Days between emails (enforce 'squeeze' after this time).
28 SQUEEZE = 3
29 # IF:
30 #  no SSH, down.
31 #  bad disk, down
32 #  DNS, kinda down (sick)
33 #  clock, kinda down (sick)
34 #  Full disk, going to be down
35
36 # Actions:
37 #  Email
38 #  suspend slice creation
39 #  kill slices
40 class Policy(Thread):
41         def __init__(self, comonthread, sickNoTicket, emailed):
42                 self.cmn = comonthread
43                 # host - > (time of email, type of email)
44                 self.emailed = emailed 
45                 # all sick nodes w/o tickets
46                 self.sickNoTicket = sickNoTicket 
47                 Thread.__init__(self)
48         
49         #def getAllSick(self):
50         #       for bucket in self.cmn.comonbkts.keys():
51         #               for host in getattr(self.cmn, bucket):
52         #                       if host not in self.cursickw.keys():
53         #                               self.cursick.put(host)
54
55         '''
56         Acts on sick nodes
57         '''
58         def emailsick(self):
59                 # Get list of nodes in debug from PLC
60                 #dbgNodes = NodesDebug()
61
62                 node = self.sickNoTicket.get(block = True)
63                 # Get the login base    
64                 id = mailer.siteId(node)
65
66                 if not id: 
67                         logger.info("loginbase for %s not found" %node)
68                 elif node not in self.emailed.keys():
69                         # Email about Down.
70                         if node in self.cmn.down:
71                                 logger.debug("POLICY: Emailing (down) " + node)
72                                 self.emailed[node] = ("down", time.localtime())
73                                 msg = emailTxt.mailtxt.DOWN \
74                                         % {'hostname': node}
75                                 mailer.email(node + " down", msg, 
76                                 "tech-" + id + "@sites.planet-lab.org")
77                                 return  
78
79                         # Email about no SSH.
80                         if node in self.cmn.ssh:
81                                 logger.debug("POLICY: Emailing (ssh) " + node)
82                                 self.emailed[node] = ("ssh", time.localtime())
83                                 msg = emailTxt.mailtxt.SSH \
84                                         % {'hostname': node}
85                                 mailer.email(node + " down", msg, 
86                                 "tech-" + id + "@sites.planet-lab.org")
87                                 return 
88
89                         # Email about DNS
90                         if node in self.cmn.dns:
91                                 logger.debug("POLICY: Emailing (dns)" + node)
92                                 self.emailed[node] = ("dns", time.localtime())
93                                 msg = emailTxt.mailtxt.DNS \
94                                         % {'hostname': node}
95                                 mailer.email("Please update DNS used by " \
96                                 + node, msg, 
97                                 "tech-" + id + "@sites.planet-lab.org")
98                                 return 
99         
100
101         '''
102         Prints, logs, and emails status of up nodes, down nodes, and buckets.
103         '''
104         def status(self):
105                 return 0
106
107         '''
108         Store/Load state of emails.  When, where, what.
109         '''
110         def emailedStore(self, action):
111                 try:
112                         if action == "LOAD":
113                                 f = open(DAT, "r+")
114                                 logger.info("Found and reading " + DAT)
115                                 self.emailed.update(pickle.load(f))
116                         if action == "WRITE":
117                                 f = open(DAT, "w")
118                                 logger.info("Writing " + DAT)
119                                 pickle.dump(self.emailed, f)
120                         f.close()
121                 except Exception, err:
122                         logger.info("Problem with DAT, %s" %err)
123
124         def run(self):
125                 while 1:
126                         self.emailsick()
127
128 '''
129 Returns list of nodes in dbg as reported by PLC
130 '''
131 def NodesDebug():
132         dbgNodes = []
133         api = xmlrpclib.Server(XMLRPC_SERVER, verbose=False)
134         anon = {'AuthMethod': "anonymous"}
135         allnodes = api.AnonAdmGetNodes(anon, [], ['hostname','boot_state'])
136         for node in allnodes:
137                 if node['boot_state'] == 'dbg': dbgNodes.append(node['hostname'])
138         logger.info("%s nodes in debug according to PLC." %len(dbgNodes))
139         return dbgNodes
140
141
142
143
144 def main():
145         logger.setLevel(logging.DEBUG)
146         ch = logging.StreamHandler()
147         ch.setLevel(logging.DEBUG)
148         formatter = logging.Formatter('%(message)s')
149         ch.setFormatter(formatter)
150         logger.addHandler(ch)
151
152         #print NodesDebug()
153         tmp = Queue.Queue()
154         a = Policy(None, tmp) 
155         a.emailedStore("LOAD")
156         print a.emailed
157
158         os._exit(0)
159 if __name__ == '__main__':
160         import os
161         XMLRPC_SERVER = 'https://www.planet-lab.org/PLCAPI/'
162         try:
163                 main()
164         except KeyboardInterrupt:
165                 print "Killed.  Exitting."
166                 logger.info('Monitor Killed')
167                 os._exit(0)