* Emails PI, then Slices if the node does not come up after a certain number of days.
[monitor.git] / policy.py
1 #
2 # Copyright (c) 2004  The Trustees of Princeton University (Trustees).
3 #
4 # Faiyaz Ahmed <faiyaza@cs.princeton.edu>
5 #
6 # $Id: $
7 #
8 # Policy Engine.
9
10 #from monitor import *
11 from threading import *
12 import time
13 import logging
14 import mailer
15 import emailTxt
16 import pickle
17 import xml, xmlrpclib
18 import Queue
19
20 #Hack to auth structure
21 import auth 
22 DAT="./monitor.dat"
23
24 logger = logging.getLogger("monitor")
25
26 # Time to enforce policy
27 POLSLEEP = 7200
28
29 # Days between emails (enforce 'squeeze' after this time).
30 SQUEEZE = 3
31
32 # Where to email the summary
33 SUMTO = "faiyaza@cs.princeton.edu"
34 TECHEMAIL="tech-%s@sites.planet-lab.org"
35 PIEMAIL="pi-%s@sites.planet-lab.org"
36 SLICEMAIL="%s@slices.planet-lab.org"
37 PLCEMAIL="support@planet-lab.org"
38
39 #Thresholds
40 PITHRESH = 3
41 SLICETHRESH = 5
42
43 # IF:
44 #  no SSH, down.
45 #  bad disk, down
46 #  DNS, kinda down (sick)
47 #  clock, kinda down (sick)
48 #  Full disk, going to be down
49
50 # Actions:
51 #  Email
52 #  suspend slice creation
53 #  kill slices
54 class Policy(Thread):
55         def __init__(self, comonthread, sickNoTicket, emailed):
56                 self.cmn = comonthread
57                 # host - > (time of email, type of email)
58                 self.emailed = emailed 
59                 # all sick nodes w/o tickets
60                 self.sickNoTicket = sickNoTicket 
61                 Thread.__init__(self)
62         
63         #def getAllSick(self):
64         #       for bucket in self.cmn.comonbkts.keys():
65         #               for host in getattr(self.cmn, bucket):
66         #                       if host not in self.cursickw.keys():
67         #                               self.cursick.put(host)
68
69         '''
70         Acts on sick nodes
71         '''
72         def actOnSick(self):
73                 # Get list of nodes in debug from PLC
74                 #dbgNodes = NodesDebug()
75                 global TECHEMAIL, PIEMAIL
76                 node = self.sickNoTicket.get(block = True)
77                 # Get the login base    
78                 id = mailer.siteId(node)
79
80                 # Send appropriate message for node if in appropriate bucket.
81                 # If we know where to send a message
82                 if not id: 
83                         logger.info("loginbase for %s not found" %node)
84                 # And we didn't email already.
85                 else:
86                         # If first email, send to Tech
87                         target = [TECHEMAIL % id]
88                         
89                         # If disk is foobarred, PLC should check it.
90                         if (node in self.cmn.filerw) and \
91                         (node not in self.emailed.keys()):
92                                 target = [PLCEMAIL]     
93                                 logger.info("Emailing PLC for " + node)
94
95                         # If in dbg, set to rins, then reboot.  Inform PLC.
96                         if (node in self.cmn.dbg):
97                                 logger.info("Node in dbg - " + node)
98                                 return
99
100                         # If its a disk, email PLC;  dont bother going through this loop.
101                         if (node in self.emailed.keys()) and \
102                         (node not in self.cmn.filerw):
103                                 # If we emailed before, how long ago?   
104                                 delta = time.localtime()[2] - self.emailed[node][1][2]
105                                 # If more than PI thresh, but less than slicethresh
106                                 if (delta >= PITHRESH) and (delta < SLICETHRESH): 
107                                         logger.info("Emailing PI for " + node)
108                                         target.append(PIEMAIL % id)
109                                 # If more than PI thresh and slicethresh
110                                 if (delta >= PITHRESH) and (delta > SLICETHRESH):
111                                         logger.info("Emailing slices for " + node)
112                                         # Email slices at site.
113                                         slices = mailer.slices(id)
114                                         if len(slices) >= 1:
115                                                 for slice in slices:
116                                                         target.append(SLICEMAIL % slice)
117
118                         # Find the bucket the node is in and send appropriate email
119                         # to approriate list of people.
120                         for bkt in self.cmn.comonbkts.keys():
121                                 if (node in getattr(self.cmn, bkt)):
122                                         # Send predefined message for that bucket.
123                                         logger.info("POLICY: Emailing (%s) %s - %s"\
124                                                 %(bkt, node, target))
125                                         tmp = getattr(emailTxt.mailtxt, bkt)
126                                         sbj = tmp[0] % {'hostname': node}
127                                         msg = tmp[1] % {'hostname': node}
128                                         mailer.email(sbj, msg, target)  
129                                         self.emailed[node] = (bkt , time.localtime())
130                                         return
131
132
133         '''
134         Prints, logs, and emails status of up nodes, down nodes, and buckets.
135         '''
136         def status(self):
137                 sub = "Monitor Summary"
138                 msg = "\nThe following nodes were acted upon:  \n\n"
139                 for (node, (type, date)) in self.emailed.items():
140                         msg +="%s\t(%s)\t%s:%s:%s\n" %(node,type,date[3],date[4],date[5])
141                 mailer.email(sub, msg, [SUMTO])
142                 logger.info(msg)
143                 return 
144
145         '''
146         Store/Load state of emails.  When, where, what.
147         '''
148         def emailedStore(self, action):
149                 try:
150                         if action == "LOAD":
151                                 f = open(DAT, "r+")
152                                 logger.info("Found and reading " + DAT)
153                                 self.emailed.update(pickle.load(f))
154                         if action == "WRITE":
155                                 f = open(DAT, "w")
156                                 logger.debug("Writing " + DAT)
157                                 pickle.dump(self.emailed, f)
158                         f.close()
159                 except Exception, err:
160                         logger.info("Problem with DAT, %s" %err)
161
162         def run(self):
163                 while 1:
164                         self.actOnSick()
165                         self.emailedStore("WRITE")
166 '''
167 Returns list of nodes in dbg as reported by PLC
168 '''
169 def NodesDebug():
170         dbgNodes = []
171         api = xmlrpclib.Server(XMLRPC_SERVER, verbose=False)
172         anon = {'AuthMethod': "anonymous"}
173         allnodes = api.AnonAdmGetNodes(anon, [], ['hostname','boot_state'])
174         for node in allnodes:
175                 if node['boot_state'] == 'dbg': dbgNodes.append(node['hostname'])
176         logger.info("%s nodes in debug according to PLC." %len(dbgNodes))
177         return dbgNodes
178
179
180
181
182 def main():
183         logger.setLevel(logging.DEBUG)
184         ch = logging.StreamHandler()
185         ch.setLevel(logging.DEBUG)
186         formatter = logging.Formatter('%(message)s')
187         ch.setFormatter(formatter)
188         logger.addHandler(ch)
189
190         #print NodesDebug()
191         #tmp = Queue.Queue()
192         #a = Policy(None, tmp) 
193         #a.emailedStore("LOAD")
194         #print a.emailed
195         print siteId("princetoan")
196
197         os._exit(0)
198 if __name__ == '__main__':
199         import os
200         XMLRPC_SERVER = 'https://www.planet-lab.org/PLCAPI/'
201         try:
202                 main()
203         except KeyboardInterrupt:
204                 print "Killed.  Exitting."
205                 logger.info('Monitor Killed')
206                 os._exit(0)