Hosed this file on alfred when rins'ing. Its RO DB access so (hopefullY) doesn't...
[monitor.git] / policy.py
1 #
2 # Copyright (c) 2004  The Trustees of Princeton University (Trustees).
3 #
4 # Faiyaz Ahmed <faiyaza@cs.princeton.edu>
5 #
6 # $Id: policy.py,v 1.4 2006/11/14 19:20:13 faiyaza Exp $
7 #
8 # Policy Engine.
9
10 #from monitor import *
11 from threading import *
12 import time
13 import logging
14 import mailer
15 import emailTxt
16 import pickle
17 import Queue
18 import plc
19 import reboot
20 import config
21
22 DAT="./monitor.dat"
23
24 logger = logging.getLogger("monitor")
25
26 # Time to enforce policy
27 POLSLEEP = 7200
28
29 # Where to email the summary
30 #SUMTO = "pupadm@lists.planet-lab.org"
31 SUMTO = "faiyaza@cs.princeton.edu"
32 TECHEMAIL="tech-%s@sites.planet-lab.org"
33 PIEMAIL="pi-%s@sites.planet-lab.org"
34 SLICEMAIL="%s@slices.planet-lab.org"
35 PLCEMAIL="support@planet-lab.org"
36
37 #Thresholds (DAYS)
38 SPERDAY = 86400
39 PITHRESH = 1 * SPERDAY
40 SLICETHRESH = 5 * SPERDAY
41 # Days before attempting rins again
42 RINSTHRESH = 5 * SPERDAY
43
44 # Minimum number of nodes up before squeezing
45 MINUP = 2
46
47 # IF:
48 #  no SSH, down.
49 #  bad disk, down
50 #  DNS, kinda down (sick)
51 #  clock, kinda down (sick)
52 #  Full disk, going to be down
53
54 # Actions:
55 #  Email
56 #  suspend slice creation
57 #  kill slices
58 class Policy(Thread):
59         def __init__(self, comonthread, sickNoTicket, emailed):
60                 self.cmn = comonthread
61                 # host - > (time of email, type of email)
62                 self.emailed = emailed 
63                 # all sick nodes w/o tickets
64                 self.sickNoTicket = sickNoTicket
65                 # Sitess we've Squeezed.
66                 self.squeezed = {}
67                 Thread.__init__(self)
68         
69
70         '''
71         What to do when node is in dbg (as reported by CoMon).
72         '''
73         def __actOnDebug(self, node):
74                 # Check to see if we've done this before
75                 if (node in self.emailed.keys()):
76                         if (self.emailed[node][0] == "dbg"):
77                                 delta = time.time() - self.emailed[node][1]
78                                 if (delta <= RINSTHRESH ):
79                                         # Don't mess with node if under Thresh. 
80                                         # Return, move on.
81                                         logger.info("POLICY:  %s in dbg, but acted on %s days ago" % (node, delta // SPERDAY))
82                                         return
83                         logger.info("POLICY:  Node in dbg - " + node)
84                         plc.nodeBootState(node, "rins") 
85                         # If it has a PCU
86                         return reboot.reboot(node)
87         
88         '''
89         What to do when node is in dbg (as reported by CoMon).
90         '''
91         def __actOnFilerw(self, node):
92                 target = [PLCEMAIL]     
93                 logger.info("POLICY:  Emailing PLC for " + node)
94                 tmp = emailTxt.mailtxt.filerw
95                 sbj = tmp[0] % {'hostname': node}
96                 msg = tmp[1] % {'hostname': node}
97                 mailer.email(sbj, msg, target)  
98                 self.emailed[node] = ("filerw", time.time())
99
100
101         '''
102         Acts on sick nodes.
103         '''
104         def actOnSick(self):
105                 # Get list of nodes in debug from PLC
106                 #dbgNodes = NodesDebug()
107                 global TECHEMAIL, PIEMAIL
108                 # Grab a node from the queue (pushed by rt thread).
109                 node = self.sickNoTicket.get(block = True)
110                 # Get the login base    
111                 loginbase = plc.siteId(node)
112
113                 # Send appropriate message for node if in appropriate bucket.
114                 # If we know where to send a message
115                 if not loginbase: 
116                         logger.info("POLICY:  loginbase for %s not found" %node)
117                 # And we didn't email already.
118                 else:
119                         # If first email, send to Tech
120                         target = [TECHEMAIL % loginbase]
121                         
122                         # If disk is foobarred, PLC should check it.
123                         if (node in self.cmn.filerw) and \
124                         (node not in self.emailed.keys()):
125                                 self.__actOnFilerw(node)
126                                 return 
127
128                         # If in dbg, set to rins, then reboot.  Inform PLC.
129                         if (node in self.cmn.dbg):
130                         # If reboot failure via PCU, POD and send email
131                         # if contacted PCU, return
132                                 if self.__actOnDebug(node):  return
133
134                         if (node in self.emailed.keys()) and \
135                         (node not in self.cmn.filerw)    and \
136                         (node not in self.cmn.clock_drift):
137                                 # If we emailed before, how long ago?   
138                                 delta = time.time() - self.emailed[node][1]
139                                 if delta < SPERDAY:  
140                                         logger.info("POLICY:  already acted on %s today." % node)
141                                         return
142
143                                 logger.info("POLICY:  acted %s on %s days ago" % (node, 
144                                 delta // SPERDAY))
145
146                                 # If more than PI thresh, but less than slicethresh
147                                 if (delta >= PITHRESH) and (delta < SLICETHRESH): 
148                                         target.append(PIEMAIL % loginbase)
149                                         #remove slice creation if enough nodes arent up
150                                         if not self.enoughUp(loginbase):
151                                                 logger.info("POLICY:  Removing slice creation from %s" % loginbase)
152                                                 plc.removeSliceCreation(node)
153                                                 self.squeezed[loginbase] = (time.time(), "creation")
154                                 # If more than PI thresh and slicethresh
155                                 if (delta >= PITHRESH) and (delta > SLICETHRESH):
156                                         # Email slices at site.
157                                         slices = plc.slices(loginbase)
158                                         if len(slices) >= 1:
159                                                 for slice in slices:
160                                                         target.append(SLICEMAIL % slice)
161                                                 if not self.enoughUp(loginbase):
162                                                         plc.suspendSlices(node)
163                                                         self.squeezed[loginbase] = (time.time(),
164                                                                  "freeze")
165
166                         # Find the bucket the node is in and send appropriate email
167                         # to approriate list of people.
168                         for bkt in self.cmn.comonbkts.keys():
169                                 if (node in getattr(self.cmn, bkt)):
170                                         # Send predefined message for that bucket.
171                                         logger.info("POLICY: Emailing (%s) %s - %s"\
172                                                 %(bkt, node, target))
173                                         tmp = getattr(emailTxt.mailtxt, bkt)
174                                         sbj = tmp[0] % {'hostname': node}
175                                         msg = tmp[1] % {'hostname': node}
176                                         mailer.email(sbj, msg, target)  
177                                         self.emailed[node] = (bkt , time.time())
178                                         return
179
180
181         '''
182         Prints, logs, and emails status of up nodes, down nodes, and buckets.
183         '''
184         def status(self):
185                 sub = "Monitor Summary"
186                 msg = "\nThe following nodes were acted upon:  \n\n"
187                 for (node, (type, date)) in self.emailed.items():
188                         # Print only things acted on today.
189                         if (time.gmtime(time.time())[2] == time.gmtime(date)[2]):
190                                 msg +="%s\t(%s)\t%s\n" %(node, type, time.ctime(date))
191                 msg +="\n\nThe following sites have been 'squeezed':\n\n"
192                 for (loginbase, (date, type)) in self.squeezed.items():
193                         # Print only things acted on today.
194                         if (time.gmtime(time.time())[2] == time.gmtime(date)[2]):
195                                 msg +="%s\t(%s)\t%s\n" %(loginbase, type, time.ctime(date))
196                 mailer.email(sub, msg, [SUMTO])
197                 logger.info(msg)
198                 return 
199
200         '''
201         Store/Load state of emails.  When, where, what.
202         '''
203         def emailedStore(self, action):
204                 try:
205                         if action == "LOAD":
206                                 f = open(DAT, "r+")
207                                 logger.info("POLICY:  Found and reading " + DAT)
208                                 self.emailed.update(pickle.load(f))
209                         if action == "WRITE":
210                                 f = open(DAT, "w")
211                                 #logger.debug("Writing " + DAT)
212                                 pickle.dump(self.emailed, f)
213                         f.close()
214                 except Exception, err:
215                         logger.info("POLICY:  Problem with DAT, %s" %err)
216
217         '''
218         Returns True if more than MINUP nodes are up at a site.
219         '''
220         def enoughUp(self, loginbase):
221                 allsitenodes = plc.getSiteNodes(loginbase)
222                 if len(allsitenodes) == 0:
223                         logger.info("Node not in db")
224                         return
225
226                 numnodes = len(allsitenodes)
227                 sicknodes = []
228                 # Get all sick nodes from comon
229                 for bucket in self.cmn.comonbkts.keys():
230                         for host in getattr(self.cmn, bucket):
231                                 sicknodes.append(host)
232                 # Diff.
233                 for node in allsitenodes:
234                         if node in sicknodes:
235                                 numnodes -= 1
236
237                 if numnodes < MINUP:
238                         logger.info(\
239 "POLICY:  site with %s has nodes %s up." %(loginbase, numnodes))
240                         return False 
241                 else: 
242                         return True 
243                         
244                 
245
246
247         def run(self):
248                 while 1:
249                         self.actOnSick()
250                         self.emailedStore("WRITE")
251
252
253 def main():
254         logger.setLevel(logging.DEBUG)
255         ch = logging.StreamHandler()
256         ch.setLevel(logging.DEBUG)
257         formatter = logging.Formatter('%(message)s')
258         ch.setFormatter(formatter)
259         logger.addHandler(ch)
260
261         #print NodesDebug()
262         #tmp = Queue.Queue()
263         #a = Policy(None, tmp) 
264         #a.emailedStore("LOAD")
265         #print a.emailed
266
267         print plc.slices(plc.siteId("alice.cs.princeton.edu"))
268         os._exit(0)
269 if __name__ == '__main__':
270         import os
271         import plc
272         try:
273                 main()
274         except KeyboardInterrupt:
275                 print "Killed.  Exitting."
276                 logger.info('Monitor Killed')
277                 os._exit(0)