Increase threshold to a week for slice creation, 2 weeks for suspension.
[monitor.git] / policy.py
1 #
2 # Copyright (c) 2004  The Trustees of Princeton University (Trustees).
3 #
4 # Faiyaz Ahmed <faiyaza@cs.princeton.edu>
5 #
6 # $Id: policy.py,v 1.11 2007/04/06 16:16:54 faiyaza Exp $
7 #
8 # Policy Engine.
9
10 #from monitor import *
11 from threading import *
12 import time
13 import logging
14 import mailer
15 import emailTxt
16 import pickle
17 import Queue
18 import plc
19 import reboot
20 import config
21
22 DAT="./monitor.dat"
23
24 logger = logging.getLogger("monitor")
25
26 # Time to enforce policy
27 POLSLEEP = 7200
28
29 # Where to email the summary
30 SUMTO = "faiyaza@cs.princeton.edu"
31 TECHEMAIL="tech-%s@sites.planet-lab.org"
32 PIEMAIL="pi-%s@sites.planet-lab.org"
33 SLICEMAIL="%s@slices.planet-lab.org"
34 PLCEMAIL="support@planet-lab.org"
35
36 #Thresholds (DAYS)
37 SPERDAY = 86400
38 PITHRESH = 7 * SPERDAY
39 SLICETHRESH = 14 * SPERDAY
40 # Days before attempting rins again
41 RINSTHRESH = 5 * SPERDAY
42
43 # Minimum number of nodes up before squeezing
44 MINUP = 2
45
46 # IF:
47 #  no SSH, down.
48 #  bad disk, down
49 #  DNS, kinda down (sick)
50 #  clock, kinda down (sick)
51 #  Full disk, going to be down
52
53 # Actions:
54 #  Email
55 #  suspend slice creation
56 #  kill slices
57 class Policy(Thread):
58         def __init__(self, comonthread, sickNoTicket, emailed):
59                 self.cmn = comonthread
60                 # host - > (time of email, type of email)
61                 self.emailed = emailed 
62                 # all sick nodes w/o tickets
63                 self.sickNoTicket = sickNoTicket
64                 # Sitess we've Squeezed.
65                 self.squeezed = {}
66                 Thread.__init__(self)
67         
68
69         '''
70         What to do when node is in dbg (as reported by CoMon).
71         '''
72         def __actOnDebug(self, node):
73                 # Check to see if we've done this before
74                 if (node in self.emailed.keys()):
75                         if (self.emailed[node][0] == "dbg"):
76                                 delta = time.time() - self.emailed[node][1]
77                                 if (delta <= RINSTHRESH ):
78                                         # Don't mess with node if under Thresh. 
79                                         # Return, move on.
80                                         logger.info("POLICY:  %s in dbg, but acted on %s days ago" % (node, delta // SPERDAY))
81                                         return
82                         logger.info("POLICY:  Node in dbg - " + node)
83                         plc.nodeBootState(node, "rins") 
84                         # If it has a PCU
85                         return reboot.reboot(node)
86         
87         '''
88         What to do when node is in dbg (as reported by CoMon).
89         '''
90         def __actOnFilerw(self, node):
91                 target = [PLCEMAIL]     
92                 logger.info("POLICY:  Emailing PLC for " + node)
93                 tmp = emailTxt.mailtxt.filerw
94                 sbj = tmp[0] % {'hostname': node}
95                 msg = tmp[1] % {'hostname': node}
96                 mailer.email(sbj, msg, target)  
97                 self.emailed[node] = ("filerw", time.time())
98
99
100         '''
101         Acts on sick nodes.
102         '''
103         def actOnSick(self):
104                 # Get list of nodes in debug from PLC
105                 #dbgNodes = NodesDebug()
106                 global TECHEMAIL, PIEMAIL
107                 # Grab a node from the queue (pushed by rt thread).
108                 node = self.sickNoTicket.get(block = True)
109                 # Get the login base    
110                 loginbase = plc.siteId([node])
111                 
112                 # Princeton Backdoor
113                 if loginbase == "princeton": return
114
115                 # Send appropriate message for node if in appropriate bucket.
116                 # If we know where to send a message
117                 if not loginbase: 
118                         logger.info("POLICY:  loginbase for %s not found" %node)
119                 # And we didn't email already.
120                 else:
121                         # If first email, send to Tech
122                         target = [TECHEMAIL % loginbase]
123                         
124                         # If disk is foobarred, PLC should check it.
125                         if (node in self.cmn.filerw) and \
126                         (node not in self.emailed.keys()):
127                                 self.__actOnFilerw(node)
128                                 return 
129
130                         # If in dbg, set to rins, then reboot.  Inform PLC.
131                         if (node in self.cmn.dbg):
132                         # If reboot failure via PCU, POD and send email
133                         # if contacted PCU, return
134                                 if self.__actOnDebug(node):  return
135
136                         if (node in self.emailed.keys()) and \
137                         (node not in self.cmn.filerw)    and \
138                         (node not in self.cmn.clock_drift):
139                                 # If we emailed before, how long ago?   
140                                 delta = time.time() - self.emailed[node][1]
141                                 if delta < SPERDAY:  
142                                         logger.info("POLICY:  already acted on %s today." % node)
143                                         return
144
145                                 logger.info("POLICY:  acted %s on %s days ago" % (node, 
146                                 delta // SPERDAY))
147                         
148                                 # If no luck with tech, email PI
149                                 if (delta >= 1):
150                                         target.append(PIEMAIL % loginbase)
151
152                                 # If more than PI thresh, but less than slicethresh
153                                 if (delta >= PITHRESH) and (delta < SLICETHRESH): 
154                                         #remove slice creation if enough nodes arent up
155                                         if not self.enoughUp(loginbase):
156                                                 slices = plc.slices([loginbase])
157                                                 if len(slices) >= 1:
158                                                         for slice in slices:
159                                                                 target.append(SLICEMAIL % slice)
160                                                 logger.info("POLICY:  Removing slice creation from %s" % loginbase)
161                                                 tmp = emailTxt.mailtxt.removedSliceCreation
162                                                 sbj = tmp[0] 
163                                                 msg = tmp[1] % {'loginbase': loginbase}
164                                                 plc.removeSliceCreation([node])
165                                                 mailer.email(sbj, msg, target)  
166                                                 self.squeezed[loginbase] = (time.time(), "creation")
167                                                 self.emailed[node] = ("creation", time.time())  
168                                                 logger.info("POLICY: Emailing (%s) %s - %s"\
169                                                         %("creation", node, target))
170                                                 return
171
172                                 # If more than PI thresh and slicethresh
173                                 if (delta >= PITHRESH) and (delta > SLICETHRESH):
174                                         target.append(PIEMAIL % loginbase)
175                                         # Email slices at site.
176                                         slices = plc.slices([loginbase])
177                                         if len(slices) >= 1:
178                                                 for slice in slices:
179                                                         target.append(SLICEMAIL % slice)
180                                         # If not enough up, freeze slices and email everyone.
181                                         if not self.enoughUp(loginbase):
182                                                 logger.info("POLICY:  Suspending %s slices." % loginbase)
183                                                 tmp = emailTxt.mailtxt.suspendSlices
184                                                 sbj = tmp[0] 
185                                                 msg = tmp[1] % {'loginbase': loginbase}
186                                                 plc.suspendSlices([node])
187                                                 self.squeezed[loginbase] = (time.time(), "freeze")
188                                                 mailer.email(sbj, msg, target)  
189                                                 self.emailed[node] = ("freeze", time.time())
190                                                 logger.info("POLICY: Emailing (%s) %s - %s"\
191                                                         %("freeze", node, target))
192
193                                                 return
194
195                         # Find the bucket the node is in and send appropriate email
196                         # to approriate list of people.
197                         for bkt in self.cmn.comonbkts.keys():
198                                 if (node in getattr(self.cmn, bkt)):
199                                         # Send predefined message for that bucket.
200                                         logger.info("POLICY: Emailing (%s) %s - %s"\
201                                                 %(bkt, node, target))
202                                         tmp = getattr(emailTxt.mailtxt, bkt)
203                                         sbj = tmp[0] % {'hostname': node}
204                                         msg = tmp[1] % {'hostname': node}
205                                         mailer.email(sbj, msg, target)  
206                                         self.emailed[node] = (bkt , time.time())
207                                         return
208
209
210         '''
211         Prints, logs, and emails status of up nodes, down nodes, and buckets.
212         '''
213         def status(self):
214                 sub = "Monitor Summary"
215                 msg = "\nThe following nodes were acted upon:  \n\n"
216                 for (node, (type, date)) in self.emailed.items():
217                         # Print only things acted on today.
218                         if (time.gmtime(time.time())[2] == time.gmtime(date)[2]):
219                                 msg +="%s\t(%s)\t%s\n" %(node, type, time.ctime(date))
220                 msg +="\n\nThe following sites have been 'squeezed':\n\n"
221                 for (loginbase, (date, type)) in self.squeezed.items():
222                         # Print only things acted on today.
223                         if (time.gmtime(time.time())[2] == time.gmtime(date)[2]):
224                                 msg +="%s\t(%s)\t%s\n" %(loginbase, type, time.ctime(date))
225                 mailer.email(sub, msg, [SUMTO])
226                 logger.info(msg)
227                 return 
228
229         '''
230         Store/Load state of emails.  When, where, what.
231         '''
232         def emailedStore(self, action):
233                 try:
234                         if action == "LOAD":
235                                 f = open(DAT, "r+")
236                                 logger.info("POLICY:  Found and reading " + DAT)
237                                 self.emailed.update(pickle.load(f))
238                         if action == "WRITE":
239                                 f = open(DAT, "w")
240                                 #logger.debug("Writing " + DAT)
241                                 pickle.dump(self.emailed, f)
242                         f.close()
243                 except Exception, err:
244                         logger.info("POLICY:  Problem with DAT, %s" %err)
245
246         '''
247         Returns True if more than MINUP nodes are up at a site.
248         '''
249         def enoughUp(self, loginbase):
250                 allsitenodes = plc.getSiteNodes([loginbase])
251                 if len(allsitenodes) == 0:
252                         logger.info("Node not in db")
253                         return
254
255                 numnodes = len(allsitenodes)
256                 sicknodes = []
257                 # Get all sick nodes from comon
258                 for bucket in self.cmn.comonbkts.keys():
259                         for host in getattr(self.cmn, bucket):
260                                 sicknodes.append(host)
261                 # Diff.
262                 for node in allsitenodes:
263                         if node in sicknodes:
264                                 numnodes -= 1
265
266                 if numnodes < MINUP:
267                         logger.info(\
268 "POLICY:  site with %s has nodes %s up." %(loginbase, numnodes))
269                         return False 
270                 else: 
271                         return True 
272                         
273                 
274
275
276         def run(self):
277                 while 1:
278                         self.actOnSick()
279                         self.emailedStore("WRITE")
280
281
282 def main():
283         logger.setLevel(logging.DEBUG)
284         ch = logging.StreamHandler()
285         ch.setLevel(logging.DEBUG)
286         formatter = logging.Formatter('%(message)s')
287         ch.setFormatter(formatter)
288         logger.addHandler(ch)
289
290         #print NodesDebug()
291         #tmp = Queue.Queue()
292         #a = Policy(None, tmp) 
293         #a.emailedStore("LOAD")
294         #print a.emailed
295
296         print plc.slices([plc.siteId(["alice.cs.princeton.edu"])])
297         os._exit(0)
298 if __name__ == '__main__':
299         import os
300         import plc
301         try:
302                 main()
303         except KeyboardInterrupt:
304                 print "Killed.  Exitting."
305                 logger.info('Monitor Killed')
306                 os._exit(0)