2 # Copyright (c) 2004 The Trustees of Princeton University (Trustees).
4 # Faiyaz Ahmed <faiyaza@cs.princeton.edu>
6 # $Id: policy.py,v 1.12 2007/04/06 17:38:14 faiyaza Exp $
10 #from monitor import *
11 from threading import *
24 logger = logging.getLogger("monitor")
26 # Time to enforce policy
29 # Where to email the summary
30 SUMTO = "faiyaza@cs.princeton.edu"
31 TECHEMAIL="tech-%s@sites.planet-lab.org"
32 PIEMAIL="pi-%s@sites.planet-lab.org"
33 SLICEMAIL="%s@slices.planet-lab.org"
34 PLCEMAIL="support@planet-lab.org"
38 PITHRESH = 7 * SPERDAY
39 SLICETHRESH = 7 * SPERDAY
40 # Days before attempting rins again
41 RINSTHRESH = 5 * SPERDAY
43 # Days before calling the node dead.
44 DEADTHRESH = 30 * SPERDAY
45 # Minimum number of nodes up before squeezing
51 # DNS, kinda down (sick)
52 # clock, kinda down (sick)
53 # Full disk, going to be down
57 # suspend slice creation
62 def __init__(self, comonthread, sickNoTicket, emailed):
63 self.cmn = comonthread
64 # host - > (time of email, type of email)
65 self.emailed = emailed
66 # all sick nodes w/o tickets
68 self.sickNoTicket = sickNoTicket
69 # Actions taken on nodes.
70 # actionlogdb{node: [action, date]}
72 # Actions taken on sites.
73 # sitelogdb{site: [action, daysdown, date]}
75 # sick nodes with no tickets
76 # sickdb{loginbase: [{hostname1: [buckets]}, {...}]}
81 def accumSickSites(self):
83 Take all sick nodes, find their sites, and put in
84 sickdb{loginbase: [{hostname1: [buckets]}, {...}]}
86 while self.sickNoTicket.empty() == False:
87 node = self.sickNoTicket.get(block = True)
89 for bkt in self.cmn.comonbkts.keys():
90 if (node in getattr(self.cmn, bkt)):
91 bkts.append("%s" % bkt)
92 self.sickdb[plc.siteId(node)] = {node: bkts}
95 def __actOnDebug(self, node):
97 If in debug, set the node to rins, reboot via PCU/POD
99 daysdown = self.cmn.codata[node]['sshstatus'] // (60*60*24)
100 logger.info("POLICY: Node %s in dbg. down for %s" %(node,daysdown))
101 plc.nodeBootState(node, "rins")
105 self.actionlogdb[node] = ['rins', daysdown, time.time()]
108 def __actOnDown(self, node):
110 If down (not debug), do the same as actOnDebug for now
112 self.__actOnDebug(node)
115 def __actOnFilerw(self, node):
117 Report to PLC when node needs disk checked.
120 logger.info("POLICY: Emailing PLC for " + node)
121 tmp = emailTxt.mailtxt.filerw
122 sbj = tmp[0] % {'hostname': node}
123 msg = tmp[1] % {'hostname': node}
124 mailer.email(sbj, msg, target)
125 self.actionlogdb[node] = ["filerw", None, time.time()]
128 def __actOnDNS(self, node):
133 def __policy(self, node, loginbase, bkt):
135 target = [TECHEMAIL % loginbase]
136 tmp = emailTxt.mailtxt.down
137 sbj = tmp[0] % {'hostname': node}
138 msg = tmp[1] % {'hostname': node, 'days': daysdown}
139 mailer.email(sbj, msg, target)
148 global TECHEMAIL, PIEMAIL
151 if loginbase == "princeton": return
153 # Send appropriate message for node if in appropriate bucket.
154 # If we know where to send a message
156 logger.info("POLICY: loginbase for %s not found" %node)
157 # And we didn't email already.
159 # If first email, send to Tech
160 target = [TECHEMAIL % loginbase]
162 # If disk is foobarred, PLC should check it.
163 if (node in self.cmn.filerw) and \
164 (node not in self.emailed.keys()):
165 self.__actOnFilerw(node)
168 # If in dbg, set to rins, then reboot. Inform PLC.
169 if (node in self.cmn.dbg):
170 self.__actOnDebug(node)
172 if (node in self.emailed.keys()) and \
173 (node not in self.cmn.filerw) and \
174 (node not in self.cmn.clock_drift):
175 # If we emailed before, how long ago?
176 delta = time.time() - self.emailed[node][1]
178 logger.info("POLICY: already acted on %s today." % node)
181 logger.info("POLICY: acted %s on %s days ago" % (node,
184 # If no luck with tech, email PI
185 if (delta >= SPERDAY):
186 target.append(PIEMAIL % loginbase)
188 if (delta >= 7 * SPERDAY):
189 #remove slice creation if enough nodes arent up
190 if not self.enoughUp(loginbase):
191 slices = plc.slices(loginbase)
194 target.append(SLICEMAIL % slice)
195 logger.info("POLICY: Removing slice creation from %s" % loginbase)
196 tmp = emailTxt.mailtxt.removedSliceCreation
198 msg = tmp[1] % {'loginbase': loginbase}
199 plc.removeSliceCreation(node)
200 mailer.email(sbj, msg, target)
201 self.squeezed[loginbase] = (time.time(), "creation")
202 self.emailed[node] = ("creation", time.time())
203 logger.info("POLICY: Emailing (%s) %s - %s"\
204 %("creation", node, target))
207 if (delta >= 14 * SPERDAY):
208 target.append(PIEMAIL % loginbase)
209 # Email slices at site.
210 slices = plc.slices([loginbase])
213 target.append(SLICEMAIL % slice)
214 # If not enough up, freeze slices and email everyone.
215 if not self.enoughUp(loginbase):
216 logger.info("POLICY: Suspending %s slices." % loginbase)
217 tmp = emailTxt.mailtxt.suspendSlices
219 msg = tmp[1] % {'loginbase': loginbase}
220 plc.suspendSlices([node])
221 self.squeezed[loginbase] = (time.time(), "freeze")
222 mailer.email(sbj, msg, target)
223 self.emailed[node] = ("freeze", time.time())
224 logger.info("POLICY: Emailing (%s) %s - %s"\
225 %("freeze", node, target))
229 # Find the bucket the node is in and send appropriate email
230 # to approriate list of people.
231 for bkt in self.cmn.comonbkts.keys():
232 if (node in getattr(self.cmn, bkt)):
233 # Send predefined message for that bucket.
234 logger.info("POLICY: Emailing (%s) %s - %s"\
235 %(bkt, node, target))
236 tmp = getattr(emailTxt.mailtxt, bkt)
237 sbj = tmp[0] % {'hostname': node}
238 msg = tmp[1] % {'hostname': node}
239 mailer.email(sbj, msg, target)
240 self.emailed[node] = (bkt , time.time())
245 Prints, logs, and emails status of up nodes, down nodes, and buckets.
248 sub = "Monitor Summary"
249 msg = "\nThe following nodes were acted upon: \n\n"
250 for (node, (type, date)) in self.emailed.items():
251 # Print only things acted on today.
252 if (time.gmtime(time.time())[2] == time.gmtime(date)[2]):
253 msg +="%s\t(%s)\t%s\n" %(node, type, time.ctime(date))
254 msg +="\n\nThe following sites have been 'squeezed':\n\n"
255 for (loginbase, (date, type)) in self.squeezed.items():
256 # Print only things acted on today.
257 if (time.gmtime(time.time())[2] == time.gmtime(date)[2]):
258 msg +="%s\t(%s)\t%s\n" %(loginbase, type, time.ctime(date))
259 mailer.email(sub, msg, [SUMTO])
264 Store/Load state of emails. When, where, what.
266 def emailedStore(self, action):
270 logger.info("POLICY: Found and reading " + DAT)
271 self.emailed.update(pickle.load(f))
272 if action == "WRITE":
274 #logger.debug("Writing " + DAT)
275 pickle.dump(self.emailed, f)
277 except Exception, err:
278 logger.info("POLICY: Problem with DAT, %s" %err)
281 Returns True if more than MINUP nodes are up at a site.
283 def enoughUp(self, loginbase):
284 allsitenodes = plc.getSiteNodes([loginbase])
285 if len(allsitenodes) == 0:
286 logger.info("Node not in db")
289 numnodes = len(allsitenodes)
291 # Get all sick nodes from comon
292 for bucket in self.cmn.comonbkts.keys():
293 for host in getattr(self.cmn, bucket):
294 sicknodes.append(host)
296 for node in allsitenodes:
297 if node in sicknodes:
302 "POLICY: site with %s has nodes %s up." %(loginbase, numnodes))
311 self.accumSickSites()
313 #self.emailedStore("WRITE")
319 logger.setLevel(logging.DEBUG)
320 ch = logging.StreamHandler()
321 ch.setLevel(logging.DEBUG)
322 formatter = logging.Formatter('%(message)s')
323 ch.setFormatter(formatter)
324 logger.addHandler(ch)
328 #a = Policy(None, tmp)
329 #a.emailedStore("LOAD")
332 #print plc.slices([plc.siteId(["alice.cs.princeton.edu"])])
334 if __name__ == '__main__':
339 except KeyboardInterrupt:
340 print "Killed. Exitting."
341 logger.info('Monitor Killed')