2 # Copyright (c) 2004 The Trustees of Princeton University (Trustees).
4 # Faiyaz Ahmed <faiyaza@cs.princeton.edu>
6 # $Id: policy.py,v 1.7 2007/01/11 21:39:07 faiyaza Exp $
10 #from monitor import *
11 from threading import *
24 logger = logging.getLogger("monitor")
26 # Time to enforce policy
29 # Where to email the summary
30 SUMTO = "faiyaza@cs.princeton.edu"
31 TECHEMAIL="tech-%s@sites.planet-lab.org"
32 PIEMAIL="pi-%s@sites.planet-lab.org"
33 SLICEMAIL="%s@slices.planet-lab.org"
34 PLCEMAIL="support@planet-lab.org"
38 PITHRESH = 2 * SPERDAY
39 SLICETHRESH = 5 * SPERDAY
40 # Days before attempting rins again
41 RINSTHRESH = 5 * SPERDAY
43 # Minimum number of nodes up before squeezing
49 # DNS, kinda down (sick)
50 # clock, kinda down (sick)
51 # Full disk, going to be down
55 # suspend slice creation
58 def __init__(self, comonthread, sickNoTicket, emailed):
59 self.cmn = comonthread
60 # host - > (time of email, type of email)
61 self.emailed = emailed
62 # all sick nodes w/o tickets
63 self.sickNoTicket = sickNoTicket
64 # Sitess we've Squeezed.
70 What to do when node is in dbg (as reported by CoMon).
72 def __actOnDebug(self, node):
73 # Check to see if we've done this before
74 if (node in self.emailed.keys()):
75 if (self.emailed[node][0] == "dbg"):
76 delta = time.time() - self.emailed[node][1]
77 if (delta <= RINSTHRESH ):
78 # Don't mess with node if under Thresh.
80 logger.info("POLICY: %s in dbg, but acted on %s days ago" % (node, delta // SPERDAY))
82 logger.info("POLICY: Node in dbg - " + node)
83 plc.nodeBootState(node, "rins")
85 return reboot.reboot(node)
88 What to do when node is in dbg (as reported by CoMon).
90 def __actOnFilerw(self, node):
92 logger.info("POLICY: Emailing PLC for " + node)
93 tmp = emailTxt.mailtxt.filerw
94 sbj = tmp[0] % {'hostname': node}
95 msg = tmp[1] % {'hostname': node}
96 mailer.email(sbj, msg, target)
97 self.emailed[node] = ("filerw", time.time())
104 # Get list of nodes in debug from PLC
105 #dbgNodes = NodesDebug()
106 global TECHEMAIL, PIEMAIL
107 # Grab a node from the queue (pushed by rt thread).
108 node = self.sickNoTicket.get(block = True)
110 loginbase = plc.siteId(node)
113 if loginbase == "princeton": return
115 # Send appropriate message for node if in appropriate bucket.
116 # If we know where to send a message
118 logger.info("POLICY: loginbase for %s not found" %node)
119 # And we didn't email already.
121 # If first email, send to Tech
122 target = [TECHEMAIL % loginbase]
124 # If disk is foobarred, PLC should check it.
125 if (node in self.cmn.filerw) and \
126 (node not in self.emailed.keys()):
127 self.__actOnFilerw(node)
130 # If in dbg, set to rins, then reboot. Inform PLC.
131 if (node in self.cmn.dbg):
132 # If reboot failure via PCU, POD and send email
133 # if contacted PCU, return
134 if self.__actOnDebug(node): return
136 if (node in self.emailed.keys()) and \
137 (node not in self.cmn.filerw) and \
138 (node not in self.cmn.clock_drift):
139 # If we emailed before, how long ago?
140 delta = time.time() - self.emailed[node][1]
142 logger.info("POLICY: already acted on %s today." % node)
145 logger.info("POLICY: acted %s on %s days ago" % (node,
148 # If no luck with tech, email PI
150 target.append(PIEMAIL % loginbase)
152 # If more than PI thresh, but less than slicethresh
153 if (delta >= PITHRESH) and (delta < SLICETHRESH):
154 #remove slice creation if enough nodes arent up
155 if not self.enoughUp(loginbase):
156 slices = plc.slices(loginbase)
159 target.append(SLICEMAIL % slice)
160 logger.info("POLICY: Removing slice creation from %s" % loginbase)
161 tmp = emailTxt.mailtxt.removedSliceCreation
163 msg = tmp[1] % {'loginbase': loginbase}
164 plc.removeSliceCreation(node)
165 mailer.email(sbj, msg, target)
166 self.squeezed[loginbase] = (time.time(), "creation")
167 self.emailed[node] = ("creation", time.time())
170 # If more than PI thresh and slicethresh
171 if (delta >= PITHRESH) and (delta > SLICETHRESH):
172 target.append(PIEMAIL % loginbase)
173 # Email slices at site.
174 slices = plc.slices(loginbase)
177 target.append(SLICEMAIL % slice)
178 # If not enough up, freeze slices and email everyone.
179 if not self.enoughUp(loginbase):
180 logger.info("POLICY: Suspending %s slices." % loginbase)
181 tmp = emailTxt.mailtxt.suspendSlices
183 msg = tmp[1] % {'loginbase': loginbase}
184 plc.suspendSlices(node)
185 self.squeezed[loginbase] = (time.time(), "freeze")
186 mailer.email(sbj, msg, target)
187 self.emailed[node] = ("freeze", time.time())
190 # Find the bucket the node is in and send appropriate email
191 # to approriate list of people.
192 for bkt in self.cmn.comonbkts.keys():
193 if (node in getattr(self.cmn, bkt)):
194 # Send predefined message for that bucket.
195 logger.info("POLICY: Emailing (%s) %s - %s"\
196 %(bkt, node, target))
197 tmp = getattr(emailTxt.mailtxt, bkt)
198 sbj = tmp[0] % {'hostname': node}
199 msg = tmp[1] % {'hostname': node}
200 mailer.email(sbj, msg, target)
201 self.emailed[node] = (bkt , time.time())
206 Prints, logs, and emails status of up nodes, down nodes, and buckets.
209 sub = "Monitor Summary"
210 msg = "\nThe following nodes were acted upon: \n\n"
211 for (node, (type, date)) in self.emailed.items():
212 # Print only things acted on today.
213 if (time.gmtime(time.time())[2] == time.gmtime(date)[2]):
214 msg +="%s\t(%s)\t%s\n" %(node, type, time.ctime(date))
215 msg +="\n\nThe following sites have been 'squeezed':\n\n"
216 for (loginbase, (date, type)) in self.squeezed.items():
217 # Print only things acted on today.
218 if (time.gmtime(time.time())[2] == time.gmtime(date)[2]):
219 msg +="%s\t(%s)\t%s\n" %(loginbase, type, time.ctime(date))
220 mailer.email(sub, msg, [SUMTO])
225 Store/Load state of emails. When, where, what.
227 def emailedStore(self, action):
231 logger.info("POLICY: Found and reading " + DAT)
232 self.emailed.update(pickle.load(f))
233 if action == "WRITE":
235 #logger.debug("Writing " + DAT)
236 pickle.dump(self.emailed, f)
238 except Exception, err:
239 logger.info("POLICY: Problem with DAT, %s" %err)
242 Returns True if more than MINUP nodes are up at a site.
244 def enoughUp(self, loginbase):
245 allsitenodes = plc.getSiteNodes(loginbase)
246 if len(allsitenodes) == 0:
247 logger.info("Node not in db")
250 numnodes = len(allsitenodes)
252 # Get all sick nodes from comon
253 for bucket in self.cmn.comonbkts.keys():
254 for host in getattr(self.cmn, bucket):
255 sicknodes.append(host)
257 for node in allsitenodes:
258 if node in sicknodes:
263 "POLICY: site with %s has nodes %s up." %(loginbase, numnodes))
274 self.emailedStore("WRITE")
278 logger.setLevel(logging.DEBUG)
279 ch = logging.StreamHandler()
280 ch.setLevel(logging.DEBUG)
281 formatter = logging.Formatter('%(message)s')
282 ch.setFormatter(formatter)
283 logger.addHandler(ch)
287 #a = Policy(None, tmp)
288 #a.emailedStore("LOAD")
291 print plc.slices(plc.siteId("alice.cs.princeton.edu"))
293 if __name__ == '__main__':
298 except KeyboardInterrupt:
299 print "Killed. Exitting."
300 logger.info('Monitor Killed')