2 # Copyright (c) 2004 The Trustees of Princeton University (Trustees).
4 # Faiyaz Ahmed <faiyaza@cs.princeton.edu>
6 # $Id: policy.py,v 1.9 2007/01/17 19:46:40 faiyaza Exp $
10 #from monitor import *
11 from threading import *
24 logger = logging.getLogger("monitor")
26 # Time to enforce policy
29 # Where to email the summary
30 SUMTO = "faiyaza@cs.princeton.edu"
31 TECHEMAIL="tech-%s@sites.planet-lab.org"
32 PIEMAIL="pi-%s@sites.planet-lab.org"
33 SLICEMAIL="%s@slices.planet-lab.org"
34 PLCEMAIL="support@planet-lab.org"
38 PITHRESH = 2 * SPERDAY
39 SLICETHRESH = 5 * SPERDAY
40 # Days before attempting rins again
41 RINSTHRESH = 5 * SPERDAY
43 # Minimum number of nodes up before squeezing
49 # DNS, kinda down (sick)
50 # clock, kinda down (sick)
51 # Full disk, going to be down
55 # suspend slice creation
58 def __init__(self, comonthread, sickNoTicket, emailed):
59 self.cmn = comonthread
60 # host - > (time of email, type of email)
61 self.emailed = emailed
62 # all sick nodes w/o tickets
63 self.sickNoTicket = sickNoTicket
64 # Sitess we've Squeezed.
70 What to do when node is in dbg (as reported by CoMon).
72 def __actOnDebug(self, node):
73 # Check to see if we've done this before
74 if (node in self.emailed.keys()):
75 if (self.emailed[node][0] == "dbg"):
76 delta = time.time() - self.emailed[node][1]
77 if (delta <= RINSTHRESH ):
78 # Don't mess with node if under Thresh.
80 logger.info("POLICY: %s in dbg, but acted on %s days ago" % (node, delta // SPERDAY))
82 logger.info("POLICY: Node in dbg - " + node)
83 plc.nodeBootState([node, "rins"])
85 return reboot.reboot(node)
88 What to do when node is in dbg (as reported by CoMon).
90 def __actOnFilerw(self, node):
92 logger.info("POLICY: Emailing PLC for " + node)
93 tmp = emailTxt.mailtxt.filerw
94 sbj = tmp[0] % {'hostname': node}
95 msg = tmp[1] % {'hostname': node}
96 mailer.email(sbj, msg, target)
97 self.emailed[node] = ("filerw", time.time())
104 # Get list of nodes in debug from PLC
105 #dbgNodes = NodesDebug()
106 global TECHEMAIL, PIEMAIL
107 # Grab a node from the queue (pushed by rt thread).
108 node = self.sickNoTicket.get(block = True)
110 loginbase = plc.siteId([node])
113 if loginbase == "princeton": return
115 # Send appropriate message for node if in appropriate bucket.
116 # If we know where to send a message
118 logger.info("POLICY: loginbase for %s not found" %node)
119 # And we didn't email already.
121 # If first email, send to Tech
122 target = [TECHEMAIL % loginbase]
124 # If disk is foobarred, PLC should check it.
125 if (node in self.cmn.filerw) and \
126 (node not in self.emailed.keys()):
127 self.__actOnFilerw(node)
130 # If in dbg, set to rins, then reboot. Inform PLC.
131 if (node in self.cmn.dbg):
132 # If reboot failure via PCU, POD and send email
133 # if contacted PCU, return
134 if self.__actOnDebug(node): return
136 if (node in self.emailed.keys()) and \
137 (node not in self.cmn.filerw) and \
138 (node not in self.cmn.clock_drift):
139 # If we emailed before, how long ago?
140 delta = time.time() - self.emailed[node][1]
142 logger.info("POLICY: already acted on %s today." % node)
145 logger.info("POLICY: acted %s on %s days ago" % (node,
148 # If no luck with tech, email PI
150 target.append(PIEMAIL % loginbase)
152 # If more than PI thresh, but less than slicethresh
153 if (delta >= PITHRESH) and (delta < SLICETHRESH):
154 #remove slice creation if enough nodes arent up
155 if not self.enoughUp(loginbase):
156 slices = plc.slices([loginbase])
159 target.append(SLICEMAIL % slice)
160 logger.info("POLICY: Removing slice creation from %s" % loginbase)
161 tmp = emailTxt.mailtxt.removedSliceCreation
163 msg = tmp[1] % {'loginbase': loginbase}
164 plc.removeSliceCreation([node])
165 mailer.email(sbj, msg, target)
166 self.squeezed[loginbase] = (time.time(), "creation")
167 self.emailed[node] = ("creation", time.time())
168 logger.info("POLICY: Emailing (%s) %s - %s"\
169 %("creation", node, target))
172 # If more than PI thresh and slicethresh
173 if (delta >= PITHRESH) and (delta > SLICETHRESH):
174 target.append(PIEMAIL % loginbase)
175 # Email slices at site.
176 slices = plc.slices([loginbase])
179 target.append(SLICEMAIL % slice)
180 # If not enough up, freeze slices and email everyone.
181 if not self.enoughUp(loginbase):
182 logger.info("POLICY: Suspending %s slices." % loginbase)
183 tmp = emailTxt.mailtxt.suspendSlices
185 msg = tmp[1] % {'loginbase': loginbase}
186 plc.suspendSlices([node])
187 self.squeezed[loginbase] = (time.time(), "freeze")
188 mailer.email(sbj, msg, target)
189 self.emailed[node] = ("freeze", time.time())
190 logger.info("POLICY: Emailing (%s) %s - %s"\
191 %("freeze", node, target))
195 # Find the bucket the node is in and send appropriate email
196 # to approriate list of people.
197 for bkt in self.cmn.comonbkts.keys():
198 if (node in getattr(self.cmn, bkt)):
199 # Send predefined message for that bucket.
200 logger.info("POLICY: Emailing (%s) %s - %s"\
201 %(bkt, node, target))
202 tmp = getattr(emailTxt.mailtxt, bkt)
203 sbj = tmp[0] % {'hostname': node}
204 msg = tmp[1] % {'hostname': node}
205 mailer.email(sbj, msg, target)
206 self.emailed[node] = (bkt , time.time())
211 Prints, logs, and emails status of up nodes, down nodes, and buckets.
214 sub = "Monitor Summary"
215 msg = "\nThe following nodes were acted upon: \n\n"
216 for (node, (type, date)) in self.emailed.items():
217 # Print only things acted on today.
218 if (time.gmtime(time.time())[2] == time.gmtime(date)[2]):
219 msg +="%s\t(%s)\t%s\n" %(node, type, time.ctime(date))
220 msg +="\n\nThe following sites have been 'squeezed':\n\n"
221 for (loginbase, (date, type)) in self.squeezed.items():
222 # Print only things acted on today.
223 if (time.gmtime(time.time())[2] == time.gmtime(date)[2]):
224 msg +="%s\t(%s)\t%s\n" %(loginbase, type, time.ctime(date))
225 mailer.email(sub, msg, [SUMTO])
230 Store/Load state of emails. When, where, what.
232 def emailedStore(self, action):
236 logger.info("POLICY: Found and reading " + DAT)
237 self.emailed.update(pickle.load(f))
238 if action == "WRITE":
240 #logger.debug("Writing " + DAT)
241 pickle.dump(self.emailed, f)
243 except Exception, err:
244 logger.info("POLICY: Problem with DAT, %s" %err)
247 Returns True if more than MINUP nodes are up at a site.
249 def enoughUp(self, loginbase):
250 allsitenodes = plc.getSiteNodes([loginbase])
251 if len(allsitenodes) == 0:
252 logger.info("Node not in db")
255 numnodes = len(allsitenodes)
257 # Get all sick nodes from comon
258 for bucket in self.cmn.comonbkts.keys():
259 for host in getattr(self.cmn, bucket):
260 sicknodes.append(host)
262 for node in allsitenodes:
263 if node in sicknodes:
268 "POLICY: site with %s has nodes %s up." %(loginbase, numnodes))
279 self.emailedStore("WRITE")
283 logger.setLevel(logging.DEBUG)
284 ch = logging.StreamHandler()
285 ch.setLevel(logging.DEBUG)
286 formatter = logging.Formatter('%(message)s')
287 ch.setFormatter(formatter)
288 logger.addHandler(ch)
292 #a = Policy(None, tmp)
293 #a.emailedStore("LOAD")
296 print plc.slices([plc.siteId(["alice.cs.princeton.edu"])])
298 if __name__ == '__main__':
303 except KeyboardInterrupt:
304 print "Killed. Exitting."
305 logger.info('Monitor Killed')