2 # Copyright (c) 2004 The Trustees of Princeton University (Trustees).
4 # Faiyaz Ahmed <faiyaza@cs.princeton.edu>
6 # $Id: policy.py,v 1.4 2006/11/14 19:20:13 faiyaza Exp $
10 #from monitor import *
11 from threading import *
24 logger = logging.getLogger("monitor")
26 # Time to enforce policy
29 # Where to email the summary
30 #SUMTO = "pupadm@lists.planet-lab.org"
31 SUMTO = "faiyaza@cs.princeton.edu"
32 TECHEMAIL="tech-%s@sites.planet-lab.org"
33 PIEMAIL="pi-%s@sites.planet-lab.org"
34 SLICEMAIL="%s@slices.planet-lab.org"
35 PLCEMAIL="support@planet-lab.org"
39 PITHRESH = 1 * SPERDAY
40 SLICETHRESH = 5 * SPERDAY
41 # Days before attempting rins again
42 RINSTHRESH = 5 * SPERDAY
44 # Minimum number of nodes up before squeezing
50 # DNS, kinda down (sick)
51 # clock, kinda down (sick)
52 # Full disk, going to be down
56 # suspend slice creation
59 def __init__(self, comonthread, sickNoTicket, emailed):
60 self.cmn = comonthread
61 # host - > (time of email, type of email)
62 self.emailed = emailed
63 # all sick nodes w/o tickets
64 self.sickNoTicket = sickNoTicket
65 # Sitess we've Squeezed.
71 What to do when node is in dbg (as reported by CoMon).
73 def __actOnDebug(self, node):
74 # Check to see if we've done this before
75 if (node in self.emailed.keys()):
76 if (self.emailed[node][0] == "dbg"):
77 delta = time.time() - self.emailed[node][1]
78 if (delta <= RINSTHRESH ):
79 # Don't mess with node if under Thresh.
81 logger.info("POLICY: %s in dbg, but acted on %s days ago" % (node, delta // SPERDAY))
83 logger.info("POLICY: Node in dbg - " + node)
84 plc.nodeBootState(node, "rins")
86 return reboot.reboot(node)
89 What to do when node is in dbg (as reported by CoMon).
91 def __actOnFilerw(self, node):
93 logger.info("POLICY: Emailing PLC for " + node)
94 tmp = emailTxt.mailtxt.filerw
95 sbj = tmp[0] % {'hostname': node}
96 msg = tmp[1] % {'hostname': node}
97 mailer.email(sbj, msg, target)
98 self.emailed[node] = ("filerw", time.time())
105 # Get list of nodes in debug from PLC
106 #dbgNodes = NodesDebug()
107 global TECHEMAIL, PIEMAIL
108 # Grab a node from the queue (pushed by rt thread).
109 node = self.sickNoTicket.get(block = True)
111 loginbase = plc.siteId(node)
113 # Send appropriate message for node if in appropriate bucket.
114 # If we know where to send a message
116 logger.info("POLICY: loginbase for %s not found" %node)
117 # And we didn't email already.
119 # If first email, send to Tech
120 target = [TECHEMAIL % loginbase]
122 # If disk is foobarred, PLC should check it.
123 if (node in self.cmn.filerw) and \
124 (node not in self.emailed.keys()):
125 self.__actOnFilerw(node)
128 # If in dbg, set to rins, then reboot. Inform PLC.
129 if (node in self.cmn.dbg):
130 # If reboot failure via PCU, POD and send email
131 # if contacted PCU, return
132 if self.__actOnDebug(node): return
134 if (node in self.emailed.keys()) and \
135 (node not in self.cmn.filerw) and \
136 (node not in self.cmn.clock_drift):
137 # If we emailed before, how long ago?
138 delta = time.time() - self.emailed[node][1]
140 logger.info("POLICY: already acted on %s today." % node)
143 logger.info("POLICY: acted %s on %s days ago" % (node,
146 # If more than PI thresh, but less than slicethresh
147 if (delta >= PITHRESH) and (delta < SLICETHRESH):
148 target.append(PIEMAIL % loginbase)
149 #remove slice creation if enough nodes arent up
150 if not self.enoughUp(loginbase):
151 logger.info("POLICY: Removing slice creation from %s" % loginbase)
152 plc.removeSliceCreation(node)
153 self.squeezed[loginbase] = (time.time(), "creation")
154 # If more than PI thresh and slicethresh
155 if (delta >= PITHRESH) and (delta > SLICETHRESH):
156 # Email slices at site.
157 slices = plc.slices(loginbase)
160 target.append(SLICEMAIL % slice)
161 if not self.enoughUp(loginbase):
162 plc.suspendSlices(node)
163 self.squeezed[loginbase] = (time.time(),
166 # Find the bucket the node is in and send appropriate email
167 # to approriate list of people.
168 for bkt in self.cmn.comonbkts.keys():
169 if (node in getattr(self.cmn, bkt)):
170 # Send predefined message for that bucket.
171 logger.info("POLICY: Emailing (%s) %s - %s"\
172 %(bkt, node, target))
173 tmp = getattr(emailTxt.mailtxt, bkt)
174 sbj = tmp[0] % {'hostname': node}
175 msg = tmp[1] % {'hostname': node}
176 mailer.email(sbj, msg, target)
177 self.emailed[node] = (bkt , time.time())
182 Prints, logs, and emails status of up nodes, down nodes, and buckets.
185 sub = "Monitor Summary"
186 msg = "\nThe following nodes were acted upon: \n\n"
187 for (node, (type, date)) in self.emailed.items():
188 # Print only things acted on today.
189 if (time.gmtime(time.time())[2] == time.gmtime(date)[2]):
190 msg +="%s\t(%s)\t%s\n" %(node, type, time.ctime(date))
191 msg +="\n\nThe following sites have been 'squeezed':\n\n"
192 for (loginbase, (date, type)) in self.squeezed.items():
193 # Print only things acted on today.
194 if (time.gmtime(time.time())[2] == time.gmtime(date)[2]):
195 msg +="%s\t(%s)\t%s\n" %(loginbase, type, time.ctime(date))
196 mailer.email(sub, msg, [SUMTO])
201 Store/Load state of emails. When, where, what.
203 def emailedStore(self, action):
207 logger.info("POLICY: Found and reading " + DAT)
208 self.emailed.update(pickle.load(f))
209 if action == "WRITE":
211 #logger.debug("Writing " + DAT)
212 pickle.dump(self.emailed, f)
214 except Exception, err:
215 logger.info("POLICY: Problem with DAT, %s" %err)
218 Returns True if more than MINUP nodes are up at a site.
220 def enoughUp(self, loginbase):
221 allsitenodes = plc.getSiteNodes(loginbase)
222 if len(allsitenodes) == 0:
223 logger.info("Node not in db")
226 numnodes = len(allsitenodes)
228 # Get all sick nodes from comon
229 for bucket in self.cmn.comonbkts.keys():
230 for host in getattr(self.cmn, bucket):
231 sicknodes.append(host)
233 for node in allsitenodes:
234 if node in sicknodes:
239 "POLICY: site with %s has nodes %s up." %(loginbase, numnodes))
250 self.emailedStore("WRITE")
254 logger.setLevel(logging.DEBUG)
255 ch = logging.StreamHandler()
256 ch.setLevel(logging.DEBUG)
257 formatter = logging.Formatter('%(message)s')
258 ch.setFormatter(formatter)
259 logger.addHandler(ch)
263 #a = Policy(None, tmp)
264 #a.emailedStore("LOAD")
267 print plc.slices(plc.siteId("alice.cs.princeton.edu"))
269 if __name__ == '__main__':
274 except KeyboardInterrupt:
275 print "Killed. Exitting."
276 logger.info('Monitor Killed')