2 # Copyright (c) 2004 The Trustees of Princeton University (Trustees).
4 # Faiyaz Ahmed <faiyaza@cs.princeton.edu>
6 # $Id: policy.py,v 1.4 2006/11/14 19:20:13 faiyaza Exp $
10 #from monitor import *
11 from threading import *
24 logger = logging.getLogger("monitor")
26 # Time to enforce policy
29 # Where to email the summary
30 SUMTO = "pupadm@lists.planet-lab.org"
31 TECHEMAIL="tech-%s@sites.planet-lab.org"
32 PIEMAIL="pi-%s@sites.planet-lab.org"
33 SLICEMAIL="%s@slices.planet-lab.org"
34 PLCEMAIL="support@planet-lab.org"
38 PITHRESH = 1 * SPERDAY
39 SLICETHRESH = 5 * SPERDAY
40 # Days before attempting rins again
41 RINSTHRESH = 5 * SPERDAY
43 # Minimum number of nodes up before squeezing
49 # DNS, kinda down (sick)
50 # clock, kinda down (sick)
51 # Full disk, going to be down
55 # suspend slice creation
58 def __init__(self, comonthread, sickNoTicket, emailed):
59 self.cmn = comonthread
60 # host - > (time of email, type of email)
61 self.emailed = emailed
62 # all sick nodes w/o tickets
63 self.sickNoTicket = sickNoTicket
64 # Sitess we've Squeezed.
70 What to do when node is in dbg (as reported by CoMon).
72 def __actOnDebug(self, node):
73 # Check to see if we've done this before
74 if (node in self.emailed.keys()):
75 if (self.emailed[node][0] == "dbg"):
76 delta = time.time() - self.emailed[node][1]
77 if (delta <= RINSTHRESH ):
78 # Don't mess with node if under Thresh.
80 logger.info("POLICY: %s in dbg, but acted on %s days ago" % (node, delta // SPERDAY))
82 logger.info("POLICY: Node in dbg - " + node)
83 plc.nodeBootState(node, "rins")
85 return reboot.reboot(node)
88 What to do when node is in dbg (as reported by CoMon).
90 def __actOnFilerw(self, node):
92 logger.info("POLICY: Emailing PLC for " + node)
93 tmp = emailTxt.mailtxt.filerw
94 sbj = tmp[0] % {'hostname': node}
95 msg = tmp[1] % {'hostname': node}
96 mailer.email(sbj, msg, target)
97 self.emailed[node] = ("filerw", time.time())
104 # Get list of nodes in debug from PLC
105 #dbgNodes = NodesDebug()
106 global TECHEMAIL, PIEMAIL
107 # Grab a node from the queue (pushed by rt thread).
108 node = self.sickNoTicket.get(block = True)
110 loginbase = plc.siteId(node)
112 # Send appropriate message for node if in appropriate bucket.
113 # If we know where to send a message
115 logger.info("POLICY: loginbase for %s not found" %node)
116 # And we didn't email already.
118 # If first email, send to Tech
119 target = [TECHEMAIL % loginbase]
121 # If disk is foobarred, PLC should check it.
122 if (node in self.cmn.filerw) and \
123 (node not in self.emailed.keys()):
124 self.__actOnFilerw(node)
127 # If in dbg, set to rins, then reboot. Inform PLC.
128 if (node in self.cmn.dbg):
129 # If reboot failure via PCU, POD and send email
130 # if contacted PCU, return
131 if self.__actOnDebug(node): return
133 if (node in self.emailed.keys()) and \
134 (node not in self.cmn.filerw) and \
135 (node not in self.cmn.clock_drift):
136 # If we emailed before, how long ago?
137 delta = time.time() - self.emailed[node][1]
139 logger.info("POLICY: already acted on %s today." % node)
142 logger.info("POLICY: acted %s on %s days ago" % (node,
145 # If more than PI thresh, but less than slicethresh
146 if (delta >= PITHRESH) and (delta < SLICETHRESH):
147 target.append(PIEMAIL % loginbase)
148 #remove slice creation if enough nodes arent up
149 if not self.enoughUp(loginbase):
150 slices = plc.slices(loginbase)
153 target.append(SLICEMAIL % slice)
154 logger.info("POLICY: Removing slice creation from %s" % loginbase)
155 tmp = emailTxt.mailtxt.removedSliceCreation
157 msg = tmp[1] % {'loginbase': loginbase}
158 plc.removeSliceCreation(node)
159 mailer.email(sbj, msg, target)
160 self.squeezed[loginbase] = (time.time(), "creation")
163 # If more than PI thresh and slicethresh
164 if (delta >= PITHRESH) and (delta > SLICETHRESH):
165 target.append(PIEMAIL % loginbase)
166 # Email slices at site.
167 slices = plc.slices(loginbase)
170 target.append(SLICEMAIL % slice)
171 # If not enough up, freeze slices and email everyone.
172 if not self.enoughUp(loginbase):
173 logger.info("POLICY: Suspending %s slices." % loginbase)
174 tmp = emailTxt.mailtxt.suspendSlices
176 msg = tmp[1] % {'loginbase': loginbase}
177 plc.suspendSlices(node)
178 self.squeezed[loginbase] = (time.time(), "freeze")
179 mailer.email(sbj, msg, target)
182 # Find the bucket the node is in and send appropriate email
183 # to approriate list of people.
184 for bkt in self.cmn.comonbkts.keys():
185 if (node in getattr(self.cmn, bkt)):
186 # Send predefined message for that bucket.
187 logger.info("POLICY: Emailing (%s) %s - %s"\
188 %(bkt, node, target))
189 tmp = getattr(emailTxt.mailtxt, bkt)
190 sbj = tmp[0] % {'hostname': node}
191 msg = tmp[1] % {'hostname': node}
192 mailer.email(sbj, msg, target)
193 self.emailed[node] = (bkt , time.time())
198 Prints, logs, and emails status of up nodes, down nodes, and buckets.
201 sub = "Monitor Summary"
202 msg = "\nThe following nodes were acted upon: \n\n"
203 for (node, (type, date)) in self.emailed.items():
204 # Print only things acted on today.
205 if (time.gmtime(time.time())[2] == time.gmtime(date)[2]):
206 msg +="%s\t(%s)\t%s\n" %(node, type, time.ctime(date))
207 msg +="\n\nThe following sites have been 'squeezed':\n\n"
208 for (loginbase, (date, type)) in self.squeezed.items():
209 # Print only things acted on today.
210 if (time.gmtime(time.time())[2] == time.gmtime(date)[2]):
211 msg +="%s\t(%s)\t%s\n" %(loginbase, type, time.ctime(date))
212 mailer.email(sub, msg, [SUMTO])
217 Store/Load state of emails. When, where, what.
219 def emailedStore(self, action):
223 logger.info("POLICY: Found and reading " + DAT)
224 self.emailed.update(pickle.load(f))
225 if action == "WRITE":
227 #logger.debug("Writing " + DAT)
228 pickle.dump(self.emailed, f)
230 except Exception, err:
231 logger.info("POLICY: Problem with DAT, %s" %err)
234 Returns True if more than MINUP nodes are up at a site.
236 def enoughUp(self, loginbase):
237 allsitenodes = plc.getSiteNodes(loginbase)
238 if len(allsitenodes) == 0:
239 logger.info("Node not in db")
242 numnodes = len(allsitenodes)
244 # Get all sick nodes from comon
245 for bucket in self.cmn.comonbkts.keys():
246 for host in getattr(self.cmn, bucket):
247 sicknodes.append(host)
249 for node in allsitenodes:
250 if node in sicknodes:
255 "POLICY: site with %s has nodes %s up." %(loginbase, numnodes))
266 self.emailedStore("WRITE")
270 logger.setLevel(logging.DEBUG)
271 ch = logging.StreamHandler()
272 ch.setLevel(logging.DEBUG)
273 formatter = logging.Formatter('%(message)s')
274 ch.setFormatter(formatter)
275 logger.addHandler(ch)
279 #a = Policy(None, tmp)
280 #a.emailedStore("LOAD")
283 print plc.slices(plc.siteId("alice.cs.princeton.edu"))
285 if __name__ == '__main__':
290 except KeyboardInterrupt:
291 print "Killed. Exitting."
292 logger.info('Monitor Killed')