2 # Copyright (c) 2004 The Trustees of Princeton University (Trustees).
4 # Faiyaz Ahmed <faiyaza@cs.princeton.edu>
10 #from monitor import *
11 from threading import *
24 logger = logging.getLogger("monitor")
26 # Time to enforce policy
29 # Where to email the summary
30 SUMTO = "pupadm@lists.planet-lab.org"
31 TECHEMAIL="tech-%s@sites.planet-lab.org"
32 PIEMAIL="pi-%s@sites.planet-lab.org"
33 SLICEMAIL="%s@slices.planet-lab.org"
34 PLCEMAIL="support@planet-lab.org"
38 PITHRESH = 1 * SPERDAY
39 SLICETHRESH = 5 * SPERDAY
40 # Days before attempting rins again
41 RINSTHRESH = 5 * SPERDAY
43 # Minimum number of nodes up before squeezing
49 # DNS, kinda down (sick)
50 # clock, kinda down (sick)
51 # Full disk, going to be down
55 # suspend slice creation
58 def __init__(self, comonthread, sickNoTicket, emailed):
59 self.cmn = comonthread
60 # host - > (time of email, type of email)
61 self.emailed = emailed
62 # all sick nodes w/o tickets
63 self.sickNoTicket = sickNoTicket
64 # Sitess we've Squeezed.
70 What to do when node is in dbg (as reported by CoMon).
72 def __actOnDebug(self, node):
73 # Check to see if we've done this before
74 if (node in self.emailed.keys()):
75 if (self.emailed[node][0] == "dbg"):
76 delta = time.time() - self.emailed[node][1]
77 if (delta <= RINSTHRESH ):
78 # Don't mess with node if under Thresh.
80 logger.info("POLICY: %s in dbg, but acted on %s days ago" % (node, delta // SPERDAY))
82 logger.info("POLICY: Node in dbg - " + node)
83 plc.nodeBootState(node, "rins")
85 return reboot.reboot(node)
88 What to do when node is in dbg (as reported by CoMon).
90 def __actOnFilerw(self, node):
92 logger.info("POLICY: Emailing PLC for " + node)
93 tmp = emailTxt.mailtxt.filerw
94 sbj = tmp[0] % {'hostname': node}
95 msg = tmp[1] % {'hostname': node}
96 mailer.email(sbj, msg, target)
97 self.emailed[node] = ("filerw", time.time())
104 # Get list of nodes in debug from PLC
105 #dbgNodes = NodesDebug()
106 global TECHEMAIL, PIEMAIL
107 # Grab a node from the queue (pushed by rt thread).
108 node = self.sickNoTicket.get(block = True)
110 loginbase = plc.siteId(node)
112 # Send appropriate message for node if in appropriate bucket.
113 # If we know where to send a message
115 logger.info("POLICY: loginbase for %s not found" %node)
116 # And we didn't email already.
118 # If first email, send to Tech
119 target = [TECHEMAIL % loginbase]
121 # If disk is foobarred, PLC should check it.
122 if (node in self.cmn.filerw) and \
123 (node not in self.emailed.keys()):
124 self.__actOnFilerw(node)
127 # If in dbg, set to rins, then reboot. Inform PLC.
128 if (node in self.cmn.dbg):
129 # If reboot failure via PCU, POD and send email
130 # if contacted PCU, return
131 if self.__actOnDebug(node): return
133 if (node in self.emailed.keys()) and \
134 (node not in self.cmn.filerw) and \
135 (node not in self.cmn.clock_drift):
136 # If we emailed before, how long ago?
137 delta = time.time() - self.emailed[node][1]
139 logger.info("POLICY: already acted on %s today." % node)
142 logger.info("POLICY: acted %s on %s days ago" % (node,
145 # If more than PI thresh, but less than slicethresh
146 if (delta >= PITHRESH) and (delta < SLICETHRESH):
147 target.append(PIEMAIL % loginbase)
148 #remove slice creation if enough nodes arent up
149 if not self.enoughUp(loginbase):
150 logger.info("POLICY: Removing slice creation from %s" % loginbase)
151 plc.removeSliceCreation(node)
152 self.squeezed[loginbase] = (time.time(), "creation")
153 # If more than PI thresh and slicethresh
154 if (delta >= PITHRESH) and (delta > SLICETHRESH):
155 # Email slices at site.
156 slices = plc.slices(loginbase)
159 target.append(SLICEMAIL % slice)
160 if not self.enoughUp(loginbase):
161 plc.suspendSlices(node)
162 self.squeezed[loginbase] = (time.time(),
165 # Find the bucket the node is in and send appropriate email
166 # to approriate list of people.
167 for bkt in self.cmn.comonbkts.keys():
168 if (node in getattr(self.cmn, bkt)):
169 # Send predefined message for that bucket.
170 logger.info("POLICY: Emailing (%s) %s - %s"\
171 %(bkt, node, target))
172 tmp = getattr(emailTxt.mailtxt, bkt)
173 sbj = tmp[0] % {'hostname': node}
174 msg = tmp[1] % {'hostname': node}
175 mailer.email(sbj, msg, target)
176 self.emailed[node] = (bkt , time.time())
181 Prints, logs, and emails status of up nodes, down nodes, and buckets.
184 sub = "Monitor Summary"
185 msg = "\nThe following nodes were acted upon: \n\n"
186 for (node, (type, date)) in self.emailed.items():
187 # Print only things acted on today.
188 if (time.gmtime(time.time())[2] == time.gmtime(date)[2]):
189 msg +="%s\t(%s)\t%s\n" %(node, type, time.ctime(date))
190 msg +="\n\nThe following sites have been 'squeezed':\n\n"
191 for (loginbase, (date, type)) in self.squeezed.items():
192 # Print only things acted on today.
193 if (time.gmtime(time.time())[2] == time.gmtime(date)[2]):
194 msg +="%s\t(%s)\t%s\n" %(loginbase, type, time.ctime(date))
195 mailer.email(sub, msg, [SUMTO])
200 Store/Load state of emails. When, where, what.
202 def emailedStore(self, action):
206 logger.info("POLICY: Found and reading " + DAT)
207 self.emailed.update(pickle.load(f))
208 if action == "WRITE":
210 #logger.debug("Writing " + DAT)
211 pickle.dump(self.emailed, f)
213 except Exception, err:
214 logger.info("POLICY: Problem with DAT, %s" %err)
217 Returns True if more than MINUP nodes are up at a site.
219 def enoughUp(self, loginbase):
220 allsitenodes = plc.getSiteNodes(loginbase)
221 if len(allsitenodes) == 0:
222 logger.info("Node not in db")
225 numnodes = len(allsitenodes)
227 # Get all sick nodes from comon
228 for bucket in self.cmn.comonbkts.keys():
229 for host in getattr(self.cmn, bucket):
230 sicknodes.append(host)
232 for node in allsitenodes:
233 if node in sicknodes:
238 "POLICY: site with %s has nodes %s up." %(loginbase, numnodes))
249 self.emailedStore("WRITE")
253 logger.setLevel(logging.DEBUG)
254 ch = logging.StreamHandler()
255 ch.setLevel(logging.DEBUG)
256 formatter = logging.Formatter('%(message)s')
257 ch.setFormatter(formatter)
258 logger.addHandler(ch)
262 #a = Policy(None, tmp)
263 #a.emailedStore("LOAD")
266 print plc.slices(plc.siteId("alice.cs.princeton.edu"))
268 if __name__ == '__main__':
273 except KeyboardInterrupt:
274 print "Killed. Exitting."
275 logger.info('Monitor Killed')