* Sets nodes to reboot, uses PCU if available. Defaults to POD/email (with site...

author Faiyaz Ahmed <faiyaza@cs.princeton.edu>

Tue, 14 Nov 2006 19:20:13 +0000 (19:20 +0000)

committer Faiyaz Ahmed <faiyaza@cs.princeton.edu>

Tue, 14 Nov 2006 19:20:13 +0000 (19:20 +0000)
author Faiyaz Ahmed <faiyaza@cs.princeton.edu>
Tue, 14 Nov 2006 19:20:13 +0000 (19:20 +0000)
committer Faiyaz Ahmed <faiyaza@cs.princeton.edu>
Tue, 14 Nov 2006 19:20:13 +0000 (19:20 +0000)
diff --git a/comon.py b/comon.py

index 7526f0d..f547230 100755 (executable)
--- a/comon.py
+++ b/comon.py
@@ -40,7 +40,6 @@ class Comon(Thread):
                         "ssh": "sshstatus%20%3E%202h",
                         "clock_drift": "drift%20%3E%201m",
                         "dns": "dns1udp%20%3E%2080%20&&%20dns2udp%20%3E%2080",
-                       "disk": "resptime%20%3E%200%20&&%20gbfree%20%3C%205",
                         "filerw": "filerw%3E0",
                         "dbg" : "keyok==0"}
                 Thread.__init__(self)
diff --git a/config.py b/config.py

index 112173f..19c590d 100644 (file)
--- a/config.py
+++ b/config.py
@@ -1,13 +1,16 @@
-from xml.sax import saxutils
-
-class config(saxutils.DefaultHandler):
-       def __init__(self, file, start):
-               self.file = file
-               self.start = start
-               self.config = {}
-
-       def startElement(self,name, attrs):
-               if name != self.start: return
  
+debug = False
  
+#from xml.sax import saxutils
+#
+#class config(saxutils.DefaultHandler):
+#      def __init__(self, file, start):
+#              self.file = file
+#              self.start = start
+#              self.config = {}
+#
+#      def startElement(self,name, attrs):
+#              if name != self.start: return
+#
+#
  #incomplete
diff --git a/emailTxt.py b/emailTxt.py

index c611cdf..00c7462 100644 (file)
--- a/emailTxt.py
+++ b/emailTxt.py
@@ -60,6 +60,27 @@ Thanks.
    -- PlanetLab Central (support@planet-lab.org)
  """)
  
+   clock_drift=("""Planetlab node %(hostname)s and NTP.""", """As part of PlanetLab node monitoring, we noticed %(hostname)s cannot reach our NTP server.
+
+Please verify that the NTP port (tcp/123) is not blocked by your site. 
+
+Thanks.
+
+  -- PlanetLab Central (support@planet-lab.org)
+""")
+
+
+   dbg=("""Planetlab node %(hostname)s requires reboot.""", """As part of PlanetLab node monitoring, we noticed %(hostname)s is in debug mode.  This usually implies the node was rebooted unexpectedly and could not come up cleanly.  
+
+We have set the node to reinstall upon reboot.  Please reboot the machine.  It would be helpful if you could forward any error messages on the console to support@planet-lab.org.
+
+
+Thanks.
+
+  -- PlanetLab Central (support@planet-lab.org)
+""")
+
+
     STANDARD_PI="""As part of PlanetLab nodes monitoring, we noticed the node %(hostname)s is not available for ssh. We have made several attempts to contact the techinical contacts for this site (they are CCed) to help us bring the node back online. If there should be a different technical contact appointed, you may add the 'tech' role to any user registered for your site via the website. (Manage Users off the left nav bar on the PI tab, then click the user)
  
  Our records indicate that there is no remote power control unit connected to this node. If this is not the case, please log into the PlanetLab Website and update the PCU information.
diff --git a/mailer.py b/mailer.py

index ca194ed..8df3b98 100755 (executable)
--- a/mailer.py
+++ b/mailer.py
@@ -6,28 +6,12 @@
  #
  # $Id: $
  from emailTxt import *
-import xml, xmlrpclib
  import smtplib
+import config
  
  MTA="localhost"
  FROM="support@planet-lab.org"
  
-XMLRPC_SERVER = 'https://www.planet-lab.org/PLCAPI/'
-
-def siteId(hostname):
-       api = xmlrpclib.Server(XMLRPC_SERVER, verbose=False)
-       anon = {'AuthMethod': "anonymous"}
-       site_id = api.AnonAdmQuerySite (anon, {"node_hostname": hostname})
-       if len(site_id) == 1:  
-               loginbase = api.AnonAdmGetSites (anon, site_id, ["login_base"])
-               return loginbase[0]['login_base']
-
-
-def slices(loginbase):
-        api = xmlrpclib.Server(XMLRPC_SERVER, verbose=False)
-        return api.SliceListNames (auth.auth, loginbase)
-
-
  def email (subject, text, to):
         """Create a mime-message that will render HTML in popular
         MUAs, text in better ones"""
@@ -78,14 +62,16 @@ def email (subject, text, to):
         writer.lastpart()
         msg = out.getvalue()
         out.close()
-       server = smtplib.SMTP(MTA)
-       server.sendmail(FROM, to,  msg)
-       server.quit()
+       if not config.debug:
+               server = smtplib.SMTP(MTA)
+               server.sendmail(FROM, to,  msg)
+               server.quit()
  
  if __name__=="__main__":
         import smtplib
         import emailTxt
-       id = siteId("alice.cs.princeton.edu")
+       import plc 
+       id = plc.siteId("alice.cs.princeton.edu")
         print id
         #if id:
                 #email('TEST', emailTxt.mailtxt.ssh % {'hostname': "ALICE.cs.princeton.edu"}, "tech-" + id + "@sites.planet-lab.org")
diff --git a/monitor.py b/monitor.py

index 4b80d9a..eacf14d 100644 (file)
--- a/monitor.py
+++ b/monitor.py
@@ -14,6 +14,8 @@ from threading import *
  import time
  import logging
  import Queue
+# Global config options
+import config
  # daemonize and *pid
  from util.process import * 
  
@@ -26,8 +28,6 @@ import policy
  # Email
  import mailer
  import emailTxt
-# Defaults
-debug = False 
  
  # Log to what 
  LOG="./monitor.log"
@@ -110,7 +110,7 @@ class ThreadWatcher(Thread):
                 for thread in runningthreads.keys():
                         # If thread found dead, remove from queue
                         if not runningthreads[thread].isAlive():
-                               logger.error("Thread Died: %s" %(thread))
+                               logger.error("***********Thread died: %s**********" %(thread))
                                 del runningthreads[thread]
  
  
@@ -127,7 +127,7 @@ Start threads, do some housekeeping, then daemonize.
  """
  def main():
         # Defaults
-       global debug, status, logger
+       global status, logger
  
         try:
                 longopts = ["debug", "status", "help"]
@@ -139,7 +139,8 @@ def main():
  
         for (opt, optval) in opts:
                 if opt == "-d" or opt == "--debug":
-                       debug = True
+                       config.debug = True
+                       print "Running in DEBUG mode:  NO EMAILS SENT AND NO SLICES SQUEEZED."
                 elif opt == "--status":
                         #print summary(names)
                         sys.exit(0)
@@ -212,8 +213,6 @@ def main():
  
  
  
-       pol.status()
-
         # Store state of emails
         pol.emailedStore("WRITE")
  
diff --git a/policy.py b/policy.py

index f263464..11e8a68 100644 (file)
--- a/policy.py
+++ b/policy.py
@@ -14,11 +14,11 @@ import logging
  import mailer
  import emailTxt
  import pickle
-import xml, xmlrpclib
  import Queue
+import plc
+import reboot
+import config
  
-#Hack to auth structure
-import auth 
  DAT="./monitor.dat"
  
  logger = logging.getLogger("monitor")
@@ -26,19 +26,22 @@ logger = logging.getLogger("monitor")
  # Time to enforce policy
  POLSLEEP = 7200
  
-# Days between emails (enforce 'squeeze' after this time).
-SQUEEZE = 3
-
  # Where to email the summary
-SUMTO = "faiyaza@cs.princeton.edu"
+SUMTO = "pupadm@lists.planet-lab.org"
  TECHEMAIL="tech-%s@sites.planet-lab.org"
  PIEMAIL="pi-%s@sites.planet-lab.org"
  SLICEMAIL="%s@slices.planet-lab.org"
  PLCEMAIL="support@planet-lab.org"
  
-#Thresholds
-PITHRESH = 3
-SLICETHRESH = 5
+#Thresholds (DAYS)
+SPERDAY = 86400
+PITHRESH = 1 * SPERDAY
+SLICETHRESH = 5 * SPERDAY
+# Days before attempting rins again
+RINSTHRESH = 5 * SPERDAY
+
+# Minimum number of nodes up before squeezing
+MINUP = 2
  
  # IF:
  #  no SSH, down.
@@ -57,63 +60,107 @@ class Policy(Thread):
                 # host - > (time of email, type of email)
                 self.emailed = emailed 
                 # all sick nodes w/o tickets
-               self.sickNoTicket = sickNoTicket 
+               self.sickNoTicket = sickNoTicket
+               # Sitess we've Squeezed.
+               self.squeezed = {}
                 Thread.__init__(self)
         
-       #def getAllSick(self):
-       #       for bucket in self.cmn.comonbkts.keys():
-       #               for host in getattr(self.cmn, bucket):
-       #                       if host not in self.cursickw.keys():
-       #                               self.cursick.put(host)
  
         '''
-       Acts on sick nodes
+       What to do when node is in dbg (as reported by CoMon).
+       '''
+       def __actOnDebug(self, node):
+               # Check to see if we've done this before
+               if (node in self.emailed.keys()):
+                       if (self.emailed[node][0] == "dbg"):
+                               delta = time.time() - self.emailed[node][1]
+                               if (delta <= RINSTHRESH ):
+                                       # Don't mess with node if under Thresh. 
+                                       # Return, move on.
+                                       logger.info("POLICY:  %s in dbg, but acted on %s days ago" % (node, delta // SPERDAY))
+                                       return
+                       logger.info("POLICY:  Node in dbg - " + node)
+                       plc.nodeBootState(node, "rins") 
+                       # If it has a PCU
+                       return reboot.reboot(node)
+       
+       '''
+       What to do when node is in dbg (as reported by CoMon).
+       '''
+       def __actOnFilerw(self, node):
+               target = [PLCEMAIL]     
+               logger.info("POLICY:  Emailing PLC for " + node)
+               tmp = emailTxt.mailtxt.filerw
+               sbj = tmp[0] % {'hostname': node}
+               msg = tmp[1] % {'hostname': node}
+               mailer.email(sbj, msg, target)  
+               self.emailed[node] = ("filerw", time.time())
+
+
+       '''
+       Acts on sick nodes.
         '''
         def actOnSick(self):
                 # Get list of nodes in debug from PLC
                 #dbgNodes = NodesDebug()
                 global TECHEMAIL, PIEMAIL
+               # Grab a node from the queue (pushed by rt thread).
                 node = self.sickNoTicket.get(block = True)
                 # Get the login base    
-               id = mailer.siteId(node)
+               loginbase = plc.siteId(node)
  
                 # Send appropriate message for node if in appropriate bucket.
                 # If we know where to send a message
-               if not id: 
-                       logger.info("loginbase for %s not found" %node)
+               if not loginbase: 
+                       logger.info("POLICY:  loginbase for %s not found" %node)
                 # And we didn't email already.
                 else:
                         # If first email, send to Tech
-                       target = [TECHEMAIL % id]
+                       target = [TECHEMAIL % loginbase]
                         
                         # If disk is foobarred, PLC should check it.
                         if (node in self.cmn.filerw) and \
                         (node not in self.emailed.keys()):
-                               target = [PLCEMAIL]     
-                               logger.info("Emailing PLC for " + node)
+                               self.__actOnFilerw(node)
+                               return 
  
                         # If in dbg, set to rins, then reboot.  Inform PLC.
                         if (node in self.cmn.dbg):
-                               logger.info("Node in dbg - " + node)
-                               return
+                       # If reboot failure via PCU, POD and send email
+                       # if contacted PCU, return
+                               if self.__actOnDebug(node):  return
  
-                       # If its a disk, email PLC;  dont bother going through this loop.
                         if (node in self.emailed.keys()) and \
-                       (node not in self.cmn.filerw):
+                       (node not in self.cmn.filerw)    and \
+                       (node not in self.cmn.clock_drift):
                                 # If we emailed before, how long ago?   
-                               delta = time.localtime()[2] - self.emailed[node][1][2]
+                               delta = time.time() - self.emailed[node][1]
+                               if delta < SPERDAY:  
+                                       logger.info("POLICY:  already acted on %s today." % node)
+                                       return
+
+                               logger.info("POLICY:  acted %s on %s days ago" % (node, 
+                               delta // SPERDAY))
+
                                 # If more than PI thresh, but less than slicethresh
                                 if (delta >= PITHRESH) and (delta < SLICETHRESH): 
-                                       logger.info("Emailing PI for " + node)
-                                       target.append(PIEMAIL % id)
+                                       target.append(PIEMAIL % loginbase)
+                                       #remove slice creation if enough nodes arent up
+                                       if not self.enoughUp(loginbase):
+                                               logger.info("POLICY:  Removing slice creation from %s" % loginbase)
+                                               plc.removeSliceCreation(node)
+                                               self.squeezed[loginbase] = (time.time(), "creation")
                                 # If more than PI thresh and slicethresh
                                 if (delta >= PITHRESH) and (delta > SLICETHRESH):
-                                       logger.info("Emailing slices for " + node)
                                         # Email slices at site.
-                                       slices = mailer.slices(id)
+                                       slices = plc.slices(loginbase)
                                         if len(slices) >= 1:
                                                 for slice in slices:
                                                         target.append(SLICEMAIL % slice)
+                                               if not self.enoughUp(loginbase):
+                                                       plc.suspendSlices(node)
+                                                       self.squeezed[loginbase] = (time.time(),
+                                                                "freeze")
  
                         # Find the bucket the node is in and send appropriate email
                         # to approriate list of people.
@@ -126,7 +173,7 @@ class Policy(Thread):
                                         sbj = tmp[0] % {'hostname': node}
                                         msg = tmp[1] % {'hostname': node}
                                         mailer.email(sbj, msg, target)  
-                                       self.emailed[node] = (bkt , time.localtime())
+                                       self.emailed[node] = (bkt , time.time())
                                         return
  
  
@@ -137,7 +184,14 @@ class Policy(Thread):
                 sub = "Monitor Summary"
                 msg = "\nThe following nodes were acted upon:  \n\n"
                 for (node, (type, date)) in self.emailed.items():
-                       msg +="%s\t(%s)\t%s:%s:%s\n" %(node,type,date[3],date[4],date[5])
+                       # Print only things acted on today.
+                       if (time.gmtime(time.time())[2] == time.gmtime(date)[2]):
+                               msg +="%s\t(%s)\t%s\n" %(node, type, time.ctime(date))
+               msg +="\n\nThe following sites have been 'squeezed':\n\n"
+               for (loginbase, (date, type)) in self.squeezed.items():
+                       # Print only things acted on today.
+                       if (time.gmtime(time.time())[2] == time.gmtime(date)[2]):
+                               msg +="%s\t(%s)\t%s\n" %(loginbase, type, time.ctime(date))
                 mailer.email(sub, msg, [SUMTO])
                 logger.info(msg)
                 return 
@@ -149,34 +203,50 @@ class Policy(Thread):
                 try:
                         if action == "LOAD":
                                 f = open(DAT, "r+")
-                               logger.info("Found and reading " + DAT)
+                               logger.info("POLICY:  Found and reading " + DAT)
                                 self.emailed.update(pickle.load(f))
                         if action == "WRITE":
                                 f = open(DAT, "w")
-                               logger.debug("Writing " + DAT)
+                               #logger.debug("Writing " + DAT)
                                 pickle.dump(self.emailed, f)
                         f.close()
                 except Exception, err:
-                       logger.info("Problem with DAT, %s" %err)
+                       logger.info("POLICY:  Problem with DAT, %s" %err)
+
+       '''
+       Returns True if more than MINUP nodes are up at a site.
+       '''
+       def enoughUp(self, loginbase):
+               allsitenodes = plc.getSiteNodes(loginbase)
+               if len(allsitenodes) == 0:
+                       logger.info("Node not in db")
+                       return
+
+               numnodes = len(allsitenodes)
+               sicknodes = []
+               # Get all sick nodes from comon
+               for bucket in self.cmn.comonbkts.keys():
+                       for host in getattr(self.cmn, bucket):
+                               sicknodes.append(host)
+               # Diff.
+               for node in allsitenodes:
+                       if node in sicknodes:
+                               numnodes -= 1
+
+               if numnodes < MINUP:
+                       logger.info(\
+"POLICY:  site with %s has nodes %s up." %(loginbase, numnodes))
+                       return False 
+               else: 
+                       return True 
+                       
+               
+
  
         def run(self):
                 while 1:
                         self.actOnSick()
                         self.emailedStore("WRITE")
-'''
-Returns list of nodes in dbg as reported by PLC
-'''
-def NodesDebug():
-       dbgNodes = []
-       api = xmlrpclib.Server(XMLRPC_SERVER, verbose=False)
-       anon = {'AuthMethod': "anonymous"}
-       allnodes = api.AnonAdmGetNodes(anon, [], ['hostname','boot_state'])
-       for node in allnodes:
-               if node['boot_state'] == 'dbg': dbgNodes.append(node['hostname'])
-       logger.info("%s nodes in debug according to PLC." %len(dbgNodes))
-       return dbgNodes
-
-
  
  
  def main():
@@ -192,12 +262,12 @@ def main():
         #a = Policy(None, tmp) 
         #a.emailedStore("LOAD")
         #print a.emailed
-       print siteId("princetoan")
  
+       print plc.slices(plc.siteId("alice.cs.princeton.edu"))
         os._exit(0)
  if __name__ == '__main__':
         import os
-       XMLRPC_SERVER = 'https://www.planet-lab.org/PLCAPI/'
+       import plc
         try:
                 main()
         except KeyboardInterrupt:
diff --git a/rt.py b/rt.py

index 05dce04..f3ce1ed 100644 (file)
--- a/rt.py
+++ b/rt.py
@@ -8,6 +8,7 @@ import Queue
  import time 
  import comon
  from threading import *
+import config
  
  # RT database access constants file
  RT_DB_CONSTANTS_PATH='/etc/planetlab/rt_db'
@@ -170,7 +171,7 @@ class RT(Thread):
                                 #logger.debug("RT: tickets for %s" %host)
                                 self.tickets[host] = tmp
                         else:
-                               logger.debug("RT: no tix for %s - policy" %host)
+                               logger.debug("RT: no tix for %s" %host)
                                 self.sickNoTicket.put(host) 
  
         # Removes hosts that are no longer down.
author	Faiyaz Ahmed <faiyaza@cs.princeton.edu>
	Tue, 14 Nov 2006 19:20:13 +0000 (19:20 +0000)
committer	Faiyaz Ahmed <faiyaza@cs.princeton.edu>
	Tue, 14 Nov 2006 19:20:13 +0000 (19:20 +0000)
comon.py		patch \| blob \| history
config.py		patch \| blob \| history
emailTxt.py		patch \| blob \| history
mailer.py		patch \| blob \| history
monitor.py		patch \| blob \| history
policy.py		patch \| blob \| history
rt.py		patch \| blob \| history