Uses CoMon's ability to find 'upness' to email. Changed queueing between threads...
[monitor.git] / policy.py
index 04c2a1c..09884d7 100644 (file)
--- a/policy.py
+++ b/policy.py
@@ -13,12 +13,19 @@ import time
 import logging
 import mailer
 import emailTxt
+import pickle
+import xml, xmlrpclib
+import Queue
+
+DAT="./monitor.dat"
 
 logger = logging.getLogger("monitor")
 
 # Time to enforce policy
 POLSLEEP = 7200
 
+# Days between emails (enforce 'squeeze' after this time).
+SQUEEZE = 3
 # IF:
 #  no SSH, down.
 #  bad disk, down
@@ -31,43 +38,65 @@ POLSLEEP = 7200
 #  suspend slice creation
 #  kill slices
 class Policy(Thread):
-       def __init__(self, comonthread, tickets):
+       def __init__(self, comonthread, sickNoTicket, emailed):
                self.cmn = comonthread
-               self.tickets = tickets
-               # host - > time of email
-               self.emailed = {}
-               # all sick nodes w/ tickets
-               self.cursickw = tickets
+               # host - > (time of email, type of email)
+               self.emailed = emailed 
                # all sick nodes w/o tickets
-               self.cursick = []
+               self.sickNoTicket = sickNoTicket 
                Thread.__init__(self)
-       '''
-       Gets all nodes without tickets and puts them in self.cursick
-       '''
-       def getAllSick(self):
-               self.cursick = []
-               for bucket in self.cmn.comonbkts.keys():
-                       for host in getattr(self.cmn, bucket):
-                               if host not in self.cursickw.keys():
-                                       if host not in self.cursick: 
-                                               self.cursick.append(host)
-               logger.debug("Nodes sick wo tickets %s " % len(self.cursick))
+       
+       #def getAllSick(self):
+       #       for bucket in self.cmn.comonbkts.keys():
+       #               for host in getattr(self.cmn, bucket):
+       #                       if host not in self.cursickw.keys():
+       #                               self.cursick.put(host)
 
        '''
-       Acts on sick nodes.
+       Acts on sick nodes
        '''
-       def emailSick(self):
-               for node in self.cmn.ssh:
-                       if node in self.cursick:
-                               if node not in self.emailed.keys():
-                                       logger.debug("Emailing " + node)
-                                       try:
-                                               self.emailed[node] = "ssh"
-                                               mailer.email('DISREGARD', 
-                                               emailTxt.mailtxt.STANDARD % {'hostname': node}, 
-                                               "tech-" + mailer.siteId(node) + "@sites.planet-lab.org")
-                                       except Exception, err:
-                                               logger.info(err)
+       def emailsick(self):
+               # Get list of nodes in debug from PLC
+               #dbgNodes = NodesDebug()
+
+               node = self.sickNoTicket.get(block = True)
+               # Get the login base    
+               id = mailer.siteId(node)
+
+               if not id: 
+                       logger.info("loginbase for %s not found" %node)
+               elif node not in self.emailed.keys():
+                       # Email about Down.
+                       if node in self.cmn.down:
+                               logger.debug("POLICY: Emailing (down) " + node)
+                               self.emailed[node] = ("down", time.localtime())
+                               msg = emailTxt.mailtxt.DOWN \
+                                       % {'hostname': node}
+                               mailer.email(node + " down", msg, 
+                               "tech-" + id + "@sites.planet-lab.org")
+                               return  
+
+                       # Email about no SSH.
+                       if node in self.cmn.ssh:
+                               logger.debug("POLICY: Emailing (ssh) " + node)
+                               self.emailed[node] = ("ssh", time.localtime())
+                               msg = emailTxt.mailtxt.SSH \
+                                       % {'hostname': node}
+                               mailer.email(node + " down", msg, 
+                               "tech-" + id + "@sites.planet-lab.org")
+                               return 
+
+                       # Email about DNS
+                       if node in self.cmn.dns:
+                               logger.debug("POLICY: Emailing (dns)" + node)
+                               self.emailed[node] = ("dns", time.localtime())
+                               msg = emailTxt.mailtxt.DNS \
+                                       % {'hostname': node}
+                               mailer.email("Please update DNS used by " \
+                               + node, msg, 
+                               "tech-" + id + "@sites.planet-lab.org")
+                               return 
+       
 
        '''
        Prints, logs, and emails status of up nodes, down nodes, and buckets.
@@ -75,7 +104,64 @@ class Policy(Thread):
        def status(self):
                return 0
 
+       '''
+       Store/Load state of emails.  When, where, what.
+       '''
+       def emailedStore(self, action):
+               try:
+                       if action == "LOAD":
+                               f = open(DAT, "r+")
+                               logger.info("Found and reading " + DAT)
+                               self.emailed.update(pickle.load(f))
+                       if action == "WRITE":
+                               f = open(DAT, "w")
+                               logger.info("Writing " + DAT)
+                               pickle.dump(self.emailed, f)
+                       f.close()
+               except Exception, err:
+                       logger.info("Problem with DAT, %s" %err)
+
        def run(self):
-               #while 1:
-               self.getAllSick()
-               self.emailSick()
+               while 1:
+                       self.emailsick()
+
+'''
+Returns list of nodes in dbg as reported by PLC
+'''
+def NodesDebug():
+       dbgNodes = []
+       api = xmlrpclib.Server(XMLRPC_SERVER, verbose=False)
+       anon = {'AuthMethod': "anonymous"}
+       allnodes = api.AnonAdmGetNodes(anon, [], ['hostname','boot_state'])
+       for node in allnodes:
+               if node['boot_state'] == 'dbg': dbgNodes.append(node['hostname'])
+       logger.info("%s nodes in debug according to PLC." %len(dbgNodes))
+       return dbgNodes
+
+
+
+
+def main():
+       logger.setLevel(logging.DEBUG)
+       ch = logging.StreamHandler()
+       ch.setLevel(logging.DEBUG)
+       formatter = logging.Formatter('%(message)s')
+       ch.setFormatter(formatter)
+       logger.addHandler(ch)
+
+       #print NodesDebug()
+       tmp = Queue.Queue()
+       a = Policy(None, tmp) 
+       a.emailedStore("LOAD")
+       print a.emailed
+
+       os._exit(0)
+if __name__ == '__main__':
+       import os
+       XMLRPC_SERVER = 'https://www.planet-lab.org/PLCAPI/'
+       try:
+               main()
+       except KeyboardInterrupt:
+               print "Killed.  Exitting."
+               logger.info('Monitor Killed')
+               os._exit(0)