Big change set.
authorStephen Soltesz <soltesz@cs.princeton.edu>
Sat, 4 Apr 2009 00:28:13 +0000 (00:28 +0000)
committerStephen Soltesz <soltesz@cs.princeton.edu>
Sat, 4 Apr 2009 00:28:13 +0000 (00:28 +0000)
added plccache objects for the db rather than pkl
moved SiteInterface into interface.py to make it accessible by the web server
added blacklist support for hostnames and loginbases
fixed pcubad logic
fixed sitebad logic
added new *history functions to controllers.py
updated files to use new cached plc data where possible.

24 files changed:
automate-default.sh
bootman.py
findall.py
findbad.py
findbadpcu.py
grouprins.py [deleted file]
monitor/database/info/__init__.py
monitor/database/info/interface.py [new file with mode: 0644]
monitor/database/info/model.py
monitor/database/info/plc.py [new file with mode: 0644]
monitor/wrapper/plc.py
monitor/wrapper/plccache.py
nodegroups.py
nodeinfo.py
pcubad.py
pcucontrol/models/IPAL.py
policy.py
sitebad.py
siteinfo.py
testapi.py
tests/nodenetwork.py [moved from nodenetwork.py with 100% similarity]
web/MonitorWeb/monitorweb/controllers.py
web/MonitorWeb/monitorweb/static/css/style.css
web/MonitorWeb/monitorweb/templates/pcuview.kid

index 8d67c94..24a9e61 100755 (executable)
@@ -82,10 +82,5 @@ for f in act_all l_plcnodes site_persistflags node_persistflags pcu_persistflags
        fi
 done
 
-############################
-# 5. Check if there are any nodes in dbg state.  Clean up afterward.
-#${MONITOR_SCRIPT_ROOT}/grouprins.py --mail=1 --reboot --nodeselect 'state=DOWN&&boot_state=(boot|rins|dbg|diag)' --stopselect "state=BOOT" || :
-#${MONITOR_SCRIPT_ROOT}/grouprins.py --mail=1 --reboot --nodeselect 'state=DEBUG&&boot_state=(rins|dbg|boot)' --stopselect 'state=BOOT' || :
-
 cp ${MONITOR_SCRIPT_ROOT}/monitor.log ${MONITOR_ARCHIVE_ROOT}/`date +%F-%H:%M`.monitor.log
 rm -f $MONITOR_PID
index cfc47a1..1a04ef0 100755 (executable)
@@ -25,6 +25,7 @@ from monitor.model import *
 from monitor.common import email_exception, found_within
 from monitor.database.info.model import *
 from monitor.wrapper import plc
+from monitor.wrapper import plccache
 from monitor.wrapper.emailTxt import mailtxt
 
 from pcucontrol.util import command as moncommands
@@ -114,7 +115,7 @@ class NodeConnection:
                ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
                bm_continue = True
 
-               plcnode = api.GetNodes({'hostname': self.node}, None)[0]
+               plcnode = plccache.GetNodeByName(self.node)
 
                InitializeBootManager.Run(bm.VARS, bm.LOG)
                try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
@@ -808,7 +809,7 @@ def restore(sitehist, hostname, config=None, forced_action=None):
                                print "...NOTIFYING OWNERS OF DNS FAILURE on %s!!!" % hostname
                                args = {}
                                try:
-                                       node = api.GetNodes(hostname)[0]
+                                       node = plccache.GetNodeByName(hostname)
                                        net = api.GetNodeNetworks(node['nodenetwork_ids'])[0]
                                except:
                                        email_exception()
index 41e23c6..64c4987 100755 (executable)
@@ -6,6 +6,7 @@ from findbadpcu import main as findbadpcu_main
 from sitebad import main as sitebad_main
 from nodebad import main as nodebad_main
 from pcubad import main as pcubad_main
+from monitor.wrapper import plccache
 import sys
 
 if __name__ == '__main__':
@@ -28,6 +29,8 @@ if __name__ == '__main__':
        cfg = parsermodule.parse_args(parser)
 
        try:
+               print "sync with plc"
+               plccache.sync()
                print "findbad"
                findbad_main()
                print "findbadpcu"
index 12b0080..7ae4b13 100755 (executable)
@@ -119,24 +119,24 @@ def main():
                l_nodes = filter(lambda x: x['hostname'] in f_nodes, l_nodes)
        elif config.nodegroup:
                ng = api.GetNodeGroups({'name' : config.nodegroup})
-               l_nodes = api.GetNodes(ng[0]['node_ids'])
+               l_nodes = plccache.GetNodesByIds(ng[0]['node_ids'])
        elif config.site:
-               site = api.GetSites(config.site)
-               l_nodes = api.GetNodes(site[0]['node_ids'], ['hostname'])
+               site = plccache.GetSitesByName([config.site])
+               l_nodes = plccache.GetNodesByIds(site[0]['node_ids'])
        elif config.sitelist:
                site_list = config.sitelist.split(',')
-               sites = api.GetSites(site_list)
+               sites = plccache.GetSitesByName(site_list)
                node_ids = []
                for s in sites:
                        node_ids += s['node_ids']
-               l_nodes = api.GetNodes(node_ids, ['hostname'])
+               l_nodes = plccache.GetNodesByIds(node_ids)
                
        l_nodes = [node['hostname'] for node in l_nodes]
 
        # perform this query after the above options, so that the filter above
        # does not break.
        if config.nodeselect:
-               plcnodes = api.GetNodes({'peer_id' : None}, ['hostname'])
+               plcnodes = plccache.l_nodes
                plcnodes = [ node['hostname'] for node in plcnodes ]
                l_nodes = node_select(config.nodeselect, plcnodes, None)
 
index 893c2b7..ab4f5ff 100755 (executable)
@@ -94,8 +94,8 @@ def main():
        api = plc.getAuthAPI()
 
        if config.site is not None:
-               site = api.GetSites(config.site)
-               l_nodes = api.GetNodes(site[0]['node_ids'], ['pcu_ids'])
+               site = plccache.GetSitesByName([config.site])
+               l_nodes = plccache.GetNodesByIds(site[0]['node_ids'])
                pcus = []
                for node in l_nodes:
                        pcus += node['pcu_ids']
@@ -103,7 +103,7 @@ def main():
                l_pcus = [pcu for pcu in sets.Set(pcus)]
 
        elif config.node is not None:
-               l_nodes = api.GetNodes(config.node, ['pcu_ids'])
+               l_nodes = plcacche.GetNodeByName(config.node)
                pcus = []
                for node in l_nodes:
                        pcus += node['pcu_ids']
@@ -113,12 +113,12 @@ def main():
        elif config.sitelist:
                site_list = config.sitelist.split(',')
 
-               sites = api.GetSites(site_list)
+               sites = plccache.GetSitesByName(site_list)
                node_ids = []
                for s in sites:
                        node_ids += s['node_ids']
 
-               l_nodes = api.GetNodes(node_ids, ['pcu_ids'])
+               l_nodes = plccache.GetNodeByIds(node_ids)
                pcus = []
                for node in l_nodes:
                        pcus += node['pcu_ids']
diff --git a/grouprins.py b/grouprins.py
deleted file mode 100755 (executable)
index 5529418..0000000
+++ /dev/null
@@ -1,387 +0,0 @@
-#!/usr/bin/python
-
-# This script is used to manipulate the operational state of nodes in
-# different node groups.  These are basically set operations on nodes via the
-# PLC api.
-# 
-# Take the ng name as an argument....
-# optionally, 
-#  * get a list of nodes in the given nodegroup.
-#  * set some or all in the set to rins.
-#  * restart them all.
-#  * do something else to them all.
-# 
-
-from monitor import config
-from monitor import util
-from monitor import const
-from monitor import database
-from monitor import parser as parsermodule
-from monitor import reboot
-from monitor.database.info.model import *
-from monitor.wrapper import plc
-api = plc.getAuthAPI()
-
-import traceback
-from optparse import OptionParser
-
-from monitor.common import *
-from nodequery import verify,query_to_dict,node_select
-from monitor.model import *
-import os
-
-import time
-
-import bootman                 # debug nodes
-import mailmonitor     # down nodes without pcu
-from monitor.wrapper.emailTxt import mailtxt
-import sys
-
-class Reboot(object):
-       def __init__(self, fbnode):
-               self.fbnode = fbnode
-
-       def _send_pcunotice(self, host):
-               args = {}
-               args['hostname'] = host
-               try:
-                       args['pcu_id'] = plc.getpcu(host)['pcu_id']
-               except:
-                       args['pcu_id'] = host
-                       
-               m = PersistMessage(host, mailtxt.pcudown_one[0] % args,
-                                                                mailtxt.pcudown_one[1] % args, True, db='pcu_persistmessages')
-
-               loginbase = plc.siteId(host)
-               m.send([const.TECHEMAIL % loginbase])
-
-       def pcu(self, host):
-               # TODO: It should be possible to diagnose the various conditions of
-               #               the PCU here, and send different messages as appropriate.
-               print "'%s'" % self.fbnode['pcu']
-               if self.fbnode['pcu'] == "PCU" or "PCUOK" in self.fbnode['pcu']:
-                       self.action = "reboot.reboot('%s')" % host
-
-                       pflags = PersistFlags(host, 2*60*60*24, db='pcu_persistflags')
-                       #pflags.resetRecentFlag('pcutried')
-                       if not pflags.getRecentFlag('pcutried'):
-                               try:
-                                       print "CALLING REBOOT!!!"
-                                       ret = reboot.reboot(host)
-
-                                       pflags.setRecentFlag('pcutried')
-                                       pflags.save()
-                                       return ret
-
-                               except Exception,e:
-                                       email_exception()
-                                       print traceback.print_exc(); print e
-
-                                       # NOTE: this failure could be an implementation issue on
-                                       #               our end.  So, extra notices are confusing...
-                                       # self._send_pcunotice(host) 
-
-                                       pflags.setRecentFlag('pcufailed')
-                                       pflags.save()
-                                       return False
-
-                       elif not pflags.getRecentFlag('pcu_rins_tried'):
-                               try:
-                                       # set node to 'rins' boot state.
-                                       print "CALLING REBOOT +++ RINS"
-                                       plc.nodeBootState(host, 'rins')
-                                       ret = reboot.reboot(host)
-
-                                       pflags.setRecentFlag('pcu_rins_tried')
-                                       pflags.save()
-                                       return ret
-
-                               except Exception,e:
-                                       email_exception()
-                                       print traceback.print_exc(); print e
-
-                                       # NOTE: this failure could be an implementation issue on
-                                       #               our end.  So, extra notices are confusing...
-                                       # self._send_pcunotice(host) 
-
-                                       pflags.setRecentFlag('pcufailed')
-                                       pflags.save()
-                                       return False
-                       else:
-                               # we've tried the pcu recently, but it didn't work,
-                               # so did we send a message about it recently?
-                               if not pflags.getRecentFlag('pcumessagesent'): 
-
-                                       self._send_pcunotice(host)
-
-                                       pflags.setRecentFlag('pcumessagesent')
-                                       pflags.save()
-
-                               # This will result in mail() being called next, to try to
-                               # engage the technical contact to take care of it also.
-                               print "RETURNING FALSE"
-                               return False
-
-               else:
-                       print "NO PCUOK"
-                       self.action = "None"
-                       return False
-
-       def mail(self, host):
-
-               # Reset every 4 weeks or so
-               pflags = PersistFlags(host, 27*60*60*24, db='mail_persistflags')
-               if not pflags.getRecentFlag('endrecord'):
-                       node_end_record(host)
-                       pflags.setRecentFlag('endrecord')
-                       pflags.save()
-
-               # Then in either case, run mailmonitor.reboot()
-               self.action = "mailmonitor.reboot('%s')" % host
-               try:
-                       return mailmonitor.reboot(host)
-               except Exception, e:
-                       email_exception(host)
-                       print traceback.print_exc(); print e
-                       return False
-
-class RebootDebug(Reboot):
-
-       def direct(self, host):
-               self.action = "bootman.reboot('%s', config, None)" % host
-               return bootman.reboot(host, config, None)
-       
-class RebootBoot(Reboot):
-
-       def direct(self, host):
-               self.action = "bootman.reboot('%s', config, 'reboot')" % host
-               return bootman.reboot(host, config, 'reboot')
-
-class RebootDown(Reboot):
-
-       def direct(self, host):
-               self.action = "None"
-               return False    # this always fails, since the node will be down.
-
-def set_node_to_rins(host, fb):
-
-       node = api.GetNodes(host, ['boot_state', 'last_contact', 'last_updated', 'date_created'])
-       record = {'observation' : node[0], 
-                         'model' : 'USER_REQUEST', 
-                         'action' : 'api.UpdateNode(%s, {"boot_state" : "rins"})' % host, 
-                         'time' : time.time()}
-       l = Log(host, record)
-
-       ret = api.UpdateNode(host, {'boot_state' : 'rins'})
-       if ret:
-               # it's nice to see the current status rather than the previous status on the console
-               node = api.GetNodes(host)[0]
-               print l
-               print "%-2d" % (i-1), nodegroup_display(node, fb)
-               return l
-       else:
-               print "FAILED TO UPDATE NODE BOOT STATE : %s" % host
-               return None
-
-
-try:
-       rebootlog = database.dbLoad("rebootlog")
-except:
-       rebootlog = LogRoll()
-
-parser = parsermodule.getParser(['nodesets'])
-parser.set_defaults( timewait=0,
-                                       skip=0,
-                                       rins=False,
-                                       reboot=False,
-                                       findbad=False,
-                                       force=False, 
-                                       nosetup=False, 
-                                       verbose=False, 
-                                       quiet=False,
-                                       )
-
-parser.add_option("", "--stopselect", dest="stopselect", metavar="", 
-                                       help="The select string that must evaluate to true for the node to be considered 'done'")
-parser.add_option("", "--findbad", dest="findbad", action="store_true", 
-                                       help="Re-run findbad on the nodes we're going to check before acting.")
-parser.add_option("", "--force", dest="force", action="store_true", 
-                                       help="Force action regardless of previous actions/logs.")
-parser.add_option("", "--rins", dest="rins", action="store_true", 
-                                       help="Set the boot_state to 'rins' for all nodes.")
-parser.add_option("", "--reboot", dest="reboot", action="store_true", 
-                                       help="Actively try to reboot the nodes, keeping a log of actions.")
-
-parser.add_option("", "--verbose", dest="verbose", action="store_true", 
-                                       help="Extra debug output messages.")
-parser.add_option("", "--nosetup", dest="nosetup", action="store_true", 
-                                       help="Do not perform the orginary setup phase.")
-parser.add_option("", "--skip", dest="skip", 
-                                       help="Number of machines to skip on the input queue.")
-parser.add_option("", "--timewait", dest="timewait", 
-                                       help="Minutes to wait between iterations of 10 nodes.")
-
-parser = parsermodule.getParser(['defaults'], parser)
-config = parsermodule.parse_args(parser)
-
-# COLLECT nodegroups, nodes and node lists
-if config.nodegroup:
-       ng = api.GetNodeGroups({'name' : config.nodegroup})
-       nodelist = api.GetNodes(ng[0]['node_ids'])
-       hostnames = [ n['hostname'] for n in nodelist ]
-
-if config.site:
-       site = api.GetSites(config.site)
-       l_nodes = api.GetNodes(site[0]['node_ids'], ['hostname'])
-       hostnames = [ n['hostname'] for n in l_nodes ]
-
-if config.node or config.nodelist:
-       if config.node: hostnames = [ config.node ] 
-       else: hostnames = util.file.getListFromFile(config.nodelist)
-
-fbquery = FindbadNodeRecord.get_all_latest()
-fb_nodelist = [ n.hostname for n in fbquery ]
-
-if config.nodeselect:
-       hostnames = node_select(config.nodeselect, fb_nodelist)
-
-if config.findbad:
-       # rerun findbad with the nodes in the given nodes.
-       file = "findbad.txt"
-       util.file.setFileFromList(file, hostnames)
-       os.system("./findbad.py --cachenodes --increment --nodelist %s" % file)
-       # TODO: shouldn't we reload the node list now?
-
-q_blacklist = BlacklistRecord.query.all()
-l_blacklist = [ n.hostname for n in q_blacklist ]
-# commands:
-i = 1
-count = 1
-#print "hosts: %s" % hostnames
-for host in hostnames:
-
-       #if 'echo' in host or 'hptest-1' in host: continue
-
-       try:
-               try:
-                       node = api.GetNodes(host)[0]
-               except:
-                       email_exception()
-                       print traceback.print_exc(); 
-                       print "FAILED GETNODES for host: %s" % host
-                       continue
-                       
-               print "%-2d" % i, nodegroup_display(node, fb)
-               i += 1
-               if i-1 <= int(config.skip): continue
-               if host in l_blacklist:
-                       print "%s is blacklisted.  Skipping." % host
-                       continue
-
-               if config.stopselect:
-                       dict_query = query_to_dict(config.stopselect)
-                       fbnode = fb['nodes'][host]['values']
-                       observed_state = get_current_state(fbnode)
-
-                       if verify(dict_query, fbnode) and observed_state != "dbg ":
-                               # evaluates to true, therefore skip.
-                               print "%s evaluates true for %s ; skipping..." % ( config.stopselect, host )
-                               try:
-                                       # todo: clean up act_all record here.
-                                       # todo: send thank you, etc.
-                                       mailmonitor.reboot(host)
-                               except Exception, e:
-                                       email_exception()
-                                       print traceback.print_exc(); print e
-
-                               continue
-                       #else:
-                               #print "%s failed to match %s: -%s-" % ( host, dict_query, observed_state )
-                               #sys.exit(1)
-
-               if not config.force and rebootlog.find(host, {'action' : ".*reboot"}, 60*60*2):
-                       print "recently rebooted %s.  skipping... " % host
-                       continue
-
-               if config.reboot:
-
-                       fbnode = fb['nodes'][host]['values']
-                       observed_state = get_current_state(fbnode)
-
-                       if       observed_state == "dbg ":
-                               o = RebootDebug(fbnode)
-
-                       elif observed_state == "boot" :
-                               if config.rins:
-                                       l = set_node_to_rins(host, fb)
-                                       if l: rebootlog.add(l)
-
-                               o = RebootBoot(fbnode)
-
-                       elif observed_state == "down":
-                               if config.rins:
-                                       l = set_node_to_rins(host, fb)
-                                       if l: rebootlog.add(l)
-
-                               o = RebootDown(fbnode)
-
-
-                       if o.direct(host):
-                               record = {'observation' : "DIRECT_SUCCESS: %s" % observed_state, 
-                                                 'action' : o.action,
-                                                 'model' : "none",
-                                                 'time' : time.time()}
-                       elif o.pcu(host):
-                               record = {'observation' : "PCU_SUCCESS: %s" % observed_state, 
-                                                 'action' : o.action,
-                                                 'model' : "none",
-                                                 'time' : time.time()}
-                       elif o.mail(host):
-                               record = {'observation' : "MAIL_SUCCESS: %s" % observed_state, 
-                                                 'action' : o.action,
-                                                 'model' : "none",
-                                                 'time' : time.time()}
-                       else:
-                               record = {'observation' : "REBOOT_FAILED: %s" %  observed_state,
-                                                 'action' : "log failure",
-                                                 'model' : "none",
-                                                 'time' : time.time()}
-
-                               print "ALL METHODS OF RESTARTING %s FAILED" % host
-                               args = {}
-                               args['hostname'] = host
-                               #m = PersistMessage(host, "ALL METHODS FAILED for %(hostname)s" % args,
-                               #                                                        "CANNOT CONTACT", False, db='suspect_persistmessages')
-                               #m.reset()
-                               #m.send(['monitor-list@lists.planet-lab.org'])
-
-                       l = Log(host, record)
-                       print l
-                       rebootlog.add(l)
-       except KeyboardInterrupt:
-               print "Killed by interrupt"
-               sys.exit(0)
-       except:
-               email_exception()
-               print traceback.print_exc();
-               print "Continuing..."
-
-       time.sleep(1)
-       if count % 10 == 0:
-               print "Saving rebootlog"
-               database.dbDump("rebootlog", rebootlog)
-               wait_time = int(config.timewait)
-               print "Sleeping %d minutes" % wait_time
-               ti = 0
-               print "Minutes slept: ",
-               sys.stdout.flush()
-               while ti < wait_time:
-                       print "%s" % ti,
-                       sys.stdout.flush()
-                       time.sleep(60)
-                       ti = ti+1
-
-       count = count + 1
-
-print "Saving rebootlog"
-database.dbDump("rebootlog", rebootlog)
index 9c3df82..03a1b74 100644 (file)
@@ -44,4 +44,5 @@ Entity.findby_or_create = classmethod(findby_or_create)
 from monitor.database.info.action import *
 from monitor.database.info.findbad import *
 from monitor.database.info.history import *
+from monitor.database.info.plc import *
 setup_all()
diff --git a/monitor/database/info/interface.py b/monitor/database/info/interface.py
new file mode 100644 (file)
index 0000000..2e5064d
--- /dev/null
@@ -0,0 +1,198 @@
+import bootman                 # debug nodes
+
+from monitor import reboot
+from monitor.common import *
+from monitor.model import *
+from monitor.wrapper import plc
+from monitor.wrapper import plccache
+from monitor.wrapper.emailTxt import mailtxt
+from monitor.database.info.model import *
+
+class SiteInterface(HistorySiteRecord):
+       @classmethod
+       def get_or_make(cls, if_new_set={}, **kwargs):
+               if 'hostname' in kwargs:
+                       kwargs['loginbase'] = plccache.plcdb_hn2lb[kwargs['hostname']]
+                       del kwargs['hostname']
+               res = HistorySiteRecord.findby_or_create(if_new_set, **kwargs)
+               return SiteInterface(res)
+       
+       def __init__(self, sitehist):
+               self.db = sitehist
+
+       def getRecentActions(self, **kwargs):
+               # TODO: make query only return records within a certin time range,
+               # i.e. greater than 0.5 days ago. or 5 days, etc.
+
+               #print "kwargs: ", kwargs
+
+               recent_actions = []
+               if 'loginbase' in kwargs:
+                       recent_actions = ActionRecord.query.filter_by(loginbase=kwargs['loginbase']).order_by(ActionRecord.date_created.desc())
+               elif 'hostname' in kwargs:
+                       recent_actions = ActionRecord.query.filter_by(hostname=kwargs['hostname']).order_by(ActionRecord.date_created.desc())
+               return recent_actions
+       
+       def increasePenalty(self):
+               #act = ActionRecord(loginbase=self.db.loginbase, action='penalty', action_type='increase_penalty',)
+               self.db.penalty_level += 1
+               # NOTE: this is to prevent overflow or index errors in applyPenalty.
+               #       there's probably a better approach to this.
+               if self.db.penalty_level >= 2:
+                       self.db.penalty_level = 2
+               self.db.penalty_applied = True
+       
+       def applyPenalty(self):
+               penalty_map = [] 
+               penalty_map.append( { 'name': 'noop',                   'enable'   : lambda site: None,
+                                                                                                               'disable'  : lambda site: None } )
+               penalty_map.append( { 'name': 'nocreate',               'enable'   : lambda site: plc.removeSiteSliceCreation(site),
+                                                                                                               'disable'  : lambda site: plc.enableSiteSliceCreation(site) } )
+               penalty_map.append( { 'name': 'suspendslices',  'enable'   : lambda site: plc.suspendSiteSlices(site),
+                                                                                                               'disable'  : lambda site: plc.enableSiteSlices(site) } )
+
+               for i in range(len(penalty_map)-1,self.db.penalty_level,-1):
+                       print "\tdisabling %s on %s" % (penalty_map[i]['name'], self.db.loginbase)
+                       penalty_map[i]['disable'](self.db.loginbase) 
+
+               for i in range(0,self.db.penalty_level+1):
+                       print "\tapplying %s on %s" % (penalty_map[i]['name'], self.db.loginbase)
+                       penalty_map[i]['enable'](self.db.loginbase)
+
+               return
+
+       def pausePenalty(self):
+               act = ActionRecord(loginbase=self.db.loginbase,
+                                                       action='penalty',
+                                                       action_type='pause_penalty',)
+       
+       def clearPenalty(self):
+               #act = ActionRecord(loginbase=self.db.loginbase, action='penalty', action_type='clear_penalty',)
+               self.db.penalty_level = 0
+               self.db.penalty_applied = False
+       
+       def getTicketStatus(self):
+               if self.db.message_id != 0:
+                       rtstatus = mailer.getTicketStatus(self.db.message_id)
+                       self.db.message_status = rtstatus['Status']
+                       self.db.message_queue = rtstatus['Queue']
+                       self.db.message_created = datetime.fromtimestamp(rtstatus['Created'])
+
+       def setTicketStatus(self, status):
+               print 'SETTING status %s' % status
+               if self.db.message_id != 0:
+                       rtstatus = mailer.setTicketStatus(self.db.message_id, status)
+
+       def getContacts(self):
+               contacts = []
+               if self.db.penalty_level >= 0:
+                       contacts += plc.getTechEmails(self.db.loginbase)
+
+               if self.db.penalty_level >= 1:
+                       contacts += plc.getPIEmails(self.db.loginbase)
+
+               if self.db.penalty_level >= 2:
+                       contacts += plc.getSliceUserEmails(self.db.loginbase)
+
+               return contacts
+
+       def sendMessage(self, type, **kwargs):
+
+               # NOTE: evidently changing an RT message's subject opens the ticket.
+               #       the logic in this policy depends up a ticket only being 'open'
+        #       if a user has replied to it.
+        #       So, to preserve these semantics, we check the status before
+        #           sending, then after sending, reset the status to the
+        #           previous status.
+        #       There is a very tiny race here, where a user sends a reply
+        #           within the time it takes to check, send, and reset.
+        #       This sucks.  It's almost certainly fragile.
+
+               # 
+               # TODO: catch any errors here, and add an ActionRecord that contains
+               #       those errors.
+               
+               args = {'loginbase' : self.db.loginbase, 'penalty_level' : self.db.penalty_level}
+               args.update(kwargs)
+
+               hostname = None
+               if 'hostname' in args:
+                       hostname = args['hostname']
+
+               if hasattr(mailtxt, type):
+
+                       message = getattr(mailtxt, type)
+                       viart = True
+                       if 'viart' in kwargs:
+                               viart = kwargs['viart']
+
+                       if viart:
+                               self.getTicketStatus()          # get current message status
+
+                       m = Message(message[0] % args, message[1] % args, viart, self.db.message_id)
+
+                       contacts = self.getContacts()
+                       contacts = [config.cc_email]    # TODO: remove after testing...
+
+                       print "sending message: %s to site %s for host %s" % (type, self.db.loginbase, hostname)
+
+                       ret = m.send(contacts)
+                       if viart:
+                               self.db.message_id = ret
+                               # reset to previous status, since a new subject 'opens' RT tickets.
+                               self.setTicketStatus(self.db.message_status) 
+
+                               # NOTE: only make a record of it if it's in RT.
+                               act = ActionRecord(loginbase=self.db.loginbase, hostname=hostname, action='notice', 
+                                                               action_type=type, message_id=self.db.message_id)
+
+               else:
+                       print "+-- WARNING! ------------------------------"
+                       print "| No such message name in emailTxt.mailtxt: %s" % type
+                       print "+------------------------------------------"
+
+               return
+
+       def closeTicket(self):
+               # TODO: close the rt ticket before overwriting the message_id
+               mailer.closeTicketViaRT(self.db.message_id, "Ticket Closed by Monitor")
+               act = ActionRecord(loginbase=self.db.loginbase, action='notice', 
+                                                       action_type='close_ticket', message_id=self.db.message_id)
+               self.db.message_id = 0
+               self.db.message_status = "new"
+
+       def runBootManager(self, hostname):
+               print "attempting BM reboot of %s" % hostname
+               ret = ""
+               try:
+                       ret = bootman.restore(self, hostname)
+                       err = ""
+               except:
+                       err = traceback.format_exc()
+                       print err
+
+               act = ActionRecord(loginbase=self.db.loginbase,
+                                                       hostname=hostname,
+                                                       action='reboot',
+                                                       action_type='bootmanager_restore',
+                                                       error_string=err)
+               return ret
+
+       def attemptReboot(self, hostname):
+               print "attempting PCU reboot of %s" % hostname
+               err = ""
+               try:
+                       ret = reboot.reboot_str(hostname)
+               except Exception, e:
+                       err = traceback.format_exc()
+                       ret = str(e)
+
+               if ret == 0 or ret == "0":
+                       ret = ""
+
+               act = ActionRecord(loginbase=self.db.loginbase,
+                                                       hostname=hostname,
+                                                       action='reboot',
+                                                       action_type='first_try_reboot',
+                                                       error_string=err)
+
index 151f428..c538c66 100644 (file)
@@ -1,4 +1,5 @@
 from monitor.database.info.action import *
 from monitor.database.info.findbad import *
 from monitor.database.info.history import *
+from monitor.database.info.plc import *
 from monitor.database.dborm import mon_session as session
diff --git a/monitor/database/info/plc.py b/monitor/database/info/plc.py
new file mode 100644 (file)
index 0000000..0847057
--- /dev/null
@@ -0,0 +1,33 @@
+from elixir import Entity, Field, OneToMany, ManyToOne, ManyToMany
+from elixir import options_defaults, using_options, setup_all
+from elixir import PickleType, String, Integer, DateTime, Boolean
+from elixir.ext.versioned import *
+
+from datetime import datetime,timedelta
+
+from monitor.database.dborm import mon_metadata, mon_session
+__metadata__ = mon_metadata
+__session__  = mon_session
+
+class PlcSite(Entity):
+       site_id = Field(Integer,primary_key=True)
+       loginbase = Field(String,default=None)
+       date_checked = Field(DateTime,default=datetime.now)
+
+       plc_site_stats = Field(PickleType,default=None)
+       acts_as_versioned(ignore=['date_checked'])
+
+class PlcNode(Entity):
+       node_id = Field(Integer,primary_key=True)
+       hostname = Field(String,default=None)
+       date_checked = Field(DateTime,default=datetime.now)
+
+       plc_node_stats = Field(PickleType,default=None)
+       acts_as_versioned(ignore=['date_checked'])
+
+class PlcPCU(Entity):
+       pcu_id = Field(Integer,primary_key=True)
+       date_checked = Field(DateTime,default=datetime.now)
+
+       plc_pcu_stats = Field(PickleType,default=None)
+       acts_as_versioned(ignore=['date_checked'])
index d2d627f..2f0f19d 100644 (file)
@@ -17,8 +17,12 @@ from monitor import database
 try:
        from monitor import config
        debug = config.debug
+       XMLRPC_SERVER=config.API_SERVER
 except:
        debug = False
+       # NOTE: this host is used by default when there are no auth files.
+       XMLRPC_SERVER="https://boot.planet-lab.org/PLCAPI/"
+
 logger = logging.getLogger("monitor")
        
 class Auth:
@@ -34,8 +38,6 @@ class Auth:
                                                        'AuthMethod' : 'password',
                                                        'AuthString' : password}
 
-# NOTE: this host is used by default when there are no auth files.
-XMLRPC_SERVER="https://boot.planet-lab.org/PLCAPI/"
 
 # NOTE: by default, use anonymous access, but if auth files are 
 #       configured, use them, with their auth definitions.
@@ -54,8 +56,6 @@ except:
                auth = Auth()
                auth.server = XMLRPC_SERVER
 
-api = xmlrpclib.Server(auth.server, verbose=False, allow_none=True)
-
 global_error_count = 0
 
 class PLC:
@@ -84,6 +84,8 @@ class PLC:
        def __repr__(self):
                return self.api.__repr__()
 
+api = PLC(auth.auth, auth.server)
+
 class CachedPLC(PLC):
 
        def _param_to_str(self, name, *params):
index db71b16..0645b18 100755 (executable)
@@ -2,8 +2,7 @@
 
 import sys
 from monitor.wrapper import plc
-from monitor import database
-from monitor import config
+from monitor.database.info.model import *
 
 def dsites_from_lsites(l_sites):
        d_sites = {}
@@ -53,98 +52,107 @@ def dsn_from_dsln(d_sites, id2lb, l_nodes):
                hn2lb[hostname] = login_base
        return (dsn, hn2lb, lb2hn)
 
-def create_netid2ip(l_nodes, l_nodenetworks):
-       netid2ip = {}
-       for node in l_nodes:
-               for netid in node['nodenetwork_ids']:
-                       found = False
-                       for nn in l_nodenetworks:
-                               if nn['nodenetwork_id'] == netid:
-                                       found = True
-                                       netid2ip[netid] = nn['ip']
-                       if not found:
-                               print "ERROR! %s" % node
-
-       return netid2ip
-
 l_sites = None
 l_nodes = None
 l_pcus = None
-l_nodenetworks = None
 
 plcdb_hn2lb = None
 plcdb_lb2hn = None
-plcdb_netid2ip = None
 plcdb_id2lb = None
 
 def init():
        global l_sites
        global l_nodes
        global l_pcus
-       global l_nodenetworks
        global plcdb_hn2lb
        global plcdb_lb2hn
-       global plcdb_netid2ip
        global plcdb_id2lb
 
-       api = plc.getCachedAuthAPI()
-       l_sites = api.GetSites({'peer_id':None}, 
-                                                       ['login_base', 'site_id', 'abbreviated_name', 'latitude', 
-                                                       'longitude', 'max_slices', 'slice_ids', 'node_ids', 'enabled', 'date_created' ])
-       l_nodes = api.GetNodes({'peer_id':None}, 
-                                                       ['hostname', 'node_id', 'ports', 'site_id', 'version', 'last_updated', 
-                                                        'date_created', 'last_contact', 'pcu_ids', 'nodenetwork_ids'])
-       l_pcus = api.GetPCUs()
-       l_nodenetworks = api.GetNodeNetworks()
+       dbsites = PlcSite.query.all()
+       l_sites = [ s.plc_site_stats for s in dbsites ]
+
+       dbnodes = PlcNode.query.all()
+       l_nodes = [ s.plc_node_stats for s in dbnodes ]
+
+       dbpcus = PlcPCU.query.all()
+       l_pcus = [ s.plc_pcu_stats for s in dbpcus ]
 
        (d_sites,id2lb) = dsites_from_lsites(l_sites)
        (plcdb, hn2lb, lb2hn) = dsn_from_dsln(d_sites, id2lb, l_nodes)
-       netid2ip = create_netid2ip(l_nodes, l_nodenetworks)
 
        plcdb_hn2lb = hn2lb
        plcdb_lb2hn = lb2hn
-       plcdb_netid2ip = netid2ip
        plcdb_id2lb = id2lb
        
-       return l_nodes
-
-
-def create_plcdb():
-
-       # get sites, and stats
-       l_sites = plc.getSites({'peer_id':None}, ['login_base', 'site_id', 'abbreviated_name', 'latitude', 'longitude', 
-                                                                                         'max_slices', 'slice_ids', 'node_ids' ])
-       if len(l_sites) == 0:
-               print "no sites! exiting..."
-               sys.exit(1)
-       (d_sites,id2lb) = dsites_from_lsites(l_sites)
+       return
+
+def GetNodesByIds(ids):
+       ret = []
+       for node_id in ids:
+               node = PlcNode.get_by(node_id=node_id)
+               ret.append(node.plc_node_stats)
+       return ret
+
+def GetNodesBySite(loginbase):
+       site = PlcSite.get_by(loginbase=loginbase)
+       return GetNodesByIds(site.plc_site_stats['node_ids'])
+
+def GetNodeByName(hostname):
+       node = PlcNode.get_by(hostname=hostname)
+       return node.plc_node_stats
+
+def GetSitesByName(sitelist):
+       ret = []
+       for site in sitelist:
+               site = PlcSite.get_by(loginbase=site)
+               ret.append(site.plc_site_stats)
+       return ret
+
+def sync():
+       l_sites = plc.api.GetSites({'peer_id':None}, 
+                                               ['login_base', 'site_id', 'abbreviated_name', 'latitude', 
+                                               'longitude', 'max_slices', 'slice_ids', 'node_ids', 
+                                               'enabled', 'date_created' ])
+       l_nodes = plc.api.GetNodes({'peer_id':None}, 
+                                               ['hostname', 'node_id', 'ports', 'site_id', 
+                                                'version', 'last_updated', 'date_created', 
+                                                'last_contact', 'pcu_ids', 'nodenetwork_ids'])
+       l_pcus = plc.api.GetPCUs()
+
+       print "sync sites"
+       for site in l_sites:
+               dbsite = PlcSite.findby_or_create(site_id=site['site_id'])
+               dbsite.loginbase = site['login_base']
+               dbsite.date_checked = datetime.now()
+               dbsite.plc_site_stats = site
+               #dbsite.flush()
+       # TODO: delete old records.
+       session.flush()
+
+       print "sync nodes"
+       for node in l_nodes:
+               dbnode = PlcNode.findby_or_create(node_id=node['node_id'])
+               dbnode.hostname = node['hostname']
+               dbnode.date_checked = datetime.now()
+               dbnode.plc_node_stats = node
+               #dbnode.flush()
+       # TODO: delete old records.
+       session.flush()
+
+       print "sync pcus"
+       for pcu in l_pcus:
+               dbpcu = PlcPCU.findby_or_create(pcu_id=pcu['pcu_id'])
+               dbpcu.date_checked = datetime.now()
+               dbpcu.plc_pcu_stats = pcu
+               #dbpcu.flush()
+       # TODO: delete old records.
+       session.flush()
 
-       # get nodes at each site, and 
-       l_nodes = plc.getNodes({'peer_id':None}, ['hostname', 'node_id', 'ports', 'site_id', 'version', 
-                                                 'last_updated', 'date_created', 'last_contact', 'pcu_ids', 'nodenetwork_ids'])
+       init()
 
-       l_nodenetworks = plc.getNodeNetworks()
-       (plcdb, hn2lb, lb2hn) = dsn_from_dsln(d_sites, id2lb, l_nodes)
-       netid2ip = create_netid2ip(l_nodes, l_nodenetworks)
-
-       # save information for future.
-       id2lb = id2lb
-       hn2lb = hn2lb
-       db = plcdb
-
-       if ('cachenodes' in dir(config) and config.cachenodes) or \
-               'cachenodes' not in dir(config):
-               database.dbDump("plcdb_hn2lb", hn2lb)
-               database.dbDump("plcdb_lb2hn", lb2hn)
-               database.dbDump("plcdb_netid2ip", netid2ip)
-               database.dbDump("l_plcnodenetworks", l_nodenetworks)
-               database.dbDump("l_plcnodes", l_nodes)
-               database.dbDump("l_plcsites", l_sites)
-       
-       return l_nodes
+       return
 
 if __name__ == '__main__':
-       create_plcdb()
+       sync()
 else:
-       #print "calling plccache init()"
        init()
index 056f5b8..999902f 100755 (executable)
@@ -59,16 +59,15 @@ def main():
                # given to GetNodes
                nodelist = []
                for h in hostlist:
-                       nodelist += api.GetNodes(h)
+                       nodelist.append( plccache.GetNodeByName(h) )
 
-               #nodelist = api.GetNodes(hostlist)
                group_str = "Given"
 
        elif config.site:
-               site = api.GetSites(config.site)
+               site = plccache.GetSitesByName([config.site])
                if len (site) > 0:
                        site = site[0]
-                       nodelist = api.GetNodes(site['node_ids'])
+                       nodelist = plccache.GetNodesByIds(site['node_ids'])
                else:
                        nodelist = []
 
@@ -76,13 +75,13 @@ def main():
 
        elif config.nodeselect:
                hostlist = node_select(config.nodeselect)
-               nodelist = api.GetNodes(hostlist)
+               nodelist = [ plccache.GetNodeByName(h) for h in hostlist ]
 
                group_str = "selection"
                
        else:
                ng = api.GetNodeGroups({'name' : config.nodegroup})
-               nodelist = api.GetNodes(ng[0]['node_ids'])
+               nodelist = plccache.GetNodesByIds(ng[0]['node_ids'])
 
                group_str = config.nodegroup
 
@@ -91,7 +90,7 @@ def main():
                ng_nodes = nodelist
 
                # Get all nodes
-               all_nodes = api.GetNodes({'peer_id': None})
+               all_nodes = plccache.l_nodes
                
                # remove ngnodes from all node list
                ng_list = [ x['hostname'] for x in ng_nodes ]
index a237a8c..726f250 100755 (executable)
@@ -140,7 +140,7 @@ if config.findbad:
 for node in config.args:
        config.node = node
 
-       plc_nodeinfo = api.GetNodes({'hostname': config.node}, None)[0]
+       plc_nodeinfo = plccache.GetNodeByName(config.node)
        fb_noderec = FindbadNodeRecord.get_latest_by(hostname=node) 
        fb_nodeinfo = fb_noderec.to_dict()
        plc_print_nodeinfo(plc_nodeinfo)
index 16a8f4f..9f0468c 100755 (executable)
--- a/pcubad.py
+++ b/pcubad.py
@@ -31,8 +31,8 @@ def main2(config):
 
        l_pcus = None
        if config.site is not None:
-               site = api.GetSites(config.site)
-               l_nodes = api.GetNodes(site[0]['node_ids'], ['pcu_ids'])
+               site = plccache.GetSitesByName([config.site])
+               l_nodes = plccache.GetNodesByIds(site[0]['node_ids'])
                pcus = []
                for node in l_nodes:
                        pcus += node['pcu_ids']
@@ -40,7 +40,7 @@ def main2(config):
                l_pcus = [pcu for pcu in sets.Set(pcus)]
 
        elif config.node:
-               l_nodes = api.GetNodes(config.node, ['pcu_ids'])
+               l_nodes = plccache.GetNodeByName(config.node)
                pcus = []
                for node in l_nodes:
                        pcus += node['pcu_ids']
@@ -72,7 +72,7 @@ def check_pcu_state(rec, pcu):
                pcu.status = 'offline'
                pcu.last_changed = datetime.now()
 
-       if ( pcu_state == 0 or pcu_state == "0" ) and changed_lessthan(pcu.last_changed, 0.5) and pcu.status != 'online':
+       if ( pcu_state == 0 or pcu_state == "0" ) and pcu.status not in [ 'online', 'good' ]:
                print "changed status from %s to online" % pcu.status
                pcu.status = 'online'
                pcu.last_changed = datetime.now()
index 1f52190..48394df 100644 (file)
@@ -78,7 +78,9 @@ class IPAL(PCUControl):
                        s.close()
                        if e[0] == errno.ECONNREFUSED:
                                # cannot connect to remote host
-                               raise Exception(e[1])
+                               raise ExceptionNotFound(e[1])
+                       elif e[0] == errno.ETIMEDOUT:
+                               raise ExceptionTimeout(e[1])
                        else:
                                # TODO: what other conditions are there?
                                raise Exception(e)
index fcbbb94..4befbd9 100755 (executable)
--- a/policy.py
+++ b/policy.py
@@ -18,229 +18,31 @@ import traceback
 import sys
 from optparse import OptionParser
 
-import bootman                 # debug nodes
-
-from monitor import util
-from monitor import const
-from monitor import reboot
 from monitor import config
-from monitor import database
 from monitor import parser as parsermodule
 from monitor.common import *
 from monitor.model import *
 from monitor.wrapper import plc
 from monitor.wrapper import plccache
-from monitor.wrapper.emailTxt import mailtxt
 from monitor.database.info.model import *
+from monitor.database.info.interface import *
 
 from nodequery import verify,query_to_dict,node_select
 
 api = plc.getAuthAPI()
 
-
-class SiteInterface(HistorySiteRecord):
-       @classmethod
-       def get_or_make(cls, if_new_set={}, **kwargs):
-               if 'hostname' in kwargs:
-                       kwargs['loginbase'] = plccache.plcdb_hn2lb[kwargs['hostname']]
-                       del kwargs['hostname']
-               res = HistorySiteRecord.findby_or_create(if_new_set, **kwargs)
-               return SiteInterface(res)
-       
-       def __init__(self, sitehist):
-               self.db = sitehist
-
-       def getRecentActions(self, **kwargs):
-               # TODO: make query only return records within a certin time range,
-               # i.e. greater than 0.5 days ago. or 5 days, etc.
-
-               #print "kwargs: ", kwargs
-
-               recent_actions = []
-               if 'loginbase' in kwargs:
-                       recent_actions = ActionRecord.query.filter_by(loginbase=kwargs['loginbase']).order_by(ActionRecord.date_created.desc())
-               elif 'hostname' in kwargs:
-                       recent_actions = ActionRecord.query.filter_by(hostname=kwargs['hostname']).order_by(ActionRecord.date_created.desc())
-               return recent_actions
-       
-       def increasePenalty(self):
-               #act = ActionRecord(loginbase=self.db.loginbase, action='penalty', action_type='increase_penalty',)
-               self.db.penalty_level += 1
-               # NOTE: this is to prevent overflow or index errors in applyPenalty.
-               #       there's probably a better approach to this.
-               if self.db.penalty_level >= 2:
-                       self.db.penalty_level = 2
-               self.db.penalty_applied = True
-       
-       def applyPenalty(self):
-               penalty_map = [] 
-               penalty_map.append( { 'name': 'noop',                   'enable'   : lambda site: None,
-                                                                                                               'disable'  : lambda site: None } )
-               penalty_map.append( { 'name': 'nocreate',               'enable'   : lambda site: plc.removeSiteSliceCreation(site),
-                                                                                                               'disable'  : lambda site: plc.enableSiteSliceCreation(site) } )
-               penalty_map.append( { 'name': 'suspendslices',  'enable'   : lambda site: plc.suspendSiteSlices(site),
-                                                                                                               'disable'  : lambda site: plc.enableSiteSlices(site) } )
-
-               for i in range(len(penalty_map)-1,self.db.penalty_level,-1):
-                       print "\tdisabling %s on %s" % (penalty_map[i]['name'], self.db.loginbase)
-                       penalty_map[i]['disable'](self.db.loginbase) 
-
-               for i in range(0,self.db.penalty_level+1):
-                       print "\tapplying %s on %s" % (penalty_map[i]['name'], self.db.loginbase)
-                       penalty_map[i]['enable'](self.db.loginbase)
-
-               return
-
-       def pausePenalty(self):
-               act = ActionRecord(loginbase=self.db.loginbase,
-                                                       action='penalty',
-                                                       action_type='pause_penalty',)
-       
-       def clearPenalty(self):
-               #act = ActionRecord(loginbase=self.db.loginbase, action='penalty', action_type='clear_penalty',)
-               self.db.penalty_level = 0
-               self.db.penalty_applied = False
-       
-       def getTicketStatus(self):
-               if self.db.message_id != 0:
-                       rtstatus = mailer.getTicketStatus(self.db.message_id)
-                       self.db.message_status = rtstatus['Status']
-                       self.db.message_queue = rtstatus['Queue']
-                       self.db.message_created = datetime.fromtimestamp(rtstatus['Created'])
-
-       def setTicketStatus(self, status):
-               print 'SETTING status %s' % status
-               if self.db.message_id != 0:
-                       rtstatus = mailer.setTicketStatus(self.db.message_id, status)
-
-       def getContacts(self):
-               contacts = []
-               if self.db.penalty_level >= 0:
-                       contacts += plc.getTechEmails(self.db.loginbase)
-
-               if self.db.penalty_level >= 1:
-                       contacts += plc.getPIEmails(self.db.loginbase)
-
-               if self.db.penalty_level >= 2:
-                       contacts += plc.getSliceUserEmails(self.db.loginbase)
-
-               return contacts
-
-       def sendMessage(self, type, **kwargs):
-
-               # NOTE: evidently changing an RT message's subject opens the ticket.
-               #       the logic in this policy depends up a ticket only being 'open'
-        #       if a user has replied to it.
-        #       So, to preserve these semantics, we check the status before
-        #           sending, then after sending, reset the status to the
-        #           previous status.
-        #       There is a very tiny race here, where a user sends a reply
-        #           within the time it takes to check, send, and reset.
-        #       This sucks.  It's almost certainly fragile.
-
-               # 
-               # TODO: catch any errors here, and add an ActionRecord that contains
-               #       those errors.
-               
-               args = {'loginbase' : self.db.loginbase, 'penalty_level' : self.db.penalty_level}
-               args.update(kwargs)
-
-               hostname = None
-               if 'hostname' in args:
-                       hostname = args['hostname']
-
-               if hasattr(mailtxt, type):
-
-                       message = getattr(mailtxt, type)
-                       viart = True
-                       if 'viart' in kwargs:
-                               viart = kwargs['viart']
-
-                       if viart:
-                               self.getTicketStatus()          # get current message status
-
-                       m = Message(message[0] % args, message[1] % args, viart, self.db.message_id)
-
-                       contacts = self.getContacts()
-                       contacts = [config.cc_email]    # TODO: remove after testing...
-
-                       print "sending message: %s to site %s for host %s" % (type, self.db.loginbase, hostname)
-
-                       ret = m.send(contacts)
-                       if viart:
-                               self.db.message_id = ret
-                               # reset to previous status, since a new subject 'opens' RT tickets.
-                               self.setTicketStatus(self.db.message_status) 
-
-                               # NOTE: only make a record of it if it's in RT.
-                               act = ActionRecord(loginbase=self.db.loginbase, hostname=hostname, action='notice', 
-                                                               action_type=type, message_id=self.db.message_id)
-
-               else:
-                       print "+-- WARNING! ------------------------------"
-                       print "| No such message name in emailTxt.mailtxt: %s" % type
-                       print "+------------------------------------------"
-
-               return
-
-       def closeTicket(self):
-               # TODO: close the rt ticket before overwriting the message_id
-               mailer.closeTicketViaRT(self.db.message_id, "Ticket Closed by Monitor")
-               act = ActionRecord(loginbase=self.db.loginbase, action='notice', 
-                                                       action_type='end_notice', message_id=self.db.message_id)
-               self.db.message_id = 0
-               self.db.message_status = "new"
-
-       def runBootManager(self, hostname):
-               print "attempting BM reboot of %s" % hostname
-               ret = ""
-               try:
-                       ret = bootman.restore(self, hostname)
-                       err = ""
-               except:
-                       err = traceback.format_exc()
-                       print err
-
-               act = ActionRecord(loginbase=self.db.loginbase,
-                                                       hostname=hostname,
-                                                       action='reboot',
-                                                       action_type='bootmanager_restore',
-                                                       error_string=err)
-               return ret
-
-       def attemptReboot(self, hostname):
-               print "attempting PCU reboot of %s" % hostname
-               err = ""
-               try:
-                       ret = reboot.reboot_str(hostname)
-               except Exception, e:
-                       err = traceback.format_exc()
-                       ret = str(e)
-
-               if ret == 0 or ret == "0":
-                       ret = ""
-
-               act = ActionRecord(loginbase=self.db.loginbase,
-                                                       hostname=hostname,
-                                                       action='reboot',
-                                                       action_type='first_try_reboot',
-                                                       error_string=err)
-
 def logic():
 
        plc.nodeBootState(host, 'rins')
        node_end_record(host)
 
-
-
-
 def main(hostnames, sitenames):
        # commands:
        i = 1
        node_count = 1
        site_count = 1
        #print "hosts: %s" % hostnames
-       for host in hostnames:
+       for i,host in enumerate(hostnames):
                try:
                        lb = plccache.plcdb_hn2lb[host]
                except:
@@ -259,13 +61,13 @@ def main(hostnames, sitenames):
 
                nodehist = HistoryNodeRecord.findby_or_create(hostname=host)
 
-               print "%s %s" % ( nodehist.hostname, nodehist.status)
+               print "%s %s %s" % (i, nodehist.hostname, nodehist.status)
                if nodehist.status == 'good' and \
                        changed_lessthan(nodehist.last_changed, 1.0) and \
                        not found_within(recent_actions, 'online_notice', 0.5):
                                # NOTE: there is a narrow window in which this command must be
                                # evaluated, otherwise the notice will not go out.  this is not ideal.
-                               sitehist.sendMessage('online_notice', hostname=host)
+                               sitehist.sendMessage('online_notice', hostname=host, viart=False)
                                print "send message for host %s online" % host
 
                                pass
@@ -314,15 +116,19 @@ def main(hostnames, sitenames):
                node_count = node_count + 1
                session.flush()
 
-       for site in sitenames:
+       for i,site in enumerate(sitenames):
                sitehist = SiteInterface.get_or_make(loginbase=site)
+               siteblack = BlacklistRecord.get_by(loginbase=site)
+
+               if siteblack and not siteblack.expired():
+                       print "skipping %s due to blacklist.  will expire %s" % (site, siteblack.willExpire() )
+                       continue
+
                # TODO: make query only return records within a certin time range,
                #               i.e. greater than 0.5 days ago. or 5 days, etc.
                recent_actions = sitehist.getRecentActions(loginbase=site)
 
-               #sitehist.sendMessage('test_notice', host)
-
-               print "%s %s" % ( sitehist.db.loginbase , sitehist.db.status)
+               print "%s %s %s" % (i, sitehist.db.loginbase, sitehist.db.status)
                if sitehist.db.status == 'down':
                        if  not found_within(recent_actions, 'pause_penalty', 30) and \
                                not found_within(recent_actions, 'increase_penalty', 7) and \
@@ -375,8 +181,7 @@ if __name__ == "__main__":
                                                force=False, 
                                                nosetup=False, 
                                                verbose=False, 
-                                               quiet=False,
-                                               )
+                                               quiet=False,)
 
        parser.add_option("", "--stopselect", dest="stopselect", metavar="", 
                                                help="The select string that must evaluate to true for the node to be considered 'done'")
@@ -401,12 +206,6 @@ if __name__ == "__main__":
        parser = parsermodule.getParser(['defaults'], parser)
        config = parsermodule.parse_args(parser)
 
-#      # COLLECT nodegroups, nodes and node lists
-#      if config.nodegroup:
-#              ng = api.GetNodeGroups({'name' : config.nodegroup})
-#              nodelist = api.GetNodes(ng[0]['node_ids'])
-#              hostnames = [ n['hostname'] for n in nodelist ]
-
        fbquery = HistoryNodeRecord.query.all()
        hostnames = [ n.hostname for n in fbquery ]
        
@@ -416,8 +215,7 @@ if __name__ == "__main__":
        if config.site:
                # TODO: replace with calls to local db.  the api fails so often that
                #               these calls should be regarded as unreliable.
-               site = api.GetSites(config.site)
-               l_nodes = api.GetNodes(site[0]['node_ids'], ['hostname'])
+               l_nodes = plccache.GetNodesBySite(config.site)
                filter_hostnames = [ n['hostname'] for n in l_nodes ]
 
                hostnames = filter(lambda x: x in filter_hostnames, hostnames)
index cf5ab4e..4d9ee33 100755 (executable)
@@ -74,7 +74,7 @@ def check_site_state(rec, sitehist):
                        print "changed status from %s to good" % sitehist.status
                        sitehist.status = 'good'
 
-       if not sitehist.new:
+       elif not sitehist.new:
        
                if sitehist.status != 'offline' and sitehist.status != 'down':
                        sitehist.last_changed = datetime.now()
index 6fe6496..4b4daf7 100755 (executable)
@@ -62,7 +62,7 @@ def plc_print_siteinfo(plcsite):
                         diff_time(plcsite['last_updated']))
 
        print ""
-       nodes = api.GetNodes(plcsite['node_ids'])
+       nodes = plccache.GetNodesByIds(plcsite['node_ids'])
        print "   Checked: %s" % time.ctime()
        print "\t                               host     | state | obs   |   created   |   updated   | last_contact "
        for plcnode in nodes:
@@ -79,7 +79,7 @@ act_all = database.dbLoad("act_all")
 for site in config.args:
        config.site = site
 
-       plc_siteinfo = api.GetSites({'login_base': config.site})[0]
+       plc_siteinfo = plccache.GetSitesByName([config.site])
        url = "https://www.planet-lab.org/db/sites/index.php?site_pattern="
        plc_siteinfo['url'] = url + plc_siteinfo['login_base']
 
@@ -87,7 +87,7 @@ for site in config.args:
                # rerun findbad with the nodes in the given nodes.
                import os
                file = "findbad.txt"
-               nodes = api.GetNodes(plc_siteinfo['node_ids'], ['hostname'])
+               nodes = plccache.GetNodesByIds(plc_siteinfo['node_ids'])
                nodes = [ n['hostname'] for n in nodes ]
                util.file.setFileFromList(file, nodes)
                os.system("./findbad.py --cachenodes --debug=0 --dbname=findbad --increment --nodelist %s" % file)
index f473d4b..d60effb 100755 (executable)
@@ -16,5 +16,5 @@ try:
                network = api.GetNodeNetworks(node['nodenetwork_ids'])
        print "ok"
 except:
-       sys.stderr.write(traceback.print_exc())
+       sys.stderr.write(traceback.format_exc())
        print "fail"
similarity index 100%
rename from nodenetwork.py
rename to tests/nodenetwork.py
index 337139f..1c4efe9 100644 (file)
@@ -11,6 +11,7 @@ from monitor.database.info.model import *
 from monitor.database.zabbixapi.model import *
 from monitor.database.dborm import zab_session as session
 from monitor.database.dborm import zab_metadata as metadata
+from monitor_xmlrpc import MonitorXmlrpcServer
 
 from monitor import reboot
 from monitor import scanapi
@@ -149,7 +150,7 @@ def prep_node_for_display(node):
 
 
 
-class Root(controllers.RootController):
+class Root(controllers.RootController, MonitorXmlrpcServer):
        @expose(template="monitorweb.templates.welcome")
        def index(self):
                import time
@@ -382,6 +383,32 @@ class Root(controllers.RootController):
                        
                return dict(sitequery=sitequery, pcuquery=pcuquery, nodequery=nodequery, actions=actions, exceptions=exceptions)
 
+       @expose(template="monitorweb.templates.nodehistory")
+       def nodehistory(self, hostname=None):
+               query = []
+               if hostname:
+                       fbnode = FindbadNodeRecord.get_by(hostname=hostname)
+                       # TODO: add links for earlier history if desired.
+                       l = fbnode.versions[-100:]
+                       l.reverse()
+                       for node in l:
+                               prep_node_for_display(node)
+                               query.append(node)
+               return dict(query=query, hostname=hostname)
+
+       @expose(template="monitorweb.templates.sitehistory")
+       def sitehistory(self, loginbase=None):
+               query = []
+               if loginbase:
+                       fbsite = HistorySiteRecord.get_by(loginbase=loginbase)
+                       # TODO: add links for earlier history if desired.
+                       l = fbsite.versions[-100:]
+                       l.reverse()
+                       for site in l:
+                               query.append(site)
+               return dict(query=query, loginbase=loginbase)
+
+
        @expose(template="monitorweb.templates.pculist")
        def pcu(self, filter='all'):
                import time
@@ -441,8 +468,10 @@ class Root(controllers.RootController):
                                filtercount['new'] += 1
                        elif not site.enabled:
                                filtercount['pending'] += 1
-                       else:
-                               filtercount[site.status] += 1
+                       elif site.status in ['good', 'online']:
+                               filtercount['good'] += 1
+                       elif site.status in ['down', 'offline']:
+                               filtercount['down'] += 1
 
                        # apply filter
                        if filter == "all":
@@ -451,7 +480,9 @@ class Root(controllers.RootController):
                                query.append(site)
                        elif filter == "pending" and not site.enabled:
                                query.append(site)
-                       elif filter == site.status:
+                       elif filter == 'good' and site.status in ['good', 'online']:
+                               query.append(site)
+                       elif filter == 'down' and site.status in ['down', 'offline']:
                                query.append(site)
                                
                return dict(query=query, fc=filtercount)
index 40a1691..4367a0a 100644 (file)
@@ -108,7 +108,7 @@ a.right { float: right; }
 #site-offline { background-color: red; }\r
 #site-down { background-color: indianred; }\r
 \r
-#site-0 { background-color : white; }\r
+/*#site-0 { background-color : white; }*/\r
 #site-1 { background-color: gold; }\r
 #site-2 { background-color: indianred; }\r
 \r
index e51c743..fc471d9 100644 (file)
@@ -16,6 +16,7 @@ from links import *
                <table py:if="len(sitequery) > 0" id="sub-table" border="1" width="100%">
                        <thead>
                                <tr>
+                                       <th>History</th>
                                        <th>Site name</th>
                                        <th>Enabled</th>
                                        <th>Penalty</th>
@@ -26,6 +27,7 @@ from links import *
                        </thead>
                        <tbody>
                                <tr py:for="i,site in enumerate(sitequery)" class="${i%2 and 'odd' or 'even'}" >
+                                       <td><a href="sitehistory?loginbase=${site.loginbase}">history</a></td>
                                        <td nowrap="true"><a class="ext-link" href="${plc_site_uri(site.loginbase)}">
                                                        <span class="icon">${site.loginbase}</span></a>
                                        </td>
@@ -131,7 +133,7 @@ from links import *
                </table>
                                </span> </a>
        </div>
-       <h3>Nodes</h3>
+       <h3>Nodes</h3> 
                <p py:if="len(nodequery) == 0">
                        There are no registered nodes for this site.
                </p>
@@ -139,6 +141,7 @@ from links import *
                        <thead>
                                <tr>
                                        <th mochi:format="int"></th>
+                                       <th>History</th>
                                        <th>Hostname</th>
                                        <th>last_contact</th>
                                        <th>last_checked</th>
@@ -151,6 +154,7 @@ from links import *
                        <tbody>
                                <tr py:for="i,node in enumerate(nodequery)" class="${i%2 and 'odd' or 'even'}" >
                                        <td></td>
+                                       <td><a href="nodehistory?hostname=${node.hostname}">history</a></td>
                                        <td id="node-${node.observed_status}" nowrap="true" >
                                                <a class="ext-link" href="${plc_node_uri(node.hostname)}">
                                                        <span class="icon">${node.hostname}</span></a>