many improvements.
[monitor.git] / findbad.py
index 9d2758c..c7449d2 100755 (executable)
@@ -11,15 +11,16 @@ import threading
 from monitor import util
 from monitor.util import command
 from monitor import config
-from monitor.database import FindbadNodeRecordSync, FindbadNodeRecord
+
+from monitor.database.info.model import FindbadNodeRecordSync, FindbadNodeRecord, session
+
 from monitor.sources import comon
-from monitor.wrapper import plc
+from monitor.wrapper import plc, plccache
 
-import syncplcdb
 from nodequery import verify,query_to_dict,node_select
 import traceback
 
-print "starting sqlfindbad.py"
+#print "starting sqlfindbad.py"
 # QUERY all nodes.
 COMON_COTOPURL= "http://summer.cs.princeton.edu/status/tabulator.cgi?" + \
                                "table=table_nodeview&" + \
@@ -253,29 +254,51 @@ def recordPingAndSSH(request, result):
                        fbnodesync = FindbadNodeRecordSync.findby_or_create(hostname=nodename,
                                                                                                                        if_new_set={'round' : global_round})
 
-                       fbrec = FindbadNodeRecord(
-                                               date_checked=datetime.fromtimestamp(values['date_checked']),
-                                               hostname=nodename,
-                                               loginbase=values['loginbase'],
-                                               kernel_version=values['kernel'],
-                                               bootcd_version=values['bootcd'],
-                                               nm_status=values['nm'],
-                                               fs_status=values['readonlyfs'],
-                                               dns_status=values['dns'],
-                                               princeton_comon_dir=values['princeton_comon'],
-                                               princeton_comon_running=values['princeton_comon_running'],
-                                               princeton_comon_procs=values['princeton_comon_procs'],
-                                               plc_node_stats = values['plcnode'],
-                                               plc_site_stats = values['plcsite'],
-                                               plc_pcuid = values['pcu'],
-                                               comon_stats = values['comonstats'],
-                                               ping_status = (values['ping'] == "PING"),
-                                               ssh_portused = values['sshport'],
-                                               ssh_status = (values['ssh'] == "SSH"),
-                                               ssh_error = values['ssherror'],
-                                               observed_status = values['state'],
-                                       )
+                       # NOTE: This code will either add a new record for the new global_round, 
+                       #               OR it will find the previous value, and update it
+                       #               with new information.
+                       #               The data that is 'lost' is not that important, b/c older
+                       #               history still exists.  
+                       fbrec = FindbadNodeRecord.findby_or_create(
+                                               round=global_round,
+                                               hostname=nodename)
+                       before = fbrec.to_dict()
+                       print "BEFORE, ", before
+                       fbrec.flush()
+                       time.sleep(2)
+                       print "Setting VALUES"
+                       fbrec.set(  date_checked=datetime.fromtimestamp(values['date_checked']),
+                                               loginbase=values['loginbase'],
+                                               kernel_version=values['kernel'],
+                                               bootcd_version=values['bootcd'],
+                                               nm_status=values['nm'],
+                                               fs_status=values['readonlyfs'],
+                                               dns_status=values['dns'],
+                                               princeton_comon_dir=values['princeton_comon'],
+                                               princeton_comon_running=values['princeton_comon_running'],
+                                               princeton_comon_procs=values['princeton_comon_procs'],
+                                               plc_node_stats = values['plcnode'],
+                                               plc_site_stats = values['plcsite'],
+                                               plc_pcuid = values['pcu'],
+                                               comon_stats = values['comonstats'],
+                                               ping_status = (values['ping'] == "PING"),
+                                               ssh_portused = values['sshport'],
+                                               ssh_status = (values['ssh'] == "SSH"),
+                                               ssh_error = values['ssherror'],
+                                               observed_status = values['state'],
+                                               observed_category = values['category'])
+                       after = fbrec.to_dict()
+                       print "AFTER , ", after
+
+                       for v in before.keys():
+                               if before[v] == after[v]:
+                                       print "SAME FOR KEY %s" % v
+                               print "%s : %s\t%s" % ( v, before[v], after[v] )
+
+                       fbrec.flush()
                        fbnodesync.round = global_round
+                       fbnodesync.flush()
+                       fbsync.flush()
 
                        count += 1
                        print "%d %s %s" % (count, nodename, values)
@@ -289,6 +312,16 @@ def handle_exception(request, result):
        for i in result:
                print "Result: %s" % i
 
+def probe(hostname):
+       try:
+               (nodename, values) = collectPingAndSSH(hostname, {})
+               recordPingAndSSH(None, (nodename, values))
+               session.flush()
+               return True
+       except:
+               print traceback.print_exc()
+               return False
+               
 
 def checkAndRecordState(l_nodes, cohash):
        global global_round
@@ -299,8 +332,9 @@ def checkAndRecordState(l_nodes, cohash):
        # CREATE all the work requests
        for nodename in l_nodes:
                fbnodesync = FindbadNodeRecordSync.findby_or_create(hostname=nodename, if_new_set={'round':0})
-
                node_round   = fbnodesync.round
+               fbnodesync.flush()
+
                if node_round < global_round:
                        # recreate node stats when refreshed
                        #print "%s" % nodename
@@ -332,6 +366,7 @@ def checkAndRecordState(l_nodes, cohash):
 
        print FindbadNodeRecordSync.query.count()
        print FindbadNodeRecord.query.count()
+       session.flush()
 
 def main():
        global global_round
@@ -345,15 +380,17 @@ def main():
                global_round += 1
                fbsync.round = global_round
 
+       fbsync.flush()
+
        cotop = comon.Comon()
        # lastcotop measures whether cotop is actually running.  this is a better
        # metric than sshstatus, or other values from CoMon
        cotop_url = COMON_COTOPURL
 
        # history information for all nodes
-       #cohash = {}
-       cohash = cotop.coget(cotop_url)
-       l_nodes = syncplcdb.create_plcdb()
+       cohash = {}
+       #cohash = cotop.coget(cotop_url)
+       l_nodes = plccache.l_nodes
        if config.nodelist:
                f_nodes = util.file.getListFromFile(config.nodelist)
                l_nodes = filter(lambda x: x['hostname'] in f_nodes, l_nodes)