many improvements.

[monitor.git] / findbad.py
diff --git a/findbad.py b/findbad.py

index c08fbc8..c7449d2 100755 (executable)
--- a/findbad.py
+++ b/findbad.py
@@ -11,15 +11,16 @@ import threading
  from monitor import util
  from monitor.util import command
  from monitor import config
-from monitor.database import FindbadNodeRecordSync, FindbadNodeRecord
+
+from monitor.database.info.model import FindbadNodeRecordSync, FindbadNodeRecord, session
+
  from monitor.sources import comon
-from monitor.wrapper import plc
+from monitor.wrapper import plc, plccache
  
-import syncplcdb
  from nodequery import verify,query_to_dict,node_select
  import traceback
  
-print "starting sqlfindbad.py"
+#print "starting sqlfindbad.py"
  # QUERY all nodes.
  COMON_COTOPURL= "http://summer.cs.princeton.edu/status/tabulator.cgi?" + \
                                 "table=table_nodeview&" + \
@@ -63,7 +64,6 @@ def collectPingAndSSH(nodename, cohash):
                                         echo '  "princeton_comon":"'`ls -d /vservers/princeton_comon`'",'
  
                                         ID=`grep princeton_comon /etc/passwd | awk -F : '{if ( $3 > 500 ) { print $3}}'` 
-
                                         echo '  "princeton_comon_running":"'`ls -d /proc/virtual/$ID`'",'
                                         echo '  "princeton_comon_procs":"'`vps ax | grep $ID | grep -v grep | wc -l`'",'
                                         echo "}"
@@ -97,14 +97,14 @@ EOF                         """)
                 oval = values['kernel']
                 if "2.6.17" in oval or "2.6.2" in oval:
                         values['ssh'] = 'SSH'
-                       values['category'] = 'ALPHA'
+                       values['category'] = 'PROD'
                         if "bm.log" in values['bmlog']:
                                 values['state'] = 'DEBUG'
                         else:
                                 values['state'] = 'BOOT'
                 elif "2.6.12" in oval or "2.6.10" in oval:
                         values['ssh'] = 'SSH'
-                       values['category'] = 'PROD'
+                       values['category'] = 'OLDPROD'
                         if "bm.log" in values['bmlog']:
                                 values['state'] = 'DEBUG'
                         else:
@@ -254,29 +254,51 @@ def recordPingAndSSH(request, result):
                         fbnodesync = FindbadNodeRecordSync.findby_or_create(hostname=nodename,
                                                                                                                         if_new_set={'round' : global_round})
  
-                       fbrec = FindbadNodeRecord(
-                                               date_checked=datetime.fromtimestamp(values['date_checked']),
-                                               hostname=nodename,
-                                               loginbase=values['loginbase'],
-                                               kernel_version=values['kernel'],
-                                               bootcd_version=values['bootcd'],
-                                               nm_status=values['nm'],
-                                               fs_status=values['readonlyfs'],
-                                               dns_status=values['dns'],
-                                               princeton_comon_dir=values['princeton_comon'],
-                                               princeton_comon_running=values['princeton_comon_running'],
-                                               princeton_comon_procs=values['princeton_comon_procs'],
-                                               plc_node_stats = values['plcnode'],
-                                               plc_site_stats = values['plcsite'],
-                                               plc_pcuid = values['pcu'],
-                                               comon_stats = values['comonstats'],
-                                               ping_status = (values['ping'] == "PING"),
-                                               ssh_portused = values['sshport'],
-                                               ssh_status = (values['ssh'] == "SSH"),
-                                               ssh_error = values['ssherror'],
-                                               observed_status = values['state'],
-                                       )
+                       # NOTE: This code will either add a new record for the new global_round, 
+                       #               OR it will find the previous value, and update it
+                       #               with new information.
+                       #               The data that is 'lost' is not that important, b/c older
+                       #               history still exists.  
+                       fbrec = FindbadNodeRecord.findby_or_create(
+                                               round=global_round,
+                                               hostname=nodename)
+                       before = fbrec.to_dict()
+                       print "BEFORE, ", before
+                       fbrec.flush()
+                       time.sleep(2)
+                       print "Setting VALUES"
+                       fbrec.set(  date_checked=datetime.fromtimestamp(values['date_checked']),
+                                               loginbase=values['loginbase'],
+                                               kernel_version=values['kernel'],
+                                               bootcd_version=values['bootcd'],
+                                               nm_status=values['nm'],
+                                               fs_status=values['readonlyfs'],
+                                               dns_status=values['dns'],
+                                               princeton_comon_dir=values['princeton_comon'],
+                                               princeton_comon_running=values['princeton_comon_running'],
+                                               princeton_comon_procs=values['princeton_comon_procs'],
+                                               plc_node_stats = values['plcnode'],
+                                               plc_site_stats = values['plcsite'],
+                                               plc_pcuid = values['pcu'],
+                                               comon_stats = values['comonstats'],
+                                               ping_status = (values['ping'] == "PING"),
+                                               ssh_portused = values['sshport'],
+                                               ssh_status = (values['ssh'] == "SSH"),
+                                               ssh_error = values['ssherror'],
+                                               observed_status = values['state'],
+                                               observed_category = values['category'])
+                       after = fbrec.to_dict()
+                       print "AFTER , ", after
+
+                       for v in before.keys():
+                               if before[v] == after[v]:
+                                       print "SAME FOR KEY %s" % v
+                               print "%s : %s\t%s" % ( v, before[v], after[v] )
+
+                       fbrec.flush()
                         fbnodesync.round = global_round
+                       fbnodesync.flush()
+                       fbsync.flush()
  
                         count += 1
                         print "%d %s %s" % (count, nodename, values)
@@ -290,6 +312,16 @@ def handle_exception(request, result):
         for i in result:
                 print "Result: %s" % i
  
+def probe(hostname):
+       try:
+               (nodename, values) = collectPingAndSSH(hostname, {})
+               recordPingAndSSH(None, (nodename, values))
+               session.flush()
+               return True
+       except:
+               print traceback.print_exc()
+               return False
+               
  
  def checkAndRecordState(l_nodes, cohash):
         global global_round
@@ -300,8 +332,9 @@ def checkAndRecordState(l_nodes, cohash):
         # CREATE all the work requests
         for nodename in l_nodes:
                 fbnodesync = FindbadNodeRecordSync.findby_or_create(hostname=nodename, if_new_set={'round':0})
-
                 node_round   = fbnodesync.round
+               fbnodesync.flush()
+
                 if node_round < global_round:
                         # recreate node stats when refreshed
                         #print "%s" % nodename
@@ -333,6 +366,7 @@ def checkAndRecordState(l_nodes, cohash):
  
         print FindbadNodeRecordSync.query.count()
         print FindbadNodeRecord.query.count()
+       session.flush()
  
  def main():
         global global_round
@@ -346,15 +380,17 @@ def main():
                 global_round += 1
                 fbsync.round = global_round
  
+       fbsync.flush()
+
         cotop = comon.Comon()
         # lastcotop measures whether cotop is actually running.  this is a better
         # metric than sshstatus, or other values from CoMon
         cotop_url = COMON_COTOPURL
  
         # history information for all nodes
-       #cohash = {}
-       cohash = cotop.coget(cotop_url)
-       l_nodes = syncplcdb.create_plcdb()
+       cohash = {}
+       #cohash = cotop.coget(cotop_url)
+       l_nodes = plccache.l_nodes
         if config.nodelist:
                 f_nodes = util.file.getListFromFile(config.nodelist)
                 l_nodes = filter(lambda x: x['hostname'] in f_nodes, l_nodes)