run external checks on hosts to determine their boot state.

[monitor.git] / findbad.py
diff --git a/findbad.py b/findbad.py

index c08fbc8..77dd120 100755 (executable)
--- a/findbad.py
+++ b/findbad.py
@@ -11,11 +11,12 @@ import threading
  from monitor import util
  from monitor.util import command
  from monitor import config
-from monitor.database import FindbadNodeRecordSync, FindbadNodeRecord
+
+from monitor.database.info.model import FindbadNodeRecordSync, FindbadNodeRecord, session
+
  from monitor.sources import comon
-from monitor.wrapper import plc
+from monitor.wrapper import plc, plccache
  
-import syncplcdb
  from nodequery import verify,query_to_dict,node_select
  import traceback
  
@@ -63,7 +64,6 @@ def collectPingAndSSH(nodename, cohash):
                                         echo '  "princeton_comon":"'`ls -d /vservers/princeton_comon`'",'
  
                                         ID=`grep princeton_comon /etc/passwd | awk -F : '{if ( $3 > 500 ) { print $3}}'` 
-
                                         echo '  "princeton_comon_running":"'`ls -d /proc/virtual/$ID`'",'
                                         echo '  "princeton_comon_procs":"'`vps ax | grep $ID | grep -v grep | wc -l`'",'
                                         echo "}"
@@ -97,14 +97,14 @@ EOF                         """)
                 oval = values['kernel']
                 if "2.6.17" in oval or "2.6.2" in oval:
                         values['ssh'] = 'SSH'
-                       values['category'] = 'ALPHA'
+                       values['category'] = 'PROD'
                         if "bm.log" in values['bmlog']:
                                 values['state'] = 'DEBUG'
                         else:
                                 values['state'] = 'BOOT'
                 elif "2.6.12" in oval or "2.6.10" in oval:
                         values['ssh'] = 'SSH'
-                       values['category'] = 'PROD'
+                       values['category'] = 'OLDPROD'
                         if "bm.log" in values['bmlog']:
                                 values['state'] = 'DEBUG'
                         else:
@@ -256,6 +256,7 @@ def recordPingAndSSH(request, result):
  
                         fbrec = FindbadNodeRecord(
                                                 date_checked=datetime.fromtimestamp(values['date_checked']),
+                                               round=global_round,
                                                 hostname=nodename,
                                                 loginbase=values['loginbase'],
                                                 kernel_version=values['kernel'],
@@ -275,8 +276,12 @@ def recordPingAndSSH(request, result):
                                                 ssh_status = (values['ssh'] == "SSH"),
                                                 ssh_error = values['ssherror'],
                                                 observed_status = values['state'],
+                                               observed_category = values['category'],
                                         )
                         fbnodesync.round = global_round
+                       fbnodesync.flush()
+                       fbsync.flush()
+                       fbrec.flush()
  
                         count += 1
                         print "%d %s %s" % (count, nodename, values)
@@ -300,8 +305,9 @@ def checkAndRecordState(l_nodes, cohash):
         # CREATE all the work requests
         for nodename in l_nodes:
                 fbnodesync = FindbadNodeRecordSync.findby_or_create(hostname=nodename, if_new_set={'round':0})
-
                 node_round   = fbnodesync.round
+               fbnodesync.flush()
+
                 if node_round < global_round:
                         # recreate node stats when refreshed
                         #print "%s" % nodename
@@ -333,6 +339,7 @@ def checkAndRecordState(l_nodes, cohash):
  
         print FindbadNodeRecordSync.query.count()
         print FindbadNodeRecord.query.count()
+       session.flush()
  
  def main():
         global global_round
@@ -346,15 +353,17 @@ def main():
                 global_round += 1
                 fbsync.round = global_round
  
+       fbsync.flush()
+
         cotop = comon.Comon()
         # lastcotop measures whether cotop is actually running.  this is a better
         # metric than sshstatus, or other values from CoMon
         cotop_url = COMON_COTOPURL
  
         # history information for all nodes
-       #cohash = {}
-       cohash = cotop.coget(cotop_url)
-       l_nodes = syncplcdb.create_plcdb()
+       cohash = {}
+       #cohash = cotop.coget(cotop_url)
+       l_nodes = plccache.l_nodes
         if config.nodelist:
                 f_nodes = util.file.getListFromFile(config.nodelist)
                 l_nodes = filter(lambda x: x['hostname'] in f_nodes, l_nodes)