run external checks on hosts to determine their boot state.
[monitor.git] / findbad.py
index c08fbc8..77dd120 100755 (executable)
@@ -11,11 +11,12 @@ import threading
 from monitor import util
 from monitor.util import command
 from monitor import config
-from monitor.database import FindbadNodeRecordSync, FindbadNodeRecord
+
+from monitor.database.info.model import FindbadNodeRecordSync, FindbadNodeRecord, session
+
 from monitor.sources import comon
-from monitor.wrapper import plc
+from monitor.wrapper import plc, plccache
 
-import syncplcdb
 from nodequery import verify,query_to_dict,node_select
 import traceback
 
@@ -63,7 +64,6 @@ def collectPingAndSSH(nodename, cohash):
                                        echo '  "princeton_comon":"'`ls -d /vservers/princeton_comon`'",'
 
                                        ID=`grep princeton_comon /etc/passwd | awk -F : '{if ( $3 > 500 ) { print $3}}'` 
-
                                        echo '  "princeton_comon_running":"'`ls -d /proc/virtual/$ID`'",'
                                        echo '  "princeton_comon_procs":"'`vps ax | grep $ID | grep -v grep | wc -l`'",'
                                        echo "}"
@@ -97,14 +97,14 @@ EOF                         """)
                oval = values['kernel']
                if "2.6.17" in oval or "2.6.2" in oval:
                        values['ssh'] = 'SSH'
-                       values['category'] = 'ALPHA'
+                       values['category'] = 'PROD'
                        if "bm.log" in values['bmlog']:
                                values['state'] = 'DEBUG'
                        else:
                                values['state'] = 'BOOT'
                elif "2.6.12" in oval or "2.6.10" in oval:
                        values['ssh'] = 'SSH'
-                       values['category'] = 'PROD'
+                       values['category'] = 'OLDPROD'
                        if "bm.log" in values['bmlog']:
                                values['state'] = 'DEBUG'
                        else:
@@ -256,6 +256,7 @@ def recordPingAndSSH(request, result):
 
                        fbrec = FindbadNodeRecord(
                                                date_checked=datetime.fromtimestamp(values['date_checked']),
+                                               round=global_round,
                                                hostname=nodename,
                                                loginbase=values['loginbase'],
                                                kernel_version=values['kernel'],
@@ -275,8 +276,12 @@ def recordPingAndSSH(request, result):
                                                ssh_status = (values['ssh'] == "SSH"),
                                                ssh_error = values['ssherror'],
                                                observed_status = values['state'],
+                                               observed_category = values['category'],
                                        )
                        fbnodesync.round = global_round
+                       fbnodesync.flush()
+                       fbsync.flush()
+                       fbrec.flush()
 
                        count += 1
                        print "%d %s %s" % (count, nodename, values)
@@ -300,8 +305,9 @@ def checkAndRecordState(l_nodes, cohash):
        # CREATE all the work requests
        for nodename in l_nodes:
                fbnodesync = FindbadNodeRecordSync.findby_or_create(hostname=nodename, if_new_set={'round':0})
-
                node_round   = fbnodesync.round
+               fbnodesync.flush()
+
                if node_round < global_round:
                        # recreate node stats when refreshed
                        #print "%s" % nodename
@@ -333,6 +339,7 @@ def checkAndRecordState(l_nodes, cohash):
 
        print FindbadNodeRecordSync.query.count()
        print FindbadNodeRecord.query.count()
+       session.flush()
 
 def main():
        global global_round
@@ -346,15 +353,17 @@ def main():
                global_round += 1
                fbsync.round = global_round
 
+       fbsync.flush()
+
        cotop = comon.Comon()
        # lastcotop measures whether cotop is actually running.  this is a better
        # metric than sshstatus, or other values from CoMon
        cotop_url = COMON_COTOPURL
 
        # history information for all nodes
-       #cohash = {}
-       cohash = cotop.coget(cotop_url)
-       l_nodes = syncplcdb.create_plcdb()
+       cohash = {}
+       #cohash = cotop.coget(cotop_url)
+       l_nodes = plccache.l_nodes
        if config.nodelist:
                f_nodes = util.file.getListFromFile(config.nodelist)
                l_nodes = filter(lambda x: x['hostname'] in f_nodes, l_nodes)