many improvements.
[monitor.git] / findbadpcu.py
index 1af600c..468107d 100755 (executable)
@@ -13,9 +13,10 @@ import threadpool
 import threading
 
 import monitor
-from monitor.pcu import reboot
+from pcucontrol  import reboot
 from monitor import config
-from monitor.database import FindbadPCURecordSync, FindbadPCURecord
+from monitor.database.info.model import FindbadPCURecordSync, FindbadPCURecord, session
+from monitor import database
 from monitor import util 
 from monitor.wrapper import plc, plccache
 from nodequery import pcu_select
@@ -25,7 +26,7 @@ global_round = 1
 errorState = {}
 count = 0
 
-def nmap_portstatus(status):
+def nmap_port_status(status):
        ps = {}
        l_nmap = status.split()
        ports = l_nmap[4:]
@@ -178,31 +179,31 @@ def collectPingAndSSH(pcuname, cohash):
 
                #### COMPLETE ENTRY   #######################
 
-               values['complete_entry'] = []
+               values['entry_complete'] = []
                #if values['protocol'] is None or values['protocol'] is "":
-               #       values['complete_entry'] += ["protocol"]
+               #       values['entry_complete'] += ["protocol"]
                if values['plc_pcu_stats']['model'] is None or values['plc_pcu_stats']['model'] is "":
-                       values['complete_entry'] += ["model"]
+                       values['entry_complete'] += ["model"]
                        # Cannot continue due to this condition
                        continue_probe = False
 
                if values['plc_pcu_stats']['password'] is None or values['plc_pcu_stats']['password'] is "":
-                       values['complete_entry'] += ["password"]
+                       values['entry_complete'] += ["password"]
                        # Cannot continue due to this condition
                        continue_probe = False
 
-               if len(values['complete_entry']) > 0:
+               if len(values['entry_complete']) > 0:
                        continue_probe = False
 
                if values['plc_pcu_stats']['hostname'] is None or values['plc_pcu_stats']['hostname'] is "":
-                       values['complete_entry'] += ["hostname"]
+                       values['entry_complete'] += ["hostname"]
                if values['plc_pcu_stats']['ip'] is None or values['plc_pcu_stats']['ip'] is "":
-                       values['complete_entry'] += ["ip"]
+                       values['entry_complete'] += ["ip"]
 
                # If there are no nodes associated with this PCU, then we cannot continue.
                if len(values['plc_pcu_stats']['node_ids']) == 0:
                        continue_probe = False
-                       values['complete_entry'] += ['NoNodeIds']
+                       values['entry_complete'] += ['NoNodeIds']
 
                #### DNS and IP MATCH #######################
                if values['plc_pcu_stats']['hostname'] is not None and values['plc_pcu_stats']['hostname'] is not "" and \
@@ -211,21 +212,21 @@ def collectPingAndSSH(pcuname, cohash):
                        try:
                                ipaddr = socket.gethostbyname(values['plc_pcu_stats']['hostname'])
                                if ipaddr == values['plc_pcu_stats']['ip']:
-                                       values['dnsmatch'] = "DNS-OK"
+                                       values['dns_status'] = "DNS-OK"
                                else:
-                                       values['dnsmatch'] = "DNS-MISMATCH"
+                                       values['dns_status'] = "DNS-MISMATCH"
                                        continue_probe = False
 
                        except Exception, err:
-                               values['dnsmatch'] = "DNS-NOENTRY"
+                               values['dns_status'] = "DNS-NOENTRY"
                                values['plc_pcu_stats']['hostname'] = values['plc_pcu_stats']['ip']
                                #print err
                else:
                        if values['plc_pcu_stats']['ip'] is not None and values['plc_pcu_stats']['ip'] is not "":
-                               values['dnsmatch'] = "NOHOSTNAME"
+                               values['dns_status'] = "NOHOSTNAME"
                                values['plc_pcu_stats']['hostname'] = values['plc_pcu_stats']['ip']
                        else:
-                               values['dnsmatch'] = "NO-DNS-OR-IP"
+                               values['dns_status'] = "NO-DNS-OR-IP"
                                values['plc_pcu_stats']['hostname'] = "No_entry_in_DB"
                                continue_probe = False
 
@@ -234,14 +235,14 @@ def collectPingAndSSH(pcuname, cohash):
                        nmap = util.command.CMD()
                        (oval,eval) = nmap.run_noexcept("nmap -oG - -P0 -p22,23,80,443,5869,9100,16992 %s | grep Host:" % reboot.pcu_name(values['plc_pcu_stats']))
                        # NOTE: an empty / error value for oval, will still work.
-                       (values['portstatus'], continue_probe) = nmap_portstatus(oval)
+                       (values['port_status'], continue_probe) = nmap_port_status(oval)
                else:
-                       values['portstatus'] = None
+                       values['port_status'] = None
                        
 
                ######  DRY RUN  ############################
                if 'node_ids' in values['plc_pcu_stats'] and len(values['plc_pcu_stats']['node_ids']) > 0:
-                       rb_ret = reboot.reboot_test(values['plc_pcu_stats']['nodenames'][0], values, continue_probe, 1, True)
+                       rb_ret = reboot.reboot_test_new(values['plc_pcu_stats']['nodenames'][0], values, continue_probe, 1, True)
                else:
                        rb_ret = "Not_Run" # No nodes to test"
 
@@ -254,6 +255,7 @@ def collectPingAndSSH(pcuname, cohash):
                print "____________________________________"
                errors['traceback'] = traceback.format_exc()
                print errors['traceback']
+               values['reboot'] = errors['traceback']
 
        values['date_checked'] = time.time()
        return (pcuname, values, errors)
@@ -274,15 +276,20 @@ def recordPingAndSSH(request, result):
 
                fbrec = FindbadPCURecord(
                                        date_checked=datetime.fromtimestamp(values['date_checked']),
-                                       record=fbsync.round,
+                                       round=fbsync.round,
                                        plc_pcuid=pcu_id,
                                        plc_pcu_stats=values['plc_pcu_stats'],
-                                       dns_status=values['dnsmatch'],
-                                       port_status=values['portstatus'],
-                                       entry_complete=" ".join(values['complete_entry']),
+                                       dns_status=values['dns_status'],
+                                       port_status=values['port_status'],
+                                       entry_complete=" ".join(values['entry_complete']),
                                        reboot_trial_status="%s" % values['reboot'],
                                )
                fbnodesync.round = global_round
+
+               fbnodesync.flush()
+               fbsync.flush()
+               fbrec.flush()
+
                count += 1
                print "%d %s %s" % (count, nodename, values)
 
@@ -308,9 +315,10 @@ def checkAndRecordState(l_pcus, cohash):
        for pcuname in l_pcus:
                pcu_id = int(pcuname)
                fbnodesync = FindbadPCURecordSync.findby_or_create(plc_pcuid=pcu_id, if_new_set={'round' : 0})
+               fbnodesync.flush()
 
                node_round   = fbnodesync.round
-               if node_round < global_round:
+               if node_round < global_round or config.force:
                        # recreate node stats when refreshed
                        #print "%s" % nodename
                        req = threadpool.WorkRequest(collectPingAndSSH, [pcuname, cohash], {}, 
@@ -340,6 +348,7 @@ def checkAndRecordState(l_pcus, cohash):
 
        print FindbadPCURecordSync.query.count()
        print FindbadPCURecord.query.count()
+       session.flush()
 
 
 def main():
@@ -353,10 +362,6 @@ def main():
 
        global_round = fbsync.round
 
-       if config.increment:
-               # update global round number to force refreshes across all nodes
-               global_round += 1
-               fbsync.round = global_round
 
        if config.site is not None:
                api = plc.getAuthAPI()
@@ -369,6 +374,7 @@ def main():
                l_pcus = [pcu for pcu in sets.Set(pcus)]
        elif config.pcuselect is not None:
                n, pcus = pcu_select(config.pcuselect)
+               print pcus
                # clear out dups.
                l_pcus = [pcu for pcu in sets.Set(pcus)]
 
@@ -382,11 +388,18 @@ def main():
                l_pcus = [ config.pcuid ] 
                l_pcus = [int(pcu) for pcu in l_pcus]
 
+       if config.increment:
+               # update global round number to force refreshes across all nodes
+               global_round += 1
+               fbsync.round = global_round
+       fbsync.flush()
+
        checkAndRecordState(l_pcus, cohash)
 
        return 0
 
 
+print "main"
 if __name__ == '__main__':
        import logging
        logger = logging.getLogger("monitor")
@@ -405,7 +418,8 @@ if __name__ == '__main__':
                                                site=None,
                                                dbname="findbadpcus", 
                                                cachenodes=False,
-                                               refresh=False,
+                                               cachecalls=True,
+                                               force=False,
                                                )
        parser.add_option("-f", "--nodelist", dest="nodelist", metavar="FILE", 
                                                help="Provide the input file for the node list")
@@ -420,12 +434,18 @@ if __name__ == '__main__':
                                                help="Cache node lookup from PLC")
        parser.add_option("", "--dbname", dest="dbname", metavar="FILE", 
                                                help="Specify the name of the database to which the information is saved")
-       parser.add_option("", "--refresh", action="store_true", dest="refresh",
+       parser.add_option("", "--nocachecalls", action="store_false", dest="cachecalls",
                                                help="Refresh the cached values")
        parser.add_option("-i", "--increment", action="store_true", dest="increment", 
                                                help="Increment round number to force refresh or retry")
+       parser.add_option("", "--force", action="store_true", dest="force", 
+                                               help="Force probe without incrementing global 'round'.")
        parser = parsermodule.getParser(['defaults'], parser)
        config = parsermodule.parse_args(parser)
+       if hasattr(config, 'cachecalls') and not config.cachecalls:
+               # NOTE: if explicilty asked, refresh cached values.
+               print "Reloading PLCCache"
+               plccache.init()
        try:
                # NOTE: evidently, there is a bizarre interaction between iLO and ssh
                # when LANG is set... Do not know why.  Unsetting LANG, fixes the problem.