nodebad.py

   1 #!/usr/bin/python
   2
   3 import os
   4 import sys
   5 import string
   6 import time
   7 from datetime import datetime,timedelta
   8
   9 from nodequery import verify,query_to_dict,node_select
  10
  11 from monitor.common import *
  12
  13 from monitor import config
  14 from monitor.wrapper import plc,plccache
  15 from monitor.const import MINUP
  16 from monitor.database.info.model import  FindbadNodeRecord, HistoryNodeRecord
  17 from monitor.database.dborm import  mon_session as session
  18
  19 from monitor.model import *
  20
  21 api = plc.getAuthAPI()
  22
  23 round = 1
  24 count = 0
  25 def main():
  26         main2(config)
  27
  28 def main2(config):
  29
  30         l_plcnodes = plccache.l_nodes
  31         l_nodes = get_nodeset(config)
  32
  33         checkAndRecordState(l_nodes, l_plcnodes)
  34
  35 # Node states:
  36
  37 def check_node_state(rec, node):
  38
  39         node_state = rec.observed_status
  40         if rec.plc_node_stats:
  41                 print rec.plc_node_stats
  42                 boot_state = rec.plc_node_stats['boot_state']
  43                 last_contact = rec.plc_node_stats['last_contact']
  44                 node.plc_nodeid = rec.plc_node_stats['node_id']
  45         else:
  46                 boot_state = "unknown"
  47                 last_contact = None
  48
  49         if boot_state == 'disable': boot_state = 'disabled'
  50         if boot_state == 'diag' or boot_state == 'diagnose': boot_state = 'safeboot'
  51
  52         if len(rec.plc_node_stats['pcu_ids']) > 0:
  53                 node.haspcu = True
  54         else:
  55                 node.haspcu = False
  56
  57
  58         # NOTE: 'DOWN' and 'DEBUG'  are temporary states, so only need
  59         #                       'translations' into the node.status state
  60         #               'BOOT' is a permanent state, but we want it to have a bit of
  61         #                       hysteresis (less than 0.5 days)
  62
  63         #################################################################
  64         # "Initialize" the findbad states into nodebad status if they are not already set
  65
  66         if node_state == 'DOWN' and ( node.status != 'offline' and node.status != 'down' ) and boot_state != 'disabled' :
  67                 print "changed status from %s to offline" % node.status
  68                 node.status = 'offline'
  69                 node.last_changed = datetime.now()
  70
  71         if node_state == 'DEBUG' and node.status != 'monitordebug' and \
  72                                                                  node.status != 'disabled' and \
  73                                                                  node.status != 'safeboot':
  74                 if boot_state != 'disabled' and boot_state != 'safeboot':
  75
  76                         print "changed status from %s to monitordebug" % (node.status)
  77                         node.status = "monitordebug"
  78                         node.last_changed = datetime.now()
  79                 else:
  80                         print "changed status from %s to %s" % (node.status, boot_state)
  81                         node.status = boot_state
  82                         node.last_changed = datetime.now()
  83
  84         if node_state == 'BOOT' and node.status != 'online' and node.status != 'good':
  85                 print "changed status from %s to online" % node.status
  86                 node.status = 'online'
  87                 node.last_changed = datetime.now()
  88
  89         #################################################################
  90         # Switch temporary hystersis states into their 'firm' states.
  91         #         online -> good                after half a day
  92         #         offline -> down               after two days
  93         #         monitordebug -> down  after 30 days
  94         #         safeboot -> monitordebug after 60 days
  95         #         disabled -> down              after 60 days
  96
  97         if node.status == 'online' and changed_greaterthan(node.last_changed, 0.5):
  98                 print "changed status from %s to good" % node.status
  99                 node.status = 'good'
 100                 # NOTE: do not reset last_changed, or you lose how long it's been up.
 101
 102         if node.status == 'offline' and changed_greaterthan(node.last_changed, 2):
 103                 print "changed status from %s to down" % node.status
 104                 node.status = 'down'
 105                 # NOTE: do not reset last_changed, or you lose how long it's been down.
 106
 107         if node.status == 'monitordebug' and changed_greaterthan(node.last_changed, 30):
 108                 print "changed status from %s to down" % node.status
 109                 node.status = 'down'
 110                 # NOTE: do not reset last_changed, or you lose how long it's been down.
 111
 112         if node.status == 'safeboot' and changed_greaterthan(node.last_changed, 60):
 113                 print "changed status from %s to down" % node.status
 114                 # NOTE: change an admin mode back into monitordebug after two months.
 115                 node.status = 'monitordebug'
 116                 node.last_changed = datetime.now()
 117
 118         # extreme cases of offline nodes
 119         if ( boot_state == 'disabled' or last_contact == None ) and \
 120                         changed_greaterthan(node.last_changed, 2*30) and \
 121                         node.status != 'down':
 122                 print "changed status from %s to down" % node.status
 123                 node.status = 'down'
 124                 node.last_changed = datetime.now()
 125
 126 def checkAndRecordState(l_nodes, l_plcnodes):
 127         global count
 128
 129         for nodename in l_nodes:
 130
 131                 nodehist = HistoryNodeRecord.findby_or_create(hostname=nodename,
 132                                                         if_new_set={'status' : 'offline',
 133                                                                                 'last_changed' : datetime.now()})
 134                 nodehist.last_checked = datetime.now()
 135
 136                 try:
 137                         # Find the most recent record
 138                         noderec = FindbadNodeRecord.get_latest_by(hostname=nodename)
 139                 except:
 140                         print "COULD NOT FIND %s" % nodename
 141                         import traceback
 142                         email_exception()
 143                         print traceback.print_exc()
 144                         continue
 145
 146                 if not noderec:
 147                         print "none object for %s"% nodename
 148                         continue
 149
 150                 check_node_state(noderec, nodehist)
 151
 152                 count += 1
 153                 print "%d %35s %s since(%s)" % (count, nodename, nodehist.status, diff_time(time.mktime(nodehist.last_changed.timetuple())))
 154
 155         # NOTE: this commits all pending operations to the DB.  Do not remove.
 156         session.flush()
 157
 158         return True
 159
 160 if __name__ == '__main__':
 161         from monitor import parser as parsermodule
 162         parser = parsermodule.getParser(['nodesets'])
 163         parser.set_defaults(filename=None, node=None, nodeselect=False, nodegroup=None, cachenodes=False)
 164         parser = parsermodule.getParser(['defaults'], parser)
 165         config = parsermodule.parse_args(parser)
 166
 167         try:
 168                 main2(config)
 169         except Exception, err:
 170                 import traceback
 171                 print traceback.print_exc()
 172                 print "Exception: %s" % err
 173                 sys.exit(0)