findbadpcu.py

   1 #!/usr/bin/python
   2
   3 import os
   4 import sys
   5 import string
   6 import time
   7 import socket
   8 import sets
   9 import signal
  10 import traceback
  11 from datetime import datetime,timedelta
  12 import threadpool
  13 import threading
  14
  15 import monitor
  16 from pcucontrol  import reboot
  17 from monitor import config
  18 from monitor.database.info.model import FindbadPCURecordSync, FindbadPCURecord, session
  19 from monitor import database
  20 from monitor import util
  21 from monitor.wrapper import plc, plccache
  22 from nodequery import pcu_select
  23 from nodecommon import nmap_port_status
  24
  25 plc_lock = threading.Lock()
  26 global_round = 1
  27 errorState = {}
  28 count = 0
  29
  30 def get_pcu(pcuname):
  31         plc_lock.acquire()
  32         try:
  33                 #print "GetPCU from PLC %s" % pcuname
  34                 l_pcu  = plc.GetPCUs({'pcu_id' : pcuname})
  35                 #print l_pcu
  36                 if len(l_pcu) > 0:
  37                         l_pcu = l_pcu[0]
  38         except:
  39                 try:
  40                         #print "GetPCU from file %s" % pcuname
  41                         l_pcus = plccache.l_pcus
  42                         for i in l_pcus:
  43                                 if i['pcu_id'] == pcuname:
  44                                         l_pcu = i
  45                 except:
  46                         traceback.print_exc()
  47                         l_pcu = None
  48
  49         plc_lock.release()
  50         return l_pcu
  51
  52 def get_nodes(node_ids):
  53         plc_lock.acquire()
  54         l_node = []
  55         try:
  56                 l_node = plc.getNodes(node_ids, ['hostname', 'last_contact', 'node_id', 'ports'])
  57         except:
  58                 try:
  59                         plc_nodes = plccache.l_plcnodes
  60                         for n in plc_nodes:
  61                                 if n['node_id'] in node_ids:
  62                                         l_node.append(n)
  63                 except:
  64                         traceback.print_exc()
  65                         l_node = None
  66
  67         plc_lock.release()
  68         if l_node == []:
  69                 l_node = None
  70         return l_node
  71
  72
  73 def get_plc_pcu_values(pcuname):
  74         """
  75                 Try to contact PLC to get the PCU info.
  76                 If that fails, try a backup copy from the last run.
  77                 If that fails, return None
  78         """
  79         values = {}
  80
  81         l_pcu = get_pcu(pcuname)
  82
  83         if l_pcu is not None:
  84                 site_id = l_pcu['site_id']
  85                 node_ids = l_pcu['node_ids']
  86                 l_node = get_nodes(node_ids)
  87
  88                 if l_node is not None:
  89                         for node in l_node:
  90                                 values[node['hostname']] = node['ports'][0]
  91
  92                         values['nodenames'] = [node['hostname'] for node in l_node]
  93
  94                         # NOTE: this is for a dry run later. It doesn't matter which node.
  95                         values['node_id'] = l_node[0]['node_id']
  96
  97                 values.update(l_pcu)
  98         else:
  99                 values = None
 100
 101         return values
 102
 103 def get_plc_site_values(site_id):
 104         ### GET PLC SITE ######################
 105         plc_lock.acquire()
 106         values = {}
 107         d_site = None
 108
 109         try:
 110                 d_site = plc.getSites({'site_id': site_id}, ['max_slices', 'slice_ids', 'node_ids', 'login_base'])
 111                 if len(d_site) > 0:
 112                         d_site = d_site[0]
 113         except:
 114                 try:
 115                         plc_sites = plccache.l_plcsites
 116                         for site in plc_sites:
 117                                 if site['site_id'] == site_id:
 118                                         d_site = site
 119                                         break
 120                 except:
 121                         traceback.print_exc()
 122                         values = None
 123
 124         plc_lock.release()
 125
 126         if d_site is not None:
 127                 max_slices = d_site['max_slices']
 128                 num_slices = len(d_site['slice_ids'])
 129                 num_nodes = len(d_site['node_ids'])
 130                 loginbase = d_site['login_base']
 131                 values['plcsite'] = {'num_nodes' : num_nodes,
 132                                                         'max_slices' : max_slices,
 133                                                         'num_slices' : num_slices,
 134                                                         'login_base' : loginbase,
 135                                                         'status'     : 'SUCCESS'}
 136         else:
 137                 values = None
 138
 139
 140         return values
 141
 142
 143 def collectPingAndSSH(pcuname, cohash):
 144
 145         continue_probe = True
 146         errors = None
 147         values = {'reboot' : 'novalue'}
 148         ### GET PCU ######################
 149         try:
 150                 b_except = False
 151                 try:
 152                         v = get_plc_pcu_values(pcuname)
 153                         if v['hostname'] is not None: v['hostname'] = v['hostname'].strip()
 154                         if v['ip'] is not None: v['ip'] = v['ip'].strip()
 155
 156                         if v is not None:
 157                                 values['plc_pcu_stats'] = v
 158                         else:
 159                                 continue_probe = False
 160                 except:
 161                         b_except = True
 162                         traceback.print_exc()
 163                         continue_probe = False
 164
 165                 if b_except or not continue_probe: return (None, None, None)
 166
 167                 #### RUN NMAP ###############################
 168                 if continue_probe:
 169                         nmap = util.command.CMD()
 170                         print "nmap -oG - -P0 -p22,23,80,443,5869,9100,16992 %s | grep Host:" % reboot.pcu_name(values['plc_pcu_stats'])
 171                         (oval,eval) = nmap.run_noexcept("nmap -oG - -P0 -p22,23,80,443,5869,9100,16992 %s | grep Host:" % reboot.pcu_name(values['plc_pcu_stats']))
 172                         # NOTE: an empty / error value for oval, will still work.
 173                         (values['port_status'], continue_probe) = nmap_port_status(oval)
 174                 else:
 175                         values['port_status'] = None
 176
 177                 #### COMPLETE ENTRY   #######################
 178
 179                 values['entry_complete'] = []
 180                 #if values['protocol'] is None or values['protocol'] is "":
 181                 #       values['entry_complete'] += ["protocol"]
 182                 if values['plc_pcu_stats']['model'] is None or values['plc_pcu_stats']['model'] is "":
 183                         values['entry_complete'] += ["model"]
 184                         # Cannot continue due to this condition
 185                         continue_probe = False
 186
 187                 if values['plc_pcu_stats']['password'] is None or values['plc_pcu_stats']['password'] is "":
 188                         values['entry_complete'] += ["password"]
 189                         # Cannot continue due to this condition
 190                         continue_probe = False
 191
 192                 if len(values['entry_complete']) > 0:
 193                         continue_probe = False
 194
 195                 if values['plc_pcu_stats']['hostname'] is None or values['plc_pcu_stats']['hostname'] is "":
 196                         values['entry_complete'] += ["hostname"]
 197                 if values['plc_pcu_stats']['ip'] is None or values['plc_pcu_stats']['ip'] is "":
 198                         values['entry_complete'] += ["ip"]
 199
 200                 # If there are no nodes associated with this PCU, then we cannot continue.
 201                 if len(values['plc_pcu_stats']['node_ids']) == 0:
 202                         continue_probe = False
 203                         values['entry_complete'] += ['nodeids']
 204
 205
 206                 #### DNS and IP MATCH #######################
 207                 if values['plc_pcu_stats']['hostname'] is not None and values['plc_pcu_stats']['hostname'] is not "" and \
 208                    values['plc_pcu_stats']['ip'] is not None and values['plc_pcu_stats']['ip'] is not "":
 209                         #print "Calling socket.gethostbyname(%s)" % values['hostname']
 210                         try:
 211                                 ipaddr = socket.gethostbyname(values['plc_pcu_stats']['hostname'])
 212                                 if ipaddr == values['plc_pcu_stats']['ip']:
 213                                         values['dns_status'] = "DNS-OK"
 214                                 else:
 215                                         values['dns_status'] = "DNS-MISMATCH"
 216                                         continue_probe = False
 217
 218                         except Exception, err:
 219                                 values['dns_status'] = "DNS-NOENTRY"
 220                                 values['plc_pcu_stats']['hostname'] = values['plc_pcu_stats']['ip']
 221                                 #print err
 222                 else:
 223                         if values['plc_pcu_stats']['ip'] is not None and values['plc_pcu_stats']['ip'] is not "":
 224                                 values['dns_status'] = "NOHOSTNAME"
 225                                 values['plc_pcu_stats']['hostname'] = values['plc_pcu_stats']['ip']
 226                         else:
 227                                 values['dns_status'] = "NO-DNS-OR-IP"
 228                                 values['plc_pcu_stats']['hostname'] = "No_entry_in_DB"
 229                                 continue_probe = False
 230
 231
 232                 ######  DRY RUN  ############################
 233                 if 'node_ids' in values['plc_pcu_stats'] and len(values['plc_pcu_stats']['node_ids']) > 0:
 234                         rb_ret = reboot.reboot_test_new(values['plc_pcu_stats']['nodenames'][0],
 235                                                                                         values, 1, True)
 236                 else:
 237                         rb_ret = "Not_Run" # No nodes to test"
 238
 239                 values['reboot'] = rb_ret
 240
 241         except:
 242                 print "____________________________________"
 243                 print values
 244                 errors = values
 245                 print "____________________________________"
 246                 errors['traceback'] = traceback.format_exc()
 247                 print errors['traceback']
 248                 values['reboot'] = errors['traceback']
 249
 250         values['date_checked'] = time.time()
 251         return (pcuname, values, errors)
 252
 253 def recordPingAndSSH(request, result):
 254         global errorState
 255         global count
 256         global global_round
 257         (nodename, values, errors) = result
 258
 259         if values is not None:
 260                 pcu_id = int(nodename)
 261                 #fbsync = FindbadPCURecordSync.findby_or_create(plc_pcuid=0,
 262                 #                                                                       if_new_set={'round': global_round})
 263                 #global_round = fbsync.round
 264                 fbnodesync = FindbadPCURecordSync.findby_or_create(plc_pcuid=pcu_id,
 265                                                                                         if_new_set={'round' : global_round})
 266
 267                 fbrec = FindbadPCURecord(
 268                                         date_checked=datetime.fromtimestamp(values['date_checked']),
 269                                         round=global_round,
 270                                         plc_pcuid=pcu_id,
 271                                         plc_pcu_stats=values['plc_pcu_stats'],
 272                                         dns_status=values['dns_status'],
 273                                         port_status=values['port_status'],
 274                                         entry_complete=" ".join(values['entry_complete']),
 275                                         reboot_trial_status="%s" % values['reboot'],
 276                                 )
 277                 fbnodesync.round = global_round
 278
 279                 fbnodesync.flush()
 280                 #fbsync.flush()
 281                 fbrec.flush()
 282
 283                 count += 1
 284                 print "%d %s %s" % (count, nodename, values)
 285
 286         if errors is not None:
 287                 pcu_id = "id_%s" % nodename
 288                 errorState[pcu_id] = errors
 289                 database.dbDump("findbadpcu_errors", errorState)
 290
 291 # this will be called when an exception occurs within a thread
 292 def handle_exception(request, result):
 293         print "Exception occured in request %s" % request.requestID
 294         for i in result:
 295                 print "Result: %s" % i
 296
 297
 298 def checkAndRecordState(l_pcus, cohash):
 299         global global_round
 300         global count
 301
 302         tp = threadpool.ThreadPool(10)
 303
 304         # CREATE all the work requests
 305         for pcuname in l_pcus:
 306                 pcu_id = int(pcuname)
 307                 fbnodesync = FindbadPCURecordSync.findby_or_create(plc_pcuid=pcu_id, if_new_set={'round' : 0})
 308                 fbnodesync.flush()
 309
 310                 node_round   = fbnodesync.round
 311                 if node_round < global_round or config.force:
 312                         # recreate node stats when refreshed
 313                         #print "%s" % nodename
 314                         req = threadpool.WorkRequest(collectPingAndSSH, [pcuname, cohash], {},
 315                                                                                  None, recordPingAndSSH, handle_exception)
 316                         tp.putRequest(req)
 317                 else:
 318                         # We just skip it, since it's "up to date"
 319                         count += 1
 320                         print "%d %s %s" % (count, pcu_id, node_round)
 321
 322         # WAIT while all the work requests are processed.
 323         begin = time.time()
 324         while 1:
 325                 try:
 326                         time.sleep(1)
 327                         tp.poll()
 328                         # if more than two hours
 329                         if time.time() - begin > (60*60*1):
 330                                 print "findbadpcus.py has run out of time!!!!!!"
 331                                 os._exit(1)
 332                 except KeyboardInterrupt:
 333                         print "Interrupted!"
 334                         break
 335                 except threadpool.NoResultsPending:
 336                         print "All results collected."
 337                         break
 338
 339         print FindbadPCURecordSync.query.count()
 340         print FindbadPCURecord.query.count()
 341         session.flush()
 342
 343
 344 def main():
 345         global global_round
 346
 347         #  monitor.database.if_cached_else_refresh(1, config.refresh, "pculist", lambda : plc.GetPCUs())
 348         l_pcus = plccache.l_pcus
 349         cohash = {}
 350
 351         fbsync = FindbadPCURecordSync.findby_or_create(plc_pcuid=0, if_new_set={'round' : global_round})
 352
 353         global_round = fbsync.round
 354
 355
 356         if config.site is not None:
 357                 api = plc.getAuthAPI()
 358                 site = api.GetSites(config.site)
 359                 l_nodes = api.GetNodes(site[0]['node_ids'], ['pcu_ids'])
 360                 pcus = []
 361                 for node in l_nodes:
 362                         pcus += node['pcu_ids']
 363                 # clear out dups.
 364                 l_pcus = [pcu for pcu in sets.Set(pcus)]
 365         elif config.pcuselect is not None:
 366                 n, pcus = pcu_select(config.pcuselect)
 367                 print pcus
 368                 # clear out dups.
 369                 l_pcus = [pcu for pcu in sets.Set(pcus)]
 370
 371         elif config.nodelist == None and config.pcuid == None:
 372                 print "Calling API GetPCUs() : cachecalls(%s)" % config.cachecalls
 373                 l_pcus  = [pcu['pcu_id'] for pcu in l_pcus]
 374         elif config.nodelist is not None:
 375                 l_pcus = util.file.getListFromFile(config.nodelist)
 376                 l_pcus = [int(pcu) for pcu in l_pcus]
 377         elif config.pcuid is not None:
 378                 l_pcus = [ config.pcuid ]
 379                 l_pcus = [int(pcu) for pcu in l_pcus]
 380
 381         if config.increment:
 382                 # update global round number to force refreshes across all nodes
 383                 global_round += 1
 384
 385         checkAndRecordState(l_pcus, cohash)
 386
 387         if config.increment:
 388                 # update global round number to force refreshes across all nodes
 389                 fbsync.round = global_round
 390                 fbsync.flush()
 391                 session.flush()
 392
 393         return 0
 394
 395
 396 print "main"
 397 if __name__ == '__main__':
 398         import logging
 399         logger = logging.getLogger("monitor")
 400         logger.setLevel(logging.DEBUG)
 401         fh = logging.FileHandler("monitor.log", mode = 'a')
 402         fh.setLevel(logging.DEBUG)
 403         formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
 404         fh.setFormatter(formatter)
 405         logger.addHandler(fh)
 406         from monitor import parser as parsermodule
 407         parser = parsermodule.getParser()
 408         parser.set_defaults(nodelist=None,
 409                                                 increment=False,
 410                                                 pcuid=None,
 411                                                 pcuselect=None,
 412                                                 site=None,
 413                                                 dbname="findbadpcus",
 414                                                 cachenodes=False,
 415                                                 cachecalls=True,
 416                                                 force=False,
 417                                                 )
 418         parser.add_option("-f", "--nodelist", dest="nodelist", metavar="FILE",
 419                                                 help="Provide the input file for the node list")
 420         parser.add_option("", "--site", dest="site", metavar="FILE",
 421                                                 help="Get all pcus associated with the given site's nodes")
 422         parser.add_option("", "--pcuselect", dest="pcuselect", metavar="FILE",
 423                                                 help="Query string to apply to the findbad pcus")
 424         parser.add_option("", "--pcuid", dest="pcuid", metavar="id",
 425                                                 help="Provide the id for a single pcu")
 426
 427         parser.add_option("", "--cachenodes", action="store_true",
 428                                                 help="Cache node lookup from PLC")
 429         parser.add_option("", "--dbname", dest="dbname", metavar="FILE",
 430                                                 help="Specify the name of the database to which the information is saved")
 431         parser.add_option("", "--nocachecalls", action="store_false", dest="cachecalls",
 432                                                 help="Refresh the cached values")
 433         parser.add_option("-i", "--increment", action="store_true", dest="increment",
 434                                                 help="Increment round number to force refresh or retry")
 435         parser.add_option("", "--force", action="store_true", dest="force",
 436                                                 help="Force probe without incrementing global 'round'.")
 437         parser = parsermodule.getParser(['defaults'], parser)
 438         config = parsermodule.parse_args(parser)
 439         if hasattr(config, 'cachecalls') and not config.cachecalls:
 440                 # NOTE: if explicilty asked, refresh cached values.
 441                 print "Reloading PLCCache"
 442                 plccache.init()
 443         try:
 444                 # NOTE: evidently, there is a bizarre interaction between iLO and ssh
 445                 # when LANG is set... Do not know why.  Unsetting LANG, fixes the problem.
 446                 if 'LANG' in os.environ:
 447                         del os.environ['LANG']
 448                 main()
 449                 time.sleep(1)
 450         except Exception, err:
 451                 traceback.print_exc()
 452                 print "Exception: %s" % err
 453                 print "Saving data... exitting."
 454                 sys.exit(0)