findbadpcu.py

   1 #!/usr/bin/python
   2
   3 import os
   4 import sys
   5 import string
   6 import time
   7 import socket
   8 import sets
   9 import signal
  10 import traceback
  11 from datetime import datetime,timedelta
  12 import threadpool
  13 import threading
  14
  15 import monitor
  16 from pcucontrol  import reboot
  17 from monitor import config
  18 from monitor.database.info.model import FindbadPCURecordSync, FindbadPCURecord, session
  19 from monitor import database
  20 from monitor import util
  21 from monitor.wrapper import plc, plccache
  22 from nodequery import pcu_select
  23
  24 plc_lock = threading.Lock()
  25 global_round = 1
  26 errorState = {}
  27 count = 0
  28
  29 def nmap_port_status(status):
  30         ps = {}
  31         l_nmap = status.split()
  32         ports = l_nmap[4:]
  33
  34         continue_probe = False
  35         for port in ports:
  36                 results = port.split('/')
  37                 ps[results[0]] = results[1]
  38                 if results[1] == "open":
  39                         continue_probe = True
  40         return (ps, continue_probe)
  41
  42 def get_pcu(pcuname):
  43         plc_lock.acquire()
  44         try:
  45                 #print "GetPCU from PLC %s" % pcuname
  46                 l_pcu  = plc.GetPCUs({'pcu_id' : pcuname})
  47                 #print l_pcu
  48                 if len(l_pcu) > 0:
  49                         l_pcu = l_pcu[0]
  50         except:
  51                 try:
  52                         #print "GetPCU from file %s" % pcuname
  53                         l_pcus = plccache.l_pcus
  54                         for i in l_pcus:
  55                                 if i['pcu_id'] == pcuname:
  56                                         l_pcu = i
  57                 except:
  58                         traceback.print_exc()
  59                         l_pcu = None
  60
  61         plc_lock.release()
  62         return l_pcu
  63
  64 def get_nodes(node_ids):
  65         plc_lock.acquire()
  66         l_node = []
  67         try:
  68                 l_node = plc.getNodes(node_ids, ['hostname', 'last_contact', 'node_id', 'ports'])
  69         except:
  70                 try:
  71                         plc_nodes = plccache.l_plcnodes
  72                         for n in plc_nodes:
  73                                 if n['node_id'] in node_ids:
  74                                         l_node.append(n)
  75                 except:
  76                         traceback.print_exc()
  77                         l_node = None
  78
  79         plc_lock.release()
  80         if l_node == []:
  81                 l_node = None
  82         return l_node
  83
  84
  85 def get_plc_pcu_values(pcuname):
  86         """
  87                 Try to contact PLC to get the PCU info.
  88                 If that fails, try a backup copy from the last run.
  89                 If that fails, return None
  90         """
  91         values = {}
  92
  93         l_pcu = get_pcu(pcuname)
  94
  95         if l_pcu is not None:
  96                 site_id = l_pcu['site_id']
  97                 node_ids = l_pcu['node_ids']
  98                 l_node = get_nodes(node_ids)
  99
 100                 if l_node is not None:
 101                         for node in l_node:
 102                                 values[node['hostname']] = node['ports'][0]
 103
 104                         values['nodenames'] = [node['hostname'] for node in l_node]
 105
 106                         # NOTE: this is for a dry run later. It doesn't matter which node.
 107                         values['node_id'] = l_node[0]['node_id']
 108
 109                 values.update(l_pcu)
 110         else:
 111                 values = None
 112
 113         return values
 114
 115 def get_plc_site_values(site_id):
 116         ### GET PLC SITE ######################
 117         plc_lock.acquire()
 118         values = {}
 119         d_site = None
 120
 121         try:
 122                 d_site = plc.getSites({'site_id': site_id}, ['max_slices', 'slice_ids', 'node_ids', 'login_base'])
 123                 if len(d_site) > 0:
 124                         d_site = d_site[0]
 125         except:
 126                 try:
 127                         plc_sites = plccache.l_plcsites
 128                         for site in plc_sites:
 129                                 if site['site_id'] == site_id:
 130                                         d_site = site
 131                                         break
 132                 except:
 133                         traceback.print_exc()
 134                         values = None
 135
 136         plc_lock.release()
 137
 138         if d_site is not None:
 139                 max_slices = d_site['max_slices']
 140                 num_slices = len(d_site['slice_ids'])
 141                 num_nodes = len(d_site['node_ids'])
 142                 loginbase = d_site['login_base']
 143                 values['plcsite'] = {'num_nodes' : num_nodes,
 144                                                         'max_slices' : max_slices,
 145                                                         'num_slices' : num_slices,
 146                                                         'login_base' : loginbase,
 147                                                         'status'     : 'SUCCESS'}
 148         else:
 149                 values = None
 150
 151
 152         return values
 153
 154
 155 def collectPingAndSSH(pcuname, cohash):
 156
 157         continue_probe = True
 158         errors = None
 159         values = {'reboot' : 'novalue'}
 160         ### GET PCU ######################
 161         try:
 162                 b_except = False
 163                 try:
 164                         v = get_plc_pcu_values(pcuname)
 165                         if v['hostname'] is not None: v['hostname'] = v['hostname'].strip()
 166                         if v['ip'] is not None: v['ip'] = v['ip'].strip()
 167
 168                         if v is not None:
 169                                 values['plc_pcu_stats'] = v
 170                         else:
 171                                 continue_probe = False
 172                 except:
 173                         b_except = True
 174                         traceback.print_exc()
 175                         continue_probe = False
 176
 177                 if b_except or not continue_probe: return (None, None, None)
 178
 179
 180                 #### COMPLETE ENTRY   #######################
 181
 182                 values['entry_complete'] = []
 183                 #if values['protocol'] is None or values['protocol'] is "":
 184                 #       values['entry_complete'] += ["protocol"]
 185                 if values['plc_pcu_stats']['model'] is None or values['plc_pcu_stats']['model'] is "":
 186                         values['entry_complete'] += ["model"]
 187                         # Cannot continue due to this condition
 188                         continue_probe = False
 189
 190                 if values['plc_pcu_stats']['password'] is None or values['plc_pcu_stats']['password'] is "":
 191                         values['entry_complete'] += ["password"]
 192                         # Cannot continue due to this condition
 193                         continue_probe = False
 194
 195                 if len(values['entry_complete']) > 0:
 196                         continue_probe = False
 197
 198                 if values['plc_pcu_stats']['hostname'] is None or values['plc_pcu_stats']['hostname'] is "":
 199                         values['entry_complete'] += ["hostname"]
 200                 if values['plc_pcu_stats']['ip'] is None or values['plc_pcu_stats']['ip'] is "":
 201                         values['entry_complete'] += ["ip"]
 202
 203                 # If there are no nodes associated with this PCU, then we cannot continue.
 204                 if len(values['plc_pcu_stats']['node_ids']) == 0:
 205                         continue_probe = False
 206                         values['entry_complete'] += ['NoNodeIds']
 207
 208                 #### DNS and IP MATCH #######################
 209                 if values['plc_pcu_stats']['hostname'] is not None and values['plc_pcu_stats']['hostname'] is not "" and \
 210                    values['plc_pcu_stats']['ip'] is not None and values['plc_pcu_stats']['ip'] is not "":
 211                         #print "Calling socket.gethostbyname(%s)" % values['hostname']
 212                         try:
 213                                 ipaddr = socket.gethostbyname(values['plc_pcu_stats']['hostname'])
 214                                 if ipaddr == values['plc_pcu_stats']['ip']:
 215                                         values['dns_status'] = "DNS-OK"
 216                                 else:
 217                                         values['dns_status'] = "DNS-MISMATCH"
 218                                         continue_probe = False
 219
 220                         except Exception, err:
 221                                 values['dns_status'] = "DNS-NOENTRY"
 222                                 values['plc_pcu_stats']['hostname'] = values['plc_pcu_stats']['ip']
 223                                 #print err
 224                 else:
 225                         if values['plc_pcu_stats']['ip'] is not None and values['plc_pcu_stats']['ip'] is not "":
 226                                 values['dns_status'] = "NOHOSTNAME"
 227                                 values['plc_pcu_stats']['hostname'] = values['plc_pcu_stats']['ip']
 228                         else:
 229                                 values['dns_status'] = "NO-DNS-OR-IP"
 230                                 values['plc_pcu_stats']['hostname'] = "No_entry_in_DB"
 231                                 continue_probe = False
 232
 233                 #### RUN NMAP ###############################
 234                 if continue_probe:
 235                         nmap = util.command.CMD()
 236                         (oval,eval) = nmap.run_noexcept("nmap -oG - -P0 -p22,23,80,443,5869,9100,16992 %s | grep Host:" % reboot.pcu_name(values['plc_pcu_stats']))
 237                         # NOTE: an empty / error value for oval, will still work.
 238                         (values['port_status'], continue_probe) = nmap_port_status(oval)
 239                 else:
 240                         values['port_status'] = None
 241
 242
 243                 ######  DRY RUN  ############################
 244                 if 'node_ids' in values['plc_pcu_stats'] and len(values['plc_pcu_stats']['node_ids']) > 0:
 245                         rb_ret = reboot.reboot_test_new(values['plc_pcu_stats']['nodenames'][0], values, continue_probe, 1, True)
 246                 else:
 247                         rb_ret = "Not_Run" # No nodes to test"
 248
 249                 values['reboot'] = rb_ret
 250
 251         except:
 252                 print "____________________________________"
 253                 print values
 254                 errors = values
 255                 print "____________________________________"
 256                 errors['traceback'] = traceback.format_exc()
 257                 print errors['traceback']
 258                 values['reboot'] = errors['traceback']
 259
 260         values['date_checked'] = time.time()
 261         return (pcuname, values, errors)
 262
 263 def recordPingAndSSH(request, result):
 264         global errorState
 265         global count
 266         global global_round
 267         (nodename, values, errors) = result
 268
 269         if values is not None:
 270                 pcu_id = int(nodename)
 271                 fbsync = FindbadPCURecordSync.findby_or_create(plc_pcuid=0,
 272                                                                                         if_new_set={'round': global_round})
 273                 global_round = fbsync.round
 274                 fbnodesync = FindbadPCURecordSync.findby_or_create(plc_pcuid=pcu_id,
 275                                                                                         if_new_set={'round' : global_round})
 276
 277                 fbrec = FindbadPCURecord(
 278                                         date_checked=datetime.fromtimestamp(values['date_checked']),
 279                                         round=fbsync.round,
 280                                         plc_pcuid=pcu_id,
 281                                         plc_pcu_stats=values['plc_pcu_stats'],
 282                                         dns_status=values['dns_status'],
 283                                         port_status=values['port_status'],
 284                                         entry_complete=" ".join(values['entry_complete']),
 285                                         reboot_trial_status="%s" % values['reboot'],
 286                                 )
 287                 fbnodesync.round = global_round
 288
 289                 fbnodesync.flush()
 290                 fbsync.flush()
 291                 fbrec.flush()
 292
 293                 count += 1
 294                 print "%d %s %s" % (count, nodename, values)
 295
 296         if errors is not None:
 297                 pcu_id = "id_%s" % nodename
 298                 errorState[pcu_id] = errors
 299                 database.dbDump("findbadpcu_errors", errorState)
 300
 301 # this will be called when an exception occurs within a thread
 302 def handle_exception(request, result):
 303         print "Exception occured in request %s" % request.requestID
 304         for i in result:
 305                 print "Result: %s" % i
 306
 307
 308 def checkAndRecordState(l_pcus, cohash):
 309         global global_round
 310         global count
 311
 312         tp = threadpool.ThreadPool(10)
 313
 314         # CREATE all the work requests
 315         for pcuname in l_pcus:
 316                 pcu_id = int(pcuname)
 317                 fbnodesync = FindbadPCURecordSync.findby_or_create(plc_pcuid=pcu_id, if_new_set={'round' : 0})
 318                 fbnodesync.flush()
 319
 320                 node_round   = fbnodesync.round
 321                 if node_round < global_round or config.force:
 322                         # recreate node stats when refreshed
 323                         #print "%s" % nodename
 324                         req = threadpool.WorkRequest(collectPingAndSSH, [pcuname, cohash], {},
 325                                                                                  None, recordPingAndSSH, handle_exception)
 326                         tp.putRequest(req)
 327                 else:
 328                         # We just skip it, since it's "up to date"
 329                         count += 1
 330                         print "%d %s %s" % (count, pcu_id, node_round)
 331
 332         # WAIT while all the work requests are processed.
 333         begin = time.time()
 334         while 1:
 335                 try:
 336                         time.sleep(1)
 337                         tp.poll()
 338                         # if more than two hours
 339                         if time.time() - begin > (60*60*1):
 340                                 print "findbadpcus.py has run out of time!!!!!!"
 341                                 os._exit(1)
 342                 except KeyboardInterrupt:
 343                         print "Interrupted!"
 344                         break
 345                 except threadpool.NoResultsPending:
 346                         print "All results collected."
 347                         break
 348
 349         print FindbadPCURecordSync.query.count()
 350         print FindbadPCURecord.query.count()
 351         session.flush()
 352
 353
 354 def main():
 355         global global_round
 356
 357         #  monitor.database.if_cached_else_refresh(1, config.refresh, "pculist", lambda : plc.GetPCUs())
 358         l_pcus = plccache.l_pcus
 359         cohash = {}
 360
 361         fbsync = FindbadPCURecordSync.findby_or_create(plc_pcuid=0, if_new_set={'round' : global_round})
 362
 363         global_round = fbsync.round
 364
 365
 366         if config.site is not None:
 367                 api = plc.getAuthAPI()
 368                 site = api.GetSites(config.site)
 369                 l_nodes = api.GetNodes(site[0]['node_ids'], ['pcu_ids'])
 370                 pcus = []
 371                 for node in l_nodes:
 372                         pcus += node['pcu_ids']
 373                 # clear out dups.
 374                 l_pcus = [pcu for pcu in sets.Set(pcus)]
 375         elif config.pcuselect is not None:
 376                 n, pcus = pcu_select(config.pcuselect)
 377                 print pcus
 378                 # clear out dups.
 379                 l_pcus = [pcu for pcu in sets.Set(pcus)]
 380
 381         elif config.nodelist == None and config.pcuid == None:
 382                 print "Calling API GetPCUs() : refresh(%s)" % config.refresh
 383                 l_pcus  = [pcu['pcu_id'] for pcu in l_pcus]
 384         elif config.nodelist is not None:
 385                 l_pcus = util.file.getListFromFile(config.nodelist)
 386                 l_pcus = [int(pcu) for pcu in l_pcus]
 387         elif config.pcuid is not None:
 388                 l_pcus = [ config.pcuid ]
 389                 l_pcus = [int(pcu) for pcu in l_pcus]
 390
 391         if config.increment:
 392                 # update global round number to force refreshes across all nodes
 393                 global_round += 1
 394                 fbsync.round = global_round
 395         fbsync.flush()
 396
 397         checkAndRecordState(l_pcus, cohash)
 398
 399         return 0
 400
 401
 402 print "main"
 403 if __name__ == '__main__':
 404         import logging
 405         logger = logging.getLogger("monitor")
 406         logger.setLevel(logging.DEBUG)
 407         fh = logging.FileHandler("monitor.log", mode = 'a')
 408         fh.setLevel(logging.DEBUG)
 409         formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
 410         fh.setFormatter(formatter)
 411         logger.addHandler(fh)
 412         from monitor import parser as parsermodule
 413         parser = parsermodule.getParser()
 414         parser.set_defaults(nodelist=None,
 415                                                 increment=False,
 416                                                 pcuid=None,
 417                                                 pcuselect=None,
 418                                                 site=None,
 419                                                 dbname="findbadpcus",
 420                                                 cachenodes=False,
 421                                                 cachecalls=True,
 422                                                 force=False,
 423                                                 )
 424         parser.add_option("-f", "--nodelist", dest="nodelist", metavar="FILE",
 425                                                 help="Provide the input file for the node list")
 426         parser.add_option("", "--site", dest="site", metavar="FILE",
 427                                                 help="Get all pcus associated with the given site's nodes")
 428         parser.add_option("", "--pcuselect", dest="pcuselect", metavar="FILE",
 429                                                 help="Query string to apply to the findbad pcus")
 430         parser.add_option("", "--pcuid", dest="pcuid", metavar="id",
 431                                                 help="Provide the id for a single pcu")
 432
 433         parser.add_option("", "--cachenodes", action="store_true",
 434                                                 help="Cache node lookup from PLC")
 435         parser.add_option("", "--dbname", dest="dbname", metavar="FILE",
 436                                                 help="Specify the name of the database to which the information is saved")
 437         parser.add_option("", "--nocachecalls", action="store_false", dest="cachecalls",
 438                                                 help="Refresh the cached values")
 439         parser.add_option("-i", "--increment", action="store_true", dest="increment",
 440                                                 help="Increment round number to force refresh or retry")
 441         parser.add_option("", "--force", action="store_true", dest="force",
 442                                                 help="Force probe without incrementing global 'round'.")
 443         parser = parsermodule.getParser(['defaults'], parser)
 444         config = parsermodule.parse_args(parser)
 445         if hasattr(config, 'cachecalls') and not config.cachecalls:
 446                 # NOTE: if explicilty asked, refresh cached values.
 447                 print "Reloading PLCCache"
 448                 plccache.init()
 449         try:
 450                 # NOTE: evidently, there is a bizarre interaction between iLO and ssh
 451                 # when LANG is set... Do not know why.  Unsetting LANG, fixes the problem.
 452                 if 'LANG' in os.environ:
 453                         del os.environ['LANG']
 454                 main()
 455                 time.sleep(1)
 456         except Exception, err:
 457                 traceback.print_exc()
 458                 print "Exception: %s" % err
 459                 print "Saving data... exitting."
 460                 sys.exit(0)