findbadpcu.py

   1 #!/usr/bin/python
   2
   3 import os
   4 import sys
   5 import string
   6 import time
   7 import socket
   8 import sets
   9 import signal
  10 import traceback
  11 from datetime import datetime,timedelta
  12 import threadpool
  13 import threading
  14
  15 import monitor
  16 from pcucontrol  import reboot
  17 from monitor import config
  18 from monitor.database.info.model import FindbadPCURecordSync, FindbadPCURecord, session
  19 from monitor import database
  20 from monitor import util
  21 from monitor.wrapper import plc, plccache
  22 from nodequery import pcu_select
  23
  24 plc_lock = threading.Lock()
  25 global_round = 1
  26 errorState = {}
  27 count = 0
  28
  29 def nmap_portstatus(status):
  30         ps = {}
  31         l_nmap = status.split()
  32         ports = l_nmap[4:]
  33
  34         continue_probe = False
  35         for port in ports:
  36                 results = port.split('/')
  37                 ps[results[0]] = results[1]
  38                 if results[1] == "open":
  39                         continue_probe = True
  40         return (ps, continue_probe)
  41
  42 def get_pcu(pcuname):
  43         plc_lock.acquire()
  44         try:
  45                 #print "GetPCU from PLC %s" % pcuname
  46                 l_pcu  = plc.GetPCUs({'pcu_id' : pcuname})
  47                 #print l_pcu
  48                 if len(l_pcu) > 0:
  49                         l_pcu = l_pcu[0]
  50         except:
  51                 try:
  52                         #print "GetPCU from file %s" % pcuname
  53                         l_pcus = plccache.l_pcus
  54                         for i in l_pcus:
  55                                 if i['pcu_id'] == pcuname:
  56                                         l_pcu = i
  57                 except:
  58                         traceback.print_exc()
  59                         l_pcu = None
  60
  61         plc_lock.release()
  62         return l_pcu
  63
  64 def get_nodes(node_ids):
  65         plc_lock.acquire()
  66         l_node = []
  67         try:
  68                 l_node = plc.getNodes(node_ids, ['hostname', 'last_contact', 'node_id', 'ports'])
  69         except:
  70                 try:
  71                         plc_nodes = plccache.l_plcnodes
  72                         for n in plc_nodes:
  73                                 if n['node_id'] in node_ids:
  74                                         l_node.append(n)
  75                 except:
  76                         traceback.print_exc()
  77                         l_node = None
  78
  79         plc_lock.release()
  80         if l_node == []:
  81                 l_node = None
  82         return l_node
  83
  84
  85 def get_plc_pcu_values(pcuname):
  86         """
  87                 Try to contact PLC to get the PCU info.
  88                 If that fails, try a backup copy from the last run.
  89                 If that fails, return None
  90         """
  91         values = {}
  92
  93         l_pcu = get_pcu(pcuname)
  94
  95         if l_pcu is not None:
  96                 site_id = l_pcu['site_id']
  97                 node_ids = l_pcu['node_ids']
  98                 l_node = get_nodes(node_ids)
  99
 100                 if l_node is not None:
 101                         for node in l_node:
 102                                 values[node['hostname']] = node['ports'][0]
 103
 104                         values['nodenames'] = [node['hostname'] for node in l_node]
 105
 106                         # NOTE: this is for a dry run later. It doesn't matter which node.
 107                         values['node_id'] = l_node[0]['node_id']
 108
 109                 values.update(l_pcu)
 110         else:
 111                 values = None
 112
 113         return values
 114
 115 def get_plc_site_values(site_id):
 116         ### GET PLC SITE ######################
 117         plc_lock.acquire()
 118         values = {}
 119         d_site = None
 120
 121         try:
 122                 d_site = plc.getSites({'site_id': site_id}, ['max_slices', 'slice_ids', 'node_ids', 'login_base'])
 123                 if len(d_site) > 0:
 124                         d_site = d_site[0]
 125         except:
 126                 try:
 127                         plc_sites = plccache.l_plcsites
 128                         for site in plc_sites:
 129                                 if site['site_id'] == site_id:
 130                                         d_site = site
 131                                         break
 132                 except:
 133                         traceback.print_exc()
 134                         values = None
 135
 136         plc_lock.release()
 137
 138         if d_site is not None:
 139                 max_slices = d_site['max_slices']
 140                 num_slices = len(d_site['slice_ids'])
 141                 num_nodes = len(d_site['node_ids'])
 142                 loginbase = d_site['login_base']
 143                 values['plcsite'] = {'num_nodes' : num_nodes,
 144                                                         'max_slices' : max_slices,
 145                                                         'num_slices' : num_slices,
 146                                                         'login_base' : loginbase,
 147                                                         'status'     : 'SUCCESS'}
 148         else:
 149                 values = None
 150
 151
 152         return values
 153
 154
 155 def collectPingAndSSH(pcuname, cohash):
 156
 157         continue_probe = True
 158         errors = None
 159         values = {'reboot' : 'novalue'}
 160         ### GET PCU ######################
 161         try:
 162                 b_except = False
 163                 try:
 164                         v = get_plc_pcu_values(pcuname)
 165                         if v['hostname'] is not None: v['hostname'] = v['hostname'].strip()
 166                         if v['ip'] is not None: v['ip'] = v['ip'].strip()
 167
 168                         if v is not None:
 169                                 values['plc_pcu_stats'] = v
 170                         else:
 171                                 continue_probe = False
 172                 except:
 173                         b_except = True
 174                         traceback.print_exc()
 175                         continue_probe = False
 176
 177                 if b_except or not continue_probe: return (None, None, None)
 178
 179
 180                 #### COMPLETE ENTRY   #######################
 181
 182                 values['complete_entry'] = []
 183                 #if values['protocol'] is None or values['protocol'] is "":
 184                 #       values['complete_entry'] += ["protocol"]
 185                 if values['plc_pcu_stats']['model'] is None or values['plc_pcu_stats']['model'] is "":
 186                         values['complete_entry'] += ["model"]
 187                         # Cannot continue due to this condition
 188                         continue_probe = False
 189
 190                 if values['plc_pcu_stats']['password'] is None or values['plc_pcu_stats']['password'] is "":
 191                         values['complete_entry'] += ["password"]
 192                         # Cannot continue due to this condition
 193                         continue_probe = False
 194
 195                 if len(values['complete_entry']) > 0:
 196                         continue_probe = False
 197
 198                 if values['plc_pcu_stats']['hostname'] is None or values['plc_pcu_stats']['hostname'] is "":
 199                         values['complete_entry'] += ["hostname"]
 200                 if values['plc_pcu_stats']['ip'] is None or values['plc_pcu_stats']['ip'] is "":
 201                         values['complete_entry'] += ["ip"]
 202
 203                 # If there are no nodes associated with this PCU, then we cannot continue.
 204                 if len(values['plc_pcu_stats']['node_ids']) == 0:
 205                         continue_probe = False
 206                         values['complete_entry'] += ['NoNodeIds']
 207
 208                 #### DNS and IP MATCH #######################
 209                 if values['plc_pcu_stats']['hostname'] is not None and values['plc_pcu_stats']['hostname'] is not "" and \
 210                    values['plc_pcu_stats']['ip'] is not None and values['plc_pcu_stats']['ip'] is not "":
 211                         #print "Calling socket.gethostbyname(%s)" % values['hostname']
 212                         try:
 213                                 ipaddr = socket.gethostbyname(values['plc_pcu_stats']['hostname'])
 214                                 if ipaddr == values['plc_pcu_stats']['ip']:
 215                                         values['dnsmatch'] = "DNS-OK"
 216                                 else:
 217                                         values['dnsmatch'] = "DNS-MISMATCH"
 218                                         continue_probe = False
 219
 220                         except Exception, err:
 221                                 values['dnsmatch'] = "DNS-NOENTRY"
 222                                 values['plc_pcu_stats']['hostname'] = values['plc_pcu_stats']['ip']
 223                                 #print err
 224                 else:
 225                         if values['plc_pcu_stats']['ip'] is not None and values['plc_pcu_stats']['ip'] is not "":
 226                                 values['dnsmatch'] = "NOHOSTNAME"
 227                                 values['plc_pcu_stats']['hostname'] = values['plc_pcu_stats']['ip']
 228                         else:
 229                                 values['dnsmatch'] = "NO-DNS-OR-IP"
 230                                 values['plc_pcu_stats']['hostname'] = "No_entry_in_DB"
 231                                 continue_probe = False
 232
 233                 #### RUN NMAP ###############################
 234                 if continue_probe:
 235                         nmap = util.command.CMD()
 236                         (oval,eval) = nmap.run_noexcept("nmap -oG - -P0 -p22,23,80,443,5869,9100,16992 %s | grep Host:" % reboot.pcu_name(values['plc_pcu_stats']))
 237                         # NOTE: an empty / error value for oval, will still work.
 238                         (values['portstatus'], continue_probe) = nmap_portstatus(oval)
 239                 else:
 240                         values['portstatus'] = None
 241
 242
 243                 ######  DRY RUN  ############################
 244                 if 'node_ids' in values['plc_pcu_stats'] and len(values['plc_pcu_stats']['node_ids']) > 0:
 245                         rb_ret = reboot.reboot_test(values['plc_pcu_stats']['nodenames'][0], values, continue_probe, 1, True)
 246                 else:
 247                         rb_ret = "Not_Run" # No nodes to test"
 248
 249                 values['reboot'] = rb_ret
 250
 251         except:
 252                 print "____________________________________"
 253                 print values
 254                 errors = values
 255                 print "____________________________________"
 256                 errors['traceback'] = traceback.format_exc()
 257                 print errors['traceback']
 258
 259         values['date_checked'] = time.time()
 260         return (pcuname, values, errors)
 261
 262 def recordPingAndSSH(request, result):
 263         global errorState
 264         global count
 265         global global_round
 266         (nodename, values, errors) = result
 267
 268         if values is not None:
 269                 pcu_id = int(nodename)
 270                 fbsync = FindbadPCURecordSync.findby_or_create(plc_pcuid=0,
 271                                                                                         if_new_set={'round': global_round})
 272                 global_round = fbsync.round
 273                 fbnodesync = FindbadPCURecordSync.findby_or_create(plc_pcuid=pcu_id,
 274                                                                                         if_new_set={'round' : global_round})
 275
 276                 fbrec = FindbadPCURecord(
 277                                         date_checked=datetime.fromtimestamp(values['date_checked']),
 278                                         round=fbsync.round,
 279                                         plc_pcuid=pcu_id,
 280                                         plc_pcu_stats=values['plc_pcu_stats'],
 281                                         dns_status=values['dnsmatch'],
 282                                         port_status=values['portstatus'],
 283                                         entry_complete=" ".join(values['complete_entry']),
 284                                         reboot_trial_status="%s" % values['reboot'],
 285                                 )
 286                 fbnodesync.round = global_round
 287
 288                 fbnodesync.flush()
 289                 fbsync.flush()
 290                 fbrec.flush()
 291
 292                 count += 1
 293                 print "%d %s %s" % (count, nodename, values)
 294
 295         if errors is not None:
 296                 pcu_id = "id_%s" % nodename
 297                 errorState[pcu_id] = errors
 298                 database.dbDump("findbadpcu_errors", errorState)
 299
 300 # this will be called when an exception occurs within a thread
 301 def handle_exception(request, result):
 302         print "Exception occured in request %s" % request.requestID
 303         for i in result:
 304                 print "Result: %s" % i
 305
 306
 307 def checkAndRecordState(l_pcus, cohash):
 308         global global_round
 309         global count
 310
 311         tp = threadpool.ThreadPool(10)
 312
 313         # CREATE all the work requests
 314         for pcuname in l_pcus:
 315                 pcu_id = int(pcuname)
 316                 fbnodesync = FindbadPCURecordSync.findby_or_create(plc_pcuid=pcu_id, if_new_set={'round' : 0})
 317                 fbnodesync.flush()
 318
 319                 node_round   = fbnodesync.round
 320                 if node_round < global_round or config.force:
 321                         # recreate node stats when refreshed
 322                         #print "%s" % nodename
 323                         req = threadpool.WorkRequest(collectPingAndSSH, [pcuname, cohash], {},
 324                                                                                  None, recordPingAndSSH, handle_exception)
 325                         tp.putRequest(req)
 326                 else:
 327                         # We just skip it, since it's "up to date"
 328                         count += 1
 329                         print "%d %s %s" % (count, pcu_id, node_round)
 330
 331         # WAIT while all the work requests are processed.
 332         begin = time.time()
 333         while 1:
 334                 try:
 335                         time.sleep(1)
 336                         tp.poll()
 337                         # if more than two hours
 338                         if time.time() - begin > (60*60*1):
 339                                 print "findbadpcus.py has run out of time!!!!!!"
 340                                 os._exit(1)
 341                 except KeyboardInterrupt:
 342                         print "Interrupted!"
 343                         break
 344                 except threadpool.NoResultsPending:
 345                         print "All results collected."
 346                         break
 347
 348         print FindbadPCURecordSync.query.count()
 349         print FindbadPCURecord.query.count()
 350         session.flush()
 351
 352
 353 def main():
 354         global global_round
 355
 356         #  monitor.database.if_cached_else_refresh(1, config.refresh, "pculist", lambda : plc.GetPCUs())
 357         l_pcus = plccache.l_pcus
 358         cohash = {}
 359
 360         fbsync = FindbadPCURecordSync.findby_or_create(plc_pcuid=0, if_new_set={'round' : global_round})
 361
 362         global_round = fbsync.round
 363
 364
 365         if config.site is not None:
 366                 api = plc.getAuthAPI()
 367                 site = api.GetSites(config.site)
 368                 l_nodes = api.GetNodes(site[0]['node_ids'], ['pcu_ids'])
 369                 pcus = []
 370                 for node in l_nodes:
 371                         pcus += node['pcu_ids']
 372                 # clear out dups.
 373                 l_pcus = [pcu for pcu in sets.Set(pcus)]
 374         elif config.pcuselect is not None:
 375                 n, pcus = pcu_select(config.pcuselect)
 376                 print pcus
 377                 # clear out dups.
 378                 l_pcus = [pcu for pcu in sets.Set(pcus)]
 379
 380         elif config.nodelist == None and config.pcuid == None:
 381                 print "Calling API GetPCUs() : refresh(%s)" % config.refresh
 382                 l_pcus  = [pcu['pcu_id'] for pcu in l_pcus]
 383         elif config.nodelist is not None:
 384                 l_pcus = util.file.getListFromFile(config.nodelist)
 385                 l_pcus = [int(pcu) for pcu in l_pcus]
 386         elif config.pcuid is not None:
 387                 l_pcus = [ config.pcuid ]
 388                 l_pcus = [int(pcu) for pcu in l_pcus]
 389
 390         if config.increment:
 391                 # update global round number to force refreshes across all nodes
 392                 global_round += 1
 393                 fbsync.round = global_round
 394         fbsync.flush()
 395
 396         checkAndRecordState(l_pcus, cohash)
 397
 398         return 0
 399
 400
 401 if __name__ == '__main__':
 402         import logging
 403         logger = logging.getLogger("monitor")
 404         logger.setLevel(logging.DEBUG)
 405         fh = logging.FileHandler("monitor.log", mode = 'a')
 406         fh.setLevel(logging.DEBUG)
 407         formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
 408         fh.setFormatter(formatter)
 409         logger.addHandler(fh)
 410         from monitor import parser as parsermodule
 411         parser = parsermodule.getParser()
 412         parser.set_defaults(nodelist=None,
 413                                                 increment=False,
 414                                                 pcuid=None,
 415                                                 pcuselect=None,
 416                                                 site=None,
 417                                                 dbname="findbadpcus",
 418                                                 cachenodes=False,
 419                                                 refresh=False,
 420                                                 force=False,
 421                                                 )
 422         parser.add_option("-f", "--nodelist", dest="nodelist", metavar="FILE",
 423                                                 help="Provide the input file for the node list")
 424         parser.add_option("", "--site", dest="site", metavar="FILE",
 425                                                 help="Get all pcus associated with the given site's nodes")
 426         parser.add_option("", "--pcuselect", dest="pcuselect", metavar="FILE",
 427                                                 help="Query string to apply to the findbad pcus")
 428         parser.add_option("", "--pcuid", dest="pcuid", metavar="id",
 429                                                 help="Provide the id for a single pcu")
 430
 431         parser.add_option("", "--cachenodes", action="store_true",
 432                                                 help="Cache node lookup from PLC")
 433         parser.add_option("", "--dbname", dest="dbname", metavar="FILE",
 434                                                 help="Specify the name of the database to which the information is saved")
 435         parser.add_option("", "--refresh", action="store_true", dest="refresh",
 436                                                 help="Refresh the cached values")
 437         parser.add_option("-i", "--increment", action="store_true", dest="increment",
 438                                                 help="Increment round number to force refresh or retry")
 439         parser.add_option("", "--force", action="store_true", dest="force",
 440                                                 help="Force probe without incrementing global 'round'.")
 441         parser = parsermodule.getParser(['defaults'], parser)
 442         config = parsermodule.parse_args(parser)
 443         try:
 444                 # NOTE: evidently, there is a bizarre interaction between iLO and ssh
 445                 # when LANG is set... Do not know why.  Unsetting LANG, fixes the problem.
 446                 if 'LANG' in os.environ:
 447                         del os.environ['LANG']
 448                 main()
 449                 time.sleep(1)
 450         except Exception, err:
 451                 traceback.print_exc()
 452                 print "Exception: %s" % err
 453                 print "Saving data... exitting."
 454                 sys.exit(0)