findbad.py

   1 #!/usr/bin/python
   2
   3 import os
   4 import sys
   5 import string
   6 import time
   7 from datetime import datetime,timedelta
   8 import threadpool
   9 import threading
  10
  11 from monitor import util
  12 from monitor.util import command
  13 from monitor import config
  14
  15 from monitor.database.info.model import FindbadNodeRecordSync, FindbadNodeRecord, session
  16
  17 from monitor.sources import comon
  18 from monitor.wrapper import plc, plccache
  19
  20 from nodequery import verify,query_to_dict,node_select
  21 import traceback
  22 from nodecommon import nmap_port_status
  23
  24 #print "starting sqlfindbad.py"
  25 # QUERY all nodes.
  26 COMON_COTOPURL= "http://summer.cs.princeton.edu/status/tabulator.cgi?" + \
  27                                 "table=table_nodeview&" + \
  28                                 "dumpcols='name,resptime,sshstatus,uptime,lastcotop,cpuspeed,memsize,disksize'&" + \
  29                                 "formatcsv"
  30                                     #"formatcsv&" + \
  31                                         #"select='lastcotop!=0'"
  32
  33 api = plc.getAuthAPI()
  34 plc_lock = threading.Lock()
  35 round = 1
  36 global_round = round
  37 count = 0
  38
  39 def collectNMAP(nodename, cohash):
  40         #### RUN NMAP ###############################
  41         values = {}
  42         nmap = util.command.CMD()
  43         print "nmap -oG - -P0 -p22,80,806 %s | grep Host:" % nodename
  44         (oval,eval) = nmap.run_noexcept("nmap -oG - -P0 -p22,80,806 %s | grep Host:" % nodename)
  45         # NOTE: an empty / error value for oval, will still work.
  46         (values['port_status'], continue_probe) = nmap_port_status(oval)
  47
  48         values['date_checked'] = datetime.now()
  49
  50         return (nodename, values)
  51
  52 def collectPingAndSSH(nodename, cohash):
  53         ### RUN PING ######################
  54         ping = command.CMD()
  55         (oval,errval) = ping.run_noexcept("ping -c 1 -q %s | grep rtt" % nodename)
  56
  57         try:
  58                 values = {}
  59
  60                 if oval == "":
  61                         # An error occurred
  62                         values['ping_status'] = False
  63                 else:
  64                         values['ping_status'] = True
  65
  66                 try:
  67                         for port in [22, 806]:
  68                                 ssh = command.SSH('root', nodename, port)
  69
  70                                 (oval, errval) = ssh.run_noexcept2(""" <<\EOF
  71                                         echo "{"
  72                                         echo '  "kernel_version":"'`uname -a`'",'
  73                                         echo '  "bmlog":"'`ls /tmp/bm.log`'",'
  74                                         echo '  "bootcd_version":"'`cat /mnt/cdrom/bootme/ID`'",'
  75                                         echo '  "nm_status":"'`ps ax | grep nm.py | grep -v grep`'",'
  76                                         echo '  "fs_status":"'`touch /var/log/monitor 2>&1`'",'
  77                                         echo '  "dns_status":"'`host boot.planet-lab.org 2>&1`'",'
  78                                         echo '  "princeton_comon_dir":"'`ls -d /vservers/princeton_comon`'",'
  79
  80                                         ID=`grep princeton_comon /etc/passwd | awk -F : '{if ( $3 > 500 ) { print $3}}'`
  81                                         echo '  "princeton_comon_running":"'`ls -d /proc/virtual/$ID`'",'
  82                                         echo '  "princeton_comon_procs":"'`vps ax | grep $ID | grep -v grep | wc -l`'",'
  83                                         echo "}"
  84 EOF                             """)
  85
  86                                 values['ssh_error'] = errval
  87                                 if len(oval) > 0:
  88                                         #print "OVAL: %s" % oval
  89                                         values.update(eval(oval))
  90                                         values['ssh_portused'] = port
  91                                         break
  92                                 else:
  93                                         values.update({'kernel_version': "", 'bmlog' : "", 'bootcd_version' : '',
  94                                                                         'nm_status' : '',
  95                                                                         'fs_status' : '',
  96                                                                         'dns_status' : '',
  97                                                                         'princeton_comon_dir' : "",
  98                                                                         'princeton_comon_running' : "",
  99                                                                         'princeton_comon_procs' : "", 'ssh_portused' : None})
 100                 except:
 101                         print traceback.print_exc()
 102                         sys.exit(1)
 103
 104                 ### RUN SSH ######################
 105                 b_getbootcd_id = True
 106                 #ssh = command.SSH('root', nodename)
 107                 #oval = ""
 108                 #errval = ""
 109                 #(oval, errval) = ssh.run_noexcept('echo `uname -a ; ls /tmp/bm.log`')
 110
 111                 oval = values['kernel_version']
 112                 if "2.6.17" in oval or "2.6.2" in oval:
 113                         values['ssh_status'] = True
 114                         values['observed_category'] = 'PROD'
 115                         if "bm.log" in values['bmlog']:
 116                                 values['observed_status'] = 'DEBUG'
 117                         else:
 118                                 values['observed_status'] = 'BOOT'
 119                 elif "2.6.12" in oval or "2.6.10" in oval:
 120                         values['ssh_status'] = True
 121                         values['observed_category'] = 'OLDPROD'
 122                         if "bm.log" in values['bmlog']:
 123                                 values['observed_status'] = 'DEBUG'
 124                         else:
 125                                 values['observed_status'] = 'BOOT'
 126
 127                 # NOTE: on 2.6.8 kernels, with 4.2 bootstrapfs, the chroot command fails.  I have no idea why.
 128                 elif "2.4" in oval or "2.6.8" in oval:
 129                         b_getbootcd_id = False
 130                         values['ssh_status'] = True
 131                         values['observed_category'] = 'OLDBOOTCD'
 132                         values['observed_status'] = 'DEBUG'
 133                 elif oval != "":
 134                         values['ssh_status'] = True
 135                         values['observed_category'] = 'UNKNOWN'
 136                         if "bm.log" in values['bmlog']:
 137                                 values['observed_status'] = 'DEBUG'
 138                         else:
 139                                 values['observed_status'] = 'BOOT'
 140                 else:
 141                         # An error occurred.
 142                         b_getbootcd_id = False
 143                         values['ssh_status'] = False
 144                         values['observed_category'] = 'ERROR'
 145                         values['observed_status'] = 'DOWN'
 146                         val = errval.strip()
 147                         values['ssh_error'] = val
 148                         values['kernel_version'] = ""
 149
 150                 #values['kernel_version'] = val
 151
 152                 if b_getbootcd_id:
 153                         # try to get BootCD for all nodes that are not 2.4 nor inaccessible
 154                         #(oval, errval) = ssh.run_noexcept('cat /mnt/cdrom/bootme/ID')
 155                         oval = values['bootcd_version']
 156                         if "BootCD" in oval:
 157                                 values['bootcd_version'] = oval
 158                                 if "v2" in oval and \
 159                                         ( nodename is not "planetlab1.cs.unc.edu" and \
 160                                           nodename is not "planetlab2.cs.unc.edu" ):
 161                                         values['observed_category'] = 'OLDBOOTCD'
 162                         else:
 163                                 values['bootcd_version'] = ""
 164                 else:
 165                         values['bootcd_version'] = ""
 166
 167                 # TODO: get bm.log for debug nodes.
 168                 # 'zcat /tmp/bm.log'
 169
 170                 #(oval, errval) = ssh.run_noexcept('ps ax | grep nm.py | grep -v grep')
 171                 oval = values['nm_status']
 172                 if "nm.py" in oval:
 173                         values['nm_status'] = "Y"
 174                 else:
 175                         values['nm_status'] = "N"
 176
 177                 continue_slice_check = True
 178                 #(oval, errval) = ssh.run_noexcept('ls -d /vservers/princeton_comon')
 179                 oval = values['princeton_comon_dir']
 180                 if "princeton_comon_dir" in oval:
 181                         values['princeton_comon_dir'] = True
 182                 else:
 183                         values['princeton_comon_dir'] = False
 184                         continue_slice_check = False
 185
 186                 if continue_slice_check:
 187                         #(oval, errval) = ssh.run_noexcept('ID=`grep princeton_comon /etc/passwd | awk -F : "{if ( \\\$3 > 500 ) { print \\\$3}}"`; ls -d /proc/virtual/$ID')
 188                         oval = values['princeton_comon_running']
 189                         if len(oval) > len('/proc/virtual/'):
 190                                 values['princeton_comon_running'] = True
 191                         else:
 192                                 values['princeton_comon_running'] = False
 193                                 continue_slice_check = False
 194                 else:
 195                         values['princeton_comon_running'] = False
 196
 197                 if continue_slice_check:
 198                         #(oval, errval) = ssh.run_noexcept('ID=`grep princeton_comon /etc/passwd | awk -F : "{if ( \\\$3 > 500 ) { print \\\$3}}"`; vps ax | grep $ID | grep -v grep | wc -l')
 199                         oval = values['princeton_comon_procs']
 200                         values['princeton_comon_procs'] = int(oval)
 201                 else:
 202                         values['princeton_comon_procs'] = None
 203
 204
 205                 if nodename in cohash:
 206                         values['comon_stats'] = cohash[nodename]
 207                 else:
 208                         values['comon_stats'] = {'resptime':  '-1',
 209                                                                         'uptime':    '-1',
 210                                                                         'sshstatus': '-1',
 211                                                                         'lastcotop': '-1',
 212                                                                         'cpuspeed' : "null",
 213                                                                         'disksize' : 'null',
 214                                                                         'memsize'  : 'null'}
 215                 # include output value
 216                 ### GET PLC NODE ######################
 217                 plc_lock.acquire()
 218                 d_node = None
 219                 try:
 220                         d_node = plc.getNodes({'hostname': nodename}, ['pcu_ids', 'site_id', 'date_created',
 221                                                                         'last_updated', 'last_contact', 'boot_state', 'nodegroup_ids'])[0]
 222                 except:
 223                         traceback.print_exc()
 224                 plc_lock.release()
 225                 values['plc_node_stats'] = d_node
 226
 227                 ##### NMAP  ###################
 228                 (n, v) = collectNMAP(nodename, None)
 229                 values.update(v)
 230
 231                 ### GET PLC PCU ######################
 232                 site_id = -1
 233                 d_pcu = None
 234                 if d_node:
 235                         pcu = d_node['pcu_ids']
 236                         if len(pcu) > 0:
 237                                 d_pcu = pcu[0]
 238
 239                         site_id = d_node['site_id']
 240
 241                 values['plc_pcuid'] = d_pcu
 242
 243                 ### GET PLC SITE ######################
 244                 plc_lock.acquire()
 245                 d_site = None
 246                 values['loginbase'] = ""
 247                 try:
 248                         d_site = plc.getSites({'site_id': site_id},
 249                                                                 ['max_slices', 'slice_ids', 'node_ids', 'login_base'])[0]
 250                         values['loginbase'] = d_site['login_base']
 251                 except:
 252                         traceback.print_exc()
 253                 plc_lock.release()
 254
 255                 values['plc_site_stats'] = d_site
 256                 values['date_checked'] = datetime.now()
 257         except:
 258                 print traceback.print_exc()
 259
 260         return (nodename, values)
 261
 262 def recordPingAndSSH(request, result):
 263         global global_round
 264         global count
 265         (nodename, values) = result
 266
 267         try:
 268                 if values is not None:
 269                         #fbsync = FindbadNodeRecordSync.findby_or_create(hostname="global",
 270                         #                                                                                               if_new_set={'round' : global_round})
 271                         #global_round = fbsync.round
 272                         fbnodesync = FindbadNodeRecordSync.findby_or_create(hostname=nodename,
 273                                                                                                                         if_new_set={'round' : global_round})
 274
 275                         # NOTE: This code will either add a new record for the new global_round,
 276                         #               OR it will find the previous value, and update it
 277                         #               with new information.
 278                         #               The data that is 'lost' is not that important, b/c older
 279                         #               history still exists.
 280                         fbrec = FindbadNodeRecord.findby_or_create(
 281                                                 round=global_round,
 282                                                 hostname=nodename)
 283
 284                         fbrec.set(  **values )
 285                                                 #date_checked=values['date_checked'],
 286                                                 #loginbase=values['loginbase'],
 287                                                 #kernel_version=values['kernel_version'],
 288                                                 #bootcd_version=values['bootcd_version'],
 289                                                 #nm_status=values['nm_status'],
 290                                                 #fs_status=values['fs_status'],
 291                                                 #dns_status=values['dns_status'],
 292                                                 #princeton_comon_dir=values['princeton_comon_dir'],
 293                                                 #princeton_comon_running=values['princeton_comon_running'],
 294                                                 #princeton_comon_procs=values['princeton_comon_procs'],
 295                                                 #plc_node_stats = values['plc_node_stats'],
 296                                                 #plc_site_stats = values['plc_site_stats'],
 297                                                 #plc_pcuid = values['plc_pcuid'],
 298                                                 #comon_stats = values['comon_stats'],
 299                                                 #ping_status = values['ping_status'],
 300                                                 #ssh_portused = values['ssh_portused'],
 301                                                 #ssh_status = values['ssh_status'],
 302                                                 #ssh_error = values['ssh_error'],
 303                                                 #observed_status = values['observed_status'],
 304                                                 #observed_category = values['observed_category'])
 305
 306                         #for v in before.keys():
 307                         #       if before[v] == after[v]:
 308                         #               print "SAME FOR KEY %s" % v
 309                         #       print "%s : %s\t%s" % ( v, before[v], after[v] )
 310
 311                         fbrec.flush()
 312                         fbnodesync.round = global_round
 313                         fbnodesync.flush()
 314                         #fbsync.flush()
 315
 316                         count += 1
 317                         print "%d %s %s" % (count, nodename, values)
 318         except:
 319                 print "ERROR:"
 320                 print traceback.print_exc()
 321
 322 # this will be called when an exception occurs within a thread
 323 def handle_exception(request, result):
 324         print "Exception occured in request %s" % request.requestID
 325         for i in result:
 326                 print "Result: %s" % i
 327
 328 def externalprobe(hostname):
 329         try:
 330                 (nodename, values) = collectNMAP(hostname, {})
 331                 recordPingAndSSH(None, (nodename, values))
 332                 session.flush()
 333                 return True
 334         except:
 335                 print traceback.print_exc()
 336                 return False
 337
 338 def probe(hostname):
 339         try:
 340                 (nodename, values) = collectPingAndSSH(hostname, {})
 341                 recordPingAndSSH(None, (nodename, values))
 342                 session.flush()
 343                 return True
 344         except:
 345                 print traceback.print_exc()
 346                 return False
 347
 348
 349 def checkAndRecordState(l_nodes, cohash):
 350         global global_round
 351         global count
 352
 353         tp = threadpool.ThreadPool(20)
 354
 355         # CREATE all the work requests
 356         for nodename in l_nodes:
 357                 fbnodesync = FindbadNodeRecordSync.findby_or_create(hostname=nodename, if_new_set={'round':0})
 358                 node_round   = fbnodesync.round
 359                 fbnodesync.flush()
 360
 361                 if node_round < global_round or config.force:
 362                         # recreate node stats when refreshed
 363                         #print "%s" % nodename
 364                         req = threadpool.WorkRequest(collectPingAndSSH, [nodename, cohash], {},
 365                                                                                  None, recordPingAndSSH, handle_exception)
 366                         tp.putRequest(req)
 367                 else:
 368                         # We just skip it, since it's "up to date"
 369                         count += 1
 370                         #print "%d %s %s" % (count, nodename, externalState['nodes'][nodename]['values'])
 371                         print "%d %s %s" % (count, nodename, node_round)
 372
 373         # WAIT while all the work requests are processed.
 374         begin = time.time()
 375         while 1:
 376                 try:
 377                         time.sleep(1)
 378                         tp.poll()
 379                         # if more than two hours
 380                         if time.time() - begin > (60*60*1.5):
 381                                 print "findbad.py has run out of time!!!!!!"
 382                                 os._exit(1)
 383                 except KeyboardInterrupt:
 384                         print "Interrupted!"
 385                         break
 386                 except threadpool.NoResultsPending:
 387                         print "All results collected."
 388                         break
 389
 390         print FindbadNodeRecordSync.query.count()
 391         print FindbadNodeRecord.query.count()
 392         session.flush()
 393
 394 def main():
 395         global global_round
 396
 397         fbsync = FindbadNodeRecordSync.findby_or_create(hostname="global",
 398                                                                                                         if_new_set={'round' : global_round})
 399         global_round = fbsync.round
 400
 401         if config.increment:
 402                 # update global round number to force refreshes across all nodes
 403                 global_round += 1
 404
 405         cotop = comon.Comon()
 406         # lastcotop measures whether cotop is actually running.  this is a better
 407         # metric than sshstatus, or other values from CoMon
 408         cotop_url = COMON_COTOPURL
 409
 410         # history information for all nodes
 411         cohash = {}
 412         #cohash = cotop.coget(cotop_url)
 413         l_nodes = plccache.l_nodes
 414         if config.nodelist:
 415                 f_nodes = util.file.getListFromFile(config.nodelist)
 416                 l_nodes = filter(lambda x: x['hostname'] in f_nodes, l_nodes)
 417         elif config.node:
 418                 f_nodes = [config.node]
 419                 l_nodes = filter(lambda x: x['hostname'] in f_nodes, l_nodes)
 420         elif config.nodegroup:
 421                 ng = api.GetNodeGroups({'name' : config.nodegroup})
 422                 l_nodes = api.GetNodes(ng[0]['node_ids'])
 423         elif config.site:
 424                 site = api.GetSites(config.site)
 425                 l_nodes = api.GetNodes(site[0]['node_ids'], ['hostname'])
 426
 427         l_nodes = [node['hostname'] for node in l_nodes]
 428
 429         # perform this query after the above options, so that the filter above
 430         # does not break.
 431         if config.nodeselect:
 432                 plcnodes = api.GetNodes({'peer_id' : None}, ['hostname'])
 433                 plcnodes = [ node['hostname'] for node in plcnodes ]
 434                 l_nodes = node_select(config.nodeselect, plcnodes, None)
 435
 436         print "fetching %s hosts" % len(l_nodes)
 437
 438         checkAndRecordState(l_nodes, cohash)
 439
 440         if config.increment:
 441                 # update global round number to force refreshes across all nodes
 442                 fbsync.round = global_round
 443                 fbsync.flush()
 444
 445         return 0
 446
 447
 448 if __name__ == '__main__':
 449         from monitor import parser as parsermodule
 450
 451         parser = parsermodule.getParser(['nodesets'])
 452
 453         parser.set_defaults( increment=False, dbname="findbad", cachenodes=False,
 454                                                 force=False,)
 455         parser.add_option("", "--cachenodes", action="store_true",
 456                                                 help="Cache node lookup from PLC")
 457         parser.add_option("", "--dbname", dest="dbname", metavar="FILE",
 458                                                 help="Specify the name of the database to which the information is saved")
 459         parser.add_option("-i", "--increment", action="store_true", dest="increment",
 460                                                 help="Increment round number to force refresh or retry")
 461         parser.add_option("", "--force", action="store_true", dest="force",
 462                                                 help="Force probe without incrementing global 'round'.")
 463
 464         parser = parsermodule.getParser(['defaults'], parser)
 465
 466         cfg = parsermodule.parse_args(parser)
 467
 468         try:
 469                 main()
 470         except Exception, err:
 471                 print traceback.print_exc()
 472                 print "Exception: %s" % err
 473                 print "Saving data... exitting."
 474                 sys.exit(0)
 475         print "sleeping"
 476         #print "final commit"
 477         #time.sleep(10)