findbad.py

   1 #!/usr/bin/python
   2
   3 import os
   4 import sys
   5 import string
   6 import time
   7 from datetime import datetime,timedelta
   8 import threadpool
   9 import threading
  10
  11 from monitor import util
  12 from monitor.util import command
  13 from monitor import config
  14
  15 from monitor.database.infovacuum import FindbadNodeRecordSync, FindbadNodeRecord
  16 from monitor.database.dborm import mon_session as session
  17
  18 from monitor.sources import comon
  19 from monitor.wrapper import plc, plccache
  20
  21 from nodequery import verify,query_to_dict,node_select
  22 import traceback
  23
  24 print "starting sqlfindbad.py"
  25 # QUERY all nodes.
  26 COMON_COTOPURL= "http://summer.cs.princeton.edu/status/tabulator.cgi?" + \
  27                                 "table=table_nodeview&" + \
  28                                 "dumpcols='name,resptime,sshstatus,uptime,lastcotop,cpuspeed,memsize,disksize'&" + \
  29                                 "formatcsv"
  30                                     #"formatcsv&" + \
  31                                         #"select='lastcotop!=0'"
  32
  33 api = plc.getAuthAPI()
  34 plc_lock = threading.Lock()
  35 round = 1
  36 global_round = round
  37 count = 0
  38
  39 def collectPingAndSSH(nodename, cohash):
  40         ### RUN PING ######################
  41         ping = command.CMD()
  42         (oval,errval) = ping.run_noexcept("ping -c 1 -q %s | grep rtt" % nodename)
  43
  44         try:
  45                 values = {}
  46
  47                 if oval == "":
  48                         # An error occurred
  49                         values['ping'] = "NOPING"
  50                 else:
  51                         values['ping'] = "PING"
  52
  53                 try:
  54                         for port in [22, 806]:
  55                                 ssh = command.SSH('root', nodename, port)
  56
  57                                 (oval, errval) = ssh.run_noexcept2(""" <<\EOF
  58                                         echo "{"
  59                                         echo '  "kernel":"'`uname -a`'",'
  60                                         echo '  "bmlog":"'`ls /tmp/bm.log`'",'
  61                                         echo '  "bootcd":"'`cat /mnt/cdrom/bootme/ID`'",'
  62                                         echo '  "nm":"'`ps ax | grep nm.py | grep -v grep`'",'
  63                                         echo '  "readonlyfs":"'`touch /var/log/monitor 2>&1`'",'
  64                                         echo '  "dns":"'`host boot.planet-lab.org 2>&1`'",'
  65                                         echo '  "princeton_comon":"'`ls -d /vservers/princeton_comon`'",'
  66
  67                                         ID=`grep princeton_comon /etc/passwd | awk -F : '{if ( $3 > 500 ) { print $3}}'`
  68                                         echo '  "princeton_comon_running":"'`ls -d /proc/virtual/$ID`'",'
  69                                         echo '  "princeton_comon_procs":"'`vps ax | grep $ID | grep -v grep | wc -l`'",'
  70                                         echo "}"
  71 EOF                             """)
  72
  73                                 values['ssherror'] = errval
  74                                 if len(oval) > 0:
  75                                         #print "OVAL: %s" % oval
  76                                         values.update(eval(oval))
  77                                         values['sshport'] = port
  78                                         break
  79                                 else:
  80                                         values.update({'kernel': "", 'bmlog' : "", 'bootcd' : '',
  81                                                                         'nm' : '',
  82                                                                         'readonlyfs' : '',
  83                                                                         'dns' : '',
  84                                                                         'princeton_comon' : "",
  85                                                                         'princeton_comon_running' : "",
  86                                                                         'princeton_comon_procs' : "", 'sshport' : None})
  87                 except:
  88                         print traceback.print_exc()
  89                         sys.exit(1)
  90
  91                 ### RUN SSH ######################
  92                 b_getbootcd_id = True
  93                 #ssh = command.SSH('root', nodename)
  94                 #oval = ""
  95                 #errval = ""
  96                 #(oval, errval) = ssh.run_noexcept('echo `uname -a ; ls /tmp/bm.log`')
  97
  98                 oval = values['kernel']
  99                 if "2.6.17" in oval or "2.6.2" in oval:
 100                         values['ssh'] = 'SSH'
 101                         values['category'] = 'PROD'
 102                         if "bm.log" in values['bmlog']:
 103                                 values['state'] = 'DEBUG'
 104                         else:
 105                                 values['state'] = 'BOOT'
 106                 elif "2.6.12" in oval or "2.6.10" in oval:
 107                         values['ssh'] = 'SSH'
 108                         values['category'] = 'OLDPROD'
 109                         if "bm.log" in values['bmlog']:
 110                                 values['state'] = 'DEBUG'
 111                         else:
 112                                 values['state'] = 'BOOT'
 113
 114                 # NOTE: on 2.6.8 kernels, with 4.2 bootstrapfs, the chroot command fails.  I have no idea why.
 115                 elif "2.4" in oval or "2.6.8" in oval:
 116                         b_getbootcd_id = False
 117                         values['ssh'] = 'SSH'
 118                         values['category'] = 'OLDBOOTCD'
 119                         values['state'] = 'DEBUG'
 120                 elif oval != "":
 121                         values['ssh'] = 'SSH'
 122                         values['category'] = 'UNKNOWN'
 123                         if "bm.log" in values['bmlog']:
 124                                 values['state'] = 'DEBUG'
 125                         else:
 126                                 values['state'] = 'BOOT'
 127                 else:
 128                         # An error occurred.
 129                         b_getbootcd_id = False
 130                         values['ssh'] = 'NOSSH'
 131                         values['category'] = 'ERROR'
 132                         values['state'] = 'DOWN'
 133                         val = errval.strip()
 134                         values['ssherror'] = val
 135                         values['kernel'] = ""
 136
 137                 #values['kernel'] = val
 138
 139                 if b_getbootcd_id:
 140                         # try to get BootCD for all nodes that are not 2.4 nor inaccessible
 141                         #(oval, errval) = ssh.run_noexcept('cat /mnt/cdrom/bootme/ID')
 142                         oval = values['bootcd']
 143                         if "BootCD" in oval:
 144                                 values['bootcd'] = oval
 145                                 if "v2" in oval and \
 146                                         ( nodename is not "planetlab1.cs.unc.edu" and \
 147                                           nodename is not "planetlab2.cs.unc.edu" ):
 148                                         values['category'] = 'OLDBOOTCD'
 149                         else:
 150                                 values['bootcd'] = ""
 151                 else:
 152                         values['bootcd'] = ""
 153
 154                 # TODO: get bm.log for debug nodes.
 155                 # 'zcat /tmp/bm.log'
 156
 157                 #(oval, errval) = ssh.run_noexcept('ps ax | grep nm.py | grep -v grep')
 158                 oval = values['nm']
 159                 if "nm.py" in oval:
 160                         values['nm'] = "Y"
 161                 else:
 162                         values['nm'] = "N"
 163
 164                 continue_slice_check = True
 165                 #(oval, errval) = ssh.run_noexcept('ls -d /vservers/princeton_comon')
 166                 oval = values['princeton_comon']
 167                 if "princeton_comon" in oval:
 168                         values['princeton_comon'] = True
 169                 else:
 170                         values['princeton_comon'] = False
 171                         continue_slice_check = False
 172
 173                 if continue_slice_check:
 174                         #(oval, errval) = ssh.run_noexcept('ID=`grep princeton_comon /etc/passwd | awk -F : "{if ( \\\$3 > 500 ) { print \\\$3}}"`; ls -d /proc/virtual/$ID')
 175                         oval = values['princeton_comon_running']
 176                         if len(oval) > len('/proc/virtual/'):
 177                                 values['princeton_comon_running'] = True
 178                         else:
 179                                 values['princeton_comon_running'] = False
 180                                 continue_slice_check = False
 181                 else:
 182                         values['princeton_comon_running'] = False
 183
 184                 if continue_slice_check:
 185                         #(oval, errval) = ssh.run_noexcept('ID=`grep princeton_comon /etc/passwd | awk -F : "{if ( \\\$3 > 500 ) { print \\\$3}}"`; vps ax | grep $ID | grep -v grep | wc -l')
 186                         oval = values['princeton_comon_procs']
 187                         values['princeton_comon_procs'] = int(oval)
 188                 else:
 189                         values['princeton_comon_procs'] = None
 190
 191
 192                 if nodename in cohash:
 193                         values['comonstats'] = cohash[nodename]
 194                 else:
 195                         values['comonstats'] = {'resptime':  '-1',
 196                                                                         'uptime':    '-1',
 197                                                                         'sshstatus': '-1',
 198                                                                         'lastcotop': '-1',
 199                                                                         'cpuspeed' : "null",
 200                                                                         'disksize' : 'null',
 201                                                                         'memsize'  : 'null'}
 202                 # include output value
 203                 ### GET PLC NODE ######################
 204                 plc_lock.acquire()
 205                 d_node = None
 206                 try:
 207                         d_node = plc.getNodes({'hostname': nodename}, ['pcu_ids', 'site_id', 'date_created',
 208                                                                         'last_updated', 'last_contact', 'boot_state', 'nodegroup_ids'])[0]
 209                 except:
 210                         traceback.print_exc()
 211                 plc_lock.release()
 212                 values['plcnode'] = d_node
 213
 214                 ### GET PLC PCU ######################
 215                 site_id = -1
 216                 d_pcu = None
 217                 if d_node:
 218                         pcu = d_node['pcu_ids']
 219                         if len(pcu) > 0:
 220                                 d_pcu = pcu[0]
 221
 222                         site_id = d_node['site_id']
 223
 224                 values['pcu'] = d_pcu
 225
 226                 ### GET PLC SITE ######################
 227                 plc_lock.acquire()
 228                 d_site = None
 229                 values['loginbase'] = ""
 230                 try:
 231                         d_site = plc.getSites({'site_id': site_id},
 232                                                                 ['max_slices', 'slice_ids', 'node_ids', 'login_base'])[0]
 233                         values['loginbase'] = d_site['login_base']
 234                 except:
 235                         traceback.print_exc()
 236                 plc_lock.release()
 237
 238                 values['plcsite'] = d_site
 239                 values['date_checked'] = time.time()
 240         except:
 241                 print traceback.print_exc()
 242
 243         return (nodename, values)
 244
 245 def recordPingAndSSH(request, result):
 246         global global_round
 247         global count
 248         (nodename, values) = result
 249
 250         try:
 251                 if values is not None:
 252                         fbsync = FindbadNodeRecordSync.findby_or_create(hostname="global",
 253                                                                                                                         if_new_set={'round' : global_round})
 254                         global_round = fbsync.round
 255                         fbnodesync = FindbadNodeRecordSync.findby_or_create(hostname=nodename,
 256                                                                                                                         if_new_set={'round' : global_round})
 257
 258                         fbrec = FindbadNodeRecord(
 259                                                 date_checked=datetime.fromtimestamp(values['date_checked']),
 260                                                 round=global_round,
 261                                                 hostname=nodename,
 262                                                 loginbase=values['loginbase'],
 263                                                 kernel_version=values['kernel'],
 264                                                 bootcd_version=values['bootcd'],
 265                                                 nm_status=values['nm'],
 266                                                 fs_status=values['readonlyfs'],
 267                                                 dns_status=values['dns'],
 268                                                 princeton_comon_dir=values['princeton_comon'],
 269                                                 princeton_comon_running=values['princeton_comon_running'],
 270                                                 princeton_comon_procs=values['princeton_comon_procs'],
 271                                                 plc_node_stats = values['plcnode'],
 272                                                 plc_site_stats = values['plcsite'],
 273                                                 plc_pcuid = values['pcu'],
 274                                                 comon_stats = values['comonstats'],
 275                                                 ping_status = (values['ping'] == "PING"),
 276                                                 ssh_portused = values['sshport'],
 277                                                 ssh_status = (values['ssh'] == "SSH"),
 278                                                 ssh_error = values['ssherror'],
 279                                                 observed_status = values['state'],
 280                                                 observed_category = values['category'],
 281                                         )
 282                         fbnodesync.round = global_round
 283                         fbnodesync.flush()
 284                         fbsync.flush()
 285                         fbrec.flush()
 286
 287                         count += 1
 288                         print "%d %s %s" % (count, nodename, values)
 289         except:
 290                 print "ERROR:"
 291                 print traceback.print_exc()
 292
 293 # this will be called when an exception occurs within a thread
 294 def handle_exception(request, result):
 295         print "Exception occured in request %s" % request.requestID
 296         for i in result:
 297                 print "Result: %s" % i
 298
 299
 300 def checkAndRecordState(l_nodes, cohash):
 301         global global_round
 302         global count
 303
 304         tp = threadpool.ThreadPool(20)
 305
 306         # CREATE all the work requests
 307         for nodename in l_nodes:
 308                 fbnodesync = FindbadNodeRecordSync.findby_or_create(hostname=nodename, if_new_set={'round':0})
 309                 node_round   = fbnodesync.round
 310                 fbnodesync.flush()
 311
 312                 if node_round < global_round:
 313                         # recreate node stats when refreshed
 314                         #print "%s" % nodename
 315                         req = threadpool.WorkRequest(collectPingAndSSH, [nodename, cohash], {},
 316                                                                                  None, recordPingAndSSH, handle_exception)
 317                         tp.putRequest(req)
 318                 else:
 319                         # We just skip it, since it's "up to date"
 320                         count += 1
 321                         #print "%d %s %s" % (count, nodename, externalState['nodes'][nodename]['values'])
 322                         print "%d %s %s" % (count, nodename, node_round)
 323
 324         # WAIT while all the work requests are processed.
 325         begin = time.time()
 326         while 1:
 327                 try:
 328                         time.sleep(1)
 329                         tp.poll()
 330                         # if more than two hours
 331                         if time.time() - begin > (60*60*1.5):
 332                                 print "findbad.py has run out of time!!!!!!"
 333                                 os._exit(1)
 334                 except KeyboardInterrupt:
 335                         print "Interrupted!"
 336                         break
 337                 except threadpool.NoResultsPending:
 338                         print "All results collected."
 339                         break
 340
 341         print FindbadNodeRecordSync.query.count()
 342         print FindbadNodeRecord.query.count()
 343         session.flush()
 344
 345 def main():
 346         global global_round
 347
 348         fbsync = FindbadNodeRecordSync.findby_or_create(hostname="global",
 349                                                                                                         if_new_set={'round' : global_round})
 350         global_round = fbsync.round
 351
 352         if config.increment:
 353                 # update global round number to force refreshes across all nodes
 354                 global_round += 1
 355                 fbsync.round = global_round
 356
 357         fbsync.flush()
 358
 359         cotop = comon.Comon()
 360         # lastcotop measures whether cotop is actually running.  this is a better
 361         # metric than sshstatus, or other values from CoMon
 362         cotop_url = COMON_COTOPURL
 363
 364         # history information for all nodes
 365         cohash = {}
 366         #cohash = cotop.coget(cotop_url)
 367         l_nodes = plccache.l_nodes
 368         if config.nodelist:
 369                 f_nodes = util.file.getListFromFile(config.nodelist)
 370                 l_nodes = filter(lambda x: x['hostname'] in f_nodes, l_nodes)
 371         elif config.node:
 372                 f_nodes = [config.node]
 373                 l_nodes = filter(lambda x: x['hostname'] in f_nodes, l_nodes)
 374         elif config.nodegroup:
 375                 ng = api.GetNodeGroups({'name' : config.nodegroup})
 376                 l_nodes = api.GetNodes(ng[0]['node_ids'])
 377         elif config.site:
 378                 site = api.GetSites(config.site)
 379                 l_nodes = api.GetNodes(site[0]['node_ids'], ['hostname'])
 380
 381         l_nodes = [node['hostname'] for node in l_nodes]
 382
 383         # perform this query after the above options, so that the filter above
 384         # does not break.
 385         if config.nodeselect:
 386                 plcnodes = api.GetNodes({'peer_id' : None}, ['hostname'])
 387                 plcnodes = [ node['hostname'] for node in plcnodes ]
 388                 l_nodes = node_select(config.nodeselect, plcnodes, None)
 389
 390         print "fetching %s hosts" % len(l_nodes)
 391
 392         checkAndRecordState(l_nodes, cohash)
 393
 394         return 0
 395
 396
 397 if __name__ == '__main__':
 398         from monitor import parser as parsermodule
 399
 400         parser = parsermodule.getParser(['nodesets'])
 401
 402         parser.set_defaults( increment=False, dbname="findbad", cachenodes=False)
 403         parser.add_option("", "--cachenodes", action="store_true",
 404                                                 help="Cache node lookup from PLC")
 405         parser.add_option("", "--dbname", dest="dbname", metavar="FILE",
 406                                                 help="Specify the name of the database to which the information is saved")
 407         parser.add_option("-i", "--increment", action="store_true", dest="increment",
 408                                                 help="Increment round number to force refresh or retry")
 409
 410         parser = parsermodule.getParser(['defaults'], parser)
 411
 412         cfg = parsermodule.parse_args(parser)
 413
 414         try:
 415                 main()
 416         except Exception, err:
 417                 print traceback.print_exc()
 418                 print "Exception: %s" % err
 419                 print "Saving data... exitting."
 420                 sys.exit(0)
 421         print "sleeping"
 422         #print "final commit"
 423         #time.sleep(10)