findbad.py

   1 #!/usr/bin/python
   2
   3 import os
   4 import sys
   5 import string
   6 import time
   7 from datetime import datetime,timedelta
   8 import threadpool
   9 import threading
  10
  11 from monitor import util
  12 from monitor.util import command
  13 from monitor import config
  14
  15 from monitor.database.info.model import FindbadNodeRecordSync, FindbadNodeRecord, session
  16
  17 from monitor.sources import comon
  18 from monitor.wrapper import plc, plccache
  19
  20 from nodequery import verify,query_to_dict,node_select
  21 import traceback
  22
  23 #print "starting sqlfindbad.py"
  24 # QUERY all nodes.
  25 COMON_COTOPURL= "http://summer.cs.princeton.edu/status/tabulator.cgi?" + \
  26                                 "table=table_nodeview&" + \
  27                                 "dumpcols='name,resptime,sshstatus,uptime,lastcotop,cpuspeed,memsize,disksize'&" + \
  28                                 "formatcsv"
  29                                     #"formatcsv&" + \
  30                                         #"select='lastcotop!=0'"
  31
  32 api = plc.getAuthAPI()
  33 plc_lock = threading.Lock()
  34 round = 1
  35 global_round = round
  36 count = 0
  37
  38 def collectPingAndSSH(nodename, cohash):
  39         ### RUN PING ######################
  40         ping = command.CMD()
  41         (oval,errval) = ping.run_noexcept("ping -c 1 -q %s | grep rtt" % nodename)
  42
  43         try:
  44                 values = {}
  45
  46                 if oval == "":
  47                         # An error occurred
  48                         values['ping'] = "NOPING"
  49                 else:
  50                         values['ping'] = "PING"
  51
  52                 try:
  53                         for port in [22, 806]:
  54                                 ssh = command.SSH('root', nodename, port)
  55
  56                                 (oval, errval) = ssh.run_noexcept2(""" <<\EOF
  57                                         echo "{"
  58                                         echo '  "kernel":"'`uname -a`'",'
  59                                         echo '  "bmlog":"'`ls /tmp/bm.log`'",'
  60                                         echo '  "bootcd":"'`cat /mnt/cdrom/bootme/ID`'",'
  61                                         echo '  "nm":"'`ps ax | grep nm.py | grep -v grep`'",'
  62                                         echo '  "readonlyfs":"'`touch /var/log/monitor 2>&1`'",'
  63                                         echo '  "dns":"'`host boot.planet-lab.org 2>&1`'",'
  64                                         echo '  "princeton_comon":"'`ls -d /vservers/princeton_comon`'",'
  65
  66                                         ID=`grep princeton_comon /etc/passwd | awk -F : '{if ( $3 > 500 ) { print $3}}'`
  67                                         echo '  "princeton_comon_running":"'`ls -d /proc/virtual/$ID`'",'
  68                                         echo '  "princeton_comon_procs":"'`vps ax | grep $ID | grep -v grep | wc -l`'",'
  69                                         echo "}"
  70 EOF                             """)
  71
  72                                 values['ssherror'] = errval
  73                                 if len(oval) > 0:
  74                                         #print "OVAL: %s" % oval
  75                                         values.update(eval(oval))
  76                                         values['sshport'] = port
  77                                         break
  78                                 else:
  79                                         values.update({'kernel': "", 'bmlog' : "", 'bootcd' : '',
  80                                                                         'nm' : '',
  81                                                                         'readonlyfs' : '',
  82                                                                         'dns' : '',
  83                                                                         'princeton_comon' : "",
  84                                                                         'princeton_comon_running' : "",
  85                                                                         'princeton_comon_procs' : "", 'sshport' : None})
  86                 except:
  87                         print traceback.print_exc()
  88                         sys.exit(1)
  89
  90                 ### RUN SSH ######################
  91                 b_getbootcd_id = True
  92                 #ssh = command.SSH('root', nodename)
  93                 #oval = ""
  94                 #errval = ""
  95                 #(oval, errval) = ssh.run_noexcept('echo `uname -a ; ls /tmp/bm.log`')
  96
  97                 oval = values['kernel']
  98                 if "2.6.17" in oval or "2.6.2" in oval:
  99                         values['ssh'] = 'SSH'
 100                         values['category'] = 'PROD'
 101                         if "bm.log" in values['bmlog']:
 102                                 values['state'] = 'DEBUG'
 103                         else:
 104                                 values['state'] = 'BOOT'
 105                 elif "2.6.12" in oval or "2.6.10" in oval:
 106                         values['ssh'] = 'SSH'
 107                         values['category'] = 'OLDPROD'
 108                         if "bm.log" in values['bmlog']:
 109                                 values['state'] = 'DEBUG'
 110                         else:
 111                                 values['state'] = 'BOOT'
 112
 113                 # NOTE: on 2.6.8 kernels, with 4.2 bootstrapfs, the chroot command fails.  I have no idea why.
 114                 elif "2.4" in oval or "2.6.8" in oval:
 115                         b_getbootcd_id = False
 116                         values['ssh'] = 'SSH'
 117                         values['category'] = 'OLDBOOTCD'
 118                         values['state'] = 'DEBUG'
 119                 elif oval != "":
 120                         values['ssh'] = 'SSH'
 121                         values['category'] = 'UNKNOWN'
 122                         if "bm.log" in values['bmlog']:
 123                                 values['state'] = 'DEBUG'
 124                         else:
 125                                 values['state'] = 'BOOT'
 126                 else:
 127                         # An error occurred.
 128                         b_getbootcd_id = False
 129                         values['ssh'] = 'NOSSH'
 130                         values['category'] = 'ERROR'
 131                         values['state'] = 'DOWN'
 132                         val = errval.strip()
 133                         values['ssherror'] = val
 134                         values['kernel'] = ""
 135
 136                 #values['kernel'] = val
 137
 138                 if b_getbootcd_id:
 139                         # try to get BootCD for all nodes that are not 2.4 nor inaccessible
 140                         #(oval, errval) = ssh.run_noexcept('cat /mnt/cdrom/bootme/ID')
 141                         oval = values['bootcd']
 142                         if "BootCD" in oval:
 143                                 values['bootcd'] = oval
 144                                 if "v2" in oval and \
 145                                         ( nodename is not "planetlab1.cs.unc.edu" and \
 146                                           nodename is not "planetlab2.cs.unc.edu" ):
 147                                         values['category'] = 'OLDBOOTCD'
 148                         else:
 149                                 values['bootcd'] = ""
 150                 else:
 151                         values['bootcd'] = ""
 152
 153                 # TODO: get bm.log for debug nodes.
 154                 # 'zcat /tmp/bm.log'
 155
 156                 #(oval, errval) = ssh.run_noexcept('ps ax | grep nm.py | grep -v grep')
 157                 oval = values['nm']
 158                 if "nm.py" in oval:
 159                         values['nm'] = "Y"
 160                 else:
 161                         values['nm'] = "N"
 162
 163                 continue_slice_check = True
 164                 #(oval, errval) = ssh.run_noexcept('ls -d /vservers/princeton_comon')
 165                 oval = values['princeton_comon']
 166                 if "princeton_comon" in oval:
 167                         values['princeton_comon'] = True
 168                 else:
 169                         values['princeton_comon'] = False
 170                         continue_slice_check = False
 171
 172                 if continue_slice_check:
 173                         #(oval, errval) = ssh.run_noexcept('ID=`grep princeton_comon /etc/passwd | awk -F : "{if ( \\\$3 > 500 ) { print \\\$3}}"`; ls -d /proc/virtual/$ID')
 174                         oval = values['princeton_comon_running']
 175                         if len(oval) > len('/proc/virtual/'):
 176                                 values['princeton_comon_running'] = True
 177                         else:
 178                                 values['princeton_comon_running'] = False
 179                                 continue_slice_check = False
 180                 else:
 181                         values['princeton_comon_running'] = False
 182
 183                 if continue_slice_check:
 184                         #(oval, errval) = ssh.run_noexcept('ID=`grep princeton_comon /etc/passwd | awk -F : "{if ( \\\$3 > 500 ) { print \\\$3}}"`; vps ax | grep $ID | grep -v grep | wc -l')
 185                         oval = values['princeton_comon_procs']
 186                         values['princeton_comon_procs'] = int(oval)
 187                 else:
 188                         values['princeton_comon_procs'] = None
 189
 190
 191                 if nodename in cohash:
 192                         values['comonstats'] = cohash[nodename]
 193                 else:
 194                         values['comonstats'] = {'resptime':  '-1',
 195                                                                         'uptime':    '-1',
 196                                                                         'sshstatus': '-1',
 197                                                                         'lastcotop': '-1',
 198                                                                         'cpuspeed' : "null",
 199                                                                         'disksize' : 'null',
 200                                                                         'memsize'  : 'null'}
 201                 # include output value
 202                 ### GET PLC NODE ######################
 203                 plc_lock.acquire()
 204                 d_node = None
 205                 try:
 206                         d_node = plc.getNodes({'hostname': nodename}, ['pcu_ids', 'site_id', 'date_created',
 207                                                                         'last_updated', 'last_contact', 'boot_state', 'nodegroup_ids'])[0]
 208                 except:
 209                         traceback.print_exc()
 210                 plc_lock.release()
 211                 values['plcnode'] = d_node
 212
 213                 ### GET PLC PCU ######################
 214                 site_id = -1
 215                 d_pcu = None
 216                 if d_node:
 217                         pcu = d_node['pcu_ids']
 218                         if len(pcu) > 0:
 219                                 d_pcu = pcu[0]
 220
 221                         site_id = d_node['site_id']
 222
 223                 values['pcu'] = d_pcu
 224
 225                 ### GET PLC SITE ######################
 226                 plc_lock.acquire()
 227                 d_site = None
 228                 values['loginbase'] = ""
 229                 try:
 230                         d_site = plc.getSites({'site_id': site_id},
 231                                                                 ['max_slices', 'slice_ids', 'node_ids', 'login_base'])[0]
 232                         values['loginbase'] = d_site['login_base']
 233                 except:
 234                         traceback.print_exc()
 235                 plc_lock.release()
 236
 237                 values['plcsite'] = d_site
 238                 values['date_checked'] = time.time()
 239         except:
 240                 print traceback.print_exc()
 241
 242         return (nodename, values)
 243
 244 def recordPingAndSSH(request, result):
 245         global global_round
 246         global count
 247         (nodename, values) = result
 248
 249         try:
 250                 if values is not None:
 251                         fbsync = FindbadNodeRecordSync.findby_or_create(hostname="global",
 252                                                                                                                         if_new_set={'round' : global_round})
 253                         global_round = fbsync.round
 254                         fbnodesync = FindbadNodeRecordSync.findby_or_create(hostname=nodename,
 255                                                                                                                         if_new_set={'round' : global_round})
 256
 257                         # NOTE: This code will either add a new record for the new global_round,
 258                         #               OR it will find the previous value, and update it
 259                         #               with new information.
 260                         #               The data that is 'lost' is not that important, b/c older
 261                         #               history still exists.
 262                         fbrec = FindbadNodeRecord.findby_or_create(
 263                                                 round=global_round,
 264                                                 hostname=nodename)
 265                         before = fbrec.to_dict()
 266                         print "BEFORE, ", before
 267                         fbrec.flush()
 268                         time.sleep(2)
 269                         print "Setting VALUES"
 270                         fbrec.set(  date_checked=datetime.fromtimestamp(values['date_checked']),
 271                                                 loginbase=values['loginbase'],
 272                                                 kernel_version=values['kernel'],
 273                                                 bootcd_version=values['bootcd'],
 274                                                 nm_status=values['nm'],
 275                                                 fs_status=values['readonlyfs'],
 276                                                 dns_status=values['dns'],
 277                                                 princeton_comon_dir=values['princeton_comon'],
 278                                                 princeton_comon_running=values['princeton_comon_running'],
 279                                                 princeton_comon_procs=values['princeton_comon_procs'],
 280                                                 plc_node_stats = values['plcnode'],
 281                                                 plc_site_stats = values['plcsite'],
 282                                                 plc_pcuid = values['pcu'],
 283                                                 comon_stats = values['comonstats'],
 284                                                 ping_status = (values['ping'] == "PING"),
 285                                                 ssh_portused = values['sshport'],
 286                                                 ssh_status = (values['ssh'] == "SSH"),
 287                                                 ssh_error = values['ssherror'],
 288                                                 observed_status = values['state'],
 289                                                 observed_category = values['category'])
 290                         after = fbrec.to_dict()
 291                         print "AFTER , ", after
 292
 293                         for v in before.keys():
 294                                 if before[v] == after[v]:
 295                                         print "SAME FOR KEY %s" % v
 296                                 print "%s : %s\t%s" % ( v, before[v], after[v] )
 297
 298                         fbrec.flush()
 299                         fbnodesync.round = global_round
 300                         fbnodesync.flush()
 301                         fbsync.flush()
 302
 303                         count += 1
 304                         print "%d %s %s" % (count, nodename, values)
 305         except:
 306                 print "ERROR:"
 307                 print traceback.print_exc()
 308
 309 # this will be called when an exception occurs within a thread
 310 def handle_exception(request, result):
 311         print "Exception occured in request %s" % request.requestID
 312         for i in result:
 313                 print "Result: %s" % i
 314
 315 def probe(hostname):
 316         try:
 317                 (nodename, values) = collectPingAndSSH(hostname, {})
 318                 recordPingAndSSH(None, (nodename, values))
 319                 session.flush()
 320                 return True
 321         except:
 322                 print traceback.print_exc()
 323                 return False
 324
 325
 326 def checkAndRecordState(l_nodes, cohash):
 327         global global_round
 328         global count
 329
 330         tp = threadpool.ThreadPool(20)
 331
 332         # CREATE all the work requests
 333         for nodename in l_nodes:
 334                 fbnodesync = FindbadNodeRecordSync.findby_or_create(hostname=nodename, if_new_set={'round':0})
 335                 node_round   = fbnodesync.round
 336                 fbnodesync.flush()
 337
 338                 if node_round < global_round:
 339                         # recreate node stats when refreshed
 340                         #print "%s" % nodename
 341                         req = threadpool.WorkRequest(collectPingAndSSH, [nodename, cohash], {},
 342                                                                                  None, recordPingAndSSH, handle_exception)
 343                         tp.putRequest(req)
 344                 else:
 345                         # We just skip it, since it's "up to date"
 346                         count += 1
 347                         #print "%d %s %s" % (count, nodename, externalState['nodes'][nodename]['values'])
 348                         print "%d %s %s" % (count, nodename, node_round)
 349
 350         # WAIT while all the work requests are processed.
 351         begin = time.time()
 352         while 1:
 353                 try:
 354                         time.sleep(1)
 355                         tp.poll()
 356                         # if more than two hours
 357                         if time.time() - begin > (60*60*1.5):
 358                                 print "findbad.py has run out of time!!!!!!"
 359                                 os._exit(1)
 360                 except KeyboardInterrupt:
 361                         print "Interrupted!"
 362                         break
 363                 except threadpool.NoResultsPending:
 364                         print "All results collected."
 365                         break
 366
 367         print FindbadNodeRecordSync.query.count()
 368         print FindbadNodeRecord.query.count()
 369         session.flush()
 370
 371 def main():
 372         global global_round
 373
 374         fbsync = FindbadNodeRecordSync.findby_or_create(hostname="global",
 375                                                                                                         if_new_set={'round' : global_round})
 376         global_round = fbsync.round
 377
 378         if config.increment:
 379                 # update global round number to force refreshes across all nodes
 380                 global_round += 1
 381                 fbsync.round = global_round
 382
 383         fbsync.flush()
 384
 385         cotop = comon.Comon()
 386         # lastcotop measures whether cotop is actually running.  this is a better
 387         # metric than sshstatus, or other values from CoMon
 388         cotop_url = COMON_COTOPURL
 389
 390         # history information for all nodes
 391         cohash = {}
 392         #cohash = cotop.coget(cotop_url)
 393         l_nodes = plccache.l_nodes
 394         if config.nodelist:
 395                 f_nodes = util.file.getListFromFile(config.nodelist)
 396                 l_nodes = filter(lambda x: x['hostname'] in f_nodes, l_nodes)
 397         elif config.node:
 398                 f_nodes = [config.node]
 399                 l_nodes = filter(lambda x: x['hostname'] in f_nodes, l_nodes)
 400         elif config.nodegroup:
 401                 ng = api.GetNodeGroups({'name' : config.nodegroup})
 402                 l_nodes = api.GetNodes(ng[0]['node_ids'])
 403         elif config.site:
 404                 site = api.GetSites(config.site)
 405                 l_nodes = api.GetNodes(site[0]['node_ids'], ['hostname'])
 406
 407         l_nodes = [node['hostname'] for node in l_nodes]
 408
 409         # perform this query after the above options, so that the filter above
 410         # does not break.
 411         if config.nodeselect:
 412                 plcnodes = api.GetNodes({'peer_id' : None}, ['hostname'])
 413                 plcnodes = [ node['hostname'] for node in plcnodes ]
 414                 l_nodes = node_select(config.nodeselect, plcnodes, None)
 415
 416         print "fetching %s hosts" % len(l_nodes)
 417
 418         checkAndRecordState(l_nodes, cohash)
 419
 420         return 0
 421
 422
 423 if __name__ == '__main__':
 424         from monitor import parser as parsermodule
 425
 426         parser = parsermodule.getParser(['nodesets'])
 427
 428         parser.set_defaults( increment=False, dbname="findbad", cachenodes=False)
 429         parser.add_option("", "--cachenodes", action="store_true",
 430                                                 help="Cache node lookup from PLC")
 431         parser.add_option("", "--dbname", dest="dbname", metavar="FILE",
 432                                                 help="Specify the name of the database to which the information is saved")
 433         parser.add_option("-i", "--increment", action="store_true", dest="increment",
 434                                                 help="Increment round number to force refresh or retry")
 435
 436         parser = parsermodule.getParser(['defaults'], parser)
 437
 438         cfg = parsermodule.parse_args(parser)
 439
 440         try:
 441                 main()
 442         except Exception, err:
 443                 print traceback.print_exc()
 444                 print "Exception: %s" % err
 445                 print "Saving data... exitting."
 446                 sys.exit(0)
 447         print "sleeping"
 448         #print "final commit"
 449         #time.sleep(10)