#!/usr/bin/python import os import sys import string import time from datetime import datetime,timedelta import threadpool import threading from monitor import util from monitor.util import command from monitor import config from monitor.database.info.model import FindbadNodeRecordSync, FindbadNodeRecord, session from monitor.sources import comon from monitor.wrapper import plc, plccache from nodequery import verify,query_to_dict,node_select import traceback print "starting sqlfindbad.py" # QUERY all nodes. COMON_COTOPURL= "http://summer.cs.princeton.edu/status/tabulator.cgi?" + \ "table=table_nodeview&" + \ "dumpcols='name,resptime,sshstatus,uptime,lastcotop,cpuspeed,memsize,disksize'&" + \ "formatcsv" #"formatcsv&" + \ #"select='lastcotop!=0'" api = plc.getAuthAPI() plc_lock = threading.Lock() round = 1 global_round = round count = 0 def collectPingAndSSH(nodename, cohash): ### RUN PING ###################### ping = command.CMD() (oval,errval) = ping.run_noexcept("ping -c 1 -q %s | grep rtt" % nodename) try: values = {} if oval == "": # An error occurred values['ping'] = "NOPING" else: values['ping'] = "PING" try: for port in [22, 806]: ssh = command.SSH('root', nodename, port) (oval, errval) = ssh.run_noexcept2(""" <<\EOF echo "{" echo ' "kernel":"'`uname -a`'",' echo ' "bmlog":"'`ls /tmp/bm.log`'",' echo ' "bootcd":"'`cat /mnt/cdrom/bootme/ID`'",' echo ' "nm":"'`ps ax | grep nm.py | grep -v grep`'",' echo ' "readonlyfs":"'`touch /var/log/monitor 2>&1`'",' echo ' "dns":"'`host boot.planet-lab.org 2>&1`'",' echo ' "princeton_comon":"'`ls -d /vservers/princeton_comon`'",' ID=`grep princeton_comon /etc/passwd | awk -F : '{if ( $3 > 500 ) { print $3}}'` echo ' "princeton_comon_running":"'`ls -d /proc/virtual/$ID`'",' echo ' "princeton_comon_procs":"'`vps ax | grep $ID | grep -v grep | wc -l`'",' echo "}" EOF """) values['ssherror'] = errval if len(oval) > 0: #print "OVAL: %s" % oval values.update(eval(oval)) values['sshport'] = port break else: values.update({'kernel': "", 'bmlog' : "", 'bootcd' : '', 'nm' : '', 'readonlyfs' : '', 'dns' : '', 'princeton_comon' : "", 'princeton_comon_running' : "", 'princeton_comon_procs' : "", 'sshport' : None}) except: print traceback.print_exc() sys.exit(1) ### RUN SSH ###################### b_getbootcd_id = True #ssh = command.SSH('root', nodename) #oval = "" #errval = "" #(oval, errval) = ssh.run_noexcept('echo `uname -a ; ls /tmp/bm.log`') oval = values['kernel'] if "2.6.17" in oval or "2.6.2" in oval: values['ssh'] = 'SSH' values['category'] = 'PROD' if "bm.log" in values['bmlog']: values['state'] = 'DEBUG' else: values['state'] = 'BOOT' elif "2.6.12" in oval or "2.6.10" in oval: values['ssh'] = 'SSH' values['category'] = 'OLDPROD' if "bm.log" in values['bmlog']: values['state'] = 'DEBUG' else: values['state'] = 'BOOT' # NOTE: on 2.6.8 kernels, with 4.2 bootstrapfs, the chroot command fails. I have no idea why. elif "2.4" in oval or "2.6.8" in oval: b_getbootcd_id = False values['ssh'] = 'SSH' values['category'] = 'OLDBOOTCD' values['state'] = 'DEBUG' elif oval != "": values['ssh'] = 'SSH' values['category'] = 'UNKNOWN' if "bm.log" in values['bmlog']: values['state'] = 'DEBUG' else: values['state'] = 'BOOT' else: # An error occurred. b_getbootcd_id = False values['ssh'] = 'NOSSH' values['category'] = 'ERROR' values['state'] = 'DOWN' val = errval.strip() values['ssherror'] = val values['kernel'] = "" #values['kernel'] = val if b_getbootcd_id: # try to get BootCD for all nodes that are not 2.4 nor inaccessible #(oval, errval) = ssh.run_noexcept('cat /mnt/cdrom/bootme/ID') oval = values['bootcd'] if "BootCD" in oval: values['bootcd'] = oval if "v2" in oval and \ ( nodename is not "planetlab1.cs.unc.edu" and \ nodename is not "planetlab2.cs.unc.edu" ): values['category'] = 'OLDBOOTCD' else: values['bootcd'] = "" else: values['bootcd'] = "" # TODO: get bm.log for debug nodes. # 'zcat /tmp/bm.log' #(oval, errval) = ssh.run_noexcept('ps ax | grep nm.py | grep -v grep') oval = values['nm'] if "nm.py" in oval: values['nm'] = "Y" else: values['nm'] = "N" continue_slice_check = True #(oval, errval) = ssh.run_noexcept('ls -d /vservers/princeton_comon') oval = values['princeton_comon'] if "princeton_comon" in oval: values['princeton_comon'] = True else: values['princeton_comon'] = False continue_slice_check = False if continue_slice_check: #(oval, errval) = ssh.run_noexcept('ID=`grep princeton_comon /etc/passwd | awk -F : "{if ( \\\$3 > 500 ) { print \\\$3}}"`; ls -d /proc/virtual/$ID') oval = values['princeton_comon_running'] if len(oval) > len('/proc/virtual/'): values['princeton_comon_running'] = True else: values['princeton_comon_running'] = False continue_slice_check = False else: values['princeton_comon_running'] = False if continue_slice_check: #(oval, errval) = ssh.run_noexcept('ID=`grep princeton_comon /etc/passwd | awk -F : "{if ( \\\$3 > 500 ) { print \\\$3}}"`; vps ax | grep $ID | grep -v grep | wc -l') oval = values['princeton_comon_procs'] values['princeton_comon_procs'] = int(oval) else: values['princeton_comon_procs'] = None if nodename in cohash: values['comonstats'] = cohash[nodename] else: values['comonstats'] = {'resptime': '-1', 'uptime': '-1', 'sshstatus': '-1', 'lastcotop': '-1', 'cpuspeed' : "null", 'disksize' : 'null', 'memsize' : 'null'} # include output value ### GET PLC NODE ###################### plc_lock.acquire() d_node = None try: d_node = plc.getNodes({'hostname': nodename}, ['pcu_ids', 'site_id', 'date_created', 'last_updated', 'last_contact', 'boot_state', 'nodegroup_ids'])[0] except: traceback.print_exc() plc_lock.release() values['plcnode'] = d_node ### GET PLC PCU ###################### site_id = -1 d_pcu = None if d_node: pcu = d_node['pcu_ids'] if len(pcu) > 0: d_pcu = pcu[0] site_id = d_node['site_id'] values['pcu'] = d_pcu ### GET PLC SITE ###################### plc_lock.acquire() d_site = None values['loginbase'] = "" try: d_site = plc.getSites({'site_id': site_id}, ['max_slices', 'slice_ids', 'node_ids', 'login_base'])[0] values['loginbase'] = d_site['login_base'] except: traceback.print_exc() plc_lock.release() values['plcsite'] = d_site values['date_checked'] = time.time() except: print traceback.print_exc() return (nodename, values) def recordPingAndSSH(request, result): global global_round global count (nodename, values) = result try: if values is not None: fbsync = FindbadNodeRecordSync.findby_or_create(hostname="global", if_new_set={'round' : global_round}) global_round = fbsync.round fbnodesync = FindbadNodeRecordSync.findby_or_create(hostname=nodename, if_new_set={'round' : global_round}) fbrec = FindbadNodeRecord( date_checked=datetime.fromtimestamp(values['date_checked']), round=global_round, hostname=nodename, loginbase=values['loginbase'], kernel_version=values['kernel'], bootcd_version=values['bootcd'], nm_status=values['nm'], fs_status=values['readonlyfs'], dns_status=values['dns'], princeton_comon_dir=values['princeton_comon'], princeton_comon_running=values['princeton_comon_running'], princeton_comon_procs=values['princeton_comon_procs'], plc_node_stats = values['plcnode'], plc_site_stats = values['plcsite'], plc_pcuid = values['pcu'], comon_stats = values['comonstats'], ping_status = (values['ping'] == "PING"), ssh_portused = values['sshport'], ssh_status = (values['ssh'] == "SSH"), ssh_error = values['ssherror'], observed_status = values['state'], observed_category = values['category'], ) fbnodesync.round = global_round fbnodesync.flush() fbsync.flush() fbrec.flush() count += 1 print "%d %s %s" % (count, nodename, values) except: print "ERROR:" print traceback.print_exc() # this will be called when an exception occurs within a thread def handle_exception(request, result): print "Exception occured in request %s" % request.requestID for i in result: print "Result: %s" % i def checkAndRecordState(l_nodes, cohash): global global_round global count tp = threadpool.ThreadPool(20) # CREATE all the work requests for nodename in l_nodes: fbnodesync = FindbadNodeRecordSync.findby_or_create(hostname=nodename, if_new_set={'round':0}) node_round = fbnodesync.round fbnodesync.flush() if node_round < global_round: # recreate node stats when refreshed #print "%s" % nodename req = threadpool.WorkRequest(collectPingAndSSH, [nodename, cohash], {}, None, recordPingAndSSH, handle_exception) tp.putRequest(req) else: # We just skip it, since it's "up to date" count += 1 #print "%d %s %s" % (count, nodename, externalState['nodes'][nodename]['values']) print "%d %s %s" % (count, nodename, node_round) # WAIT while all the work requests are processed. begin = time.time() while 1: try: time.sleep(1) tp.poll() # if more than two hours if time.time() - begin > (60*60*1.5): print "findbad.py has run out of time!!!!!!" os._exit(1) except KeyboardInterrupt: print "Interrupted!" break except threadpool.NoResultsPending: print "All results collected." break print FindbadNodeRecordSync.query.count() print FindbadNodeRecord.query.count() session.flush() def main(): global global_round fbsync = FindbadNodeRecordSync.findby_or_create(hostname="global", if_new_set={'round' : global_round}) global_round = fbsync.round if config.increment: # update global round number to force refreshes across all nodes global_round += 1 fbsync.round = global_round fbsync.flush() cotop = comon.Comon() # lastcotop measures whether cotop is actually running. this is a better # metric than sshstatus, or other values from CoMon cotop_url = COMON_COTOPURL # history information for all nodes cohash = {} #cohash = cotop.coget(cotop_url) l_nodes = plccache.l_nodes if config.nodelist: f_nodes = util.file.getListFromFile(config.nodelist) l_nodes = filter(lambda x: x['hostname'] in f_nodes, l_nodes) elif config.node: f_nodes = [config.node] l_nodes = filter(lambda x: x['hostname'] in f_nodes, l_nodes) elif config.nodegroup: ng = api.GetNodeGroups({'name' : config.nodegroup}) l_nodes = api.GetNodes(ng[0]['node_ids']) elif config.site: site = api.GetSites(config.site) l_nodes = api.GetNodes(site[0]['node_ids'], ['hostname']) l_nodes = [node['hostname'] for node in l_nodes] # perform this query after the above options, so that the filter above # does not break. if config.nodeselect: plcnodes = api.GetNodes({'peer_id' : None}, ['hostname']) plcnodes = [ node['hostname'] for node in plcnodes ] l_nodes = node_select(config.nodeselect, plcnodes, None) print "fetching %s hosts" % len(l_nodes) checkAndRecordState(l_nodes, cohash) return 0 if __name__ == '__main__': from monitor import parser as parsermodule parser = parsermodule.getParser(['nodesets']) parser.set_defaults( increment=False, dbname="findbad", cachenodes=False) parser.add_option("", "--cachenodes", action="store_true", help="Cache node lookup from PLC") parser.add_option("", "--dbname", dest="dbname", metavar="FILE", help="Specify the name of the database to which the information is saved") parser.add_option("-i", "--increment", action="store_true", dest="increment", help="Increment round number to force refresh or retry") parser = parsermodule.getParser(['defaults'], parser) cfg = parsermodule.parse_args(parser) try: main() except Exception, err: print traceback.print_exc() print "Exception: %s" % err print "Saving data... exitting." sys.exit(0) print "sleeping" #print "final commit" #time.sleep(10)