X-Git-Url: http://git.onelab.eu/?p=monitor.git;a=blobdiff_plain;f=monitor%2Fscanapi.py;h=22e3e74fadf449a851e960e656cddc4614e2d8f8;hp=af7fcd430b4e3373eaf2647556012409a5bed4da;hb=32e64e33bc81735e22024c5a44510848bb3c88df;hpb=28582f7068d5ef8e74cb3b70134f682d4ab471bc diff --git a/monitor/scanapi.py b/monitor/scanapi.py index af7fcd4..22e3e74 100644 --- a/monitor/scanapi.py +++ b/monitor/scanapi.py @@ -167,138 +167,107 @@ class ScanNodeInternal(ScanInterface): syncclass = None primarykey = 'hostname' + def collectPorts(self, nodename, port_list=[22,80,806]): + values = {} + for port in port_list: + ret = os.system("nc -w 5 -z %s %s > /dev/null" % (nodename, port) ) + if ret == 0: + values[str(port)] = "open" + else: + values[str(port)] = "closed" + return {'port_status' : values } + def collectNMAP(self, nodename, cohash): #### RUN NMAP ############################### + # NOTE: run the same command three times and take the best of three + # runs. NMAP can drop packets, and especially so when it runs many + # commands at once. values = {} nmap = command.CMD() print "nmap -oG - -P0 -p22,80,806 %s | grep Host:" % nodename - (oval,eval) = nmap.run_noexcept("nmap -oG - -P0 -p22,80,806 %s | grep Host:" % nodename) + (oval1,eval) = nmap.run_noexcept("nmap -oG - -P0 -p22,80,806 %s | grep Host:" % nodename) + (oval2,eval) = nmap.run_noexcept("nmap -oG - -P0 -p22,80,806 %s | grep Host:" % nodename) + (oval3,eval) = nmap.run_noexcept("nmap -oG - -P0 -p22,80,806 %s | grep Host:" % nodename) # NOTE: an empty / error value for oval, will still work. - (values['port_status'], continue_probe) = nmap_port_status(oval) + values['port_status'] = {} + (o1,continue_probe) = nmap_port_status(oval1) + (o2,continue_probe) = nmap_port_status(oval2) + (o3,continue_probe) = nmap_port_status(oval3) + for p in ['22', '80', '806']: + l = [ o1[p], o2[p], o3[p] ] + if len(filter(lambda x: x == 'open', l)) > 1: + values['port_status'][p] = 'open' + else: + values['port_status'][p] = o1[p] - values['date_checked'] = datetime.now() - + print values['port_status'] return (nodename, values) - def collectInternal(self, nodename, cohash): - ### RUN PING ###################### + def collectPING(self, nodename, cohash): + values = {} ping = command.CMD() (oval,errval) = ping.run_noexcept("ping -c 1 -q %s | grep rtt" % nodename) - try: - values = {} + values = {} + if oval == "": + # An error occurred + values['ping_status'] = False + else: + values['ping_status'] = True - if oval == "": - # An error occurred - values['ping_status'] = False - else: - values['ping_status'] = True + return values - try: - for port in [22, 806]: - ssh = command.SSH('root', nodename, port) - - (oval, errval) = ssh.run_noexcept2(""" <<\EOF - echo "{" - echo ' "kernel_version":"'`uname -a`'",' - echo ' "bmlog":"'`ls /tmp/bm.log`'",' - echo ' "bootcd_version":"'`cat /mnt/cdrom/bootme/ID`'",' - echo ' "nm_status":"'`ps ax | grep nm.py | grep -v grep`'",' - echo ' "dns_status":"'`host boot.planet-lab.org 2>&1`'",' - echo ' "princeton_comon_dir":"'`ls -d /vservers/princeton_comon`'",' - echo ' "uptime":"'`uptime`'",' - - ID=`grep princeton_comon /etc/passwd | awk -F : '{if ( $3 > 500 ) { print $3}}'` - echo ' "princeton_comon_running":"'`ls -d /proc/virtual/$ID`'",' - echo ' "princeton_comon_procs":"'`vps ax | grep $ID | grep -v grep | wc -l`'",' - echo ' "fs_status":"'`grep proc /proc/mounts | grep ro, ; if [ -x /usr/bin/timeout.pl ] ; then timeout.pl 20 touch /var/log/monitor 2>&1 ; if [ -d /vservers/ ] ; then timeout.pl 20 touch /vservers/monitor.log 2>&1 ; fi ; fi`'",' - echo ' "rpm_version":"'`if [ -x /usr/bin/timeout.pl ] ; then timeout.pl 30 rpm -q NodeManager ; fi`'",' - echo ' "rpm_versions":"'`if [ -x /usr/bin/timeout.pl ] ; then timeout.pl 45 rpm -q -a ; fi`'",' - echo "}" -EOF """) - - values['ssh_error'] = errval - if len(oval) > 0: - #print "OVAL: %s" % oval - values.update(eval(oval)) - values['ssh_portused'] = port - break - else: - values.update({'kernel_version': "", 'bmlog' : "", 'bootcd_version' : '', - 'nm_status' : '', - 'fs_status' : '', - 'uptime' : '', - 'dns_status' : '', - 'rpm_version' : '', - 'rpm_versions' : '', - 'princeton_comon_dir' : "", - 'princeton_comon_running' : "", - 'princeton_comon_procs' : "", 'ssh_portused' : None}) - except: - print traceback.print_exc() - sys.exit(1) + def collectTRACEROUTE(self, nodename, cohash): + values = {} + trace = command.CMD() + (oval,errval) = trace.run_noexcept("traceroute %s" % nodename) - values['fs_status'] = "" - print "ALLVERSIONS: %s %s" % (nodename, values['rpm_versions']) + values['traceroute'] = oval - print "RPMVERSION: %s %s" % (nodename, values['rpm_version']) - print "UPTIME: %s %s" % (nodename, values['uptime']) - ### RUN SSH ###################### - b_getbootcd_id = True - - oval = values['kernel_version'] - if "2.6.17" in oval or "2.6.2" in oval: - values['ssh_status'] = True - values['observed_category'] = 'PROD' - if "bm.log" in values['bmlog']: - values['observed_status'] = 'DEBUG' - else: - values['observed_status'] = 'BOOT' - elif "2.6.12" in oval or "2.6.10" in oval: - values['ssh_status'] = True - values['observed_category'] = 'OLDPROD' - if "bm.log" in values['bmlog']: - values['observed_status'] = 'DEBUG' - else: - values['observed_status'] = 'BOOT' - - # NOTE: on 2.6.8 kernels, with 4.2 bootstrapfs, the chroot - # command fails. I have no idea why. - elif "2.4" in oval or "2.6.8" in oval: - b_getbootcd_id = False - values['ssh_status'] = True - values['observed_category'] = 'OLDBOOTCD' - values['observed_status'] = 'DEBUG' - elif oval != "": - values['ssh_status'] = True - values['observed_category'] = 'UNKNOWN' - if "bm.log" in values['bmlog']: - values['observed_status'] = 'DEBUG' - else: - values['observed_status'] = 'BOOT' - else: - # An error occurred. - b_getbootcd_id = False - values['ssh_status'] = False - values['observed_category'] = 'ERROR' - values['observed_status'] = 'DOWN' - val = errval.strip() - values['ssh_error'] = val - values['kernel_version'] = "" - - if b_getbootcd_id: - # try to get BootCD for all nodes that are not 2.4 nor inaccessible - oval = values['bootcd_version'] - if "BootCD" in oval: - values['bootcd_version'] = oval - if "v2" in oval and \ - ( nodename is not "planetlab1.cs.unc.edu" and \ - nodename is not "planetlab2.cs.unc.edu" ): - values['observed_category'] = 'OLDBOOTCD' + return values + + def collectSSH(self, nodename, cohash): + values = {} + try: + for port in [22, 806]: + ssh = command.SSH('root', nodename, port) + + (oval, errval) = ssh.run_noexcept2(""" <<\EOF + echo "{" + echo ' "kernel_version":"'`uname -a`'",' + echo ' "bmlog":"'`ls /tmp/bm.log`'",' + echo ' "bootcd_version":"'`cat /mnt/cdrom/bootme/ID`'",' + echo ' "nm_status":"'`ps ax | grep nm.py | grep -v grep`'",' + echo ' "dns_status":"'`host boot.planet-lab.org 2>&1`'",' + echo ' "princeton_comon_dir":"'`ls -d /vservers/princeton_comon`'",' + echo ' "uptime":"'`cat /proc/uptime`'",' + + ID=`grep princeton_comon /etc/passwd | awk -F : '{if ( $3 > 500 ) { print $3}}'` + echo ' "princeton_comon_running":"'`ls -d /proc/virtual/$ID`'",' + echo ' "princeton_comon_procs":"'`vps ax | grep $ID | grep -v grep | wc -l`'",' + echo ' "fs_status":"'`grep proc /proc/mounts | grep ro, ; if [ -x /usr/bin/timeout.pl ] ; then timeout.pl 20 touch /var/log/monitor 2>&1 ; if [ -d /vservers/ ] ; then timeout.pl 20 touch /vservers/monitor.log 2>&1 ; fi ; fi`'",' + echo ' "rpm_version":"'`if [ -x /usr/bin/timeout.pl ] ; then timeout.pl 30 rpm -q NodeManager ; fi`'",' + echo ' "rpm_versions":"'`if [ -x /usr/bin/timeout.pl ] ; then timeout.pl 45 rpm -q -a ; fi`'",' + echo "}" +EOF """) + + values['ssh_error'] = errval + if len(oval) > 0: + #print "OVAL: %s" % oval + values.update(eval(oval)) + values['ssh_portused'] = port + break else: - values['bootcd_version'] = "" - else: - values['bootcd_version'] = "" + values.update({'kernel_version': "", 'bmlog' : "", 'bootcd_version' : '', + 'nm_status' : '', + 'fs_status' : '', + 'uptime' : '', + 'dns_status' : '', + 'rpm_version' : '', + 'rpm_versions' : '', + 'princeton_comon_dir' : "", + 'princeton_comon_running' : "", + 'princeton_comon_procs' : "", 'ssh_portused' : None}) oval = values['nm_status'] if "nm.py" in oval: @@ -329,8 +298,144 @@ EOF """) values['princeton_comon_procs'] = int(oval) else: values['princeton_comon_procs'] = None + except: + print traceback.print_exc() + sys.exit(1) + + return values + + def collectPLC(self, nodename, cohash): + values = {} + ### GET PLC NODE ###################### + d_node = plccache.GetNodeByName(nodename) + values['plc_node_stats'] = d_node + + ### GET PLC PCU ###################### + site_id = -1 + d_pcu = None + if d_node and len(d_node['pcu_ids']) > 0: + d_pcu = d_node['pcu_ids'][0] + + site_id = d_node['site_id'] + + values['plc_pcuid'] = d_pcu + + ### GET PLC SITE ###################### + print "SITEID: %s" % site_id + d_site = plccache.GetSitesById([ site_id ])[0] + values['loginbase'] = d_site['login_base'] + values['plc_site_stats'] = d_site + + return values + + def evaluate(self, nodename, values): + # TODO: this section can probably be reduced to a policy statement + # using patterns and values collected so far. + # NOTE: A node is "DOWN" if + # * cannot ssh into it. + # * all ports are not open for a 'BOOT' node + # * dns for hostname does not exist. + b_getbootcd_id = True + + oval = values['kernel_version'] + values['ssh_status'] = True + if "2.6.17" in oval or "2.6.2" in oval: + values['observed_category'] = 'PROD' + if "bm.log" in values['bmlog']: + values['observed_status'] = 'DEBUG' + else: + values['observed_status'] = 'BOOT' + elif "2.6.12" in oval or "2.6.10" in oval: + values['observed_category'] = 'OLDPROD' + if "bm.log" in values['bmlog']: + values['observed_status'] = 'DEBUG' + else: + values['observed_status'] = 'BOOT' + + # NOTE: on 2.6.8 kernels, with 4.2 bootstrapfs, the chroot + # command fails. I have no idea why. + elif "2.4" in oval or "2.6.8" in oval: + b_getbootcd_id = False + values['observed_category'] = 'OLDBOOTCD' + values['observed_status'] = 'DEBUG' + elif oval != "": + values['observed_category'] = 'UNKNOWN' + if "bm.log" in values['bmlog']: + values['observed_status'] = 'DEBUG' + else: + values['observed_status'] = 'BOOT' + else: + # An error occurred. + b_getbootcd_id = False + values['ssh_status'] = False + values['observed_category'] = 'ERROR' + values['observed_status'] = 'DOWN' + values['kernel_version'] = "" + + values['firewall'] = False + + # NOTE: A node is down if some of the public ports are not open + if values['observed_status'] == "BOOT": + # verify that all ports are open. Else, report node as down. + if not ( values['port_status']['22'] == "open" and \ + values['port_status']['80'] == "open" and \ + values['port_status']['806'] == "open") : + #email_exception(nodename, "%s FILTERED HOST" % nodename) + values['observed_status'] = 'DOWN' + values['firewall'] = True + + #if not values['external_dns_status']: + # email_exception("%s DNS down" % nodename) + + if b_getbootcd_id: + # try to get BootCD for all nodes that are not 2.4 nor inaccessible + oval = values['bootcd_version'] + if "BootCD" in oval: + values['bootcd_version'] = oval + if "v2" in oval and \ + ( nodename is not "planetlab1.cs.unc.edu" and \ + nodename is not "planetlab2.cs.unc.edu" ): + values['observed_category'] = 'OLDBOOTCD' + else: + values['bootcd_version'] = "" + else: + values['bootcd_version'] = "" + + return values + + def collectDNS(self, nodename, cohash): + values = {} + try: + ipaddr = socket.gethostbyname(nodename) + # TODO: check that IP returned matches IP in plc db. + values['external_dns_status'] = True + except Exception, err: + values['external_dns_status'] = False + + return values + + def collectInternal(self, nodename, cohash): + try: + values = {} + + v = self.collectPING(nodename, cohash) + values.update(v) + + v = self.collectPorts(nodename) + values.update(v) + + v = self.collectSSH(nodename, cohash) + values.update(v) + + v = self.collectDNS(nodename, cohash) + values.update(v) + + v = self.collectTRACEROUTE(nodename, cohash) + values.update(v) + + v = self.collectPLC(nodename, cohash) + values.update(v) - if nodename in cohash: values['comon_stats'] = cohash[nodename] else: @@ -341,51 +446,23 @@ EOF """) 'cpuspeed' : "null", 'disksize' : 'null', 'memsize' : 'null'} - # include output value - ### GET PLC NODE ###################### - d_node = plccache.GetNodeByName(nodename) - values['plc_node_stats'] = d_node - - ##### NMAP ################### - (n, v) = self.collectNMAP(nodename, None) - values.update(v) - - ### GET PLC PCU ###################### - site_id = -1 - d_pcu = None - if d_node: - pcu = d_node['pcu_ids'] - if len(pcu) > 0: - d_pcu = pcu[0] - - site_id = d_node['site_id'] - values['plc_pcuid'] = d_pcu - - ### GET PLC SITE ###################### - plc_lock.acquire() - d_site = None - values['loginbase'] = "" - try: - d_site = plccache.GetSitesById([ site_id ])[0] - #d_site = plc.getSites({'site_id': site_id}, - # ['max_slices', 'slice_ids', 'node_ids', 'login_base'])[0] - values['loginbase'] = d_site['login_base'] - except: - traceback.print_exc() - plc_lock.release() + values['rpms'] = values['rpm_versions'] + print "ALLVERSIONS: %s %s" % (nodename, values['rpm_versions']) + print "RPMVERSION: %s %s" % (nodename, values['rpm_version']) + print "UPTIME: %s %s" % (nodename, values['uptime']) - values['plc_site_stats'] = d_site + values = self.evaluate(nodename, values) values['date_checked'] = datetime.now() + except: print traceback.print_exc() return (nodename, values) + def internalprobe(hostname): - #fbsync = FindbadNodeRecordSync.findby_or_create(hostname="global", - # if_new_set={'round' : 1}) - scannode = ScanNodeInternal() # fbsync.round) + scannode = ScanNodeInternal() try: (nodename, values) = scannode.collectInternal(hostname, {}) scannode.record(None, (nodename, values)) @@ -396,12 +473,10 @@ def internalprobe(hostname): return False def externalprobe(hostname): - #fbsync = FindbadNodeRecordSync.findby_or_create(hostname="global", - # if_new_set={'round' : 1}) - scannode = ScanNodeInternal() # fbsync.round) + scannode = ScanNodeInternal() try: - (nodename, values) = scannode.collectNMAP(hostname, {}) - scannode.record(None, (nodename, values)) + values = self.collectPorts(hostname) + scannode.record(None, (hostname, values)) session.flush() return True except: