X-Git-Url: http://git.onelab.eu/?p=monitor.git;a=blobdiff_plain;f=monitor%2Fscanapi.py;h=177affcb16341b6ebc2ca86bb1d76723f6a01de8;hp=f5c4f5f24761900a6f4c7bc627e7f62862f54fa9;hb=HEAD;hpb=13b93b7152cc4789c9554ad11a2e1ffdd34a1304 diff --git a/monitor/scanapi.py b/monitor/scanapi.py index f5c4f5f..177affc 100644 --- a/monitor/scanapi.py +++ b/monitor/scanapi.py @@ -167,134 +167,117 @@ class ScanNodeInternal(ScanInterface): syncclass = None primarykey = 'hostname' + def collectPorts(self, nodename, port_list=[22,80,806]): + values = {} + for port in port_list: + ret = os.system("nc -w 5 -z %s %s > /dev/null" % (nodename, port) ) + if ret == 0: + values[str(port)] = "open" + else: + values[str(port)] = "closed" + return {'port_status' : values } + def collectNMAP(self, nodename, cohash): #### RUN NMAP ############################### + # NOTE: run the same command three times and take the best of three + # runs. NMAP can drop packets, and especially so when it runs many + # commands at once. values = {} nmap = command.CMD() - print "nmap -oG - -P0 -p22,80,806 %s | grep Host:" % nodename - (oval,eval) = nmap.run_noexcept("nmap -oG - -P0 -p22,80,806 %s | grep Host:" % nodename) + print "nmap -oG - -P0 -p22,80,806 %s | grep -v Down | grep Ports:" % nodename + (oval1,eval) = nmap.run_noexcept("nmap -oG - -P0 -p22,80,806 %s | grep -v Down | grep Ports:" % nodename) + (oval2,eval) = nmap.run_noexcept("nmap -oG - -P0 -p22,80,806 %s | grep -v Down | grep Ports:" % nodename) + (oval3,eval) = nmap.run_noexcept("nmap -oG - -P0 -p22,80,806 %s | grep -v Down | grep Ports:" % nodename) # NOTE: an empty / error value for oval, will still work. - (values['port_status'], continue_probe) = nmap_port_status(oval) + values['port_status'] = {} + (o1,continue_probe) = nmap_port_status(oval1) + (o2,continue_probe) = nmap_port_status(oval2) + (o3,continue_probe) = nmap_port_status(oval3) + for p in ['22', '80', '806']: + l = [ o1[p], o2[p], o3[p] ] + if len(filter(lambda x: x == 'open', l)) > 1: + values['port_status'][p] = 'open' + else: + values['port_status'][p] = o1[p] - values['date_checked'] = datetime.now() - + print values['port_status'] return (nodename, values) - def collectInternal(self, nodename, cohash): - ### RUN PING ###################### + def collectPING(self, nodename, cohash): + values = {} ping = command.CMD() (oval,errval) = ping.run_noexcept("ping -c 1 -q %s | grep rtt" % nodename) - try: - values = {} + values = {} + if oval == "": + # An error occurred + values['ping_status'] = False + else: + values['ping_status'] = True - if oval == "": - # An error occurred - values['ping_status'] = False - else: - values['ping_status'] = True + return values - try: - for port in [22, 806]: - ssh = command.SSH('root', nodename, port) - - (oval, errval) = ssh.run_noexcept2(""" <<\EOF - echo "{" - echo ' "kernel_version":"'`uname -a`'",' - echo ' "bmlog":"'`ls /tmp/bm.log`'",' - echo ' "bootcd_version":"'`cat /mnt/cdrom/bootme/ID`'",' - echo ' "nm_status":"'`ps ax | grep nm.py | grep -v grep`'",' - echo ' "fs_status":"'`touch /var/log/monitor 2>&1 ; if [ -d /vservers/ ] ; then touch /vservers/monitor.log 2>&1 ; fi ; grep proc /proc/mounts | grep ro,`'",' - echo ' "dns_status":"'`host boot.planet-lab.org 2>&1`'",' - echo ' "princeton_comon_dir":"'`ls -d /vservers/princeton_comon`'",' - - ID=`grep princeton_comon /etc/passwd | awk -F : '{if ( $3 > 500 ) { print $3}}'` - echo ' "princeton_comon_running":"'`ls -d /proc/virtual/$ID`'",' - echo ' "princeton_comon_procs":"'`vps ax | grep $ID | grep -v grep | wc -l`'",' - echo ' "rpm_version":"'`rpm -q NodeManager`'",' - echo ' "rpm_versions":"'`rpm -q -a`'",' - echo "}" -EOF """) - - values['ssh_error'] = errval - if len(oval) > 0: - #print "OVAL: %s" % oval - values.update(eval(oval)) - values['ssh_portused'] = port - break - else: - values.update({'kernel_version': "", 'bmlog' : "", 'bootcd_version' : '', - 'nm_status' : '', - 'fs_status' : '', - 'dns_status' : '', - 'rpm_version' : '', - 'rpm_versions' : '', - 'princeton_comon_dir' : "", - 'princeton_comon_running' : "", - 'princeton_comon_procs' : "", 'ssh_portused' : None}) - except: - print traceback.print_exc() - sys.exit(1) + def collectTRACEROUTE(self, nodename, cohash): + values = {} + trace = command.CMD() + (oval,errval) = trace.run_noexcept("traceroute %s" % nodename) - print "ALLVERSIONS: %s %s" % (nodename, values['rpm_versions']) + values['traceroute'] = oval - print "RPMVERSION: %s %s" % (nodename, values['rpm_version']) - ### RUN SSH ###################### - b_getbootcd_id = True - - oval = values['kernel_version'] - if "2.6.17" in oval or "2.6.2" in oval: - values['ssh_status'] = True - values['observed_category'] = 'PROD' - if "bm.log" in values['bmlog']: - values['observed_status'] = 'DEBUG' - else: - values['observed_status'] = 'BOOT' - elif "2.6.12" in oval or "2.6.10" in oval: - values['ssh_status'] = True - values['observed_category'] = 'OLDPROD' - if "bm.log" in values['bmlog']: - values['observed_status'] = 'DEBUG' - else: - values['observed_status'] = 'BOOT' - - # NOTE: on 2.6.8 kernels, with 4.2 bootstrapfs, the chroot - # command fails. I have no idea why. - elif "2.4" in oval or "2.6.8" in oval: - b_getbootcd_id = False - values['ssh_status'] = True - values['observed_category'] = 'OLDBOOTCD' - values['observed_status'] = 'DEBUG' - elif oval != "": - values['ssh_status'] = True - values['observed_category'] = 'UNKNOWN' - if "bm.log" in values['bmlog']: - values['observed_status'] = 'DEBUG' - else: - values['observed_status'] = 'BOOT' - else: - # An error occurred. - b_getbootcd_id = False - values['ssh_status'] = False - values['observed_category'] = 'ERROR' - values['observed_status'] = 'DOWN' - val = errval.strip() - values['ssh_error'] = val - values['kernel_version'] = "" - - if b_getbootcd_id: - # try to get BootCD for all nodes that are not 2.4 nor inaccessible - oval = values['bootcd_version'] - if "BootCD" in oval: - values['bootcd_version'] = oval - if "v2" in oval and \ - ( nodename is not "planetlab1.cs.unc.edu" and \ - nodename is not "planetlab2.cs.unc.edu" ): - values['observed_category'] = 'OLDBOOTCD' + return values + + def collectSSH(self, nodename, cohash): + values = {} + try: + for port in [22, 806]: + ssh = command.SSH('root', nodename, port) + + (oval, errval) = ssh.run_noexcept2(""" <<\EOF + echo "{" + echo ' "kernel_version":"'`uname -a`'",' + echo ' "bmlog":"'`ls /tmp/bm.log || ls /tmp/source/BootManager.py`'",' + echo ' "bootcd_version":"'`cat /mnt/cdrom/bootme/ID || cat /usr/bootme/ID`'",' + echo ' "boot_server":"'`cat /mnt/cdrom/bootme/BOOTSERVER`'",' + echo ' "install_date":"'`python -c "import os,time,stat; print time.ctime(os.stat('/usr/boot/plnode.txt')[stat.ST_CTIME])" || python -c "import os,time,stat; print time.ctime(os.stat('/usr/boot/cacert.pem')[stat.ST_CTIME])"`'",' + echo ' "nm_status":"'`ps ax | grep nm.py | grep -v grep`'",' + echo ' "dns_status":"'`host boot.planet-lab.org 2>&1`'",' + echo ' "iptables_status":"'`iptables -t mangle -nL | awk '$1~/^[A-Z]+$/ {modules[$1]=1;}END{for (k in modules) {if (k) printf "%s ",k;}}'`'",' + echo ' "princeton_comon_dir":"'`ls -d /vservers/princeton_comon`'",' + echo ' "uptime":"'`cat /proc/uptime`'",' + + ID=`grep princeton_comon /etc/passwd | awk -F : '{if ( $3 > 500 ) { print $3}}'` + echo ' "princeton_comon_running":"'`ls -d /proc/virtual/$ID`'",' + echo ' "princeton_comon_procs":"'`vps ax | grep $ID | grep -v grep | wc -l`'",' + echo ' "fs_status":"'`grep proc /proc/mounts | grep ro, ; if [ -x /usr/bin/timeout.pl ] ; then timeout.pl 20 touch /var/log/monitor 2>&1 ; if [ -d /vservers/ ] ; then timeout.pl 20 touch /vservers/monitor.log 2>&1 ; fi ; fi`'",' + echo ' "rpm_version":"'`if [ -x /usr/bin/timeout.pl ] ; then /usr/bin/timeout.pl 30 rpm -q NodeManager ; fi`'",' + echo ' "rpm_versions":"'`if [ -x /usr/bin/timeout.pl ] ; then /usr/bin/timeout.pl 45 rpm -q -a | sort ; fi`'",' + echo ' "md5sums":"'`md5sum /etc/yum.conf /etc/yum.myplc.d/myplc.repo /etc/yum.myplc.d/stock.repo | awk '{print $1}'`'",' + echo ' "md5sum_yum":"'`grep -v -E "^#" /etc/yum.myplc.d/myplc.repo | md5sum`'",' + echo ' "nada":"'``'",' + echo "}" +EOF """) + + values['ssh_error'] = errval + if len(oval) > 0: + #print "OVAL: %s" % oval + values.update(eval(oval)) + values['ssh_portused'] = port + break else: - values['bootcd_version'] = "" - else: - values['bootcd_version'] = "" + values.update({'kernel_version': "", 'bmlog' : "", 'bootcd_version' : '', + 'boot_server' : '', + 'install_date' : '', + 'nm_status' : '', + 'fs_status' : '', + 'uptime' : '', + 'dns_status' : '', + 'md5sums' : '', + 'md5sum_yum' : '', + 'rpm_version' : '', + 'rpm_versions' : '', + 'princeton_comon_dir' : "", + 'princeton_comon_running' : "", + 'princeton_comon_procs' : "", 'ssh_portused' : None}) oval = values['nm_status'] if "nm.py" in oval: @@ -304,7 +287,7 @@ EOF """) continue_slice_check = True oval = values['princeton_comon_dir'] - if "princeton_comon_dir" in oval: + if "princeton_comon" in oval: values['princeton_comon_dir'] = True else: values['princeton_comon_dir'] = False @@ -325,8 +308,150 @@ EOF """) values['princeton_comon_procs'] = int(oval) else: values['princeton_comon_procs'] = None + except: + print traceback.print_exc() + sys.exit(1) + + return values + + def collectPLC(self, nodename, cohash): + values = {} + ### GET PLC NODE ###################### + d_node = plccache.GetNodeByName(nodename) + values['plc_node_stats'] = d_node + + ### GET PLC PCU ###################### + site_id = -1 + d_pcu = None + if d_node and len(d_node['pcu_ids']) > 0: + d_pcu = d_node['pcu_ids'][0] + + site_id = d_node['site_id'] + + values['plc_pcuid'] = d_pcu + + ### GET PLC SITE ###################### + print "SITEID: %s" % site_id + d_site = plccache.GetSitesById([ site_id ])[0] + values['loginbase'] = d_site['login_base'] + values['plc_site_stats'] = d_site + + return values + + def evaluate(self, nodename, values): + # TODO: this section can probably be reduced to a policy statement + # using patterns and values collected so far. + # NOTE: A node is "DOWN" if + # * cannot ssh into it. + # * all ports are not open for a 'BOOT' node + # * dns for hostname does not exist. + b_getbootcd_id = True + + oval = values['kernel_version'] + values['ssh_status'] = True + if "2.6.17" in oval or "2.6.2" in oval: + values['observed_category'] = 'PROD' + if "bm.log" in values['bmlog'] or "BootManager" in values['bmlog']: + values['observed_status'] = 'DEBUG' + else: + values['observed_status'] = 'BOOT' + elif "2.6.12" in oval or "2.6.10" in oval: + values['observed_category'] = 'OLDPROD' + if "bm.log" in values['bmlog']: + values['observed_status'] = 'DEBUG' + else: + values['observed_status'] = 'BOOT' + + # NOTE: on 2.6.8 kernels, with 4.2 bootstrapfs, the chroot + # command fails. I have no idea why. + elif "2.4" in oval or "2.6.8" in oval: + b_getbootcd_id = False + values['observed_category'] = 'OLDBOOTCD' + values['observed_status'] = 'DEBUG' + elif oval != "": + values['observed_category'] = 'UNKNOWN' + if "bm.log" in values['bmlog']: + values['observed_status'] = 'DEBUG' + else: + values['observed_status'] = 'BOOT' + else: + # An error occurred. + b_getbootcd_id = False + values['ssh_status'] = False + values['observed_category'] = 'ERROR' + values['observed_status'] = 'DOWN' + values['kernel_version'] = "" + + values['firewall'] = False + + #print "BEFORE:%s" % values + # NOTE: A node is down if some of the public ports are not open + if values['observed_status'] == "BOOT": + # verify that all ports are open. Else, report node as down. + if not ( values['port_status']['22'] == "open" and \ + values['port_status']['80'] == "open" and \ + values['port_status']['806'] == "open") : + #email_exception(nodename, "%s FILTERED HOST" % nodename) + values['observed_status'] = 'DOWN' + values['firewall'] = True + + #if values['port_status']['22'] == "open" and \ + # values['port_status']['80'] == "closed" and \ + # values['port_status']['806'] == "open" : + # email_exception("%s port 80 blocked" % nodename, "possible VSERVER ref blocked") + + #if not values['external_dns_status']: + # email_exception("%s DNS down" % nodename) + + if b_getbootcd_id: + # try to get BootCD for all nodes that are not 2.4 nor inaccessible + oval = values['bootcd_version'] + if "BootCD" in oval: + values['bootcd_version'] = oval + if "v2" in oval and \ + ( nodename is not "planetlab1.cs.unc.edu" and \ + nodename is not "planetlab2.cs.unc.edu" ): + values['observed_category'] = 'OLDBOOTCD' + else: + values['bootcd_version'] = "" + else: + values['bootcd_version'] = "" + + return values + + def collectDNS(self, nodename, cohash): + values = {} + try: + ipaddr = socket.gethostbyname(nodename) + # TODO: check that IP returned matches IP in plc db. + values['external_dns_status'] = True + except Exception, err: + values['external_dns_status'] = False + + return values + + def collectInternal(self, nodename, cohash): + try: + values = {} + + v = self.collectPING(nodename, cohash) + values.update(v) + + v = self.collectPorts(nodename) + values.update(v) + + v = self.collectSSH(nodename, cohash) + values.update(v) + + v = self.collectDNS(nodename, cohash) + values.update(v) + + v = self.collectTRACEROUTE(nodename, cohash) + values.update(v) + + v = self.collectPLC(nodename, cohash) + values.update(v) - if nodename in cohash: values['comon_stats'] = cohash[nodename] else: @@ -337,60 +462,26 @@ EOF """) 'cpuspeed' : "null", 'disksize' : 'null', 'memsize' : 'null'} - # include output value - ### GET PLC NODE ###################### - plc_lock.acquire() - d_node = None - try: - d_node = plccache.GetNodeByName(nodename) - #d_node = plc.getNodes({'hostname': nodename}, ['pcu_ids', 'site_id', - # 'date_created', 'last_updated', - # 'last_contact', 'boot_state', 'nodegroup_ids'])[0] - except: - traceback.print_exc() - plc_lock.release() - values['plc_node_stats'] = d_node - - ##### NMAP ################### - (n, v) = self.collectNMAP(nodename, None) - values.update(v) - - ### GET PLC PCU ###################### - site_id = -1 - d_pcu = None - if d_node: - pcu = d_node['pcu_ids'] - if len(pcu) > 0: - d_pcu = pcu[0] - site_id = d_node['site_id'] - - values['plc_pcuid'] = d_pcu - - ### GET PLC SITE ###################### - plc_lock.acquire() - d_site = None - values['loginbase'] = "" - try: - d_site = plccache.GetSitesById([ site_id ])[0] - #d_site = plc.getSites({'site_id': site_id}, - # ['max_slices', 'slice_ids', 'node_ids', 'login_base'])[0] - values['loginbase'] = d_site['login_base'] - except: - traceback.print_exc() - plc_lock.release() + values['rpms'] = values['rpm_versions'] + print "ALLVERSIONS: %s %s" % (nodename, values['rpm_versions']) + print "RPMVERSION: %s %s" % (nodename, values['rpm_version']) + print "UPTIME: %s %s" % (nodename, values['uptime']) + print "MD5SUMS: %s %s" % (nodename, values['md5sums']) + print "MD5SUM_YUM: %s %s" % (nodename, values['md5sum_yum']) - values['plc_site_stats'] = d_site + values = self.evaluate(nodename, values) + #print "%s %s" % (nodename, values) values['date_checked'] = datetime.now() + except: print traceback.print_exc() return (nodename, values) + def internalprobe(hostname): - #fbsync = FindbadNodeRecordSync.findby_or_create(hostname="global", - # if_new_set={'round' : 1}) - scannode = ScanNodeInternal() # fbsync.round) + scannode = ScanNodeInternal() try: (nodename, values) = scannode.collectInternal(hostname, {}) scannode.record(None, (nodename, values)) @@ -401,12 +492,10 @@ def internalprobe(hostname): return False def externalprobe(hostname): - #fbsync = FindbadNodeRecordSync.findby_or_create(hostname="global", - # if_new_set={'round' : 1}) - scannode = ScanNodeInternal() # fbsync.round) + scannode = ScanNodeInternal() try: - (nodename, values) = scannode.collectNMAP(hostname, {}) - scannode.record(None, (nodename, values)) + values = self.collectPorts(hostname) + scannode.record(None, (hostname, values)) session.flush() return True except: @@ -440,13 +529,13 @@ class ScanPCU(ScanInterface): traceback.print_exc() continue_probe = False - if b_except or not continue_probe: return (None, None, None) + if b_except or not continue_probe: return (None, None) #### RUN NMAP ############################### if continue_probe: nmap = command.CMD() - print "nmap -oG - -P0 -p22,23,80,443,5869,9100,16992 %s | grep Host:" % reboot.pcu_name(values['plc_pcu_stats']) - (oval,eval) = nmap.run_noexcept("nmap -oG - -P0 -p22,23,80,443,5869,9100,16992 %s | grep Host:" % reboot.pcu_name(values['plc_pcu_stats'])) + print "nmap -oG - -P0 -p22,23,80,443,623,5869,9100,16992 %s | grep -v Down | grep Ports:" % reboot.pcu_name(values['plc_pcu_stats']) + (oval,eval) = nmap.run_noexcept("nmap -oG - -P0 -p22,23,80,443,623,5869,9100,16992 %s | grep -v Down | grep Ports:" % reboot.pcu_name(values['plc_pcu_stats'])) # NOTE: an empty / error value for oval, will still work. (values['port_status'], continue_probe) = nmap_port_status(oval) else: