X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=monitor%2Fscanapi.py;h=327b2b44d70c4144ca0186a8f7945981621fe3bc;hb=6c38b5cef7bf12fa4ade23ae1c54b3491797fb1d;hp=f7939e684b8f2cb2624ad13c8325e5d10aa0a771;hpb=6a452e8ece2ca8a47105c128eaebc38507bc76c5;p=monitor.git diff --git a/monitor/scanapi.py b/monitor/scanapi.py index f7939e6..327b2b4 100644 --- a/monitor/scanapi.py +++ b/monitor/scanapi.py @@ -63,7 +63,7 @@ def get_nodes(node_ids): l_node = plc.getNodes(node_ids, ['hostname', 'last_contact', 'node_id', 'ports']) except: try: - plc_nodes = plccache.l_plcnodes + plc_nodes = plccache.l_nodes for n in plc_nodes: if n['node_id'] in node_ids: l_node.append(n) @@ -157,7 +157,7 @@ class ScanInterface(object): except: print "ERROR:" - email_exception(nodename) + email_exception(str(nodename)) print traceback.print_exc() pass @@ -167,127 +167,110 @@ class ScanNodeInternal(ScanInterface): syncclass = None primarykey = 'hostname' + def collectPorts(self, nodename, port_list=[22,80,806]): + values = {} + for port in port_list: + ret = os.system("nc -w 5 -z %s %s > /dev/null" % (nodename, port) ) + if ret == 0: + values[str(port)] = "open" + else: + values[str(port)] = "closed" + return {'port_status' : values } + def collectNMAP(self, nodename, cohash): #### RUN NMAP ############################### + # NOTE: run the same command three times and take the best of three + # runs. NMAP can drop packets, and especially so when it runs many + # commands at once. values = {} nmap = command.CMD() print "nmap -oG - -P0 -p22,80,806 %s | grep Host:" % nodename - (oval,eval) = nmap.run_noexcept("nmap -oG - -P0 -p22,80,806 %s | grep Host:" % nodename) + (oval1,eval) = nmap.run_noexcept("nmap -oG - -P0 -p22,80,806 %s | grep Host:" % nodename) + (oval2,eval) = nmap.run_noexcept("nmap -oG - -P0 -p22,80,806 %s | grep Host:" % nodename) + (oval3,eval) = nmap.run_noexcept("nmap -oG - -P0 -p22,80,806 %s | grep Host:" % nodename) # NOTE: an empty / error value for oval, will still work. - (values['port_status'], continue_probe) = nmap_port_status(oval) + values['port_status'] = {} + (o1,continue_probe) = nmap_port_status(oval1) + (o2,continue_probe) = nmap_port_status(oval2) + (o3,continue_probe) = nmap_port_status(oval3) + for p in ['22', '80', '806']: + l = [ o1[p], o2[p], o3[p] ] + if len(filter(lambda x: x == 'open', l)) > 1: + values['port_status'][p] = 'open' + else: + values['port_status'][p] = o1[p] - values['date_checked'] = datetime.now() - + print values['port_status'] return (nodename, values) - def collectInternal(self, nodename, cohash): - ### RUN PING ###################### + def collectPING(self, nodename, cohash): + values = {} ping = command.CMD() (oval,errval) = ping.run_noexcept("ping -c 1 -q %s | grep rtt" % nodename) - try: - values = {} + values = {} + if oval == "": + # An error occurred + values['ping_status'] = False + else: + values['ping_status'] = True - if oval == "": - # An error occurred - values['ping_status'] = False - else: - values['ping_status'] = True + return values - try: - for port in [22, 806]: - ssh = command.SSH('root', nodename, port) - - (oval, errval) = ssh.run_noexcept2(""" <<\EOF - echo "{" - echo ' "kernel_version":"'`uname -a`'",' - echo ' "bmlog":"'`ls /tmp/bm.log`'",' - echo ' "bootcd_version":"'`cat /mnt/cdrom/bootme/ID`'",' - echo ' "nm_status":"'`ps ax | grep nm.py | grep -v grep`'",' - echo ' "fs_status":"'`touch /var/log/monitor 2>&1`'",' - echo ' "dns_status":"'`host boot.planet-lab.org 2>&1`'",' - echo ' "princeton_comon_dir":"'`ls -d /vservers/princeton_comon`'",' - - ID=`grep princeton_comon /etc/passwd | awk -F : '{if ( $3 > 500 ) { print $3}}'` - echo ' "princeton_comon_running":"'`ls -d /proc/virtual/$ID`'",' - echo ' "princeton_comon_procs":"'`vps ax | grep $ID | grep -v grep | wc -l`'",' - echo "}" -EOF """) - - values['ssh_error'] = errval - if len(oval) > 0: - #print "OVAL: %s" % oval - values.update(eval(oval)) - values['ssh_portused'] = port - break - else: - values.update({'kernel_version': "", 'bmlog' : "", 'bootcd_version' : '', - 'nm_status' : '', - 'fs_status' : '', - 'dns_status' : '', - 'princeton_comon_dir' : "", - 'princeton_comon_running' : "", - 'princeton_comon_procs' : "", 'ssh_portused' : None}) - except: - print traceback.print_exc() - sys.exit(1) - - ### RUN SSH ###################### - b_getbootcd_id = True - - oval = values['kernel_version'] - if "2.6.17" in oval or "2.6.2" in oval: - values['ssh_status'] = True - values['observed_category'] = 'PROD' - if "bm.log" in values['bmlog']: - values['observed_status'] = 'DEBUG' - else: - values['observed_status'] = 'BOOT' - elif "2.6.12" in oval or "2.6.10" in oval: - values['ssh_status'] = True - values['observed_category'] = 'OLDPROD' - if "bm.log" in values['bmlog']: - values['observed_status'] = 'DEBUG' - else: - values['observed_status'] = 'BOOT' - - # NOTE: on 2.6.8 kernels, with 4.2 bootstrapfs, the chroot - # command fails. I have no idea why. - elif "2.4" in oval or "2.6.8" in oval: - b_getbootcd_id = False - values['ssh_status'] = True - values['observed_category'] = 'OLDBOOTCD' - values['observed_status'] = 'DEBUG' - elif oval != "": - values['ssh_status'] = True - values['observed_category'] = 'UNKNOWN' - if "bm.log" in values['bmlog']: - values['observed_status'] = 'DEBUG' - else: - values['observed_status'] = 'BOOT' - else: - # An error occurred. - b_getbootcd_id = False - values['ssh_status'] = False - values['observed_category'] = 'ERROR' - values['observed_status'] = 'DOWN' - val = errval.strip() - values['ssh_error'] = val - values['kernel_version'] = "" - - if b_getbootcd_id: - # try to get BootCD for all nodes that are not 2.4 nor inaccessible - oval = values['bootcd_version'] - if "BootCD" in oval: - values['bootcd_version'] = oval - if "v2" in oval and \ - ( nodename is not "planetlab1.cs.unc.edu" and \ - nodename is not "planetlab2.cs.unc.edu" ): - values['observed_category'] = 'OLDBOOTCD' + def collectTRACEROUTE(self, nodename, cohash): + values = {} + trace = command.CMD() + (oval,errval) = trace.run_noexcept("traceroute %s" % nodename) + + values['traceroute'] = oval + + return values + + def collectSSH(self, nodename, cohash): + values = {} + try: + for port in [22, 806]: + ssh = command.SSH('root', nodename, port) + + (oval, errval) = ssh.run_noexcept2(""" <<\EOF + echo "{" + echo ' "kernel_version":"'`uname -a`'",' + echo ' "bmlog":"'`ls /tmp/bm.log`'",' + echo ' "bootcd_version":"'`cat /mnt/cdrom/bootme/ID`'",' + echo ' "boot_server":"'`cat /mnt/cdrom/bootme/BOOTSERVER`'",' + echo ' "nm_status":"'`ps ax | grep nm.py | grep -v grep`'",' + echo ' "dns_status":"'`host boot.planet-lab.org 2>&1`'",' + echo ' "iptables_status":"'`iptables -t mangle -nL | awk '$1~/^[A-Z]+$/ {modules[$1]=1;}END{for (k in modules) {if (k) printf "%s ",k;}}'`'",' + echo ' "princeton_comon_dir":"'`ls -d /vservers/princeton_comon`'",' + echo ' "uptime":"'`cat /proc/uptime`'",' + + ID=`grep princeton_comon /etc/passwd | awk -F : '{if ( $3 > 500 ) { print $3}}'` + echo ' "princeton_comon_running":"'`ls -d /proc/virtual/$ID`'",' + echo ' "princeton_comon_procs":"'`vps ax | grep $ID | grep -v grep | wc -l`'",' + echo ' "fs_status":"'`grep proc /proc/mounts | grep ro, ; if [ -x /usr/bin/timeout.pl ] ; then timeout.pl 20 touch /var/log/monitor 2>&1 ; if [ -d /vservers/ ] ; then timeout.pl 20 touch /vservers/monitor.log 2>&1 ; fi ; fi`'",' + echo ' "rpm_version":"'`if [ -x /usr/bin/timeout.pl ] ; then timeout.pl 30 rpm -q NodeManager ; fi`'",' + echo ' "rpm_versions":"'`if [ -x /usr/bin/timeout.pl ] ; then timeout.pl 45 rpm -q -a ; fi`'",' + echo "}" +EOF """) + + values['ssh_error'] = errval + if len(oval) > 0: + #print "OVAL: %s" % oval + values.update(eval(oval)) + values['ssh_portused'] = port + break else: - values['bootcd_version'] = "" - else: - values['bootcd_version'] = "" + values.update({'kernel_version': "", 'bmlog' : "", 'bootcd_version' : '', + 'boot_server' : '', + 'nm_status' : '', + 'fs_status' : '', + 'uptime' : '', + 'dns_status' : '', + 'rpm_version' : '', + 'rpm_versions' : '', + 'princeton_comon_dir' : "", + 'princeton_comon_running' : "", + 'princeton_comon_procs' : "", 'ssh_portused' : None}) oval = values['nm_status'] if "nm.py" in oval: @@ -297,7 +280,7 @@ EOF """) continue_slice_check = True oval = values['princeton_comon_dir'] - if "princeton_comon_dir" in oval: + if "princeton_comon" in oval: values['princeton_comon_dir'] = True else: values['princeton_comon_dir'] = False @@ -318,8 +301,149 @@ EOF """) values['princeton_comon_procs'] = int(oval) else: values['princeton_comon_procs'] = None + except: + print traceback.print_exc() + sys.exit(1) + + return values + + def collectPLC(self, nodename, cohash): + values = {} + ### GET PLC NODE ###################### + d_node = plccache.GetNodeByName(nodename) + values['plc_node_stats'] = d_node + + ### GET PLC PCU ###################### + site_id = -1 + d_pcu = None + if d_node and len(d_node['pcu_ids']) > 0: + d_pcu = d_node['pcu_ids'][0] + + site_id = d_node['site_id'] + + values['plc_pcuid'] = d_pcu + + ### GET PLC SITE ###################### + print "SITEID: %s" % site_id + d_site = plccache.GetSitesById([ site_id ])[0] + values['loginbase'] = d_site['login_base'] + values['plc_site_stats'] = d_site + + return values + + def evaluate(self, nodename, values): + # TODO: this section can probably be reduced to a policy statement + # using patterns and values collected so far. + # NOTE: A node is "DOWN" if + # * cannot ssh into it. + # * all ports are not open for a 'BOOT' node + # * dns for hostname does not exist. + b_getbootcd_id = True + + oval = values['kernel_version'] + values['ssh_status'] = True + if "2.6.17" in oval or "2.6.2" in oval: + values['observed_category'] = 'PROD' + if "bm.log" in values['bmlog']: + values['observed_status'] = 'DEBUG' + else: + values['observed_status'] = 'BOOT' + elif "2.6.12" in oval or "2.6.10" in oval: + values['observed_category'] = 'OLDPROD' + if "bm.log" in values['bmlog']: + values['observed_status'] = 'DEBUG' + else: + values['observed_status'] = 'BOOT' + + # NOTE: on 2.6.8 kernels, with 4.2 bootstrapfs, the chroot + # command fails. I have no idea why. + elif "2.4" in oval or "2.6.8" in oval: + b_getbootcd_id = False + values['observed_category'] = 'OLDBOOTCD' + values['observed_status'] = 'DEBUG' + elif oval != "": + values['observed_category'] = 'UNKNOWN' + if "bm.log" in values['bmlog']: + values['observed_status'] = 'DEBUG' + else: + values['observed_status'] = 'BOOT' + else: + # An error occurred. + b_getbootcd_id = False + values['ssh_status'] = False + values['observed_category'] = 'ERROR' + values['observed_status'] = 'DOWN' + values['kernel_version'] = "" + + values['firewall'] = False + + # NOTE: A node is down if some of the public ports are not open + if values['observed_status'] == "BOOT": + # verify that all ports are open. Else, report node as down. + if not ( values['port_status']['22'] == "open" and \ + values['port_status']['80'] == "open" and \ + values['port_status']['806'] == "open") : + #email_exception(nodename, "%s FILTERED HOST" % nodename) + values['observed_status'] = 'DOWN' + values['firewall'] = True + + #if values['port_status']['22'] == "open" and \ + # values['port_status']['80'] == "closed" and \ + # values['port_status']['806'] == "open" : + # email_exception("%s port 80 blocked" % nodename, "possible VSERVER ref blocked") + + #if not values['external_dns_status']: + # email_exception("%s DNS down" % nodename) + + if b_getbootcd_id: + # try to get BootCD for all nodes that are not 2.4 nor inaccessible + oval = values['bootcd_version'] + if "BootCD" in oval: + values['bootcd_version'] = oval + if "v2" in oval and \ + ( nodename is not "planetlab1.cs.unc.edu" and \ + nodename is not "planetlab2.cs.unc.edu" ): + values['observed_category'] = 'OLDBOOTCD' + else: + values['bootcd_version'] = "" + else: + values['bootcd_version'] = "" + + return values + + def collectDNS(self, nodename, cohash): + values = {} + try: + ipaddr = socket.gethostbyname(nodename) + # TODO: check that IP returned matches IP in plc db. + values['external_dns_status'] = True + except Exception, err: + values['external_dns_status'] = False + + return values + + def collectInternal(self, nodename, cohash): + try: + values = {} + + v = self.collectPING(nodename, cohash) + values.update(v) + + v = self.collectPorts(nodename) + values.update(v) + + v = self.collectSSH(nodename, cohash) + values.update(v) + + v = self.collectDNS(nodename, cohash) + values.update(v) + + v = self.collectTRACEROUTE(nodename, cohash) + values.update(v) + + v = self.collectPLC(nodename, cohash) + values.update(v) - if nodename in cohash: values['comon_stats'] = cohash[nodename] else: @@ -330,60 +454,23 @@ EOF """) 'cpuspeed' : "null", 'disksize' : 'null', 'memsize' : 'null'} - # include output value - ### GET PLC NODE ###################### - plc_lock.acquire() - d_node = None - try: - d_node = plccache.GetNodeByName(nodename) - #d_node = plc.getNodes({'hostname': nodename}, ['pcu_ids', 'site_id', - # 'date_created', 'last_updated', - # 'last_contact', 'boot_state', 'nodegroup_ids'])[0] - except: - traceback.print_exc() - plc_lock.release() - values['plc_node_stats'] = d_node - - ##### NMAP ################### - (n, v) = self.collectNMAP(nodename, None) - values.update(v) - - ### GET PLC PCU ###################### - site_id = -1 - d_pcu = None - if d_node: - pcu = d_node['pcu_ids'] - if len(pcu) > 0: - d_pcu = pcu[0] - - site_id = d_node['site_id'] - values['plc_pcuid'] = d_pcu - - ### GET PLC SITE ###################### - plc_lock.acquire() - d_site = None - values['loginbase'] = "" - try: - d_site = plccache.GetSitesById([ site_id ])[0] - #d_site = plc.getSites({'site_id': site_id}, - # ['max_slices', 'slice_ids', 'node_ids', 'login_base'])[0] - values['loginbase'] = d_site['login_base'] - except: - traceback.print_exc() - plc_lock.release() + values['rpms'] = values['rpm_versions'] + print "ALLVERSIONS: %s %s" % (nodename, values['rpm_versions']) + print "RPMVERSION: %s %s" % (nodename, values['rpm_version']) + print "UPTIME: %s %s" % (nodename, values['uptime']) - values['plc_site_stats'] = d_site + values = self.evaluate(nodename, values) values['date_checked'] = datetime.now() + except: print traceback.print_exc() return (nodename, values) + def internalprobe(hostname): - #fbsync = FindbadNodeRecordSync.findby_or_create(hostname="global", - # if_new_set={'round' : 1}) - scannode = ScanNodeInternal() # fbsync.round) + scannode = ScanNodeInternal() try: (nodename, values) = scannode.collectInternal(hostname, {}) scannode.record(None, (nodename, values)) @@ -394,12 +481,10 @@ def internalprobe(hostname): return False def externalprobe(hostname): - #fbsync = FindbadNodeRecordSync.findby_or_create(hostname="global", - # if_new_set={'round' : 1}) - scannode = ScanNodeInternal() # fbsync.round) + scannode = ScanNodeInternal() try: - (nodename, values) = scannode.collectNMAP(hostname, {}) - scannode.record(None, (nodename, values)) + values = self.collectPorts(hostname) + scannode.record(None, (hostname, values)) session.flush() return True except: @@ -483,7 +568,7 @@ class ScanPCU(ScanInterface): values['dns_status'] = "DNS-OK" else: values['dns_status'] = "DNS-MISMATCH" - continue_probe = False + values['plc_pcu_stats']['hostname'] = values['plc_pcu_stats']['ip'] except Exception, err: values['dns_status'] = "DNS-NOENTRY"