X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=monitor%2Fscanapi.py;h=177affcb16341b6ebc2ca86bb1d76723f6a01de8;hb=fbe2fbd7f5d866205f878e68968dcef14a3812ab;hp=bbc99d77ef1b92c83a38e6eab63b30c5e6e729d4;hpb=f6ae4843ec52f237b8c01c9fdcc9130a34518944;p=monitor.git diff --git a/monitor/scanapi.py b/monitor/scanapi.py index bbc99d7..177affc 100644 --- a/monitor/scanapi.py +++ b/monitor/scanapi.py @@ -11,8 +11,7 @@ import threading import socket from pcucontrol import reboot -from monitor import util -from monitor.util import command +from pcucontrol.util import command from monitor import config from monitor.database.info.model import * @@ -21,7 +20,7 @@ from monitor.sources import comon from monitor.wrapper import plc, plccache import traceback -from nodecommon import nmap_port_status +from monitor.common import nmap_port_status, email_exception COMON_COTOPURL= "http://summer.cs.princeton.edu/status/tabulator.cgi?" + \ "table=table_nodeview&" + \ @@ -64,7 +63,7 @@ def get_nodes(node_ids): l_node = plc.getNodes(node_ids, ['hostname', 'last_contact', 'node_id', 'ports']) except: try: - plc_nodes = plccache.l_plcnodes + plc_nodes = plccache.l_nodes for n in plc_nodes: if n['node_id'] in node_ids: l_node.append(n) @@ -113,7 +112,7 @@ class ScanInterface(object): syncclass = None primarykey = 'hostname' - def __init__(self, round): + def __init__(self, round=1): self.round = round self.count = 1 @@ -134,157 +133,151 @@ class ScanInterface(object): try: if values is None: return - - fbnodesync = self.syncclass.findby_or_create( - if_new_set={'round' : self.round}, + + if self.syncclass: + fbnodesync = self.syncclass.findby_or_create( + #if_new_set={'round' : self.round}, **{ self.primarykey : nodename}) # NOTE: This code will either add a new record for the new self.round, # OR it will find the previous value, and update it with new information. # The data that is 'lost' is not that important, b/c older # history still exists. fbrec = self.recordclass.findby_or_create( - **{'round':self.round, self.primarykey:nodename}) + **{ self.primarykey:nodename}) fbrec.set( **values ) fbrec.flush() - fbnodesync.round = self.round - fbnodesync.flush() + if self.syncclass: + fbnodesync.round = self.round + fbnodesync.flush() print "%d %s %s" % (self.count, nodename, values) self.count += 1 except: print "ERROR:" + email_exception(str(nodename)) print traceback.print_exc() pass class ScanNodeInternal(ScanInterface): recordclass = FindbadNodeRecord - syncclass = FindbadNodeRecordSync + #syncclass = FindbadNodeRecordSync + syncclass = None primarykey = 'hostname' + def collectPorts(self, nodename, port_list=[22,80,806]): + values = {} + for port in port_list: + ret = os.system("nc -w 5 -z %s %s > /dev/null" % (nodename, port) ) + if ret == 0: + values[str(port)] = "open" + else: + values[str(port)] = "closed" + return {'port_status' : values } + def collectNMAP(self, nodename, cohash): #### RUN NMAP ############################### + # NOTE: run the same command three times and take the best of three + # runs. NMAP can drop packets, and especially so when it runs many + # commands at once. values = {} - nmap = util.command.CMD() - print "nmap -oG - -P0 -p22,80,806 %s | grep Host:" % nodename - (oval,eval) = nmap.run_noexcept("nmap -oG - -P0 -p22,80,806 %s | grep Host:" % nodename) + nmap = command.CMD() + print "nmap -oG - -P0 -p22,80,806 %s | grep -v Down | grep Ports:" % nodename + (oval1,eval) = nmap.run_noexcept("nmap -oG - -P0 -p22,80,806 %s | grep -v Down | grep Ports:" % nodename) + (oval2,eval) = nmap.run_noexcept("nmap -oG - -P0 -p22,80,806 %s | grep -v Down | grep Ports:" % nodename) + (oval3,eval) = nmap.run_noexcept("nmap -oG - -P0 -p22,80,806 %s | grep -v Down | grep Ports:" % nodename) # NOTE: an empty / error value for oval, will still work. - (values['port_status'], continue_probe) = nmap_port_status(oval) + values['port_status'] = {} + (o1,continue_probe) = nmap_port_status(oval1) + (o2,continue_probe) = nmap_port_status(oval2) + (o3,continue_probe) = nmap_port_status(oval3) + for p in ['22', '80', '806']: + l = [ o1[p], o2[p], o3[p] ] + if len(filter(lambda x: x == 'open', l)) > 1: + values['port_status'][p] = 'open' + else: + values['port_status'][p] = o1[p] - values['date_checked'] = datetime.now() - + print values['port_status'] return (nodename, values) - def collectInternal(self, nodename, cohash): - ### RUN PING ###################### + def collectPING(self, nodename, cohash): + values = {} ping = command.CMD() (oval,errval) = ping.run_noexcept("ping -c 1 -q %s | grep rtt" % nodename) - try: - values = {} + values = {} + if oval == "": + # An error occurred + values['ping_status'] = False + else: + values['ping_status'] = True - if oval == "": - # An error occurred - values['ping_status'] = False - else: - values['ping_status'] = True + return values - try: - for port in [22, 806]: - ssh = command.SSH('root', nodename, port) - - (oval, errval) = ssh.run_noexcept2(""" <<\EOF - echo "{" - echo ' "kernel_version":"'`uname -a`'",' - echo ' "bmlog":"'`ls /tmp/bm.log`'",' - echo ' "bootcd_version":"'`cat /mnt/cdrom/bootme/ID`'",' - echo ' "nm_status":"'`ps ax | grep nm.py | grep -v grep`'",' - echo ' "fs_status":"'`touch /var/log/monitor 2>&1`'",' - echo ' "dns_status":"'`host boot.planet-lab.org 2>&1`'",' - echo ' "princeton_comon_dir":"'`ls -d /vservers/princeton_comon`'",' - - ID=`grep princeton_comon /etc/passwd | awk -F : '{if ( $3 > 500 ) { print $3}}'` - echo ' "princeton_comon_running":"'`ls -d /proc/virtual/$ID`'",' - echo ' "princeton_comon_procs":"'`vps ax | grep $ID | grep -v grep | wc -l`'",' - echo "}" - EOF """) - - values['ssh_error'] = errval - if len(oval) > 0: - #print "OVAL: %s" % oval - values.update(eval(oval)) - values['ssh_portused'] = port - break - else: - values.update({'kernel_version': "", 'bmlog' : "", 'bootcd_version' : '', - 'nm_status' : '', - 'fs_status' : '', - 'dns_status' : '', - 'princeton_comon_dir' : "", - 'princeton_comon_running' : "", - 'princeton_comon_procs' : "", 'ssh_portused' : None}) - except: - print traceback.print_exc() - sys.exit(1) - - ### RUN SSH ###################### - b_getbootcd_id = True - - oval = values['kernel_version'] - if "2.6.17" in oval or "2.6.2" in oval: - values['ssh_status'] = True - values['observed_category'] = 'PROD' - if "bm.log" in values['bmlog']: - values['observed_status'] = 'DEBUG' - else: - values['observed_status'] = 'BOOT' - elif "2.6.12" in oval or "2.6.10" in oval: - values['ssh_status'] = True - values['observed_category'] = 'OLDPROD' - if "bm.log" in values['bmlog']: - values['observed_status'] = 'DEBUG' - else: - values['observed_status'] = 'BOOT' - - # NOTE: on 2.6.8 kernels, with 4.2 bootstrapfs, the chroot - # command fails. I have no idea why. - elif "2.4" in oval or "2.6.8" in oval: - b_getbootcd_id = False - values['ssh_status'] = True - values['observed_category'] = 'OLDBOOTCD' - values['observed_status'] = 'DEBUG' - elif oval != "": - values['ssh_status'] = True - values['observed_category'] = 'UNKNOWN' - if "bm.log" in values['bmlog']: - values['observed_status'] = 'DEBUG' - else: - values['observed_status'] = 'BOOT' - else: - # An error occurred. - b_getbootcd_id = False - values['ssh_status'] = False - values['observed_category'] = 'ERROR' - values['observed_status'] = 'DOWN' - val = errval.strip() - values['ssh_error'] = val - values['kernel_version'] = "" - - if b_getbootcd_id: - # try to get BootCD for all nodes that are not 2.4 nor inaccessible - oval = values['bootcd_version'] - if "BootCD" in oval: - values['bootcd_version'] = oval - if "v2" in oval and \ - ( nodename is not "planetlab1.cs.unc.edu" and \ - nodename is not "planetlab2.cs.unc.edu" ): - values['observed_category'] = 'OLDBOOTCD' + def collectTRACEROUTE(self, nodename, cohash): + values = {} + trace = command.CMD() + (oval,errval) = trace.run_noexcept("traceroute %s" % nodename) + + values['traceroute'] = oval + + return values + + def collectSSH(self, nodename, cohash): + values = {} + try: + for port in [22, 806]: + ssh = command.SSH('root', nodename, port) + + (oval, errval) = ssh.run_noexcept2(""" <<\EOF + echo "{" + echo ' "kernel_version":"'`uname -a`'",' + echo ' "bmlog":"'`ls /tmp/bm.log || ls /tmp/source/BootManager.py`'",' + echo ' "bootcd_version":"'`cat /mnt/cdrom/bootme/ID || cat /usr/bootme/ID`'",' + echo ' "boot_server":"'`cat /mnt/cdrom/bootme/BOOTSERVER`'",' + echo ' "install_date":"'`python -c "import os,time,stat; print time.ctime(os.stat('/usr/boot/plnode.txt')[stat.ST_CTIME])" || python -c "import os,time,stat; print time.ctime(os.stat('/usr/boot/cacert.pem')[stat.ST_CTIME])"`'",' + echo ' "nm_status":"'`ps ax | grep nm.py | grep -v grep`'",' + echo ' "dns_status":"'`host boot.planet-lab.org 2>&1`'",' + echo ' "iptables_status":"'`iptables -t mangle -nL | awk '$1~/^[A-Z]+$/ {modules[$1]=1;}END{for (k in modules) {if (k) printf "%s ",k;}}'`'",' + echo ' "princeton_comon_dir":"'`ls -d /vservers/princeton_comon`'",' + echo ' "uptime":"'`cat /proc/uptime`'",' + + ID=`grep princeton_comon /etc/passwd | awk -F : '{if ( $3 > 500 ) { print $3}}'` + echo ' "princeton_comon_running":"'`ls -d /proc/virtual/$ID`'",' + echo ' "princeton_comon_procs":"'`vps ax | grep $ID | grep -v grep | wc -l`'",' + echo ' "fs_status":"'`grep proc /proc/mounts | grep ro, ; if [ -x /usr/bin/timeout.pl ] ; then timeout.pl 20 touch /var/log/monitor 2>&1 ; if [ -d /vservers/ ] ; then timeout.pl 20 touch /vservers/monitor.log 2>&1 ; fi ; fi`'",' + echo ' "rpm_version":"'`if [ -x /usr/bin/timeout.pl ] ; then /usr/bin/timeout.pl 30 rpm -q NodeManager ; fi`'",' + echo ' "rpm_versions":"'`if [ -x /usr/bin/timeout.pl ] ; then /usr/bin/timeout.pl 45 rpm -q -a | sort ; fi`'",' + echo ' "md5sums":"'`md5sum /etc/yum.conf /etc/yum.myplc.d/myplc.repo /etc/yum.myplc.d/stock.repo | awk '{print $1}'`'",' + echo ' "md5sum_yum":"'`grep -v -E "^#" /etc/yum.myplc.d/myplc.repo | md5sum`'",' + echo ' "nada":"'``'",' + echo "}" +EOF """) + + values['ssh_error'] = errval + if len(oval) > 0: + #print "OVAL: %s" % oval + values.update(eval(oval)) + values['ssh_portused'] = port + break else: - values['bootcd_version'] = "" - else: - values['bootcd_version'] = "" + values.update({'kernel_version': "", 'bmlog' : "", 'bootcd_version' : '', + 'boot_server' : '', + 'install_date' : '', + 'nm_status' : '', + 'fs_status' : '', + 'uptime' : '', + 'dns_status' : '', + 'md5sums' : '', + 'md5sum_yum' : '', + 'rpm_version' : '', + 'rpm_versions' : '', + 'princeton_comon_dir' : "", + 'princeton_comon_running' : "", + 'princeton_comon_procs' : "", 'ssh_portused' : None}) oval = values['nm_status'] if "nm.py" in oval: @@ -294,7 +287,7 @@ class ScanNodeInternal(ScanInterface): continue_slice_check = True oval = values['princeton_comon_dir'] - if "princeton_comon_dir" in oval: + if "princeton_comon" in oval: values['princeton_comon_dir'] = True else: values['princeton_comon_dir'] = False @@ -315,8 +308,150 @@ class ScanNodeInternal(ScanInterface): values['princeton_comon_procs'] = int(oval) else: values['princeton_comon_procs'] = None + except: + print traceback.print_exc() + sys.exit(1) + + return values + + def collectPLC(self, nodename, cohash): + values = {} + ### GET PLC NODE ###################### + d_node = plccache.GetNodeByName(nodename) + values['plc_node_stats'] = d_node + + ### GET PLC PCU ###################### + site_id = -1 + d_pcu = None + if d_node and len(d_node['pcu_ids']) > 0: + d_pcu = d_node['pcu_ids'][0] + + site_id = d_node['site_id'] + + values['plc_pcuid'] = d_pcu + + ### GET PLC SITE ###################### + print "SITEID: %s" % site_id + d_site = plccache.GetSitesById([ site_id ])[0] + values['loginbase'] = d_site['login_base'] + values['plc_site_stats'] = d_site + + return values + + def evaluate(self, nodename, values): + # TODO: this section can probably be reduced to a policy statement + # using patterns and values collected so far. + # NOTE: A node is "DOWN" if + # * cannot ssh into it. + # * all ports are not open for a 'BOOT' node + # * dns for hostname does not exist. + b_getbootcd_id = True + + oval = values['kernel_version'] + values['ssh_status'] = True + if "2.6.17" in oval or "2.6.2" in oval: + values['observed_category'] = 'PROD' + if "bm.log" in values['bmlog'] or "BootManager" in values['bmlog']: + values['observed_status'] = 'DEBUG' + else: + values['observed_status'] = 'BOOT' + elif "2.6.12" in oval or "2.6.10" in oval: + values['observed_category'] = 'OLDPROD' + if "bm.log" in values['bmlog']: + values['observed_status'] = 'DEBUG' + else: + values['observed_status'] = 'BOOT' + + # NOTE: on 2.6.8 kernels, with 4.2 bootstrapfs, the chroot + # command fails. I have no idea why. + elif "2.4" in oval or "2.6.8" in oval: + b_getbootcd_id = False + values['observed_category'] = 'OLDBOOTCD' + values['observed_status'] = 'DEBUG' + elif oval != "": + values['observed_category'] = 'UNKNOWN' + if "bm.log" in values['bmlog']: + values['observed_status'] = 'DEBUG' + else: + values['observed_status'] = 'BOOT' + else: + # An error occurred. + b_getbootcd_id = False + values['ssh_status'] = False + values['observed_category'] = 'ERROR' + values['observed_status'] = 'DOWN' + values['kernel_version'] = "" + + values['firewall'] = False + + #print "BEFORE:%s" % values + # NOTE: A node is down if some of the public ports are not open + if values['observed_status'] == "BOOT": + # verify that all ports are open. Else, report node as down. + if not ( values['port_status']['22'] == "open" and \ + values['port_status']['80'] == "open" and \ + values['port_status']['806'] == "open") : + #email_exception(nodename, "%s FILTERED HOST" % nodename) + values['observed_status'] = 'DOWN' + values['firewall'] = True + + #if values['port_status']['22'] == "open" and \ + # values['port_status']['80'] == "closed" and \ + # values['port_status']['806'] == "open" : + # email_exception("%s port 80 blocked" % nodename, "possible VSERVER ref blocked") + + #if not values['external_dns_status']: + # email_exception("%s DNS down" % nodename) + + if b_getbootcd_id: + # try to get BootCD for all nodes that are not 2.4 nor inaccessible + oval = values['bootcd_version'] + if "BootCD" in oval: + values['bootcd_version'] = oval + if "v2" in oval and \ + ( nodename is not "planetlab1.cs.unc.edu" and \ + nodename is not "planetlab2.cs.unc.edu" ): + values['observed_category'] = 'OLDBOOTCD' + else: + values['bootcd_version'] = "" + else: + values['bootcd_version'] = "" + + return values + + def collectDNS(self, nodename, cohash): + values = {} + try: + ipaddr = socket.gethostbyname(nodename) + # TODO: check that IP returned matches IP in plc db. + values['external_dns_status'] = True + except Exception, err: + values['external_dns_status'] = False + + return values + + def collectInternal(self, nodename, cohash): + try: + values = {} + + v = self.collectPING(nodename, cohash) + values.update(v) + + v = self.collectPorts(nodename) + values.update(v) + + v = self.collectSSH(nodename, cohash) + values.update(v) + + v = self.collectDNS(nodename, cohash) + values.update(v) + + v = self.collectTRACEROUTE(nodename, cohash) + values.update(v) + + v = self.collectPLC(nodename, cohash) + values.update(v) - if nodename in cohash: values['comon_stats'] = cohash[nodename] else: @@ -327,58 +462,26 @@ class ScanNodeInternal(ScanInterface): 'cpuspeed' : "null", 'disksize' : 'null', 'memsize' : 'null'} - # include output value - ### GET PLC NODE ###################### - plc_lock.acquire() - d_node = None - try: - d_node = plc.getNodes({'hostname': nodename}, ['pcu_ids', 'site_id', - 'date_created', 'last_updated', - 'last_contact', 'boot_state', 'nodegroup_ids'])[0] - except: - traceback.print_exc() - plc_lock.release() - values['plc_node_stats'] = d_node - - ##### NMAP ################### - (n, v) = self.collectNMAP(nodename, None) - values.update(v) - ### GET PLC PCU ###################### - site_id = -1 - d_pcu = None - if d_node: - pcu = d_node['pcu_ids'] - if len(pcu) > 0: - d_pcu = pcu[0] + values['rpms'] = values['rpm_versions'] + print "ALLVERSIONS: %s %s" % (nodename, values['rpm_versions']) + print "RPMVERSION: %s %s" % (nodename, values['rpm_version']) + print "UPTIME: %s %s" % (nodename, values['uptime']) + print "MD5SUMS: %s %s" % (nodename, values['md5sums']) + print "MD5SUM_YUM: %s %s" % (nodename, values['md5sum_yum']) - site_id = d_node['site_id'] - - values['plc_pcuid'] = d_pcu - - ### GET PLC SITE ###################### - plc_lock.acquire() - d_site = None - values['loginbase'] = "" - try: - d_site = plc.getSites({'site_id': site_id}, - ['max_slices', 'slice_ids', 'node_ids', 'login_base'])[0] - values['loginbase'] = d_site['login_base'] - except: - traceback.print_exc() - plc_lock.release() - - values['plc_site_stats'] = d_site + values = self.evaluate(nodename, values) + #print "%s %s" % (nodename, values) values['date_checked'] = datetime.now() + except: print traceback.print_exc() return (nodename, values) + def internalprobe(hostname): - fbsync = FindbadNodeRecordSync.findby_or_create(hostname="global", - if_new_set={'round' : 1}) - scannode = ScanNodeInternal(fbsync.round) + scannode = ScanNodeInternal() try: (nodename, values) = scannode.collectInternal(hostname, {}) scannode.record(None, (nodename, values)) @@ -389,12 +492,10 @@ def internalprobe(hostname): return False def externalprobe(hostname): - fbsync = FindbadNodeRecordSync.findby_or_create(hostname="global", - if_new_set={'round' : 1}) - scannode = ScanNodeInternal(fbsync.round) + scannode = ScanNodeInternal() try: - (nodename, values) = scannode.collectNMAP(hostname, {}) - scannode.record(None, (nodename, values)) + values = self.collectPorts(hostname) + scannode.record(None, (hostname, values)) session.flush() return True except: @@ -403,7 +504,7 @@ def externalprobe(hostname): class ScanPCU(ScanInterface): recordclass = FindbadPCURecord - syncclass = FindbadPCURecordSync + syncclass = None primarykey = 'plc_pcuid' def collectInternal(self, pcuname, cohash): @@ -428,13 +529,13 @@ class ScanPCU(ScanInterface): traceback.print_exc() continue_probe = False - if b_except or not continue_probe: return (None, None, None) + if b_except or not continue_probe: return (None, None) #### RUN NMAP ############################### if continue_probe: - nmap = util.command.CMD() - print "nmap -oG - -P0 -p22,23,80,443,5869,9100,16992 %s | grep Host:" % reboot.pcu_name(values['plc_pcu_stats']) - (oval,eval) = nmap.run_noexcept("nmap -oG - -P0 -p22,23,80,443,5869,9100,16992 %s | grep Host:" % reboot.pcu_name(values['plc_pcu_stats'])) + nmap = command.CMD() + print "nmap -oG - -P0 -p22,23,80,443,623,5869,9100,16992 %s | grep -v Down | grep Ports:" % reboot.pcu_name(values['plc_pcu_stats']) + (oval,eval) = nmap.run_noexcept("nmap -oG - -P0 -p22,23,80,443,623,5869,9100,16992 %s | grep -v Down | grep Ports:" % reboot.pcu_name(values['plc_pcu_stats'])) # NOTE: an empty / error value for oval, will still work. (values['port_status'], continue_probe) = nmap_port_status(oval) else: @@ -478,7 +579,7 @@ class ScanPCU(ScanInterface): values['dns_status'] = "DNS-OK" else: values['dns_status'] = "DNS-MISMATCH" - continue_probe = False + values['plc_pcu_stats']['hostname'] = values['plc_pcu_stats']['ip'] except Exception, err: values['dns_status'] = "DNS-NOENTRY" @@ -494,7 +595,7 @@ class ScanPCU(ScanInterface): ###### DRY RUN ############################ - if 'node_ids' in values['plc_pcu_stats'] and \ + if continue_probe and 'node_ids' in values['plc_pcu_stats'] and \ len(values['plc_pcu_stats']['node_ids']) > 0: rb_ret = reboot.reboot_test_new(values['plc_pcu_stats']['nodenames'][0], values, 1, True) @@ -510,7 +611,8 @@ class ScanPCU(ScanInterface): print "____________________________________" errors['traceback'] = traceback.format_exc() print errors['traceback'] - values['reboot_trial_status'] = errors['traceback'] + values['reboot_trial_status'] = str(errors['traceback']) + print values values['entry_complete']=" ".join(values['entry_complete'])