clearer names for actions, and infer actions better
[monitor.git] / monitor / scanapi.py
1 #!/usr/bin/python
2
3 import os
4 import sys
5 import string
6 import time
7 from datetime import datetime,timedelta
8 import threadpool
9 import threading
10
11 import socket
12 from pcucontrol import reboot
13
14 from pcucontrol.util import command
15 from monitor import config
16
17 from monitor.database.info.model import *
18
19 from monitor.sources import comon
20 from monitor.wrapper import plc, plccache
21
22 import traceback
23 from monitor.common import nmap_port_status, email_exception
24
25 COMON_COTOPURL= "http://summer.cs.princeton.edu/status/tabulator.cgi?" + \
26                         "table=table_nodeview&" + \
27                         "dumpcols='name,resptime,sshstatus,uptime,lastcotop,cpuspeed,memsize,disksize'&" + \
28                         "formatcsv"
29
30 api = plc.getAuthAPI()
31 plc_lock = threading.Lock()
32 round = 1
33 global_round = round
34 count = 0
35
36
37 def get_pcu(pcuname):
38         plc_lock.acquire()
39         try:
40                 #print "GetPCU from PLC %s" % pcuname
41                 l_pcu  = plc.GetPCUs({'pcu_id' : pcuname})
42                 #print l_pcu
43                 if len(l_pcu) > 0:
44                         l_pcu = l_pcu[0]
45         except:
46                 try:
47                         #print "GetPCU from file %s" % pcuname
48                         l_pcus = plccache.l_pcus
49                         for i in l_pcus:
50                                 if i['pcu_id'] == pcuname:
51                                         l_pcu = i
52                 except:
53                         traceback.print_exc()
54                         l_pcu = None
55
56         plc_lock.release()
57         return l_pcu
58
59 def get_nodes(node_ids):
60         plc_lock.acquire()
61         l_node = []
62         try:
63                 l_node = plc.getNodes(node_ids, ['hostname', 'last_contact', 'node_id', 'ports'])
64         except:
65                 try:
66                         plc_nodes = plccache.l_nodes
67                         for n in plc_nodes:
68                                 if n['node_id'] in node_ids:
69                                         l_node.append(n)
70                 except:
71                         traceback.print_exc()
72                         l_node = None
73
74         plc_lock.release()
75         if l_node == []:
76                 l_node = None
77         return l_node
78         
79
80 def get_plc_pcu_values(pcuname):
81         """
82                 Try to contact PLC to get the PCU info.
83                 If that fails, try a backup copy from the last run.
84                 If that fails, return None
85         """
86         values = {}
87
88         l_pcu = get_pcu(pcuname)
89         
90         if l_pcu is not None:
91                 site_id = l_pcu['site_id']
92                 node_ids = l_pcu['node_ids']
93                 l_node = get_nodes(node_ids) 
94                                 
95                 if l_node is not None:
96                         for node in l_node:
97                                 values[node['hostname']] = node['ports'][0]
98
99                         values['nodenames'] = [node['hostname'] for node in l_node]
100
101                         # NOTE: this is for a dry run later. It doesn't matter which node.
102                         values['node_id'] = l_node[0]['node_id']
103
104                 values.update(l_pcu)
105         else:
106                 values = None
107         
108         return values
109
110 class ScanInterface(object):
111         recordclass = None
112         syncclass = None
113         primarykey = 'hostname'
114
115         def __init__(self, round=1):
116                 self.round = round
117                 self.count = 1
118
119         def __getattr__(self, name):
120                 if 'collect' in name or 'record' in name:
121                         method = getattr(self, name, None)
122                         if method is None:
123                                 raise Exception("No such method %s" % name)
124                         return method
125                 else:
126                         raise Exception("No such method %s" % name)
127
128         def collect(self, nodename, data):
129                 pass
130
131         def record(self, request, (nodename, values) ):
132
133                 try:
134                         if values is None:
135                                 return
136                         
137                         if self.syncclass:
138                                 fbnodesync = self.syncclass.findby_or_create(
139                                                                                                 #if_new_set={'round' : self.round},
140                                                                                                 **{ self.primarykey : nodename})
141                         # NOTE: This code will either add a new record for the new self.round, 
142                         #       OR it will find the previous value, and update it with new information.
143                         #       The data that is 'lost' is not that important, b/c older
144                         #       history still exists.  
145                         fbrec = self.recordclass.findby_or_create(
146                                                 **{ self.primarykey:nodename})
147
148                         fbrec.set( **values ) 
149
150                         fbrec.flush()
151                         if self.syncclass:
152                                 fbnodesync.round = self.round
153                                 fbnodesync.flush()
154
155                         print "%d %s %s" % (self.count, nodename, values)
156                         self.count += 1
157
158                 except:
159                         print "ERROR:"
160                         email_exception(str(nodename))
161                         print traceback.print_exc()
162                         pass
163
164 class ScanNodeInternal(ScanInterface):
165         recordclass = FindbadNodeRecord
166         #syncclass = FindbadNodeRecordSync
167         syncclass = None
168         primarykey = 'hostname'
169
170         def collectPorts(self, nodename, port_list=[22,80,806]):
171                 values = {}
172                 for port in port_list:
173                         ret = os.system("nc -w 5 -z %s %s > /dev/null" % (nodename, port) )
174                         if ret == 0:
175                                 values[str(port)] = "open"
176                         else:
177                                 values[str(port)] = "closed"
178                 return {'port_status' : values }
179
180         def collectNMAP(self, nodename, cohash):
181                 #### RUN NMAP ###############################
182                 # NOTE: run the same command three times and take the best of three
183                 #               runs.  NMAP can drop packets, and especially so when it runs many
184                 #               commands at once.
185                 values = {}
186                 nmap = command.CMD()
187                 print "nmap -oG - -P0 -p22,80,806 %s | grep -v Down | grep Ports:" % nodename
188                 (oval1,eval) = nmap.run_noexcept("nmap -oG - -P0 -p22,80,806 %s | grep -v Down | grep Ports:" % nodename)
189                 (oval2,eval) = nmap.run_noexcept("nmap -oG - -P0 -p22,80,806 %s | grep -v Down | grep Ports:" % nodename)
190                 (oval3,eval) = nmap.run_noexcept("nmap -oG - -P0 -p22,80,806 %s | grep -v Down | grep Ports:" % nodename)
191                 # NOTE: an empty / error value for oval, will still work.
192                 values['port_status'] = {}
193                 (o1,continue_probe) = nmap_port_status(oval1)
194                 (o2,continue_probe) = nmap_port_status(oval2)
195                 (o3,continue_probe) = nmap_port_status(oval3)
196                 for p in ['22', '80', '806']:
197                         l = [ o1[p], o2[p], o3[p] ]
198                         if len(filter(lambda x: x == 'open', l)) > 1:
199                                 values['port_status'][p] = 'open'
200                         else:
201                                 values['port_status'][p] = o1[p]
202
203                 print values['port_status']
204                 return (nodename, values)
205
206         def collectPING(self, nodename, cohash):
207                 values = {}
208                 ping = command.CMD()
209                 (oval,errval) = ping.run_noexcept("ping -c 1 -q %s | grep rtt" % nodename)
210
211                 values = {}
212                 if oval == "":
213                         # An error occurred
214                         values['ping_status'] = False
215                 else:
216                         values['ping_status'] = True
217
218                 return values
219
220         def collectTRACEROUTE(self, nodename, cohash):
221                 values = {}
222                 trace = command.CMD()
223                 (oval,errval) = trace.run_noexcept("traceroute %s" % nodename)
224
225                 values['traceroute'] = oval
226
227                 return values
228
229         def collectSSH(self, nodename, cohash):
230                 values = {}
231                 try:
232                         for port in [22, 806]: 
233                                 ssh = command.SSH('root', nodename, port)
234
235                                 (oval, errval) = ssh.run_noexcept2(""" <<\EOF
236                                         echo "{"
237                                         echo '  "kernel_version":"'`uname -a`'",'
238                                         echo '  "bmlog":"'`ls /tmp/bm.log || ls /tmp/source/BootManager.py`'",'
239                                         echo '  "bootcd_version":"'`cat /mnt/cdrom/bootme/ID || cat /usr/bootme/ID`'",'
240                                         echo '  "boot_server":"'`cat /mnt/cdrom/bootme/BOOTSERVER`'",'
241                                         echo '  "install_date":"'`python -c "import os,time,stat; print time.ctime(os.stat('/usr/boot/plnode.txt')[stat.ST_CTIME])" || python -c "import os,time,stat; print  time.ctime(os.stat('/usr/boot/cacert.pem')[stat.ST_CTIME])"`'",'
242                                         echo '  "nm_status":"'`ps ax | grep nm.py | grep -v grep`'",'
243                                         echo '  "dns_status":"'`host boot.planet-lab.org 2>&1`'",'
244                                         echo '  "iptables_status":"'`iptables -t mangle -nL | awk '$1~/^[A-Z]+$/ {modules[$1]=1;}END{for (k in modules) {if (k) printf "%s ",k;}}'`'",'
245                                         echo '  "princeton_comon_dir":"'`ls -d /vservers/princeton_comon`'",'
246                                         echo '  "uptime":"'`cat /proc/uptime`'",'
247
248                                         ID=`grep princeton_comon /etc/passwd | awk -F : '{if ( $3 > 500 ) { print $3}}'` 
249                                         echo '  "princeton_comon_running":"'`ls -d /proc/virtual/$ID`'",'
250                                         echo '  "princeton_comon_procs":"'`vps ax | grep $ID | grep -v grep | wc -l`'",'
251                                         echo '  "fs_status":"'`grep proc /proc/mounts | grep ro, ; if [ -x /usr/bin/timeout.pl ] ; then timeout.pl 20 touch /var/log/monitor 2>&1 ; if [ -d /vservers/ ] ; then timeout.pl 20 touch /vservers/monitor.log 2>&1  ; fi ; fi`'",'
252                                         echo '  "rpm_version":"'`if [ -x /usr/bin/timeout.pl ] ; then /usr/bin/timeout.pl 30 rpm -q NodeManager ; fi`'",'
253                                         echo '  "rpm_versions":"'`if [ -x /usr/bin/timeout.pl ] ; then /usr/bin/timeout.pl 45 rpm -q -a | sort ; fi`'",'
254                                         echo '  "md5sums":"'`md5sum /etc/yum.conf /etc/yum.myplc.d/myplc.repo /etc/yum.myplc.d/stock.repo  | awk '{print $1}'`'",'
255                                         echo '  "md5sum_yum":"'`grep -v -E "^#" /etc/yum.myplc.d/myplc.repo | md5sum`'",'
256                                         echo '  "nada":"'``'",'
257                                         echo "}"
258 EOF                     """)
259
260                                 values['ssh_error'] = errval
261                                 if len(oval) > 0:
262                                         #print "OVAL: %s" % oval
263                                         values.update(eval(oval))
264                                         values['ssh_portused'] = port
265                                         break
266                                 else:
267                                         values.update({'kernel_version': "", 'bmlog' : "", 'bootcd_version' : '', 
268                                                                         'boot_server' : '',
269                                                                         'install_date' : '',
270                                                                         'nm_status' : '', 
271                                                                         'fs_status' : '',
272                                                                         'uptime' : '',
273                                                                         'dns_status' : '',
274                                                                         'md5sums' : '',
275                                                                         'md5sum_yum' : '',
276                                                                         'rpm_version' : '',
277                                                                         'rpm_versions' : '',
278                                                                         'princeton_comon_dir' : "", 
279                                                                         'princeton_comon_running' : "", 
280                                                                         'princeton_comon_procs' : "", 'ssh_portused' : None})
281
282                         oval = values['nm_status']
283                         if "nm.py" in oval:
284                                 values['nm_status'] = "Y"
285                         else:
286                                 values['nm_status'] = "N"
287
288                         continue_slice_check = True
289                         oval = values['princeton_comon_dir']
290                         if "princeton_comon" in oval:
291                                 values['princeton_comon_dir'] = True
292                         else:
293                                 values['princeton_comon_dir'] = False
294                                 continue_slice_check = False
295
296                         if continue_slice_check:
297                                 oval = values['princeton_comon_running']
298                                 if len(oval) > len('/proc/virtual/'):
299                                         values['princeton_comon_running'] = True
300                                 else:
301                                         values['princeton_comon_running'] = False
302                                         continue_slice_check = False
303                         else:
304                                 values['princeton_comon_running'] = False
305                                 
306                         if continue_slice_check:
307                                 oval = values['princeton_comon_procs']
308                                 values['princeton_comon_procs'] = int(oval)
309                         else:
310                                 values['princeton_comon_procs'] = None
311                 except:
312                         print traceback.print_exc()
313                         sys.exit(1)
314
315                 return values
316
317         def collectPLC(self, nodename, cohash):
318                 values = {}
319                 ### GET PLC NODE ######################
320                 d_node = plccache.GetNodeByName(nodename)
321                 values['plc_node_stats'] = d_node
322
323                 ### GET PLC PCU ######################
324                 site_id = -1
325                 d_pcu = None
326                 if d_node and len(d_node['pcu_ids']) > 0:
327                         d_pcu = d_node['pcu_ids'][0]
328
329                 site_id = d_node['site_id']
330
331                 values['plc_pcuid'] = d_pcu
332
333                 ### GET PLC SITE ######################
334                 print "SITEID: %s" % site_id
335                 d_site = plccache.GetSitesById([ site_id ])[0]
336                 values['loginbase'] = d_site['login_base']
337                 values['plc_site_stats'] = d_site 
338
339                 return values
340
341         def evaluate(self, nodename, values):
342                 # TODO: this section can probably be reduced to a policy statement
343                 #               using patterns and values collected so far.
344                 # NOTE: A node is "DOWN" if 
345                 #       * cannot ssh into it.
346                 #   * all ports are not open for a 'BOOT' node
347                 #   * dns for hostname does not exist.
348                 b_getbootcd_id = True
349
350                 oval = values['kernel_version']
351                 values['ssh_status'] = True
352                 if "2.6.17" in oval or "2.6.2" in oval:
353                         values['observed_category'] = 'PROD'
354                         if "bm.log" in values['bmlog'] or "BootManager" in values['bmlog']:
355                                 values['observed_status'] = 'DEBUG'
356                         else:
357                                 values['observed_status'] = 'BOOT'
358                 elif "2.6.12" in oval or "2.6.10" in oval:
359                         values['observed_category'] = 'OLDPROD'
360                         if "bm.log" in values['bmlog']:
361                                 values['observed_status'] = 'DEBUG'
362                         else:
363                                 values['observed_status'] = 'BOOT'
364                 
365                 # NOTE: on 2.6.8 kernels, with 4.2 bootstrapfs, the chroot 
366                 #       command fails.  I have no idea why.
367                 elif "2.4" in oval or "2.6.8" in oval:
368                         b_getbootcd_id = False
369                         values['observed_category'] = 'OLDBOOTCD'
370                         values['observed_status'] = 'DEBUG'
371                 elif oval != "":
372                         values['observed_category'] = 'UNKNOWN'
373                         if "bm.log" in values['bmlog']:
374                                 values['observed_status'] = 'DEBUG'
375                         else:
376                                 values['observed_status'] = 'BOOT'
377                 else:
378                         # An error occurred.
379                         b_getbootcd_id = False
380                         values['ssh_status'] = False
381                         values['observed_category'] = 'ERROR'
382                         values['observed_status'] = 'DOWN'
383                         values['kernel_version'] = ""
384
385                 values['firewall'] = False
386
387                 #print "BEFORE:%s" % values
388                 # NOTE: A node is down if some of the public ports are not open
389                 if values['observed_status'] == "BOOT":
390                         # verify that all ports are open.  Else, report node as down.
391                         if not ( values['port_status']['22']  == "open" and \
392                                          values['port_status']['80']  == "open" and \
393                                          values['port_status']['806'] == "open") :
394                                 #email_exception(nodename, "%s FILTERED HOST" % nodename)
395                                 values['observed_status'] = 'DOWN'
396                                 values['firewall'] = True
397
398                         #if   values['port_status']['22']  == "open" and \
399                         #        values['port_status']['80']  == "closed" and \
400                         #        values['port_status']['806'] == "open" :
401                         #       email_exception("%s port 80 blocked" % nodename, "possible VSERVER ref blocked")
402
403                 #if not values['external_dns_status']:
404                 #       email_exception("%s DNS down" % nodename)
405
406                 if b_getbootcd_id:
407                         # try to get BootCD for all nodes that are not 2.4 nor inaccessible
408                         oval = values['bootcd_version']
409                         if "BootCD" in oval:
410                                 values['bootcd_version'] = oval
411                                 if "v2" in oval and \
412                                         ( nodename is not "planetlab1.cs.unc.edu" and \
413                                           nodename is not "planetlab2.cs.unc.edu" ):
414                                         values['observed_category'] = 'OLDBOOTCD'
415                         else:
416                                 values['bootcd_version'] = ""
417                 else:
418                         values['bootcd_version'] = ""
419
420                 return values
421
422         def collectDNS(self, nodename, cohash):
423                 values = {}
424                 try:
425                         ipaddr = socket.gethostbyname(nodename)
426                         # TODO: check that IP returned matches IP in plc db.
427                         values['external_dns_status'] = True
428                 except Exception, err:
429                         values['external_dns_status'] = False
430
431                 return values
432
433         def collectInternal(self, nodename, cohash):
434                 try:
435                         values = {}
436
437                         v = self.collectPING(nodename, cohash)
438                         values.update(v)
439
440                         v = self.collectPorts(nodename)
441                         values.update(v)
442
443                         v = self.collectSSH(nodename, cohash)
444                         values.update(v)
445
446                         v = self.collectDNS(nodename, cohash)
447                         values.update(v)
448
449                         v = self.collectTRACEROUTE(nodename, cohash)
450                         values.update(v)
451
452                         v = self.collectPLC(nodename, cohash)
453                         values.update(v)
454
455                         if nodename in cohash: 
456                                 values['comon_stats'] = cohash[nodename]
457                         else:
458                                 values['comon_stats'] = {'resptime':  '-1', 
459                                                                                 'uptime':    '-1',
460                                                                                 'sshstatus': '-1', 
461                                                                                 'lastcotop': '-1',
462                                                                                 'cpuspeed' : "null",
463                                                                                 'disksize' : 'null',
464                                                                                 'memsize'  : 'null'}
465
466                         values['rpms'] = values['rpm_versions']
467                         print "ALLVERSIONS: %s %s" % (nodename, values['rpm_versions'])
468                         print "RPMVERSION: %s %s" % (nodename, values['rpm_version'])
469                         print "UPTIME: %s %s" % (nodename, values['uptime'])
470                         print "MD5SUMS: %s %s" % (nodename, values['md5sums'])
471                         print "MD5SUM_YUM: %s %s" % (nodename, values['md5sum_yum'])
472
473                         values = self.evaluate(nodename, values)
474                         #print "%s %s" % (nodename, values)
475                         values['date_checked'] = datetime.now()
476
477                 except:
478                         print traceback.print_exc()
479
480                 return (nodename, values)
481
482
483 def internalprobe(hostname):
484         scannode = ScanNodeInternal()
485         try:
486                 (nodename, values) = scannode.collectInternal(hostname, {})
487                 scannode.record(None, (nodename, values))
488                 session.flush()
489                 return True
490         except:
491                 print traceback.print_exc()
492                 return False
493
494 def externalprobe(hostname):
495         scannode = ScanNodeInternal() 
496         try:
497                 values = self.collectPorts(hostname)
498                 scannode.record(None, (hostname, values))
499                 session.flush()
500                 return True
501         except:
502                 print traceback.print_exc()
503                 return False
504
505 class ScanPCU(ScanInterface):
506         recordclass = FindbadPCURecord
507         syncclass = None
508         primarykey = 'plc_pcuid'
509
510         def collectInternal(self, pcuname, cohash):
511
512                 continue_probe = True
513                 errors = None
514                 values = {'reboot_trial_status' : 'novalue'}
515                 ### GET PCU ######################
516                 try:
517                         b_except = False
518                         try:
519                                 v = get_plc_pcu_values(pcuname)
520                                 if v['hostname'] is not None: v['hostname'] = v['hostname'].strip()
521                                 if v['ip'] is not None: v['ip'] = v['ip'].strip()
522
523                                 if v is not None:
524                                         values['plc_pcu_stats'] = v
525                                 else:
526                                         continue_probe = False
527                         except:
528                                 b_except = True
529                                 traceback.print_exc()
530                                 continue_probe = False
531
532                         if b_except or not continue_probe: return (None, None)
533
534                         #### RUN NMAP ###############################
535                         if continue_probe:
536                                 nmap = command.CMD()
537                                 print "nmap -oG - -P0 -p22,23,80,443,623,5869,9100,16992 %s | grep -v Down | grep Ports:" % reboot.pcu_name(values['plc_pcu_stats'])
538                                 (oval,eval) = nmap.run_noexcept("nmap -oG - -P0 -p22,23,80,443,623,5869,9100,16992 %s | grep -v Down | grep Ports:" % reboot.pcu_name(values['plc_pcu_stats']))
539                                 # NOTE: an empty / error value for oval, will still work.
540                                 (values['port_status'], continue_probe) = nmap_port_status(oval)
541                         else:
542                                 values['port_status'] = None
543                                 
544                         #### COMPLETE ENTRY   #######################
545
546                         values['entry_complete'] = []
547                         #if values['protocol'] is None or values['protocol'] is "":
548                         #       values['entry_complete'] += ["protocol"]
549                         if values['plc_pcu_stats']['model'] is None or values['plc_pcu_stats']['model'] is "":
550                                 values['entry_complete'] += ["model"]
551                                 # Cannot continue due to this condition
552                                 continue_probe = False
553
554                         if values['plc_pcu_stats']['password'] is None or values['plc_pcu_stats']['password'] is "":
555                                 values['entry_complete'] += ["password"]
556                                 # Cannot continue due to this condition
557                                 continue_probe = False
558
559                         if len(values['entry_complete']) > 0:
560                                 continue_probe = False
561
562                         if values['plc_pcu_stats']['hostname'] is None or values['plc_pcu_stats']['hostname'] is "":
563                                 values['entry_complete'] += ["hostname"]
564                         if values['plc_pcu_stats']['ip'] is None or values['plc_pcu_stats']['ip'] is "":
565                                 values['entry_complete'] += ["ip"]
566
567                         # If there are no nodes associated with this PCU, then we cannot continue.
568                         if len(values['plc_pcu_stats']['node_ids']) == 0:
569                                 continue_probe = False
570                                 values['entry_complete'] += ['nodeids']
571
572
573                         #### DNS and IP MATCH #######################
574                         if values['plc_pcu_stats']['hostname'] is not None and values['plc_pcu_stats']['hostname'] is not "" and \
575                            values['plc_pcu_stats']['ip'] is not None and values['plc_pcu_stats']['ip'] is not "":
576                                 try:
577                                         ipaddr = socket.gethostbyname(values['plc_pcu_stats']['hostname'])
578                                         if ipaddr == values['plc_pcu_stats']['ip']:
579                                                 values['dns_status'] = "DNS-OK"
580                                         else:
581                                                 values['dns_status'] = "DNS-MISMATCH"
582                                                 values['plc_pcu_stats']['hostname'] = values['plc_pcu_stats']['ip']
583
584                                 except Exception, err:
585                                         values['dns_status'] = "DNS-NOENTRY"
586                                         values['plc_pcu_stats']['hostname'] = values['plc_pcu_stats']['ip']
587                         else:
588                                 if values['plc_pcu_stats']['ip'] is not None and values['plc_pcu_stats']['ip'] is not "":
589                                         values['dns_status'] = "NOHOSTNAME"
590                                         values['plc_pcu_stats']['hostname'] = values['plc_pcu_stats']['ip']
591                                 else:
592                                         values['dns_status'] = "NO-DNS-OR-IP"
593                                         values['plc_pcu_stats']['hostname'] = "No_entry_in_DB"
594                                         continue_probe = False
595
596
597                         ######  DRY RUN  ############################
598                         if continue_probe and 'node_ids' in values['plc_pcu_stats'] and \
599                                 len(values['plc_pcu_stats']['node_ids']) > 0:
600                                 rb_ret = reboot.reboot_test_new(values['plc_pcu_stats']['nodenames'][0], 
601                                                                                                 values, 1, True)
602                         else:
603                                 rb_ret = "Not_Run" # No nodes to test"
604
605                         values['reboot_trial_status'] = rb_ret
606
607                 except:
608                         print "____________________________________"
609                         print values
610                         errors = values
611                         print "____________________________________"
612                         errors['traceback'] = traceback.format_exc()
613                         print errors['traceback']
614                         values['reboot_trial_status'] = str(errors['traceback'])
615                         print values
616
617                 values['entry_complete']=" ".join(values['entry_complete'])
618
619                 values['date_checked'] = datetime.now()
620                 return (pcuname, values)
621