bug fix in summary template
[monitor.git] / monitor / scanapi.py
1 #!/usr/bin/python
2
3 import os
4 import sys
5 import string
6 import time
7 from datetime import datetime,timedelta
8 import threadpool
9 import threading
10
11 import socket
12 from pcucontrol import reboot
13
14 from pcucontrol.util import command
15 from monitor import config
16
17 from monitor.database.info.model import *
18
19 from monitor.sources import comon
20 from monitor.wrapper import plc, plccache
21
22 import traceback
23 from monitor.common import nmap_port_status, email_exception
24
25 COMON_COTOPURL= "http://summer.cs.princeton.edu/status/tabulator.cgi?" + \
26                         "table=table_nodeview&" + \
27                         "dumpcols='name,resptime,sshstatus,uptime,lastcotop,cpuspeed,memsize,disksize'&" + \
28                         "formatcsv"
29
30 api = plc.getAuthAPI()
31 plc_lock = threading.Lock()
32 round = 1
33 global_round = round
34 count = 0
35
36
37 def get_pcu(pcuname):
38         plc_lock.acquire()
39         try:
40                 #print "GetPCU from PLC %s" % pcuname
41                 l_pcu  = plc.GetPCUs({'pcu_id' : pcuname})
42                 #print l_pcu
43                 if len(l_pcu) > 0:
44                         l_pcu = l_pcu[0]
45         except:
46                 try:
47                         #print "GetPCU from file %s" % pcuname
48                         l_pcus = plccache.l_pcus
49                         for i in l_pcus:
50                                 if i['pcu_id'] == pcuname:
51                                         l_pcu = i
52                 except:
53                         traceback.print_exc()
54                         l_pcu = None
55
56         plc_lock.release()
57         return l_pcu
58
59 def get_nodes(node_ids):
60         plc_lock.acquire()
61         l_node = []
62         try:
63                 l_node = plc.getNodes(node_ids, ['hostname', 'last_contact', 'node_id', 'ports'])
64         except:
65                 try:
66                         plc_nodes = plccache.l_nodes
67                         for n in plc_nodes:
68                                 if n['node_id'] in node_ids:
69                                         l_node.append(n)
70                 except:
71                         traceback.print_exc()
72                         l_node = None
73
74         plc_lock.release()
75         if l_node == []:
76                 l_node = None
77         return l_node
78         
79
80 def get_plc_pcu_values(pcuname):
81         """
82                 Try to contact PLC to get the PCU info.
83                 If that fails, try a backup copy from the last run.
84                 If that fails, return None
85         """
86         values = {}
87
88         l_pcu = get_pcu(pcuname)
89         
90         if l_pcu is not None:
91                 site_id = l_pcu['site_id']
92                 node_ids = l_pcu['node_ids']
93                 l_node = get_nodes(node_ids) 
94                                 
95                 if l_node is not None:
96                         for node in l_node:
97                                 values[node['hostname']] = node['ports'][0]
98
99                         values['nodenames'] = [node['hostname'] for node in l_node]
100
101                         # NOTE: this is for a dry run later. It doesn't matter which node.
102                         values['node_id'] = l_node[0]['node_id']
103
104                 values.update(l_pcu)
105         else:
106                 values = None
107         
108         return values
109
110 class ScanInterface(object):
111         recordclass = None
112         syncclass = None
113         primarykey = 'hostname'
114
115         def __init__(self, round=1):
116                 self.round = round
117                 self.count = 1
118
119         def __getattr__(self, name):
120                 if 'collect' in name or 'record' in name:
121                         method = getattr(self, name, None)
122                         if method is None:
123                                 raise Exception("No such method %s" % name)
124                         return method
125                 else:
126                         raise Exception("No such method %s" % name)
127
128         def collect(self, nodename, data):
129                 pass
130
131         def record(self, request, (nodename, values) ):
132
133                 try:
134                         if values is None:
135                                 return
136                         
137                         if self.syncclass:
138                                 fbnodesync = self.syncclass.findby_or_create(
139                                                                                                 #if_new_set={'round' : self.round},
140                                                                                                 **{ self.primarykey : nodename})
141                         # NOTE: This code will either add a new record for the new self.round, 
142                         #       OR it will find the previous value, and update it with new information.
143                         #       The data that is 'lost' is not that important, b/c older
144                         #       history still exists.  
145                         fbrec = self.recordclass.findby_or_create(
146                                                 **{ self.primarykey:nodename})
147
148                         fbrec.set( **values ) 
149
150                         fbrec.flush()
151                         if self.syncclass:
152                                 fbnodesync.round = self.round
153                                 fbnodesync.flush()
154
155                         print "%d %s %s" % (self.count, nodename, values)
156                         self.count += 1
157
158                 except:
159                         print "ERROR:"
160                         email_exception(str(nodename))
161                         print traceback.print_exc()
162                         pass
163
164 class ScanNodeInternal(ScanInterface):
165         recordclass = FindbadNodeRecord
166         #syncclass = FindbadNodeRecordSync
167         syncclass = None
168         primarykey = 'hostname'
169
170         def collectNMAP(self, nodename, cohash):
171                 #### RUN NMAP ###############################
172                 values = {}
173                 nmap = command.CMD()
174                 print "nmap -oG - -P0 -p22,80,806 %s | grep Host:" % nodename
175                 (oval,eval) = nmap.run_noexcept("nmap -oG - -P0 -p22,80,806 %s | grep Host:" % nodename)
176                 # NOTE: an empty / error value for oval, will still work.
177                 (values['port_status'], continue_probe) = nmap_port_status(oval)
178
179                 values['date_checked'] = datetime.now()
180                                 
181                 return (nodename, values)
182
183         def collectInternal(self, nodename, cohash):
184                 ### RUN PING ######################
185                 ping = command.CMD()
186                 (oval,errval) = ping.run_noexcept("ping -c 1 -q %s | grep rtt" % nodename)
187
188                 try:
189                         values = {}
190
191                         if oval == "":
192                                 # An error occurred
193                                 values['ping_status'] = False
194                         else:
195                                 values['ping_status'] = True
196
197                         try:
198                                 for port in [22, 806]: 
199                                         ssh = command.SSH('root', nodename, port)
200                                         #echo '  "fs_status":"'`touch /var/log/monitor 2>&1 ; if [ -d /vservers/ ] ; then touch /vservers/monitor.log 2>&1 ; fi ; grep proc /proc/mounts | grep ro,`'",'
201
202                                         (oval, errval) = ssh.run_noexcept2(""" <<\EOF
203                                                 echo "{"
204                                                 echo '  "kernel_version":"'`uname -a`'",'
205                                                 echo '  "bmlog":"'`ls /tmp/bm.log`'",'
206                                                 echo '  "bootcd_version":"'`cat /mnt/cdrom/bootme/ID`'",'
207                                                 echo '  "nm_status":"'`ps ax | grep nm.py | grep -v grep`'",'
208                                                 echo '  "dns_status":"'`host boot.planet-lab.org 2>&1`'",'
209                                                 echo '  "princeton_comon_dir":"'`ls -d /vservers/princeton_comon`'",'
210
211                                                 ID=`grep princeton_comon /etc/passwd | awk -F : '{if ( $3 > 500 ) { print $3}}'` 
212                                                 echo '  "princeton_comon_running":"'`ls -d /proc/virtual/$ID`'",'
213                                                 echo '  "princeton_comon_procs":"'`vps ax | grep $ID | grep -v grep | wc -l`'",'
214                                                 echo '  "rpm_version":"'`rpm -q NodeManager`'",'
215                                                 echo '  "rpm_versions":"'`rpm -q -a`'",'
216                                                 echo "}"
217 EOF                             """)
218                                         
219                                         values['ssh_error'] = errval
220                                         if len(oval) > 0:
221                                                 #print "OVAL: %s" % oval
222                                                 values.update(eval(oval))
223                                                 values['ssh_portused'] = port
224                                                 break
225                                         else:
226                                                 values.update({'kernel_version': "", 'bmlog' : "", 'bootcd_version' : '', 
227                                                                                 'nm_status' : '', 
228                                                                                 'fs_status' : '',
229                                                                                 'dns_status' : '',
230                                                                                 'rpm_version' : '',
231                                                                                 'rpm_versions' : '',
232                                                                                 'princeton_comon_dir' : "", 
233                                                                                 'princeton_comon_running' : "", 
234                                                                                 'princeton_comon_procs' : "", 'ssh_portused' : None})
235                         except:
236                                 print traceback.print_exc()
237                                 sys.exit(1)
238
239                         values['fs_status'] = ""
240                         print "ALLVERSIONS: %s %s" % (nodename, values['rpm_versions'])
241
242                         print "RPMVERSION: %s %s" % (nodename, values['rpm_version'])
243                         ### RUN SSH ######################
244                         b_getbootcd_id = True
245
246                         oval = values['kernel_version']
247                         if "2.6.17" in oval or "2.6.2" in oval:
248                                 values['ssh_status'] = True
249                                 values['observed_category'] = 'PROD'
250                                 if "bm.log" in values['bmlog']:
251                                         values['observed_status'] = 'DEBUG'
252                                 else:
253                                         values['observed_status'] = 'BOOT'
254                         elif "2.6.12" in oval or "2.6.10" in oval:
255                                 values['ssh_status'] = True
256                                 values['observed_category'] = 'OLDPROD'
257                                 if "bm.log" in values['bmlog']:
258                                         values['observed_status'] = 'DEBUG'
259                                 else:
260                                         values['observed_status'] = 'BOOT'
261                         
262                         # NOTE: on 2.6.8 kernels, with 4.2 bootstrapfs, the chroot 
263                         #       command fails.  I have no idea why.
264                         elif "2.4" in oval or "2.6.8" in oval:
265                                 b_getbootcd_id = False
266                                 values['ssh_status'] = True
267                                 values['observed_category'] = 'OLDBOOTCD'
268                                 values['observed_status'] = 'DEBUG'
269                         elif oval != "":
270                                 values['ssh_status'] = True
271                                 values['observed_category'] = 'UNKNOWN'
272                                 if "bm.log" in values['bmlog']:
273                                         values['observed_status'] = 'DEBUG'
274                                 else:
275                                         values['observed_status'] = 'BOOT'
276                         else:
277                                 # An error occurred.
278                                 b_getbootcd_id = False
279                                 values['ssh_status'] = False
280                                 values['observed_category'] = 'ERROR'
281                                 values['observed_status'] = 'DOWN'
282                                 val = errval.strip()
283                                 values['ssh_error'] = val
284                                 values['kernel_version'] = ""
285
286                         if b_getbootcd_id:
287                                 # try to get BootCD for all nodes that are not 2.4 nor inaccessible
288                                 oval = values['bootcd_version']
289                                 if "BootCD" in oval:
290                                         values['bootcd_version'] = oval
291                                         if "v2" in oval and \
292                                                 ( nodename is not "planetlab1.cs.unc.edu" and \
293                                                   nodename is not "planetlab2.cs.unc.edu" ):
294                                                 values['observed_category'] = 'OLDBOOTCD'
295                                 else:
296                                         values['bootcd_version'] = ""
297                         else:
298                                 values['bootcd_version'] = ""
299
300                         oval = values['nm_status']
301                         if "nm.py" in oval:
302                                 values['nm_status'] = "Y"
303                         else:
304                                 values['nm_status'] = "N"
305
306                         continue_slice_check = True
307                         oval = values['princeton_comon_dir']
308                         if "princeton_comon_dir" in oval:
309                                 values['princeton_comon_dir'] = True
310                         else:
311                                 values['princeton_comon_dir'] = False
312                                 continue_slice_check = False
313
314                         if continue_slice_check:
315                                 oval = values['princeton_comon_running']
316                                 if len(oval) > len('/proc/virtual/'):
317                                         values['princeton_comon_running'] = True
318                                 else:
319                                         values['princeton_comon_running'] = False
320                                         continue_slice_check = False
321                         else:
322                                 values['princeton_comon_running'] = False
323                                 
324                         if continue_slice_check:
325                                 oval = values['princeton_comon_procs']
326                                 values['princeton_comon_procs'] = int(oval)
327                         else:
328                                 values['princeton_comon_procs'] = None
329
330                                 
331                         if nodename in cohash: 
332                                 values['comon_stats'] = cohash[nodename]
333                         else:
334                                 values['comon_stats'] = {'resptime':  '-1', 
335                                                                                 'uptime':    '-1',
336                                                                                 'sshstatus': '-1', 
337                                                                                 'lastcotop': '-1',
338                                                                                 'cpuspeed' : "null",
339                                                                                 'disksize' : 'null',
340                                                                                 'memsize'  : 'null'}
341                         # include output value
342                         ### GET PLC NODE ######################
343                         plc_lock.acquire()
344                         d_node = None
345                         try:
346                                 d_node = plccache.GetNodeByName(nodename)
347                                 #d_node = plc.getNodes({'hostname': nodename}, ['pcu_ids', 'site_id', 
348                                 #                                               'date_created', 'last_updated', 
349                                 #                                               'last_contact', 'boot_state', 'nodegroup_ids'])[0]
350                         except:
351                                 traceback.print_exc()
352                         plc_lock.release()
353                         values['plc_node_stats'] = d_node
354
355                         ##### NMAP  ###################
356                         (n, v) = self.collectNMAP(nodename, None)
357                         values.update(v)
358
359                         ### GET PLC PCU ######################
360                         site_id = -1
361                         d_pcu = None
362                         if d_node:
363                                 pcu = d_node['pcu_ids']
364                                 if len(pcu) > 0:
365                                         d_pcu = pcu[0]
366
367                                 site_id = d_node['site_id']
368
369                         values['plc_pcuid'] = d_pcu
370
371                         ### GET PLC SITE ######################
372                         plc_lock.acquire()
373                         d_site = None
374                         values['loginbase'] = ""
375                         try:
376                                 d_site = plccache.GetSitesById([ site_id ])[0]
377                                 #d_site = plc.getSites({'site_id': site_id}, 
378                                 #                                       ['max_slices', 'slice_ids', 'node_ids', 'login_base'])[0]
379                                 values['loginbase'] = d_site['login_base']
380                         except:
381                                 traceback.print_exc()
382                         plc_lock.release()
383
384                         values['plc_site_stats'] = d_site 
385                         values['date_checked'] = datetime.now()
386                 except:
387                         print traceback.print_exc()
388
389                 return (nodename, values)
390
391 def internalprobe(hostname):
392         #fbsync = FindbadNodeRecordSync.findby_or_create(hostname="global", 
393         #                                                                                               if_new_set={'round' : 1})
394         scannode = ScanNodeInternal() # fbsync.round)
395         try:
396                 (nodename, values) = scannode.collectInternal(hostname, {})
397                 scannode.record(None, (nodename, values))
398                 session.flush()
399                 return True
400         except:
401                 print traceback.print_exc()
402                 return False
403
404 def externalprobe(hostname):
405         #fbsync = FindbadNodeRecordSync.findby_or_create(hostname="global", 
406         #                                                                                               if_new_set={'round' : 1})
407         scannode = ScanNodeInternal() # fbsync.round)
408         try:
409                 (nodename, values) = scannode.collectNMAP(hostname, {})
410                 scannode.record(None, (nodename, values))
411                 session.flush()
412                 return True
413         except:
414                 print traceback.print_exc()
415                 return False
416
417 class ScanPCU(ScanInterface):
418         recordclass = FindbadPCURecord
419         syncclass = None
420         primarykey = 'plc_pcuid'
421
422         def collectInternal(self, pcuname, cohash):
423
424                 continue_probe = True
425                 errors = None
426                 values = {'reboot_trial_status' : 'novalue'}
427                 ### GET PCU ######################
428                 try:
429                         b_except = False
430                         try:
431                                 v = get_plc_pcu_values(pcuname)
432                                 if v['hostname'] is not None: v['hostname'] = v['hostname'].strip()
433                                 if v['ip'] is not None: v['ip'] = v['ip'].strip()
434
435                                 if v is not None:
436                                         values['plc_pcu_stats'] = v
437                                 else:
438                                         continue_probe = False
439                         except:
440                                 b_except = True
441                                 traceback.print_exc()
442                                 continue_probe = False
443
444                         if b_except or not continue_probe: return (None, None, None)
445
446                         #### RUN NMAP ###############################
447                         if continue_probe:
448                                 nmap = command.CMD()
449                                 print "nmap -oG - -P0 -p22,23,80,443,5869,9100,16992 %s | grep Host:" % reboot.pcu_name(values['plc_pcu_stats'])
450                                 (oval,eval) = nmap.run_noexcept("nmap -oG - -P0 -p22,23,80,443,5869,9100,16992 %s | grep Host:" % reboot.pcu_name(values['plc_pcu_stats']))
451                                 # NOTE: an empty / error value for oval, will still work.
452                                 (values['port_status'], continue_probe) = nmap_port_status(oval)
453                         else:
454                                 values['port_status'] = None
455                                 
456                         #### COMPLETE ENTRY   #######################
457
458                         values['entry_complete'] = []
459                         #if values['protocol'] is None or values['protocol'] is "":
460                         #       values['entry_complete'] += ["protocol"]
461                         if values['plc_pcu_stats']['model'] is None or values['plc_pcu_stats']['model'] is "":
462                                 values['entry_complete'] += ["model"]
463                                 # Cannot continue due to this condition
464                                 continue_probe = False
465
466                         if values['plc_pcu_stats']['password'] is None or values['plc_pcu_stats']['password'] is "":
467                                 values['entry_complete'] += ["password"]
468                                 # Cannot continue due to this condition
469                                 continue_probe = False
470
471                         if len(values['entry_complete']) > 0:
472                                 continue_probe = False
473
474                         if values['plc_pcu_stats']['hostname'] is None or values['plc_pcu_stats']['hostname'] is "":
475                                 values['entry_complete'] += ["hostname"]
476                         if values['plc_pcu_stats']['ip'] is None or values['plc_pcu_stats']['ip'] is "":
477                                 values['entry_complete'] += ["ip"]
478
479                         # If there are no nodes associated with this PCU, then we cannot continue.
480                         if len(values['plc_pcu_stats']['node_ids']) == 0:
481                                 continue_probe = False
482                                 values['entry_complete'] += ['nodeids']
483
484
485                         #### DNS and IP MATCH #######################
486                         if values['plc_pcu_stats']['hostname'] is not None and values['plc_pcu_stats']['hostname'] is not "" and \
487                            values['plc_pcu_stats']['ip'] is not None and values['plc_pcu_stats']['ip'] is not "":
488                                 try:
489                                         ipaddr = socket.gethostbyname(values['plc_pcu_stats']['hostname'])
490                                         if ipaddr == values['plc_pcu_stats']['ip']:
491                                                 values['dns_status'] = "DNS-OK"
492                                         else:
493                                                 values['dns_status'] = "DNS-MISMATCH"
494                                                 values['plc_pcu_stats']['hostname'] = values['plc_pcu_stats']['ip']
495
496                                 except Exception, err:
497                                         values['dns_status'] = "DNS-NOENTRY"
498                                         values['plc_pcu_stats']['hostname'] = values['plc_pcu_stats']['ip']
499                         else:
500                                 if values['plc_pcu_stats']['ip'] is not None and values['plc_pcu_stats']['ip'] is not "":
501                                         values['dns_status'] = "NOHOSTNAME"
502                                         values['plc_pcu_stats']['hostname'] = values['plc_pcu_stats']['ip']
503                                 else:
504                                         values['dns_status'] = "NO-DNS-OR-IP"
505                                         values['plc_pcu_stats']['hostname'] = "No_entry_in_DB"
506                                         continue_probe = False
507
508
509                         ######  DRY RUN  ############################
510                         if continue_probe and 'node_ids' in values['plc_pcu_stats'] and \
511                                 len(values['plc_pcu_stats']['node_ids']) > 0:
512                                 rb_ret = reboot.reboot_test_new(values['plc_pcu_stats']['nodenames'][0], 
513                                                                                                 values, 1, True)
514                         else:
515                                 rb_ret = "Not_Run" # No nodes to test"
516
517                         values['reboot_trial_status'] = rb_ret
518
519                 except:
520                         print "____________________________________"
521                         print values
522                         errors = values
523                         print "____________________________________"
524                         errors['traceback'] = traceback.format_exc()
525                         print errors['traceback']
526                         values['reboot_trial_status'] = str(errors['traceback'])
527                         print values
528
529                 values['entry_complete']=" ".join(values['entry_complete'])
530
531                 values['date_checked'] = datetime.now()
532                 return (pcuname, values)
533