unify the model by which probes are made to collect information about nodes or
[monitor.git] / monitor / scanapi.py
1 #!/usr/bin/python
2
3 import os
4 import sys
5 import string
6 import time
7 from datetime import datetime,timedelta
8 import threadpool
9 import threading
10
11 import socket
12 from pcucontrol import reboot
13
14 from monitor import util
15 from monitor.util import command
16 from monitor import config
17
18 from monitor.database.info.model import *
19
20 from monitor.sources import comon
21 from monitor.wrapper import plc, plccache
22
23 from nodequery import verify,query_to_dict,node_select
24 import traceback
25 from nodecommon import nmap_port_status
26
27 COMON_COTOPURL= "http://summer.cs.princeton.edu/status/tabulator.cgi?" + \
28                         "table=table_nodeview&" + \
29                         "dumpcols='name,resptime,sshstatus,uptime,lastcotop,cpuspeed,memsize,disksize'&" + \
30                         "formatcsv"
31
32 api = plc.getAuthAPI()
33 plc_lock = threading.Lock()
34 round = 1
35 global_round = round
36 count = 0
37
38
39 def get_pcu(pcuname):
40         plc_lock.acquire()
41         try:
42                 #print "GetPCU from PLC %s" % pcuname
43                 l_pcu  = plc.GetPCUs({'pcu_id' : pcuname})
44                 #print l_pcu
45                 if len(l_pcu) > 0:
46                         l_pcu = l_pcu[0]
47         except:
48                 try:
49                         #print "GetPCU from file %s" % pcuname
50                         l_pcus = plccache.l_pcus
51                         for i in l_pcus:
52                                 if i['pcu_id'] == pcuname:
53                                         l_pcu = i
54                 except:
55                         traceback.print_exc()
56                         l_pcu = None
57
58         plc_lock.release()
59         return l_pcu
60
61 def get_nodes(node_ids):
62         plc_lock.acquire()
63         l_node = []
64         try:
65                 l_node = plc.getNodes(node_ids, ['hostname', 'last_contact', 'node_id', 'ports'])
66         except:
67                 try:
68                         plc_nodes = plccache.l_plcnodes
69                         for n in plc_nodes:
70                                 if n['node_id'] in node_ids:
71                                         l_node.append(n)
72                 except:
73                         traceback.print_exc()
74                         l_node = None
75
76         plc_lock.release()
77         if l_node == []:
78                 l_node = None
79         return l_node
80         
81
82 def get_plc_pcu_values(pcuname):
83         """
84                 Try to contact PLC to get the PCU info.
85                 If that fails, try a backup copy from the last run.
86                 If that fails, return None
87         """
88         values = {}
89
90         l_pcu = get_pcu(pcuname)
91         
92         if l_pcu is not None:
93                 site_id = l_pcu['site_id']
94                 node_ids = l_pcu['node_ids']
95                 l_node = get_nodes(node_ids) 
96                                 
97                 if l_node is not None:
98                         for node in l_node:
99                                 values[node['hostname']] = node['ports'][0]
100
101                         values['nodenames'] = [node['hostname'] for node in l_node]
102
103                         # NOTE: this is for a dry run later. It doesn't matter which node.
104                         values['node_id'] = l_node[0]['node_id']
105
106                 values.update(l_pcu)
107         else:
108                 values = None
109         
110         return values
111
112 class ScanInterface(object):
113         recordclass = None
114         syncclass = None
115         primarykey = 'hostname'
116
117         def __init__(self, round):
118                 self.round = round
119                 self.count = 1
120
121         def __getattr__(self, name):
122                 if 'collect' in name or 'record' in name:
123                         method = getattr(self, name, None)
124                         if method is None:
125                                 raise Exception("No such method %s" % name)
126                         return method
127                 else:
128                         raise Exception("No such method %s" % name)
129
130         def collect(self, nodename, data):
131                 pass
132
133         def record(self, request, (nodename, values) ):
134
135                 try:
136                         if values is None:
137                                 return
138
139                         fbnodesync = self.syncclass.findby_or_create(
140                                                                                                 if_new_set={'round' : self.round},
141                                                                                                 **{ self.primarykey : nodename})
142                         # NOTE: This code will either add a new record for the new self.round, 
143                         #       OR it will find the previous value, and update it with new information.
144                         #       The data that is 'lost' is not that important, b/c older
145                         #       history still exists.  
146                         fbrec = self.recordclass.findby_or_create(
147                                                 **{'round':self.round, self.primarykey:nodename})
148
149                         fbrec.set( **values ) 
150
151                         fbrec.flush()
152                         fbnodesync.round = self.round
153                         fbnodesync.flush()
154
155                         print "%d %s %s" % (self.count, nodename, values)
156                         self.count += 1
157
158                 except:
159                         print "ERROR:"
160                         print traceback.print_exc()
161                         pass
162
163 class ScanNodeInternal(ScanInterface):
164         recordclass = FindbadNodeRecord
165         syncclass = FindbadNodeRecordSync
166         primarykey = 'hostname'
167
168         def collectNMAP(self, nodename, cohash):
169                 #### RUN NMAP ###############################
170                 values = {}
171                 nmap = util.command.CMD()
172                 print "nmap -oG - -P0 -p22,80,806 %s | grep Host:" % nodename
173                 (oval,eval) = nmap.run_noexcept("nmap -oG - -P0 -p22,80,806 %s | grep Host:" % nodename)
174                 # NOTE: an empty / error value for oval, will still work.
175                 (values['port_status'], continue_probe) = nmap_port_status(oval)
176
177                 values['date_checked'] = datetime.now()
178                                 
179                 return (nodename, values)
180
181         def collectInternal(self, nodename, cohash):
182                 ### RUN PING ######################
183                 ping = command.CMD()
184                 (oval,errval) = ping.run_noexcept("ping -c 1 -q %s | grep rtt" % nodename)
185
186                 try:
187                         values = {}
188
189                         if oval == "":
190                                 # An error occurred
191                                 values['ping_status'] = False
192                         else:
193                                 values['ping_status'] = True
194
195                         try:
196                                 for port in [22, 806]: 
197                                         ssh = command.SSH('root', nodename, port)
198
199                                         (oval, errval) = ssh.run_noexcept2(""" <<\EOF
200                                                 echo "{"
201                                                 echo '  "kernel_version":"'`uname -a`'",'
202                                                 echo '  "bmlog":"'`ls /tmp/bm.log`'",'
203                                                 echo '  "bootcd_version":"'`cat /mnt/cdrom/bootme/ID`'",'
204                                                 echo '  "nm_status":"'`ps ax | grep nm.py | grep -v grep`'",'
205                                                 echo '  "fs_status":"'`touch /var/log/monitor 2>&1`'",'
206                                                 echo '  "dns_status":"'`host boot.planet-lab.org 2>&1`'",'
207                                                 echo '  "princeton_comon_dir":"'`ls -d /vservers/princeton_comon`'",'
208
209                                                 ID=`grep princeton_comon /etc/passwd | awk -F : '{if ( $3 > 500 ) { print $3}}'` 
210                                                 echo '  "princeton_comon_running":"'`ls -d /proc/virtual/$ID`'",'
211                                                 echo '  "princeton_comon_procs":"'`vps ax | grep $ID | grep -v grep | wc -l`'",'
212                                                 echo "}"
213         EOF                             """)
214                                         
215                                         values['ssh_error'] = errval
216                                         if len(oval) > 0:
217                                                 #print "OVAL: %s" % oval
218                                                 values.update(eval(oval))
219                                                 values['ssh_portused'] = port
220                                                 break
221                                         else:
222                                                 values.update({'kernel_version': "", 'bmlog' : "", 'bootcd_version' : '', 
223                                                                                 'nm_status' : '', 
224                                                                                 'fs_status' : '',
225                                                                                 'dns_status' : '',
226                                                                                 'princeton_comon_dir' : "", 
227                                                                                 'princeton_comon_running' : "", 
228                                                                                 'princeton_comon_procs' : "", 'ssh_portused' : None})
229                         except:
230                                 print traceback.print_exc()
231                                 sys.exit(1)
232
233                         ### RUN SSH ######################
234                         b_getbootcd_id = True
235
236                         oval = values['kernel_version']
237                         if "2.6.17" in oval or "2.6.2" in oval:
238                                 values['ssh_status'] = True
239                                 values['observed_category'] = 'PROD'
240                                 if "bm.log" in values['bmlog']:
241                                         values['observed_status'] = 'DEBUG'
242                                 else:
243                                         values['observed_status'] = 'BOOT'
244                         elif "2.6.12" in oval or "2.6.10" in oval:
245                                 values['ssh_status'] = True
246                                 values['observed_category'] = 'OLDPROD'
247                                 if "bm.log" in values['bmlog']:
248                                         values['observed_status'] = 'DEBUG'
249                                 else:
250                                         values['observed_status'] = 'BOOT'
251                         
252                         # NOTE: on 2.6.8 kernels, with 4.2 bootstrapfs, the chroot 
253                         #       command fails.  I have no idea why.
254                         elif "2.4" in oval or "2.6.8" in oval:
255                                 b_getbootcd_id = False
256                                 values['ssh_status'] = True
257                                 values['observed_category'] = 'OLDBOOTCD'
258                                 values['observed_status'] = 'DEBUG'
259                         elif oval != "":
260                                 values['ssh_status'] = True
261                                 values['observed_category'] = 'UNKNOWN'
262                                 if "bm.log" in values['bmlog']:
263                                         values['observed_status'] = 'DEBUG'
264                                 else:
265                                         values['observed_status'] = 'BOOT'
266                         else:
267                                 # An error occurred.
268                                 b_getbootcd_id = False
269                                 values['ssh_status'] = False
270                                 values['observed_category'] = 'ERROR'
271                                 values['observed_status'] = 'DOWN'
272                                 val = errval.strip()
273                                 values['ssh_error'] = val
274                                 values['kernel_version'] = ""
275
276                         if b_getbootcd_id:
277                                 # try to get BootCD for all nodes that are not 2.4 nor inaccessible
278                                 oval = values['bootcd_version']
279                                 if "BootCD" in oval:
280                                         values['bootcd_version'] = oval
281                                         if "v2" in oval and \
282                                                 ( nodename is not "planetlab1.cs.unc.edu" and \
283                                                   nodename is not "planetlab2.cs.unc.edu" ):
284                                                 values['observed_category'] = 'OLDBOOTCD'
285                                 else:
286                                         values['bootcd_version'] = ""
287                         else:
288                                 values['bootcd_version'] = ""
289
290                         oval = values['nm_status']
291                         if "nm.py" in oval:
292                                 values['nm_status'] = "Y"
293                         else:
294                                 values['nm_status'] = "N"
295
296                         continue_slice_check = True
297                         oval = values['princeton_comon_dir']
298                         if "princeton_comon_dir" in oval:
299                                 values['princeton_comon_dir'] = True
300                         else:
301                                 values['princeton_comon_dir'] = False
302                                 continue_slice_check = False
303
304                         if continue_slice_check:
305                                 oval = values['princeton_comon_running']
306                                 if len(oval) > len('/proc/virtual/'):
307                                         values['princeton_comon_running'] = True
308                                 else:
309                                         values['princeton_comon_running'] = False
310                                         continue_slice_check = False
311                         else:
312                                 values['princeton_comon_running'] = False
313                                 
314                         if continue_slice_check:
315                                 oval = values['princeton_comon_procs']
316                                 values['princeton_comon_procs'] = int(oval)
317                         else:
318                                 values['princeton_comon_procs'] = None
319
320                                 
321                         if nodename in cohash: 
322                                 values['comon_stats'] = cohash[nodename]
323                         else:
324                                 values['comon_stats'] = {'resptime':  '-1', 
325                                                                                 'uptime':    '-1',
326                                                                                 'sshstatus': '-1', 
327                                                                                 'lastcotop': '-1',
328                                                                                 'cpuspeed' : "null",
329                                                                                 'disksize' : 'null',
330                                                                                 'memsize'  : 'null'}
331                         # include output value
332                         ### GET PLC NODE ######################
333                         plc_lock.acquire()
334                         d_node = None
335                         try:
336                                 d_node = plc.getNodes({'hostname': nodename}, ['pcu_ids', 'site_id', 
337                                                                                 'date_created', 'last_updated', 
338                                                                                 'last_contact', 'boot_state', 'nodegroup_ids'])[0]
339                         except:
340                                 traceback.print_exc()
341                         plc_lock.release()
342                         values['plc_node_stats'] = d_node
343
344                         ##### NMAP  ###################
345                         (n, v) = collectNMAP(nodename, None)
346                         values.update(v)
347
348                         ### GET PLC PCU ######################
349                         site_id = -1
350                         d_pcu = None
351                         if d_node:
352                                 pcu = d_node['pcu_ids']
353                                 if len(pcu) > 0:
354                                         d_pcu = pcu[0]
355
356                                 site_id = d_node['site_id']
357
358                         values['plc_pcuid'] = d_pcu
359
360                         ### GET PLC SITE ######################
361                         plc_lock.acquire()
362                         d_site = None
363                         values['loginbase'] = ""
364                         try:
365                                 d_site = plc.getSites({'site_id': site_id}, 
366                                                                         ['max_slices', 'slice_ids', 'node_ids', 'login_base'])[0]
367                                 values['loginbase'] = d_site['login_base']
368                         except:
369                                 traceback.print_exc()
370                         plc_lock.release()
371
372                         values['plc_site_stats'] = d_site 
373                         values['date_checked'] = datetime.now()
374                 except:
375                         print traceback.print_exc()
376
377                 return (nodename, values)
378
379
380 def internalprobe(hostname):
381         fbsync = FindbadNodeRecordSync.findby_or_create(hostname="global", 
382                                                                                                         if_new_set={'round' : 1})
383         scannode = ScanNodeInternal(fbsync.round)
384         try:
385                 (nodename, values) = scannode.collectInternal(hostname, {})
386                 scannode.record(None, (nodename, values))
387                 session.flush()
388                 return True
389         except:
390                 print traceback.print_exc()
391                 return False
392
393 def externalprobe(hostname):
394         fbsync = FindbadNodeRecordSync.findby_or_create(hostname="global", 
395                                                                                                         if_new_set={'round' : 1})
396         scannode = ScanNodeInternal(fbsync.round)
397         try:
398                 (nodename, values) = scannode.collectNMAP(hostname, {})
399                 scannode.record(None, (nodename, values))
400                 session.flush()
401                 return True
402         except:
403                 print traceback.print_exc()
404                 return False
405
406 class ScanPCU(ScanInterface):
407         recordclass = FindbadPCURecord
408         syncclass = FindbadPCURecordSync
409         primarykey = 'plc_pcuid'
410
411         def collectInternal(self, pcuname, cohash):
412
413                 continue_probe = True
414                 errors = None
415                 values = {'reboot_trial_status' : 'novalue'}
416                 ### GET PCU ######################
417                 try:
418                         b_except = False
419                         try:
420                                 v = get_plc_pcu_values(pcuname)
421                                 if v['hostname'] is not None: v['hostname'] = v['hostname'].strip()
422                                 if v['ip'] is not None: v['ip'] = v['ip'].strip()
423
424                                 if v is not None:
425                                         values['plc_pcu_stats'] = v
426                                 else:
427                                         continue_probe = False
428                         except:
429                                 b_except = True
430                                 traceback.print_exc()
431                                 continue_probe = False
432
433                         if b_except or not continue_probe: return (None, None, None)
434
435                         #### RUN NMAP ###############################
436                         if continue_probe:
437                                 nmap = util.command.CMD()
438                                 print "nmap -oG - -P0 -p22,23,80,443,5869,9100,16992 %s | grep Host:" % reboot.pcu_name(values['plc_pcu_stats'])
439                                 (oval,eval) = nmap.run_noexcept("nmap -oG - -P0 -p22,23,80,443,5869,9100,16992 %s | grep Host:" % reboot.pcu_name(values['plc_pcu_stats']))
440                                 # NOTE: an empty / error value for oval, will still work.
441                                 (values['port_status'], continue_probe) = nmap_port_status(oval)
442                         else:
443                                 values['port_status'] = None
444                                 
445                         #### COMPLETE ENTRY   #######################
446
447                         values['entry_complete'] = []
448                         #if values['protocol'] is None or values['protocol'] is "":
449                         #       values['entry_complete'] += ["protocol"]
450                         if values['plc_pcu_stats']['model'] is None or values['plc_pcu_stats']['model'] is "":
451                                 values['entry_complete'] += ["model"]
452                                 # Cannot continue due to this condition
453                                 continue_probe = False
454
455                         if values['plc_pcu_stats']['password'] is None or values['plc_pcu_stats']['password'] is "":
456                                 values['entry_complete'] += ["password"]
457                                 # Cannot continue due to this condition
458                                 continue_probe = False
459
460                         if len(values['entry_complete']) > 0:
461                                 continue_probe = False
462
463                         if values['plc_pcu_stats']['hostname'] is None or values['plc_pcu_stats']['hostname'] is "":
464                                 values['entry_complete'] += ["hostname"]
465                         if values['plc_pcu_stats']['ip'] is None or values['plc_pcu_stats']['ip'] is "":
466                                 values['entry_complete'] += ["ip"]
467
468                         # If there are no nodes associated with this PCU, then we cannot continue.
469                         if len(values['plc_pcu_stats']['node_ids']) == 0:
470                                 continue_probe = False
471                                 values['entry_complete'] += ['nodeids']
472
473
474                         #### DNS and IP MATCH #######################
475                         if values['plc_pcu_stats']['hostname'] is not None and values['plc_pcu_stats']['hostname'] is not "" and \
476                            values['plc_pcu_stats']['ip'] is not None and values['plc_pcu_stats']['ip'] is not "":
477                                 try:
478                                         ipaddr = socket.gethostbyname(values['plc_pcu_stats']['hostname'])
479                                         if ipaddr == values['plc_pcu_stats']['ip']:
480                                                 values['dns_status'] = "DNS-OK"
481                                         else:
482                                                 values['dns_status'] = "DNS-MISMATCH"
483                                                 continue_probe = False
484
485                                 except Exception, err:
486                                         values['dns_status'] = "DNS-NOENTRY"
487                                         values['plc_pcu_stats']['hostname'] = values['plc_pcu_stats']['ip']
488                         else:
489                                 if values['plc_pcu_stats']['ip'] is not None and values['plc_pcu_stats']['ip'] is not "":
490                                         values['dns_status'] = "NOHOSTNAME"
491                                         values['plc_pcu_stats']['hostname'] = values['plc_pcu_stats']['ip']
492                                 else:
493                                         values['dns_status'] = "NO-DNS-OR-IP"
494                                         values['plc_pcu_stats']['hostname'] = "No_entry_in_DB"
495                                         continue_probe = False
496
497
498                         ######  DRY RUN  ############################
499                         if 'node_ids' in values['plc_pcu_stats'] and \
500                                 len(values['plc_pcu_stats']['node_ids']) > 0:
501                                 rb_ret = reboot.reboot_test_new(values['plc_pcu_stats']['nodenames'][0], 
502                                                                                                 values, 1, True)
503                         else:
504                                 rb_ret = "Not_Run" # No nodes to test"
505
506                         values['reboot_trial_status'] = rb_ret
507
508                 except:
509                         print "____________________________________"
510                         print values
511                         errors = values
512                         print "____________________________________"
513                         errors['traceback'] = traceback.format_exc()
514                         print errors['traceback']
515                         values['reboot_trial_status'] = errors['traceback']
516
517                 values['entry_complete']=" ".join(values['entry_complete'])
518
519                 values['date_checked'] = datetime.now()
520                 return (pcuname, values)
521