add getNodeAPI()
[monitor.git] / monitor / scanapi.py
1 #!/usr/bin/python
2
3 import os
4 import sys
5 import string
6 import time
7 from datetime import datetime,timedelta
8 import threadpool
9 import threading
10
11 import socket
12 from pcucontrol import reboot
13
14 from monitor import util
15 from monitor.util import command
16 from monitor import config
17
18 from monitor.database.info.model import *
19
20 from monitor.sources import comon
21 from monitor.wrapper import plc, plccache
22
23 import traceback
24 from monitor.common import nmap_port_status
25
26 COMON_COTOPURL= "http://summer.cs.princeton.edu/status/tabulator.cgi?" + \
27                         "table=table_nodeview&" + \
28                         "dumpcols='name,resptime,sshstatus,uptime,lastcotop,cpuspeed,memsize,disksize'&" + \
29                         "formatcsv"
30
31 api = plc.getAuthAPI()
32 plc_lock = threading.Lock()
33 round = 1
34 global_round = round
35 count = 0
36
37
38 def get_pcu(pcuname):
39         plc_lock.acquire()
40         try:
41                 #print "GetPCU from PLC %s" % pcuname
42                 l_pcu  = plc.GetPCUs({'pcu_id' : pcuname})
43                 #print l_pcu
44                 if len(l_pcu) > 0:
45                         l_pcu = l_pcu[0]
46         except:
47                 try:
48                         #print "GetPCU from file %s" % pcuname
49                         l_pcus = plccache.l_pcus
50                         for i in l_pcus:
51                                 if i['pcu_id'] == pcuname:
52                                         l_pcu = i
53                 except:
54                         traceback.print_exc()
55                         l_pcu = None
56
57         plc_lock.release()
58         return l_pcu
59
60 def get_nodes(node_ids):
61         plc_lock.acquire()
62         l_node = []
63         try:
64                 l_node = plc.getNodes(node_ids, ['hostname', 'last_contact', 'node_id', 'ports'])
65         except:
66                 try:
67                         plc_nodes = plccache.l_plcnodes
68                         for n in plc_nodes:
69                                 if n['node_id'] in node_ids:
70                                         l_node.append(n)
71                 except:
72                         traceback.print_exc()
73                         l_node = None
74
75         plc_lock.release()
76         if l_node == []:
77                 l_node = None
78         return l_node
79         
80
81 def get_plc_pcu_values(pcuname):
82         """
83                 Try to contact PLC to get the PCU info.
84                 If that fails, try a backup copy from the last run.
85                 If that fails, return None
86         """
87         values = {}
88
89         l_pcu = get_pcu(pcuname)
90         
91         if l_pcu is not None:
92                 site_id = l_pcu['site_id']
93                 node_ids = l_pcu['node_ids']
94                 l_node = get_nodes(node_ids) 
95                                 
96                 if l_node is not None:
97                         for node in l_node:
98                                 values[node['hostname']] = node['ports'][0]
99
100                         values['nodenames'] = [node['hostname'] for node in l_node]
101
102                         # NOTE: this is for a dry run later. It doesn't matter which node.
103                         values['node_id'] = l_node[0]['node_id']
104
105                 values.update(l_pcu)
106         else:
107                 values = None
108         
109         return values
110
111 class ScanInterface(object):
112         recordclass = None
113         syncclass = None
114         primarykey = 'hostname'
115
116         def __init__(self, round):
117                 self.round = round
118                 self.count = 1
119
120         def __getattr__(self, name):
121                 if 'collect' in name or 'record' in name:
122                         method = getattr(self, name, None)
123                         if method is None:
124                                 raise Exception("No such method %s" % name)
125                         return method
126                 else:
127                         raise Exception("No such method %s" % name)
128
129         def collect(self, nodename, data):
130                 pass
131
132         def record(self, request, (nodename, values) ):
133
134                 try:
135                         if values is None:
136                                 return
137
138                         fbnodesync = self.syncclass.findby_or_create(
139                                                                                                 if_new_set={'round' : self.round},
140                                                                                                 **{ self.primarykey : nodename})
141                         # NOTE: This code will either add a new record for the new self.round, 
142                         #       OR it will find the previous value, and update it with new information.
143                         #       The data that is 'lost' is not that important, b/c older
144                         #       history still exists.  
145                         fbrec = self.recordclass.findby_or_create(
146                                                 **{'round':self.round, self.primarykey:nodename})
147
148                         fbrec.set( **values ) 
149
150                         fbrec.flush()
151                         fbnodesync.round = self.round
152                         fbnodesync.flush()
153
154                         print "%d %s %s" % (self.count, nodename, values)
155                         self.count += 1
156
157                 except:
158                         print "ERROR:"
159                         print traceback.print_exc()
160                         pass
161
162 class ScanNodeInternal(ScanInterface):
163         recordclass = FindbadNodeRecord
164         syncclass = FindbadNodeRecordSync
165         primarykey = 'hostname'
166
167         def collectNMAP(self, nodename, cohash):
168                 #### RUN NMAP ###############################
169                 values = {}
170                 nmap = util.command.CMD()
171                 print "nmap -oG - -P0 -p22,80,806 %s | grep Host:" % nodename
172                 (oval,eval) = nmap.run_noexcept("nmap -oG - -P0 -p22,80,806 %s | grep Host:" % nodename)
173                 # NOTE: an empty / error value for oval, will still work.
174                 (values['port_status'], continue_probe) = nmap_port_status(oval)
175
176                 values['date_checked'] = datetime.now()
177                                 
178                 return (nodename, values)
179
180         def collectInternal(self, nodename, cohash):
181                 ### RUN PING ######################
182                 ping = command.CMD()
183                 (oval,errval) = ping.run_noexcept("ping -c 1 -q %s | grep rtt" % nodename)
184
185                 try:
186                         values = {}
187
188                         if oval == "":
189                                 # An error occurred
190                                 values['ping_status'] = False
191                         else:
192                                 values['ping_status'] = True
193
194                         try:
195                                 for port in [22, 806]: 
196                                         ssh = command.SSH('root', nodename, port)
197
198                                         (oval, errval) = ssh.run_noexcept2(""" <<\EOF
199                                                 echo "{"
200                                                 echo '  "kernel_version":"'`uname -a`'",'
201                                                 echo '  "bmlog":"'`ls /tmp/bm.log`'",'
202                                                 echo '  "bootcd_version":"'`cat /mnt/cdrom/bootme/ID`'",'
203                                                 echo '  "nm_status":"'`ps ax | grep nm.py | grep -v grep`'",'
204                                                 echo '  "fs_status":"'`touch /var/log/monitor 2>&1`'",'
205                                                 echo '  "dns_status":"'`host boot.planet-lab.org 2>&1`'",'
206                                                 echo '  "princeton_comon_dir":"'`ls -d /vservers/princeton_comon`'",'
207
208                                                 ID=`grep princeton_comon /etc/passwd | awk -F : '{if ( $3 > 500 ) { print $3}}'` 
209                                                 echo '  "princeton_comon_running":"'`ls -d /proc/virtual/$ID`'",'
210                                                 echo '  "princeton_comon_procs":"'`vps ax | grep $ID | grep -v grep | wc -l`'",'
211                                                 echo "}"
212         EOF                             """)
213                                         
214                                         values['ssh_error'] = errval
215                                         if len(oval) > 0:
216                                                 #print "OVAL: %s" % oval
217                                                 values.update(eval(oval))
218                                                 values['ssh_portused'] = port
219                                                 break
220                                         else:
221                                                 values.update({'kernel_version': "", 'bmlog' : "", 'bootcd_version' : '', 
222                                                                                 'nm_status' : '', 
223                                                                                 'fs_status' : '',
224                                                                                 'dns_status' : '',
225                                                                                 'princeton_comon_dir' : "", 
226                                                                                 'princeton_comon_running' : "", 
227                                                                                 'princeton_comon_procs' : "", 'ssh_portused' : None})
228                         except:
229                                 print traceback.print_exc()
230                                 sys.exit(1)
231
232                         ### RUN SSH ######################
233                         b_getbootcd_id = True
234
235                         oval = values['kernel_version']
236                         if "2.6.17" in oval or "2.6.2" in oval:
237                                 values['ssh_status'] = True
238                                 values['observed_category'] = 'PROD'
239                                 if "bm.log" in values['bmlog']:
240                                         values['observed_status'] = 'DEBUG'
241                                 else:
242                                         values['observed_status'] = 'BOOT'
243                         elif "2.6.12" in oval or "2.6.10" in oval:
244                                 values['ssh_status'] = True
245                                 values['observed_category'] = 'OLDPROD'
246                                 if "bm.log" in values['bmlog']:
247                                         values['observed_status'] = 'DEBUG'
248                                 else:
249                                         values['observed_status'] = 'BOOT'
250                         
251                         # NOTE: on 2.6.8 kernels, with 4.2 bootstrapfs, the chroot 
252                         #       command fails.  I have no idea why.
253                         elif "2.4" in oval or "2.6.8" in oval:
254                                 b_getbootcd_id = False
255                                 values['ssh_status'] = True
256                                 values['observed_category'] = 'OLDBOOTCD'
257                                 values['observed_status'] = 'DEBUG'
258                         elif oval != "":
259                                 values['ssh_status'] = True
260                                 values['observed_category'] = 'UNKNOWN'
261                                 if "bm.log" in values['bmlog']:
262                                         values['observed_status'] = 'DEBUG'
263                                 else:
264                                         values['observed_status'] = 'BOOT'
265                         else:
266                                 # An error occurred.
267                                 b_getbootcd_id = False
268                                 values['ssh_status'] = False
269                                 values['observed_category'] = 'ERROR'
270                                 values['observed_status'] = 'DOWN'
271                                 val = errval.strip()
272                                 values['ssh_error'] = val
273                                 values['kernel_version'] = ""
274
275                         if b_getbootcd_id:
276                                 # try to get BootCD for all nodes that are not 2.4 nor inaccessible
277                                 oval = values['bootcd_version']
278                                 if "BootCD" in oval:
279                                         values['bootcd_version'] = oval
280                                         if "v2" in oval and \
281                                                 ( nodename is not "planetlab1.cs.unc.edu" and \
282                                                   nodename is not "planetlab2.cs.unc.edu" ):
283                                                 values['observed_category'] = 'OLDBOOTCD'
284                                 else:
285                                         values['bootcd_version'] = ""
286                         else:
287                                 values['bootcd_version'] = ""
288
289                         oval = values['nm_status']
290                         if "nm.py" in oval:
291                                 values['nm_status'] = "Y"
292                         else:
293                                 values['nm_status'] = "N"
294
295                         continue_slice_check = True
296                         oval = values['princeton_comon_dir']
297                         if "princeton_comon_dir" in oval:
298                                 values['princeton_comon_dir'] = True
299                         else:
300                                 values['princeton_comon_dir'] = False
301                                 continue_slice_check = False
302
303                         if continue_slice_check:
304                                 oval = values['princeton_comon_running']
305                                 if len(oval) > len('/proc/virtual/'):
306                                         values['princeton_comon_running'] = True
307                                 else:
308                                         values['princeton_comon_running'] = False
309                                         continue_slice_check = False
310                         else:
311                                 values['princeton_comon_running'] = False
312                                 
313                         if continue_slice_check:
314                                 oval = values['princeton_comon_procs']
315                                 values['princeton_comon_procs'] = int(oval)
316                         else:
317                                 values['princeton_comon_procs'] = None
318
319                                 
320                         if nodename in cohash: 
321                                 values['comon_stats'] = cohash[nodename]
322                         else:
323                                 values['comon_stats'] = {'resptime':  '-1', 
324                                                                                 'uptime':    '-1',
325                                                                                 'sshstatus': '-1', 
326                                                                                 'lastcotop': '-1',
327                                                                                 'cpuspeed' : "null",
328                                                                                 'disksize' : 'null',
329                                                                                 'memsize'  : 'null'}
330                         # include output value
331                         ### GET PLC NODE ######################
332                         plc_lock.acquire()
333                         d_node = None
334                         try:
335                                 d_node = plc.getNodes({'hostname': nodename}, ['pcu_ids', 'site_id', 
336                                                                                 'date_created', 'last_updated', 
337                                                                                 'last_contact', 'boot_state', 'nodegroup_ids'])[0]
338                         except:
339                                 traceback.print_exc()
340                         plc_lock.release()
341                         values['plc_node_stats'] = d_node
342
343                         ##### NMAP  ###################
344                         (n, v) = self.collectNMAP(nodename, None)
345                         values.update(v)
346
347                         ### GET PLC PCU ######################
348                         site_id = -1
349                         d_pcu = None
350                         if d_node:
351                                 pcu = d_node['pcu_ids']
352                                 if len(pcu) > 0:
353                                         d_pcu = pcu[0]
354
355                                 site_id = d_node['site_id']
356
357                         values['plc_pcuid'] = d_pcu
358
359                         ### GET PLC SITE ######################
360                         plc_lock.acquire()
361                         d_site = None
362                         values['loginbase'] = ""
363                         try:
364                                 d_site = plc.getSites({'site_id': site_id}, 
365                                                                         ['max_slices', 'slice_ids', 'node_ids', 'login_base'])[0]
366                                 values['loginbase'] = d_site['login_base']
367                         except:
368                                 traceback.print_exc()
369                         plc_lock.release()
370
371                         values['plc_site_stats'] = d_site 
372                         values['date_checked'] = datetime.now()
373                 except:
374                         print traceback.print_exc()
375
376                 return (nodename, values)
377
378 def internalprobe(hostname):
379         fbsync = FindbadNodeRecordSync.findby_or_create(hostname="global", 
380                                                                                                         if_new_set={'round' : 1})
381         scannode = ScanNodeInternal(fbsync.round)
382         try:
383                 (nodename, values) = scannode.collectInternal(hostname, {})
384                 scannode.record(None, (nodename, values))
385                 session.flush()
386                 return True
387         except:
388                 print traceback.print_exc()
389                 return False
390
391 def externalprobe(hostname):
392         fbsync = FindbadNodeRecordSync.findby_or_create(hostname="global", 
393                                                                                                         if_new_set={'round' : 1})
394         scannode = ScanNodeInternal(fbsync.round)
395         try:
396                 (nodename, values) = scannode.collectNMAP(hostname, {})
397                 scannode.record(None, (nodename, values))
398                 session.flush()
399                 return True
400         except:
401                 print traceback.print_exc()
402                 return False
403
404 class ScanPCU(ScanInterface):
405         recordclass = FindbadPCURecord
406         syncclass = FindbadPCURecordSync
407         primarykey = 'plc_pcuid'
408
409         def collectInternal(self, pcuname, cohash):
410
411                 continue_probe = True
412                 errors = None
413                 values = {'reboot_trial_status' : 'novalue'}
414                 ### GET PCU ######################
415                 try:
416                         b_except = False
417                         try:
418                                 v = get_plc_pcu_values(pcuname)
419                                 if v['hostname'] is not None: v['hostname'] = v['hostname'].strip()
420                                 if v['ip'] is not None: v['ip'] = v['ip'].strip()
421
422                                 if v is not None:
423                                         values['plc_pcu_stats'] = v
424                                 else:
425                                         continue_probe = False
426                         except:
427                                 b_except = True
428                                 traceback.print_exc()
429                                 continue_probe = False
430
431                         if b_except or not continue_probe: return (None, None, None)
432
433                         #### RUN NMAP ###############################
434                         if continue_probe:
435                                 nmap = util.command.CMD()
436                                 print "nmap -oG - -P0 -p22,23,80,443,5869,9100,16992 %s | grep Host:" % reboot.pcu_name(values['plc_pcu_stats'])
437                                 (oval,eval) = nmap.run_noexcept("nmap -oG - -P0 -p22,23,80,443,5869,9100,16992 %s | grep Host:" % reboot.pcu_name(values['plc_pcu_stats']))
438                                 # NOTE: an empty / error value for oval, will still work.
439                                 (values['port_status'], continue_probe) = nmap_port_status(oval)
440                         else:
441                                 values['port_status'] = None
442                                 
443                         #### COMPLETE ENTRY   #######################
444
445                         values['entry_complete'] = []
446                         #if values['protocol'] is None or values['protocol'] is "":
447                         #       values['entry_complete'] += ["protocol"]
448                         if values['plc_pcu_stats']['model'] is None or values['plc_pcu_stats']['model'] is "":
449                                 values['entry_complete'] += ["model"]
450                                 # Cannot continue due to this condition
451                                 continue_probe = False
452
453                         if values['plc_pcu_stats']['password'] is None or values['plc_pcu_stats']['password'] is "":
454                                 values['entry_complete'] += ["password"]
455                                 # Cannot continue due to this condition
456                                 continue_probe = False
457
458                         if len(values['entry_complete']) > 0:
459                                 continue_probe = False
460
461                         if values['plc_pcu_stats']['hostname'] is None or values['plc_pcu_stats']['hostname'] is "":
462                                 values['entry_complete'] += ["hostname"]
463                         if values['plc_pcu_stats']['ip'] is None or values['plc_pcu_stats']['ip'] is "":
464                                 values['entry_complete'] += ["ip"]
465
466                         # If there are no nodes associated with this PCU, then we cannot continue.
467                         if len(values['plc_pcu_stats']['node_ids']) == 0:
468                                 continue_probe = False
469                                 values['entry_complete'] += ['nodeids']
470
471
472                         #### DNS and IP MATCH #######################
473                         if values['plc_pcu_stats']['hostname'] is not None and values['plc_pcu_stats']['hostname'] is not "" and \
474                            values['plc_pcu_stats']['ip'] is not None and values['plc_pcu_stats']['ip'] is not "":
475                                 try:
476                                         ipaddr = socket.gethostbyname(values['plc_pcu_stats']['hostname'])
477                                         if ipaddr == values['plc_pcu_stats']['ip']:
478                                                 values['dns_status'] = "DNS-OK"
479                                         else:
480                                                 values['dns_status'] = "DNS-MISMATCH"
481                                                 continue_probe = False
482
483                                 except Exception, err:
484                                         values['dns_status'] = "DNS-NOENTRY"
485                                         values['plc_pcu_stats']['hostname'] = values['plc_pcu_stats']['ip']
486                         else:
487                                 if values['plc_pcu_stats']['ip'] is not None and values['plc_pcu_stats']['ip'] is not "":
488                                         values['dns_status'] = "NOHOSTNAME"
489                                         values['plc_pcu_stats']['hostname'] = values['plc_pcu_stats']['ip']
490                                 else:
491                                         values['dns_status'] = "NO-DNS-OR-IP"
492                                         values['plc_pcu_stats']['hostname'] = "No_entry_in_DB"
493                                         continue_probe = False
494
495
496                         ######  DRY RUN  ############################
497                         if 'node_ids' in values['plc_pcu_stats'] and \
498                                 len(values['plc_pcu_stats']['node_ids']) > 0:
499                                 rb_ret = reboot.reboot_test_new(values['plc_pcu_stats']['nodenames'][0], 
500                                                                                                 values, 1, True)
501                         else:
502                                 rb_ret = "Not_Run" # No nodes to test"
503
504                         values['reboot_trial_status'] = rb_ret
505
506                 except:
507                         print "____________________________________"
508                         print values
509                         errors = values
510                         print "____________________________________"
511                         errors['traceback'] = traceback.format_exc()
512                         print errors['traceback']
513                         values['reboot_trial_status'] = errors['traceback']
514
515                 values['entry_complete']=" ".join(values['entry_complete'])
516
517                 values['date_checked'] = datetime.now()
518                 return (pcuname, values)
519