modified findbad and findbadpcu to use scanapi. need to combine these files.
[monitor.git] / monitor / scanapi.py
1 #!/usr/bin/python
2
3 import os
4 import sys
5 import string
6 import time
7 from datetime import datetime,timedelta
8 import threadpool
9 import threading
10
11 import socket
12 from pcucontrol import reboot
13
14 from monitor import util
15 from monitor.util import command
16 from monitor import config
17
18 from monitor.database.info.model import *
19
20 from monitor.sources import comon
21 from monitor.wrapper import plc, plccache
22
23 from nodequery import verify,query_to_dict,node_select
24 import traceback
25 from nodecommon import nmap_port_status
26
27 COMON_COTOPURL= "http://summer.cs.princeton.edu/status/tabulator.cgi?" + \
28                         "table=table_nodeview&" + \
29                         "dumpcols='name,resptime,sshstatus,uptime,lastcotop,cpuspeed,memsize,disksize'&" + \
30                         "formatcsv"
31
32 api = plc.getAuthAPI()
33 plc_lock = threading.Lock()
34 round = 1
35 global_round = round
36 count = 0
37
38
39 def get_pcu(pcuname):
40         plc_lock.acquire()
41         try:
42                 #print "GetPCU from PLC %s" % pcuname
43                 l_pcu  = plc.GetPCUs({'pcu_id' : pcuname})
44                 #print l_pcu
45                 if len(l_pcu) > 0:
46                         l_pcu = l_pcu[0]
47         except:
48                 try:
49                         #print "GetPCU from file %s" % pcuname
50                         l_pcus = plccache.l_pcus
51                         for i in l_pcus:
52                                 if i['pcu_id'] == pcuname:
53                                         l_pcu = i
54                 except:
55                         traceback.print_exc()
56                         l_pcu = None
57
58         plc_lock.release()
59         return l_pcu
60
61 def get_nodes(node_ids):
62         plc_lock.acquire()
63         l_node = []
64         try:
65                 l_node = plc.getNodes(node_ids, ['hostname', 'last_contact', 'node_id', 'ports'])
66         except:
67                 try:
68                         plc_nodes = plccache.l_plcnodes
69                         for n in plc_nodes:
70                                 if n['node_id'] in node_ids:
71                                         l_node.append(n)
72                 except:
73                         traceback.print_exc()
74                         l_node = None
75
76         plc_lock.release()
77         if l_node == []:
78                 l_node = None
79         return l_node
80         
81
82 def get_plc_pcu_values(pcuname):
83         """
84                 Try to contact PLC to get the PCU info.
85                 If that fails, try a backup copy from the last run.
86                 If that fails, return None
87         """
88         values = {}
89
90         l_pcu = get_pcu(pcuname)
91         
92         if l_pcu is not None:
93                 site_id = l_pcu['site_id']
94                 node_ids = l_pcu['node_ids']
95                 l_node = get_nodes(node_ids) 
96                                 
97                 if l_node is not None:
98                         for node in l_node:
99                                 values[node['hostname']] = node['ports'][0]
100
101                         values['nodenames'] = [node['hostname'] for node in l_node]
102
103                         # NOTE: this is for a dry run later. It doesn't matter which node.
104                         values['node_id'] = l_node[0]['node_id']
105
106                 values.update(l_pcu)
107         else:
108                 values = None
109         
110         return values
111
112 class ScanInterface(object):
113         recordclass = None
114         syncclass = None
115         primarykey = 'hostname'
116
117         def __init__(self, round):
118                 self.round = round
119                 self.count = 1
120
121         def __getattr__(self, name):
122                 if 'collect' in name or 'record' in name:
123                         method = getattr(self, name, None)
124                         if method is None:
125                                 raise Exception("No such method %s" % name)
126                         return method
127                 else:
128                         raise Exception("No such method %s" % name)
129
130         def collect(self, nodename, data):
131                 pass
132
133         def record(self, request, (nodename, values) ):
134
135                 try:
136                         if values is None:
137                                 return
138
139                         fbnodesync = self.syncclass.findby_or_create(
140                                                                                                 if_new_set={'round' : self.round},
141                                                                                                 **{ self.primarykey : nodename})
142                         # NOTE: This code will either add a new record for the new self.round, 
143                         #       OR it will find the previous value, and update it with new information.
144                         #       The data that is 'lost' is not that important, b/c older
145                         #       history still exists.  
146                         fbrec = self.recordclass.findby_or_create(
147                                                 **{'round':self.round, self.primarykey:nodename})
148
149                         fbrec.set( **values ) 
150
151                         fbrec.flush()
152                         fbnodesync.round = self.round
153                         fbnodesync.flush()
154
155                         print "%d %s %s" % (self.count, nodename, values)
156                         self.count += 1
157
158                 except:
159                         print "ERROR:"
160                         print traceback.print_exc()
161                         pass
162
163 class ScanNodeInternal(ScanInterface):
164         recordclass = FindbadNodeRecord
165         syncclass = FindbadNodeRecordSync
166         primarykey = 'hostname'
167
168         def collectNMAP(self, nodename, cohash):
169                 #### RUN NMAP ###############################
170                 values = {}
171                 nmap = util.command.CMD()
172                 print "nmap -oG - -P0 -p22,80,806 %s | grep Host:" % nodename
173                 (oval,eval) = nmap.run_noexcept("nmap -oG - -P0 -p22,80,806 %s | grep Host:" % nodename)
174                 # NOTE: an empty / error value for oval, will still work.
175                 (values['port_status'], continue_probe) = nmap_port_status(oval)
176
177                 values['date_checked'] = datetime.now()
178                                 
179                 return (nodename, values)
180
181         def collectInternal(self, nodename, cohash):
182                 ### RUN PING ######################
183                 ping = command.CMD()
184                 (oval,errval) = ping.run_noexcept("ping -c 1 -q %s | grep rtt" % nodename)
185
186                 try:
187                         values = {}
188
189                         if oval == "":
190                                 # An error occurred
191                                 values['ping_status'] = False
192                         else:
193                                 values['ping_status'] = True
194
195                         try:
196                                 for port in [22, 806]: 
197                                         ssh = command.SSH('root', nodename, port)
198
199                                         (oval, errval) = ssh.run_noexcept2(""" <<\EOF
200                                                 echo "{"
201                                                 echo '  "kernel_version":"'`uname -a`'",'
202                                                 echo '  "bmlog":"'`ls /tmp/bm.log`'",'
203                                                 echo '  "bootcd_version":"'`cat /mnt/cdrom/bootme/ID`'",'
204                                                 echo '  "nm_status":"'`ps ax | grep nm.py | grep -v grep`'",'
205                                                 echo '  "fs_status":"'`touch /var/log/monitor 2>&1`'",'
206                                                 echo '  "dns_status":"'`host boot.planet-lab.org 2>&1`'",'
207                                                 echo '  "princeton_comon_dir":"'`ls -d /vservers/princeton_comon`'",'
208
209                                                 ID=`grep princeton_comon /etc/passwd | awk -F : '{if ( $3 > 500 ) { print $3}}'` 
210                                                 echo '  "princeton_comon_running":"'`ls -d /proc/virtual/$ID`'",'
211                                                 echo '  "princeton_comon_procs":"'`vps ax | grep $ID | grep -v grep | wc -l`'",'
212                                                 echo "}"
213         EOF                             """)
214                                         
215                                         values['ssh_error'] = errval
216                                         if len(oval) > 0:
217                                                 #print "OVAL: %s" % oval
218                                                 values.update(eval(oval))
219                                                 values['ssh_portused'] = port
220                                                 break
221                                         else:
222                                                 values.update({'kernel_version': "", 'bmlog' : "", 'bootcd_version' : '', 
223                                                                                 'nm_status' : '', 
224                                                                                 'fs_status' : '',
225                                                                                 'dns_status' : '',
226                                                                                 'princeton_comon_dir' : "", 
227                                                                                 'princeton_comon_running' : "", 
228                                                                                 'princeton_comon_procs' : "", 'ssh_portused' : None})
229                         except:
230                                 print traceback.print_exc()
231                                 sys.exit(1)
232
233                         ### RUN SSH ######################
234                         b_getbootcd_id = True
235
236                         oval = values['kernel_version']
237                         if "2.6.17" in oval or "2.6.2" in oval:
238                                 values['ssh_status'] = True
239                                 values['observed_category'] = 'PROD'
240                                 if "bm.log" in values['bmlog']:
241                                         values['observed_status'] = 'DEBUG'
242                                 else:
243                                         values['observed_status'] = 'BOOT'
244                         elif "2.6.12" in oval or "2.6.10" in oval:
245                                 values['ssh_status'] = True
246                                 values['observed_category'] = 'OLDPROD'
247                                 if "bm.log" in values['bmlog']:
248                                         values['observed_status'] = 'DEBUG'
249                                 else:
250                                         values['observed_status'] = 'BOOT'
251                         
252                         # NOTE: on 2.6.8 kernels, with 4.2 bootstrapfs, the chroot 
253                         #       command fails.  I have no idea why.
254                         elif "2.4" in oval or "2.6.8" in oval:
255                                 b_getbootcd_id = False
256                                 values['ssh_status'] = True
257                                 values['observed_category'] = 'OLDBOOTCD'
258                                 values['observed_status'] = 'DEBUG'
259                         elif oval != "":
260                                 values['ssh_status'] = True
261                                 values['observed_category'] = 'UNKNOWN'
262                                 if "bm.log" in values['bmlog']:
263                                         values['observed_status'] = 'DEBUG'
264                                 else:
265                                         values['observed_status'] = 'BOOT'
266                         else:
267                                 # An error occurred.
268                                 b_getbootcd_id = False
269                                 values['ssh_status'] = False
270                                 values['observed_category'] = 'ERROR'
271                                 values['observed_status'] = 'DOWN'
272                                 val = errval.strip()
273                                 values['ssh_error'] = val
274                                 values['kernel_version'] = ""
275
276                         if b_getbootcd_id:
277                                 # try to get BootCD for all nodes that are not 2.4 nor inaccessible
278                                 oval = values['bootcd_version']
279                                 if "BootCD" in oval:
280                                         values['bootcd_version'] = oval
281                                         if "v2" in oval and \
282                                                 ( nodename is not "planetlab1.cs.unc.edu" and \
283                                                   nodename is not "planetlab2.cs.unc.edu" ):
284                                                 values['observed_category'] = 'OLDBOOTCD'
285                                 else:
286                                         values['bootcd_version'] = ""
287                         else:
288                                 values['bootcd_version'] = ""
289
290                         oval = values['nm_status']
291                         if "nm.py" in oval:
292                                 values['nm_status'] = "Y"
293                         else:
294                                 values['nm_status'] = "N"
295
296                         continue_slice_check = True
297                         oval = values['princeton_comon_dir']
298                         if "princeton_comon_dir" in oval:
299                                 values['princeton_comon_dir'] = True
300                         else:
301                                 values['princeton_comon_dir'] = False
302                                 continue_slice_check = False
303
304                         if continue_slice_check:
305                                 oval = values['princeton_comon_running']
306                                 if len(oval) > len('/proc/virtual/'):
307                                         values['princeton_comon_running'] = True
308                                 else:
309                                         values['princeton_comon_running'] = False
310                                         continue_slice_check = False
311                         else:
312                                 values['princeton_comon_running'] = False
313                                 
314                         if continue_slice_check:
315                                 oval = values['princeton_comon_procs']
316                                 values['princeton_comon_procs'] = int(oval)
317                         else:
318                                 values['princeton_comon_procs'] = None
319
320                                 
321                         if nodename in cohash: 
322                                 values['comon_stats'] = cohash[nodename]
323                         else:
324                                 values['comon_stats'] = {'resptime':  '-1', 
325                                                                                 'uptime':    '-1',
326                                                                                 'sshstatus': '-1', 
327                                                                                 'lastcotop': '-1',
328                                                                                 'cpuspeed' : "null",
329                                                                                 'disksize' : 'null',
330                                                                                 'memsize'  : 'null'}
331                         # include output value
332                         ### GET PLC NODE ######################
333                         plc_lock.acquire()
334                         d_node = None
335                         try:
336                                 d_node = plc.getNodes({'hostname': nodename}, ['pcu_ids', 'site_id', 
337                                                                                 'date_created', 'last_updated', 
338                                                                                 'last_contact', 'boot_state', 'nodegroup_ids'])[0]
339                         except:
340                                 traceback.print_exc()
341                         plc_lock.release()
342                         values['plc_node_stats'] = d_node
343
344                         ##### NMAP  ###################
345                         (n, v) = self.collectNMAP(nodename, None)
346                         values.update(v)
347
348                         ### GET PLC PCU ######################
349                         site_id = -1
350                         d_pcu = None
351                         if d_node:
352                                 pcu = d_node['pcu_ids']
353                                 if len(pcu) > 0:
354                                         d_pcu = pcu[0]
355
356                                 site_id = d_node['site_id']
357
358                         values['plc_pcuid'] = d_pcu
359
360                         ### GET PLC SITE ######################
361                         plc_lock.acquire()
362                         d_site = None
363                         values['loginbase'] = ""
364                         try:
365                                 d_site = plc.getSites({'site_id': site_id}, 
366                                                                         ['max_slices', 'slice_ids', 'node_ids', 'login_base'])[0]
367                                 values['loginbase'] = d_site['login_base']
368                         except:
369                                 traceback.print_exc()
370                         plc_lock.release()
371
372                         values['plc_site_stats'] = d_site 
373                         values['date_checked'] = datetime.now()
374                 except:
375                         print traceback.print_exc()
376
377                 return (nodename, values)
378
379 def internalprobe(hostname):
380         fbsync = FindbadNodeRecordSync.findby_or_create(hostname="global", 
381                                                                                                         if_new_set={'round' : 1})
382         scannode = ScanNodeInternal(fbsync.round)
383         try:
384                 (nodename, values) = scannode.collectInternal(hostname, {})
385                 scannode.record(None, (nodename, values))
386                 session.flush()
387                 return True
388         except:
389                 print traceback.print_exc()
390                 return False
391
392 def externalprobe(hostname):
393         fbsync = FindbadNodeRecordSync.findby_or_create(hostname="global", 
394                                                                                                         if_new_set={'round' : 1})
395         scannode = ScanNodeInternal(fbsync.round)
396         try:
397                 (nodename, values) = scannode.collectNMAP(hostname, {})
398                 scannode.record(None, (nodename, values))
399                 session.flush()
400                 return True
401         except:
402                 print traceback.print_exc()
403                 return False
404
405 class ScanPCU(ScanInterface):
406         recordclass = FindbadPCURecord
407         syncclass = FindbadPCURecordSync
408         primarykey = 'plc_pcuid'
409
410         def collectInternal(self, pcuname, cohash):
411
412                 continue_probe = True
413                 errors = None
414                 values = {'reboot_trial_status' : 'novalue'}
415                 ### GET PCU ######################
416                 try:
417                         b_except = False
418                         try:
419                                 v = get_plc_pcu_values(pcuname)
420                                 if v['hostname'] is not None: v['hostname'] = v['hostname'].strip()
421                                 if v['ip'] is not None: v['ip'] = v['ip'].strip()
422
423                                 if v is not None:
424                                         values['plc_pcu_stats'] = v
425                                 else:
426                                         continue_probe = False
427                         except:
428                                 b_except = True
429                                 traceback.print_exc()
430                                 continue_probe = False
431
432                         if b_except or not continue_probe: return (None, None, None)
433
434                         #### RUN NMAP ###############################
435                         if continue_probe:
436                                 nmap = util.command.CMD()
437                                 print "nmap -oG - -P0 -p22,23,80,443,5869,9100,16992 %s | grep Host:" % reboot.pcu_name(values['plc_pcu_stats'])
438                                 (oval,eval) = nmap.run_noexcept("nmap -oG - -P0 -p22,23,80,443,5869,9100,16992 %s | grep Host:" % reboot.pcu_name(values['plc_pcu_stats']))
439                                 # NOTE: an empty / error value for oval, will still work.
440                                 (values['port_status'], continue_probe) = nmap_port_status(oval)
441                         else:
442                                 values['port_status'] = None
443                                 
444                         #### COMPLETE ENTRY   #######################
445
446                         values['entry_complete'] = []
447                         #if values['protocol'] is None or values['protocol'] is "":
448                         #       values['entry_complete'] += ["protocol"]
449                         if values['plc_pcu_stats']['model'] is None or values['plc_pcu_stats']['model'] is "":
450                                 values['entry_complete'] += ["model"]
451                                 # Cannot continue due to this condition
452                                 continue_probe = False
453
454                         if values['plc_pcu_stats']['password'] is None or values['plc_pcu_stats']['password'] is "":
455                                 values['entry_complete'] += ["password"]
456                                 # Cannot continue due to this condition
457                                 continue_probe = False
458
459                         if len(values['entry_complete']) > 0:
460                                 continue_probe = False
461
462                         if values['plc_pcu_stats']['hostname'] is None or values['plc_pcu_stats']['hostname'] is "":
463                                 values['entry_complete'] += ["hostname"]
464                         if values['plc_pcu_stats']['ip'] is None or values['plc_pcu_stats']['ip'] is "":
465                                 values['entry_complete'] += ["ip"]
466
467                         # If there are no nodes associated with this PCU, then we cannot continue.
468                         if len(values['plc_pcu_stats']['node_ids']) == 0:
469                                 continue_probe = False
470                                 values['entry_complete'] += ['nodeids']
471
472
473                         #### DNS and IP MATCH #######################
474                         if values['plc_pcu_stats']['hostname'] is not None and values['plc_pcu_stats']['hostname'] is not "" and \
475                            values['plc_pcu_stats']['ip'] is not None and values['plc_pcu_stats']['ip'] is not "":
476                                 try:
477                                         ipaddr = socket.gethostbyname(values['plc_pcu_stats']['hostname'])
478                                         if ipaddr == values['plc_pcu_stats']['ip']:
479                                                 values['dns_status'] = "DNS-OK"
480                                         else:
481                                                 values['dns_status'] = "DNS-MISMATCH"
482                                                 continue_probe = False
483
484                                 except Exception, err:
485                                         values['dns_status'] = "DNS-NOENTRY"
486                                         values['plc_pcu_stats']['hostname'] = values['plc_pcu_stats']['ip']
487                         else:
488                                 if values['plc_pcu_stats']['ip'] is not None and values['plc_pcu_stats']['ip'] is not "":
489                                         values['dns_status'] = "NOHOSTNAME"
490                                         values['plc_pcu_stats']['hostname'] = values['plc_pcu_stats']['ip']
491                                 else:
492                                         values['dns_status'] = "NO-DNS-OR-IP"
493                                         values['plc_pcu_stats']['hostname'] = "No_entry_in_DB"
494                                         continue_probe = False
495
496
497                         ######  DRY RUN  ############################
498                         if 'node_ids' in values['plc_pcu_stats'] and \
499                                 len(values['plc_pcu_stats']['node_ids']) > 0:
500                                 rb_ret = reboot.reboot_test_new(values['plc_pcu_stats']['nodenames'][0], 
501                                                                                                 values, 1, True)
502                         else:
503                                 rb_ret = "Not_Run" # No nodes to test"
504
505                         values['reboot_trial_status'] = rb_ret
506
507                 except:
508                         print "____________________________________"
509                         print values
510                         errors = values
511                         print "____________________________________"
512                         errors['traceback'] = traceback.format_exc()
513                         print errors['traceback']
514                         values['reboot_trial_status'] = errors['traceback']
515
516                 values['entry_complete']=" ".join(values['entry_complete'])
517
518                 values['date_checked'] = datetime.now()
519                 return (pcuname, values)
520