modified *list templates with abreviated information
[monitor.git] / findbad.py
1 #!/usr/bin/python
2
3 import os
4 import sys
5 import string
6 import time
7 from datetime import datetime,timedelta
8 import threadpool
9 import threading
10
11 from monitor import util
12 from monitor.util import command
13 from monitor import config
14
15 from monitor.database.info.model import FindbadNodeRecordSync, FindbadNodeRecord, session
16
17 from monitor.sources import comon
18 from monitor.wrapper import plc, plccache
19
20 from nodequery import verify,query_to_dict,node_select
21 import traceback
22 from nodecommon import nmap_port_status
23
24 #print "starting sqlfindbad.py"
25 # QUERY all nodes.
26 COMON_COTOPURL= "http://summer.cs.princeton.edu/status/tabulator.cgi?" + \
27                                 "table=table_nodeview&" + \
28                                 "dumpcols='name,resptime,sshstatus,uptime,lastcotop,cpuspeed,memsize,disksize'&" + \
29                                 "formatcsv"
30                                     #"formatcsv&" + \
31                                         #"select='lastcotop!=0'"
32
33 api = plc.getAuthAPI()
34 plc_lock = threading.Lock()
35 round = 1
36 global_round = round
37 count = 0
38
39 def collectNMAP(nodename, cohash):
40         #### RUN NMAP ###############################
41         values = {}
42         nmap = util.command.CMD()
43         print "nmap -oG - -P0 -p22,80,806 %s | grep Host:" % nodename
44         (oval,eval) = nmap.run_noexcept("nmap -oG - -P0 -p22,80,806 %s | grep Host:" % nodename)
45         # NOTE: an empty / error value for oval, will still work.
46         (values['port_status'], continue_probe) = nmap_port_status(oval)
47
48         values['date_checked'] = datetime.now()
49                         
50         return (nodename, values)
51
52 def collectPingAndSSH(nodename, cohash):
53         ### RUN PING ######################
54         ping = command.CMD()
55         (oval,errval) = ping.run_noexcept("ping -c 1 -q %s | grep rtt" % nodename)
56
57         try:
58                 values = {}
59
60                 if oval == "":
61                         # An error occurred
62                         values['ping_status'] = False
63                 else:
64                         values['ping_status'] = True
65
66                 try:
67                         for port in [22, 806]: 
68                                 ssh = command.SSH('root', nodename, port)
69
70                                 (oval, errval) = ssh.run_noexcept2(""" <<\EOF
71                                         echo "{"
72                                         echo '  "kernel_version":"'`uname -a`'",'
73                                         echo '  "bmlog":"'`ls /tmp/bm.log`'",'
74                                         echo '  "bootcd_version":"'`cat /mnt/cdrom/bootme/ID`'",'
75                                         echo '  "nm_status":"'`ps ax | grep nm.py | grep -v grep`'",'
76                                         echo '  "fs_status":"'`touch /var/log/monitor 2>&1`'",'
77                                         echo '  "dns_status":"'`host boot.planet-lab.org 2>&1`'",'
78                                         echo '  "princeton_comon_dir":"'`ls -d /vservers/princeton_comon`'",'
79
80                                         ID=`grep princeton_comon /etc/passwd | awk -F : '{if ( $3 > 500 ) { print $3}}'` 
81                                         echo '  "princeton_comon_running":"'`ls -d /proc/virtual/$ID`'",'
82                                         echo '  "princeton_comon_procs":"'`vps ax | grep $ID | grep -v grep | wc -l`'",'
83                                         echo "}"
84 EOF                             """)
85                                 
86                                 values['ssh_error'] = errval
87                                 if len(oval) > 0:
88                                         #print "OVAL: %s" % oval
89                                         values.update(eval(oval))
90                                         values['ssh_portused'] = port
91                                         break
92                                 else:
93                                         values.update({'kernel_version': "", 'bmlog' : "", 'bootcd_version' : '', 
94                                                                         'nm_status' : '', 
95                                                                         'fs_status' : '',
96                                                                         'dns_status' : '',
97                                                                         'princeton_comon_dir' : "", 
98                                                                         'princeton_comon_running' : "", 
99                                                                         'princeton_comon_procs' : "", 'ssh_portused' : None})
100                 except:
101                         print traceback.print_exc()
102                         sys.exit(1)
103
104                 ### RUN SSH ######################
105                 b_getbootcd_id = True
106                 #ssh = command.SSH('root', nodename)
107                 #oval = ""
108                 #errval = ""
109                 #(oval, errval) = ssh.run_noexcept('echo `uname -a ; ls /tmp/bm.log`')
110
111                 oval = values['kernel_version']
112                 if "2.6.17" in oval or "2.6.2" in oval:
113                         values['ssh_status'] = True
114                         values['observed_category'] = 'PROD'
115                         if "bm.log" in values['bmlog']:
116                                 values['observed_status'] = 'DEBUG'
117                         else:
118                                 values['observed_status'] = 'BOOT'
119                 elif "2.6.12" in oval or "2.6.10" in oval:
120                         values['ssh_status'] = True
121                         values['observed_category'] = 'OLDPROD'
122                         if "bm.log" in values['bmlog']:
123                                 values['observed_status'] = 'DEBUG'
124                         else:
125                                 values['observed_status'] = 'BOOT'
126                 
127                 # NOTE: on 2.6.8 kernels, with 4.2 bootstrapfs, the chroot command fails.  I have no idea why.
128                 elif "2.4" in oval or "2.6.8" in oval:
129                         b_getbootcd_id = False
130                         values['ssh_status'] = True
131                         values['observed_category'] = 'OLDBOOTCD'
132                         values['observed_status'] = 'DEBUG'
133                 elif oval != "":
134                         values['ssh_status'] = True
135                         values['observed_category'] = 'UNKNOWN'
136                         if "bm.log" in values['bmlog']:
137                                 values['observed_status'] = 'DEBUG'
138                         else:
139                                 values['observed_status'] = 'BOOT'
140                 else:
141                         # An error occurred.
142                         b_getbootcd_id = False
143                         values['ssh_status'] = False
144                         values['observed_category'] = 'ERROR'
145                         values['observed_status'] = 'DOWN'
146                         val = errval.strip()
147                         values['ssh_error'] = val
148                         values['kernel_version'] = ""
149
150                 #values['kernel_version'] = val
151
152                 if b_getbootcd_id:
153                         # try to get BootCD for all nodes that are not 2.4 nor inaccessible
154                         #(oval, errval) = ssh.run_noexcept('cat /mnt/cdrom/bootme/ID')
155                         oval = values['bootcd_version']
156                         if "BootCD" in oval:
157                                 values['bootcd_version'] = oval
158                                 if "v2" in oval and \
159                                         ( nodename is not "planetlab1.cs.unc.edu" and \
160                                           nodename is not "planetlab2.cs.unc.edu" ):
161                                         values['observed_category'] = 'OLDBOOTCD'
162                         else:
163                                 values['bootcd_version'] = ""
164                 else:
165                         values['bootcd_version'] = ""
166
167                 # TODO: get bm.log for debug nodes.
168                 # 'zcat /tmp/bm.log'
169                 
170                 #(oval, errval) = ssh.run_noexcept('ps ax | grep nm.py | grep -v grep')
171                 oval = values['nm_status']
172                 if "nm.py" in oval:
173                         values['nm_status'] = "Y"
174                 else:
175                         values['nm_status'] = "N"
176
177                 continue_slice_check = True
178                 #(oval, errval) = ssh.run_noexcept('ls -d /vservers/princeton_comon')
179                 oval = values['princeton_comon_dir']
180                 if "princeton_comon_dir" in oval:
181                         values['princeton_comon_dir'] = True
182                 else:
183                         values['princeton_comon_dir'] = False
184                         continue_slice_check = False
185
186                 if continue_slice_check:
187                         #(oval, errval) = ssh.run_noexcept('ID=`grep princeton_comon /etc/passwd | awk -F : "{if ( \\\$3 > 500 ) { print \\\$3}}"`; ls -d /proc/virtual/$ID')
188                         oval = values['princeton_comon_running']
189                         if len(oval) > len('/proc/virtual/'):
190                                 values['princeton_comon_running'] = True
191                         else:
192                                 values['princeton_comon_running'] = False
193                                 continue_slice_check = False
194                 else:
195                         values['princeton_comon_running'] = False
196                         
197                 if continue_slice_check:
198                         #(oval, errval) = ssh.run_noexcept('ID=`grep princeton_comon /etc/passwd | awk -F : "{if ( \\\$3 > 500 ) { print \\\$3}}"`; vps ax | grep $ID | grep -v grep | wc -l')
199                         oval = values['princeton_comon_procs']
200                         values['princeton_comon_procs'] = int(oval)
201                 else:
202                         values['princeton_comon_procs'] = None
203
204                         
205                 if nodename in cohash: 
206                         values['comon_stats'] = cohash[nodename]
207                 else:
208                         values['comon_stats'] = {'resptime':  '-1', 
209                                                                         'uptime':    '-1',
210                                                                         'sshstatus': '-1', 
211                                                                         'lastcotop': '-1',
212                                                                         'cpuspeed' : "null",
213                                                                         'disksize' : 'null',
214                                                                         'memsize'  : 'null'}
215                 # include output value
216                 ### GET PLC NODE ######################
217                 plc_lock.acquire()
218                 d_node = None
219                 try:
220                         d_node = plc.getNodes({'hostname': nodename}, ['pcu_ids', 'site_id', 'date_created', 
221                                                                         'last_updated', 'last_contact', 'boot_state', 'nodegroup_ids'])[0]
222                 except:
223                         traceback.print_exc()
224                 plc_lock.release()
225                 values['plc_node_stats'] = d_node
226
227                 ##### NMAP  ###################
228                 (n, v) = collectNMAP(nodename, None)
229                 values.update(v)
230
231                 ### GET PLC PCU ######################
232                 site_id = -1
233                 d_pcu = None
234                 if d_node:
235                         pcu = d_node['pcu_ids']
236                         if len(pcu) > 0:
237                                 d_pcu = pcu[0]
238
239                         site_id = d_node['site_id']
240
241                 values['plc_pcuid'] = d_pcu
242
243                 ### GET PLC SITE ######################
244                 plc_lock.acquire()
245                 d_site = None
246                 values['loginbase'] = ""
247                 try:
248                         d_site = plc.getSites({'site_id': site_id}, 
249                                                                 ['max_slices', 'slice_ids', 'node_ids', 'login_base'])[0]
250                         values['loginbase'] = d_site['login_base']
251                 except:
252                         traceback.print_exc()
253                 plc_lock.release()
254
255                 values['plc_site_stats'] = d_site 
256                 values['date_checked'] = datetime.now()
257         except:
258                 print traceback.print_exc()
259
260         return (nodename, values)
261
262 def recordPingAndSSH(request, result):
263         global global_round
264         global count
265         (nodename, values) = result
266
267         try:
268                 if values is not None:
269                         #fbsync = FindbadNodeRecordSync.findby_or_create(hostname="global", 
270                         #                                                                                               if_new_set={'round' : global_round})
271                         #global_round = fbsync.round
272                         fbnodesync = FindbadNodeRecordSync.findby_or_create(hostname=nodename,
273                                                                                                                         if_new_set={'round' : global_round})
274
275                         # NOTE: This code will either add a new record for the new global_round, 
276                         #               OR it will find the previous value, and update it
277                         #               with new information.
278                         #               The data that is 'lost' is not that important, b/c older
279                         #               history still exists.  
280                         fbrec = FindbadNodeRecord.findby_or_create(
281                                                 round=global_round,
282                                                 hostname=nodename)
283
284                         fbrec.set(  **values ) 
285                                                 #date_checked=values['date_checked'],
286                                                 #loginbase=values['loginbase'],
287                                                 #kernel_version=values['kernel_version'],
288                                                 #bootcd_version=values['bootcd_version'],
289                                                 #nm_status=values['nm_status'],
290                                                 #fs_status=values['fs_status'],
291                                                 #dns_status=values['dns_status'],
292                                                 #princeton_comon_dir=values['princeton_comon_dir'],
293                                                 #princeton_comon_running=values['princeton_comon_running'],
294                                                 #princeton_comon_procs=values['princeton_comon_procs'],
295                                                 #plc_node_stats = values['plc_node_stats'],
296                                                 #plc_site_stats = values['plc_site_stats'],
297                                                 #plc_pcuid = values['plc_pcuid'],
298                                                 #comon_stats = values['comon_stats'],
299                                                 #ping_status = values['ping_status'],
300                                                 #ssh_portused = values['ssh_portused'],
301                                                 #ssh_status = values['ssh_status'],
302                                                 #ssh_error = values['ssh_error'],
303                                                 #observed_status = values['observed_status'],
304                                                 #observed_category = values['observed_category'])
305
306                         #for v in before.keys():
307                         #       if before[v] == after[v]:
308                         #               print "SAME FOR KEY %s" % v
309                         #       print "%s : %s\t%s" % ( v, before[v], after[v] )
310
311                         fbrec.flush()
312                         fbnodesync.round = global_round
313                         fbnodesync.flush()
314                         #fbsync.flush()
315
316                         count += 1
317                         print "%d %s %s" % (count, nodename, values)
318         except:
319                 print "ERROR:"
320                 print traceback.print_exc()
321
322 # this will be called when an exception occurs within a thread
323 def handle_exception(request, result):
324         print "Exception occured in request %s" % request.requestID
325         for i in result:
326                 print "Result: %s" % i
327
328 def externalprobe(hostname):
329         try:
330                 (nodename, values) = collectNMAP(hostname, {})
331                 recordPingAndSSH(None, (nodename, values))
332                 session.flush()
333                 return True
334         except:
335                 print traceback.print_exc()
336                 return False
337
338 def probe(hostname):
339         try:
340                 (nodename, values) = collectPingAndSSH(hostname, {})
341                 recordPingAndSSH(None, (nodename, values))
342                 session.flush()
343                 return True
344         except:
345                 print traceback.print_exc()
346                 return False
347                 
348
349 def checkAndRecordState(l_nodes, cohash):
350         global global_round
351         global count
352
353         tp = threadpool.ThreadPool(20)
354
355         # CREATE all the work requests
356         for nodename in l_nodes:
357                 fbnodesync = FindbadNodeRecordSync.findby_or_create(hostname=nodename, if_new_set={'round':0})
358                 node_round   = fbnodesync.round
359                 fbnodesync.flush()
360
361                 if node_round < global_round or config.force:
362                         # recreate node stats when refreshed
363                         #print "%s" % nodename
364                         req = threadpool.WorkRequest(collectPingAndSSH, [nodename, cohash], {}, 
365                                                                                  None, recordPingAndSSH, handle_exception)
366                         tp.putRequest(req)
367                 else:
368                         # We just skip it, since it's "up to date"
369                         count += 1
370                         #print "%d %s %s" % (count, nodename, externalState['nodes'][nodename]['values'])
371                         print "%d %s %s" % (count, nodename, node_round)
372
373         # WAIT while all the work requests are processed.
374         begin = time.time()
375         while 1:
376                 try:
377                         time.sleep(1)
378                         tp.poll()
379                         # if more than two hours
380                         if time.time() - begin > (60*60*1.5):
381                                 print "findbad.py has run out of time!!!!!!"
382                                 os._exit(1)
383                 except KeyboardInterrupt:
384                         print "Interrupted!"
385                         break
386                 except threadpool.NoResultsPending:
387                         print "All results collected."
388                         break
389
390         print FindbadNodeRecordSync.query.count()
391         print FindbadNodeRecord.query.count()
392         session.flush()
393
394 def main():
395         global global_round
396
397         fbsync = FindbadNodeRecordSync.findby_or_create(hostname="global", 
398                                                                                                         if_new_set={'round' : global_round})
399         global_round = fbsync.round
400
401         if config.increment:
402                 # update global round number to force refreshes across all nodes
403                 global_round += 1
404
405         cotop = comon.Comon()
406         # lastcotop measures whether cotop is actually running.  this is a better
407         # metric than sshstatus, or other values from CoMon
408         cotop_url = COMON_COTOPURL
409
410         # history information for all nodes
411         cohash = {}
412         #cohash = cotop.coget(cotop_url)
413         l_nodes = plccache.l_nodes
414         if config.nodelist:
415                 f_nodes = util.file.getListFromFile(config.nodelist)
416                 l_nodes = filter(lambda x: x['hostname'] in f_nodes, l_nodes)
417         elif config.node:
418                 f_nodes = [config.node]
419                 l_nodes = filter(lambda x: x['hostname'] in f_nodes, l_nodes)
420         elif config.nodegroup:
421                 ng = api.GetNodeGroups({'name' : config.nodegroup})
422                 l_nodes = api.GetNodes(ng[0]['node_ids'])
423         elif config.site:
424                 site = api.GetSites(config.site)
425                 l_nodes = api.GetNodes(site[0]['node_ids'], ['hostname'])
426                 
427         l_nodes = [node['hostname'] for node in l_nodes]
428
429         # perform this query after the above options, so that the filter above
430         # does not break.
431         if config.nodeselect:
432                 plcnodes = api.GetNodes({'peer_id' : None}, ['hostname'])
433                 plcnodes = [ node['hostname'] for node in plcnodes ]
434                 l_nodes = node_select(config.nodeselect, plcnodes, None)
435
436         print "fetching %s hosts" % len(l_nodes)
437
438         checkAndRecordState(l_nodes, cohash)
439
440         if config.increment:
441                 # update global round number to force refreshes across all nodes
442                 fbsync.round = global_round
443                 fbsync.flush()
444
445         return 0
446
447
448 if __name__ == '__main__':
449         from monitor import parser as parsermodule
450
451         parser = parsermodule.getParser(['nodesets'])
452
453         parser.set_defaults( increment=False, dbname="findbad", cachenodes=False, 
454                                                 force=False,)
455         parser.add_option("", "--cachenodes", action="store_true",
456                                                 help="Cache node lookup from PLC")
457         parser.add_option("", "--dbname", dest="dbname", metavar="FILE", 
458                                                 help="Specify the name of the database to which the information is saved")
459         parser.add_option("-i", "--increment", action="store_true", dest="increment", 
460                                                 help="Increment round number to force refresh or retry")
461         parser.add_option("", "--force", action="store_true", dest="force", 
462                                                 help="Force probe without incrementing global 'round'.")
463
464         parser = parsermodule.getParser(['defaults'], parser)
465         
466         cfg = parsermodule.parse_args(parser)
467
468         try:
469                 main()
470         except Exception, err:
471                 print traceback.print_exc()
472                 print "Exception: %s" % err
473                 print "Saving data... exitting."
474                 sys.exit(0)
475         print "sleeping"
476         #print "final commit"
477         #time.sleep(10)