7 from datetime import datetime,timedelta
11 from monitor import util
12 from monitor.util import command
13 from monitor import config
15 from monitor.database.info.model import FindbadNodeRecordSync, FindbadNodeRecord, session
17 from monitor.sources import comon
18 from monitor.wrapper import plc, plccache
20 from nodequery import verify,query_to_dict,node_select
22 from nodecommon import nmap_port_status
24 #print "starting sqlfindbad.py"
26 COMON_COTOPURL= "http://summer.cs.princeton.edu/status/tabulator.cgi?" + \
27 "table=table_nodeview&" + \
28 "dumpcols='name,resptime,sshstatus,uptime,lastcotop,cpuspeed,memsize,disksize'&" + \
31 #"select='lastcotop!=0'"
33 api = plc.getAuthAPI()
34 plc_lock = threading.Lock()
39 def collectNMAP(nodename, cohash):
40 #### RUN NMAP ###############################
42 nmap = util.command.CMD()
43 print "nmap -oG - -P0 -p22,80,806 %s | grep Host:" % nodename
44 (oval,eval) = nmap.run_noexcept("nmap -oG - -P0 -p22,80,806 %s | grep Host:" % nodename)
45 # NOTE: an empty / error value for oval, will still work.
46 (values['port_status'], continue_probe) = nmap_port_status(oval)
48 values['date_checked'] = datetime.now()
50 return (nodename, values)
52 def collectPingAndSSH(nodename, cohash):
53 ### RUN PING ######################
55 (oval,errval) = ping.run_noexcept("ping -c 1 -q %s | grep rtt" % nodename)
62 values['ping_status'] = False
64 values['ping_status'] = True
67 for port in [22, 806]:
68 ssh = command.SSH('root', nodename, port)
70 (oval, errval) = ssh.run_noexcept2(""" <<\EOF
72 echo ' "kernel_version":"'`uname -a`'",'
73 echo ' "bmlog":"'`ls /tmp/bm.log`'",'
74 echo ' "bootcd_version":"'`cat /mnt/cdrom/bootme/ID`'",'
75 echo ' "nm_status":"'`ps ax | grep nm.py | grep -v grep`'",'
76 echo ' "fs_status":"'`touch /var/log/monitor 2>&1`'",'
77 echo ' "dns_status":"'`host boot.planet-lab.org 2>&1`'",'
78 echo ' "princeton_comon_dir":"'`ls -d /vservers/princeton_comon`'",'
80 ID=`grep princeton_comon /etc/passwd | awk -F : '{if ( $3 > 500 ) { print $3}}'`
81 echo ' "princeton_comon_running":"'`ls -d /proc/virtual/$ID`'",'
82 echo ' "princeton_comon_procs":"'`vps ax | grep $ID | grep -v grep | wc -l`'",'
86 values['ssh_error'] = errval
88 #print "OVAL: %s" % oval
89 values.update(eval(oval))
90 values['ssh_portused'] = port
93 values.update({'kernel_version': "", 'bmlog' : "", 'bootcd_version' : '',
97 'princeton_comon_dir' : "",
98 'princeton_comon_running' : "",
99 'princeton_comon_procs' : "", 'ssh_portused' : None})
101 print traceback.print_exc()
104 ### RUN SSH ######################
105 b_getbootcd_id = True
106 #ssh = command.SSH('root', nodename)
109 #(oval, errval) = ssh.run_noexcept('echo `uname -a ; ls /tmp/bm.log`')
111 oval = values['kernel_version']
112 if "2.6.17" in oval or "2.6.2" in oval:
113 values['ssh_status'] = True
114 values['observed_category'] = 'PROD'
115 if "bm.log" in values['bmlog']:
116 values['observed_status'] = 'DEBUG'
118 values['observed_status'] = 'BOOT'
119 elif "2.6.12" in oval or "2.6.10" in oval:
120 values['ssh_status'] = True
121 values['observed_category'] = 'OLDPROD'
122 if "bm.log" in values['bmlog']:
123 values['observed_status'] = 'DEBUG'
125 values['observed_status'] = 'BOOT'
127 # NOTE: on 2.6.8 kernels, with 4.2 bootstrapfs, the chroot command fails. I have no idea why.
128 elif "2.4" in oval or "2.6.8" in oval:
129 b_getbootcd_id = False
130 values['ssh_status'] = True
131 values['observed_category'] = 'OLDBOOTCD'
132 values['observed_status'] = 'DEBUG'
134 values['ssh_status'] = True
135 values['observed_category'] = 'UNKNOWN'
136 if "bm.log" in values['bmlog']:
137 values['observed_status'] = 'DEBUG'
139 values['observed_status'] = 'BOOT'
142 b_getbootcd_id = False
143 values['ssh_status'] = False
144 values['observed_category'] = 'ERROR'
145 values['observed_status'] = 'DOWN'
147 values['ssh_error'] = val
148 values['kernel_version'] = ""
150 #values['kernel_version'] = val
153 # try to get BootCD for all nodes that are not 2.4 nor inaccessible
154 #(oval, errval) = ssh.run_noexcept('cat /mnt/cdrom/bootme/ID')
155 oval = values['bootcd_version']
157 values['bootcd_version'] = oval
158 if "v2" in oval and \
159 ( nodename is not "planetlab1.cs.unc.edu" and \
160 nodename is not "planetlab2.cs.unc.edu" ):
161 values['observed_category'] = 'OLDBOOTCD'
163 values['bootcd_version'] = ""
165 values['bootcd_version'] = ""
167 # TODO: get bm.log for debug nodes.
170 #(oval, errval) = ssh.run_noexcept('ps ax | grep nm.py | grep -v grep')
171 oval = values['nm_status']
173 values['nm_status'] = "Y"
175 values['nm_status'] = "N"
177 continue_slice_check = True
178 #(oval, errval) = ssh.run_noexcept('ls -d /vservers/princeton_comon')
179 oval = values['princeton_comon_dir']
180 if "princeton_comon_dir" in oval:
181 values['princeton_comon_dir'] = True
183 values['princeton_comon_dir'] = False
184 continue_slice_check = False
186 if continue_slice_check:
187 #(oval, errval) = ssh.run_noexcept('ID=`grep princeton_comon /etc/passwd | awk -F : "{if ( \\\$3 > 500 ) { print \\\$3}}"`; ls -d /proc/virtual/$ID')
188 oval = values['princeton_comon_running']
189 if len(oval) > len('/proc/virtual/'):
190 values['princeton_comon_running'] = True
192 values['princeton_comon_running'] = False
193 continue_slice_check = False
195 values['princeton_comon_running'] = False
197 if continue_slice_check:
198 #(oval, errval) = ssh.run_noexcept('ID=`grep princeton_comon /etc/passwd | awk -F : "{if ( \\\$3 > 500 ) { print \\\$3}}"`; vps ax | grep $ID | grep -v grep | wc -l')
199 oval = values['princeton_comon_procs']
200 values['princeton_comon_procs'] = int(oval)
202 values['princeton_comon_procs'] = None
205 if nodename in cohash:
206 values['comon_stats'] = cohash[nodename]
208 values['comon_stats'] = {'resptime': '-1',
215 # include output value
216 ### GET PLC NODE ######################
220 d_node = plc.getNodes({'hostname': nodename}, ['pcu_ids', 'site_id', 'date_created',
221 'last_updated', 'last_contact', 'boot_state', 'nodegroup_ids'])[0]
223 traceback.print_exc()
225 values['plc_node_stats'] = d_node
227 ##### NMAP ###################
228 (n, v) = collectNMAP(nodename, None)
231 ### GET PLC PCU ######################
235 pcu = d_node['pcu_ids']
239 site_id = d_node['site_id']
241 values['plc_pcuid'] = d_pcu
243 ### GET PLC SITE ######################
246 values['loginbase'] = ""
248 d_site = plc.getSites({'site_id': site_id},
249 ['max_slices', 'slice_ids', 'node_ids', 'login_base'])[0]
250 values['loginbase'] = d_site['login_base']
252 traceback.print_exc()
255 values['plc_site_stats'] = d_site
256 values['date_checked'] = datetime.now()
258 print traceback.print_exc()
260 return (nodename, values)
262 def recordPingAndSSH(request, result):
265 (nodename, values) = result
268 if values is not None:
269 #fbsync = FindbadNodeRecordSync.findby_or_create(hostname="global",
270 # if_new_set={'round' : global_round})
271 #global_round = fbsync.round
272 fbnodesync = FindbadNodeRecordSync.findby_or_create(hostname=nodename,
273 if_new_set={'round' : global_round})
275 # NOTE: This code will either add a new record for the new global_round,
276 # OR it will find the previous value, and update it
277 # with new information.
278 # The data that is 'lost' is not that important, b/c older
279 # history still exists.
280 fbrec = FindbadNodeRecord.findby_or_create(
284 fbrec.set( **values )
285 #date_checked=values['date_checked'],
286 #loginbase=values['loginbase'],
287 #kernel_version=values['kernel_version'],
288 #bootcd_version=values['bootcd_version'],
289 #nm_status=values['nm_status'],
290 #fs_status=values['fs_status'],
291 #dns_status=values['dns_status'],
292 #princeton_comon_dir=values['princeton_comon_dir'],
293 #princeton_comon_running=values['princeton_comon_running'],
294 #princeton_comon_procs=values['princeton_comon_procs'],
295 #plc_node_stats = values['plc_node_stats'],
296 #plc_site_stats = values['plc_site_stats'],
297 #plc_pcuid = values['plc_pcuid'],
298 #comon_stats = values['comon_stats'],
299 #ping_status = values['ping_status'],
300 #ssh_portused = values['ssh_portused'],
301 #ssh_status = values['ssh_status'],
302 #ssh_error = values['ssh_error'],
303 #observed_status = values['observed_status'],
304 #observed_category = values['observed_category'])
306 #for v in before.keys():
307 # if before[v] == after[v]:
308 # print "SAME FOR KEY %s" % v
309 # print "%s : %s\t%s" % ( v, before[v], after[v] )
312 fbnodesync.round = global_round
317 print "%d %s %s" % (count, nodename, values)
320 print traceback.print_exc()
322 # this will be called when an exception occurs within a thread
323 def handle_exception(request, result):
324 print "Exception occured in request %s" % request.requestID
326 print "Result: %s" % i
328 def externalprobe(hostname):
330 (nodename, values) = collectNMAP(hostname, {})
331 recordPingAndSSH(None, (nodename, values))
335 print traceback.print_exc()
340 (nodename, values) = collectPingAndSSH(hostname, {})
341 recordPingAndSSH(None, (nodename, values))
345 print traceback.print_exc()
349 def checkAndRecordState(l_nodes, cohash):
353 tp = threadpool.ThreadPool(20)
355 # CREATE all the work requests
356 for nodename in l_nodes:
357 fbnodesync = FindbadNodeRecordSync.findby_or_create(hostname=nodename, if_new_set={'round':0})
358 node_round = fbnodesync.round
361 if node_round < global_round or config.force:
362 # recreate node stats when refreshed
363 #print "%s" % nodename
364 req = threadpool.WorkRequest(collectPingAndSSH, [nodename, cohash], {},
365 None, recordPingAndSSH, handle_exception)
368 # We just skip it, since it's "up to date"
370 #print "%d %s %s" % (count, nodename, externalState['nodes'][nodename]['values'])
371 print "%d %s %s" % (count, nodename, node_round)
373 # WAIT while all the work requests are processed.
379 # if more than two hours
380 if time.time() - begin > (60*60*1.5):
381 print "findbad.py has run out of time!!!!!!"
383 except KeyboardInterrupt:
386 except threadpool.NoResultsPending:
387 print "All results collected."
390 print FindbadNodeRecordSync.query.count()
391 print FindbadNodeRecord.query.count()
397 fbsync = FindbadNodeRecordSync.findby_or_create(hostname="global",
398 if_new_set={'round' : global_round})
399 global_round = fbsync.round
402 # update global round number to force refreshes across all nodes
405 cotop = comon.Comon()
406 # lastcotop measures whether cotop is actually running. this is a better
407 # metric than sshstatus, or other values from CoMon
408 cotop_url = COMON_COTOPURL
410 # history information for all nodes
412 #cohash = cotop.coget(cotop_url)
413 l_nodes = plccache.l_nodes
415 f_nodes = util.file.getListFromFile(config.nodelist)
416 l_nodes = filter(lambda x: x['hostname'] in f_nodes, l_nodes)
418 f_nodes = [config.node]
419 l_nodes = filter(lambda x: x['hostname'] in f_nodes, l_nodes)
420 elif config.nodegroup:
421 ng = api.GetNodeGroups({'name' : config.nodegroup})
422 l_nodes = api.GetNodes(ng[0]['node_ids'])
424 site = api.GetSites(config.site)
425 l_nodes = api.GetNodes(site[0]['node_ids'], ['hostname'])
427 l_nodes = [node['hostname'] for node in l_nodes]
429 # perform this query after the above options, so that the filter above
431 if config.nodeselect:
432 plcnodes = api.GetNodes({'peer_id' : None}, ['hostname'])
433 plcnodes = [ node['hostname'] for node in plcnodes ]
434 l_nodes = node_select(config.nodeselect, plcnodes, None)
436 print "fetching %s hosts" % len(l_nodes)
438 checkAndRecordState(l_nodes, cohash)
441 # update global round number to force refreshes across all nodes
442 fbsync.round = global_round
448 if __name__ == '__main__':
449 from monitor import parser as parsermodule
451 parser = parsermodule.getParser(['nodesets'])
453 parser.set_defaults( increment=False, dbname="findbad", cachenodes=False,
455 parser.add_option("", "--cachenodes", action="store_true",
456 help="Cache node lookup from PLC")
457 parser.add_option("", "--dbname", dest="dbname", metavar="FILE",
458 help="Specify the name of the database to which the information is saved")
459 parser.add_option("-i", "--increment", action="store_true", dest="increment",
460 help="Increment round number to force refresh or retry")
461 parser.add_option("", "--force", action="store_true", dest="force",
462 help="Force probe without incrementing global 'round'.")
464 parser = parsermodule.getParser(['defaults'], parser)
466 cfg = parsermodule.parse_args(parser)
470 except Exception, err:
471 print traceback.print_exc()
472 print "Exception: %s" % err
473 print "Saving data... exitting."
476 #print "final commit"