7 from datetime import datetime,timedelta
11 from monitor import util
12 from monitor.util import command
13 from monitor import config
15 from monitor.database.info.model import FindbadNodeRecordSync, FindbadNodeRecord, session
17 from monitor.sources import comon
18 from monitor.wrapper import plc, plccache
20 from nodequery import verify,query_to_dict,node_select
23 #print "starting sqlfindbad.py"
25 COMON_COTOPURL= "http://summer.cs.princeton.edu/status/tabulator.cgi?" + \
26 "table=table_nodeview&" + \
27 "dumpcols='name,resptime,sshstatus,uptime,lastcotop,cpuspeed,memsize,disksize'&" + \
30 #"select='lastcotop!=0'"
32 api = plc.getAuthAPI()
33 plc_lock = threading.Lock()
38 def collectPingAndSSH(nodename, cohash):
39 ### RUN PING ######################
41 (oval,errval) = ping.run_noexcept("ping -c 1 -q %s | grep rtt" % nodename)
48 values['ping'] = "NOPING"
50 values['ping'] = "PING"
53 for port in [22, 806]:
54 ssh = command.SSH('root', nodename, port)
56 (oval, errval) = ssh.run_noexcept2(""" <<\EOF
58 echo ' "kernel":"'`uname -a`'",'
59 echo ' "bmlog":"'`ls /tmp/bm.log`'",'
60 echo ' "bootcd":"'`cat /mnt/cdrom/bootme/ID`'",'
61 echo ' "nm":"'`ps ax | grep nm.py | grep -v grep`'",'
62 echo ' "readonlyfs":"'`touch /var/log/monitor 2>&1`'",'
63 echo ' "dns":"'`host boot.planet-lab.org 2>&1`'",'
64 echo ' "princeton_comon":"'`ls -d /vservers/princeton_comon`'",'
66 ID=`grep princeton_comon /etc/passwd | awk -F : '{if ( $3 > 500 ) { print $3}}'`
67 echo ' "princeton_comon_running":"'`ls -d /proc/virtual/$ID`'",'
68 echo ' "princeton_comon_procs":"'`vps ax | grep $ID | grep -v grep | wc -l`'",'
72 values['ssherror'] = errval
74 #print "OVAL: %s" % oval
75 values.update(eval(oval))
76 values['sshport'] = port
79 values.update({'kernel': "", 'bmlog' : "", 'bootcd' : '',
83 'princeton_comon' : "",
84 'princeton_comon_running' : "",
85 'princeton_comon_procs' : "", 'sshport' : None})
87 print traceback.print_exc()
90 ### RUN SSH ######################
92 #ssh = command.SSH('root', nodename)
95 #(oval, errval) = ssh.run_noexcept('echo `uname -a ; ls /tmp/bm.log`')
97 oval = values['kernel']
98 if "2.6.17" in oval or "2.6.2" in oval:
100 values['category'] = 'PROD'
101 if "bm.log" in values['bmlog']:
102 values['state'] = 'DEBUG'
104 values['state'] = 'BOOT'
105 elif "2.6.12" in oval or "2.6.10" in oval:
106 values['ssh'] = 'SSH'
107 values['category'] = 'OLDPROD'
108 if "bm.log" in values['bmlog']:
109 values['state'] = 'DEBUG'
111 values['state'] = 'BOOT'
113 # NOTE: on 2.6.8 kernels, with 4.2 bootstrapfs, the chroot command fails. I have no idea why.
114 elif "2.4" in oval or "2.6.8" in oval:
115 b_getbootcd_id = False
116 values['ssh'] = 'SSH'
117 values['category'] = 'OLDBOOTCD'
118 values['state'] = 'DEBUG'
120 values['ssh'] = 'SSH'
121 values['category'] = 'UNKNOWN'
122 if "bm.log" in values['bmlog']:
123 values['state'] = 'DEBUG'
125 values['state'] = 'BOOT'
128 b_getbootcd_id = False
129 values['ssh'] = 'NOSSH'
130 values['category'] = 'ERROR'
131 values['state'] = 'DOWN'
133 values['ssherror'] = val
134 values['kernel'] = ""
136 #values['kernel'] = val
139 # try to get BootCD for all nodes that are not 2.4 nor inaccessible
140 #(oval, errval) = ssh.run_noexcept('cat /mnt/cdrom/bootme/ID')
141 oval = values['bootcd']
143 values['bootcd'] = oval
144 if "v2" in oval and \
145 ( nodename is not "planetlab1.cs.unc.edu" and \
146 nodename is not "planetlab2.cs.unc.edu" ):
147 values['category'] = 'OLDBOOTCD'
149 values['bootcd'] = ""
151 values['bootcd'] = ""
153 # TODO: get bm.log for debug nodes.
156 #(oval, errval) = ssh.run_noexcept('ps ax | grep nm.py | grep -v grep')
163 continue_slice_check = True
164 #(oval, errval) = ssh.run_noexcept('ls -d /vservers/princeton_comon')
165 oval = values['princeton_comon']
166 if "princeton_comon" in oval:
167 values['princeton_comon'] = True
169 values['princeton_comon'] = False
170 continue_slice_check = False
172 if continue_slice_check:
173 #(oval, errval) = ssh.run_noexcept('ID=`grep princeton_comon /etc/passwd | awk -F : "{if ( \\\$3 > 500 ) { print \\\$3}}"`; ls -d /proc/virtual/$ID')
174 oval = values['princeton_comon_running']
175 if len(oval) > len('/proc/virtual/'):
176 values['princeton_comon_running'] = True
178 values['princeton_comon_running'] = False
179 continue_slice_check = False
181 values['princeton_comon_running'] = False
183 if continue_slice_check:
184 #(oval, errval) = ssh.run_noexcept('ID=`grep princeton_comon /etc/passwd | awk -F : "{if ( \\\$3 > 500 ) { print \\\$3}}"`; vps ax | grep $ID | grep -v grep | wc -l')
185 oval = values['princeton_comon_procs']
186 values['princeton_comon_procs'] = int(oval)
188 values['princeton_comon_procs'] = None
191 if nodename in cohash:
192 values['comonstats'] = cohash[nodename]
194 values['comonstats'] = {'resptime': '-1',
201 # include output value
202 ### GET PLC NODE ######################
206 d_node = plc.getNodes({'hostname': nodename}, ['pcu_ids', 'site_id', 'date_created',
207 'last_updated', 'last_contact', 'boot_state', 'nodegroup_ids'])[0]
209 traceback.print_exc()
211 values['plcnode'] = d_node
213 ### GET PLC PCU ######################
217 pcu = d_node['pcu_ids']
221 site_id = d_node['site_id']
223 values['pcu'] = d_pcu
225 ### GET PLC SITE ######################
228 values['loginbase'] = ""
230 d_site = plc.getSites({'site_id': site_id},
231 ['max_slices', 'slice_ids', 'node_ids', 'login_base'])[0]
232 values['loginbase'] = d_site['login_base']
234 traceback.print_exc()
237 values['plcsite'] = d_site
238 values['date_checked'] = time.time()
240 print traceback.print_exc()
242 return (nodename, values)
244 def recordPingAndSSH(request, result):
247 (nodename, values) = result
250 if values is not None:
251 fbsync = FindbadNodeRecordSync.findby_or_create(hostname="global",
252 if_new_set={'round' : global_round})
253 global_round = fbsync.round
254 fbnodesync = FindbadNodeRecordSync.findby_or_create(hostname=nodename,
255 if_new_set={'round' : global_round})
257 # NOTE: This code will either add a new record for the new global_round,
258 # OR it will find the previous value, and update it
259 # with new information.
260 # The data that is 'lost' is not that important, b/c older
261 # history still exists.
262 fbrec = FindbadNodeRecord.findby_or_create(
265 before = fbrec.to_dict()
266 print "BEFORE, ", before
269 print "Setting VALUES"
270 fbrec.set( date_checked=datetime.fromtimestamp(values['date_checked']),
271 loginbase=values['loginbase'],
272 kernel_version=values['kernel'],
273 bootcd_version=values['bootcd'],
274 nm_status=values['nm'],
275 fs_status=values['readonlyfs'],
276 dns_status=values['dns'],
277 princeton_comon_dir=values['princeton_comon'],
278 princeton_comon_running=values['princeton_comon_running'],
279 princeton_comon_procs=values['princeton_comon_procs'],
280 plc_node_stats = values['plcnode'],
281 plc_site_stats = values['plcsite'],
282 plc_pcuid = values['pcu'],
283 comon_stats = values['comonstats'],
284 ping_status = (values['ping'] == "PING"),
285 ssh_portused = values['sshport'],
286 ssh_status = (values['ssh'] == "SSH"),
287 ssh_error = values['ssherror'],
288 observed_status = values['state'],
289 observed_category = values['category'])
290 after = fbrec.to_dict()
291 print "AFTER , ", after
293 for v in before.keys():
294 if before[v] == after[v]:
295 print "SAME FOR KEY %s" % v
296 print "%s : %s\t%s" % ( v, before[v], after[v] )
299 fbnodesync.round = global_round
304 print "%d %s %s" % (count, nodename, values)
307 print traceback.print_exc()
309 # this will be called when an exception occurs within a thread
310 def handle_exception(request, result):
311 print "Exception occured in request %s" % request.requestID
313 print "Result: %s" % i
317 (nodename, values) = collectPingAndSSH(hostname, {})
318 recordPingAndSSH(None, (nodename, values))
322 print traceback.print_exc()
326 def checkAndRecordState(l_nodes, cohash):
330 tp = threadpool.ThreadPool(20)
332 # CREATE all the work requests
333 for nodename in l_nodes:
334 fbnodesync = FindbadNodeRecordSync.findby_or_create(hostname=nodename, if_new_set={'round':0})
335 node_round = fbnodesync.round
338 if node_round < global_round:
339 # recreate node stats when refreshed
340 #print "%s" % nodename
341 req = threadpool.WorkRequest(collectPingAndSSH, [nodename, cohash], {},
342 None, recordPingAndSSH, handle_exception)
345 # We just skip it, since it's "up to date"
347 #print "%d %s %s" % (count, nodename, externalState['nodes'][nodename]['values'])
348 print "%d %s %s" % (count, nodename, node_round)
350 # WAIT while all the work requests are processed.
356 # if more than two hours
357 if time.time() - begin > (60*60*1.5):
358 print "findbad.py has run out of time!!!!!!"
360 except KeyboardInterrupt:
363 except threadpool.NoResultsPending:
364 print "All results collected."
367 print FindbadNodeRecordSync.query.count()
368 print FindbadNodeRecord.query.count()
374 fbsync = FindbadNodeRecordSync.findby_or_create(hostname="global",
375 if_new_set={'round' : global_round})
376 global_round = fbsync.round
379 # update global round number to force refreshes across all nodes
381 fbsync.round = global_round
385 cotop = comon.Comon()
386 # lastcotop measures whether cotop is actually running. this is a better
387 # metric than sshstatus, or other values from CoMon
388 cotop_url = COMON_COTOPURL
390 # history information for all nodes
392 #cohash = cotop.coget(cotop_url)
393 l_nodes = plccache.l_nodes
395 f_nodes = util.file.getListFromFile(config.nodelist)
396 l_nodes = filter(lambda x: x['hostname'] in f_nodes, l_nodes)
398 f_nodes = [config.node]
399 l_nodes = filter(lambda x: x['hostname'] in f_nodes, l_nodes)
400 elif config.nodegroup:
401 ng = api.GetNodeGroups({'name' : config.nodegroup})
402 l_nodes = api.GetNodes(ng[0]['node_ids'])
404 site = api.GetSites(config.site)
405 l_nodes = api.GetNodes(site[0]['node_ids'], ['hostname'])
407 l_nodes = [node['hostname'] for node in l_nodes]
409 # perform this query after the above options, so that the filter above
411 if config.nodeselect:
412 plcnodes = api.GetNodes({'peer_id' : None}, ['hostname'])
413 plcnodes = [ node['hostname'] for node in plcnodes ]
414 l_nodes = node_select(config.nodeselect, plcnodes, None)
416 print "fetching %s hosts" % len(l_nodes)
418 checkAndRecordState(l_nodes, cohash)
423 if __name__ == '__main__':
424 from monitor import parser as parsermodule
426 parser = parsermodule.getParser(['nodesets'])
428 parser.set_defaults( increment=False, dbname="findbad", cachenodes=False)
429 parser.add_option("", "--cachenodes", action="store_true",
430 help="Cache node lookup from PLC")
431 parser.add_option("", "--dbname", dest="dbname", metavar="FILE",
432 help="Specify the name of the database to which the information is saved")
433 parser.add_option("-i", "--increment", action="store_true", dest="increment",
434 help="Increment round number to force refresh or retry")
436 parser = parsermodule.getParser(['defaults'], parser)
438 cfg = parsermodule.parse_args(parser)
442 except Exception, err:
443 print traceback.print_exc()
444 print "Exception: %s" % err
445 print "Saving data... exitting."
448 #print "final commit"