7 from datetime import datetime,timedelta
11 from monitor import util
12 from monitor.util import command
13 from monitor import config
15 from monitor.database.info.model import FindbadNodeRecordSync, FindbadNodeRecord, session
17 from monitor.sources import comon
18 from monitor.wrapper import plc, plccache
20 from nodequery import verify,query_to_dict,node_select
23 print "starting sqlfindbad.py"
25 COMON_COTOPURL= "http://summer.cs.princeton.edu/status/tabulator.cgi?" + \
26 "table=table_nodeview&" + \
27 "dumpcols='name,resptime,sshstatus,uptime,lastcotop,cpuspeed,memsize,disksize'&" + \
30 #"select='lastcotop!=0'"
32 api = plc.getAuthAPI()
33 plc_lock = threading.Lock()
38 def collectPingAndSSH(nodename, cohash):
39 ### RUN PING ######################
41 (oval,errval) = ping.run_noexcept("ping -c 1 -q %s | grep rtt" % nodename)
48 values['ping'] = "NOPING"
50 values['ping'] = "PING"
53 for port in [22, 806]:
54 ssh = command.SSH('root', nodename, port)
56 (oval, errval) = ssh.run_noexcept2(""" <<\EOF
58 echo ' "kernel":"'`uname -a`'",'
59 echo ' "bmlog":"'`ls /tmp/bm.log`'",'
60 echo ' "bootcd":"'`cat /mnt/cdrom/bootme/ID`'",'
61 echo ' "nm":"'`ps ax | grep nm.py | grep -v grep`'",'
62 echo ' "readonlyfs":"'`touch /var/log/monitor 2>&1`'",'
63 echo ' "dns":"'`host boot.planet-lab.org 2>&1`'",'
64 echo ' "princeton_comon":"'`ls -d /vservers/princeton_comon`'",'
66 ID=`grep princeton_comon /etc/passwd | awk -F : '{if ( $3 > 500 ) { print $3}}'`
67 echo ' "princeton_comon_running":"'`ls -d /proc/virtual/$ID`'",'
68 echo ' "princeton_comon_procs":"'`vps ax | grep $ID | grep -v grep | wc -l`'",'
72 values['ssherror'] = errval
74 #print "OVAL: %s" % oval
75 values.update(eval(oval))
76 values['sshport'] = port
79 values.update({'kernel': "", 'bmlog' : "", 'bootcd' : '',
83 'princeton_comon' : "",
84 'princeton_comon_running' : "",
85 'princeton_comon_procs' : "", 'sshport' : None})
87 print traceback.print_exc()
90 ### RUN SSH ######################
92 #ssh = command.SSH('root', nodename)
95 #(oval, errval) = ssh.run_noexcept('echo `uname -a ; ls /tmp/bm.log`')
97 oval = values['kernel']
98 if "2.6.17" in oval or "2.6.2" in oval:
100 values['category'] = 'PROD'
101 if "bm.log" in values['bmlog']:
102 values['state'] = 'DEBUG'
104 values['state'] = 'BOOT'
105 elif "2.6.12" in oval or "2.6.10" in oval:
106 values['ssh'] = 'SSH'
107 values['category'] = 'OLDPROD'
108 if "bm.log" in values['bmlog']:
109 values['state'] = 'DEBUG'
111 values['state'] = 'BOOT'
113 # NOTE: on 2.6.8 kernels, with 4.2 bootstrapfs, the chroot command fails. I have no idea why.
114 elif "2.4" in oval or "2.6.8" in oval:
115 b_getbootcd_id = False
116 values['ssh'] = 'SSH'
117 values['category'] = 'OLDBOOTCD'
118 values['state'] = 'DEBUG'
120 values['ssh'] = 'SSH'
121 values['category'] = 'UNKNOWN'
122 if "bm.log" in values['bmlog']:
123 values['state'] = 'DEBUG'
125 values['state'] = 'BOOT'
128 b_getbootcd_id = False
129 values['ssh'] = 'NOSSH'
130 values['category'] = 'ERROR'
131 values['state'] = 'DOWN'
133 values['ssherror'] = val
134 values['kernel'] = ""
136 #values['kernel'] = val
139 # try to get BootCD for all nodes that are not 2.4 nor inaccessible
140 #(oval, errval) = ssh.run_noexcept('cat /mnt/cdrom/bootme/ID')
141 oval = values['bootcd']
143 values['bootcd'] = oval
144 if "v2" in oval and \
145 ( nodename is not "planetlab1.cs.unc.edu" and \
146 nodename is not "planetlab2.cs.unc.edu" ):
147 values['category'] = 'OLDBOOTCD'
149 values['bootcd'] = ""
151 values['bootcd'] = ""
153 # TODO: get bm.log for debug nodes.
156 #(oval, errval) = ssh.run_noexcept('ps ax | grep nm.py | grep -v grep')
163 continue_slice_check = True
164 #(oval, errval) = ssh.run_noexcept('ls -d /vservers/princeton_comon')
165 oval = values['princeton_comon']
166 if "princeton_comon" in oval:
167 values['princeton_comon'] = True
169 values['princeton_comon'] = False
170 continue_slice_check = False
172 if continue_slice_check:
173 #(oval, errval) = ssh.run_noexcept('ID=`grep princeton_comon /etc/passwd | awk -F : "{if ( \\\$3 > 500 ) { print \\\$3}}"`; ls -d /proc/virtual/$ID')
174 oval = values['princeton_comon_running']
175 if len(oval) > len('/proc/virtual/'):
176 values['princeton_comon_running'] = True
178 values['princeton_comon_running'] = False
179 continue_slice_check = False
181 values['princeton_comon_running'] = False
183 if continue_slice_check:
184 #(oval, errval) = ssh.run_noexcept('ID=`grep princeton_comon /etc/passwd | awk -F : "{if ( \\\$3 > 500 ) { print \\\$3}}"`; vps ax | grep $ID | grep -v grep | wc -l')
185 oval = values['princeton_comon_procs']
186 values['princeton_comon_procs'] = int(oval)
188 values['princeton_comon_procs'] = None
191 if nodename in cohash:
192 values['comonstats'] = cohash[nodename]
194 values['comonstats'] = {'resptime': '-1',
201 # include output value
202 ### GET PLC NODE ######################
206 d_node = plc.getNodes({'hostname': nodename}, ['pcu_ids', 'site_id', 'date_created',
207 'last_updated', 'last_contact', 'boot_state', 'nodegroup_ids'])[0]
209 traceback.print_exc()
211 values['plcnode'] = d_node
213 ### GET PLC PCU ######################
217 pcu = d_node['pcu_ids']
221 site_id = d_node['site_id']
223 values['pcu'] = d_pcu
225 ### GET PLC SITE ######################
228 values['loginbase'] = ""
230 d_site = plc.getSites({'site_id': site_id},
231 ['max_slices', 'slice_ids', 'node_ids', 'login_base'])[0]
232 values['loginbase'] = d_site['login_base']
234 traceback.print_exc()
237 values['plcsite'] = d_site
238 values['date_checked'] = time.time()
240 print traceback.print_exc()
242 return (nodename, values)
244 def recordPingAndSSH(request, result):
247 (nodename, values) = result
250 if values is not None:
251 fbsync = FindbadNodeRecordSync.findby_or_create(hostname="global",
252 if_new_set={'round' : global_round})
253 global_round = fbsync.round
254 fbnodesync = FindbadNodeRecordSync.findby_or_create(hostname=nodename,
255 if_new_set={'round' : global_round})
257 fbrec = FindbadNodeRecord(
258 date_checked=datetime.fromtimestamp(values['date_checked']),
261 loginbase=values['loginbase'],
262 kernel_version=values['kernel'],
263 bootcd_version=values['bootcd'],
264 nm_status=values['nm'],
265 fs_status=values['readonlyfs'],
266 dns_status=values['dns'],
267 princeton_comon_dir=values['princeton_comon'],
268 princeton_comon_running=values['princeton_comon_running'],
269 princeton_comon_procs=values['princeton_comon_procs'],
270 plc_node_stats = values['plcnode'],
271 plc_site_stats = values['plcsite'],
272 plc_pcuid = values['pcu'],
273 comon_stats = values['comonstats'],
274 ping_status = (values['ping'] == "PING"),
275 ssh_portused = values['sshport'],
276 ssh_status = (values['ssh'] == "SSH"),
277 ssh_error = values['ssherror'],
278 observed_status = values['state'],
279 observed_category = values['category'],
281 fbnodesync.round = global_round
287 print "%d %s %s" % (count, nodename, values)
290 print traceback.print_exc()
292 # this will be called when an exception occurs within a thread
293 def handle_exception(request, result):
294 print "Exception occured in request %s" % request.requestID
296 print "Result: %s" % i
299 def checkAndRecordState(l_nodes, cohash):
303 tp = threadpool.ThreadPool(20)
305 # CREATE all the work requests
306 for nodename in l_nodes:
307 fbnodesync = FindbadNodeRecordSync.findby_or_create(hostname=nodename, if_new_set={'round':0})
308 node_round = fbnodesync.round
311 if node_round < global_round:
312 # recreate node stats when refreshed
313 #print "%s" % nodename
314 req = threadpool.WorkRequest(collectPingAndSSH, [nodename, cohash], {},
315 None, recordPingAndSSH, handle_exception)
318 # We just skip it, since it's "up to date"
320 #print "%d %s %s" % (count, nodename, externalState['nodes'][nodename]['values'])
321 print "%d %s %s" % (count, nodename, node_round)
323 # WAIT while all the work requests are processed.
329 # if more than two hours
330 if time.time() - begin > (60*60*1.5):
331 print "findbad.py has run out of time!!!!!!"
333 except KeyboardInterrupt:
336 except threadpool.NoResultsPending:
337 print "All results collected."
340 print FindbadNodeRecordSync.query.count()
341 print FindbadNodeRecord.query.count()
347 fbsync = FindbadNodeRecordSync.findby_or_create(hostname="global",
348 if_new_set={'round' : global_round})
349 global_round = fbsync.round
352 # update global round number to force refreshes across all nodes
354 fbsync.round = global_round
358 cotop = comon.Comon()
359 # lastcotop measures whether cotop is actually running. this is a better
360 # metric than sshstatus, or other values from CoMon
361 cotop_url = COMON_COTOPURL
363 # history information for all nodes
365 #cohash = cotop.coget(cotop_url)
366 l_nodes = plccache.l_nodes
368 f_nodes = util.file.getListFromFile(config.nodelist)
369 l_nodes = filter(lambda x: x['hostname'] in f_nodes, l_nodes)
371 f_nodes = [config.node]
372 l_nodes = filter(lambda x: x['hostname'] in f_nodes, l_nodes)
373 elif config.nodegroup:
374 ng = api.GetNodeGroups({'name' : config.nodegroup})
375 l_nodes = api.GetNodes(ng[0]['node_ids'])
377 site = api.GetSites(config.site)
378 l_nodes = api.GetNodes(site[0]['node_ids'], ['hostname'])
380 l_nodes = [node['hostname'] for node in l_nodes]
382 # perform this query after the above options, so that the filter above
384 if config.nodeselect:
385 plcnodes = api.GetNodes({'peer_id' : None}, ['hostname'])
386 plcnodes = [ node['hostname'] for node in plcnodes ]
387 l_nodes = node_select(config.nodeselect, plcnodes, None)
389 print "fetching %s hosts" % len(l_nodes)
391 checkAndRecordState(l_nodes, cohash)
396 if __name__ == '__main__':
397 from monitor import parser as parsermodule
399 parser = parsermodule.getParser(['nodesets'])
401 parser.set_defaults( increment=False, dbname="findbad", cachenodes=False)
402 parser.add_option("", "--cachenodes", action="store_true",
403 help="Cache node lookup from PLC")
404 parser.add_option("", "--dbname", dest="dbname", metavar="FILE",
405 help="Specify the name of the database to which the information is saved")
406 parser.add_option("-i", "--increment", action="store_true", dest="increment",
407 help="Increment round number to force refresh or retry")
409 parser = parsermodule.getParser(['defaults'], parser)
411 cfg = parsermodule.parse_args(parser)
415 except Exception, err:
416 print traceback.print_exc()
417 print "Exception: %s" % err
418 print "Saving data... exitting."
421 #print "final commit"