7 from datetime import datetime,timedelta
11 from monitor import util
12 from monitor.util import command
13 from monitor import config
14 from monitor.database import FindbadNodeRecordSync, FindbadNodeRecord
15 from monitor.sources import comon
16 from monitor.wrapper import plc
19 from nodequery import verify,query_to_dict,node_select
22 print "starting sqlfindbad.py"
24 COMON_COTOPURL= "http://summer.cs.princeton.edu/status/tabulator.cgi?" + \
25 "table=table_nodeview&" + \
26 "dumpcols='name,resptime,sshstatus,uptime,lastcotop,cpuspeed,memsize,disksize'&" + \
29 #"select='lastcotop!=0'"
31 api = plc.getAuthAPI()
32 plc_lock = threading.Lock()
37 def collectPingAndSSH(nodename, cohash):
38 ### RUN PING ######################
40 (oval,errval) = ping.run_noexcept("ping -c 1 -q %s | grep rtt" % nodename)
47 values['ping'] = "NOPING"
49 values['ping'] = "PING"
52 for port in [22, 806]:
53 ssh = command.SSH('root', nodename, port)
55 (oval, errval) = ssh.run_noexcept2(""" <<\EOF
57 echo ' "kernel":"'`uname -a`'",'
58 echo ' "bmlog":"'`ls /tmp/bm.log`'",'
59 echo ' "bootcd":"'`cat /mnt/cdrom/bootme/ID`'",'
60 echo ' "nm":"'`ps ax | grep nm.py | grep -v grep`'",'
61 echo ' "readonlyfs":"'`touch /var/log/monitor 2>&1`'",'
62 echo ' "dns":"'`host boot.planet-lab.org 2>&1`'",'
63 echo ' "princeton_comon":"'`ls -d /vservers/princeton_comon`'",'
65 ID=`grep princeton_comon /etc/passwd | awk -F : '{if ( $3 > 500 ) { print $3}}'`
66 echo ' "princeton_comon_running":"'`ls -d /proc/virtual/$ID`'",'
67 echo ' "princeton_comon_procs":"'`vps ax | grep $ID | grep -v grep | wc -l`'",'
71 values['ssherror'] = errval
73 #print "OVAL: %s" % oval
74 values.update(eval(oval))
75 values['sshport'] = port
78 values.update({'kernel': "", 'bmlog' : "", 'bootcd' : '',
82 'princeton_comon' : "",
83 'princeton_comon_running' : "",
84 'princeton_comon_procs' : "", 'sshport' : None})
86 print traceback.print_exc()
89 ### RUN SSH ######################
91 #ssh = command.SSH('root', nodename)
94 #(oval, errval) = ssh.run_noexcept('echo `uname -a ; ls /tmp/bm.log`')
96 oval = values['kernel']
97 if "2.6.17" in oval or "2.6.2" in oval:
99 values['category'] = 'PROD'
100 if "bm.log" in values['bmlog']:
101 values['state'] = 'DEBUG'
103 values['state'] = 'BOOT'
104 elif "2.6.12" in oval or "2.6.10" in oval:
105 values['ssh'] = 'SSH'
106 values['category'] = 'OLDPROD'
107 if "bm.log" in values['bmlog']:
108 values['state'] = 'DEBUG'
110 values['state'] = 'BOOT'
112 # NOTE: on 2.6.8 kernels, with 4.2 bootstrapfs, the chroot command fails. I have no idea why.
113 elif "2.4" in oval or "2.6.8" in oval:
114 b_getbootcd_id = False
115 values['ssh'] = 'SSH'
116 values['category'] = 'OLDBOOTCD'
117 values['state'] = 'DEBUG'
119 values['ssh'] = 'SSH'
120 values['category'] = 'UNKNOWN'
121 if "bm.log" in values['bmlog']:
122 values['state'] = 'DEBUG'
124 values['state'] = 'BOOT'
127 b_getbootcd_id = False
128 values['ssh'] = 'NOSSH'
129 values['category'] = 'ERROR'
130 values['state'] = 'DOWN'
132 values['ssherror'] = val
133 values['kernel'] = ""
135 #values['kernel'] = val
138 # try to get BootCD for all nodes that are not 2.4 nor inaccessible
139 #(oval, errval) = ssh.run_noexcept('cat /mnt/cdrom/bootme/ID')
140 oval = values['bootcd']
142 values['bootcd'] = oval
143 if "v2" in oval and \
144 ( nodename is not "planetlab1.cs.unc.edu" and \
145 nodename is not "planetlab2.cs.unc.edu" ):
146 values['category'] = 'OLDBOOTCD'
148 values['bootcd'] = ""
150 values['bootcd'] = ""
152 # TODO: get bm.log for debug nodes.
155 #(oval, errval) = ssh.run_noexcept('ps ax | grep nm.py | grep -v grep')
162 continue_slice_check = True
163 #(oval, errval) = ssh.run_noexcept('ls -d /vservers/princeton_comon')
164 oval = values['princeton_comon']
165 if "princeton_comon" in oval:
166 values['princeton_comon'] = True
168 values['princeton_comon'] = False
169 continue_slice_check = False
171 if continue_slice_check:
172 #(oval, errval) = ssh.run_noexcept('ID=`grep princeton_comon /etc/passwd | awk -F : "{if ( \\\$3 > 500 ) { print \\\$3}}"`; ls -d /proc/virtual/$ID')
173 oval = values['princeton_comon_running']
174 if len(oval) > len('/proc/virtual/'):
175 values['princeton_comon_running'] = True
177 values['princeton_comon_running'] = False
178 continue_slice_check = False
180 values['princeton_comon_running'] = False
182 if continue_slice_check:
183 #(oval, errval) = ssh.run_noexcept('ID=`grep princeton_comon /etc/passwd | awk -F : "{if ( \\\$3 > 500 ) { print \\\$3}}"`; vps ax | grep $ID | grep -v grep | wc -l')
184 oval = values['princeton_comon_procs']
185 values['princeton_comon_procs'] = int(oval)
187 values['princeton_comon_procs'] = None
190 if nodename in cohash:
191 values['comonstats'] = cohash[nodename]
193 values['comonstats'] = {'resptime': '-1',
200 # include output value
201 ### GET PLC NODE ######################
205 d_node = plc.getNodes({'hostname': nodename}, ['pcu_ids', 'site_id', 'date_created',
206 'last_updated', 'last_contact', 'boot_state', 'nodegroup_ids'])[0]
208 traceback.print_exc()
210 values['plcnode'] = d_node
212 ### GET PLC PCU ######################
216 pcu = d_node['pcu_ids']
220 site_id = d_node['site_id']
222 values['pcu'] = d_pcu
224 ### GET PLC SITE ######################
227 values['loginbase'] = ""
229 d_site = plc.getSites({'site_id': site_id},
230 ['max_slices', 'slice_ids', 'node_ids', 'login_base'])[0]
231 values['loginbase'] = d_site['login_base']
233 traceback.print_exc()
236 values['plcsite'] = d_site
237 values['date_checked'] = time.time()
239 print traceback.print_exc()
241 return (nodename, values)
243 def recordPingAndSSH(request, result):
246 (nodename, values) = result
249 if values is not None:
250 fbsync = FindbadNodeRecordSync.findby_or_create(hostname="global",
251 if_new_set={'round' : global_round})
252 global_round = fbsync.round
253 fbnodesync = FindbadNodeRecordSync.findby_or_create(hostname=nodename,
254 if_new_set={'round' : global_round})
256 fbrec = FindbadNodeRecord(
257 date_checked=datetime.fromtimestamp(values['date_checked']),
259 loginbase=values['loginbase'],
260 kernel_version=values['kernel'],
261 bootcd_version=values['bootcd'],
262 nm_status=values['nm'],
263 fs_status=values['readonlyfs'],
264 dns_status=values['dns'],
265 princeton_comon_dir=values['princeton_comon'],
266 princeton_comon_running=values['princeton_comon_running'],
267 princeton_comon_procs=values['princeton_comon_procs'],
268 plc_node_stats = values['plcnode'],
269 plc_site_stats = values['plcsite'],
270 plc_pcuid = values['pcu'],
271 comon_stats = values['comonstats'],
272 ping_status = (values['ping'] == "PING"),
273 ssh_portused = values['sshport'],
274 ssh_status = (values['ssh'] == "SSH"),
275 ssh_error = values['ssherror'],
276 observed_status = values['state'],
278 fbnodesync.round = global_round
281 print "%d %s %s" % (count, nodename, values)
284 print traceback.print_exc()
286 # this will be called when an exception occurs within a thread
287 def handle_exception(request, result):
288 print "Exception occured in request %s" % request.requestID
290 print "Result: %s" % i
293 def checkAndRecordState(l_nodes, cohash):
297 tp = threadpool.ThreadPool(20)
299 # CREATE all the work requests
300 for nodename in l_nodes:
301 fbnodesync = FindbadNodeRecordSync.findby_or_create(hostname=nodename, if_new_set={'round':0})
303 node_round = fbnodesync.round
304 if node_round < global_round:
305 # recreate node stats when refreshed
306 #print "%s" % nodename
307 req = threadpool.WorkRequest(collectPingAndSSH, [nodename, cohash], {},
308 None, recordPingAndSSH, handle_exception)
311 # We just skip it, since it's "up to date"
313 #print "%d %s %s" % (count, nodename, externalState['nodes'][nodename]['values'])
314 print "%d %s %s" % (count, nodename, node_round)
316 # WAIT while all the work requests are processed.
322 # if more than two hours
323 if time.time() - begin > (60*60*1.5):
324 print "findbad.py has run out of time!!!!!!"
326 except KeyboardInterrupt:
329 except threadpool.NoResultsPending:
330 print "All results collected."
333 print FindbadNodeRecordSync.query.count()
334 print FindbadNodeRecord.query.count()
339 fbsync = FindbadNodeRecordSync.findby_or_create(hostname="global",
340 if_new_set={'round' : global_round})
341 global_round = fbsync.round
344 # update global round number to force refreshes across all nodes
346 fbsync.round = global_round
348 cotop = comon.Comon()
349 # lastcotop measures whether cotop is actually running. this is a better
350 # metric than sshstatus, or other values from CoMon
351 cotop_url = COMON_COTOPURL
353 # history information for all nodes
355 cohash = cotop.coget(cotop_url)
356 l_nodes = syncplcdb.create_plcdb()
358 f_nodes = util.file.getListFromFile(config.nodelist)
359 l_nodes = filter(lambda x: x['hostname'] in f_nodes, l_nodes)
361 f_nodes = [config.node]
362 l_nodes = filter(lambda x: x['hostname'] in f_nodes, l_nodes)
363 elif config.nodegroup:
364 ng = api.GetNodeGroups({'name' : config.nodegroup})
365 l_nodes = api.GetNodes(ng[0]['node_ids'])
367 site = api.GetSites(config.site)
368 l_nodes = api.GetNodes(site[0]['node_ids'], ['hostname'])
370 l_nodes = [node['hostname'] for node in l_nodes]
372 # perform this query after the above options, so that the filter above
374 if config.nodeselect:
375 plcnodes = api.GetNodes({'peer_id' : None}, ['hostname'])
376 plcnodes = [ node['hostname'] for node in plcnodes ]
377 l_nodes = node_select(config.nodeselect, plcnodes, None)
379 print "fetching %s hosts" % len(l_nodes)
381 checkAndRecordState(l_nodes, cohash)
386 if __name__ == '__main__':
387 from monitor import parser as parsermodule
389 parser = parsermodule.getParser(['nodesets'])
391 parser.set_defaults( increment=False, dbname="findbad", cachenodes=False)
392 parser.add_option("", "--cachenodes", action="store_true",
393 help="Cache node lookup from PLC")
394 parser.add_option("", "--dbname", dest="dbname", metavar="FILE",
395 help="Specify the name of the database to which the information is saved")
396 parser.add_option("-i", "--increment", action="store_true", dest="increment",
397 help="Increment round number to force refresh or retry")
399 parser = parsermodule.getParser(['defaults'], parser)
401 cfg = parsermodule.parse_args(parser)
405 except Exception, err:
406 print traceback.print_exc()
407 print "Exception: %s" % err
408 print "Saving data... exitting."
411 #print "final commit"