8 from config import config
9 from optparse import OptionParser
10 parser = OptionParser()
11 parser.set_defaults(filename=None, increment=False, dbname="findbadnodes", cachenodes=False)
12 parser.add_option("-f", "--nodelist", dest="filename", metavar="FILE",
13 help="Provide the input file for the node list")
14 parser.add_option("", "--cachenodes", action="store_true",
15 help="Cache node lookup from PLC")
16 parser.add_option("", "--dbname", dest="dbname", metavar="FILE",
17 help="Specify the name of the database to which the information is saved")
18 parser.add_option("-i", "--increment", action="store_true", dest="increment",
19 help="Increment round number to force refresh or retry")
20 config = config(parser)
24 COMON_COTOPURL= "http://summer.cs.princeton.edu/status/tabulator.cgi?" + \
25 "table=table_nodeview&" + \
26 "dumpcols='name,resptime,sshstatus,uptime,lastcotop'&" + \
29 #"select='lastcotop!=0'"
32 plc_lock = threading.Lock()
34 externalState = {'round': round, 'nodes': {}}
44 def collectPingAndSSH(nodename, cohash):
45 ### RUN PING ######################
47 (oval,eval) = ping.run_noexcept("ping -c 1 -q %s | grep rtt" % nodename)
53 values['ping'] = "NOPING"
55 values['ping'] = "PING"
57 #uptime = soltesz.SSH('root', nodename)
58 #(oval,eval) = uptime.run_noexcept("uptime | awk '{print $3,$4}' | tr , ' '")
60 ### RUN SSH ######################
62 ssh = soltesz.SSH('root', nodename)
65 (oval, eval) = ssh.run_noexcept('echo `uname -a ; ls /tmp/bm.log`')
67 if "2.6.17" in oval or "2.6.2" in oval:
69 values['category'] = 'ALPHA'
71 values['state'] = 'DEBUG'
73 values['state'] = 'BOOT'
74 elif "2.6.12" in oval or "2.6.10" in oval:
76 values['category'] = 'PROD'
78 values['state'] = 'DEBUG'
80 values['state'] = 'BOOT'
82 b_getbootcd_id = False
84 values['category'] = 'OLDBOOTCD'
85 values['state'] = 'DEBUG'
88 values['category'] = 'UNKNOWN'
90 values['state'] = 'DEBUG'
92 values['state'] = 'BOOT'
95 b_getbootcd_id = False
96 values['ssh'] = 'NOSSH'
97 values['category'] = 'ERROR'
98 values['state'] = 'DOWN'
101 values['kernel'] = val
104 # try to get BootCD for all nodes that are not 2.4 nor inaccessible
105 (oval, eval) = ssh.run_noexcept('cat /mnt/cdrom/bootme/ID')
108 values['bootcd'] = val
110 ( nodename is not "planetlab1.cs.unc.edu" and \
111 nodename is not "planetlab2.cs.unc.edu" ):
112 values['category'] = 'OLDBOOTCD'
114 values['bootcd'] = ""
116 values['bootcd'] = ""
118 # TODO: get bm.log for debug nodes.
121 if nodename in cohash:
122 values['comonstats'] = cohash[nodename]
124 values['comonstats'] = {'resptime': '-1',
128 # include output value
129 ### GET PLC NODE ######################
134 d_node = plc.getNodes({'hostname': nodename}, ['pcu_ids', 'site_id', 'last_contact', 'boot_state', 'nodegroup_ids'])
140 traceback.print_exc()
143 if b_except: return (None, None)
146 if d_node and len(d_node) > 0:
147 pcu = d_node[0]['pcu_ids']
149 values['pcu'] = "PCU"
151 values['pcu'] = "NOPCU"
152 site_id = d_node[0]['site_id']
153 last_contact = d_node[0]['last_contact']
154 nodegroups = d_node[0]['nodegroup_ids']
155 values['plcnode'] = {'status' : 'SUCCESS',
157 'boot_state' : d_node[0]['boot_state'],
159 'nodegroups' : nodegroups,
160 'last_contact': last_contact}
162 values['pcu'] = "UNKNOWN"
163 values['plcnode'] = {'status' : "GN_FAILED"}
166 ### GET PLC SITE ######################
171 d_site = plc.getSites({'site_id': site_id},
172 ['max_slices', 'slice_ids', 'node_ids', 'login_base'])
176 traceback.print_exc()
179 if b_except: return (None, None)
181 if d_site and len(d_site) > 0:
182 max_slices = d_site[0]['max_slices']
183 num_slices = len(d_site[0]['slice_ids'])
184 num_nodes = len(d_site[0]['node_ids'])
185 loginbase = d_site[0]['login_base']
186 values['plcsite'] = {'num_nodes' : num_nodes,
187 'max_slices' : max_slices,
188 'num_slices' : num_slices,
189 'login_base' : loginbase,
190 'status' : 'SUCCESS'}
192 values['plcsite'] = {'status' : "GS_FAILED"}
194 values['checked'] = time.time()
196 return (nodename, values)
198 def recordPingAndSSH(request, result):
201 (nodename, values) = result
203 if values is not None:
204 global_round = externalState['round']
205 externalState['nodes'][nodename]['values'] = values
206 externalState['nodes'][nodename]['round'] = global_round
209 print "%d %s %s" % (count, nodename, externalState['nodes'][nodename]['values'])
210 soltesz.dbDump(config.dbname, externalState)
212 # this will be called when an exception occurs within a thread
213 def handle_exception(request, result):
214 print "Exception occured in request %s" % request.requestID
216 print "Result: %s" % i
219 def checkAndRecordState(l_nodes, cohash):
222 global_round = externalState['round']
224 tp = threadpool.ThreadPool(20)
226 # CREATE all the work requests
227 for nodename in l_nodes:
228 if nodename not in externalState['nodes']:
229 externalState['nodes'][nodename] = {'round': 0, 'values': []}
231 node_round = externalState['nodes'][nodename]['round']
232 if node_round < global_round:
233 # recreate node stats when refreshed
234 #print "%s" % nodename
235 req = threadpool.WorkRequest(collectPingAndSSH, [nodename, cohash], {},
236 None, recordPingAndSSH, handle_exception)
239 # We just skip it, since it's "up to date"
241 print "%d %s %s" % (count, nodename, externalState['nodes'][nodename]['values'])
244 # WAIT while all the work requests are processed.
249 except KeyboardInterrupt:
252 except threadpool.NoResultsPending:
253 print "All results collected."
261 externalState = soltesz.if_cached_else(1, config.dbname, lambda : externalState)
264 # update global round number to force refreshes across all nodes
265 externalState['round'] += 1
267 cotop = comon.Comon()
268 # lastcotop measures whether cotop is actually running. this is a better
269 # metric than sshstatus, or other values from CoMon
270 cotop_url = COMON_COTOPURL
272 # history information for all nodes
273 cohash = cotop.coget(cotop_url)
274 l_nodes = syncplcdb.create_plcdb()
276 f_nodes = config.getListFromFile(config.filename)
277 l_nodes = filter(lambda x: x['hostname'] in f_nodes, l_nodes)
279 l_nodes = [node['hostname'] for node in l_nodes]
281 print "fetching %s hosts" % len(l_nodes)
283 checkAndRecordState(l_nodes, cohash)
288 if __name__ == '__main__':
291 except Exception, err:
293 print traceback.print_exc()
294 print "Exception: %s" % err
295 print "Saving data... exitting."
296 soltesz.dbDump(config.dbname, externalState)