10 COMON_COTOPURL= "http://summer.cs.princeton.edu/status/tabulator.cgi?" + \
11 "table=table_nodeview&" + \
12 "dumpcols='name,resptime,sshstatus,uptime,lastcotop'&" + \
15 #"select='lastcotop!=0'"
18 plc_lock = threading.Lock()
20 externalState = {'round': round, 'nodes': {}}
31 api = plc.PLC(auth.auth, auth.plc)
33 def collectPingAndSSH(nodename, cohash):
34 ### RUN PING ######################
36 (oval,eval) = ping.run_noexcept("ping -c 1 -q %s | grep rtt" % nodename)
42 values['ping'] = "NOPING"
44 values['ping'] = "PING"
46 #uptime = soltesz.SSH('root', nodename)
47 #(oval,eval) = uptime.run_noexcept("uptime | awk '{print $3,$4}' | tr , ' '")
49 ### RUN SSH ######################
51 ssh = soltesz.SSH('root', nodename)
54 (oval, eval) = ssh.run_noexcept('echo `uname -a ; ls /tmp/bm.log`')
56 if "2.6.17" in oval or "2.6.2" in oval:
58 values['category'] = 'ALPHA'
60 values['state'] = 'DEBUG'
62 values['state'] = 'BOOT'
63 elif "2.6.12" in oval or "2.6.10" in oval:
65 values['category'] = 'PROD'
67 values['state'] = 'DEBUG'
69 values['state'] = 'BOOT'
71 b_getbootcd_id = False
73 values['category'] = 'OLDBOOTCD'
74 values['state'] = 'DEBUG'
77 values['category'] = 'UNKNOWN'
79 values['state'] = 'DEBUG'
81 values['state'] = 'BOOT'
84 b_getbootcd_id = False
85 values['ssh'] = 'NOSSH'
86 values['category'] = 'ERROR'
87 values['state'] = 'DOWN'
90 values['kernel'] = val
93 # try to get BootCD for all nodes that are not 2.4 nor inaccessible
94 (oval, eval) = ssh.run_noexcept('cat /mnt/cdrom/bootme/ID')
97 values['bootcd'] = val
99 ( nodename is not "planetlab1.cs.unc.edu" and \
100 nodename is not "planetlab2.cs.unc.edu" ):
101 values['category'] = 'OLDBOOTCD'
103 values['bootcd'] = ""
105 values['bootcd'] = ""
107 # TODO: get bm.log for debug nodes.
110 if nodename in cohash:
111 values['comonstats'] = cohash[nodename]
113 values['comonstats'] = {'resptime': '-1',
117 # include output value
118 ### GET PLC NODE ######################
123 d_node = plc.getNodes({'hostname': nodename}, ['pcu_ids', 'site_id', 'last_contact', 'boot_state', 'nodegroup_ids'])
129 traceback.print_exc()
132 if b_except: return (None, None)
135 if d_node and len(d_node) > 0:
136 pcu = d_node[0]['pcu_ids']
138 values['pcu'] = "PCU"
140 values['pcu'] = "NOPCU"
141 site_id = d_node[0]['site_id']
142 last_contact = d_node[0]['last_contact']
143 nodegroups = d_node[0]['nodegroup_ids']
144 values['plcnode'] = {'status' : 'SUCCESS',
146 'boot_state' : d_node[0]['boot_state'],
148 'nodegroups' : nodegroups,
149 'last_contact': last_contact}
151 values['pcu'] = "UNKNOWN"
152 values['plcnode'] = {'status' : "GN_FAILED"}
155 ### GET PLC SITE ######################
160 d_site = plc.getSites({'site_id': site_id},
161 ['max_slices', 'slice_ids', 'node_ids', 'login_base'])
165 traceback.print_exc()
168 if b_except: return (None, None)
170 if d_site and len(d_site) > 0:
171 max_slices = d_site[0]['max_slices']
172 num_slices = len(d_site[0]['slice_ids'])
173 num_nodes = len(d_site[0]['node_ids'])
174 loginbase = d_site[0]['login_base']
175 values['plcsite'] = {'num_nodes' : num_nodes,
176 'max_slices' : max_slices,
177 'num_slices' : num_slices,
178 'login_base' : loginbase,
179 'status' : 'SUCCESS'}
181 values['plcsite'] = {'status' : "GS_FAILED"}
183 values['checked'] = time.time()
185 return (nodename, values)
187 def recordPingAndSSH(request, result):
190 (nodename, values) = result
192 if values is not None:
193 global_round = externalState['round']
194 externalState['nodes'][nodename]['values'] = values
195 externalState['nodes'][nodename]['round'] = global_round
198 print "%d %s %s" % (count, nodename, externalState['nodes'][nodename]['values'])
199 soltesz.dbDump(config.dbname, externalState)
201 # this will be called when an exception occurs within a thread
202 def handle_exception(request, result):
203 print "Exception occured in request %s" % request.requestID
205 print "Result: %s" % i
208 def checkAndRecordState(l_nodes, cohash):
211 global_round = externalState['round']
213 tp = threadpool.ThreadPool(20)
215 # CREATE all the work requests
216 for nodename in l_nodes:
217 if nodename not in externalState['nodes']:
218 externalState['nodes'][nodename] = {'round': 0, 'values': []}
220 node_round = externalState['nodes'][nodename]['round']
221 if node_round < global_round:
222 # recreate node stats when refreshed
223 #print "%s" % nodename
224 req = threadpool.WorkRequest(collectPingAndSSH, [nodename, cohash], {},
225 None, recordPingAndSSH, handle_exception)
228 # We just skip it, since it's "up to date"
230 print "%d %s %s" % (count, nodename, externalState['nodes'][nodename]['values'])
233 # WAIT while all the work requests are processed.
238 except KeyboardInterrupt:
241 except threadpool.NoResultsPending:
242 print "All results collected."
250 externalState = soltesz.if_cached_else(1, config.dbname, lambda : externalState)
253 # update global round number to force refreshes across all nodes
254 externalState['round'] += 1
256 cotop = comon.Comon()
257 # lastcotop measures whether cotop is actually running. this is a better
258 # metric than sshstatus, or other values from CoMon
259 cotop_url = COMON_COTOPURL
261 # history information for all nodes
262 cohash = cotop.coget(cotop_url)
263 l_nodes = syncplcdb.create_plcdb()
265 f_nodes = config.getListFromFile(config.filename)
266 l_nodes = filter(lambda x: x['hostname'] in f_nodes, l_nodes)
268 f_nodes = [config.node]
269 l_nodes = filter(lambda x: x['hostname'] in f_nodes, l_nodes)
270 elif config.nodegroup:
271 ng = api.GetNodeGroups({'name' : config.nodegroup})
272 l_nodes = api.GetNodes(ng[0]['node_ids'])
274 l_nodes = [node['hostname'] for node in l_nodes]
276 print "fetching %s hosts" % len(l_nodes)
278 checkAndRecordState(l_nodes, cohash)
283 if __name__ == '__main__':
284 from config import config
285 from optparse import OptionParser
286 parser = OptionParser()
287 parser.set_defaults(filename=None, node=None, nodegroup=None, increment=False, dbname="findbadnodes", cachenodes=False)
288 parser.add_option("", "--node", dest="node", metavar="hostname",
289 help="Provide a single node to operate on")
290 parser.add_option("-f", "--nodelist", dest="filename", metavar="FILE",
291 help="Provide the input file for the node list")
292 parser.add_option("", "--nodegroup", dest="nodegroup", metavar="FILE",
293 help="Provide the nodegroup for the list of nodes.")
295 parser.add_option("", "--cachenodes", action="store_true",
296 help="Cache node lookup from PLC")
297 parser.add_option("", "--dbname", dest="dbname", metavar="FILE",
298 help="Specify the name of the database to which the information is saved")
299 parser.add_option("-i", "--increment", action="store_true", dest="increment",
300 help="Increment round number to force refresh or retry")
301 config = config(parser)
306 except Exception, err:
308 print traceback.print_exc()
309 print "Exception: %s" % err
310 print "Saving data... exitting."
311 soltesz.dbDump(config.dbname, externalState)