12 from config import config
13 from optparse import OptionParser
14 parser = OptionParser()
15 parser.set_defaults(filename="", increment=False, dbname="findbadnodes")
16 parser.add_option("-f", "--nodes", dest="filename", metavar="FILE",
17 help="Provide the input file for the node list")
18 parser.add_option("", "--dbname", dest="dbname", metavar="FILE",
19 help="Specify the name of the database to which the information is saved")
20 parser.add_option("-i", "--increment", action="store_true", dest="increment",
21 help="Increment round number to force refresh or retry")
22 config = config(parser)
26 COMON_COTOPURL= "http://summer.cs.princeton.edu/status/tabulator.cgi?" + \
27 "table=table_nodeview&" + \
28 "dumpcols='name,resptime,sshstatus,uptime,lastcotop'&" + \
31 #"select='lastcotop!=0'"
34 externalState = {'round': round, 'nodes': {}}
37 def collectPingAndSSH(nodename, cohash):
38 ### RUN PING ######################
40 (oval,eval) = ping.run_noexcept("ping -c 1 -q %s | grep rtt" % nodename)
46 values['ping'] = "NOPING"
48 values['ping'] = "PING"
50 ### RUN SSH ######################
52 ssh = soltesz.SSH('root', nodename)
55 (oval, eval) = ssh.run_noexcept('echo `uname -a ; ls /tmp/bm.log`')
57 if "2.6.17" in oval or "2.6.20" in oval:
60 values['category'] = 'ALPHA'
61 values['state'] = 'DEBUG'
63 values['category'] = 'ALPHA'
64 values['state'] = 'BOOT'
65 elif "2.6.12" in oval or "2.6.10" in oval:
67 values['category'] = 'PROD'
69 values['state'] = 'DEBUG'
71 values['state'] = 'BOOT'
73 b_getbootcd_id = False
75 values['category'] = 'OLDBOOTCD'
76 values['state'] = 'DEBUG'
79 values['category'] = 'UNKNOWN'
81 values['state'] = 'DEBUG'
83 values['state'] = 'BOOT'
86 b_getbootcd_id = False
87 values['ssh'] = 'NOSSH'
88 values['category'] = 'ERROR'
89 values['state'] = 'DOWN'
92 values['kernel'] = val
95 # try to get BootCD for all nodes that are not 2.4 nor inaccessible
96 (oval, eval) = ssh.run_noexcept('cat /mnt/cdrom/bootme/ID')
99 values['bootcd'] = val
101 values['category'] = 'OLDBOOTCD'
103 values['bootcd'] = ""
105 values['bootcd'] = ""
107 # TODO: get bm.log for debug nodes.
110 values['comonstats'] = cohash[nodename]
111 # include output value
112 ### GET PLC NODE ######################
113 d_node = plc.getNodes({'hostname': nodename})
115 if d_node and len(d_node) > 0:
116 pcu = d_node[0]['pcu_ids']
118 values['pcu'] = "PCU"
120 values['pcu'] = "NOPCU"
121 site_id = d_node[0]['site_id']
122 values['plcnode'] = {'status' : 'SUCCESS', 'pcu_ids': pcu, 'site_id': site_id}
124 values['pcu'] = "UNKNOWN"
125 values['plcnode'] = {'status' : "GN_FAILED"}
128 ### GET PLC SITE ######################
129 d_site = plc.getSites({'site_id': site_id})
130 if d_site and len(d_site) > 0:
131 max_slices = d_site[0]['max_slices']
132 num_slices = len(d_site[0]['slice_ids'])
133 num_nodes = len(d_site[0]['node_ids'])
134 loginbase = d_site[0]['login_base']
135 values['plcsite'] = {'num_nodes' : num_nodes,
136 'max_slices' : max_slices,
137 'num_slices' : num_slices,
138 'login_base' : loginbase,
139 'status' : 'SUCCESS'}
141 values['plcsite'] = {'status' : "GS_FAILED"}
143 return (nodename, values)
145 def recordPingAndSSH(request, result):
148 (nodename, values) = result
150 global_round = externalState['round']
151 externalState['nodes'][nodename]['values'] = values
152 externalState['nodes'][nodename]['round'] = global_round
155 print "%d %s %s" % (count, nodename, externalState['nodes'][nodename]['values'])
156 soltesz.dbDump(config.dbname, externalState)
158 # this will be called when an exception occurs within a thread
159 def handle_exception(request, result):
160 print "Exception occured in request %s" % request.requestID
162 print "Result: %s" % i
165 def checkAndRecordState(l_nodes, cohash):
168 global_round = externalState['round']
170 tp = threadpool.ThreadPool(20)
172 # CREATE all the work requests
173 for nodename in l_nodes:
174 if nodename not in externalState['nodes']:
175 externalState['nodes'][nodename] = {'round': 0, 'values': []}
177 node_round = externalState['nodes'][nodename]['round']
178 if node_round < global_round:
179 # recreate node stats when refreshed
180 #print "%s" % nodename
181 req = threadpool.WorkRequest(collectPingAndSSH, [nodename, cohash], {},
182 None, recordPingAndSSH, handle_exception)
185 # We just skip it, since it's "up to date"
187 print "%d %s %s" % (count, nodename, externalState['nodes'][nodename]['values'])
190 # WAIT while all the work requests are processed.
195 except KeyboardInterrupt:
198 except threadpool.NoResultsPending:
199 print "All results collected."
207 externalState = soltesz.if_cached_else(1, config.dbname, lambda : externalState)
210 # update global round number to force refreshes across all nodes
211 externalState['round'] += 1
213 cotop = comon.Comon()
214 # lastcotop measures whether cotop is actually running. this is a better
215 # metric than sshstatus, or other values from CoMon
216 cotop_url = COMON_COTOPURL
218 cohash = cotop.coget(cotop_url)
220 if config.filename == "":
221 l_nodes = cohash.keys()
223 l_nodes = config.getListFromFile(config.filename)
225 checkAndRecordState(l_nodes, cohash)
230 if __name__ == '__main__':
233 except Exception, err:
234 print "Exception: %s" % err
235 print "Saving data... exitting."
236 soltesz.dbDump(config.dbname, externalState)