7 from datetime import datetime,timedelta
11 from monitor import util
12 from monitor.util import command
13 from monitor import config
14 from monitor.database import FindbadNodeRecordSync, FindbadNodeRecord
15 from monitor.sources import comon
16 from monitor.wrapper import plc
19 from nodequery import verify,query_to_dict,node_select
22 print "starting sqlfindbad.py"
24 COMON_COTOPURL= "http://summer.cs.princeton.edu/status/tabulator.cgi?" + \
25 "table=table_nodeview&" + \
26 "dumpcols='name,resptime,sshstatus,uptime,lastcotop,cpuspeed,memsize,disksize'&" + \
29 #"select='lastcotop!=0'"
31 api = plc.getAuthAPI()
32 plc_lock = threading.Lock()
37 def collectPingAndSSH(nodename, cohash):
38 ### RUN PING ######################
40 (oval,errval) = ping.run_noexcept("ping -c 1 -q %s | grep rtt" % nodename)
47 values['ping'] = "NOPING"
49 values['ping'] = "PING"
52 for port in [22, 806]:
53 ssh = command.SSH('root', nodename, port)
55 (oval, errval) = ssh.run_noexcept2(""" <<\EOF
57 echo ' "kernel":"'`uname -a`'",'
58 echo ' "bmlog":"'`ls /tmp/bm.log`'",'
59 echo ' "bootcd":"'`cat /mnt/cdrom/bootme/ID`'",'
60 echo ' "nm":"'`ps ax | grep nm.py | grep -v grep`'",'
61 echo ' "readonlyfs":"'`touch /var/log/monitor 2>&1`'",'
62 echo ' "dns":"'`host boot.planet-lab.org 2>&1`'",'
63 echo ' "princeton_comon":"'`ls -d /vservers/princeton_comon`'",'
65 ID=`grep princeton_comon /etc/passwd | awk -F : '{if ( $3 > 500 ) { print $3}}'`
67 echo ' "princeton_comon_running":"'`ls -d /proc/virtual/$ID`'",'
68 echo ' "princeton_comon_procs":"'`vps ax | grep $ID | grep -v grep | wc -l`'",'
72 values['ssherror'] = errval
74 #print "OVAL: %s" % oval
75 values.update(eval(oval))
76 values['sshport'] = port
79 values.update({'kernel': "", 'bmlog' : "", 'bootcd' : '',
83 'princeton_comon' : "",
84 'princeton_comon_running' : "",
85 'princeton_comon_procs' : "", 'sshport' : None})
87 print traceback.print_exc()
90 ### RUN SSH ######################
92 #ssh = command.SSH('root', nodename)
95 #(oval, errval) = ssh.run_noexcept('echo `uname -a ; ls /tmp/bm.log`')
97 oval = values['kernel']
98 if "2.6.17" in oval or "2.6.2" in oval:
100 values['category'] = 'ALPHA'
101 if "bm.log" in values['bmlog']:
102 values['state'] = 'DEBUG'
104 values['state'] = 'BOOT'
105 elif "2.6.12" in oval or "2.6.10" in oval:
106 values['ssh'] = 'SSH'
107 values['category'] = 'PROD'
108 if "bm.log" in values['bmlog']:
109 values['state'] = 'DEBUG'
111 values['state'] = 'BOOT'
113 # NOTE: on 2.6.8 kernels, with 4.2 bootstrapfs, the chroot command fails. I have no idea why.
114 elif "2.4" in oval or "2.6.8" in oval:
115 b_getbootcd_id = False
116 values['ssh'] = 'SSH'
117 values['category'] = 'OLDBOOTCD'
118 values['state'] = 'DEBUG'
120 values['ssh'] = 'SSH'
121 values['category'] = 'UNKNOWN'
122 if "bm.log" in values['bmlog']:
123 values['state'] = 'DEBUG'
125 values['state'] = 'BOOT'
128 b_getbootcd_id = False
129 values['ssh'] = 'NOSSH'
130 values['category'] = 'ERROR'
131 values['state'] = 'DOWN'
133 values['ssherror'] = val
134 values['kernel'] = ""
136 #values['kernel'] = val
139 # try to get BootCD for all nodes that are not 2.4 nor inaccessible
140 #(oval, errval) = ssh.run_noexcept('cat /mnt/cdrom/bootme/ID')
141 oval = values['bootcd']
143 values['bootcd'] = oval
144 if "v2" in oval and \
145 ( nodename is not "planetlab1.cs.unc.edu" and \
146 nodename is not "planetlab2.cs.unc.edu" ):
147 values['category'] = 'OLDBOOTCD'
149 values['bootcd'] = ""
151 values['bootcd'] = ""
153 # TODO: get bm.log for debug nodes.
156 #(oval, errval) = ssh.run_noexcept('ps ax | grep nm.py | grep -v grep')
163 continue_slice_check = True
164 #(oval, errval) = ssh.run_noexcept('ls -d /vservers/princeton_comon')
165 oval = values['princeton_comon']
166 if "princeton_comon" in oval:
167 values['princeton_comon'] = True
169 values['princeton_comon'] = False
170 continue_slice_check = False
172 if continue_slice_check:
173 #(oval, errval) = ssh.run_noexcept('ID=`grep princeton_comon /etc/passwd | awk -F : "{if ( \\\$3 > 500 ) { print \\\$3}}"`; ls -d /proc/virtual/$ID')
174 oval = values['princeton_comon_running']
175 if len(oval) > len('/proc/virtual/'):
176 values['princeton_comon_running'] = True
178 values['princeton_comon_running'] = False
179 continue_slice_check = False
181 values['princeton_comon_running'] = False
183 if continue_slice_check:
184 #(oval, errval) = ssh.run_noexcept('ID=`grep princeton_comon /etc/passwd | awk -F : "{if ( \\\$3 > 500 ) { print \\\$3}}"`; vps ax | grep $ID | grep -v grep | wc -l')
185 oval = values['princeton_comon_procs']
186 values['princeton_comon_procs'] = int(oval)
188 values['princeton_comon_procs'] = None
191 if nodename in cohash:
192 values['comonstats'] = cohash[nodename]
194 values['comonstats'] = {'resptime': '-1',
201 # include output value
202 ### GET PLC NODE ######################
206 d_node = plc.getNodes({'hostname': nodename}, ['pcu_ids', 'site_id', 'date_created',
207 'last_updated', 'last_contact', 'boot_state', 'nodegroup_ids'])[0]
209 traceback.print_exc()
211 values['plcnode'] = d_node
213 ### GET PLC PCU ######################
217 pcu = d_node['pcu_ids']
221 site_id = d_node['site_id']
223 values['pcu'] = d_pcu
225 ### GET PLC SITE ######################
228 values['loginbase'] = ""
230 d_site = plc.getSites({'site_id': site_id},
231 ['max_slices', 'slice_ids', 'node_ids', 'login_base'])[0]
232 values['loginbase'] = d_site['login_base']
234 traceback.print_exc()
237 values['plcsite'] = d_site
238 values['date_checked'] = time.time()
240 print traceback.print_exc()
242 return (nodename, values)
244 def recordPingAndSSH(request, result):
247 (nodename, values) = result
250 if values is not None:
251 fbsync = FindbadNodeRecordSync.findby_or_create(hostname="global",
252 if_new_set={'round' : global_round})
253 global_round = fbsync.round
254 fbnodesync = FindbadNodeRecordSync.findby_or_create(hostname=nodename,
255 if_new_set={'round' : global_round})
257 fbrec = FindbadNodeRecord(
258 date_checked=datetime.fromtimestamp(values['date_checked']),
260 loginbase=values['loginbase'],
261 kernel_version=values['kernel'],
262 bootcd_version=values['bootcd'],
263 nm_status=values['nm'],
264 fs_status=values['readonlyfs'],
265 dns_status=values['dns'],
266 princeton_comon_dir=values['princeton_comon'],
267 princeton_comon_running=values['princeton_comon_running'],
268 princeton_comon_procs=values['princeton_comon_procs'],
269 plc_node_stats = values['plcnode'],
270 plc_site_stats = values['plcsite'],
271 plc_pcuid = values['pcu'],
272 comon_stats = values['comonstats'],
273 ping_status = (values['ping'] == "PING"),
274 ssh_portused = values['sshport'],
275 ssh_status = (values['ssh'] == "SSH"),
276 ssh_error = values['ssherror'],
277 observed_status = values['state'],
279 fbnodesync.round = global_round
282 print "%d %s %s" % (count, nodename, values)
285 print traceback.print_exc()
287 # this will be called when an exception occurs within a thread
288 def handle_exception(request, result):
289 print "Exception occured in request %s" % request.requestID
291 print "Result: %s" % i
294 def checkAndRecordState(l_nodes, cohash):
298 tp = threadpool.ThreadPool(20)
300 # CREATE all the work requests
301 for nodename in l_nodes:
302 fbnodesync = FindbadNodeRecordSync.findby_or_create(hostname=nodename, if_new_set={'round':0})
304 node_round = fbnodesync.round
305 if node_round < global_round:
306 # recreate node stats when refreshed
307 #print "%s" % nodename
308 req = threadpool.WorkRequest(collectPingAndSSH, [nodename, cohash], {},
309 None, recordPingAndSSH, handle_exception)
312 # We just skip it, since it's "up to date"
314 #print "%d %s %s" % (count, nodename, externalState['nodes'][nodename]['values'])
315 print "%d %s %s" % (count, nodename, node_round)
317 # WAIT while all the work requests are processed.
323 # if more than two hours
324 if time.time() - begin > (60*60*1.5):
325 print "findbad.py has run out of time!!!!!!"
327 except KeyboardInterrupt:
330 except threadpool.NoResultsPending:
331 print "All results collected."
334 print FindbadNodeRecordSync.query.count()
335 print FindbadNodeRecord.query.count()
340 fbsync = FindbadNodeRecordSync.findby_or_create(hostname="global",
341 if_new_set={'round' : global_round})
342 global_round = fbsync.round
345 # update global round number to force refreshes across all nodes
347 fbsync.round = global_round
349 cotop = comon.Comon()
350 # lastcotop measures whether cotop is actually running. this is a better
351 # metric than sshstatus, or other values from CoMon
352 cotop_url = COMON_COTOPURL
354 # history information for all nodes
356 cohash = cotop.coget(cotop_url)
357 l_nodes = syncplcdb.create_plcdb()
359 f_nodes = util.file.getListFromFile(config.nodelist)
360 l_nodes = filter(lambda x: x['hostname'] in f_nodes, l_nodes)
362 f_nodes = [config.node]
363 l_nodes = filter(lambda x: x['hostname'] in f_nodes, l_nodes)
364 elif config.nodegroup:
365 ng = api.GetNodeGroups({'name' : config.nodegroup})
366 l_nodes = api.GetNodes(ng[0]['node_ids'])
368 site = api.GetSites(config.site)
369 l_nodes = api.GetNodes(site[0]['node_ids'], ['hostname'])
371 l_nodes = [node['hostname'] for node in l_nodes]
373 # perform this query after the above options, so that the filter above
375 if config.nodeselect:
376 plcnodes = api.GetNodes({'peer_id' : None}, ['hostname'])
377 plcnodes = [ node['hostname'] for node in plcnodes ]
378 l_nodes = node_select(config.nodeselect, plcnodes, None)
380 print "fetching %s hosts" % len(l_nodes)
382 checkAndRecordState(l_nodes, cohash)
387 if __name__ == '__main__':
388 from monitor import parser as parsermodule
390 parser = parsermodule.getParser(['nodesets'])
392 parser.set_defaults( increment=False, dbname="findbad", cachenodes=False)
393 parser.add_option("", "--cachenodes", action="store_true",
394 help="Cache node lookup from PLC")
395 parser.add_option("", "--dbname", dest="dbname", metavar="FILE",
396 help="Specify the name of the database to which the information is saved")
397 parser.add_option("-i", "--increment", action="store_true", dest="increment",
398 help="Increment round number to force refresh or retry")
400 parser = parsermodule.getParser(['defaults'], parser)
402 cfg = parsermodule.parse_args(parser)
406 except Exception, err:
407 print traceback.print_exc()
408 print "Exception: %s" % err
409 print "Saving data... exitting."
412 #print "final commit"