7 from datetime import datetime,timedelta
11 from monitor import util
12 from monitor.util import command
13 from monitor import config
14 from monitor.database import FindbadNodeRecordSync, FindbadNodeRecord
15 from monitor.sources import comon
16 from monitor.wrapper import plc, plccache
18 from nodequery import verify,query_to_dict,node_select
21 print "starting sqlfindbad.py"
23 COMON_COTOPURL= "http://summer.cs.princeton.edu/status/tabulator.cgi?" + \
24 "table=table_nodeview&" + \
25 "dumpcols='name,resptime,sshstatus,uptime,lastcotop,cpuspeed,memsize,disksize'&" + \
28 #"select='lastcotop!=0'"
30 api = plc.getAuthAPI()
31 plc_lock = threading.Lock()
36 def collectPingAndSSH(nodename, cohash):
37 ### RUN PING ######################
39 (oval,errval) = ping.run_noexcept("ping -c 1 -q %s | grep rtt" % nodename)
46 values['ping'] = "NOPING"
48 values['ping'] = "PING"
51 for port in [22, 806]:
52 ssh = command.SSH('root', nodename, port)
54 (oval, errval) = ssh.run_noexcept2(""" <<\EOF
56 echo ' "kernel":"'`uname -a`'",'
57 echo ' "bmlog":"'`ls /tmp/bm.log`'",'
58 echo ' "bootcd":"'`cat /mnt/cdrom/bootme/ID`'",'
59 echo ' "nm":"'`ps ax | grep nm.py | grep -v grep`'",'
60 echo ' "readonlyfs":"'`touch /var/log/monitor 2>&1`'",'
61 echo ' "dns":"'`host boot.planet-lab.org 2>&1`'",'
62 echo ' "princeton_comon":"'`ls -d /vservers/princeton_comon`'",'
64 ID=`grep princeton_comon /etc/passwd | awk -F : '{if ( $3 > 500 ) { print $3}}'`
65 echo ' "princeton_comon_running":"'`ls -d /proc/virtual/$ID`'",'
66 echo ' "princeton_comon_procs":"'`vps ax | grep $ID | grep -v grep | wc -l`'",'
70 values['ssherror'] = errval
72 #print "OVAL: %s" % oval
73 values.update(eval(oval))
74 values['sshport'] = port
77 values.update({'kernel': "", 'bmlog' : "", 'bootcd' : '',
81 'princeton_comon' : "",
82 'princeton_comon_running' : "",
83 'princeton_comon_procs' : "", 'sshport' : None})
85 print traceback.print_exc()
88 ### RUN SSH ######################
90 #ssh = command.SSH('root', nodename)
93 #(oval, errval) = ssh.run_noexcept('echo `uname -a ; ls /tmp/bm.log`')
95 oval = values['kernel']
96 if "2.6.17" in oval or "2.6.2" in oval:
98 values['category'] = 'PROD'
99 if "bm.log" in values['bmlog']:
100 values['state'] = 'DEBUG'
102 values['state'] = 'BOOT'
103 elif "2.6.12" in oval or "2.6.10" in oval:
104 values['ssh'] = 'SSH'
105 values['category'] = 'OLDPROD'
106 if "bm.log" in values['bmlog']:
107 values['state'] = 'DEBUG'
109 values['state'] = 'BOOT'
111 # NOTE: on 2.6.8 kernels, with 4.2 bootstrapfs, the chroot command fails. I have no idea why.
112 elif "2.4" in oval or "2.6.8" in oval:
113 b_getbootcd_id = False
114 values['ssh'] = 'SSH'
115 values['category'] = 'OLDBOOTCD'
116 values['state'] = 'DEBUG'
118 values['ssh'] = 'SSH'
119 values['category'] = 'UNKNOWN'
120 if "bm.log" in values['bmlog']:
121 values['state'] = 'DEBUG'
123 values['state'] = 'BOOT'
126 b_getbootcd_id = False
127 values['ssh'] = 'NOSSH'
128 values['category'] = 'ERROR'
129 values['state'] = 'DOWN'
131 values['ssherror'] = val
132 values['kernel'] = ""
134 #values['kernel'] = val
137 # try to get BootCD for all nodes that are not 2.4 nor inaccessible
138 #(oval, errval) = ssh.run_noexcept('cat /mnt/cdrom/bootme/ID')
139 oval = values['bootcd']
141 values['bootcd'] = oval
142 if "v2" in oval and \
143 ( nodename is not "planetlab1.cs.unc.edu" and \
144 nodename is not "planetlab2.cs.unc.edu" ):
145 values['category'] = 'OLDBOOTCD'
147 values['bootcd'] = ""
149 values['bootcd'] = ""
151 # TODO: get bm.log for debug nodes.
154 #(oval, errval) = ssh.run_noexcept('ps ax | grep nm.py | grep -v grep')
161 continue_slice_check = True
162 #(oval, errval) = ssh.run_noexcept('ls -d /vservers/princeton_comon')
163 oval = values['princeton_comon']
164 if "princeton_comon" in oval:
165 values['princeton_comon'] = True
167 values['princeton_comon'] = False
168 continue_slice_check = False
170 if continue_slice_check:
171 #(oval, errval) = ssh.run_noexcept('ID=`grep princeton_comon /etc/passwd | awk -F : "{if ( \\\$3 > 500 ) { print \\\$3}}"`; ls -d /proc/virtual/$ID')
172 oval = values['princeton_comon_running']
173 if len(oval) > len('/proc/virtual/'):
174 values['princeton_comon_running'] = True
176 values['princeton_comon_running'] = False
177 continue_slice_check = False
179 values['princeton_comon_running'] = False
181 if continue_slice_check:
182 #(oval, errval) = ssh.run_noexcept('ID=`grep princeton_comon /etc/passwd | awk -F : "{if ( \\\$3 > 500 ) { print \\\$3}}"`; vps ax | grep $ID | grep -v grep | wc -l')
183 oval = values['princeton_comon_procs']
184 values['princeton_comon_procs'] = int(oval)
186 values['princeton_comon_procs'] = None
189 if nodename in cohash:
190 values['comonstats'] = cohash[nodename]
192 values['comonstats'] = {'resptime': '-1',
199 # include output value
200 ### GET PLC NODE ######################
204 d_node = plc.getNodes({'hostname': nodename}, ['pcu_ids', 'site_id', 'date_created',
205 'last_updated', 'last_contact', 'boot_state', 'nodegroup_ids'])[0]
207 traceback.print_exc()
209 values['plcnode'] = d_node
211 ### GET PLC PCU ######################
215 pcu = d_node['pcu_ids']
219 site_id = d_node['site_id']
221 values['pcu'] = d_pcu
223 ### GET PLC SITE ######################
226 values['loginbase'] = ""
228 d_site = plc.getSites({'site_id': site_id},
229 ['max_slices', 'slice_ids', 'node_ids', 'login_base'])[0]
230 values['loginbase'] = d_site['login_base']
232 traceback.print_exc()
235 values['plcsite'] = d_site
236 values['date_checked'] = time.time()
238 print traceback.print_exc()
240 return (nodename, values)
242 def recordPingAndSSH(request, result):
245 (nodename, values) = result
248 if values is not None:
249 fbsync = FindbadNodeRecordSync.findby_or_create(hostname="global",
250 if_new_set={'round' : global_round})
251 global_round = fbsync.round
252 fbnodesync = FindbadNodeRecordSync.findby_or_create(hostname=nodename,
253 if_new_set={'round' : global_round})
255 fbrec = FindbadNodeRecord(
256 date_checked=datetime.fromtimestamp(values['date_checked']),
259 loginbase=values['loginbase'],
260 kernel_version=values['kernel'],
261 bootcd_version=values['bootcd'],
262 nm_status=values['nm'],
263 fs_status=values['readonlyfs'],
264 dns_status=values['dns'],
265 princeton_comon_dir=values['princeton_comon'],
266 princeton_comon_running=values['princeton_comon_running'],
267 princeton_comon_procs=values['princeton_comon_procs'],
268 plc_node_stats = values['plcnode'],
269 plc_site_stats = values['plcsite'],
270 plc_pcuid = values['pcu'],
271 comon_stats = values['comonstats'],
272 ping_status = (values['ping'] == "PING"),
273 ssh_portused = values['sshport'],
274 ssh_status = (values['ssh'] == "SSH"),
275 ssh_error = values['ssherror'],
276 observed_status = values['state'],
277 observed_category = values['category'],
279 fbnodesync.round = global_round
282 print "%d %s %s" % (count, nodename, values)
285 print traceback.print_exc()
287 # this will be called when an exception occurs within a thread
288 def handle_exception(request, result):
289 print "Exception occured in request %s" % request.requestID
291 print "Result: %s" % i
294 def checkAndRecordState(l_nodes, cohash):
298 tp = threadpool.ThreadPool(20)
300 # CREATE all the work requests
301 for nodename in l_nodes:
302 fbnodesync = FindbadNodeRecordSync.findby_or_create(hostname=nodename, if_new_set={'round':0})
304 node_round = fbnodesync.round
305 if node_round < global_round:
306 # recreate node stats when refreshed
307 #print "%s" % nodename
308 req = threadpool.WorkRequest(collectPingAndSSH, [nodename, cohash], {},
309 None, recordPingAndSSH, handle_exception)
312 # We just skip it, since it's "up to date"
314 #print "%d %s %s" % (count, nodename, externalState['nodes'][nodename]['values'])
315 print "%d %s %s" % (count, nodename, node_round)
317 # WAIT while all the work requests are processed.
323 # if more than two hours
324 if time.time() - begin > (60*60*1.5):
325 print "findbad.py has run out of time!!!!!!"
327 except KeyboardInterrupt:
330 except threadpool.NoResultsPending:
331 print "All results collected."
334 print FindbadNodeRecordSync.query.count()
335 print FindbadNodeRecord.query.count()
340 fbsync = FindbadNodeRecordSync.findby_or_create(hostname="global",
341 if_new_set={'round' : global_round})
342 global_round = fbsync.round
345 # update global round number to force refreshes across all nodes
347 fbsync.round = global_round
349 cotop = comon.Comon()
350 # lastcotop measures whether cotop is actually running. this is a better
351 # metric than sshstatus, or other values from CoMon
352 cotop_url = COMON_COTOPURL
354 # history information for all nodes
356 cohash = cotop.coget(cotop_url)
357 l_nodes = plccache.l_nodes
359 f_nodes = util.file.getListFromFile(config.nodelist)
360 l_nodes = filter(lambda x: x['hostname'] in f_nodes, l_nodes)
362 f_nodes = [config.node]
363 l_nodes = filter(lambda x: x['hostname'] in f_nodes, l_nodes)
364 elif config.nodegroup:
365 ng = api.GetNodeGroups({'name' : config.nodegroup})
366 l_nodes = api.GetNodes(ng[0]['node_ids'])
368 site = api.GetSites(config.site)
369 l_nodes = api.GetNodes(site[0]['node_ids'], ['hostname'])
371 l_nodes = [node['hostname'] for node in l_nodes]
373 # perform this query after the above options, so that the filter above
375 if config.nodeselect:
376 plcnodes = api.GetNodes({'peer_id' : None}, ['hostname'])
377 plcnodes = [ node['hostname'] for node in plcnodes ]
378 l_nodes = node_select(config.nodeselect, plcnodes, None)
380 print "fetching %s hosts" % len(l_nodes)
382 checkAndRecordState(l_nodes, cohash)
387 if __name__ == '__main__':
388 from monitor import parser as parsermodule
390 parser = parsermodule.getParser(['nodesets'])
392 parser.set_defaults( increment=False, dbname="findbad", cachenodes=False)
393 parser.add_option("", "--cachenodes", action="store_true",
394 help="Cache node lookup from PLC")
395 parser.add_option("", "--dbname", dest="dbname", metavar="FILE",
396 help="Specify the name of the database to which the information is saved")
397 parser.add_option("-i", "--increment", action="store_true", dest="increment",
398 help="Increment round number to force refresh or retry")
400 parser = parsermodule.getParser(['defaults'], parser)
402 cfg = parsermodule.parse_args(parser)
406 except Exception, err:
407 print traceback.print_exc()
408 print "Exception: %s" % err
409 print "Saving data... exitting."
412 #print "final commit"