7 from datetime import datetime,timedelta
11 from monitor.util import file
12 from monitor.util import command
13 from monitor import config
15 from monitor.database.info.model import FindbadNodeRecordSync, FindbadNodeRecord, session
17 from monitor.sources import comon
18 from monitor.wrapper import plc, plccache
19 from monitor.scanapi import *
21 from nodequery import verify,query_to_dict,node_select
23 from monitor.common import nmap_port_status
25 #print "starting sqlfindbad.py"
27 COMON_COTOPURL= "http://summer.cs.princeton.edu/status/tabulator.cgi?" + \
28 "table=table_nodeview&" + \
29 "dumpcols='name,resptime,sshstatus,uptime,lastcotop,cpuspeed,memsize,disksize'&" + \
32 #"select='lastcotop!=0'"
34 api = plc.getAuthAPI()
35 plc_lock = threading.Lock()
40 # this will be called when an exception occurs within a thread
41 def handle_exception(request, result):
42 print "Exception occured in request %s" % request.requestID
44 print "Result: %s" % i
47 def checkAndRecordState(l_nodes, cohash):
51 tp = threadpool.ThreadPool(20)
52 scannode = ScanNodeInternal(global_round)
54 # CREATE all the work requests
55 for nodename in l_nodes:
56 fbnodesync = FindbadNodeRecordSync.findby_or_create(hostname=nodename, if_new_set={'round':0})
57 node_round = fbnodesync.round
60 if node_round < global_round or config.force:
61 # recreate node stats when refreshed
62 #print "%s" % nodename
63 req = threadpool.WorkRequest(scannode.collectInternal, [nodename, cohash], {},
64 None, scannode.record, handle_exception)
67 # We just skip it, since it's "up to date"
69 #print "%d %s %s" % (count, nodename, externalState['nodes'][nodename]['values'])
70 print "%d %s %s" % (count, nodename, node_round)
72 # WAIT while all the work requests are processed.
78 # if more than two hours
79 if time.time() - begin > (60*60*1.5):
80 print "findbad.py has run out of time!!!!!!"
82 except KeyboardInterrupt:
85 except threadpool.NoResultsPending:
86 print "All results collected."
89 print FindbadNodeRecordSync.query.count()
90 print FindbadNodeRecord.query.count()
96 fbsync = FindbadNodeRecordSync.findby_or_create(hostname="global",
97 if_new_set={'round' : global_round})
98 global_round = fbsync.round
101 # update global round number to force refreshes across all nodes
104 cotop = comon.Comon()
105 # lastcotop measures whether cotop is actually running. this is a better
106 # metric than sshstatus, or other values from CoMon
107 cotop_url = COMON_COTOPURL
109 # history information for all nodes
111 #cohash = cotop.coget(cotop_url)
112 l_nodes = plccache.l_nodes
114 f_nodes = file.getListFromFile(config.nodelist)
115 l_nodes = filter(lambda x: x['hostname'] in f_nodes, l_nodes)
117 f_nodes = [config.node]
118 l_nodes = filter(lambda x: x['hostname'] in f_nodes, l_nodes)
119 elif config.nodegroup:
120 ng = api.GetNodeGroups({'name' : config.nodegroup})
121 l_nodes = api.GetNodes(ng[0]['node_ids'])
123 site = api.GetSites(config.site)
124 l_nodes = api.GetNodes(site[0]['node_ids'], ['hostname'])
125 elif config.sitelist:
126 site_list = config.sitelist.split(',')
127 sites = api.GetSites(site_list)
130 node_ids += s['node_ids']
131 l_nodes = api.GetNodes(node_ids, ['hostname'])
133 l_nodes = [node['hostname'] for node in l_nodes]
135 # perform this query after the above options, so that the filter above
137 if config.nodeselect:
138 plcnodes = api.GetNodes({'peer_id' : None}, ['hostname'])
139 plcnodes = [ node['hostname'] for node in plcnodes ]
140 l_nodes = node_select(config.nodeselect, plcnodes, None)
142 print "fetching %s hosts" % len(l_nodes)
144 checkAndRecordState(l_nodes, cohash)
147 # update global round number to force refreshes across all nodes
148 fbsync.round = global_round
154 if __name__ == '__main__':
155 from monitor import parser as parsermodule
157 parser = parsermodule.getParser(['nodesets'])
159 parser.set_defaults( increment=False, dbname="findbad", cachenodes=False,
161 parser.add_option("", "--cachenodes", action="store_true",
162 help="Cache node lookup from PLC")
163 parser.add_option("", "--dbname", dest="dbname", metavar="FILE",
164 help="Specify the name of the database to which the information is saved")
165 parser.add_option("-i", "--increment", action="store_true", dest="increment",
166 help="Increment round number to force refresh or retry")
167 parser.add_option("", "--force", action="store_true", dest="force",
168 help="Force probe without incrementing global 'round'.")
170 parser = parsermodule.getParser(['defaults'], parser)
172 cfg = parsermodule.parse_args(parser)
176 except Exception, err:
177 print traceback.print_exc()
178 print "Exception: %s" % err
179 print "Saving data... exitting."