7 from datetime import datetime,timedelta
11 from monitor.util import file
12 from pcucontrol.util import command
13 from monitor import config
15 from monitor.database.info.model import FindbadNodeRecord, session
17 from monitor.sources import comon
18 from monitor.wrapper import plc, plccache
19 from monitor.scanapi import *
21 from nodequery import verify,query_to_dict,node_select
23 from monitor.common import nmap_port_status
25 #print "starting sqlfindbad.py"
27 COMON_COTOPURL= "http://summer.cs.princeton.edu/status/tabulator.cgi?" + \
28 "table=table_nodeview&" + \
29 "dumpcols='name,resptime,sshstatus,uptime,lastcotop,cpuspeed,memsize,disksize'&" + \
32 #"select='lastcotop!=0'"
34 api = plc.getAuthAPI()
35 plc_lock = threading.Lock()
40 # this will be called when an exception occurs within a thread
41 def handle_exception(request, result):
42 print "Exception occured in request %s" % request.requestID
44 print "Result: %s" % i
47 def checkAndRecordState(l_nodes, cohash):
51 tp = threadpool.ThreadPool(20)
52 scannode = ScanNodeInternal(global_round)
54 # CREATE all the work requests
55 for nodename in l_nodes:
56 #fbnodesync = FindbadNodeRecordSync.findby_or_create(hostname=nodename, if_new_set={'round':0})
57 #node_round = fbnodesync.round
58 node_round = global_round - 1
61 if node_round < global_round or config.force:
62 # recreate node stats when refreshed
63 #print "%s" % nodename
64 req = threadpool.WorkRequest(scannode.collectInternal, [nodename, cohash], {},
65 None, scannode.record, handle_exception)
68 # We just skip it, since it's "up to date"
70 #print "%d %s %s" % (count, nodename, externalState['nodes'][nodename]['values'])
71 print "%d %s %s" % (count, nodename, node_round)
73 # WAIT while all the work requests are processed.
79 # if more than two hours
80 if time.time() - begin > (60*60*1.5):
81 print "findbad.py has run out of time!!!!!!"
83 except KeyboardInterrupt:
86 except threadpool.NoResultsPending:
87 print "All results collected."
90 #print FindbadNodeRecordSync.query.count()
91 print FindbadNodeRecord.query.count()
97 #fbsync = FindbadNodeRecordSync.findby_or_create(hostname="global",
98 # if_new_set={'round' : global_round})
99 #global_round = fbsync.round
102 # update global round number to force refreshes across all nodes
105 cotop = comon.Comon()
106 # lastcotop measures whether cotop is actually running. this is a better
107 # metric than sshstatus, or other values from CoMon
108 cotop_url = COMON_COTOPURL
110 # history information for all nodes
112 #cohash = cotop.coget(cotop_url)
113 l_nodes = plccache.l_nodes
115 f_nodes = file.getListFromFile(config.nodelist)
116 l_nodes = filter(lambda x: x['hostname'] in f_nodes, l_nodes)
118 f_nodes = [config.node]
119 l_nodes = filter(lambda x: x['hostname'] in f_nodes, l_nodes)
120 elif config.nodegroup:
121 ng = api.GetNodeGroups({'name' : config.nodegroup})
122 l_nodes = plccache.GetNodesByIds(ng[0]['node_ids'])
124 site = plccache.GetSitesByName([config.site])
125 l_nodes = plccache.GetNodesByIds(site[0]['node_ids'])
126 elif config.sitelist:
127 site_list = config.sitelist.split(',')
128 sites = plccache.GetSitesByName(site_list)
131 node_ids += s['node_ids']
132 l_nodes = plccache.GetNodesByIds(node_ids)
134 l_nodes = [node['hostname'] for node in l_nodes]
136 # perform this query after the above options, so that the filter above
138 if config.nodeselect:
139 plcnodes = plccache.l_nodes
140 plcnodes = [ node['hostname'] for node in plcnodes ]
141 l_nodes = node_select(config.nodeselect, plcnodes, None)
143 print "fetching %s hosts" % len(l_nodes)
145 checkAndRecordState(l_nodes, cohash)
148 # update global round number to force refreshes across all nodes
149 #fbsync.round = global_round
156 if __name__ == '__main__':
157 from monitor import parser as parsermodule
159 parser = parsermodule.getParser(['nodesets'])
161 parser.set_defaults( increment=False, dbname="findbad", cachenodes=False,
163 parser.add_option("", "--cachenodes", action="store_true",
164 help="Cache node lookup from PLC")
165 parser.add_option("", "--dbname", dest="dbname", metavar="FILE",
166 help="Specify the name of the database to which the information is saved")
167 parser.add_option("-i", "--increment", action="store_true", dest="increment",
168 help="Increment round number to force refresh or retry")
169 parser.add_option("", "--force", action="store_true", dest="force",
170 help="Force probe without incrementing global 'round'.")
172 parser = parsermodule.getParser(['defaults'], parser)
174 cfg = parsermodule.parse_args(parser)
178 except Exception, err:
179 print traceback.print_exc()
180 from monitor.common import email_exception
182 print "Exception: %s" % err
183 print "Saving data... exitting."