1e412bc83799a70d76ffbca8a122abad7d6b9823
[monitor.git] / findbad.py
1 #!/usr/bin/python
2
3 import os
4 import sys
5 import string
6 import time
7 from datetime import datetime,timedelta
8 import threadpool
9 import threading
10
11 from monitor import util
12 from monitor.util import command
13 from monitor import config
14 from monitor.database import FindbadNodeRecordSync, FindbadNodeRecord
15 from monitor.sources import comon
16 from monitor.wrapper import plc, plccache
17
18 from nodequery import verify,query_to_dict,node_select
19 import traceback
20
21 print "starting sqlfindbad.py"
22 # QUERY all nodes.
23 COMON_COTOPURL= "http://summer.cs.princeton.edu/status/tabulator.cgi?" + \
24                                 "table=table_nodeview&" + \
25                                 "dumpcols='name,resptime,sshstatus,uptime,lastcotop,cpuspeed,memsize,disksize'&" + \
26                                 "formatcsv"
27                                     #"formatcsv&" + \
28                                         #"select='lastcotop!=0'"
29
30 api = plc.getAuthAPI()
31 plc_lock = threading.Lock()
32 round = 1
33 global_round = round
34 count = 0
35
36 def collectPingAndSSH(nodename, cohash):
37         ### RUN PING ######################
38         ping = command.CMD()
39         (oval,errval) = ping.run_noexcept("ping -c 1 -q %s | grep rtt" % nodename)
40
41         try:
42                 values = {}
43
44                 if oval == "":
45                         # An error occurred
46                         values['ping'] = "NOPING"
47                 else:
48                         values['ping'] = "PING"
49
50                 try:
51                         for port in [22, 806]: 
52                                 ssh = command.SSH('root', nodename, port)
53
54                                 (oval, errval) = ssh.run_noexcept2(""" <<\EOF
55                                         echo "{"
56                                         echo '  "kernel":"'`uname -a`'",'
57                                         echo '  "bmlog":"'`ls /tmp/bm.log`'",'
58                                         echo '  "bootcd":"'`cat /mnt/cdrom/bootme/ID`'",'
59                                         echo '  "nm":"'`ps ax | grep nm.py | grep -v grep`'",'
60                                         echo '  "readonlyfs":"'`touch /var/log/monitor 2>&1`'",'
61                                         echo '  "dns":"'`host boot.planet-lab.org 2>&1`'",'
62                                         echo '  "princeton_comon":"'`ls -d /vservers/princeton_comon`'",'
63
64                                         ID=`grep princeton_comon /etc/passwd | awk -F : '{if ( $3 > 500 ) { print $3}}'` 
65                                         echo '  "princeton_comon_running":"'`ls -d /proc/virtual/$ID`'",'
66                                         echo '  "princeton_comon_procs":"'`vps ax | grep $ID | grep -v grep | wc -l`'",'
67                                         echo "}"
68 EOF                             """)
69                                 
70                                 values['ssherror'] = errval
71                                 if len(oval) > 0:
72                                         #print "OVAL: %s" % oval
73                                         values.update(eval(oval))
74                                         values['sshport'] = port
75                                         break
76                                 else:
77                                         values.update({'kernel': "", 'bmlog' : "", 'bootcd' : '', 
78                                                                         'nm' : '', 
79                                                                         'readonlyfs' : '',
80                                                                         'dns' : '',
81                                                                         'princeton_comon' : "", 
82                                                                         'princeton_comon_running' : "", 
83                                                                         'princeton_comon_procs' : "", 'sshport' : None})
84                 except:
85                         print traceback.print_exc()
86                         sys.exit(1)
87
88                 ### RUN SSH ######################
89                 b_getbootcd_id = True
90                 #ssh = command.SSH('root', nodename)
91                 #oval = ""
92                 #errval = ""
93                 #(oval, errval) = ssh.run_noexcept('echo `uname -a ; ls /tmp/bm.log`')
94
95                 oval = values['kernel']
96                 if "2.6.17" in oval or "2.6.2" in oval:
97                         values['ssh'] = 'SSH'
98                         values['category'] = 'PROD'
99                         if "bm.log" in values['bmlog']:
100                                 values['state'] = 'DEBUG'
101                         else:
102                                 values['state'] = 'BOOT'
103                 elif "2.6.12" in oval or "2.6.10" in oval:
104                         values['ssh'] = 'SSH'
105                         values['category'] = 'OLDPROD'
106                         if "bm.log" in values['bmlog']:
107                                 values['state'] = 'DEBUG'
108                         else:
109                                 values['state'] = 'BOOT'
110                 
111                 # NOTE: on 2.6.8 kernels, with 4.2 bootstrapfs, the chroot command fails.  I have no idea why.
112                 elif "2.4" in oval or "2.6.8" in oval:
113                         b_getbootcd_id = False
114                         values['ssh'] = 'SSH'
115                         values['category'] = 'OLDBOOTCD'
116                         values['state'] = 'DEBUG'
117                 elif oval != "":
118                         values['ssh'] = 'SSH'
119                         values['category'] = 'UNKNOWN'
120                         if "bm.log" in values['bmlog']:
121                                 values['state'] = 'DEBUG'
122                         else:
123                                 values['state'] = 'BOOT'
124                 else:
125                         # An error occurred.
126                         b_getbootcd_id = False
127                         values['ssh'] = 'NOSSH'
128                         values['category'] = 'ERROR'
129                         values['state'] = 'DOWN'
130                         val = errval.strip()
131                         values['ssherror'] = val
132                         values['kernel'] = ""
133
134                 #values['kernel'] = val
135
136                 if b_getbootcd_id:
137                         # try to get BootCD for all nodes that are not 2.4 nor inaccessible
138                         #(oval, errval) = ssh.run_noexcept('cat /mnt/cdrom/bootme/ID')
139                         oval = values['bootcd']
140                         if "BootCD" in oval:
141                                 values['bootcd'] = oval
142                                 if "v2" in oval and \
143                                         ( nodename is not "planetlab1.cs.unc.edu" and \
144                                           nodename is not "planetlab2.cs.unc.edu" ):
145                                         values['category'] = 'OLDBOOTCD'
146                         else:
147                                 values['bootcd'] = ""
148                 else:
149                         values['bootcd'] = ""
150
151                 # TODO: get bm.log for debug nodes.
152                 # 'zcat /tmp/bm.log'
153                 
154                 #(oval, errval) = ssh.run_noexcept('ps ax | grep nm.py | grep -v grep')
155                 oval = values['nm']
156                 if "nm.py" in oval:
157                         values['nm'] = "Y"
158                 else:
159                         values['nm'] = "N"
160
161                 continue_slice_check = True
162                 #(oval, errval) = ssh.run_noexcept('ls -d /vservers/princeton_comon')
163                 oval = values['princeton_comon']
164                 if "princeton_comon" in oval:
165                         values['princeton_comon'] = True
166                 else:
167                         values['princeton_comon'] = False
168                         continue_slice_check = False
169
170                 if continue_slice_check:
171                         #(oval, errval) = ssh.run_noexcept('ID=`grep princeton_comon /etc/passwd | awk -F : "{if ( \\\$3 > 500 ) { print \\\$3}}"`; ls -d /proc/virtual/$ID')
172                         oval = values['princeton_comon_running']
173                         if len(oval) > len('/proc/virtual/'):
174                                 values['princeton_comon_running'] = True
175                         else:
176                                 values['princeton_comon_running'] = False
177                                 continue_slice_check = False
178                 else:
179                         values['princeton_comon_running'] = False
180                         
181                 if continue_slice_check:
182                         #(oval, errval) = ssh.run_noexcept('ID=`grep princeton_comon /etc/passwd | awk -F : "{if ( \\\$3 > 500 ) { print \\\$3}}"`; vps ax | grep $ID | grep -v grep | wc -l')
183                         oval = values['princeton_comon_procs']
184                         values['princeton_comon_procs'] = int(oval)
185                 else:
186                         values['princeton_comon_procs'] = None
187
188                         
189                 if nodename in cohash: 
190                         values['comonstats'] = cohash[nodename]
191                 else:
192                         values['comonstats'] = {'resptime':  '-1', 
193                                                                         'uptime':    '-1',
194                                                                         'sshstatus': '-1', 
195                                                                         'lastcotop': '-1',
196                                                                         'cpuspeed' : "null",
197                                                                         'disksize' : 'null',
198                                                                         'memsize'  : 'null'}
199                 # include output value
200                 ### GET PLC NODE ######################
201                 plc_lock.acquire()
202                 d_node = None
203                 try:
204                         d_node = plc.getNodes({'hostname': nodename}, ['pcu_ids', 'site_id', 'date_created', 
205                                                                         'last_updated', 'last_contact', 'boot_state', 'nodegroup_ids'])[0]
206                 except:
207                         traceback.print_exc()
208                 plc_lock.release()
209                 values['plcnode'] = d_node
210
211                 ### GET PLC PCU ######################
212                 site_id = -1
213                 d_pcu = None
214                 if d_node:
215                         pcu = d_node['pcu_ids']
216                         if len(pcu) > 0:
217                                 d_pcu = pcu[0]
218
219                         site_id = d_node['site_id']
220
221                 values['pcu'] = d_pcu
222
223                 ### GET PLC SITE ######################
224                 plc_lock.acquire()
225                 d_site = None
226                 values['loginbase'] = ""
227                 try:
228                         d_site = plc.getSites({'site_id': site_id}, 
229                                                                 ['max_slices', 'slice_ids', 'node_ids', 'login_base'])[0]
230                         values['loginbase'] = d_site['login_base']
231                 except:
232                         traceback.print_exc()
233                 plc_lock.release()
234
235                 values['plcsite'] = d_site 
236                 values['date_checked'] = time.time()
237         except:
238                 print traceback.print_exc()
239
240         return (nodename, values)
241
242 def recordPingAndSSH(request, result):
243         global global_round
244         global count
245         (nodename, values) = result
246
247         try:
248                 if values is not None:
249                         fbsync = FindbadNodeRecordSync.findby_or_create(hostname="global", 
250                                                                                                                         if_new_set={'round' : global_round})
251                         global_round = fbsync.round
252                         fbnodesync = FindbadNodeRecordSync.findby_or_create(hostname=nodename,
253                                                                                                                         if_new_set={'round' : global_round})
254
255                         fbrec = FindbadNodeRecord(
256                                                 date_checked=datetime.fromtimestamp(values['date_checked']),
257                                                 round=global_round,
258                                                 hostname=nodename,
259                                                 loginbase=values['loginbase'],
260                                                 kernel_version=values['kernel'],
261                                                 bootcd_version=values['bootcd'],
262                                                 nm_status=values['nm'],
263                                                 fs_status=values['readonlyfs'],
264                                                 dns_status=values['dns'],
265                                                 princeton_comon_dir=values['princeton_comon'],
266                                                 princeton_comon_running=values['princeton_comon_running'],
267                                                 princeton_comon_procs=values['princeton_comon_procs'],
268                                                 plc_node_stats = values['plcnode'],
269                                                 plc_site_stats = values['plcsite'],
270                                                 plc_pcuid = values['pcu'],
271                                                 comon_stats = values['comonstats'],
272                                                 ping_status = (values['ping'] == "PING"),
273                                                 ssh_portused = values['sshport'],
274                                                 ssh_status = (values['ssh'] == "SSH"),
275                                                 ssh_error = values['ssherror'],
276                                                 observed_status = values['state'],
277                                                 observed_category = values['category'],
278                                         )
279                         fbnodesync.round = global_round
280
281                         count += 1
282                         print "%d %s %s" % (count, nodename, values)
283         except:
284                 print "ERROR:"
285                 print traceback.print_exc()
286
287 # this will be called when an exception occurs within a thread
288 def handle_exception(request, result):
289         print "Exception occured in request %s" % request.requestID
290         for i in result:
291                 print "Result: %s" % i
292
293
294 def checkAndRecordState(l_nodes, cohash):
295         global global_round
296         global count
297
298         tp = threadpool.ThreadPool(20)
299
300         # CREATE all the work requests
301         for nodename in l_nodes:
302                 fbnodesync = FindbadNodeRecordSync.findby_or_create(hostname=nodename, if_new_set={'round':0})
303
304                 node_round   = fbnodesync.round
305                 if node_round < global_round:
306                         # recreate node stats when refreshed
307                         #print "%s" % nodename
308                         req = threadpool.WorkRequest(collectPingAndSSH, [nodename, cohash], {}, 
309                                                                                  None, recordPingAndSSH, handle_exception)
310                         tp.putRequest(req)
311                 else:
312                         # We just skip it, since it's "up to date"
313                         count += 1
314                         #print "%d %s %s" % (count, nodename, externalState['nodes'][nodename]['values'])
315                         print "%d %s %s" % (count, nodename, node_round)
316
317         # WAIT while all the work requests are processed.
318         begin = time.time()
319         while 1:
320                 try:
321                         time.sleep(1)
322                         tp.poll()
323                         # if more than two hours
324                         if time.time() - begin > (60*60*1.5):
325                                 print "findbad.py has run out of time!!!!!!"
326                                 os._exit(1)
327                 except KeyboardInterrupt:
328                         print "Interrupted!"
329                         break
330                 except threadpool.NoResultsPending:
331                         print "All results collected."
332                         break
333
334         print FindbadNodeRecordSync.query.count()
335         print FindbadNodeRecord.query.count()
336
337 def main():
338         global global_round
339
340         fbsync = FindbadNodeRecordSync.findby_or_create(hostname="global", 
341                                                                                                         if_new_set={'round' : global_round})
342         global_round = fbsync.round
343
344         if config.increment:
345                 # update global round number to force refreshes across all nodes
346                 global_round += 1
347                 fbsync.round = global_round
348
349         cotop = comon.Comon()
350         # lastcotop measures whether cotop is actually running.  this is a better
351         # metric than sshstatus, or other values from CoMon
352         cotop_url = COMON_COTOPURL
353
354         # history information for all nodes
355         #cohash = {}
356         cohash = cotop.coget(cotop_url)
357         l_nodes = plccache.l_nodes
358         if config.nodelist:
359                 f_nodes = util.file.getListFromFile(config.nodelist)
360                 l_nodes = filter(lambda x: x['hostname'] in f_nodes, l_nodes)
361         elif config.node:
362                 f_nodes = [config.node]
363                 l_nodes = filter(lambda x: x['hostname'] in f_nodes, l_nodes)
364         elif config.nodegroup:
365                 ng = api.GetNodeGroups({'name' : config.nodegroup})
366                 l_nodes = api.GetNodes(ng[0]['node_ids'])
367         elif config.site:
368                 site = api.GetSites(config.site)
369                 l_nodes = api.GetNodes(site[0]['node_ids'], ['hostname'])
370                 
371         l_nodes = [node['hostname'] for node in l_nodes]
372
373         # perform this query after the above options, so that the filter above
374         # does not break.
375         if config.nodeselect:
376                 plcnodes = api.GetNodes({'peer_id' : None}, ['hostname'])
377                 plcnodes = [ node['hostname'] for node in plcnodes ]
378                 l_nodes = node_select(config.nodeselect, plcnodes, None)
379
380         print "fetching %s hosts" % len(l_nodes)
381
382         checkAndRecordState(l_nodes, cohash)
383
384         return 0
385
386
387 if __name__ == '__main__':
388         from monitor import parser as parsermodule
389
390         parser = parsermodule.getParser(['nodesets'])
391
392         parser.set_defaults( increment=False, dbname="findbad", cachenodes=False)
393         parser.add_option("", "--cachenodes", action="store_true",
394                                                 help="Cache node lookup from PLC")
395         parser.add_option("", "--dbname", dest="dbname", metavar="FILE", 
396                                                 help="Specify the name of the database to which the information is saved")
397         parser.add_option("-i", "--increment", action="store_true", dest="increment", 
398                                                 help="Increment round number to force refresh or retry")
399
400         parser = parsermodule.getParser(['defaults'], parser)
401         
402         cfg = parsermodule.parse_args(parser)
403
404         try:
405                 main()
406         except Exception, err:
407                 print traceback.print_exc()
408                 print "Exception: %s" % err
409                 print "Saving data... exitting."
410                 sys.exit(0)
411         print "sleeping"
412         #print "final commit"
413         #time.sleep(10)