merge from:
[monitor.git] / findbad.py
1 #!/usr/bin/python
2
3 import os
4 import sys
5 import string
6 import time
7 from datetime import datetime,timedelta
8 import threadpool
9 import threading
10
11 from monitor import util
12 from monitor.util import command
13 from monitor import config
14 from monitor.database import FindbadNodeRecordSync, FindbadNodeRecord
15 from monitor.sources import comon
16 from monitor.wrapper import plc
17
18 import syncplcdb
19 from nodequery import verify,query_to_dict,node_select
20 import traceback
21
22 print "starting sqlfindbad.py"
23 # QUERY all nodes.
24 COMON_COTOPURL= "http://summer.cs.princeton.edu/status/tabulator.cgi?" + \
25                                 "table=table_nodeview&" + \
26                                 "dumpcols='name,resptime,sshstatus,uptime,lastcotop,cpuspeed,memsize,disksize'&" + \
27                                 "formatcsv"
28                                     #"formatcsv&" + \
29                                         #"select='lastcotop!=0'"
30
31 api = plc.getAuthAPI()
32 plc_lock = threading.Lock()
33 round = 1
34 global_round = round
35 count = 0
36
37 def collectPingAndSSH(nodename, cohash):
38         ### RUN PING ######################
39         ping = command.CMD()
40         (oval,errval) = ping.run_noexcept("ping -c 1 -q %s | grep rtt" % nodename)
41
42         try:
43                 values = {}
44
45                 if oval == "":
46                         # An error occurred
47                         values['ping'] = "NOPING"
48                 else:
49                         values['ping'] = "PING"
50
51                 try:
52                         for port in [22, 806]: 
53                                 ssh = command.SSH('root', nodename, port)
54
55                                 (oval, errval) = ssh.run_noexcept2(""" <<\EOF
56                                         echo "{"
57                                         echo '  "kernel":"'`uname -a`'",'
58                                         echo '  "bmlog":"'`ls /tmp/bm.log`'",'
59                                         echo '  "bootcd":"'`cat /mnt/cdrom/bootme/ID`'",'
60                                         echo '  "nm":"'`ps ax | grep nm.py | grep -v grep`'",'
61                                         echo '  "readonlyfs":"'`touch /var/log/monitor 2>&1`'",'
62                                         echo '  "dns":"'`host boot.planet-lab.org 2>&1`'",'
63                                         echo '  "princeton_comon":"'`ls -d /vservers/princeton_comon`'",'
64
65                                         ID=`grep princeton_comon /etc/passwd | awk -F : '{if ( $3 > 500 ) { print $3}}'` 
66                                         echo '  "princeton_comon_running":"'`ls -d /proc/virtual/$ID`'",'
67                                         echo '  "princeton_comon_procs":"'`vps ax | grep $ID | grep -v grep | wc -l`'",'
68                                         echo "}"
69 EOF                             """)
70                                 
71                                 values['ssherror'] = errval
72                                 if len(oval) > 0:
73                                         #print "OVAL: %s" % oval
74                                         values.update(eval(oval))
75                                         values['sshport'] = port
76                                         break
77                                 else:
78                                         values.update({'kernel': "", 'bmlog' : "", 'bootcd' : '', 
79                                                                         'nm' : '', 
80                                                                         'readonlyfs' : '',
81                                                                         'dns' : '',
82                                                                         'princeton_comon' : "", 
83                                                                         'princeton_comon_running' : "", 
84                                                                         'princeton_comon_procs' : "", 'sshport' : None})
85                 except:
86                         print traceback.print_exc()
87                         sys.exit(1)
88
89                 ### RUN SSH ######################
90                 b_getbootcd_id = True
91                 #ssh = command.SSH('root', nodename)
92                 #oval = ""
93                 #errval = ""
94                 #(oval, errval) = ssh.run_noexcept('echo `uname -a ; ls /tmp/bm.log`')
95
96                 oval = values['kernel']
97                 if "2.6.17" in oval or "2.6.2" in oval:
98                         values['ssh'] = 'SSH'
99                         values['category'] = 'PROD'
100                         if "bm.log" in values['bmlog']:
101                                 values['state'] = 'DEBUG'
102                         else:
103                                 values['state'] = 'BOOT'
104                 elif "2.6.12" in oval or "2.6.10" in oval:
105                         values['ssh'] = 'SSH'
106                         values['category'] = 'OLDPROD'
107                         if "bm.log" in values['bmlog']:
108                                 values['state'] = 'DEBUG'
109                         else:
110                                 values['state'] = 'BOOT'
111                 
112                 # NOTE: on 2.6.8 kernels, with 4.2 bootstrapfs, the chroot command fails.  I have no idea why.
113                 elif "2.4" in oval or "2.6.8" in oval:
114                         b_getbootcd_id = False
115                         values['ssh'] = 'SSH'
116                         values['category'] = 'OLDBOOTCD'
117                         values['state'] = 'DEBUG'
118                 elif oval != "":
119                         values['ssh'] = 'SSH'
120                         values['category'] = 'UNKNOWN'
121                         if "bm.log" in values['bmlog']:
122                                 values['state'] = 'DEBUG'
123                         else:
124                                 values['state'] = 'BOOT'
125                 else:
126                         # An error occurred.
127                         b_getbootcd_id = False
128                         values['ssh'] = 'NOSSH'
129                         values['category'] = 'ERROR'
130                         values['state'] = 'DOWN'
131                         val = errval.strip()
132                         values['ssherror'] = val
133                         values['kernel'] = ""
134
135                 #values['kernel'] = val
136
137                 if b_getbootcd_id:
138                         # try to get BootCD for all nodes that are not 2.4 nor inaccessible
139                         #(oval, errval) = ssh.run_noexcept('cat /mnt/cdrom/bootme/ID')
140                         oval = values['bootcd']
141                         if "BootCD" in oval:
142                                 values['bootcd'] = oval
143                                 if "v2" in oval and \
144                                         ( nodename is not "planetlab1.cs.unc.edu" and \
145                                           nodename is not "planetlab2.cs.unc.edu" ):
146                                         values['category'] = 'OLDBOOTCD'
147                         else:
148                                 values['bootcd'] = ""
149                 else:
150                         values['bootcd'] = ""
151
152                 # TODO: get bm.log for debug nodes.
153                 # 'zcat /tmp/bm.log'
154                 
155                 #(oval, errval) = ssh.run_noexcept('ps ax | grep nm.py | grep -v grep')
156                 oval = values['nm']
157                 if "nm.py" in oval:
158                         values['nm'] = "Y"
159                 else:
160                         values['nm'] = "N"
161
162                 continue_slice_check = True
163                 #(oval, errval) = ssh.run_noexcept('ls -d /vservers/princeton_comon')
164                 oval = values['princeton_comon']
165                 if "princeton_comon" in oval:
166                         values['princeton_comon'] = True
167                 else:
168                         values['princeton_comon'] = False
169                         continue_slice_check = False
170
171                 if continue_slice_check:
172                         #(oval, errval) = ssh.run_noexcept('ID=`grep princeton_comon /etc/passwd | awk -F : "{if ( \\\$3 > 500 ) { print \\\$3}}"`; ls -d /proc/virtual/$ID')
173                         oval = values['princeton_comon_running']
174                         if len(oval) > len('/proc/virtual/'):
175                                 values['princeton_comon_running'] = True
176                         else:
177                                 values['princeton_comon_running'] = False
178                                 continue_slice_check = False
179                 else:
180                         values['princeton_comon_running'] = False
181                         
182                 if continue_slice_check:
183                         #(oval, errval) = ssh.run_noexcept('ID=`grep princeton_comon /etc/passwd | awk -F : "{if ( \\\$3 > 500 ) { print \\\$3}}"`; vps ax | grep $ID | grep -v grep | wc -l')
184                         oval = values['princeton_comon_procs']
185                         values['princeton_comon_procs'] = int(oval)
186                 else:
187                         values['princeton_comon_procs'] = None
188
189                         
190                 if nodename in cohash: 
191                         values['comonstats'] = cohash[nodename]
192                 else:
193                         values['comonstats'] = {'resptime':  '-1', 
194                                                                         'uptime':    '-1',
195                                                                         'sshstatus': '-1', 
196                                                                         'lastcotop': '-1',
197                                                                         'cpuspeed' : "null",
198                                                                         'disksize' : 'null',
199                                                                         'memsize'  : 'null'}
200                 # include output value
201                 ### GET PLC NODE ######################
202                 plc_lock.acquire()
203                 d_node = None
204                 try:
205                         d_node = plc.getNodes({'hostname': nodename}, ['pcu_ids', 'site_id', 'date_created', 
206                                                                         'last_updated', 'last_contact', 'boot_state', 'nodegroup_ids'])[0]
207                 except:
208                         traceback.print_exc()
209                 plc_lock.release()
210                 values['plcnode'] = d_node
211
212                 ### GET PLC PCU ######################
213                 site_id = -1
214                 d_pcu = None
215                 if d_node:
216                         pcu = d_node['pcu_ids']
217                         if len(pcu) > 0:
218                                 d_pcu = pcu[0]
219
220                         site_id = d_node['site_id']
221
222                 values['pcu'] = d_pcu
223
224                 ### GET PLC SITE ######################
225                 plc_lock.acquire()
226                 d_site = None
227                 values['loginbase'] = ""
228                 try:
229                         d_site = plc.getSites({'site_id': site_id}, 
230                                                                 ['max_slices', 'slice_ids', 'node_ids', 'login_base'])[0]
231                         values['loginbase'] = d_site['login_base']
232                 except:
233                         traceback.print_exc()
234                 plc_lock.release()
235
236                 values['plcsite'] = d_site 
237                 values['date_checked'] = time.time()
238         except:
239                 print traceback.print_exc()
240
241         return (nodename, values)
242
243 def recordPingAndSSH(request, result):
244         global global_round
245         global count
246         (nodename, values) = result
247
248         try:
249                 if values is not None:
250                         fbsync = FindbadNodeRecordSync.findby_or_create(hostname="global", 
251                                                                                                                         if_new_set={'round' : global_round})
252                         global_round = fbsync.round
253                         fbnodesync = FindbadNodeRecordSync.findby_or_create(hostname=nodename,
254                                                                                                                         if_new_set={'round' : global_round})
255
256                         fbrec = FindbadNodeRecord(
257                                                 date_checked=datetime.fromtimestamp(values['date_checked']),
258                                                 hostname=nodename,
259                                                 loginbase=values['loginbase'],
260                                                 kernel_version=values['kernel'],
261                                                 bootcd_version=values['bootcd'],
262                                                 nm_status=values['nm'],
263                                                 fs_status=values['readonlyfs'],
264                                                 dns_status=values['dns'],
265                                                 princeton_comon_dir=values['princeton_comon'],
266                                                 princeton_comon_running=values['princeton_comon_running'],
267                                                 princeton_comon_procs=values['princeton_comon_procs'],
268                                                 plc_node_stats = values['plcnode'],
269                                                 plc_site_stats = values['plcsite'],
270                                                 plc_pcuid = values['pcu'],
271                                                 comon_stats = values['comonstats'],
272                                                 ping_status = (values['ping'] == "PING"),
273                                                 ssh_portused = values['sshport'],
274                                                 ssh_status = (values['ssh'] == "SSH"),
275                                                 ssh_error = values['ssherror'],
276                                                 observed_status = values['state'],
277                                         )
278                         fbnodesync.round = global_round
279
280                         count += 1
281                         print "%d %s %s" % (count, nodename, values)
282         except:
283                 print "ERROR:"
284                 print traceback.print_exc()
285
286 # this will be called when an exception occurs within a thread
287 def handle_exception(request, result):
288         print "Exception occured in request %s" % request.requestID
289         for i in result:
290                 print "Result: %s" % i
291
292
293 def checkAndRecordState(l_nodes, cohash):
294         global global_round
295         global count
296
297         tp = threadpool.ThreadPool(20)
298
299         # CREATE all the work requests
300         for nodename in l_nodes:
301                 fbnodesync = FindbadNodeRecordSync.findby_or_create(hostname=nodename, if_new_set={'round':0})
302
303                 node_round   = fbnodesync.round
304                 if node_round < global_round:
305                         # recreate node stats when refreshed
306                         #print "%s" % nodename
307                         req = threadpool.WorkRequest(collectPingAndSSH, [nodename, cohash], {}, 
308                                                                                  None, recordPingAndSSH, handle_exception)
309                         tp.putRequest(req)
310                 else:
311                         # We just skip it, since it's "up to date"
312                         count += 1
313                         #print "%d %s %s" % (count, nodename, externalState['nodes'][nodename]['values'])
314                         print "%d %s %s" % (count, nodename, node_round)
315
316         # WAIT while all the work requests are processed.
317         begin = time.time()
318         while 1:
319                 try:
320                         time.sleep(1)
321                         tp.poll()
322                         # if more than two hours
323                         if time.time() - begin > (60*60*1.5):
324                                 print "findbad.py has run out of time!!!!!!"
325                                 os._exit(1)
326                 except KeyboardInterrupt:
327                         print "Interrupted!"
328                         break
329                 except threadpool.NoResultsPending:
330                         print "All results collected."
331                         break
332
333         print FindbadNodeRecordSync.query.count()
334         print FindbadNodeRecord.query.count()
335
336 def main():
337         global global_round
338
339         fbsync = FindbadNodeRecordSync.findby_or_create(hostname="global", 
340                                                                                                         if_new_set={'round' : global_round})
341         global_round = fbsync.round
342
343         if config.increment:
344                 # update global round number to force refreshes across all nodes
345                 global_round += 1
346                 fbsync.round = global_round
347
348         cotop = comon.Comon()
349         # lastcotop measures whether cotop is actually running.  this is a better
350         # metric than sshstatus, or other values from CoMon
351         cotop_url = COMON_COTOPURL
352
353         # history information for all nodes
354         #cohash = {}
355         cohash = cotop.coget(cotop_url)
356         l_nodes = syncplcdb.create_plcdb()
357         if config.nodelist:
358                 f_nodes = util.file.getListFromFile(config.nodelist)
359                 l_nodes = filter(lambda x: x['hostname'] in f_nodes, l_nodes)
360         elif config.node:
361                 f_nodes = [config.node]
362                 l_nodes = filter(lambda x: x['hostname'] in f_nodes, l_nodes)
363         elif config.nodegroup:
364                 ng = api.GetNodeGroups({'name' : config.nodegroup})
365                 l_nodes = api.GetNodes(ng[0]['node_ids'])
366         elif config.site:
367                 site = api.GetSites(config.site)
368                 l_nodes = api.GetNodes(site[0]['node_ids'], ['hostname'])
369                 
370         l_nodes = [node['hostname'] for node in l_nodes]
371
372         # perform this query after the above options, so that the filter above
373         # does not break.
374         if config.nodeselect:
375                 plcnodes = api.GetNodes({'peer_id' : None}, ['hostname'])
376                 plcnodes = [ node['hostname'] for node in plcnodes ]
377                 l_nodes = node_select(config.nodeselect, plcnodes, None)
378
379         print "fetching %s hosts" % len(l_nodes)
380
381         checkAndRecordState(l_nodes, cohash)
382
383         return 0
384
385
386 if __name__ == '__main__':
387         from monitor import parser as parsermodule
388
389         parser = parsermodule.getParser(['nodesets'])
390
391         parser.set_defaults( increment=False, dbname="findbad", cachenodes=False)
392         parser.add_option("", "--cachenodes", action="store_true",
393                                                 help="Cache node lookup from PLC")
394         parser.add_option("", "--dbname", dest="dbname", metavar="FILE", 
395                                                 help="Specify the name of the database to which the information is saved")
396         parser.add_option("-i", "--increment", action="store_true", dest="increment", 
397                                                 help="Increment round number to force refresh or retry")
398
399         parser = parsermodule.getParser(['defaults'], parser)
400         
401         cfg = parsermodule.parse_args(parser)
402
403         try:
404                 main()
405         except Exception, err:
406                 print traceback.print_exc()
407                 print "Exception: %s" % err
408                 print "Saving data... exitting."
409                 sys.exit(0)
410         print "sleeping"
411         #print "final commit"
412         #time.sleep(10)