completed updates to the info model.
[monitor.git] / findbad.py
1 #!/usr/bin/python
2
3 import os
4 import sys
5 import string
6 import time
7 from datetime import datetime,timedelta
8 import threadpool
9 import threading
10
11 from monitor import util
12 from monitor.util import command
13 from monitor import config
14
15 from monitor.database.info.model import FindbadNodeRecordSync, FindbadNodeRecord, session
16
17 from monitor.sources import comon
18 from monitor.wrapper import plc, plccache
19
20 from nodequery import verify,query_to_dict,node_select
21 import traceback
22
23 print "starting sqlfindbad.py"
24 # QUERY all nodes.
25 COMON_COTOPURL= "http://summer.cs.princeton.edu/status/tabulator.cgi?" + \
26                                 "table=table_nodeview&" + \
27                                 "dumpcols='name,resptime,sshstatus,uptime,lastcotop,cpuspeed,memsize,disksize'&" + \
28                                 "formatcsv"
29                                     #"formatcsv&" + \
30                                         #"select='lastcotop!=0'"
31
32 api = plc.getAuthAPI()
33 plc_lock = threading.Lock()
34 round = 1
35 global_round = round
36 count = 0
37
38 def collectPingAndSSH(nodename, cohash):
39         ### RUN PING ######################
40         ping = command.CMD()
41         (oval,errval) = ping.run_noexcept("ping -c 1 -q %s | grep rtt" % nodename)
42
43         try:
44                 values = {}
45
46                 if oval == "":
47                         # An error occurred
48                         values['ping'] = "NOPING"
49                 else:
50                         values['ping'] = "PING"
51
52                 try:
53                         for port in [22, 806]: 
54                                 ssh = command.SSH('root', nodename, port)
55
56                                 (oval, errval) = ssh.run_noexcept2(""" <<\EOF
57                                         echo "{"
58                                         echo '  "kernel":"'`uname -a`'",'
59                                         echo '  "bmlog":"'`ls /tmp/bm.log`'",'
60                                         echo '  "bootcd":"'`cat /mnt/cdrom/bootme/ID`'",'
61                                         echo '  "nm":"'`ps ax | grep nm.py | grep -v grep`'",'
62                                         echo '  "readonlyfs":"'`touch /var/log/monitor 2>&1`'",'
63                                         echo '  "dns":"'`host boot.planet-lab.org 2>&1`'",'
64                                         echo '  "princeton_comon":"'`ls -d /vservers/princeton_comon`'",'
65
66                                         ID=`grep princeton_comon /etc/passwd | awk -F : '{if ( $3 > 500 ) { print $3}}'` 
67                                         echo '  "princeton_comon_running":"'`ls -d /proc/virtual/$ID`'",'
68                                         echo '  "princeton_comon_procs":"'`vps ax | grep $ID | grep -v grep | wc -l`'",'
69                                         echo "}"
70 EOF                             """)
71                                 
72                                 values['ssherror'] = errval
73                                 if len(oval) > 0:
74                                         #print "OVAL: %s" % oval
75                                         values.update(eval(oval))
76                                         values['sshport'] = port
77                                         break
78                                 else:
79                                         values.update({'kernel': "", 'bmlog' : "", 'bootcd' : '', 
80                                                                         'nm' : '', 
81                                                                         'readonlyfs' : '',
82                                                                         'dns' : '',
83                                                                         'princeton_comon' : "", 
84                                                                         'princeton_comon_running' : "", 
85                                                                         'princeton_comon_procs' : "", 'sshport' : None})
86                 except:
87                         print traceback.print_exc()
88                         sys.exit(1)
89
90                 ### RUN SSH ######################
91                 b_getbootcd_id = True
92                 #ssh = command.SSH('root', nodename)
93                 #oval = ""
94                 #errval = ""
95                 #(oval, errval) = ssh.run_noexcept('echo `uname -a ; ls /tmp/bm.log`')
96
97                 oval = values['kernel']
98                 if "2.6.17" in oval or "2.6.2" in oval:
99                         values['ssh'] = 'SSH'
100                         values['category'] = 'PROD'
101                         if "bm.log" in values['bmlog']:
102                                 values['state'] = 'DEBUG'
103                         else:
104                                 values['state'] = 'BOOT'
105                 elif "2.6.12" in oval or "2.6.10" in oval:
106                         values['ssh'] = 'SSH'
107                         values['category'] = 'OLDPROD'
108                         if "bm.log" in values['bmlog']:
109                                 values['state'] = 'DEBUG'
110                         else:
111                                 values['state'] = 'BOOT'
112                 
113                 # NOTE: on 2.6.8 kernels, with 4.2 bootstrapfs, the chroot command fails.  I have no idea why.
114                 elif "2.4" in oval or "2.6.8" in oval:
115                         b_getbootcd_id = False
116                         values['ssh'] = 'SSH'
117                         values['category'] = 'OLDBOOTCD'
118                         values['state'] = 'DEBUG'
119                 elif oval != "":
120                         values['ssh'] = 'SSH'
121                         values['category'] = 'UNKNOWN'
122                         if "bm.log" in values['bmlog']:
123                                 values['state'] = 'DEBUG'
124                         else:
125                                 values['state'] = 'BOOT'
126                 else:
127                         # An error occurred.
128                         b_getbootcd_id = False
129                         values['ssh'] = 'NOSSH'
130                         values['category'] = 'ERROR'
131                         values['state'] = 'DOWN'
132                         val = errval.strip()
133                         values['ssherror'] = val
134                         values['kernel'] = ""
135
136                 #values['kernel'] = val
137
138                 if b_getbootcd_id:
139                         # try to get BootCD for all nodes that are not 2.4 nor inaccessible
140                         #(oval, errval) = ssh.run_noexcept('cat /mnt/cdrom/bootme/ID')
141                         oval = values['bootcd']
142                         if "BootCD" in oval:
143                                 values['bootcd'] = oval
144                                 if "v2" in oval and \
145                                         ( nodename is not "planetlab1.cs.unc.edu" and \
146                                           nodename is not "planetlab2.cs.unc.edu" ):
147                                         values['category'] = 'OLDBOOTCD'
148                         else:
149                                 values['bootcd'] = ""
150                 else:
151                         values['bootcd'] = ""
152
153                 # TODO: get bm.log for debug nodes.
154                 # 'zcat /tmp/bm.log'
155                 
156                 #(oval, errval) = ssh.run_noexcept('ps ax | grep nm.py | grep -v grep')
157                 oval = values['nm']
158                 if "nm.py" in oval:
159                         values['nm'] = "Y"
160                 else:
161                         values['nm'] = "N"
162
163                 continue_slice_check = True
164                 #(oval, errval) = ssh.run_noexcept('ls -d /vservers/princeton_comon')
165                 oval = values['princeton_comon']
166                 if "princeton_comon" in oval:
167                         values['princeton_comon'] = True
168                 else:
169                         values['princeton_comon'] = False
170                         continue_slice_check = False
171
172                 if continue_slice_check:
173                         #(oval, errval) = ssh.run_noexcept('ID=`grep princeton_comon /etc/passwd | awk -F : "{if ( \\\$3 > 500 ) { print \\\$3}}"`; ls -d /proc/virtual/$ID')
174                         oval = values['princeton_comon_running']
175                         if len(oval) > len('/proc/virtual/'):
176                                 values['princeton_comon_running'] = True
177                         else:
178                                 values['princeton_comon_running'] = False
179                                 continue_slice_check = False
180                 else:
181                         values['princeton_comon_running'] = False
182                         
183                 if continue_slice_check:
184                         #(oval, errval) = ssh.run_noexcept('ID=`grep princeton_comon /etc/passwd | awk -F : "{if ( \\\$3 > 500 ) { print \\\$3}}"`; vps ax | grep $ID | grep -v grep | wc -l')
185                         oval = values['princeton_comon_procs']
186                         values['princeton_comon_procs'] = int(oval)
187                 else:
188                         values['princeton_comon_procs'] = None
189
190                         
191                 if nodename in cohash: 
192                         values['comonstats'] = cohash[nodename]
193                 else:
194                         values['comonstats'] = {'resptime':  '-1', 
195                                                                         'uptime':    '-1',
196                                                                         'sshstatus': '-1', 
197                                                                         'lastcotop': '-1',
198                                                                         'cpuspeed' : "null",
199                                                                         'disksize' : 'null',
200                                                                         'memsize'  : 'null'}
201                 # include output value
202                 ### GET PLC NODE ######################
203                 plc_lock.acquire()
204                 d_node = None
205                 try:
206                         d_node = plc.getNodes({'hostname': nodename}, ['pcu_ids', 'site_id', 'date_created', 
207                                                                         'last_updated', 'last_contact', 'boot_state', 'nodegroup_ids'])[0]
208                 except:
209                         traceback.print_exc()
210                 plc_lock.release()
211                 values['plcnode'] = d_node
212
213                 ### GET PLC PCU ######################
214                 site_id = -1
215                 d_pcu = None
216                 if d_node:
217                         pcu = d_node['pcu_ids']
218                         if len(pcu) > 0:
219                                 d_pcu = pcu[0]
220
221                         site_id = d_node['site_id']
222
223                 values['pcu'] = d_pcu
224
225                 ### GET PLC SITE ######################
226                 plc_lock.acquire()
227                 d_site = None
228                 values['loginbase'] = ""
229                 try:
230                         d_site = plc.getSites({'site_id': site_id}, 
231                                                                 ['max_slices', 'slice_ids', 'node_ids', 'login_base'])[0]
232                         values['loginbase'] = d_site['login_base']
233                 except:
234                         traceback.print_exc()
235                 plc_lock.release()
236
237                 values['plcsite'] = d_site 
238                 values['date_checked'] = time.time()
239         except:
240                 print traceback.print_exc()
241
242         return (nodename, values)
243
244 def recordPingAndSSH(request, result):
245         global global_round
246         global count
247         (nodename, values) = result
248
249         try:
250                 if values is not None:
251                         fbsync = FindbadNodeRecordSync.findby_or_create(hostname="global", 
252                                                                                                                         if_new_set={'round' : global_round})
253                         global_round = fbsync.round
254                         fbnodesync = FindbadNodeRecordSync.findby_or_create(hostname=nodename,
255                                                                                                                         if_new_set={'round' : global_round})
256
257                         fbrec = FindbadNodeRecord(
258                                                 date_checked=datetime.fromtimestamp(values['date_checked']),
259                                                 round=global_round,
260                                                 hostname=nodename,
261                                                 loginbase=values['loginbase'],
262                                                 kernel_version=values['kernel'],
263                                                 bootcd_version=values['bootcd'],
264                                                 nm_status=values['nm'],
265                                                 fs_status=values['readonlyfs'],
266                                                 dns_status=values['dns'],
267                                                 princeton_comon_dir=values['princeton_comon'],
268                                                 princeton_comon_running=values['princeton_comon_running'],
269                                                 princeton_comon_procs=values['princeton_comon_procs'],
270                                                 plc_node_stats = values['plcnode'],
271                                                 plc_site_stats = values['plcsite'],
272                                                 plc_pcuid = values['pcu'],
273                                                 comon_stats = values['comonstats'],
274                                                 ping_status = (values['ping'] == "PING"),
275                                                 ssh_portused = values['sshport'],
276                                                 ssh_status = (values['ssh'] == "SSH"),
277                                                 ssh_error = values['ssherror'],
278                                                 observed_status = values['state'],
279                                                 observed_category = values['category'],
280                                         )
281                         fbnodesync.round = global_round
282                         fbnodesync.flush()
283                         fbsync.flush()
284                         fbrec.flush()
285
286                         count += 1
287                         print "%d %s %s" % (count, nodename, values)
288         except:
289                 print "ERROR:"
290                 print traceback.print_exc()
291
292 # this will be called when an exception occurs within a thread
293 def handle_exception(request, result):
294         print "Exception occured in request %s" % request.requestID
295         for i in result:
296                 print "Result: %s" % i
297
298
299 def checkAndRecordState(l_nodes, cohash):
300         global global_round
301         global count
302
303         tp = threadpool.ThreadPool(20)
304
305         # CREATE all the work requests
306         for nodename in l_nodes:
307                 fbnodesync = FindbadNodeRecordSync.findby_or_create(hostname=nodename, if_new_set={'round':0})
308                 node_round   = fbnodesync.round
309                 fbnodesync.flush()
310
311                 if node_round < global_round:
312                         # recreate node stats when refreshed
313                         #print "%s" % nodename
314                         req = threadpool.WorkRequest(collectPingAndSSH, [nodename, cohash], {}, 
315                                                                                  None, recordPingAndSSH, handle_exception)
316                         tp.putRequest(req)
317                 else:
318                         # We just skip it, since it's "up to date"
319                         count += 1
320                         #print "%d %s %s" % (count, nodename, externalState['nodes'][nodename]['values'])
321                         print "%d %s %s" % (count, nodename, node_round)
322
323         # WAIT while all the work requests are processed.
324         begin = time.time()
325         while 1:
326                 try:
327                         time.sleep(1)
328                         tp.poll()
329                         # if more than two hours
330                         if time.time() - begin > (60*60*1.5):
331                                 print "findbad.py has run out of time!!!!!!"
332                                 os._exit(1)
333                 except KeyboardInterrupt:
334                         print "Interrupted!"
335                         break
336                 except threadpool.NoResultsPending:
337                         print "All results collected."
338                         break
339
340         print FindbadNodeRecordSync.query.count()
341         print FindbadNodeRecord.query.count()
342         session.flush()
343
344 def main():
345         global global_round
346
347         fbsync = FindbadNodeRecordSync.findby_or_create(hostname="global", 
348                                                                                                         if_new_set={'round' : global_round})
349         global_round = fbsync.round
350
351         if config.increment:
352                 # update global round number to force refreshes across all nodes
353                 global_round += 1
354                 fbsync.round = global_round
355
356         fbsync.flush()
357
358         cotop = comon.Comon()
359         # lastcotop measures whether cotop is actually running.  this is a better
360         # metric than sshstatus, or other values from CoMon
361         cotop_url = COMON_COTOPURL
362
363         # history information for all nodes
364         cohash = {}
365         #cohash = cotop.coget(cotop_url)
366         l_nodes = plccache.l_nodes
367         if config.nodelist:
368                 f_nodes = util.file.getListFromFile(config.nodelist)
369                 l_nodes = filter(lambda x: x['hostname'] in f_nodes, l_nodes)
370         elif config.node:
371                 f_nodes = [config.node]
372                 l_nodes = filter(lambda x: x['hostname'] in f_nodes, l_nodes)
373         elif config.nodegroup:
374                 ng = api.GetNodeGroups({'name' : config.nodegroup})
375                 l_nodes = api.GetNodes(ng[0]['node_ids'])
376         elif config.site:
377                 site = api.GetSites(config.site)
378                 l_nodes = api.GetNodes(site[0]['node_ids'], ['hostname'])
379                 
380         l_nodes = [node['hostname'] for node in l_nodes]
381
382         # perform this query after the above options, so that the filter above
383         # does not break.
384         if config.nodeselect:
385                 plcnodes = api.GetNodes({'peer_id' : None}, ['hostname'])
386                 plcnodes = [ node['hostname'] for node in plcnodes ]
387                 l_nodes = node_select(config.nodeselect, plcnodes, None)
388
389         print "fetching %s hosts" % len(l_nodes)
390
391         checkAndRecordState(l_nodes, cohash)
392
393         return 0
394
395
396 if __name__ == '__main__':
397         from monitor import parser as parsermodule
398
399         parser = parsermodule.getParser(['nodesets'])
400
401         parser.set_defaults( increment=False, dbname="findbad", cachenodes=False)
402         parser.add_option("", "--cachenodes", action="store_true",
403                                                 help="Cache node lookup from PLC")
404         parser.add_option("", "--dbname", dest="dbname", metavar="FILE", 
405                                                 help="Specify the name of the database to which the information is saved")
406         parser.add_option("-i", "--increment", action="store_true", dest="increment", 
407                                                 help="Increment round number to force refresh or retry")
408
409         parser = parsermodule.getParser(['defaults'], parser)
410         
411         cfg = parsermodule.parse_args(parser)
412
413         try:
414                 main()
415         except Exception, err:
416                 print traceback.print_exc()
417                 print "Exception: %s" % err
418                 print "Saving data... exitting."
419                 sys.exit(0)
420         print "sleeping"
421         #print "final commit"
422         #time.sleep(10)