setup both monitor and zabbix databases in /etc/plc.d/monitor
[monitor.git] / findbad.py
1 #!/usr/bin/python
2
3 import os
4 import sys
5 import string
6 import time
7 from datetime import datetime,timedelta
8 import threadpool
9 import threading
10
11 from monitor import util
12 from monitor.util import command
13 from monitor import config
14
15 from monitor.database.infovacuum import FindbadNodeRecordSync, FindbadNodeRecord
16 from monitor.database.dborm import mon_session as session
17
18 from monitor.sources import comon
19 from monitor.wrapper import plc, plccache
20
21 from nodequery import verify,query_to_dict,node_select
22 import traceback
23
24 print "starting sqlfindbad.py"
25 # QUERY all nodes.
26 COMON_COTOPURL= "http://summer.cs.princeton.edu/status/tabulator.cgi?" + \
27                                 "table=table_nodeview&" + \
28                                 "dumpcols='name,resptime,sshstatus,uptime,lastcotop,cpuspeed,memsize,disksize'&" + \
29                                 "formatcsv"
30                                     #"formatcsv&" + \
31                                         #"select='lastcotop!=0'"
32
33 api = plc.getAuthAPI()
34 plc_lock = threading.Lock()
35 round = 1
36 global_round = round
37 count = 0
38
39 def collectPingAndSSH(nodename, cohash):
40         ### RUN PING ######################
41         ping = command.CMD()
42         (oval,errval) = ping.run_noexcept("ping -c 1 -q %s | grep rtt" % nodename)
43
44         try:
45                 values = {}
46
47                 if oval == "":
48                         # An error occurred
49                         values['ping'] = "NOPING"
50                 else:
51                         values['ping'] = "PING"
52
53                 try:
54                         for port in [22, 806]: 
55                                 ssh = command.SSH('root', nodename, port)
56
57                                 (oval, errval) = ssh.run_noexcept2(""" <<\EOF
58                                         echo "{"
59                                         echo '  "kernel":"'`uname -a`'",'
60                                         echo '  "bmlog":"'`ls /tmp/bm.log`'",'
61                                         echo '  "bootcd":"'`cat /mnt/cdrom/bootme/ID`'",'
62                                         echo '  "nm":"'`ps ax | grep nm.py | grep -v grep`'",'
63                                         echo '  "readonlyfs":"'`touch /var/log/monitor 2>&1`'",'
64                                         echo '  "dns":"'`host boot.planet-lab.org 2>&1`'",'
65                                         echo '  "princeton_comon":"'`ls -d /vservers/princeton_comon`'",'
66
67                                         ID=`grep princeton_comon /etc/passwd | awk -F : '{if ( $3 > 500 ) { print $3}}'` 
68                                         echo '  "princeton_comon_running":"'`ls -d /proc/virtual/$ID`'",'
69                                         echo '  "princeton_comon_procs":"'`vps ax | grep $ID | grep -v grep | wc -l`'",'
70                                         echo "}"
71 EOF                             """)
72                                 
73                                 values['ssherror'] = errval
74                                 if len(oval) > 0:
75                                         #print "OVAL: %s" % oval
76                                         values.update(eval(oval))
77                                         values['sshport'] = port
78                                         break
79                                 else:
80                                         values.update({'kernel': "", 'bmlog' : "", 'bootcd' : '', 
81                                                                         'nm' : '', 
82                                                                         'readonlyfs' : '',
83                                                                         'dns' : '',
84                                                                         'princeton_comon' : "", 
85                                                                         'princeton_comon_running' : "", 
86                                                                         'princeton_comon_procs' : "", 'sshport' : None})
87                 except:
88                         print traceback.print_exc()
89                         sys.exit(1)
90
91                 ### RUN SSH ######################
92                 b_getbootcd_id = True
93                 #ssh = command.SSH('root', nodename)
94                 #oval = ""
95                 #errval = ""
96                 #(oval, errval) = ssh.run_noexcept('echo `uname -a ; ls /tmp/bm.log`')
97
98                 oval = values['kernel']
99                 if "2.6.17" in oval or "2.6.2" in oval:
100                         values['ssh'] = 'SSH'
101                         values['category'] = 'PROD'
102                         if "bm.log" in values['bmlog']:
103                                 values['state'] = 'DEBUG'
104                         else:
105                                 values['state'] = 'BOOT'
106                 elif "2.6.12" in oval or "2.6.10" in oval:
107                         values['ssh'] = 'SSH'
108                         values['category'] = 'OLDPROD'
109                         if "bm.log" in values['bmlog']:
110                                 values['state'] = 'DEBUG'
111                         else:
112                                 values['state'] = 'BOOT'
113                 
114                 # NOTE: on 2.6.8 kernels, with 4.2 bootstrapfs, the chroot command fails.  I have no idea why.
115                 elif "2.4" in oval or "2.6.8" in oval:
116                         b_getbootcd_id = False
117                         values['ssh'] = 'SSH'
118                         values['category'] = 'OLDBOOTCD'
119                         values['state'] = 'DEBUG'
120                 elif oval != "":
121                         values['ssh'] = 'SSH'
122                         values['category'] = 'UNKNOWN'
123                         if "bm.log" in values['bmlog']:
124                                 values['state'] = 'DEBUG'
125                         else:
126                                 values['state'] = 'BOOT'
127                 else:
128                         # An error occurred.
129                         b_getbootcd_id = False
130                         values['ssh'] = 'NOSSH'
131                         values['category'] = 'ERROR'
132                         values['state'] = 'DOWN'
133                         val = errval.strip()
134                         values['ssherror'] = val
135                         values['kernel'] = ""
136
137                 #values['kernel'] = val
138
139                 if b_getbootcd_id:
140                         # try to get BootCD for all nodes that are not 2.4 nor inaccessible
141                         #(oval, errval) = ssh.run_noexcept('cat /mnt/cdrom/bootme/ID')
142                         oval = values['bootcd']
143                         if "BootCD" in oval:
144                                 values['bootcd'] = oval
145                                 if "v2" in oval and \
146                                         ( nodename is not "planetlab1.cs.unc.edu" and \
147                                           nodename is not "planetlab2.cs.unc.edu" ):
148                                         values['category'] = 'OLDBOOTCD'
149                         else:
150                                 values['bootcd'] = ""
151                 else:
152                         values['bootcd'] = ""
153
154                 # TODO: get bm.log for debug nodes.
155                 # 'zcat /tmp/bm.log'
156                 
157                 #(oval, errval) = ssh.run_noexcept('ps ax | grep nm.py | grep -v grep')
158                 oval = values['nm']
159                 if "nm.py" in oval:
160                         values['nm'] = "Y"
161                 else:
162                         values['nm'] = "N"
163
164                 continue_slice_check = True
165                 #(oval, errval) = ssh.run_noexcept('ls -d /vservers/princeton_comon')
166                 oval = values['princeton_comon']
167                 if "princeton_comon" in oval:
168                         values['princeton_comon'] = True
169                 else:
170                         values['princeton_comon'] = False
171                         continue_slice_check = False
172
173                 if continue_slice_check:
174                         #(oval, errval) = ssh.run_noexcept('ID=`grep princeton_comon /etc/passwd | awk -F : "{if ( \\\$3 > 500 ) { print \\\$3}}"`; ls -d /proc/virtual/$ID')
175                         oval = values['princeton_comon_running']
176                         if len(oval) > len('/proc/virtual/'):
177                                 values['princeton_comon_running'] = True
178                         else:
179                                 values['princeton_comon_running'] = False
180                                 continue_slice_check = False
181                 else:
182                         values['princeton_comon_running'] = False
183                         
184                 if continue_slice_check:
185                         #(oval, errval) = ssh.run_noexcept('ID=`grep princeton_comon /etc/passwd | awk -F : "{if ( \\\$3 > 500 ) { print \\\$3}}"`; vps ax | grep $ID | grep -v grep | wc -l')
186                         oval = values['princeton_comon_procs']
187                         values['princeton_comon_procs'] = int(oval)
188                 else:
189                         values['princeton_comon_procs'] = None
190
191                         
192                 if nodename in cohash: 
193                         values['comonstats'] = cohash[nodename]
194                 else:
195                         values['comonstats'] = {'resptime':  '-1', 
196                                                                         'uptime':    '-1',
197                                                                         'sshstatus': '-1', 
198                                                                         'lastcotop': '-1',
199                                                                         'cpuspeed' : "null",
200                                                                         'disksize' : 'null',
201                                                                         'memsize'  : 'null'}
202                 # include output value
203                 ### GET PLC NODE ######################
204                 plc_lock.acquire()
205                 d_node = None
206                 try:
207                         d_node = plc.getNodes({'hostname': nodename}, ['pcu_ids', 'site_id', 'date_created', 
208                                                                         'last_updated', 'last_contact', 'boot_state', 'nodegroup_ids'])[0]
209                 except:
210                         traceback.print_exc()
211                 plc_lock.release()
212                 values['plcnode'] = d_node
213
214                 ### GET PLC PCU ######################
215                 site_id = -1
216                 d_pcu = None
217                 if d_node:
218                         pcu = d_node['pcu_ids']
219                         if len(pcu) > 0:
220                                 d_pcu = pcu[0]
221
222                         site_id = d_node['site_id']
223
224                 values['pcu'] = d_pcu
225
226                 ### GET PLC SITE ######################
227                 plc_lock.acquire()
228                 d_site = None
229                 values['loginbase'] = ""
230                 try:
231                         d_site = plc.getSites({'site_id': site_id}, 
232                                                                 ['max_slices', 'slice_ids', 'node_ids', 'login_base'])[0]
233                         values['loginbase'] = d_site['login_base']
234                 except:
235                         traceback.print_exc()
236                 plc_lock.release()
237
238                 values['plcsite'] = d_site 
239                 values['date_checked'] = time.time()
240         except:
241                 print traceback.print_exc()
242
243         return (nodename, values)
244
245 def recordPingAndSSH(request, result):
246         global global_round
247         global count
248         (nodename, values) = result
249
250         try:
251                 if values is not None:
252                         fbsync = FindbadNodeRecordSync.findby_or_create(hostname="global", 
253                                                                                                                         if_new_set={'round' : global_round})
254                         global_round = fbsync.round
255                         fbnodesync = FindbadNodeRecordSync.findby_or_create(hostname=nodename,
256                                                                                                                         if_new_set={'round' : global_round})
257
258                         fbrec = FindbadNodeRecord(
259                                                 date_checked=datetime.fromtimestamp(values['date_checked']),
260                                                 round=global_round,
261                                                 hostname=nodename,
262                                                 loginbase=values['loginbase'],
263                                                 kernel_version=values['kernel'],
264                                                 bootcd_version=values['bootcd'],
265                                                 nm_status=values['nm'],
266                                                 fs_status=values['readonlyfs'],
267                                                 dns_status=values['dns'],
268                                                 princeton_comon_dir=values['princeton_comon'],
269                                                 princeton_comon_running=values['princeton_comon_running'],
270                                                 princeton_comon_procs=values['princeton_comon_procs'],
271                                                 plc_node_stats = values['plcnode'],
272                                                 plc_site_stats = values['plcsite'],
273                                                 plc_pcuid = values['pcu'],
274                                                 comon_stats = values['comonstats'],
275                                                 ping_status = (values['ping'] == "PING"),
276                                                 ssh_portused = values['sshport'],
277                                                 ssh_status = (values['ssh'] == "SSH"),
278                                                 ssh_error = values['ssherror'],
279                                                 observed_status = values['state'],
280                                                 observed_category = values['category'],
281                                         )
282                         fbnodesync.round = global_round
283                         fbnodesync.flush()
284                         fbsync.flush()
285                         fbrec.flush()
286
287                         count += 1
288                         print "%d %s %s" % (count, nodename, values)
289         except:
290                 print "ERROR:"
291                 print traceback.print_exc()
292
293 # this will be called when an exception occurs within a thread
294 def handle_exception(request, result):
295         print "Exception occured in request %s" % request.requestID
296         for i in result:
297                 print "Result: %s" % i
298
299
300 def checkAndRecordState(l_nodes, cohash):
301         global global_round
302         global count
303
304         tp = threadpool.ThreadPool(20)
305
306         # CREATE all the work requests
307         for nodename in l_nodes:
308                 fbnodesync = FindbadNodeRecordSync.findby_or_create(hostname=nodename, if_new_set={'round':0})
309                 node_round   = fbnodesync.round
310                 fbnodesync.flush()
311
312                 if node_round < global_round:
313                         # recreate node stats when refreshed
314                         #print "%s" % nodename
315                         req = threadpool.WorkRequest(collectPingAndSSH, [nodename, cohash], {}, 
316                                                                                  None, recordPingAndSSH, handle_exception)
317                         tp.putRequest(req)
318                 else:
319                         # We just skip it, since it's "up to date"
320                         count += 1
321                         #print "%d %s %s" % (count, nodename, externalState['nodes'][nodename]['values'])
322                         print "%d %s %s" % (count, nodename, node_round)
323
324         # WAIT while all the work requests are processed.
325         begin = time.time()
326         while 1:
327                 try:
328                         time.sleep(1)
329                         tp.poll()
330                         # if more than two hours
331                         if time.time() - begin > (60*60*1.5):
332                                 print "findbad.py has run out of time!!!!!!"
333                                 os._exit(1)
334                 except KeyboardInterrupt:
335                         print "Interrupted!"
336                         break
337                 except threadpool.NoResultsPending:
338                         print "All results collected."
339                         break
340
341         print FindbadNodeRecordSync.query.count()
342         print FindbadNodeRecord.query.count()
343         session.flush()
344
345 def main():
346         global global_round
347
348         fbsync = FindbadNodeRecordSync.findby_or_create(hostname="global", 
349                                                                                                         if_new_set={'round' : global_round})
350         global_round = fbsync.round
351
352         if config.increment:
353                 # update global round number to force refreshes across all nodes
354                 global_round += 1
355                 fbsync.round = global_round
356
357         fbsync.flush()
358
359         cotop = comon.Comon()
360         # lastcotop measures whether cotop is actually running.  this is a better
361         # metric than sshstatus, or other values from CoMon
362         cotop_url = COMON_COTOPURL
363
364         # history information for all nodes
365         cohash = {}
366         #cohash = cotop.coget(cotop_url)
367         l_nodes = plccache.l_nodes
368         if config.nodelist:
369                 f_nodes = util.file.getListFromFile(config.nodelist)
370                 l_nodes = filter(lambda x: x['hostname'] in f_nodes, l_nodes)
371         elif config.node:
372                 f_nodes = [config.node]
373                 l_nodes = filter(lambda x: x['hostname'] in f_nodes, l_nodes)
374         elif config.nodegroup:
375                 ng = api.GetNodeGroups({'name' : config.nodegroup})
376                 l_nodes = api.GetNodes(ng[0]['node_ids'])
377         elif config.site:
378                 site = api.GetSites(config.site)
379                 l_nodes = api.GetNodes(site[0]['node_ids'], ['hostname'])
380                 
381         l_nodes = [node['hostname'] for node in l_nodes]
382
383         # perform this query after the above options, so that the filter above
384         # does not break.
385         if config.nodeselect:
386                 plcnodes = api.GetNodes({'peer_id' : None}, ['hostname'])
387                 plcnodes = [ node['hostname'] for node in plcnodes ]
388                 l_nodes = node_select(config.nodeselect, plcnodes, None)
389
390         print "fetching %s hosts" % len(l_nodes)
391
392         checkAndRecordState(l_nodes, cohash)
393
394         return 0
395
396
397 if __name__ == '__main__':
398         from monitor import parser as parsermodule
399
400         parser = parsermodule.getParser(['nodesets'])
401
402         parser.set_defaults( increment=False, dbname="findbad", cachenodes=False)
403         parser.add_option("", "--cachenodes", action="store_true",
404                                                 help="Cache node lookup from PLC")
405         parser.add_option("", "--dbname", dest="dbname", metavar="FILE", 
406                                                 help="Specify the name of the database to which the information is saved")
407         parser.add_option("-i", "--increment", action="store_true", dest="increment", 
408                                                 help="Increment round number to force refresh or retry")
409
410         parser = parsermodule.getParser(['defaults'], parser)
411         
412         cfg = parsermodule.parse_args(parser)
413
414         try:
415                 main()
416         except Exception, err:
417                 print traceback.print_exc()
418                 print "Exception: %s" % err
419                 print "Saving data... exitting."
420                 sys.exit(0)
421         print "sleeping"
422         #print "final commit"
423         #time.sleep(10)