-m lists running tests on testmaster
[infrastructure.git] / scripts / manage-infrastructure.py
1 #!/usr/bin/python
2
3 import os.path, sys
4 import re
5 import subprocess
6 from optparse import OptionParser
7
8 class BuildBoxes:
9
10     # everything in the onelab.eu domain
11     domain = 'pl.sophia.inria.fr'
12     testmaster = 'testmaster'
13     build_boxes = [ "mirror", "liquid", "reed", "velvet", ]
14     plc_boxes = [ "testplc" ]
15     qemu_boxes = \
16         [ "qemu64-%d"%i for i in range (1,4) ] + \
17         [ "qemu32-%d"%i for i in range (1,6) ]
18     test_boxes = plc_boxes + qemu_boxes
19     testmaster_boxes = [ testmaster ]
20
21     def __init__ (self):
22         # dummy defaults
23         self.boxes = []
24         self.do_tracker_qemus = False
25         self.do_tracker_plcs = False
26
27     def fqdn (self, box):
28         return "%s.%s"%(box,self.domain)
29
30     ssh_command=['ssh','-o','ConnectTimeout=3']
31     @staticmethod
32     def root (box): return "root@%s"%box
33
34     @staticmethod
35     def ssh(box):
36         return BuildBoxes.ssh_command + [ BuildBoxes.root(box) ]
37
38     def header (self,message):
39         print "===============",message
40         sys.stdout.flush()
41
42     def run (self,argv,message, trash_err=False):
43         if self.options.dry_run:
44             print 'DRY_RUN:',
45             print " ".join(argv)
46             return 0
47         else:
48             if message: self.header(message)
49             if not trash_err:
50                 return subprocess.call(argv)
51             else:
52                 return subprocess.call(argv,stderr=file('/dev/null','w'))
53                 
54     def run_ssh (self, box, argv, message, trash_err=False):
55         result=self.run (self.ssh(box) + argv, message, trash_err)
56         if result!=0:
57             print "WARNING: failed to run %s on %s"%(" ".join(argv),box)
58         return result
59
60     def backquote (self, argv, trash_err=False):
61         if not trash_err:
62             return subprocess.Popen(argv,stdout=subprocess.PIPE).communicate()[0]
63         else:
64             return subprocess.Popen(argv,stdout=subprocess.PIPE,stderr=file('/dev/null','w')).communicate()[0]
65
66     def backquote_ssh (self, box, argv, trash_err=False):
67         # first probe the ssh link
68         hostname=self.backquote ( self.ssh(box) + [ "hostname"], trash_err=True )
69         if not hostname:
70             print "%s unreachable"%self.root(box)
71             return ''
72         else:
73             return self.backquote( ['ssh',self.root(box)] + argv, trash_err)
74
75     def reboot (self,box):
76         command=['ssh',self.root(box),'shutdown','-r','now']
77         self.run (command,"Rebooting %s"%box)
78
79     def handle_tracker_plcs (self):
80         box = self.fqdn (self.testmaster)
81         filename="tracker-plcs"
82         if not self.options.probe:
83             command=["rm","-rf",filename]
84             self.run_ssh(box,command,"Cleaning up %s on %s"%(filename,box))
85         else:
86             self.header ("++++++++++ Inspecting %s on %s"%(filename,box))
87             read_command = ["cat",filename]
88             trackers=self.backquote_ssh(box,read_command)
89             for tracker in trackers.split('\n'):
90                 if not tracker: continue
91                 try:
92                     tracker=tracker.strip()
93                     [hostname,buildname]=tracker.split('@')
94                     [left,plcname]=buildname.rsplit('-',1)
95                     print self.margin_outline(plcname),tracker
96                 except:
97                     print self.margin(""),tracker
98
99     def handle_tracker_qemus (self):
100         box = self.fqdn (self.testmaster)
101         filename="tracker-qemus"
102         if not self.options.probe:
103             command=["rm","-rf",filename]
104             self.run_ssh(box,command,"Cleaning up %s on %s"%(filename,box))
105         else:
106             self.header ("++++++++++ Inspecting %s on %s"%(filename,box))
107             read_command = ["cat",filename]
108             trackers=self.backquote_ssh(box,read_command)
109             for tracker in trackers.split('\n'):
110                 if not tracker: continue
111                 try:
112                     tracker=tracker.strip()
113                     [hostname,buildname,nodename]=tracker.split('@')
114                     nodename=nodename.split('.')[0]
115                     print self.margin_outline(nodename),tracker
116                 except:
117                     print self.margin(""),tracker
118
119     def handle_build_box (self,box):
120         if not self.options.probe:
121             self.reboot(box)
122         else:
123             command=['uptime']
124             uptime=self.backquote_ssh(box,command,True).strip()
125
126             command=['pgrep','build']
127             if self.options.dry_run:
128                 self.run_ssh(box,command,None)
129             else:
130                 pids=self.backquote_ssh(box,command,True)
131                 if not pids:
132                     self.header ('No build process on %s (%s)'%(box,uptime))
133                 else:
134                     command=['ps','-o','pid,command'] + [ pid for pid in pids.split("\n") if pid]
135                     self.run_ssh(box,command,"Active build processes on %s (%s)"%(box,uptime),True)
136
137     vplc_matcher = re.compile(".*(vplc[0-9]+$)")
138     def vplcname (self, vservername):
139         match = self.vplc_matcher.match(vservername)
140         if match: return match.groups(0)
141         else: return ""
142
143     margin_format="%-14s"
144     def margin(self,string): return self.margin_format%string
145     def outline (self, string): return '== %s =='%string
146     def margin_outline (self, string): return self.margin(self.outline(string))
147
148     def handle_plc_box (self,box):
149         if not self.options.probe:
150             self.reboot(box)
151         else:
152             command=['vserver-stat']
153             if self.options.dry_run:
154                 self.run_ssh(box,command,"Active vservers on %s"%box)
155             else:
156                 # try to find fullname (vserver_stat truncates to a ridiculously short name)
157                 try:
158                     self.header ("vserver map on %s"%box)
159                     # fetch the contexts for all vservers on that box
160                     map_command=['grep','.','/etc/vservers/*/context','/dev/null',]
161                     context_map=self.backquote_ssh (box,map_command)
162                     # at this point we have a set of lines like
163                     # /etc/vservers/2010.01.20--k27-f12-32-vplc03/context:40144
164                     ctx_dict={}
165                     for map_line in context_map.split("\n"):
166                         if not map_line: continue
167                         [path,xid] = map_line.split(':')
168                         ctx_dict[xid]=os.path.basename(os.path.dirname(path))
169                     # at this point ctx_id maps context id to vservername
170
171                     vserver_stat = self.backquote_ssh (box,command)
172                     for vserver_line in vserver_stat.split("\n"):
173                         if not vserver_line: continue
174                         context=vserver_line.split()[0]
175                         if context=="CTX": 
176                             print self.margin(""),vserver_line
177                             continue
178                         longname=ctx_dict[context]
179                         print self.margin_outline(self.vplcname(longname)),"%(vserver_line)s [=%(longname)s]"%locals()
180                 except:
181                     self.run_ssh(box,command,"Fine-grained method failed - fallback to plain vserver-stat")
182
183     vnode_matcher = re.compile(".*(vnode[0-9]+)")
184     def vnodename (self, ps_line):
185         match = self.vnode_matcher.match(ps_line)
186         if match: return match.groups(0)
187         else: return ""
188
189     def handle_qemu_box (self,box):
190         if not self.options.probe:
191             self.reboot(box)
192         else:
193             command=['lsmod']
194             modules=self.backquote_ssh(box,command).split('\n')
195             kqemu_msg='*NO kqemu MODULE LOADED*'
196             for module in modules:
197                 if module.find('kqemu')==0:
198                     kqemu_msg='kqemu OK'
199             
200             command=['pgrep','qemu']
201             if self.options.dry_run:
202                 self.run_ssh(box,command,None)
203             else:
204                 pids=self.backquote_ssh(box,command)
205                 if not pids:
206                     self.header ('No qemu process on %s'%box)
207                 else:
208                     self.header ("Active qemu processes on %s (%s)"%(box,kqemu_msg))
209                     command=['ps','-o','pid,command'] + [ pid for pid in pids.split("\n") if pid]
210                     ps_lines = self.backquote_ssh (box,command).split("\n")
211                     for ps_line in ps_lines:
212                         if not ps_line or ps_line.find('PID') >=0 : continue
213                         print self.margin_outline(self.vnodename(ps_line)), ps_line
214
215     # the ouput of ps -o pid,command gives us <pid> bash <buildname>/run_log
216     def testmaster_buildname (self, ps_line):
217         chunks=ps_line.split()
218         path=chunks[2]
219         [buildname,command]=path.split('/')
220         return buildname
221
222     def handle_testmaster_box (self, box):
223             command=['pgrep','run_log']
224             if self.options.dry_run:
225                 self.run_ssh(box,command,None)
226             else:
227                 pids=self.backquote_ssh(box,command)
228                 if not pids:
229                     self.header ('No run_log process on %s'%box)
230                 else:
231                     self.header ("Active run_log processes on %s"%(box))
232                     command=['ps','-o','pid,command'] + [ pid for pid in pids.split("\n") if pid]
233                     ps_lines = self.backquote_ssh (box,command).split("\n")
234                     for ps_line in ps_lines:
235                         if not ps_line or ps_line.find('PID') >=0 : continue
236                         print self.margin_outline(self.testmaster_buildname(ps_line)), ps_line
237         
238
239     def handle_box(self,box,type):
240         if box in self.qemu_boxes:
241             if type=="qemu": self.handle_qemu_box(self.fqdn(box))
242         elif box in self.plc_boxes:
243             if type=="plc":  self.handle_plc_box(self.fqdn(box))
244         elif box in self.testmaster_boxes:
245             if type=='testmaster': self.handle_testmaster_box(self.fqdn(box))
246         elif type=="build":
247             self.handle_build_box(self.fqdn(box))
248
249     def handle_disk (self,box):
250         box=self.fqdn(box)
251         return self.run_ssh(box,["df","-h",],"Disk space on %s"%box)
252
253     def main (self):
254         usage="""%prog [options] [hostname..(s)]
255 Default is to act on test boxes only"""
256         parser = OptionParser (usage=usage)
257         parser.add_option ("-n","--dry-run",action="store_true",dest="dry_run",default=False,
258                            help="Dry run")
259         parser.add_option ("-r","--reboot", action="store_false",dest="probe",default=True,
260                            help="Actually reset/reboot stuff instead of just probing it")
261         # no need for -p = probe, as this is the default
262         parser.add_option ("-p","--plc", action="store_true",dest="plc_only",default=False,
263                            help="Acts on the plc box only")
264
265         parser.add_option ("-a","--all",action="store_true",dest="all_boxes",default=False,
266                            help="Acts on build and test boxes")
267         parser.add_option ("-b","--build",action="store_true",dest="build_only",default=False,
268                            help="Acts on build boxes only")
269         parser.add_option ("-q","--qemu",action="store_true",dest="qemu_only",default=False,
270                            help="Only acts on the qemu boxes")
271         parser.add_option ("-t","--trackers",action="store_true",dest="trackers_only",default=False,
272                            help="Only wipes trackers")
273         parser.add_option ("-m","--master",action="store_true",dest="testmaster_only",default=False,
274                            help="Display the testmaster status")
275         parser.add_option ("-d","--disk",action="store_true",dest="show_disk",default=False,
276                            help="Only inspects disk status")
277
278         (self.options,args) = parser.parse_args()
279
280         # use given hostnames if provided
281         if args:
282             self.boxes=args
283             # if hostnames are specified, let's stay on the safe side and don't reset trackers
284             self.do_tracker_plcs = False
285             self.do_tracker_qemus = False
286         elif self.options.all_boxes:
287             self.boxes=self.test_boxes + self.build_boxes
288             self.do_tracker_plcs = True
289             self.do_tracker_qemus = True
290         elif self.options.build_only:
291             self.boxes=self.build_boxes
292             self.do_tracker_plcs = False
293             self.do_tracker_qemus = False
294         elif self.options.qemu_only:
295             self.boxes=self.qemu_boxes
296             self.do_tracker_plcs = False
297             self.do_tracker_qemus = True
298         elif self.options.plc_only:
299             self.boxes=self.plc_boxes
300             self.do_tracker_plcs = True
301             self.do_tracker_qemus = False
302         elif self.options.testmaster_only:
303             self.boxes=self.testmaster_boxes
304             self.do_tracker_plcs = False
305             self.do_tracker_qemus = False
306         elif self.options.trackers_only:
307             self.boxes = []
308             self.do_tracker_plcs = True
309             self.do_tracker_qemus = True
310         # default
311         else:
312             self.boxes = self.test_boxes
313             self.do_tracker_plcs = True
314             self.do_tracker_qemus = True
315
316         if self.options.show_disk:
317             for box in self.boxes: self.handle_disk(box)
318             return
319
320         # ALL OTHERS
321         for box in self.boxes:  self.handle_box (box,"build")
322         # TESTMASTER
323         for box in self.boxes:  self.handle_box (box,"testmaster")
324         # PLCS
325         if self.do_tracker_plcs:self.handle_tracker_plcs ()
326         for box in self.boxes:  self.handle_box (box,"plc")
327         # QEMU
328         if self.do_tracker_qemus:self.handle_tracker_qemus ()
329         for box in self.boxes:  self.handle_box (box,"qemu")
330
331 if __name__ == "__main__":
332     BuildBoxes().main()