remove check file just after pushing to branches
[infrastructure.git] / scripts / manage-infrastructure.py
1 #!/usr/bin/python
2
3 import os.path, sys
4 import re
5 import subprocess
6 from optparse import OptionParser
7
8 class BuildBoxes:
9
10     # everything in the onelab.eu domain
11     domain = 'pl.sophia.inria.fr'
12     testmaster = 'testmaster'
13     build_boxes = [ "mirror", "liquid", "reed", "velvet", ]
14     plc_boxes = [ "testplc" ]
15     # qemu32-5 is officially dead
16     qemu_boxes = \
17         [ "qemu64-%d"%i for i in range (1,4) ] + \
18         [ "qemu32-%d"%i for i in range (1,5) ]
19     test_boxes = plc_boxes + qemu_boxes
20     testmaster_boxes = [ testmaster ]
21
22     def __init__ (self):
23         # dummy defaults
24         self.boxes = []
25         self.do_tracker_qemus = False
26         self.do_tracker_plcs = False
27
28     def fqdn (self, box):
29         return "%s.%s"%(box,self.domain)
30
31     ssh_command=['ssh','-o','ConnectTimeout=3']
32     @staticmethod
33     def root (box): return "root@%s"%box
34
35     @staticmethod
36     def ssh(box):
37         return BuildBoxes.ssh_command + [ BuildBoxes.root(box) ]
38
39     def header (self,message):
40         print "===============",message
41         sys.stdout.flush()
42
43     def run (self,argv,message, trash_err=False):
44         if self.options.dry_run:
45             print 'DRY_RUN:',
46             print " ".join(argv)
47             return 0
48         else:
49             if message: self.header(message)
50             if not trash_err:
51                 return subprocess.call(argv)
52             else:
53                 return subprocess.call(argv,stderr=file('/dev/null','w'))
54                 
55     def run_ssh (self, box, argv, message, trash_err=False):
56         result=self.run (self.ssh(box) + argv, message, trash_err)
57         if result!=0:
58             print "WARNING: failed to run %s on %s"%(" ".join(argv),box)
59         return result
60
61     def backquote (self, argv, trash_err=False):
62         if not trash_err:
63             return subprocess.Popen(argv,stdout=subprocess.PIPE).communicate()[0]
64         else:
65             return subprocess.Popen(argv,stdout=subprocess.PIPE,stderr=file('/dev/null','w')).communicate()[0]
66
67     def backquote_ssh (self, box, argv, trash_err=False):
68         # first probe the ssh link
69         hostname=self.backquote ( self.ssh(box) + [ "hostname"], trash_err=True )
70         if not hostname:
71             print "%s unreachable"%self.root(box)
72             return ''
73         else:
74             return self.backquote( ['ssh',self.root(box)] + argv, trash_err)
75
76     def reboot (self,box):
77         command=['ssh',self.root(box),'shutdown','-r','now']
78         self.run (command,"Rebooting %s"%box)
79
80     def handle_tracker_plcs (self):
81         box = self.fqdn (self.testmaster)
82         filename="tracker-plcs"
83         if not self.options.probe:
84             command=["rm","-rf",filename]
85             self.run_ssh(box,command,"Cleaning up %s on %s"%(filename,box))
86         else:
87             self.header ("++++++++++ Inspecting %s on %s"%(filename,box))
88             read_command = ["cat",filename]
89             trackers=self.backquote_ssh(box,read_command)
90             for tracker in trackers.split('\n'):
91                 if not tracker: continue
92                 try:
93                     tracker=tracker.strip()
94                     [hostname,buildname]=tracker.split('@')
95                     [left,plcname]=buildname.rsplit('-',1)
96                     print self.margin_outline(plcname),tracker
97                 except:
98                     print self.margin(""),tracker
99
100     def handle_tracker_qemus (self):
101         box = self.fqdn (self.testmaster)
102         filename="tracker-qemus"
103         if not self.options.probe:
104             command=["rm","-rf",filename]
105             self.run_ssh(box,command,"Cleaning up %s on %s"%(filename,box))
106         else:
107             self.header ("++++++++++ Inspecting %s on %s"%(filename,box))
108             read_command = ["cat",filename]
109             trackers=self.backquote_ssh(box,read_command)
110             for tracker in trackers.split('\n'):
111                 if not tracker: continue
112                 try:
113                     tracker=tracker.strip()
114                     [hostname,buildname,nodename]=tracker.split('@')
115                     nodename=nodename.split('.')[0]
116                     print self.margin_outline(nodename),tracker
117                 except:
118                     print self.margin(""),tracker
119
120     def handle_build_box (self,box):
121         if not self.options.probe:
122             self.reboot(box)
123         else:
124             command=['uptime']
125             uptime=self.backquote_ssh(box,command,True).strip()
126
127             command=['pgrep','build']
128             if self.options.dry_run:
129                 self.run_ssh(box,command,None)
130             else:
131                 pids=self.backquote_ssh(box,command,True)
132                 if not pids:
133                     self.header ('No build process on %s (%s)'%(box,uptime))
134                 else:
135                     command=['ps','-o','pid,command'] + [ pid for pid in pids.split("\n") if pid]
136                     self.run_ssh(box,command,"Active build processes on %s (%s)"%(box,uptime),True)
137
138     # this one is more accurate as it locates processes in the vservers as well
139     # but it's so sloooowww
140     def handle_build_box_deep (self,box):
141         if not self.options.probe:
142             self.reboot(box)
143         else:
144             command=['uptime']
145             uptime=self.backquote_ssh(box,command,True).strip()
146
147             command=['vps','-e']
148             if self.options.dry_run:
149                 self.run_ssh(box,command,None)
150             else:
151                 # simulate grep vbuild
152                 vps_lines=[ line for line in self.backquote_ssh(box,command,True).split("\n")
153                             if line.find('vbuild') >= 0]
154                 pids=[ line.split()[0] for line in vps_lines ]
155                 if not pids:
156                     self.header ('No build process on %s (%s)'%(box,uptime))
157                 else:
158                     command=['vps','-o','pid,command'] + pids
159                     self.run_ssh(box,command,"Active build processes on %s (%s)"%(box,uptime),True)
160
161
162     vplc_matcher = re.compile(".*(vplc[0-9]+$)")
163     def vplcname (self, vservername):
164         match = self.vplc_matcher.match(vservername)
165         if match: return match.groups(0)
166         else: return ""
167
168     margin_format="%-14s"
169     def margin(self,string): return self.margin_format%string
170     def outline (self, string): return '== %s =='%string
171     def margin_outline (self, string): return self.margin(self.outline(string))
172
173     def handle_plc_box (self,box):
174         if not self.options.probe:
175             self.reboot(box)
176         else:
177             command=['vserver-stat']
178             if self.options.dry_run:
179                 self.run_ssh(box,command,"Active vservers on %s"%box)
180             else:
181                 # try to find fullname (vserver_stat truncates to a ridiculously short name)
182                 try:
183                     self.header ("vserver map on %s"%box)
184                     # fetch the contexts for all vservers on that box
185                     map_command=['grep','.','/etc/vservers/*/context','/dev/null',]
186                     context_map=self.backquote_ssh (box,map_command)
187                     # at this point we have a set of lines like
188                     # /etc/vservers/2010.01.20--k27-f12-32-vplc03/context:40144
189                     ctx_dict={}
190                     for map_line in context_map.split("\n"):
191                         if not map_line: continue
192                         [path,xid] = map_line.split(':')
193                         ctx_dict[xid]=os.path.basename(os.path.dirname(path))
194                     # at this point ctx_id maps context id to vservername
195
196                     vserver_stat = self.backquote_ssh (box,command)
197                     for vserver_line in vserver_stat.split("\n"):
198                         if not vserver_line: continue
199                         context=vserver_line.split()[0]
200                         if context=="CTX": 
201                             print self.margin(""),vserver_line
202                             continue
203                         longname=ctx_dict[context]
204                         print self.margin_outline(self.vplcname(longname)),"%(vserver_line)s [=%(longname)s]"%locals()
205                 except:
206                     self.run_ssh(box,command,"Fine-grained method failed - fallback to plain vserver-stat")
207
208     vnode_matcher = re.compile(".*(vnode[0-9]+)")
209     def vnodename (self, ps_line):
210         match = self.vnode_matcher.match(ps_line)
211         if match: return match.groups(0)
212         else: return ""
213
214     def handle_qemu_box (self,box):
215         if not self.options.probe:
216             self.reboot(box)
217         else:
218             command=['lsmod']
219             modules=self.backquote_ssh(box,command).split('\n')
220             kqemu_msg='*NO kqemu MODULE LOADED*'
221             for module in modules:
222                 if module.find('kqemu')==0:
223                     kqemu_msg='kqemu OK'
224             
225             command=['pgrep','qemu']
226             if self.options.dry_run:
227                 self.run_ssh(box,command,None)
228             else:
229                 pids=self.backquote_ssh(box,command)
230                 if not pids:
231                     self.header ('No qemu process on %s'%box)
232                 else:
233                     self.header ("Active qemu processes on %s (%s)"%(box,kqemu_msg))
234                     command=['ps','-o','pid,command'] + [ pid for pid in pids.split("\n") if pid]
235                     ps_lines = self.backquote_ssh (box,command).split("\n")
236                     for ps_line in ps_lines:
237                         if not ps_line or ps_line.find('PID') >=0 : continue
238                         print self.margin_outline(self.vnodename(ps_line)), ps_line
239
240     # the ouput of ps -o pid,command gives us <pid> bash <buildname>/run_log
241     def testmaster_buildname (self, ps_line):
242         chunks=ps_line.split()
243         path=chunks[2]
244         [buildname,command]=path.split('/')
245         return buildname
246
247     def handle_testmaster_box (self, box):
248         if not self.options.probe: 
249             pass
250         else:
251             command=['pgrep','run_log']
252             if self.options.dry_run:
253                 self.run_ssh(box,command,None)
254             else:
255                 pids=self.backquote_ssh(box,command)
256                 if not pids:
257                     self.header ('No run_log process on %s'%box)
258                 else:
259                     self.header ("Active run_log processes on %s"%(box))
260                     command=['ps','-o','pid,command'] + [ pid for pid in pids.split("\n") if pid]
261                     ps_lines = self.backquote_ssh (box,command).split("\n")
262                     for ps_line in ps_lines:
263                         if not ps_line or ps_line.find('PID') >=0 : continue
264                         print self.margin_outline(self.testmaster_buildname(ps_line)), ps_line
265         
266
267     def handle_box(self,box,type):
268         if box in self.qemu_boxes:
269             if type=="qemu": self.handle_qemu_box(self.fqdn(box))
270         elif box in self.plc_boxes:
271             if type=="plc":  self.handle_plc_box(self.fqdn(box))
272         elif box in self.testmaster_boxes:
273             if type=='testmaster': self.handle_testmaster_box(self.fqdn(box))
274         elif type=="build":
275             if self.options.deep:
276                 self.handle_build_box_deep(self.fqdn(box))
277             else:
278                 self.handle_build_box(self.fqdn(box))
279
280     def handle_disk (self,box):
281         box=self.fqdn(box)
282         return self.run_ssh(box,["df","-h",],"Disk space on %s"%box)
283
284     def main (self):
285         usage="""%prog [options] [hostname..(s)]
286 Default is to act on test boxes only"""
287         parser = OptionParser (usage=usage)
288         parser.add_option ("-n","--dry-run",action="store_true",dest="dry_run",default=False,
289                            help="Dry run")
290         parser.add_option ("-r","--reboot", action="store_false",dest="probe",default=True,
291                            help="Actually reset/reboot stuff instead of just probing it")
292         # no need for -p = probe, as this is the default
293         parser.add_option ("-p","--plc", action="store_true",dest="plc_only",default=False,
294                            help="Acts on the plc box only")
295
296         parser.add_option ("-a","--all",action="store_true",dest="all_boxes",default=False,
297                            help="Acts on build and test boxes")
298         parser.add_option ("-b","--build",action="store_true",dest="build_only",default=False,
299                            help="Acts on build boxes only")
300         parser.add_option ("-e","--deep",action="store_true", dest="deep", default=False,
301                            help="on build boxes, shows vbuild processes in vservers as well; signif. slower")
302         parser.add_option ("-q","--qemu",action="store_true",dest="qemu_only",default=False,
303                            help="Only acts on the qemu boxes")
304         parser.add_option ("-t","--trackers",action="store_true",dest="trackers_only",default=False,
305                            help="Only wipes trackers")
306         parser.add_option ("-m","--master",action="store_true",dest="testmaster_only",default=False,
307                            help="Display the testmaster status")
308         parser.add_option ("-d","--disk",action="store_true",dest="show_disk",default=False,
309                            help="Only inspects disk status")
310
311         (self.options,args) = parser.parse_args()
312
313         # use given hostnames if provided
314         if args:
315             self.boxes=args
316             # if hostnames are specified, let's stay on the safe side and don't reset trackers
317             self.do_tracker_plcs = False
318             self.do_tracker_qemus = False
319         elif self.options.all_boxes:
320             self.boxes=self.test_boxes + self.build_boxes + self.testmaster_boxes
321             self.do_tracker_plcs = True
322             self.do_tracker_qemus = True
323         elif self.options.build_only:
324             self.boxes=self.build_boxes
325             self.do_tracker_plcs = False
326             self.do_tracker_qemus = False
327         elif self.options.qemu_only:
328             self.boxes=self.qemu_boxes
329             self.do_tracker_plcs = False
330             self.do_tracker_qemus = True
331         elif self.options.plc_only:
332             self.boxes=self.plc_boxes
333             self.do_tracker_plcs = True
334             self.do_tracker_qemus = False
335         elif self.options.testmaster_only:
336             self.boxes=self.testmaster_boxes
337             self.do_tracker_plcs = False
338             self.do_tracker_qemus = False
339         elif self.options.trackers_only:
340             self.boxes = []
341             self.do_tracker_plcs = True
342             self.do_tracker_qemus = True
343         # default
344         else:
345             self.boxes = self.test_boxes
346             self.do_tracker_plcs = True
347             self.do_tracker_qemus = True
348
349         if self.options.show_disk:
350             for box in self.boxes: self.handle_disk(box)
351             return
352
353         # PLCS
354         if self.do_tracker_plcs:self.handle_tracker_plcs ()
355         for box in self.boxes:  self.handle_box (box,"plc")
356         # QEMU
357         if self.do_tracker_qemus:self.handle_tracker_qemus ()
358         for box in self.boxes:  self.handle_box (box,"qemu")
359         # ALL OTHERS
360         for box in self.boxes:  self.handle_box (box,"build")
361         # TESTMASTER
362         for box in self.boxes:  self.handle_box (box,"testmaster")
363
364 if __name__ == "__main__":
365     BuildBoxes().main()