f4fd9ad2dbd773ec7988c2d5b0c845d100be42a3
[infrastructure.git] / scripts / manage-infrastructure.py
1 #!/usr/bin/python
2
3 import os.path, sys
4 import re
5 import subprocess
6 from optparse import OptionParser
7
8 class BuildBoxes:
9
10     # everything in the onelab.eu domain
11     domain = 'pl.sophia.inria.fr'
12     testmaster = 'testmaster'
13     build_boxes = [ "mirror", "liquid", "reed", "velvet", ]
14     plc_boxes = [ "testplc" ]
15     qemu_boxes = \
16         [ "qemu64-%d"%i for i in range (1,4) ] + \
17         [ "qemu32-%d"%i for i in range (1,6) ]
18     test_boxes = plc_boxes + qemu_boxes
19
20     def __init__ (self):
21         # dummy defaults
22         self.boxes = []
23         self.do_tracker_qemus = False
24         self.do_tracker_plcs = False
25
26     def fqdn (self, box):
27         return "%s.%s"%(box,self.domain)
28
29     ssh_command=['ssh','-o','ConnectTimeout=3']
30     @staticmethod
31     def root (box): return "root@%s"%box
32
33     @staticmethod
34     def ssh(box):
35         return BuildBoxes.ssh_command + [ BuildBoxes.root(box) ]
36
37     def header (self,message):
38         print "===============",message
39         sys.stdout.flush()
40
41     def run (self,argv,message, trash_err=False):
42         if self.options.dry_run:
43             print 'DRY_RUN:',
44             print " ".join(argv)
45             return 0
46         else:
47             if message: self.header(message)
48             if not trash_err:
49                 return subprocess.call(argv)
50             else:
51                 return subprocess.call(argv,stderr=file('/dev/null','w'))
52                 
53     def run_ssh (self, box, argv, message, trash_err=False):
54         result=self.run (self.ssh(box) + argv, message, trash_err)
55         if result!=0:
56             print "WARNING: failed to run %s on %s"%(" ".join(argv),box)
57         return result
58
59     def backquote (self, argv, trash_err=False):
60         if not trash_err:
61             return subprocess.Popen(argv,stdout=subprocess.PIPE).communicate()[0]
62         else:
63             return subprocess.Popen(argv,stdout=subprocess.PIPE,stderr=file('/dev/null','w')).communicate()[0]
64
65     def backquote_ssh (self, box, argv, trash_err=False):
66         # first probe the ssh link
67         hostname=self.backquote ( self.ssh(box) + [ "hostname"], trash_err=True )
68         if not hostname:
69             print "%s unreachable"%self.root(box)
70             return ''
71         else:
72             return self.backquote( ['ssh',self.root(box)] + argv, trash_err)
73
74     def reboot (self,box):
75         command=['ssh',self.root(box),'shutdown','-r','now']
76         self.run (command,"Rebooting %s"%box)
77
78     def handle_tracker_plcs (self):
79         box = self.fqdn (self.testmaster)
80         filename="tracker-plcs"
81         if not self.options.probe:
82             command=["rm","-rf",filename]
83             self.run_ssh(box,command,"Cleaning up %s on %s"%(filename,box))
84         else:
85             self.header ("++++++++++ Inspecting %s on %s"%(filename,box))
86             read_command = ["cat",filename]
87             trackers=self.backquote_ssh(box,read_command)
88             for tracker in trackers.split('\n'):
89                 if not tracker: continue
90                 try:
91                     tracker=tracker.strip()
92                     [hostname,buildname]=tracker.split('@')
93                     [left,plcname]=buildname.rsplit('-',1)
94                     print self.margin_outline(plcname),tracker
95                 except:
96                     print self.margin(""),tracker
97
98     def handle_tracker_qemus (self):
99         box = self.fqdn (self.testmaster)
100         filename="tracker-qemus"
101         if not self.options.probe:
102             command=["rm","-rf",filename]
103             self.run_ssh(box,command,"Cleaning up %s on %s"%(filename,box))
104         else:
105             self.header ("++++++++++ Inspecting %s on %s"%(filename,box))
106             read_command = ["cat",filename]
107             trackers=self.backquote_ssh(box,read_command)
108             for tracker in trackers.split('\n'):
109                 if not tracker: continue
110                 try:
111                     tracker=tracker.strip()
112                     [hostname,buildname,nodename]=tracker.split('@')
113                     nodename=nodename.split('.')[0]
114                     print self.margin_outline(nodename),tracker
115                 except:
116                     print self.margin(""),tracker
117
118     def handle_build_box (self,box):
119         if not self.options.probe:
120             self.reboot(box)
121         else:
122             command=['uptime']
123             uptime=self.backquote_ssh(box,command,True).strip()
124
125             command=['pgrep','build']
126             if self.options.dry_run:
127                 self.run_ssh(box,command,None)
128             else:
129                 pids=self.backquote_ssh(box,command,True)
130                 if not pids:
131                     self.header ('No build process on %s (%s)'%(box,uptime))
132                 else:
133                     command=['ps','-o','pid,command'] + [ pid for pid in pids.split("\n") if pid]
134                     self.run_ssh(box,command,"Active build processes on %s (%s)"%(box,uptime),True)
135
136     vplc_matcher = re.compile(".*(vplc[0-9]+$)")
137     def vplcname (self, vservername):
138         match = self.vplc_matcher.match(vservername)
139         if match: return match.groups(0)
140         else: return ""
141
142     margin_format="%-14s"
143     def margin(self,string): return self.margin_format%string
144     def outline (self, string): return '== %s =='%string
145     def margin_outline (self, string): return self.margin(self.outline(string))
146
147     def handle_plc_box (self,box):
148         if not self.options.probe:
149             self.reboot(box)
150         else:
151             command=['vserver-stat']
152             if self.options.dry_run:
153                 self.run_ssh(box,command,"Active vservers on %s"%box)
154             else:
155                 # try to find fullname (vserver_stat truncates to a ridiculously short name)
156                 try:
157                     self.header ("vserver map on %s"%box)
158                     # fetch the contexts for all vservers on that box
159                     map_command=['grep','.','/etc/vservers/*/context','/dev/null',]
160                     context_map=self.backquote_ssh (box,map_command)
161                     # at this point we have a set of lines like
162                     # /etc/vservers/2010.01.20--k27-f12-32-vplc03/context:40144
163                     ctx_dict={}
164                     for map_line in context_map.split("\n"):
165                         if not map_line: continue
166                         [path,xid] = map_line.split(':')
167                         ctx_dict[xid]=os.path.basename(os.path.dirname(path))
168                     # at this point ctx_id maps context id to vservername
169
170                     vserver_stat = self.backquote_ssh (box,command)
171                     for vserver_line in vserver_stat.split("\n"):
172                         if not vserver_line: continue
173                         context=vserver_line.split()[0]
174                         if context=="CTX": 
175                             print self.margin(""),vserver_line
176                             continue
177                         longname=ctx_dict[context]
178                         print self.margin_outline(self.vplcname(longname)),"%(vserver_line)s [=%(longname)s]"%locals()
179                 except:
180                     self.run_ssh(box,command,"Fine-grained method failed - fallback to plain vserver-stat")
181
182     vnode_matcher = re.compile(".*(vnode[0-9]+)")
183     def vnodename (self, ps_line):
184         match = self.vnode_matcher.match(ps_line)
185         if match: return match.groups(0)
186         else: return ""
187
188
189     def handle_qemu_box (self,box):
190         if not self.options.probe:
191             self.reboot(box)
192         else:
193             command=['lsmod']
194             modules=self.backquote_ssh(box,command).split('\n')
195             kqemu_msg='*NO kqemu MODULE LOADED*'
196             for module in modules:
197                 if module.find('kqemu')==0:
198                     kqemu_msg='kqemu OK'
199             
200             command=['pgrep','qemu']
201             if self.options.dry_run:
202                 self.run_ssh(box,command,None)
203             else:
204                 pids=self.backquote_ssh(box,command)
205                 if not pids:
206                     self.header ('No qemu process on %s'%box)
207                 else:
208                     self.header ("Active qemu processes on %s (%s)"%(box,kqemu_msg))
209                     command=['ps','-o','pid,command'] + [ pid for pid in pids.split("\n") if pid]
210                     ps_lines = self.backquote_ssh (box,command).split("\n")
211                     for ps_line in ps_lines:
212                         if not ps_line or ps_line.find('PID') >=0 : continue
213                         print self.margin_outline(self.vnodename(ps_line)), ps_line
214
215     def handle_box(self,box,type):
216         if box in self.qemu_boxes:
217             if type=="qemu": self.handle_qemu_box(self.fqdn(box))
218         elif box in self.plc_boxes:
219             if type=="plc":  self.handle_plc_box(self.fqdn(box))
220         elif type=="build":
221             self.handle_build_box(self.fqdn(box))
222
223     def handle_disk (self,box):
224         box=self.fqdn(box)
225         return self.run_ssh(box,["df","-h",],"Disk space on %s"%box)
226
227     def main (self):
228         usage="""%prog [options] [hostname..(s)]
229 Default is to act on test boxes only (with trackers clean)"""
230         parser = OptionParser (usage=usage)
231         parser.add_option ("-n","--dry-run",action="store_true",dest="dry_run",default=False,
232                            help="Dry run")
233         parser.add_option ("-r","--reboot", action="store_false",dest="probe",default=True,
234                            help="Actually reset/reboot stuff instead of just probing it")
235         # no need for -p = probe, as this is the default
236         parser.add_option ("-p","--plc", action="store_true",dest="plc_only",default=False,
237                            help="Acts on the plc box only")
238
239         parser.add_option ("-a","--all",action="store_true",dest="all_boxes",default=False,
240                            help="Acts on build and test boxes")
241         parser.add_option ("-b","--build",action="store_true",dest="build_only",default=False,
242                            help="Acts on build boxes only")
243         parser.add_option ("-q","--qemu",action="store_true",dest="qemu_only",default=False,
244                            help="Only acts on the qemu boxes")
245         parser.add_option ("-t","--trackers",action="store_true",dest="trackers_only",default=False,
246                            help="Only wipes trackers")
247         parser.add_option ("-d","--disk",action="store_true",dest="show_disk",default=False,
248                            help="Only inspects disk status")
249
250         (self.options,args) = parser.parse_args()
251
252         # use given hostnames if provided
253         if args:
254             self.boxes=args
255             # if hostnames are specified, let's stay on the safe side and don't reset trackers
256             self.do_tracker_plcs = False
257             self.do_tracker_qemus = False
258         elif self.options.all_boxes:
259             self.boxes=self.test_boxes + self.build_boxes
260             self.do_tracker_plcs = True
261             self.do_tracker_qemus = True
262         elif self.options.build_only:
263             self.boxes=self.build_boxes
264             self.do_tracker_plcs = False
265             self.do_tracker_qemus = False
266         elif self.options.qemu_only:
267             self.boxes=self.qemu_boxes
268             self.do_tracker_plcs = False
269             self.do_tracker_qemus = True
270         elif self.options.plc_only:
271             self.boxes=self.plc_boxes
272             self.do_tracker_plcs = True
273             self.do_tracker_qemus = False
274         elif self.options.trackers_only:
275             self.boxes = []
276             self.do_tracker_plcs = True
277             self.do_tracker_qemus = True
278         # default
279         else:
280             self.boxes = self.test_boxes
281             self.do_tracker_plcs = True
282             self.do_tracker_qemus = True
283
284         if self.options.show_disk:
285             for box in self.boxes: self.handle_disk(box)
286             return
287
288         # ALL OTHERS
289         for box in self.boxes:  self.handle_box (box,"build")
290         # PLCS
291         if self.do_tracker_plcs:self.handle_tracker_plcs ()
292         for box in self.boxes:  self.handle_box (box,"plc")
293         # QEMU
294         if self.do_tracker_qemus:self.handle_tracker_qemus ()
295         for box in self.boxes:  self.handle_box (box,"qemu")
296
297 if __name__ == "__main__":
298     BuildBoxes().main()