probe for kvm_intel as on some boxes this does not load
[infrastructure.git] / scripts / manage-infrastructure.py
1 #!/usr/bin/python
2
3 import os.path, sys
4 import re
5 import subprocess
6 from optparse import OptionParser
7
8 class BuildBoxes:
9
10     # everything in the onelab.eu domain
11     domain = 'pl.sophia.inria.fr'
12     build_boxes = [ "mirror", "liquid", "reed", "velvet", ]
13     plc_boxes = [ "testplc" ]
14     testmaster = 'testmaster'
15     testmaster_boxes = [ testmaster ]
16     # cache the list of qemu boxes in ~/.qemu-boxes
17     # this can be refreshed by running -c
18     qemu_boxes=[]
19
20     def cache_file (self): return os.path.expanduser("~/.qemu-boxes")
21
22     def load_cache (self):
23         cache=self.cache_file()
24         if os.path.isfile(cache):
25             self.qemu_boxes=file(cache).read().split()
26         self.test_boxes = self.plc_boxes + self.qemu_boxes
27
28     # run LocalTestResources on testmaster
29     def refresh_cache (self):
30         retrieved= \
31             self.backquote_ssh(self.fqdn(self.testmaster),['LocalTestResources.py'],trash_err=True)
32         remove="."+BuildBoxes.domain
33         retrieved = [ x.replace(remove,"").strip() for x in retrieved.split()]
34         self.qemu_boxes = retrieved
35         cache=self.cache_file()
36         file(cache,'w').write(' '.join(self.qemu_boxes)+'\n')
37         print "New contents of %s:"%cache
38         print file(cache).read(),
39
40     def __init__ (self):
41         # dummy defaults
42         self.boxes = []
43         self.do_tracker_qemus = False
44         self.do_tracker_plcs = False
45         self.load_cache()
46
47     def fqdn (self, box):
48         return "%s.%s"%(box,self.domain)
49
50     ssh_command=['ssh','-o','ConnectTimeout=3']
51     @staticmethod
52     def root (box): return "root@%s"%box
53
54     @staticmethod
55     def ssh(box):
56         return BuildBoxes.ssh_command + [ BuildBoxes.root(box) ]
57
58     def header (self,message):
59         print "===============",message
60         sys.stdout.flush()
61
62     def run (self,argv,message, trash_err=False):
63         if self.options.dry_run:
64             print 'DRY_RUN:',
65             print " ".join(argv)
66             return 0
67         else:
68             if message: self.header(message)
69             if not trash_err:
70                 return subprocess.call(argv)
71             else:
72                 return subprocess.call(argv,stderr=file('/dev/null','w'))
73                 
74     def run_ssh (self, box, argv, message, trash_err=False):
75         result=self.run (self.ssh(box) + argv, message, trash_err)
76         if result!=0:
77             print "WARNING: failed to run %s on %s"%(" ".join(argv),box)
78         return result
79
80     def backquote (self, argv, trash_err=False):
81         if not trash_err:
82             return subprocess.Popen(argv,stdout=subprocess.PIPE).communicate()[0]
83         else:
84             return subprocess.Popen(argv,stdout=subprocess.PIPE,stderr=file('/dev/null','w')).communicate()[0]
85
86     def backquote_ssh (self, box, argv, trash_err=False):
87         # first probe the ssh link
88         hostname=self.backquote ( self.ssh(box) + [ "hostname"], trash_err=True )
89         if not hostname:
90             print "%s unreachable"%self.root(box)
91             return ''
92         else:
93             return self.backquote( ['ssh',self.root(box)] + argv, trash_err)
94
95     def reboot (self,box):
96         command=['ssh',self.root(box),'shutdown','-r','now']
97         self.run (command,"Rebooting %s"%box)
98
99     def handle_tracker_plcs (self):
100         box = self.fqdn (self.testmaster)
101         filename="tracker-plcs"
102         if not self.options.probe:
103             command=["rm","-rf",filename]
104             self.run_ssh(box,command,"Cleaning up %s on %s"%(filename,box))
105         else:
106             self.header ("++++++++++ Inspecting %s on %s"%(filename,box))
107             read_command = ["cat",filename]
108             trackers=self.backquote_ssh(box,read_command)
109             for tracker in trackers.split('\n'):
110                 if not tracker: continue
111                 try:
112                     tracker=tracker.strip()
113                     [hostname,buildname]=tracker.split('@')
114                     [left,plcname]=buildname.rsplit('-',1)
115                     print self.margin_outline(plcname),tracker
116                 except:
117                     print self.margin(""),tracker
118
119     def handle_tracker_qemus (self):
120         box = self.fqdn (self.testmaster)
121         filename="tracker-qemus"
122         if not self.options.probe:
123             command=["rm","-rf",filename]
124             self.run_ssh(box,command,"Cleaning up %s on %s"%(filename,box))
125         else:
126             self.header ("++++++++++ Inspecting %s on %s"%(filename,box))
127             read_command = ["cat",filename]
128             trackers=self.backquote_ssh(box,read_command)
129             for tracker in trackers.split('\n'):
130                 if not tracker: continue
131                 try:
132                     tracker=tracker.strip()
133                     [hostname,buildname,nodename]=tracker.split('@')
134                     nodename=nodename.split('.')[0]
135                     print self.margin_outline(nodename),tracker
136                 except:
137                     print self.margin(""),tracker
138
139     def handle_build_box (self,box):
140         if not self.options.probe:
141             self.reboot(box)
142         else:
143             command=['uptime']
144             uptime=self.backquote_ssh(box,command,True).strip()
145
146             command=['pgrep','build']
147             if self.options.dry_run:
148                 self.run_ssh(box,command,None)
149             else:
150                 pids=self.backquote_ssh(box,command,True)
151                 if not pids:
152                     self.header ('No build process on %s (%s)'%(box,uptime))
153                 else:
154                     command=['ps','-o','pid,command'] + [ pid for pid in pids.split("\n") if pid]
155                     self.run_ssh(box,command,"Active build processes on %s (%s)"%(box,uptime),True)
156
157     # this one is more accurate as it locates processes in the vservers as well
158     # but it's so sloooowww
159     def handle_build_box_deep (self,box):
160         if not self.options.probe:
161             self.reboot(box)
162         else:
163             command=['uptime']
164             uptime=self.backquote_ssh(box,command,True).strip()
165
166             command=['vps','-e']
167             if self.options.dry_run:
168                 self.run_ssh(box,command,None)
169             else:
170                 # simulate grep vbuild
171                 vps_lines=[ line for line in self.backquote_ssh(box,command,True).split("\n")
172                             if line.find('vbuild') >= 0]
173                 pids=[ line.split()[0] for line in vps_lines ]
174                 if not pids:
175                     self.header ('No build process on %s (%s)'%(box,uptime))
176                 else:
177                     command=['vps','-o','pid,command'] + pids
178                     self.run_ssh(box,command,"Active build processes on %s (%s)"%(box,uptime),True)
179
180
181     vplc_matcher = re.compile(".*(vplc[0-9]+$)")
182     def vplcname (self, vservername):
183         match = self.vplc_matcher.match(vservername)
184         if match: return match.groups(0)
185         else: return ""
186
187     margin_format="%-14s"
188     def margin(self,string): return self.margin_format%string
189     def outline (self, string): return '== %s =='%string
190     def margin_outline (self, string): return self.margin(self.outline(string))
191
192     def handle_plc_box (self,box):
193 # initial approach was to first scan vserver-stat, but it's not needed
194         if not self.options.probe:
195 #            # remove mark for all running servers to avoid resurrection
196 #            if vserver_names:
197 #                bash="; ".join( [ "rm -f /etc/vservers/%s/apps/init/mark"%vs for vs in vserver_names ] )
198 #                stop_command=['bash','-c',"'" + bash + "'"]
199 #                self.run_ssh(box,stop_command,"Removing mark on running vservers on %s"%box)
200             # just trash all marks 
201             stop_command=['rm','-rf','/etc/vservers/*/apps/init/mark']
202             self.run_ssh(box,stop_command,"Removing all vserver marks on %s"%box)
203             if not self.options.soft:
204                 self.reboot(box)
205             else:
206                 self.run_ssh(box,['service','util-vserver','stop'],"Stopping all running vservers")
207             return
208         # even for rebooting we need to scan vserver-stat to stop the vservers properly
209         vserver_names=[]
210         command=['vserver-stat']
211         if self.options.dry_run:
212             self.run_ssh(box,command,"Active vservers on %s"%box)
213         # try to find fullname (vserver_stat truncates to a ridiculously short name)
214         self.header ("vserver map on %s"%box)
215         # fetch the contexts for all vservers on that box
216         map_command=['grep','.','/etc/vservers/*/context','/dev/null',]
217         context_map=self.backquote_ssh (box,map_command)
218         # at this point we have a set of lines like
219         # /etc/vservers/2010.01.20--k27-f12-32-vplc03/context:40144
220         ctx_dict={}
221         for map_line in context_map.split("\n"):
222             if not map_line: continue
223             [path,xid] = map_line.split(':')
224             ctx_dict[xid]=os.path.basename(os.path.dirname(path))
225         # at this point ctx_id maps context id to vservername
226
227         vserver_stat = self.backquote_ssh (box,command)
228         for vserver_line in vserver_stat.split("\n"):
229             if not vserver_line: continue
230             context=vserver_line.split()[0]
231             if context=="CTX": 
232                 print self.margin(""),vserver_line
233                 continue
234             longname=ctx_dict[context]
235             vserver_names.append(longname)
236             print self.margin_outline(self.vplcname(longname)),"%(vserver_line)s [=%(longname)s]"%locals()
237
238     vnode_matcher = re.compile(".*(vnode[0-9]+)")
239     def vnodename (self, ps_line):
240         match = self.vnode_matcher.match(ps_line)
241         if match: return match.groups(0)
242         else: return ""
243
244     def handle_qemu_box (self,box):
245         if not self.options.probe:
246             if not self.options.soft:
247                 self.reboot(box)
248             else:
249                 self.run_ssh(box,['pkill','qemu'],"Killing qemu instances")
250         else:
251             command=['lsmod']
252             modules=self.backquote_ssh(box,command).split('\n')
253             kqemu_msg='*NO kqemu/kmv_intel MODULE LOADED*'
254             for module in modules:
255                 if module.find('kqemu')==0:
256                     kqemu_msg='kqemu module loaded'
257                 # kvm might be loaded without vkm_intel (we dont have AMD)
258                 elif module.find('kvm_intel')==0:
259                     kqemu_msg='kvm_intel module loaded'
260             
261             command=['pgrep','qemu']
262             if self.options.dry_run:
263                 self.run_ssh(box,command,None)
264             else:
265                 pids=self.backquote_ssh(box,command)
266                 if not pids:
267                     self.header ('No qemu process on %s (%s)'%(box,kqemu_msg))
268                 else:
269                     self.header ("Active qemu processes on %s (%s)"%(box,kqemu_msg))
270                     command=['ps','-o','pid,command'] + [ pid for pid in pids.split("\n") if pid]
271                     ps_lines = self.backquote_ssh (box,command).split("\n")
272                     for ps_line in ps_lines:
273                         if not ps_line or ps_line.find('PID') >=0 : continue
274                         print self.margin_outline(self.vnodename(ps_line)), ps_line
275
276     # the ouput of ps -o pid,command gives us <pid> bash <buildname>/run_log
277     def testmaster_buildname (self, ps_line):
278         chunks=ps_line.split()
279         path=chunks[2]
280         [buildname,command]=path.split('/')
281         return buildname
282
283     def handle_testmaster_box (self, box):
284         if not self.options.probe: 
285             pass
286         else:
287             command=['pgrep','run_log']
288             if self.options.dry_run:
289                 self.run_ssh(box,command,None)
290             else:
291                 pids=self.backquote_ssh(box,command)
292                 if not pids:
293                     self.header ('No run_log process on %s'%box)
294                 else:
295                     self.header ("Active run_log processes on %s"%(box))
296                     command=['ps','-o','pid,command'] + [ pid for pid in pids.split("\n") if pid]
297                     ps_lines = self.backquote_ssh (box,command).split("\n")
298                     for ps_line in ps_lines:
299                         if not ps_line or ps_line.find('PID') >=0 : continue
300                         print self.margin_outline(self.testmaster_buildname(ps_line)), ps_line
301         
302
303     def handle_box(self,box,type):
304         if box in self.qemu_boxes:
305             if type=="qemu": self.handle_qemu_box(self.fqdn(box))
306         elif box in self.plc_boxes:
307             if type=="plc":  self.handle_plc_box(self.fqdn(box))
308         elif box in self.testmaster_boxes:
309             if type=='testmaster': self.handle_testmaster_box(self.fqdn(box))
310         elif type=="build":
311             if self.options.deep:
312                 self.handle_build_box_deep(self.fqdn(box))
313             else:
314                 self.handle_build_box(self.fqdn(box))
315
316     def handle_disk (self,box):
317         box=self.fqdn(box)
318         return self.run_ssh(box,["df","-h",],"Disk space on %s"%box)
319
320     def main (self):
321         usage="""%prog [options] [hostname..(s)]
322 Default is to act on test boxes only"""
323         parser = OptionParser (usage=usage)
324         parser.add_option ("-n","--dry-run",action="store_true",dest="dry_run",default=False,
325                            help="Dry run")
326         parser.add_option ("-r","--reboot", action="store_false",dest="probe",default=True,
327                            help="Actually reset/reboot stuff instead of just probing it")
328         parser.add_option ("-s","--soft",action="store_true",dest="soft",default=False,
329                            help="Soft reset instead of hard reboot of the boxes")
330         # no need for -p = probe, as this is the default
331         parser.add_option ("-p","--plc", action="store_true",dest="plc_only",default=False,
332                            help="Acts on the plc box only")
333
334         parser.add_option ("-e","--deep",action="store_true", dest="deep", default=False,
335                            help="on build boxes, shows vbuild processes in vservers as well; signif. slower")
336
337         parser.add_option ("-a","--all",action="store_true",dest="all_boxes",default=False,
338                            help="Acts on build and test boxes")
339         parser.add_option ("-b","--build",action="store_true",dest="build_only",default=False,
340                            help="Acts on build boxes only")
341         parser.add_option ("-q","--qemu",action="store_true",dest="qemu_only",default=False,
342                            help="Only acts on the qemu boxes")
343         parser.add_option ("-t","--trackers",action="store_true",dest="trackers_only",default=False,
344                            help="Only wipes trackers")
345         parser.add_option ("-m","--master",action="store_true",dest="testmaster_only",default=False,
346                            help="Display the testmaster status")
347         parser.add_option ("-d","--disk",action="store_true",dest="show_disk",default=False,
348                            help="Only inspects disk status")
349         parser.add_option ("-c","--refresh-cache",action="store_true",dest="refresh_cache", default=False,
350                            help="Refresh cached list of qemu boxes at testmaster - implies -q")
351
352         (self.options,args) = parser.parse_args()
353
354         # -c implies -q
355         if self.options.refresh_cache:
356             self.options.qemu_only=True
357             self.refresh_cache()
358
359         # use given hostnames if provided
360         if args:
361             self.boxes=args
362             # if hostnames are specified, let's stay on the safe side and don't reset trackers
363             self.do_tracker_plcs = False
364             self.do_tracker_qemus = False
365         elif self.options.all_boxes:
366             self.boxes=self.test_boxes + self.build_boxes + self.testmaster_boxes
367             self.do_tracker_plcs = True
368             self.do_tracker_qemus = True
369         elif self.options.build_only:
370             self.boxes=self.build_boxes
371             self.do_tracker_plcs = False
372             self.do_tracker_qemus = False
373         elif self.options.qemu_only:
374             self.boxes=self.qemu_boxes
375             self.do_tracker_plcs = False
376             self.do_tracker_qemus = True
377         elif self.options.plc_only:
378             self.boxes=self.plc_boxes
379             self.do_tracker_plcs = True
380             self.do_tracker_qemus = False
381         elif self.options.testmaster_only:
382             self.boxes=self.testmaster_boxes
383             self.do_tracker_plcs = False
384             self.do_tracker_qemus = False
385         elif self.options.trackers_only:
386             self.boxes = []
387             self.do_tracker_plcs = True
388             self.do_tracker_qemus = True
389         # default
390         else:
391             self.boxes = self.test_boxes
392             self.do_tracker_plcs = True
393             self.do_tracker_qemus = True
394
395         if self.options.show_disk:
396             for box in self.boxes: self.handle_disk(box)
397             return
398
399         # PLCS
400         if self.do_tracker_plcs:self.handle_tracker_plcs ()
401         for box in self.boxes:  self.handle_box (box,"plc")
402         # QEMU
403         if self.do_tracker_qemus:self.handle_tracker_qemus ()
404         for box in self.boxes:  self.handle_box (box,"qemu")
405         # ALL OTHERS
406         for box in self.boxes:  self.handle_box (box,"build")
407         # TESTMASTER
408         for box in self.boxes:  self.handle_box (box,"testmaster")
409
410 if __name__ == "__main__":
411     BuildBoxes().main()