X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=scripts%2Fmanage-infrastructure.py;h=ffc1af6d7c4bc0ef5cd4f594c0d743e6ce1080c1;hb=6f9aada325c0c44ef0d7acd068f884b29a926a6d;hp=4be91e4ec8d9582d25e33ae9cefafe81434c2cda;hpb=7accd4f78485761c33251376919b3870a3543739;p=infrastructure.git diff --git a/scripts/manage-infrastructure.py b/scripts/manage-infrastructure.py index 4be91e4..ffc1af6 100755 --- a/scripts/manage-infrastructure.py +++ b/scripts/manage-infrastructure.py @@ -1,6 +1,6 @@ #!/usr/bin/python -import os.path +import os.path, sys import re import subprocess from optparse import OptionParser @@ -8,66 +8,175 @@ from optparse import OptionParser class BuildBoxes: # everything in the onelab.eu domain - domain = 'onelab.eu' - testmaster = 'testmaster' + domain = 'pl.sophia.inria.fr' build_boxes = [ "mirror", "liquid", "reed", "velvet", ] plc_boxes = [ "testplc" ] - qemu_boxes = \ - [ "testqemu%d"%i for i in range (1,4) ] + \ - [ "testqemu32-%d"%i for i in range (1,6) ] - test_boxes = plc_boxes + qemu_boxes + testmaster = 'testmaster' + testmaster_boxes = [ testmaster ] + # cache the list of qemu boxes in ~/.qemu-boxes + # this can be refreshed by running -c + qemu_boxes=[] + + def cache_file (self): return os.path.expanduser("~/.qemu-boxes") + + def load_cache (self): + cache=self.cache_file() + if os.path.isfile(cache): + self.qemu_boxes=file(cache).read().split() + self.test_boxes = self.plc_boxes + self.qemu_boxes + + # run LocalTestResources on testmaster + def refresh_cache (self): + retrieved= \ + self.backquote_ssh(self.fqdn(self.testmaster),['LocalTestResources.py'],trash_err=True) + remove="."+BuildBoxes.domain + retrieved = [ x.replace(remove,"").strip() for x in retrieved.split()] + self.qemu_boxes = retrieved + cache=self.cache_file() + file(cache,'w').write(' '.join(self.qemu_boxes)+'\n') + print "New contents of %s:"%cache + print file(cache).read(), def __init__ (self): # dummy defaults self.boxes = [] - self.do_tracker = False + self.do_tracker_qemus = False + self.do_tracker_plcs = False + self.load_cache() def fqdn (self, box): return "%s.%s"%(box,self.domain) + + ssh_command=['ssh','-o','ConnectTimeout=3'] @staticmethod def root (box): return "root@%s"%box + @staticmethod + def ssh(box): + return BuildBoxes.ssh_command + [ BuildBoxes.root(box) ] + def header (self,message): print "===============",message + sys.stdout.flush() - def run (self,argv,message): + def run (self,argv,message, trash_err=False): if self.options.dry_run: print 'DRY_RUN:', print " ".join(argv) + return 0 else: if message: self.header(message) - subprocess.call(argv) + if not trash_err: + return subprocess.call(argv) + else: + return subprocess.call(argv,stderr=file('/dev/null','w')) - def backquote (self, argv): - return subprocess.Popen(argv,stdout=subprocess.PIPE).communicate()[0] + def run_ssh (self, box, argv, message, trash_err=False): + result=self.run (self.ssh(box) + argv, message, trash_err) + if result!=0: + print "WARNING: failed to run %s on %s"%(" ".join(argv),box) + return result + + def backquote (self, argv, trash_err=False): + if not trash_err: + return subprocess.Popen(argv,stdout=subprocess.PIPE).communicate()[0] + else: + return subprocess.Popen(argv,stdout=subprocess.PIPE,stderr=file('/dev/null','w')).communicate()[0] + + def backquote_ssh (self, box, argv, trash_err=False): + # first probe the ssh link + hostname=self.backquote ( self.ssh(box) + [ "hostname"], trash_err=True ) + if not hostname: + print "%s unreachable"%self.root(box) + return '' + else: + return self.backquote( ['ssh',self.root(box)] + argv, trash_err) def reboot (self,box): command=['ssh',self.root(box),'shutdown','-r','now'] self.run (command,"Rebooting %s"%box) - def handle_trackers (self): + def handle_tracker_plcs (self): box = self.fqdn (self.testmaster) - if self.options.probe: - command=['ssh',self.root(box),"head","-100","tracker*"] - self.run(command,"Inspecting trackers on %s"%box) + filename="tracker-plcs" + if not self.options.probe: + command=["rm","-rf",filename] + self.run_ssh(box,command,"Cleaning up %s on %s"%(filename,box)) else: - command=['ssh',self.root(box),"rm","-rf","tracker*"] - self.run(command,"Cleaning up trackers on %s"%box) + self.header ("++++++++++ Inspecting %s on %s"%(filename,box)) + read_command = ["cat",filename] + trackers=self.backquote_ssh(box,read_command) + for tracker in trackers.split('\n'): + if not tracker: continue + try: + tracker=tracker.strip() + [hostname,buildname]=tracker.split('@') + [left,plcname]=buildname.rsplit('-',1) + print self.margin_outline(plcname),tracker + except: + print self.margin(""),tracker + + def handle_tracker_qemus (self): + box = self.fqdn (self.testmaster) + filename="tracker-qemus" + if not self.options.probe: + command=["rm","-rf",filename] + self.run_ssh(box,command,"Cleaning up %s on %s"%(filename,box)) + else: + self.header ("++++++++++ Inspecting %s on %s"%(filename,box)) + read_command = ["cat",filename] + trackers=self.backquote_ssh(box,read_command) + for tracker in trackers.split('\n'): + if not tracker: continue + try: + tracker=tracker.strip() + [hostname,buildname,nodename]=tracker.split('@') + nodename=nodename.split('.')[0] + print self.margin_outline(nodename),tracker + except: + print self.margin(""),tracker def handle_build_box (self,box): if not self.options.probe: self.reboot(box) else: - command=['ssh',self.root(box),'pgrep','build'] + command=['uptime'] + uptime=self.backquote_ssh(box,command,True).strip() + + command=['pgrep','build'] + if self.options.dry_run: + self.run_ssh(box,command,None) + else: + pids=self.backquote_ssh(box,command,True) + if not pids: + self.header ('No build process on %s (%s)'%(box,uptime)) + else: + command=['ps','-o','pid,command'] + [ pid for pid in pids.split("\n") if pid] + self.run_ssh(box,command,"Active build processes on %s (%s)"%(box,uptime),True) + + # this one is more accurate as it locates processes in the vservers as well + # but it's so sloooowww + def handle_build_box_deep (self,box): + if not self.options.probe: + self.reboot(box) + else: + command=['uptime'] + uptime=self.backquote_ssh(box,command,True).strip() + + command=['vps','-e'] if self.options.dry_run: - self.run(command,None) + self.run_ssh(box,command,None) else: - pids=self.backquote(command) + # simulate grep vbuild + vps_lines=[ line for line in self.backquote_ssh(box,command,True).split("\n") + if line.find('vbuild') >= 0] + pids=[ line.split()[0] for line in vps_lines ] if not pids: - self.header ('No build process on %s'%box) + self.header ('No build process on %s (%s)'%(box,uptime)) else: - command=['ssh',self.root(box),'ps','-o','pid,command'] + [ pid for pid in pids.split("\n") if pid] - self.run(command,"Active build processes on %s"%box) + command=['vps','-o','pid,command'] + pids + self.run_ssh(box,command,"Active build processes on %s (%s)"%(box,uptime),True) + vplc_matcher = re.compile(".*(vplc[0-9]+$)") def vplcname (self, vservername): @@ -75,42 +184,56 @@ class BuildBoxes: if match: return match.groups(0) else: return "" + margin_format="%-14s" + def margin(self,string): return self.margin_format%string + def outline (self, string): return '== %s =='%string + def margin_outline (self, string): return self.margin(self.outline(string)) + def handle_plc_box (self,box): +# initial approach was to first scan vserver-stat, but it's not needed if not self.options.probe: - self.reboot(box) - else: - command=['ssh',self.root(box),'vserver-stat'] - if self.options.dry_run: - self.run(command,"Active vservers on %s"%box) +# # remove mark for all running servers to avoid resurrection +# if vserver_names: +# bash="; ".join( [ "rm -f /etc/vservers/%s/apps/init/mark"%vs for vs in vserver_names ] ) +# stop_command=['bash','-c',"'" + bash + "'"] +# self.run_ssh(box,stop_command,"Removing mark on running vservers on %s"%box) + # just trash all marks + stop_command=['rm','-rf','/etc/vservers/*/apps/init/mark'] + self.run_ssh(box,stop_command,"Removing all vserver marks on %s"%box) + if not self.options.soft: + self.reboot(box) else: - # try to find fullname (vserver_stat truncates to a ridiculously short name) - try: - self.header ("vserver map on %s"%box) - # fetch the contexts for all vservers on that box - map_command=['ssh',self.root(box),'grep','.','/etc/vservers/*/context','/dev/null',] - context_map=self.backquote (map_command) - # at this point we have a set of lines like - # /etc/vservers/2010.01.20--k27-f12-32-vplc03/context:40144 - ctx_dict={} - for map_line in context_map.split("\n"): - if not map_line: continue - [path,xid] = map_line.split(':') - ctx_dict[xid]=os.path.basename(os.path.dirname(path)) - # at this point ctx_id maps context id to vservername - - vserver_stat = self.backquote (command) - for vserver_line in vserver_stat.split("\n"): - if not vserver_line: continue - context=vserver_line.split()[0] - if context=="CTX": - print vserver_line - continue - longname=ctx_dict[context] - plcname=self.vplcname(longname) - if plcname: print "== %s =="%plcname - print "%(vserver_line)s [=%(longname)s]"%locals() - except: - self.run(command,"Fine-grained method failed - fallback to plain vserver-stat") + self.run_ssh(box,['service','util-vserver','stop'],"Stopping all running vservers") + return + # even for rebooting we need to scan vserver-stat to stop the vservers properly + vserver_names=[] + command=['vserver-stat'] + if self.options.dry_run: + self.run_ssh(box,command,"Active vservers on %s"%box) + # try to find fullname (vserver_stat truncates to a ridiculously short name) + self.header ("vserver map on %s"%box) + # fetch the contexts for all vservers on that box + map_command=['grep','.','/etc/vservers/*/context','/dev/null',] + context_map=self.backquote_ssh (box,map_command) + # at this point we have a set of lines like + # /etc/vservers/2010.01.20--k27-f12-32-vplc03/context:40144 + ctx_dict={} + for map_line in context_map.split("\n"): + if not map_line: continue + [path,xid] = map_line.split(':') + ctx_dict[xid]=os.path.basename(os.path.dirname(path)) + # at this point ctx_id maps context id to vservername + + vserver_stat = self.backquote_ssh (box,command) + for vserver_line in vserver_stat.split("\n"): + if not vserver_line: continue + context=vserver_line.split()[0] + if context=="CTX": + print self.margin(""),vserver_line + continue + longname=ctx_dict[context] + vserver_names.append(longname) + print self.margin_outline(self.vplcname(longname)),"%(vserver_line)s [=%(longname)s]"%locals() vnode_matcher = re.compile(".*(vnode[0-9]+)") def vnodename (self, ps_line): @@ -118,48 +241,99 @@ class BuildBoxes: if match: return match.groups(0) else: return "" - def handle_qemu_box (self,box): if not self.options.probe: - self.reboot(box) + if not self.options.soft: + self.reboot(box) + else: + self.run_ssh(box,['pkill','qemu'],"Killing qemu instances") else: - command=['ssh',self.root(box),'pgrep','qemu'] + command=['lsmod'] + modules=self.backquote_ssh(box,command).split('\n') + kqemu_msg='*NO kqemu/kmv_intel MODULE LOADED*' + for module in modules: + if module.find('kqemu')==0: + kqemu_msg='kqemu module loaded' + # kvm might be loaded without vkm_intel (we dont have AMD) + elif module.find('kvm_intel')==0: + kqemu_msg='kvm_intel module loaded' + + command=['pgrep','qemu'] if self.options.dry_run: - self.run(command,None) + self.run_ssh(box,command,None) else: - pids=self.backquote(command) + pids=self.backquote_ssh(box,command) if not pids: - self.header ('No qemu process on %s'%box) + self.header ('No qemu process on %s (%s)'%(box,kqemu_msg)) else: - self.header ("Active qemu processes on %s"%box) - command=['ssh',self.root(box),'ps','-o','pid,command'] + [ pid for pid in pids.split("\n") if pid] - ps_lines = self.backquote (command).split("\n") + self.header ("Active qemu processes on %s (%s)"%(box,kqemu_msg)) + command=['ps','-o','pid,command'] + [ pid for pid in pids.split("\n") if pid] + ps_lines = self.backquote_ssh (box,command).split("\n") for ps_line in ps_lines: if not ps_line or ps_line.find('PID') >=0 : continue - node=self.vnodename(ps_line) - if node: print "== %s =="%node - print ps_line + print self.margin_outline(self.vnodename(ps_line)), ps_line - def handle_box(self,box): + # the ouput of ps -o pid,command gives us bash /run_log + def testmaster_buildname (self, ps_line): + chunks=ps_line.split() + path=chunks[2] + [buildname,command]=path.split('/') + return buildname + + def handle_testmaster_box (self, box): + if not self.options.probe: + pass + else: + command=['pgrep','run_log'] + if self.options.dry_run: + self.run_ssh(box,command,None) + else: + pids=self.backquote_ssh(box,command) + if not pids: + self.header ('No run_log process on %s'%box) + else: + self.header ("Active run_log processes on %s"%(box)) + command=['ps','-o','pid,command'] + [ pid for pid in pids.split("\n") if pid] + ps_lines = self.backquote_ssh (box,command).split("\n") + for ps_line in ps_lines: + if not ps_line or ps_line.find('PID') >=0 : continue + print self.margin_outline(self.testmaster_buildname(ps_line)), ps_line + + + def handle_box(self,box,type): if box in self.qemu_boxes: - self.handle_qemu_box(self.fqdn(box)) + if type=="qemu": self.handle_qemu_box(self.fqdn(box)) elif box in self.plc_boxes: - self.handle_plc_box(self.fqdn(box)) - else: - self.handle_build_box(self.fqdn(box)) + if type=="plc": self.handle_plc_box(self.fqdn(box)) + elif box in self.testmaster_boxes: + if type=='testmaster': self.handle_testmaster_box(self.fqdn(box)) + elif type=="build": + if self.options.deep: + self.handle_build_box_deep(self.fqdn(box)) + else: + self.handle_build_box(self.fqdn(box)) + + def handle_disk (self,box): + box=self.fqdn(box) + return self.run_ssh(box,["df","-h",],"Disk space on %s"%box) def main (self): usage="""%prog [options] [hostname..(s)] -Default is to act on test boxes only (with trackers clean)""" +Default is to act on test boxes only""" parser = OptionParser (usage=usage) parser.add_option ("-n","--dry-run",action="store_true",dest="dry_run",default=False, help="Dry run") parser.add_option ("-r","--reboot", action="store_false",dest="probe",default=True, help="Actually reset/reboot stuff instead of just probing it") + parser.add_option ("-s","--soft",action="store_true",dest="soft",default=False, + help="Soft reset instead of hard reboot of the boxes") # no need for -p = probe, as this is the default parser.add_option ("-p","--plc", action="store_true",dest="plc_only",default=False, help="Acts on the plc box only") + parser.add_option ("-e","--deep",action="store_true", dest="deep", default=False, + help="on build boxes, shows vbuild processes in vservers as well; signif. slower") + parser.add_option ("-a","--all",action="store_true",dest="all_boxes",default=False, help="Acts on build and test boxes") parser.add_option ("-b","--build",action="store_true",dest="build_only",default=False, @@ -168,39 +342,70 @@ Default is to act on test boxes only (with trackers clean)""" help="Only acts on the qemu boxes") parser.add_option ("-t","--trackers",action="store_true",dest="trackers_only",default=False, help="Only wipes trackers") + parser.add_option ("-m","--master",action="store_true",dest="testmaster_only",default=False, + help="Display the testmaster status") + parser.add_option ("-d","--disk",action="store_true",dest="show_disk",default=False, + help="Only inspects disk status") + parser.add_option ("-c","--refresh-cache",action="store_true",dest="refresh_cache", default=False, + help="Refresh cached list of qemu boxes at testmaster - implies -q") (self.options,args) = parser.parse_args() + # -c implies -q + if self.options.refresh_cache: + self.options.qemu_only=True + self.refresh_cache() + # use given hostnames if provided if args: self.boxes=args # if hostnames are specified, let's stay on the safe side and don't reset trackers - self.do_tracker = False + self.do_tracker_plcs = False + self.do_tracker_qemus = False elif self.options.all_boxes: - self.boxes=self.test_boxes + self.build_boxes - self.do_tracker = True + self.boxes=self.test_boxes + self.build_boxes + self.testmaster_boxes + self.do_tracker_plcs = True + self.do_tracker_qemus = True elif self.options.build_only: self.boxes=self.build_boxes - self.do_tracker = False + self.do_tracker_plcs = False + self.do_tracker_qemus = False elif self.options.qemu_only: self.boxes=self.qemu_boxes - self.do_tracker = False + self.do_tracker_plcs = False + self.do_tracker_qemus = True elif self.options.plc_only: self.boxes=self.plc_boxes - self.do_tracker = False + self.do_tracker_plcs = True + self.do_tracker_qemus = False + elif self.options.testmaster_only: + self.boxes=self.testmaster_boxes + self.do_tracker_plcs = False + self.do_tracker_qemus = False elif self.options.trackers_only: self.boxes = [] - self.do_tracker = True + self.do_tracker_plcs = True + self.do_tracker_qemus = True # default else: self.boxes = self.test_boxes - self.do_tracker = True + self.do_tracker_plcs = True + self.do_tracker_qemus = True - if self.do_tracker: - self.handle_trackers () - for box in self.boxes: - self.handle_box (box) + if self.options.show_disk: + for box in self.boxes: self.handle_disk(box) + return + # PLCS + if self.do_tracker_plcs:self.handle_tracker_plcs () + for box in self.boxes: self.handle_box (box,"plc") + # QEMU + if self.do_tracker_qemus:self.handle_tracker_qemus () + for box in self.boxes: self.handle_box (box,"qemu") + # ALL OTHERS + for box in self.boxes: self.handle_box (box,"build") + # TESTMASTER + for box in self.boxes: self.handle_box (box,"testmaster") if __name__ == "__main__": BuildBoxes().main()