#!/usr/bin/python import os.path, sys import re import subprocess from optparse import OptionParser class Infrastructure: # everything in the onelab.eu domain domain = 'pl.sophia.inria.fr' build_boxes = [ "devel", "liquid", "reed", "velvet", ] plc_boxes = [ "testplc" ] testmaster = 'testmaster' testmaster_boxes = [ testmaster ] # cache the list of qemu boxes in ~/.qemu-boxes # this can be refreshed by running -c qemu_boxes=[] def cache_file (self): return os.path.expanduser("~/.qemu-boxes") def load_cache (self): cache=self.cache_file() if os.path.isfile(cache): self.qemu_boxes=file(cache).read().split() self.test_boxes = self.plc_boxes + self.qemu_boxes # run LocalTestResources on testmaster def refresh_cache (self): retrieved= \ self.backquote_ssh(self.fqdn(self.testmaster),['LocalTestResources.py'],trash_err=True) remove="."+Infrastructure.domain retrieved = [ x.replace(remove,"").strip() for x in retrieved.split()] self.qemu_boxes = retrieved cache=self.cache_file() file(cache,'w').write(' '.join(self.qemu_boxes)+'\n') print "New contents of %s:"%cache print file(cache).read(), def __init__ (self): # dummy defaults self.boxes = [] self.do_tracker_qemus = False self.do_tracker_plcs = False self.load_cache() def fqdn (self, box): return "%s.%s"%(box,self.domain) ssh_command=['ssh','-o','ConnectTimeout=3'] @staticmethod def root (box): return "root@%s"%box @staticmethod def ssh(box): return Infrastructure.ssh_command + [ Infrastructure.root(box) ] def header (self,message): print "===============",message sys.stdout.flush() def run (self,argv,message, trash_err=False): if self.options.dry_run: print 'DRY_RUN:', print " ".join(argv) return 0 else: if message: self.header(message) if not trash_err: return subprocess.call(argv) else: return subprocess.call(argv,stderr=file('/dev/null','w')) def run_ssh (self, box, argv, message, trash_err=False): result=self.run (self.ssh(box) + argv, message, trash_err) if result!=0: print "WARNING: failed to run %s on %s"%(" ".join(argv),box) return result def backquote (self, argv, trash_err=False): if not trash_err: return subprocess.Popen(argv,stdout=subprocess.PIPE).communicate()[0] else: return subprocess.Popen(argv,stdout=subprocess.PIPE,stderr=file('/dev/null','w')).communicate()[0] def backquote_ssh (self, box, argv, trash_err=False): # first probe the ssh link hostname=self.backquote ( self.ssh(box) + [ "hostname"], trash_err=True ) if not hostname: print "%s unreachable"%self.root(box) return '' else: return self.backquote( ['ssh',self.root(box)] + argv, trash_err) def reboot (self,box): command=['ssh',self.root(box),'shutdown','-r','now'] self.run (command,"Rebooting %s"%box) def handle_starting (self): box = self.fqdn (self.testmaster) filename="starting" if not self.options.probe: command=["rm","-rf",filename] self.run_ssh(box,command,"Cleaning up %s on %s"%(filename,box)) else: read_command = ["cat",filename] self.run_ssh(box,read_command,"++++++++++ Inspecting %s on %s"%(filename,box)) def handle_tracker_plcs (self): box = self.fqdn (self.testmaster) filename="tracker-plcs" if not self.options.probe: command=["rm","-rf",filename] self.run_ssh(box,command,"Cleaning up %s on %s"%(filename,box)) else: self.header ("++++++++++ Inspecting %s on %s"%(filename,box)) read_command = ["cat",filename] trackers=self.backquote_ssh(box,read_command) for tracker in trackers.split('\n'): if not tracker: continue try: tracker=tracker.strip() (hostname,buildname,plcname)=tracker.split('@') print self.margin_outline(plcname),tracker except: print self.margin(""),tracker def handle_tracker_qemus (self): box = self.fqdn (self.testmaster) filename="tracker-qemus" if not self.options.probe: command=["rm","-rf",filename] self.run_ssh(box,command,"Cleaning up %s on %s"%(filename,box)) else: self.header ("++++++++++ Inspecting %s on %s"%(filename,box)) read_command = ["cat",filename] trackers=self.backquote_ssh(box,read_command) for tracker in trackers.split('\n'): if not tracker: continue try: tracker=tracker.strip() [hostname,buildname,nodename]=tracker.split('@') nodename=nodename.split('.')[0] print self.margin_outline(nodename),tracker except: print self.margin(""),tracker def handle_build_box (self,box): if not self.options.probe: self.reboot(box) else: command=['uptime'] uptime=self.backquote_ssh(box,command,True).strip() command=['pgrep','build'] if self.options.dry_run: self.run_ssh(box,command,None) else: pids=self.backquote_ssh(box,command,True) if not pids: self.header ('No build process on %s (%s)'%(box,uptime)) else: command=['ps','-o','pid,command'] + [ pid for pid in pids.split("\n") if pid] self.run_ssh(box,command,"Active build processes on %s (%s)"%(box,uptime),True) # this one is more accurate as it locates processes in the vservers as well # but it's so sloooowww def handle_build_box_deep (self,box): if not self.options.probe: self.reboot(box) else: command=['uptime'] uptime=self.backquote_ssh(box,command,True).strip() command=['vps','-e'] if self.options.dry_run: self.run_ssh(box,command,None) else: # simulate grep vbuild vps_lines=[ line for line in self.backquote_ssh(box,command,True).split("\n") if line.find('vbuild') >= 0] pids=[ line.split()[0] for line in vps_lines ] if not pids: self.header ('No build process on %s (%s)'%(box,uptime)) else: command=['vps','-o','pid,command'] + pids self.run_ssh(box,command,"Active build processes on %s (%s)"%(box,uptime),True) vplc_matcher = re.compile(".*(vplc[0-9]+$)") def vplcname (self, vservername): match = self.vplc_matcher.match(vservername) if match: return match.groups(0) else: return "" margin_format="%-14s" def margin(self,string): return self.margin_format%string def outline (self, string): return '== %s =='%string def margin_outline (self, string): return self.margin(self.outline(string)) def handle_plc_box (self,box): # initial approach was to first scan vserver-stat, but it's not needed if not self.options.probe: # # remove mark for all running servers to avoid resurrection # if vserver_names: # bash="; ".join( [ "rm -f /etc/vservers/%s/apps/init/mark"%vs for vs in vserver_names ] ) # stop_command=['bash','-c',"'" + bash + "'"] # self.run_ssh(box,stop_command,"Removing mark on running vservers on %s"%box) # just trash all marks stop_command=['rm','-rf','/etc/vservers/*/apps/init/mark'] self.run_ssh(box,stop_command,"Removing all vserver marks on %s"%box) if not self.options.soft: self.reboot(box) else: self.run_ssh(box,['service','util-vserver','stop'],"Stopping all running vservers") return # even for rebooting we need to scan vserver-stat to stop the vservers properly vserver_names=[] command=['vserver-stat'] if self.options.dry_run: self.run_ssh(box,command,"Active vservers on %s"%box) # try to find fullname (vserver_stat truncates to a ridiculously short name) self.header ("vserver map on %s"%box) # fetch the contexts for all vservers on that box map_command=['grep','.','/etc/vservers/*/context','/dev/null',] context_map=self.backquote_ssh (box,map_command) # at this point we have a set of lines like # /etc/vservers/2010.01.20--k27-f12-32-vplc03/context:40144 ctx_dict={} for map_line in context_map.split("\n"): if not map_line: continue [path,xid] = map_line.split(':') ctx_dict[xid]=os.path.basename(os.path.dirname(path)) # at this point ctx_id maps context id to vservername vserver_stat = self.backquote_ssh (box,command) for vserver_line in vserver_stat.split("\n"): if not vserver_line: continue context=vserver_line.split()[0] if context=="CTX": print self.margin(""),vserver_line continue longname=ctx_dict[context] vserver_names.append(longname) print self.margin_outline(self.vplcname(longname)),"%(vserver_line)s [=%(longname)s]"%locals() vnode_matcher = re.compile(".*(vnode[0-9]+)") def vnodename (self, ps_line): match = self.vnode_matcher.match(ps_line) if match: return match.groups(0) else: return "" def handle_qemu_box (self,box): if not self.options.probe: if not self.options.soft: self.reboot(box) else: self.run_ssh(box,['pkill','qemu'],"Killing qemu instances") else: command=['lsmod'] modules=self.backquote_ssh(box,command).split('\n') kqemu_msg='*NO kqemu/kmv_intel MODULE LOADED*' for module in modules: if module.find('kqemu')==0: kqemu_msg='kqemu module loaded' # kvm might be loaded without vkm_intel (we dont have AMD) elif module.find('kvm_intel')==0: kqemu_msg='kvm_intel module loaded' command=['pgrep','qemu'] if self.options.dry_run: self.run_ssh(box,command,None) else: pids=self.backquote_ssh(box,command) if not pids: self.header ('No qemu process on %s (%s)'%(box,kqemu_msg)) else: self.header ("Active qemu processes on %s (%s)"%(box,kqemu_msg)) command=['ps','-o','pid,command'] + [ pid for pid in pids.split("\n") if pid] ps_lines = self.backquote_ssh (box,command).split("\n") for ps_line in ps_lines: if not ps_line or ps_line.find('PID') >=0 : continue print self.margin_outline(self.vnodename(ps_line)), ps_line # the ouput of ps -o pid,command gives us bash /run_log def testmaster_buildname (self, ps_line): chunks=ps_line.split() path=chunks[2] [buildname,command]=path.split('/') return buildname def handle_testmaster_box (self, box): if not self.options.probe: pass else: command=['pgrep','run_log'] if self.options.dry_run: self.run_ssh(box,command,None) else: pids=self.backquote_ssh(box,command) if not pids: self.header ('No run_log process on %s'%box) else: self.header ("Active run_log processes on %s"%(box)) command=['ps','-o','pid,command'] + [ pid for pid in pids.split("\n") if pid] ps_lines = self.backquote_ssh (box,command).split("\n") for ps_line in ps_lines: if not ps_line or ps_line.find('PID') >=0 : continue print self.margin_outline(self.testmaster_buildname(ps_line)), ps_line def handle_box(self,box,type): if box in self.qemu_boxes: if type=="qemu": self.handle_qemu_box(self.fqdn(box)) elif box in self.plc_boxes: if type=="plc": self.handle_plc_box(self.fqdn(box)) elif box in self.testmaster_boxes: if type=='testmaster': self.handle_testmaster_box(self.fqdn(box)) elif type=="build": if self.options.deep: self.handle_build_box_deep(self.fqdn(box)) else: self.handle_build_box(self.fqdn(box)) def handle_disk (self,box): box=self.fqdn(box) return self.run_ssh(box,["df","-h",],"Disk space on %s"%box) def main (self): usage="""%prog [options] [hostname..(s)] Default is to act on test boxes only""" parser = OptionParser (usage=usage) parser.add_option ("-n","--dry-run",action="store_true",dest="dry_run",default=False, help="Dry run") parser.add_option ("-r","--reboot", action="store_false",dest="probe",default=True, help="Actually reset/reboot stuff instead of just probing it") parser.add_option ("-s","--soft",action="store_true",dest="soft",default=False, help="Soft reset instead of hard reboot of the boxes") # no need for -p = probe, as this is the default parser.add_option ("-p","--plc", action="store_true",dest="plc_only",default=False, help="Acts on the plc box only") parser.add_option ("-e","--deep",action="store_true", dest="deep", default=False, help="on build boxes, shows vbuild processes in vservers as well; signif. slower") parser.add_option ("-a","--all",action="store_true",dest="all_boxes",default=False, help="Acts on build and test boxes") parser.add_option ("-b","--build",action="store_true",dest="build_only",default=False, help="Acts on build boxes only") parser.add_option ("-q","--qemu",action="store_true",dest="qemu_only",default=False, help="Only acts on the qemu boxes") parser.add_option ("-t","--trackers",action="store_true",dest="trackers_only",default=False, help="Only wipes trackers") parser.add_option ("-m","--master",action="store_true",dest="testmaster_only",default=False, help="Display the testmaster status") parser.add_option ("-d","--disk",action="store_true",dest="show_disk",default=False, help="Only inspects disk status") parser.add_option ("-c","--refresh-cache",action="store_true",dest="refresh_cache", default=False, help="Refresh cached list of qemu boxes at testmaster - implies -q") (self.options,args) = parser.parse_args() # -c implies -q if self.options.refresh_cache: self.options.qemu_only=True self.refresh_cache() # use given hostnames if provided if args: self.boxes=args # if hostnames are specified, let's stay on the safe side and don't reset trackers self.do_tracker_plcs = False self.do_tracker_qemus = False elif self.options.all_boxes: self.boxes=self.test_boxes + self.build_boxes + self.testmaster_boxes self.do_tracker_plcs = True self.do_tracker_qemus = True elif self.options.build_only: self.boxes=self.build_boxes self.do_tracker_plcs = False self.do_tracker_qemus = False elif self.options.qemu_only: self.boxes=self.qemu_boxes self.do_tracker_plcs = False self.do_tracker_qemus = True elif self.options.plc_only: self.boxes=self.plc_boxes self.do_tracker_plcs = True self.do_tracker_qemus = False elif self.options.testmaster_only: self.boxes=self.testmaster_boxes self.do_tracker_plcs = False self.do_tracker_qemus = False elif self.options.trackers_only: self.boxes = [] self.do_tracker_plcs = True self.do_tracker_qemus = True # default else: self.boxes = self.test_boxes self.do_tracker_plcs = True self.do_tracker_qemus = True if self.options.show_disk: for box in self.boxes: self.handle_disk(box) return # PLCS if self.do_tracker_plcs: self.handle_tracker_plcs () self.handle_starting () for box in self.boxes: self.handle_box (box,"plc") # QEMU if self.do_tracker_qemus:self.handle_tracker_qemus () for box in self.boxes: self.handle_box (box,"qemu") # ALL OTHERS for box in self.boxes: self.handle_box (box,"build") # TESTMASTER for box in self.boxes: self.handle_box (box,"testmaster") if __name__ == "__main__": Infrastructure().main()