when rebooting, remove all marks, not only for the running vservers
[infrastructure.git] / scripts / manage-infrastructure.py
index a994aef..4709c15 100755 (executable)
@@ -1,6 +1,6 @@
 #!/usr/bin/python
 
-import os.path
+import os.path, sys
 import re
 import subprocess
 from optparse import OptionParser
@@ -8,66 +8,175 @@ from optparse import OptionParser
 class BuildBoxes:
 
     # everything in the onelab.eu domain
-    domain = 'onelab.eu'
-    testmaster = 'testmaster'
+    domain = 'pl.sophia.inria.fr'
     build_boxes = [ "mirror", "liquid", "reed", "velvet", ]
     plc_boxes = [ "testplc" ]
-    qemu_boxes = \
-        [ "testqemu%d"%i for i in range (1,4) ] + \
-        [ "testqemu32-%d"%i for i in range (1,6) ]
-    test_boxes = plc_boxes + qemu_boxes
+    testmaster = 'testmaster'
+    testmaster_boxes = [ testmaster ]
+    # cache the list of qemu boxes in ~/.qemu-boxes
+    # this can be refreshed by running -c
+    qemu_boxes=[]
+
+    def cache_file (self): return os.path.expanduser("~/.qemu-boxes")
+
+    def load_cache (self):
+        cache=self.cache_file()
+        if os.path.isfile(cache):
+            self.qemu_boxes=file(cache).read().split()
+        self.test_boxes = self.plc_boxes + self.qemu_boxes
+
+    # run LocalTestResources on testmaster
+    def refresh_cache (self):
+        retrieved= \
+            self.backquote_ssh(self.fqdn(self.testmaster),['LocalTestResources.py'],trash_err=True)
+        remove="."+BuildBoxes.domain
+        retrieved = [ x.replace(remove,"").strip() for x in retrieved.split()]
+        self.qemu_boxes = retrieved
+        cache=self.cache_file()
+        file(cache,'w').write(' '.join(self.qemu_boxes)+'\n')
+        print "New contents of %s:"%cache
+        print file(cache).read(),
 
     def __init__ (self):
         # dummy defaults
         self.boxes = []
-        self.do_tracker = False
+        self.do_tracker_qemus = False
+        self.do_tracker_plcs = False
+        self.load_cache()
 
     def fqdn (self, box):
         return "%s.%s"%(box,self.domain)
+
+    ssh_command=['ssh','-o','ConnectTimeout=3']
     @staticmethod
     def root (box): return "root@%s"%box
 
+    @staticmethod
+    def ssh(box):
+        return BuildBoxes.ssh_command + [ BuildBoxes.root(box) ]
+
     def header (self,message):
         print "===============",message
+        sys.stdout.flush()
 
-    def run (self,argv,message):
+    def run (self,argv,message, trash_err=False):
         if self.options.dry_run:
             print 'DRY_RUN:',
             print " ".join(argv)
+            return 0
         else:
             if message: self.header(message)
-            subprocess.call(argv)
+            if not trash_err:
+                return subprocess.call(argv)
+            else:
+                return subprocess.call(argv,stderr=file('/dev/null','w'))
                 
-    def backquote (self, argv):
-        return subprocess.Popen(argv,stdout=subprocess.PIPE).communicate()[0]
+    def run_ssh (self, box, argv, message, trash_err=False):
+        result=self.run (self.ssh(box) + argv, message, trash_err)
+        if result!=0:
+            print "WARNING: failed to run %s on %s"%(" ".join(argv),box)
+        return result
+
+    def backquote (self, argv, trash_err=False):
+        if not trash_err:
+            return subprocess.Popen(argv,stdout=subprocess.PIPE).communicate()[0]
+        else:
+            return subprocess.Popen(argv,stdout=subprocess.PIPE,stderr=file('/dev/null','w')).communicate()[0]
+
+    def backquote_ssh (self, box, argv, trash_err=False):
+        # first probe the ssh link
+        hostname=self.backquote ( self.ssh(box) + [ "hostname"], trash_err=True )
+        if not hostname:
+            print "%s unreachable"%self.root(box)
+            return ''
+        else:
+            return self.backquote( ['ssh',self.root(box)] + argv, trash_err)
 
     def reboot (self,box):
         command=['ssh',self.root(box),'shutdown','-r','now']
         self.run (command,"Rebooting %s"%box)
 
-    def handle_trackers (self):
+    def handle_tracker_plcs (self):
+        box = self.fqdn (self.testmaster)
+        filename="tracker-plcs"
+        if not self.options.probe:
+            command=["rm","-rf",filename]
+            self.run_ssh(box,command,"Cleaning up %s on %s"%(filename,box))
+        else:
+            self.header ("++++++++++ Inspecting %s on %s"%(filename,box))
+            read_command = ["cat",filename]
+            trackers=self.backquote_ssh(box,read_command)
+            for tracker in trackers.split('\n'):
+                if not tracker: continue
+                try:
+                    tracker=tracker.strip()
+                    [hostname,buildname]=tracker.split('@')
+                    [left,plcname]=buildname.rsplit('-',1)
+                    print self.margin_outline(plcname),tracker
+                except:
+                    print self.margin(""),tracker
+
+    def handle_tracker_qemus (self):
         box = self.fqdn (self.testmaster)
-        if self.options.probe:
-            command=['ssh',self.root(box),"head","-100","tracker*"]
-            self.run(command,"Inspecting trackers on %s"%box)
+        filename="tracker-qemus"
+        if not self.options.probe:
+            command=["rm","-rf",filename]
+            self.run_ssh(box,command,"Cleaning up %s on %s"%(filename,box))
         else:
-            command=['ssh',self.root(box),"rm","-rf","tracker*"]
-            self.run(command,"Cleaning up trackers on %s"%box)
+            self.header ("++++++++++ Inspecting %s on %s"%(filename,box))
+            read_command = ["cat",filename]
+            trackers=self.backquote_ssh(box,read_command)
+            for tracker in trackers.split('\n'):
+                if not tracker: continue
+                try:
+                    tracker=tracker.strip()
+                    [hostname,buildname,nodename]=tracker.split('@')
+                    nodename=nodename.split('.')[0]
+                    print self.margin_outline(nodename),tracker
+                except:
+                    print self.margin(""),tracker
 
     def handle_build_box (self,box):
         if not self.options.probe:
             self.reboot(box)
         else:
-            command=['ssh',self.root(box),'pgrep','build']
+            command=['uptime']
+            uptime=self.backquote_ssh(box,command,True).strip()
+
+            command=['pgrep','build']
             if self.options.dry_run:
-                self.run(command,None)
+                self.run_ssh(box,command,None)
             else:
-                pids=self.backquote(command)
+                pids=self.backquote_ssh(box,command,True)
                 if not pids:
-                    self.header ('No build process on %s'%box)
+                    self.header ('No build process on %s (%s)'%(box,uptime))
                 else:
-                    command=['ssh',self.root(box),'ps'] + [ pid for pid in pids.split("\n") if pid]
-                    self.run(command,"Active build processes on %s"%box)
+                    command=['ps','-o','pid,command'] + [ pid for pid in pids.split("\n") if pid]
+                    self.run_ssh(box,command,"Active build processes on %s (%s)"%(box,uptime),True)
+
+    # this one is more accurate as it locates processes in the vservers as well
+    # but it's so sloooowww
+    def handle_build_box_deep (self,box):
+        if not self.options.probe:
+            self.reboot(box)
+        else:
+            command=['uptime']
+            uptime=self.backquote_ssh(box,command,True).strip()
+
+            command=['vps','-e']
+            if self.options.dry_run:
+                self.run_ssh(box,command,None)
+            else:
+                # simulate grep vbuild
+                vps_lines=[ line for line in self.backquote_ssh(box,command,True).split("\n")
+                            if line.find('vbuild') >= 0]
+                pids=[ line.split()[0] for line in vps_lines ]
+                if not pids:
+                    self.header ('No build process on %s (%s)'%(box,uptime))
+                else:
+                    command=['vps','-o','pid,command'] + pids
+                    self.run_ssh(box,command,"Active build processes on %s (%s)"%(box,uptime),True)
+
 
     vplc_matcher = re.compile(".*(vplc[0-9]+$)")
     def vplcname (self, vservername):
@@ -75,42 +184,51 @@ class BuildBoxes:
         if match: return match.groups(0)
         else: return ""
 
+    margin_format="%-14s"
+    def margin(self,string): return self.margin_format%string
+    def outline (self, string): return '== %s =='%string
+    def margin_outline (self, string): return self.margin(self.outline(string))
+
     def handle_plc_box (self,box):
+        # even for rebooting we need to scan vserver-stat to stop the vservers properly
+        vserver_names=[]
+        command=['vserver-stat']
+        if self.options.dry_run:
+            self.run_ssh(box,command,"Active vservers on %s"%box)
+        # try to find fullname (vserver_stat truncates to a ridiculously short name)
+        self.header ("vserver map on %s"%box)
+        # fetch the contexts for all vservers on that box
+        map_command=['grep','.','/etc/vservers/*/context','/dev/null',]
+        context_map=self.backquote_ssh (box,map_command)
+        # at this point we have a set of lines like
+        # /etc/vservers/2010.01.20--k27-f12-32-vplc03/context:40144
+        ctx_dict={}
+        for map_line in context_map.split("\n"):
+            if not map_line: continue
+            [path,xid] = map_line.split(':')
+            ctx_dict[xid]=os.path.basename(os.path.dirname(path))
+        # at this point ctx_id maps context id to vservername
+
+        vserver_stat = self.backquote_ssh (box,command)
+        for vserver_line in vserver_stat.split("\n"):
+            if not vserver_line: continue
+            context=vserver_line.split()[0]
+            if context=="CTX": 
+                print self.margin(""),vserver_line
+                continue
+            longname=ctx_dict[context]
+            vserver_names.append(longname)
+            print self.margin_outline(self.vplcname(longname)),"%(vserver_line)s [=%(longname)s]"%locals()
         if not self.options.probe:
+#            # remove mark for all running servers to avoid resurrection
+#            if vserver_names:
+#                bash="; ".join( [ "rm -f /etc/vservers/%s/apps/init/mark"%vs for vs in vserver_names ] )
+#                stop_command=['bash','-c',"'" + bash + "'"]
+#                self.run_ssh(box,stop_command,"Removing mark on running vservers on %s"%box)
+            # just trash all marks 
+            stop_command=['rm','-rf','/etc/vservers/*/apps/init/mark']
+            self.run_ssh(box,stop_command,"Removing all vserver marks on %s"%box)
             self.reboot(box)
-        else:
-            command=['ssh',self.root(box),'vserver-stat']
-            if self.options.dry_run:
-                self.run(command,"Active vservers on %s"%box)
-            else:
-                # try to find fullname (vserver_stat truncates to a ridiculously short name)
-                try:
-                    self.header ("vserver map on %s"%box)
-                    # fetch the contexts for all vservers on that box
-                    map_command=['ssh',self.root(box),'grep','.','/etc/vservers/*/context','/dev/null',]
-                    context_map=self.backquote (map_command)
-                    # at this point we have a set of lines like
-                    # /etc/vservers/2010.01.20--k27-f12-32-vplc03/context:40144
-                    ctx_dict={}
-                    for map_line in context_map.split("\n"):
-                        if not map_line: continue
-                        [path,xid] = map_line.split(':')
-                        ctx_dict[xid]=os.path.basename(os.path.dirname(path))
-                    # at this point ctx_id maps context id to vservername
-
-                    vserver_stat = self.backquote (command)
-                    for vserver_line in vserver_stat.split("\n"):
-                        if not vserver_line: continue
-                        context=vserver_line.split()[0]
-                        if context=="CTX": 
-                            print vserver_line
-                            continue
-                        longname=ctx_dict[context]
-                        plcname=self.vplcname(longname)
-                        if plcname: print "== %s =="%plcname
-                        print "%(vserver_line)s [=%(longname)s]"%locals()
-                except:
-                    self.run(command,"Fine-grained method failed - fallback to plain vserver-stat")
 
     vnode_matcher = re.compile(".*(vnode[0-9]+)")
     def vnodename (self, ps_line):
@@ -118,39 +236,79 @@ class BuildBoxes:
         if match: return match.groups(0)
         else: return ""
 
-
     def handle_qemu_box (self,box):
         if not self.options.probe:
             self.reboot(box)
         else:
-            command=['ssh',self.root(box),'pgrep','qemu']
+            command=['lsmod']
+            modules=self.backquote_ssh(box,command).split('\n')
+            kqemu_msg='*NO kqemu MODULE LOADED*'
+            for module in modules:
+                if module.find('kqemu')==0:
+                    kqemu_msg='kqemu OK'
+            
+            command=['pgrep','qemu']
             if self.options.dry_run:
-                self.run(command,None)
+                self.run_ssh(box,command,None)
             else:
-                pids=self.backquote(command)
+                pids=self.backquote_ssh(box,command)
                 if not pids:
                     self.header ('No qemu process on %s'%box)
                 else:
-                    self.header ("Active qemu processes on %s"%box)
-                    command=['ssh',self.root(box),'ps'] + [ pid for pid in pids.split("\n") if pid]
-                    ps_lines = self.backquote (command).split("\n")
+                    self.header ("Active qemu processes on %s (%s)"%(box,kqemu_msg))
+                    command=['ps','-o','pid,command'] + [ pid for pid in pids.split("\n") if pid]
+                    ps_lines = self.backquote_ssh (box,command).split("\n")
                     for ps_line in ps_lines:
                         if not ps_line or ps_line.find('PID') >=0 : continue
-                        node=self.vnodename(ps_line)
-                        if node: print "== %s =="%node
-                        print ps_line
+                        print self.margin_outline(self.vnodename(ps_line)), ps_line
 
-    def handle_box(self,box):
+    # the ouput of ps -o pid,command gives us <pid> bash <buildname>/run_log
+    def testmaster_buildname (self, ps_line):
+        chunks=ps_line.split()
+        path=chunks[2]
+        [buildname,command]=path.split('/')
+        return buildname
+
+    def handle_testmaster_box (self, box):
+        if not self.options.probe: 
+            pass
+        else:
+            command=['pgrep','run_log']
+            if self.options.dry_run:
+                self.run_ssh(box,command,None)
+            else:
+                pids=self.backquote_ssh(box,command)
+                if not pids:
+                    self.header ('No run_log process on %s'%box)
+                else:
+                    self.header ("Active run_log processes on %s"%(box))
+                    command=['ps','-o','pid,command'] + [ pid for pid in pids.split("\n") if pid]
+                    ps_lines = self.backquote_ssh (box,command).split("\n")
+                    for ps_line in ps_lines:
+                        if not ps_line or ps_line.find('PID') >=0 : continue
+                        print self.margin_outline(self.testmaster_buildname(ps_line)), ps_line
+        
+
+    def handle_box(self,box,type):
         if box in self.qemu_boxes:
-            self.handle_qemu_box(self.fqdn(box))
+            if type=="qemu": self.handle_qemu_box(self.fqdn(box))
         elif box in self.plc_boxes:
-            self.handle_plc_box(self.fqdn(box))
-        else:
-            self.handle_build_box(self.fqdn(box))
+            if type=="plc":  self.handle_plc_box(self.fqdn(box))
+        elif box in self.testmaster_boxes:
+            if type=='testmaster': self.handle_testmaster_box(self.fqdn(box))
+        elif type=="build":
+            if self.options.deep:
+                self.handle_build_box_deep(self.fqdn(box))
+            else:
+                self.handle_build_box(self.fqdn(box))
+
+    def handle_disk (self,box):
+        box=self.fqdn(box)
+        return self.run_ssh(box,["df","-h",],"Disk space on %s"%box)
 
     def main (self):
         usage="""%prog [options] [hostname..(s)]
-Default is to act on test boxes only (with trackers clean)"""
+Default is to act on test boxes only"""
         parser = OptionParser (usage=usage)
         parser.add_option ("-n","--dry-run",action="store_true",dest="dry_run",default=False,
                            help="Dry run")
@@ -164,43 +322,76 @@ Default is to act on test boxes only (with trackers clean)"""
                            help="Acts on build and test boxes")
         parser.add_option ("-b","--build",action="store_true",dest="build_only",default=False,
                            help="Acts on build boxes only")
+        parser.add_option ("-e","--deep",action="store_true", dest="deep", default=False,
+                           help="on build boxes, shows vbuild processes in vservers as well; signif. slower")
         parser.add_option ("-q","--qemu",action="store_true",dest="qemu_only",default=False,
                            help="Only acts on the qemu boxes")
         parser.add_option ("-t","--trackers",action="store_true",dest="trackers_only",default=False,
                            help="Only wipes trackers")
+        parser.add_option ("-m","--master",action="store_true",dest="testmaster_only",default=False,
+                           help="Display the testmaster status")
+        parser.add_option ("-d","--disk",action="store_true",dest="show_disk",default=False,
+                           help="Only inspects disk status")
+        parser.add_option ("-c","--refresh-cache",action="store_true",dest="refresh_cache", default=False,
+                           help="Refresh cached list of qemu boxes at testmaster - implies -q")
 
         (self.options,args) = parser.parse_args()
 
+        # -c implies -q
+        if self.options.refresh_cache:
+            self.options.qemu_only=True
+            self.refresh_cache()
+
         # use given hostnames if provided
         if args:
             self.boxes=args
             # if hostnames are specified, let's stay on the safe side and don't reset trackers
-            self.do_tracker = False
+            self.do_tracker_plcs = False
+            self.do_tracker_qemus = False
         elif self.options.all_boxes:
-            self.boxes=self.test_boxes + self.build_boxes
-            self.do_tracker = True
+            self.boxes=self.test_boxes + self.build_boxes + self.testmaster_boxes
+            self.do_tracker_plcs = True
+            self.do_tracker_qemus = True
         elif self.options.build_only:
             self.boxes=self.build_boxes
-            self.do_tracker = False
+            self.do_tracker_plcs = False
+            self.do_tracker_qemus = False
         elif self.options.qemu_only:
             self.boxes=self.qemu_boxes
-            self.do_tracker = False
+            self.do_tracker_plcs = False
+            self.do_tracker_qemus = True
         elif self.options.plc_only:
             self.boxes=self.plc_boxes
-            self.do_tracker = False
+            self.do_tracker_plcs = True
+            self.do_tracker_qemus = False
+        elif self.options.testmaster_only:
+            self.boxes=self.testmaster_boxes
+            self.do_tracker_plcs = False
+            self.do_tracker_qemus = False
         elif self.options.trackers_only:
             self.boxes = []
-            self.do_tracker = True
+            self.do_tracker_plcs = True
+            self.do_tracker_qemus = True
         # default
         else:
             self.boxes = self.test_boxes
-            self.do_tracker = True
+            self.do_tracker_plcs = True
+            self.do_tracker_qemus = True
 
-        if self.do_tracker:
-            self.handle_trackers ()
-        for box in self.boxes:
-            self.handle_box (box)
+        if self.options.show_disk:
+            for box in self.boxes: self.handle_disk(box)
+            return
 
+        # PLCS
+        if self.do_tracker_plcs:self.handle_tracker_plcs ()
+        for box in self.boxes:  self.handle_box (box,"plc")
+        # QEMU
+        if self.do_tracker_qemus:self.handle_tracker_qemus ()
+        for box in self.boxes:  self.handle_box (box,"qemu")
+        # ALL OTHERS
+        for box in self.boxes:  self.handle_box (box,"build")
+        # TESTMASTER
+        for box in self.boxes:  self.handle_box (box,"testmaster")
 
 if __name__ == "__main__":
     BuildBoxes().main()