outline boxes that are down
authorthierry <thierry@41d37cc5-eb28-0410-a9bf-d37491348ade>
Fri, 26 Feb 2010 09:12:21 +0000 (09:12 +0000)
committerthierry <thierry@41d37cc5-eb28-0410-a9bf-d37491348ade>
Fri, 26 Feb 2010 09:12:21 +0000 (09:12 +0000)
scripts/manage-infrastructure.py

index 2138cae..bdd434e 100755 (executable)
@@ -25,9 +25,15 @@ class BuildBoxes:
 
     def fqdn (self, box):
         return "%s.%s"%(box,self.domain)
+
+    ssh_command=['ssh','-o','ConnectTimeout=3']
     @staticmethod
     def root (box): return "root@%s"%box
 
+    @staticmethod
+    def ssh(box):
+        return BuildBoxes.ssh_command + [ BuildBoxes.root(box) ]
+
     def header (self,message):
         print "===============",message
 
@@ -35,21 +41,35 @@ class BuildBoxes:
         if self.options.dry_run:
             print 'DRY_RUN:',
             print " ".join(argv)
+            return 0
         else:
             if message: self.header(message)
             if not trash_err:
-                subprocess.call(argv)
+                return subprocess.call(argv)
             else:
-                subprocess.call(argv,stderr=file('/dev/null','w'))
+                return subprocess.call(argv,stderr=file('/dev/null','w'))
                 
+    def run_ssh (self, box, argv, message, trash_err=False):
+        result=self.run (self.ssh(box) + argv, message, trash_err)
+        if result!=0:
+            print "WARNING: failed to run %s on %s"%(" ".join(argv),box)
+        return result
+
     def backquote (self, argv, trash_err=False):
         if not trash_err:
             return subprocess.Popen(argv,stdout=subprocess.PIPE).communicate()[0]
         else:
-            null = open('/dev/null','w')
-            result = subprocess.Popen(argv,stdout=subprocess.PIPE,stderr=null).communicate()[0]
-            null.close()
-            return result
+            return subprocess.Popen(argv,stdout=subprocess.PIPE,stderr=file('/dev/null','w')).communicate()[0]
+
+    def backquote_ssh (self, box, argv, trash_err=False):
+#        print 'BACKQUOTE_SSH [%s] %s'%(box,' '.join(argv))
+        # first probe the ssh link
+        hostname=self.backquote ( self.ssh(box) + [ "hostname"], trash_err=True )
+        if not hostname:
+            print "%s unreachable"%self.root(box)
+            return ''
+        else:
+            return self.backquote( ['ssh',self.root(box)] + argv, trash_err)
 
     def reboot (self,box):
         command=['ssh',self.root(box),'shutdown','-r','now']
@@ -59,12 +79,12 @@ class BuildBoxes:
         box = self.fqdn (self.testmaster)
         filename="tracker-plcs"
         if not self.options.probe:
-            command=['ssh',self.root(box),"rm","-rf",filename]
-            self.run(command,"Cleaning up %s on %s"%(filename,box))
+            command=["rm","-rf",filename]
+            self.run_ssh(box,command,"Cleaning up %s on %s"%(filename,box))
         else:
             self.header ("++++++++++ Inspecting %s on %s"%(filename,box))
-            read_command = ['ssh',self.root(box),"cat",filename]
-            trackers=self.backquote(read_command)
+            read_command = ["cat",filename]
+            trackers=self.backquote_ssh(box,read_command)
             for tracker in trackers.split('\n'):
                 if not tracker: continue
                 try:
@@ -79,12 +99,12 @@ class BuildBoxes:
         box = self.fqdn (self.testmaster)
         filename="tracker-qemus"
         if not self.options.probe:
-            command=['ssh',self.root(box),"rm","-rf",filename]
-            self.run(command,"Cleaning up %s on %s"%(filename,box))
+            command=["rm","-rf",filename]
+            self.run_ssh(box,command,"Cleaning up %s on %s"%(filename,box))
         else:
             self.header ("++++++++++ Inspecting %s on %s"%(filename,box))
-            read_command = ['ssh',self.root(box),"cat",filename]
-            trackers=self.backquote(read_command)
+            read_command = ["cat",filename]
+            trackers=self.backquote_ssh(box,read_command)
             for tracker in trackers.split('\n'):
                 if not tracker: continue
                 try:
@@ -99,19 +119,19 @@ class BuildBoxes:
         if not self.options.probe:
             self.reboot(box)
         else:
-            command=['ssh',self.root(box),'uptime']
-            uptime=self.backquote(command,True).strip()
+            command=['uptime']
+            uptime=self.backquote_ssh(box,command,True).strip()
 
-            command=['ssh',self.root(box),'pgrep','build']
+            command=['pgrep','build']
             if self.options.dry_run:
-                self.run(command,None)
+                self.run_ssh(box,command,None)
             else:
-                pids=self.backquote(command,True)
+                pids=self.backquote_ssh(box,command,True)
                 if not pids:
                     self.header ('No build process on %s (%s)'%(box,uptime))
                 else:
-                    command=['ssh',self.root(box),'ps','-o','pid,command'] + [ pid for pid in pids.split("\n") if pid]
-                    self.run(command,"Active build processes on %s (%s)"%(box,uptime),True)
+                    command=['ps','-o','pid,command'] + [ pid for pid in pids.split("\n") if pid]
+                    self.run_ssh(box,command,"Active build processes on %s (%s)"%(box,uptime),True)
 
     vplc_matcher = re.compile(".*(vplc[0-9]+$)")
     def vplcname (self, vservername):
@@ -128,16 +148,16 @@ class BuildBoxes:
         if not self.options.probe:
             self.reboot(box)
         else:
-            command=['ssh',self.root(box),'vserver-stat']
+            command=['vserver-stat']
             if self.options.dry_run:
-                self.run(command,"Active vservers on %s"%box)
+                self.run_ssh(box,command,"Active vservers on %s"%box)
             else:
                 # try to find fullname (vserver_stat truncates to a ridiculously short name)
                 try:
                     self.header ("vserver map on %s"%box)
                     # fetch the contexts for all vservers on that box
-                    map_command=['ssh',self.root(box),'grep','.','/etc/vservers/*/context','/dev/null',]
-                    context_map=self.backquote (map_command)
+                    map_command=['grep','.','/etc/vservers/*/context','/dev/null',]
+                    context_map=self.backquote_ssh (box,map_command)
                     # at this point we have a set of lines like
                     # /etc/vservers/2010.01.20--k27-f12-32-vplc03/context:40144
                     ctx_dict={}
@@ -147,7 +167,7 @@ class BuildBoxes:
                         ctx_dict[xid]=os.path.basename(os.path.dirname(path))
                     # at this point ctx_id maps context id to vservername
 
-                    vserver_stat = self.backquote (command)
+                    vserver_stat = self.backquote_ssh (box,command)
                     for vserver_line in vserver_stat.split("\n"):
                         if not vserver_line: continue
                         context=vserver_line.split()[0]
@@ -157,7 +177,7 @@ class BuildBoxes:
                         longname=ctx_dict[context]
                         print self.margin_outline(self.vplcname(longname)),"%(vserver_line)s [=%(longname)s]"%locals()
                 except:
-                    self.run(command,"Fine-grained method failed - fallback to plain vserver-stat")
+                    self.run_ssh(box,command,"Fine-grained method failed - fallback to plain vserver-stat")
 
     vnode_matcher = re.compile(".*(vnode[0-9]+)")
     def vnodename (self, ps_line):
@@ -170,24 +190,24 @@ class BuildBoxes:
         if not self.options.probe:
             self.reboot(box)
         else:
-            command=['ssh',self.root(box),'lsmod']
-            modules=self.backquote(command).split('\n')
+            command=['lsmod']
+            modules=self.backquote_ssh(box,command).split('\n')
             kqemu_msg='*NO kqemu MODULE LOADED*'
             for module in modules:
                 if module.find('kqemu')==0:
                     kqemu_msg='kqemu OK'
             
-            command=['ssh',self.root(box),'pgrep','qemu']
+            command=['pgrep','qemu']
             if self.options.dry_run:
-                self.run(command,None)
+                self.run_ssh(box,command,None)
             else:
-                pids=self.backquote(command)
+                pids=self.backquote_ssh(box,command)
                 if not pids:
                     self.header ('No qemu process on %s'%box)
                 else:
                     self.header ("Active qemu processes on %s (%s)"%(box,kqemu_msg))
-                    command=['ssh',self.root(box),'ps','-o','pid,command'] + [ pid for pid in pids.split("\n") if pid]
-                    ps_lines = self.backquote (command).split("\n")
+                    command=['ps','-o','pid,command'] + [ pid for pid in pids.split("\n") if pid]
+                    ps_lines = self.backquote_ssh (box,command).split("\n")
                     for ps_line in ps_lines:
                         if not ps_line or ps_line.find('PID') >=0 : continue
                         print self.margin_outline(self.vnodename(ps_line)), ps_line