scripts/manage-infrastructure.py

   1 #!/usr/bin/python
   2
   3 import os.path, sys
   4 import re
   5 import subprocess
   6 from optparse import OptionParser
   7
   8 class BuildBoxes:
   9
  10     # everything in the onelab.eu domain
  11     domain = 'pl.sophia.inria.fr'
  12     testmaster = 'testmaster'
  13     build_boxes = [ "mirror", "liquid", "reed", "velvet", ]
  14     plc_boxes = [ "testplc" ]
  15     qemu_boxes = \
  16         [ "qemu64-%d"%i for i in range (1,4) ] + \
  17         [ "qemu32-%d"%i for i in range (1,6) ]
  18     test_boxes = plc_boxes + qemu_boxes
  19     testmaster_boxes = [ testmaster ]
  20
  21     def __init__ (self):
  22         # dummy defaults
  23         self.boxes = []
  24         self.do_tracker_qemus = False
  25         self.do_tracker_plcs = False
  26
  27     def fqdn (self, box):
  28         return "%s.%s"%(box,self.domain)
  29
  30     ssh_command=['ssh','-o','ConnectTimeout=3']
  31     @staticmethod
  32     def root (box): return "root@%s"%box
  33
  34     @staticmethod
  35     def ssh(box):
  36         return BuildBoxes.ssh_command + [ BuildBoxes.root(box) ]
  37
  38     def header (self,message):
  39         print "===============",message
  40         sys.stdout.flush()
  41
  42     def run (self,argv,message, trash_err=False):
  43         if self.options.dry_run:
  44             print 'DRY_RUN:',
  45             print " ".join(argv)
  46             return 0
  47         else:
  48             if message: self.header(message)
  49             if not trash_err:
  50                 return subprocess.call(argv)
  51             else:
  52                 return subprocess.call(argv,stderr=file('/dev/null','w'))
  53
  54     def run_ssh (self, box, argv, message, trash_err=False):
  55         result=self.run (self.ssh(box) + argv, message, trash_err)
  56         if result!=0:
  57             print "WARNING: failed to run %s on %s"%(" ".join(argv),box)
  58         return result
  59
  60     def backquote (self, argv, trash_err=False):
  61         if not trash_err:
  62             return subprocess.Popen(argv,stdout=subprocess.PIPE).communicate()[0]
  63         else:
  64             return subprocess.Popen(argv,stdout=subprocess.PIPE,stderr=file('/dev/null','w')).communicate()[0]
  65
  66     def backquote_ssh (self, box, argv, trash_err=False):
  67         # first probe the ssh link
  68         hostname=self.backquote ( self.ssh(box) + [ "hostname"], trash_err=True )
  69         if not hostname:
  70             print "%s unreachable"%self.root(box)
  71             return ''
  72         else:
  73             return self.backquote( ['ssh',self.root(box)] + argv, trash_err)
  74
  75     def reboot (self,box):
  76         command=['ssh',self.root(box),'shutdown','-r','now']
  77         self.run (command,"Rebooting %s"%box)
  78
  79     def handle_tracker_plcs (self):
  80         box = self.fqdn (self.testmaster)
  81         filename="tracker-plcs"
  82         if not self.options.probe:
  83             command=["rm","-rf",filename]
  84             self.run_ssh(box,command,"Cleaning up %s on %s"%(filename,box))
  85         else:
  86             self.header ("++++++++++ Inspecting %s on %s"%(filename,box))
  87             read_command = ["cat",filename]
  88             trackers=self.backquote_ssh(box,read_command)
  89             for tracker in trackers.split('\n'):
  90                 if not tracker: continue
  91                 try:
  92                     tracker=tracker.strip()
  93                     [hostname,buildname]=tracker.split('@')
  94                     [left,plcname]=buildname.rsplit('-',1)
  95                     print self.margin_outline(plcname),tracker
  96                 except:
  97                     print self.margin(""),tracker
  98
  99     def handle_tracker_qemus (self):
 100         box = self.fqdn (self.testmaster)
 101         filename="tracker-qemus"
 102         if not self.options.probe:
 103             command=["rm","-rf",filename]
 104             self.run_ssh(box,command,"Cleaning up %s on %s"%(filename,box))
 105         else:
 106             self.header ("++++++++++ Inspecting %s on %s"%(filename,box))
 107             read_command = ["cat",filename]
 108             trackers=self.backquote_ssh(box,read_command)
 109             for tracker in trackers.split('\n'):
 110                 if not tracker: continue
 111                 try:
 112                     tracker=tracker.strip()
 113                     [hostname,buildname,nodename]=tracker.split('@')
 114                     nodename=nodename.split('.')[0]
 115                     print self.margin_outline(nodename),tracker
 116                 except:
 117                     print self.margin(""),tracker
 118
 119     def handle_build_box (self,box):
 120         if not self.options.probe:
 121             self.reboot(box)
 122         else:
 123             command=['uptime']
 124             uptime=self.backquote_ssh(box,command,True).strip()
 125
 126             command=['pgrep','build']
 127             if self.options.dry_run:
 128                 self.run_ssh(box,command,None)
 129             else:
 130                 pids=self.backquote_ssh(box,command,True)
 131                 if not pids:
 132                     self.header ('No build process on %s (%s)'%(box,uptime))
 133                 else:
 134                     command=['ps','-o','pid,command'] + [ pid for pid in pids.split("\n") if pid]
 135                     self.run_ssh(box,command,"Active build processes on %s (%s)"%(box,uptime),True)
 136
 137     # this one is more accurate as it locates processes in the vservers as well
 138     # but it's so sloooowww
 139     def handle_build_box_deep (self,box):
 140         if not self.options.probe:
 141             self.reboot(box)
 142         else:
 143             command=['uptime']
 144             uptime=self.backquote_ssh(box,command,True).strip()
 145
 146             command=['vps','-e']
 147             if self.options.dry_run:
 148                 self.run_ssh(box,command,None)
 149             else:
 150                 # simulate grep vbuild
 151                 vps_lines=[ line for line in self.backquote_ssh(box,command,True).split("\n")
 152                             if line.find('vbuild') >= 0]
 153                 pids=[ line.split()[0] for line in vps_lines ]
 154                 if not pids:
 155                     self.header ('No build process on %s (%s)'%(box,uptime))
 156                 else:
 157                     command=['vps','-o','pid,command'] + pids
 158                     self.run_ssh(box,command,"Active build processes on %s (%s)"%(box,uptime),True)
 159
 160
 161     vplc_matcher = re.compile(".*(vplc[0-9]+$)")
 162     def vplcname (self, vservername):
 163         match = self.vplc_matcher.match(vservername)
 164         if match: return match.groups(0)
 165         else: return ""
 166
 167     margin_format="%-14s"
 168     def margin(self,string): return self.margin_format%string
 169     def outline (self, string): return '== %s =='%string
 170     def margin_outline (self, string): return self.margin(self.outline(string))
 171
 172     def handle_plc_box (self,box):
 173         if not self.options.probe:
 174             self.reboot(box)
 175         else:
 176             command=['vserver-stat']
 177             if self.options.dry_run:
 178                 self.run_ssh(box,command,"Active vservers on %s"%box)
 179             else:
 180                 # try to find fullname (vserver_stat truncates to a ridiculously short name)
 181                 try:
 182                     self.header ("vserver map on %s"%box)
 183                     # fetch the contexts for all vservers on that box
 184                     map_command=['grep','.','/etc/vservers/*/context','/dev/null',]
 185                     context_map=self.backquote_ssh (box,map_command)
 186                     # at this point we have a set of lines like
 187                     # /etc/vservers/2010.01.20--k27-f12-32-vplc03/context:40144
 188                     ctx_dict={}
 189                     for map_line in context_map.split("\n"):
 190                         if not map_line: continue
 191                         [path,xid] = map_line.split(':')
 192                         ctx_dict[xid]=os.path.basename(os.path.dirname(path))
 193                     # at this point ctx_id maps context id to vservername
 194
 195                     vserver_stat = self.backquote_ssh (box,command)
 196                     for vserver_line in vserver_stat.split("\n"):
 197                         if not vserver_line: continue
 198                         context=vserver_line.split()[0]
 199                         if context=="CTX":
 200                             print self.margin(""),vserver_line
 201                             continue
 202                         longname=ctx_dict[context]
 203                         print self.margin_outline(self.vplcname(longname)),"%(vserver_line)s [=%(longname)s]"%locals()
 204                 except:
 205                     self.run_ssh(box,command,"Fine-grained method failed - fallback to plain vserver-stat")
 206
 207     vnode_matcher = re.compile(".*(vnode[0-9]+)")
 208     def vnodename (self, ps_line):
 209         match = self.vnode_matcher.match(ps_line)
 210         if match: return match.groups(0)
 211         else: return ""
 212
 213     def handle_qemu_box (self,box):
 214         if not self.options.probe:
 215             self.reboot(box)
 216         else:
 217             command=['lsmod']
 218             modules=self.backquote_ssh(box,command).split('\n')
 219             kqemu_msg='*NO kqemu MODULE LOADED*'
 220             for module in modules:
 221                 if module.find('kqemu')==0:
 222                     kqemu_msg='kqemu OK'
 223
 224             command=['pgrep','qemu']
 225             if self.options.dry_run:
 226                 self.run_ssh(box,command,None)
 227             else:
 228                 pids=self.backquote_ssh(box,command)
 229                 if not pids:
 230                     self.header ('No qemu process on %s'%box)
 231                 else:
 232                     self.header ("Active qemu processes on %s (%s)"%(box,kqemu_msg))
 233                     command=['ps','-o','pid,command'] + [ pid for pid in pids.split("\n") if pid]
 234                     ps_lines = self.backquote_ssh (box,command).split("\n")
 235                     for ps_line in ps_lines:
 236                         if not ps_line or ps_line.find('PID') >=0 : continue
 237                         print self.margin_outline(self.vnodename(ps_line)), ps_line
 238
 239     # the ouput of ps -o pid,command gives us <pid> bash <buildname>/run_log
 240     def testmaster_buildname (self, ps_line):
 241         chunks=ps_line.split()
 242         path=chunks[2]
 243         [buildname,command]=path.split('/')
 244         return buildname
 245
 246     def handle_testmaster_box (self, box):
 247         if not self.options.probe:
 248             pass
 249         else:
 250             command=['pgrep','run_log']
 251             if self.options.dry_run:
 252                 self.run_ssh(box,command,None)
 253             else:
 254                 pids=self.backquote_ssh(box,command)
 255                 if not pids:
 256                     self.header ('No run_log process on %s'%box)
 257                 else:
 258                     self.header ("Active run_log processes on %s"%(box))
 259                     command=['ps','-o','pid,command'] + [ pid for pid in pids.split("\n") if pid]
 260                     ps_lines = self.backquote_ssh (box,command).split("\n")
 261                     for ps_line in ps_lines:
 262                         if not ps_line or ps_line.find('PID') >=0 : continue
 263                         print self.margin_outline(self.testmaster_buildname(ps_line)), ps_line
 264
 265
 266     def handle_box(self,box,type):
 267         if box in self.qemu_boxes:
 268             if type=="qemu": self.handle_qemu_box(self.fqdn(box))
 269         elif box in self.plc_boxes:
 270             if type=="plc":  self.handle_plc_box(self.fqdn(box))
 271         elif box in self.testmaster_boxes:
 272             if type=='testmaster': self.handle_testmaster_box(self.fqdn(box))
 273         elif type=="build":
 274             if self.options.deep:
 275                 self.handle_build_box_deep(self.fqdn(box))
 276             else:
 277                 self.handle_build_box(self.fqdn(box))
 278
 279     def handle_disk (self,box):
 280         box=self.fqdn(box)
 281         return self.run_ssh(box,["df","-h",],"Disk space on %s"%box)
 282
 283     def main (self):
 284         usage="""%prog [options] [hostname..(s)]
 285 Default is to act on test boxes only"""
 286         parser = OptionParser (usage=usage)
 287         parser.add_option ("-n","--dry-run",action="store_true",dest="dry_run",default=False,
 288                            help="Dry run")
 289         parser.add_option ("-r","--reboot", action="store_false",dest="probe",default=True,
 290                            help="Actually reset/reboot stuff instead of just probing it")
 291         # no need for -p = probe, as this is the default
 292         parser.add_option ("-p","--plc", action="store_true",dest="plc_only",default=False,
 293                            help="Acts on the plc box only")
 294
 295         parser.add_option ("-a","--all",action="store_true",dest="all_boxes",default=False,
 296                            help="Acts on build and test boxes")
 297         parser.add_option ("-b","--build",action="store_true",dest="build_only",default=False,
 298                            help="Acts on build boxes only")
 299         parser.add_option ("-e","--deep",action="store_true", dest="deep", default=False,
 300                            help="on build boxes, shows vbuild processes in vservers as well; signif. slower")
 301         parser.add_option ("-q","--qemu",action="store_true",dest="qemu_only",default=False,
 302                            help="Only acts on the qemu boxes")
 303         parser.add_option ("-t","--trackers",action="store_true",dest="trackers_only",default=False,
 304                            help="Only wipes trackers")
 305         parser.add_option ("-m","--master",action="store_true",dest="testmaster_only",default=False,
 306                            help="Display the testmaster status")
 307         parser.add_option ("-d","--disk",action="store_true",dest="show_disk",default=False,
 308                            help="Only inspects disk status")
 309
 310         (self.options,args) = parser.parse_args()
 311
 312         # use given hostnames if provided
 313         if args:
 314             self.boxes=args
 315             # if hostnames are specified, let's stay on the safe side and don't reset trackers
 316             self.do_tracker_plcs = False
 317             self.do_tracker_qemus = False
 318         elif self.options.all_boxes:
 319             self.boxes=self.test_boxes + self.build_boxes + self.testmaster_boxes
 320             self.do_tracker_plcs = True
 321             self.do_tracker_qemus = True
 322         elif self.options.build_only:
 323             self.boxes=self.build_boxes
 324             self.do_tracker_plcs = False
 325             self.do_tracker_qemus = False
 326         elif self.options.qemu_only:
 327             self.boxes=self.qemu_boxes
 328             self.do_tracker_plcs = False
 329             self.do_tracker_qemus = True
 330         elif self.options.plc_only:
 331             self.boxes=self.plc_boxes
 332             self.do_tracker_plcs = True
 333             self.do_tracker_qemus = False
 334         elif self.options.testmaster_only:
 335             self.boxes=self.testmaster_boxes
 336             self.do_tracker_plcs = False
 337             self.do_tracker_qemus = False
 338         elif self.options.trackers_only:
 339             self.boxes = []
 340             self.do_tracker_plcs = True
 341             self.do_tracker_qemus = True
 342         # default
 343         else:
 344             self.boxes = self.test_boxes
 345             self.do_tracker_plcs = True
 346             self.do_tracker_qemus = True
 347
 348         if self.options.show_disk:
 349             for box in self.boxes: self.handle_disk(box)
 350             return
 351
 352         # PLCS
 353         if self.do_tracker_plcs:self.handle_tracker_plcs ()
 354         for box in self.boxes:  self.handle_box (box,"plc")
 355         # QEMU
 356         if self.do_tracker_qemus:self.handle_tracker_qemus ()
 357         for box in self.boxes:  self.handle_box (box,"qemu")
 358         # ALL OTHERS
 359         for box in self.boxes:  self.handle_box (box,"build")
 360         # TESTMASTER
 361         for box in self.boxes:  self.handle_box (box,"testmaster")
 362
 363 if __name__ == "__main__":
 364     BuildBoxes().main()