scripts/manage-infra-obso.py

   1 #!/usr/bin/python
   2
   3 import os.path, sys
   4 import re
   5 import subprocess
   6 from optparse import OptionParser
   7
   8 class Infrastructure:
   9
  10     # everything in the onelab.eu domain
  11     domain = 'pl.sophia.inria.fr'
  12     build_boxes = [ "devel", "liquid", "reed", "velvet", ]
  13     plc_boxes = [ "testplc" ]
  14     testmaster = 'testmaster'
  15     testmaster_boxes = [ testmaster ]
  16     # cache the list of qemu boxes in ~/.qemu-boxes
  17     # this can be refreshed by running -c
  18     qemu_boxes=[]
  19
  20     def cache_file (self): return os.path.expanduser("~/.qemu-boxes")
  21
  22     def load_cache (self):
  23         cache=self.cache_file()
  24         if os.path.isfile(cache):
  25             self.qemu_boxes=file(cache).read().split()
  26         self.test_boxes = self.plc_boxes + self.qemu_boxes
  27
  28     # run LocalTestResources on testmaster
  29     def refresh_cache (self):
  30         retrieved= \
  31             self.backquote_ssh(self.fqdn(self.testmaster),['LocalTestResources.py'],trash_err=True)
  32         remove="."+Infrastructure.domain
  33         retrieved = [ x.replace(remove,"").strip() for x in retrieved.split()]
  34         self.qemu_boxes = retrieved
  35         cache=self.cache_file()
  36         file(cache,'w').write(' '.join(self.qemu_boxes)+'\n')
  37         print "New contents of %s:"%cache
  38         print file(cache).read(),
  39
  40     def __init__ (self):
  41         # dummy defaults
  42         self.boxes = []
  43         self.do_tracker_qemus = False
  44         self.do_tracker_plcs = False
  45         self.load_cache()
  46
  47     def fqdn (self, box):
  48         return "%s.%s"%(box,self.domain)
  49
  50     ssh_command=['ssh','-o','ConnectTimeout=3']
  51     @staticmethod
  52     def root (box): return "root@%s"%box
  53
  54     @staticmethod
  55     def ssh(box):
  56         return Infrastructure.ssh_command + [ Infrastructure.root(box) ]
  57
  58     def header (self,message):
  59         print "===============",message
  60         sys.stdout.flush()
  61
  62     def run (self,argv,message, trash_err=False):
  63         if self.options.dry_run:
  64             print 'DRY_RUN:',
  65             print " ".join(argv)
  66             return 0
  67         else:
  68             if message: self.header(message)
  69             if not trash_err:
  70                 return subprocess.call(argv)
  71             else:
  72                 return subprocess.call(argv,stderr=file('/dev/null','w'))
  73
  74     def run_ssh (self, box, argv, message, trash_err=False):
  75         result=self.run (self.ssh(box) + argv, message, trash_err)
  76         if result!=0:
  77             print "WARNING: failed to run %s on %s"%(" ".join(argv),box)
  78         return result
  79
  80     def backquote (self, argv, trash_err=False):
  81         if not trash_err:
  82             return subprocess.Popen(argv,stdout=subprocess.PIPE).communicate()[0]
  83         else:
  84             return subprocess.Popen(argv,stdout=subprocess.PIPE,stderr=file('/dev/null','w')).communicate()[0]
  85
  86     def backquote_ssh (self, box, argv, trash_err=False):
  87         # first probe the ssh link
  88         hostname=self.backquote ( self.ssh(box) + [ "hostname"], trash_err=True )
  89         if not hostname:
  90             print "%s unreachable"%self.root(box)
  91             return ''
  92         else:
  93             return self.backquote( ['ssh',self.root(box)] + argv, trash_err)
  94
  95     def reboot (self,box):
  96         command=['ssh',self.root(box),'shutdown','-r','now']
  97         self.run (command,"Rebooting %s"%box)
  98
  99     def handle_starting (self):
 100         box = self.fqdn (self.testmaster)
 101         filename="starting"
 102         if not self.options.probe:
 103             command=["rm","-rf",filename]
 104             self.run_ssh(box,command,"Cleaning up %s on %s"%(filename,box))
 105         else:
 106             read_command = ["cat",filename]
 107             self.run_ssh(box,read_command,"++++++++++ Inspecting %s on %s"%(filename,box))
 108
 109     def handle_tracker_plcs (self):
 110         box = self.fqdn (self.testmaster)
 111         filename="tracker-plcs"
 112         if not self.options.probe:
 113             command=["rm","-rf",filename]
 114             self.run_ssh(box,command,"Cleaning up %s on %s"%(filename,box))
 115         else:
 116             self.header ("++++++++++ Inspecting %s on %s"%(filename,box))
 117             read_command = ["cat",filename]
 118             trackers=self.backquote_ssh(box,read_command)
 119             for tracker in trackers.split('\n'):
 120                 if not tracker: continue
 121                 try:
 122                     tracker=tracker.strip()
 123                     (hostname,buildname,plcname)=tracker.split('@')
 124                     print self.margin_outline(plcname),tracker
 125                 except:
 126                     print self.margin(""),tracker
 127
 128     def handle_tracker_qemus (self):
 129         box = self.fqdn (self.testmaster)
 130         filename="tracker-qemus"
 131         if not self.options.probe:
 132             command=["rm","-rf",filename]
 133             self.run_ssh(box,command,"Cleaning up %s on %s"%(filename,box))
 134         else:
 135             self.header ("++++++++++ Inspecting %s on %s"%(filename,box))
 136             read_command = ["cat",filename]
 137             trackers=self.backquote_ssh(box,read_command)
 138             for tracker in trackers.split('\n'):
 139                 if not tracker: continue
 140                 try:
 141                     tracker=tracker.strip()
 142                     [hostname,buildname,nodename]=tracker.split('@')
 143                     nodename=nodename.split('.')[0]
 144                     print self.margin_outline(nodename),tracker
 145                 except:
 146                     print self.margin(""),tracker
 147
 148     def handle_build_box (self,box):
 149         if not self.options.probe:
 150             self.reboot(box)
 151         else:
 152             command=['uptime']
 153             uptime=self.backquote_ssh(box,command,True).strip()
 154
 155             command=['pgrep','build']
 156             if self.options.dry_run:
 157                 self.run_ssh(box,command,None)
 158             else:
 159                 pids=self.backquote_ssh(box,command,True)
 160                 if not pids:
 161                     self.header ('No build process on %s (%s)'%(box,uptime))
 162                 else:
 163                     command=['ps','-o','pid,command'] + [ pid for pid in pids.split("\n") if pid]
 164                     self.run_ssh(box,command,"Active build processes on %s (%s)"%(box,uptime),True)
 165
 166     # this one is more accurate as it locates processes in the vservers as well
 167     # but it's so sloooowww
 168     def handle_build_box_deep (self,box):
 169         if not self.options.probe:
 170             self.reboot(box)
 171         else:
 172             command=['uptime']
 173             uptime=self.backquote_ssh(box,command,True).strip()
 174
 175             command=['vps','-e']
 176             if self.options.dry_run:
 177                 self.run_ssh(box,command,None)
 178             else:
 179                 # simulate grep vbuild
 180                 vps_lines=[ line for line in self.backquote_ssh(box,command,True).split("\n")
 181                             if line.find('vbuild') >= 0]
 182                 pids=[ line.split()[0] for line in vps_lines ]
 183                 if not pids:
 184                     self.header ('No build process on %s (%s)'%(box,uptime))
 185                 else:
 186                     command=['vps','-o','pid,command'] + pids
 187                     self.run_ssh(box,command,"Active build processes on %s (%s)"%(box,uptime),True)
 188
 189
 190     vplc_matcher = re.compile(".*(vplc[0-9]+$)")
 191     def vplcname (self, vservername):
 192         match = self.vplc_matcher.match(vservername)
 193         if match: return match.groups(0)
 194         else: return ""
 195
 196     margin_format="%-14s"
 197     def margin(self,string): return self.margin_format%string
 198     def outline (self, string): return '== %s =='%string
 199     def margin_outline (self, string): return self.margin(self.outline(string))
 200
 201     def handle_plc_box (self,box):
 202 # initial approach was to first scan vserver-stat, but it's not needed
 203         if not self.options.probe:
 204 #            # remove mark for all running servers to avoid resurrection
 205 #            if vserver_names:
 206 #                bash="; ".join( [ "rm -f /etc/vservers/%s/apps/init/mark"%vs for vs in vserver_names ] )
 207 #                stop_command=['bash','-c',"'" + bash + "'"]
 208 #                self.run_ssh(box,stop_command,"Removing mark on running vservers on %s"%box)
 209             # just trash all marks
 210             stop_command=['rm','-rf','/etc/vservers/*/apps/init/mark']
 211             self.run_ssh(box,stop_command,"Removing all vserver marks on %s"%box)
 212             if not self.options.soft:
 213                 self.reboot(box)
 214             else:
 215                 self.run_ssh(box,['service','util-vserver','stop'],"Stopping all running vservers")
 216             return
 217         # even for rebooting we need to scan vserver-stat to stop the vservers properly
 218         vserver_names=[]
 219         command=['vserver-stat']
 220         if self.options.dry_run:
 221             self.run_ssh(box,command,"Active vservers on %s"%box)
 222         # try to find fullname (vserver_stat truncates to a ridiculously short name)
 223         self.header ("vserver map on %s"%box)
 224         # fetch the contexts for all vservers on that box
 225         map_command=['grep','.','/etc/vservers/*/context','/dev/null',]
 226         context_map=self.backquote_ssh (box,map_command)
 227         # at this point we have a set of lines like
 228         # /etc/vservers/2010.01.20--k27-f12-32-vplc03/context:40144
 229         ctx_dict={}
 230         for map_line in context_map.split("\n"):
 231             if not map_line: continue
 232             [path,xid] = map_line.split(':')
 233             ctx_dict[xid]=os.path.basename(os.path.dirname(path))
 234         # at this point ctx_id maps context id to vservername
 235
 236         vserver_stat = self.backquote_ssh (box,command)
 237         for vserver_line in vserver_stat.split("\n"):
 238             if not vserver_line: continue
 239             context=vserver_line.split()[0]
 240             if context=="CTX":
 241                 print self.margin(""),vserver_line
 242                 continue
 243             longname=ctx_dict[context]
 244             vserver_names.append(longname)
 245             print self.margin_outline(self.vplcname(longname)),"%(vserver_line)s [=%(longname)s]"%locals()
 246
 247     vnode_matcher = re.compile(".*(vnode[0-9]+)")
 248     def vnodename (self, ps_line):
 249         match = self.vnode_matcher.match(ps_line)
 250         if match: return match.groups(0)
 251         else: return ""
 252
 253     def handle_qemu_box (self,box):
 254         if not self.options.probe:
 255             if not self.options.soft:
 256                 self.reboot(box)
 257             else:
 258                 self.run_ssh(box,['pkill','qemu'],"Killing qemu instances")
 259         else:
 260             command=['lsmod']
 261             modules=self.backquote_ssh(box,command).split('\n')
 262             kqemu_msg='*NO kqemu/kmv_intel MODULE LOADED*'
 263             for module in modules:
 264                 if module.find('kqemu')==0:
 265                     kqemu_msg='kqemu module loaded'
 266                 # kvm might be loaded without vkm_intel (we dont have AMD)
 267                 elif module.find('kvm_intel')==0:
 268                     kqemu_msg='kvm_intel module loaded'
 269
 270             command=['pgrep','qemu']
 271             if self.options.dry_run:
 272                 self.run_ssh(box,command,None)
 273             else:
 274                 pids=self.backquote_ssh(box,command)
 275                 if not pids:
 276                     self.header ('No qemu process on %s (%s)'%(box,kqemu_msg))
 277                 else:
 278                     self.header ("Active qemu processes on %s (%s)"%(box,kqemu_msg))
 279                     command=['ps','-o','pid,command'] + [ pid for pid in pids.split("\n") if pid]
 280                     ps_lines = self.backquote_ssh (box,command).split("\n")
 281                     for ps_line in ps_lines:
 282                         if not ps_line or ps_line.find('PID') >=0 : continue
 283                         print self.margin_outline(self.vnodename(ps_line)), ps_line
 284
 285     # the ouput of ps -o pid,command gives us <pid> bash <buildname>/run_log
 286     def testmaster_buildname (self, ps_line):
 287         chunks=ps_line.split()
 288         path=chunks[2]
 289         [buildname,command]=path.split('/')
 290         return buildname
 291
 292     def handle_testmaster_box (self, box):
 293         if not self.options.probe:
 294             pass
 295         else:
 296             command=['pgrep','run_log']
 297             if self.options.dry_run:
 298                 self.run_ssh(box,command,None)
 299             else:
 300                 pids=self.backquote_ssh(box,command)
 301                 if not pids:
 302                     self.header ('No run_log process on %s'%box)
 303                 else:
 304                     self.header ("Active run_log processes on %s"%(box))
 305                     command=['ps','-o','pid,command'] + [ pid for pid in pids.split("\n") if pid]
 306                     ps_lines = self.backquote_ssh (box,command).split("\n")
 307                     for ps_line in ps_lines:
 308                         if not ps_line or ps_line.find('PID') >=0 : continue
 309                         print self.margin_outline(self.testmaster_buildname(ps_line)), ps_line
 310
 311
 312     def handle_box(self,box,type):
 313         if box in self.qemu_boxes:
 314             if type=="qemu": self.handle_qemu_box(self.fqdn(box))
 315         elif box in self.plc_boxes:
 316             if type=="plc":  self.handle_plc_box(self.fqdn(box))
 317         elif box in self.testmaster_boxes:
 318             if type=='testmaster': self.handle_testmaster_box(self.fqdn(box))
 319         elif type=="build":
 320             if self.options.deep:
 321                 self.handle_build_box_deep(self.fqdn(box))
 322             else:
 323                 self.handle_build_box(self.fqdn(box))
 324
 325     def handle_disk (self,box):
 326         box=self.fqdn(box)
 327         return self.run_ssh(box,["df","-h",],"Disk space on %s"%box)
 328
 329     def main (self):
 330         usage="""%prog [options] [hostname..(s)]
 331 Default is to act on test boxes only"""
 332         parser = OptionParser (usage=usage)
 333         parser.add_option ("-n","--dry-run",action="store_true",dest="dry_run",default=False,
 334                            help="Dry run")
 335         parser.add_option ("-r","--reboot", action="store_false",dest="probe",default=True,
 336                            help="Actually reset/reboot stuff instead of just probing it")
 337         parser.add_option ("-s","--soft",action="store_true",dest="soft",default=False,
 338                            help="Soft reset instead of hard reboot of the boxes")
 339         # no need for -p = probe, as this is the default
 340         parser.add_option ("-p","--plc", action="store_true",dest="plc_only",default=False,
 341                            help="Acts on the plc box only")
 342
 343         parser.add_option ("-e","--deep",action="store_true", dest="deep", default=False,
 344                            help="on build boxes, shows vbuild processes in vservers as well; signif. slower")
 345
 346         parser.add_option ("-a","--all",action="store_true",dest="all_boxes",default=False,
 347                            help="Acts on build and test boxes")
 348         parser.add_option ("-b","--build",action="store_true",dest="build_only",default=False,
 349                            help="Acts on build boxes only")
 350         parser.add_option ("-q","--qemu",action="store_true",dest="qemu_only",default=False,
 351                            help="Only acts on the qemu boxes")
 352         parser.add_option ("-t","--trackers",action="store_true",dest="trackers_only",default=False,
 353                            help="Only wipes trackers")
 354         parser.add_option ("-m","--master",action="store_true",dest="testmaster_only",default=False,
 355                            help="Display the testmaster status")
 356         parser.add_option ("-d","--disk",action="store_true",dest="show_disk",default=False,
 357                            help="Only inspects disk status")
 358         parser.add_option ("-c","--refresh-cache",action="store_true",dest="refresh_cache", default=False,
 359                            help="Refresh cached list of qemu boxes at testmaster - implies -q")
 360
 361         (self.options,args) = parser.parse_args()
 362
 363         # -c implies -q
 364         if self.options.refresh_cache:
 365             self.options.qemu_only=True
 366             self.refresh_cache()
 367
 368         # use given hostnames if provided
 369         if args:
 370             self.boxes=args
 371             # if hostnames are specified, let's stay on the safe side and don't reset trackers
 372             self.do_tracker_plcs = False
 373             self.do_tracker_qemus = False
 374         elif self.options.all_boxes:
 375             self.boxes=self.test_boxes + self.build_boxes + self.testmaster_boxes
 376             self.do_tracker_plcs = True
 377             self.do_tracker_qemus = True
 378         elif self.options.build_only:
 379             self.boxes=self.build_boxes
 380             self.do_tracker_plcs = False
 381             self.do_tracker_qemus = False
 382         elif self.options.qemu_only:
 383             self.boxes=self.qemu_boxes
 384             self.do_tracker_plcs = False
 385             self.do_tracker_qemus = True
 386         elif self.options.plc_only:
 387             self.boxes=self.plc_boxes
 388             self.do_tracker_plcs = True
 389             self.do_tracker_qemus = False
 390         elif self.options.testmaster_only:
 391             self.boxes=self.testmaster_boxes
 392             self.do_tracker_plcs = False
 393             self.do_tracker_qemus = False
 394         elif self.options.trackers_only:
 395             self.boxes = []
 396             self.do_tracker_plcs = True
 397             self.do_tracker_qemus = True
 398         # default
 399         else:
 400             self.boxes = self.test_boxes
 401             self.do_tracker_plcs = True
 402             self.do_tracker_qemus = True
 403
 404         if self.options.show_disk:
 405             for box in self.boxes: self.handle_disk(box)
 406             return
 407
 408         # PLCS
 409         if self.do_tracker_plcs:
 410             self.handle_tracker_plcs ()
 411             self.handle_starting ()
 412         for box in self.boxes:  self.handle_box (box,"plc")
 413         # QEMU
 414         if self.do_tracker_qemus:self.handle_tracker_qemus ()
 415         for box in self.boxes:  self.handle_box (box,"qemu")
 416         # ALL OTHERS
 417         for box in self.boxes:  self.handle_box (box,"build")
 418         # TESTMASTER
 419         for box in self.boxes:  self.handle_box (box,"testmaster")
 420
 421 if __name__ == "__main__":
 422     Infrastructure().main()