Optimize the 0.0.0.0 case, and get it working on 2.3 kernels.
[util-vserver-pl.git] / python / vserver.py
1 # Copyright 2005 Princeton University
2
3 #$Id: vserver.py,v 1.72 2007/08/02 16:01:59 dhozac Exp $
4
5 import errno
6 import fcntl
7 import os
8 import re
9 import pwd
10 import signal
11 import sys
12 import time
13 import traceback
14 import subprocess
15 import resource
16
17 import vserverimpl
18 import cpulimit, bwlimit
19
20 from vserverimpl import DLIMIT_INF
21 from vserverimpl import VC_LIM_KEEP
22 from vserverimpl import VLIMIT_NSOCK
23 from vserverimpl import VLIMIT_OPENFD
24 from vserverimpl import VLIMIT_ANON
25 from vserverimpl import VLIMIT_SHMEM
26
27 #
28 # these are the flags taken from the kernel linux/vserver/legacy.h
29 #
30 FLAGS_LOCK = 1
31 FLAGS_SCHED = 2  # XXX - defined in util-vserver/src/chcontext.c
32 FLAGS_NPROC = 4
33 FLAGS_PRIVATE = 8
34 FLAGS_INIT = 16
35 FLAGS_HIDEINFO = 32
36 FLAGS_ULIMIT = 64
37 FLAGS_NAMESPACE = 128
38
39 RLIMITS = { "NSOCK": VLIMIT_NSOCK,
40             "OPENFD": VLIMIT_OPENFD,
41             "ANON": VLIMIT_ANON,
42             "SHMEM": VLIMIT_SHMEM}
43
44 # add in the platform supported rlimits
45 for entry in resource.__dict__.keys():
46     if entry.find("RLIMIT_")==0:
47         k = entry[len("RLIMIT_"):]
48         if not RLIMITS.has_key(k):
49             RLIMITS[k]=resource.__dict__[entry]
50         else:
51             print "WARNING: duplicate RLIMITS key %s" % k
52
53 class NoSuchVServer(Exception): pass
54
55
56 class VServerConfig:
57     def __init__(self, name, directory):
58         self.name = name
59         self.dir = directory
60         self.cache = None
61         if not (os.path.isdir(self.dir) and
62                 os.access(self.dir, os.R_OK | os.W_OK | os.X_OK)):
63             raise NoSuchVServer, "%s does not exist" % self.dir
64
65     def get(self, option, default = None):
66         try:
67             if self.cache:
68                 return self.cache[option]
69             else:
70                 f = open(os.path.join(self.dir, option), "r")
71                 buf = f.read().rstrip()
72                 f.close()
73                 return buf
74         except:
75             if default is not None:
76                 return default
77             else:
78                 raise KeyError, "Key %s is not set for %s" % (option, self.name)
79
80     def update(self, option, value):
81         if self.cache:
82             return
83
84         try:
85             old_umask = os.umask(0022)
86             filename = os.path.join(self.dir, option)
87             try:
88                 os.makedirs(os.path.dirname(filename), 0755)
89             except:
90                 pass
91             f = open(filename, 'w')
92             if isinstance(value, list):
93                 f.write("%s\n" % "\n".join(value))
94             else:
95                 f.write("%s\n" % value)
96             f.close()
97             os.umask(old_umask)
98         except:
99             raise
100
101     def unset(self, option):
102         if self.cache:
103             return
104
105         try:
106             filename = os.path.join(self.dir, option)
107             os.unlink(filename)
108             try:
109                 os.removedirs(os.path.dirname(filename))
110             except:
111                 pass
112             return True
113         except:
114             return False
115
116     def cache_it(self):
117         self.cache = {}
118         def add_to_cache(cache, dirname, fnames):
119             for file in fnames:
120                 full_name = os.path.join(dirname, file)
121                 if os.path.islink(full_name):
122                     fnames.remove(file)
123                 elif (os.path.isfile(full_name) and
124                       os.access(full_name, os.R_OK)):
125                     f = open(full_name, "r")
126                     cache[full_name.replace(os.path.join(self.dir, ''),
127                                             '')] = f.read().rstrip()
128                     f.close()
129         os.path.walk(self.dir, add_to_cache, self.cache)
130
131
132 def adjust_lim(goal, curr):
133     gh = goal[0]
134     gs = goal[1]
135     gm = goal[2]
136     soft = curr[0]
137     hard = curr[1]
138     if gm != VC_LIM_KEEP:
139         if gm > soft:
140             soft = gm
141         if gm > hard:
142             hard = gm
143     if gs != VC_LIM_KEEP:
144         if gs > soft:
145             soft = gs
146     if gh != VC_LIM_KEEP:
147         if gh > hard:
148             hard = gh
149     return (soft, hard)
150
151
152 class VServer:
153
154     INITSCRIPTS = [('/etc/rc.vinit', 'start'),
155                    ('/etc/rc.d/rc', '%(runlevel)d')]
156
157     def __init__(self, name, vm_id = None, vm_running = None, logfile=None):
158
159         self.name = name
160         self.dir = "%s/%s" % (vserverimpl.VSERVER_BASEDIR, name)
161         if not (os.path.isdir(self.dir) and
162                 os.access(self.dir, os.R_OK | os.W_OK | os.X_OK)):
163             raise NoSuchVServer, "no such vserver: " + name
164         self.config = VServerConfig(name, "/etc/vservers/%s" % name)
165         self.remove_caps = ~vserverimpl.CAP_SAFE;
166         if vm_id == None:
167             vm_id = int(self.config.get('context'))
168         self.ctx = vm_id
169         if vm_running == None:
170             vm_running = self.is_running()
171         self.vm_running = vm_running
172         self.logfile = logfile
173
174     # inspired from nodemanager's logger
175     def log(self,msg):
176         if self.logfile:
177             try:
178                 fd = os.open(self.logfile,os.O_WRONLY | os.O_CREAT | os.O_APPEND, 0600)
179                 if not msg.endswith('\n'): msg += '\n'
180                 os.write(fd, '%s: %s' % (time.asctime(time.gmtime()), msg))
181                 os.close(fd)
182             except:
183                 print '%s: (%s failed to open) %s'%(time.asctime(time.gmtime()),self.logfile,msg)
184
185     def set_rlimit(self, type, hard, soft, min):
186         """Generic set resource limit function for vserver"""
187         global RLIMITS
188         update = False
189
190         if hard <> VC_LIM_KEEP:
191             self.config.update('rlimits/%s.hard' % type.lower(), hard)
192             update = True
193         if soft <> VC_LIM_KEEP:
194             self.config.update('rlimits/%s.soft' % type.lower(), soft)
195             update = True
196         if min <> VC_LIM_KEEP:
197             self.config.update('rlimits/%s.min' % type.lower(), min)
198             update = True
199
200         if self.is_running() and update:
201             resource_type = RLIMITS[type]
202             try:
203                 vserverimpl.setrlimit(self.ctx, resource_type, hard, soft, min)
204                 lim = resource.getrlimit(resource_type)
205                 lim = adjust_lim((hard, soft, min), lim)
206                 resource.setrlimit(resource_type, lim)
207             except OSError, e:
208                 self.log("Error: setrlimit(%d, %s, %d, %d, %d): %s"
209                          % (self.ctx, type.lower(), hard, soft, min))
210
211         return update
212
213     def get_prefix_from_capabilities(self, capabilities, prefix):
214         split_caps = capabilities.split(',')
215         return ",".join(["%s" % (c) for c in split_caps if c.startswith(prefix.upper()) or c.startswith(prefix.lower())])
216
217     def get_bcaps_from_capabilities(self, capabilities):
218         return self.get_prefix_from_capabilities(capabilities, "cap_")
219
220     def get_ccaps_from_capabilities(self, capabilities):
221         return self.get_prefix_from_capabilities(capabilities, "vxc_")
222
223     def set_capabilities_config(self, capabilities):
224         bcaps = self.get_bcaps_from_capabilities(capabilities)
225         ccaps = self.get_ccaps_from_capabilities(capabilities)
226         self.config.update('bcapabilities', bcaps)
227         self.config.update('ccapabilities', ccaps)
228         ret = vserverimpl.setbcaps(self.ctx, vserverimpl.text2bcaps(bcaps))
229         if ret > 0:
230             return ret
231         return vserverimpl.setccaps(self.ctx, vserverimpl.text2ccaps(ccaps))
232
233     def get_capabilities(self):
234         bcaps = vserverimpl.bcaps2text(vserverimpl.getbcaps(self.ctx))
235         ccaps = vserverimpl.ccaps2text(vserverimpl.getccaps(self.ctx))
236         if bcaps and ccaps:
237             ccaps = "," + ccaps
238         return (bcaps + ccaps)
239  
240     def get_capabilities_config(self):
241         bcaps = self.config.get('bcapabilities', '')
242         ccaps = self.config.get('ccapabilities', '')
243         if bcaps and ccaps:
244             ccaps = "," + ccaps
245         return (bcaps + ccaps)
246
247     def set_ipaddresses(self, addresses):
248         vserverimpl.netremove(self.ctx, "all")
249         for a in addresses.split(","):
250             vserverimpl.netadd(self.ctx, a)
251
252     def set_ipaddresses_config(self, addresses):
253         i = 0
254         for a in addresses.split(","):
255             self.config.update("interfaces/%d/ip" % i, a)
256             i += 1
257         while self.config.unset("interfaces/%d/ip" % i):
258             i += 1
259         self.set_ipaddresses(addresses)
260
261     def get_ipaddresses_config(self):
262         i = 0
263         ret = []
264         while True:
265             r = self.config.get("interfaces/%d/ip" % i, '')
266             if r == '':
267                 break
268             ret += [r]
269             i += 1
270         return ",".join(ret)
271
272     def get_ipaddresses(self):
273         # No clean way to do this right now.
274         return None
275
276     def __do_chroot(self):
277         os.chroot(self.dir)
278         os.chdir("/")
279
280     def chroot_call(self, fn, *args):
281
282         cwd_fd = os.open(".", os.O_RDONLY)
283         try:
284             root_fd = os.open("/", os.O_RDONLY)
285             try:
286                 self.__do_chroot()
287                 result = fn(*args)
288             finally:
289                 os.fchdir(root_fd)
290                 os.chroot(".")
291                 os.fchdir(cwd_fd)
292                 os.close(root_fd)
293         finally:
294             os.close(cwd_fd)
295         return result
296
297     def set_disklimit(self, block_limit):
298         # block_limit is in kB
299         if block_limit == 0:
300             try:
301                 vserverimpl.unsetdlimit(self.dir, self.ctx)
302             except OSError, e:
303                 self.log("Unexpected error with unsetdlimit for context %d" % self.ctx)
304             return
305
306         if self.vm_running:
307             block_usage = vserverimpl.DLIMIT_KEEP
308             inode_usage = vserverimpl.DLIMIT_KEEP
309         else:
310             # init_disk_info() must have been called to get usage values
311             block_usage = self.disk_blocks
312             inode_usage = self.disk_inodes
313
314         try:
315             vserverimpl.setdlimit(self.dir,
316                                   self.ctx,
317                                   block_usage,
318                                   block_limit,
319                                   inode_usage,
320                                   vserverimpl.DLIMIT_INF,  # inode limit
321                                   2)   # %age reserved for root
322         except OSError, e:
323             self.log("Unexpected error with setdlimit for context %d" % self.ctx)
324
325
326         self.config.update('dlimits/0/space_total', block_limit)
327
328     def is_running(self):
329         return vserverimpl.isrunning(self.ctx)
330     
331     def get_disklimit(self):
332
333         try:
334             (self.disk_blocks, block_limit, self.disk_inodes, inode_limit,
335              reserved) = vserverimpl.getdlimit(self.dir, self.ctx)
336         except OSError, ex:
337             if ex.errno != errno.ESRCH:
338                 raise
339             # get here if no vserver disk limit has been set for xid
340             block_limit = -1
341
342         return block_limit
343
344     def set_sched_config(self, cpu_min, cpu_share):
345
346         """ Write current CPU scheduler parameters to the vserver
347         configuration file. This method does not modify the kernel CPU
348         scheduling parameters for this context. """
349
350         self.config.update('sched/fill-rate', cpu_min)
351         self.config.update('sched/fill-rate2', cpu_share)
352         if cpu_share == 0:
353             self.config.unset('sched/idle-time')
354         
355         if self.is_running():
356             self.set_sched(cpu_min, cpu_share)
357
358     def set_sched(self, cpu_min, cpu_share):
359         """ Update kernel CPU scheduling parameters for this context. """
360         vserverimpl.setsched(self.ctx, cpu_min, cpu_share)
361
362     def get_sched(self):
363         # have no way of querying scheduler right now on a per vserver basis
364         return (-1, False)
365
366     def set_bwlimit(self, minrate = bwlimit.bwmin, maxrate = None,
367                     exempt_min = None, exempt_max = None,
368                     share = None, dev = "eth0"):
369
370         if minrate is None:
371             bwlimit.off(self.ctx, dev)
372         else:
373             bwlimit.on(self.ctx, dev, share,
374                        minrate, maxrate, exempt_min, exempt_max)
375
376     def get_bwlimit(self, dev = "eth0"):
377
378         result = bwlimit.get(self.ctx)
379         # result of bwlimit.get is (ctx, share, minrate, maxrate)
380         if result:
381             result = result[1:]
382         return result
383
384     def open(self, filename, mode = "r", bufsize = -1):
385
386         return self.chroot_call(open, filename, mode, bufsize)
387
388     def __do_chcontext(self, state_file):
389
390         if state_file:
391             print >>state_file, "%u" % self.ctx
392             state_file.close()
393
394         if vserverimpl.chcontext(self.ctx, vserverimpl.text2bcaps(self.get_capabilities_config())):
395             self.set_resources(True)
396             vserverimpl.setup_done(self.ctx)
397
398
399     def __prep(self, runlevel):
400
401         """ Perform all the crap that the vserver script does before
402         actually executing the startup scripts. """
403
404
405         # set the initial runlevel
406         vserverimpl.setrunlevel(self.dir + "/var/run/utmp", runlevel)
407
408         # mount /proc and /dev/pts
409         self.__do_mount("none", self.dir, "/proc", "proc")
410         # XXX - magic mount options
411         self.__do_mount("none", self.dir, "/dev/pts", "devpts", 0, "gid=5,mode=0620")
412
413
414     def __cleanvar(self):
415         """
416         Clean the /var/ directory so RH startup scripts can run
417         """ 
418
419         RUNDIR = "/var/run"
420         LOCKDIR = "/var/lock/subsys"
421
422         filter = ["utmp"]
423         garbage = []
424         for topdir in [RUNDIR, LOCKDIR]:
425             #os.walk() = (dirpath, dirnames, filenames)
426             for root, dirs, files in os.walk(topdir):
427                 for file in files:
428                     if not file in filter:
429                         garbage.append(root + "/" + file)
430
431         for f in garbage: os.unlink(f)
432         return garbage
433
434
435     def __do_mount(self, *mount_args):
436         try:
437             vserverimpl.mount(*mount_args)
438         except OSError, ex:
439             if ex.errno == errno.EBUSY:
440                 # assume already mounted
441                 return
442             raise ex
443
444
445     def enter(self):
446         self.config.cache_it()
447         self.__do_chroot()
448         self.__do_chcontext(None)
449
450
451     def start(self, runlevel = 3):
452
453         if (os.fork() != 0):
454             # Parent should just return.
455             self.vm_running = True
456             return
457         else:
458             # child process
459             try:
460                 # so we don't chcontext with priv'ed fds
461                 close_nonstandard_fds()
462
463                 # get a new session
464                 os.setsid()
465
466                 # open state file to record vserver info
467                 state_file = open("/var/run/vservers/%s" % self.name, "w")
468
469                 # use /dev/null for stdin, /var/log/boot.log for stdout/err
470                 fd = os.open("/dev/null", os.O_RDONLY)
471                 if fd != 0:
472                     os.dup2(fd, 0)
473                     os.close(fd)
474  
475                 # perform pre-init cleanup
476                 self.__prep(runlevel)
477
478                 self.config.cache_it()
479                 self.__do_chroot()
480                 removed = self.__cleanvar()
481
482                 log = open("/var/log/boot.log", "a", 0)
483                 if log.fileno() != 1:
484                     os.dup2(log.fileno(), 1)
485                 os.dup2(1, 2)
486
487                 print >>log, ("%s: removing %s" % 
488                                 (time.asctime(time.gmtime()), removed))
489                 print >>log, ("%s: starting the virtual server %s" %
490                                 (time.asctime(time.gmtime()), self.name))
491                 # execute each init script in turn
492                 # XXX - we don't support all scripts that vserver script does
493                 self.__do_chcontext(state_file)
494                 for cmd in self.INITSCRIPTS:
495                     try:
496                         # enter vserver context
497                         arg_subst = { 'runlevel': runlevel }
498                         cmd_args = [cmd[0]] + map(lambda x: x % arg_subst,
499                                                    cmd[1:])
500                         if os.path.isfile(cmd[0]):                         
501                             print >>log, "executing '%s'" % " ".join(cmd_args)
502                             os.spawnvp(os.P_NOWAIT,cmd[0],cmd_args)
503                     except:
504                         print >>log, traceback.format_exc()
505
506             # we get here due to an exception in the top-level child process
507             except Exception, ex:
508                 self.log(traceback.format_exc())
509             os._exit(0)
510
511     def set_resources(self,setup=False):
512
513         """ Called when vserver context is entered for first time,
514         should be overridden by subclass. """
515
516         pass
517
518     def init_disk_info(self):
519         try:
520             dlimit = vserverimpl.getdlimit(self.dir, self.ctx)
521             self.disk_blocks = dlimit[0]
522             self.disk_inodes = dlimit[2]
523             return self.disk_blocks * 1024
524         except Exception, e:
525             pass
526         cmd = "/usr/sbin/vdu --script --space --inodes --blocksize 1024 --xid %d %s" % (self.ctx, self.dir)
527         p = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE,
528                              stdout=subprocess.PIPE, stderr=subprocess.PIPE,
529                              close_fds=True)
530         p.stdin.close()
531         line = p.stdout.readline()
532         if not line:
533             sys.stderr.write(p.stderr.read())
534         p.stdout.close()
535         p.stderr.close()
536         ret = p.wait()
537
538         (space, inodes) = line.split()
539         self.disk_inodes = int(inodes)
540         self.disk_blocks = int(space)
541         #(self.disk_inodes, self.disk_blocks) = vduimpl.vdu(self.dir)
542
543         return self.disk_blocks * 1024
544
545     def stop(self, signal = signal.SIGKILL):
546         vserverimpl.killall(self.ctx, signal)
547         self.vm_running = False
548
549     def setname(self, slice_id):
550         '''Set vcVHI_CONTEXT field in kernel to slice_id'''
551         vserverimpl.setname(self.ctx, slice_id)
552
553     def getname(self):
554         '''Get vcVHI_CONTEXT field in kernel'''
555         return vserverimpl.getname(self.ctx)
556
557
558 def create(vm_name, static = False, ctor = VServer):
559
560     options = ['vuseradd']
561     if static:
562         options += ['--static']
563     ret = os.spawnvp(os.P_WAIT, 'vuseradd', options + [vm_name])
564     if not os.WIFEXITED(ret) or os.WEXITSTATUS(ret) != 0:
565         out = "system command ('%s') " % options
566         if os.WIFEXITED(ret):
567             out += "failed, rc = %d" % os.WEXITSTATUS(ret)
568         else:
569             out += "killed by signal %d" % os.WTERMSIG(ret)
570         raise SystemError, out
571     vm_id = pwd.getpwnam(vm_name)[2]
572
573     return ctor(vm_name, vm_id)
574
575
576 def close_nonstandard_fds():
577     """Close all open file descriptors other than 0, 1, and 2."""
578     _SC_OPEN_MAX = 4
579     for fd in range(3, os.sysconf(_SC_OPEN_MAX)):
580         try: os.close(fd)
581         except OSError: pass  # most likely an fd that isn't open
582