FUSE patch from Jeremy Stribling.
[util-vserver-pl.git] / python / vserver.py
1 # Copyright 2005 Princeton University
2
3 #$Id: vserver.py,v 1.72 2007/08/02 16:01:59 dhozac Exp $
4
5 import errno
6 import fcntl
7 import os
8 import re
9 import pwd
10 import signal
11 import sys
12 import time
13 import traceback
14 import subprocess
15 import resource
16
17 import vserverimpl
18 import cpulimit, bwlimit
19
20 from vserverimpl import DLIMIT_INF
21 from vserverimpl import VC_LIM_KEEP
22 from vserverimpl import VLIMIT_NSOCK
23 from vserverimpl import VLIMIT_OPENFD
24 from vserverimpl import VLIMIT_ANON
25 from vserverimpl import VLIMIT_SHMEM
26
27 #
28 # these are the flags taken from the kernel linux/vserver/legacy.h
29 #
30 FLAGS_LOCK = 1
31 FLAGS_SCHED = 2  # XXX - defined in util-vserver/src/chcontext.c
32 FLAGS_NPROC = 4
33 FLAGS_PRIVATE = 8
34 FLAGS_INIT = 16
35 FLAGS_HIDEINFO = 32
36 FLAGS_ULIMIT = 64
37 FLAGS_NAMESPACE = 128
38
39 RLIMITS = { "NSOCK": VLIMIT_NSOCK,
40             "OPENFD": VLIMIT_OPENFD,
41             "ANON": VLIMIT_ANON,
42             "SHMEM": VLIMIT_SHMEM}
43
44 # add in the platform supported rlimits
45 for entry in resource.__dict__.keys():
46     if entry.find("RLIMIT_")==0:
47         k = entry[len("RLIMIT_"):]
48         if not RLIMITS.has_key(k):
49             RLIMITS[k]=resource.__dict__[entry]
50         else:
51             print "WARNING: duplicate RLIMITS key %s" % k
52
53 class NoSuchVServer(Exception): pass
54
55
56 class VServerConfig:
57     def __init__(self, name, directory):
58         self.name = name
59         self.dir = directory
60         self.cache = None
61         if not (os.path.isdir(self.dir) and
62                 os.access(self.dir, os.R_OK | os.W_OK | os.X_OK)):
63             raise NoSuchVServer, "%s does not exist" % self.dir
64
65     def get(self, option, default = None):
66         try:
67             if self.cache:
68                 return self.cache[option]
69             else:
70                 f = open(os.path.join(self.dir, option), "r")
71                 buf = f.read().rstrip()
72                 f.close()
73                 return buf
74         except:
75             if default is not None:
76                 return default
77             else:
78                 raise KeyError, "Key %s is not set for %s" % (option, self.name)
79
80     def update(self, option, value):
81         if self.cache:
82             return
83
84         try:
85             old_umask = os.umask(0022)
86             filename = os.path.join(self.dir, option)
87             try:
88                 os.makedirs(os.path.dirname(filename), 0755)
89             except:
90                 pass
91             f = open(filename, 'w')
92             if isinstance(value, list):
93                 f.write("%s\n" % "\n".join(value))
94             else:
95                 f.write("%s\n" % value)
96             f.close()
97             os.umask(old_umask)
98         except:
99             raise
100
101     def unset(self, option):
102         if self.cache:
103             return
104
105         try:
106             filename = os.path.join(self.dir, option)
107             os.unlink(filename)
108             try:
109                 os.removedirs(os.path.dirname(filename))
110             except:
111                 pass
112             return True
113         except:
114             return False
115
116     def cache_it(self):
117         self.cache = {}
118         def add_to_cache(cache, dirname, fnames):
119             for file in fnames:
120                 full_name = os.path.join(dirname, file)
121                 if os.path.islink(full_name):
122                     fnames.remove(file)
123                 elif (os.path.isfile(full_name) and
124                       os.access(full_name, os.R_OK)):
125                     f = open(full_name, "r")
126                     cache[full_name.replace(os.path.join(self.dir, ''),
127                                             '')] = f.read().rstrip()
128                     f.close()
129         os.path.walk(self.dir, add_to_cache, self.cache)
130
131
132 class VServer:
133
134     INITSCRIPTS = [('/etc/rc.vinit', 'start'),
135                    ('/etc/rc.d/rc', '%(runlevel)d')]
136
137     def __init__(self, name, vm_id = None, vm_running = None, logfile=None):
138
139         self.name = name
140         self.dir = "%s/%s" % (vserverimpl.VSERVER_BASEDIR, name)
141         if not (os.path.isdir(self.dir) and
142                 os.access(self.dir, os.R_OK | os.W_OK | os.X_OK)):
143             raise NoSuchVServer, "no such vserver: " + name
144         self.config = VServerConfig(name, "/etc/vservers/%s" % name)
145         self.remove_caps = ~vserverimpl.CAP_SAFE;
146         if vm_id == None:
147             vm_id = int(self.config.get('context'))
148         self.ctx = vm_id
149         if vm_running == None:
150             vm_running = self.is_running()
151         self.vm_running = vm_running
152         self.logfile = logfile
153
154     # inspired from nodemanager's logger
155     def log(self,msg):
156         if self.logfile:
157             try:
158                 fd = os.open(self.logfile,os.O_WRONLY | os.O_CREAT | os.O_APPEND, 0600)
159                 if not msg.endswith('\n'): msg += '\n'
160                 os.write(fd, '%s: %s' % (time.asctime(time.gmtime()), msg))
161                 os.close(fd)
162             except:
163                 print '%s: (%s failed to open) %s'%(time.asctime(time.gmtime()),self.logfile,msg)
164
165     def set_rlimit(self, type, hard, soft, min):
166         """Generic set resource limit function for vserver"""
167         global RLIMITS
168         update = False
169
170         if hard <> VC_LIM_KEEP:
171             self.config.update('rlimits/%s.hard' % type.lower(), hard)
172             update = True
173         if soft <> VC_LIM_KEEP:
174             self.config.update('rlimits/%s.soft' % type.lower(), soft)
175             update = True
176         if min <> VC_LIM_KEEP:
177             self.config.update('rlimits/%s.min' % type.lower(), min)
178             update = True
179
180         if self.is_running() and update:
181             resource_type = RLIMITS[type]
182             try:
183                 vserverimpl.setrlimit(self.ctx, resource_type, hard, soft, min)
184             except OSError, e:
185                 self.log("Error: setrlimit(%d, %s, %d, %d, %d): %s"
186                          % (self.ctx, type.lower(), hard, soft, min))
187
188         return update
189
190     def get_prefix_from_capabilities(self, capabilities, prefix):
191         split_caps = capabilities.split(',')
192         return ",".join(["%s" % (c) for c in split_caps if c.startswith(prefix.upper()) or c.startswith(prefix.lower())])
193
194     def get_bcaps_from_capabilities(self, capabilities):
195         return self.get_prefix_from_capabilities(capabilities, "cap_")
196
197     def get_ccaps_from_capabilities(self, capabilities):
198         return self.get_prefix_from_capabilities(capabilities, "vxc_")
199
200     def set_capabilities_config(self, capabilities):
201         bcaps = self.get_bcaps_from_capabilities(capabilities)
202         ccaps = self.get_ccaps_from_capabilities(capabilities)
203         self.config.update('bcapabilities', bcaps)
204         self.config.update('ccapabilities', ccaps)
205         ret = vserverimpl.setbcaps(self.ctx, vserverimpl.text2bcaps(bcaps))
206         if ret > 0:
207             return ret
208         return vserverimpl.setccaps(self.ctx, vserverimpl.text2ccaps(ccaps))
209
210     def get_capabilities(self):
211         bcaps = vserverimpl.bcaps2text(vserverimpl.getbcaps(self.ctx))
212         ccaps = vserverimpl.ccaps2text(vserverimpl.getccaps(self.ctx))
213         if bcaps and ccaps:
214             ccaps = "," + ccaps
215         return (bcaps + ccaps)
216  
217     def get_capabilities_config(self):
218         bcaps = self.config.get('bcapabilities', '')
219         ccaps = self.config.get('ccapabilities', '')
220         if bcaps and ccaps:
221             ccaps = "," + ccaps
222         return (bcaps + ccaps)
223
224     def set_ipaddresses(self, addresses):
225         vserverimpl.netremove(self.ctx, "all")
226         for a in addresses.split(","):
227             vserverimpl.netadd(self.ctx, a)
228
229     def set_ipaddresses_config(self, addresses):
230         i = 0
231         for a in addresses.split(","):
232             self.config.update("interfaces/%d/ip" % i, a)
233             i += 1
234         while self.config.unset("interfaces/%d/ip" % i):
235             i += 1
236         self.set_ipaddresses(addresses)
237
238     def get_ipaddresses_config(self):
239         i = 0
240         ret = []
241         while True:
242             r = self.config.get("interfaces/%d/ip" % i, '')
243             if r == '':
244                 break
245             ret += [r]
246             i += 1
247         return ",".join(ret)
248
249     def get_ipaddresses(self):
250         # No clean way to do this right now.
251         return None
252
253     def __do_chroot(self):
254         os.chroot(self.dir)
255         os.chdir("/")
256
257     def chroot_call(self, fn, *args):
258
259         cwd_fd = os.open(".", os.O_RDONLY)
260         try:
261             root_fd = os.open("/", os.O_RDONLY)
262             try:
263                 self.__do_chroot()
264                 result = fn(*args)
265             finally:
266                 os.fchdir(root_fd)
267                 os.chroot(".")
268                 os.fchdir(cwd_fd)
269                 os.close(root_fd)
270         finally:
271             os.close(cwd_fd)
272         return result
273
274     def set_disklimit(self, block_limit):
275         # block_limit is in kB
276         if block_limit == 0:
277             try:
278                 vserverimpl.unsetdlimit(self.dir, self.ctx)
279             except OSError, e:
280                 self.log("Unexpected error with unsetdlimit for context %d" % self.ctx)
281             return
282
283         if self.vm_running:
284             block_usage = vserverimpl.DLIMIT_KEEP
285             inode_usage = vserverimpl.DLIMIT_KEEP
286         else:
287             # init_disk_info() must have been called to get usage values
288             block_usage = self.disk_blocks
289             inode_usage = self.disk_inodes
290
291         try:
292             vserverimpl.setdlimit(self.dir,
293                                   self.ctx,
294                                   block_usage,
295                                   block_limit,
296                                   inode_usage,
297                                   vserverimpl.DLIMIT_INF,  # inode limit
298                                   2)   # %age reserved for root
299         except OSError, e:
300             self.log("Unexpected error with setdlimit for context %d" % self.ctx)
301
302
303         self.config.update('dlimits/0/space_total', block_limit)
304
305     def is_running(self):
306         return vserverimpl.isrunning(self.ctx)
307     
308     def get_disklimit(self):
309
310         try:
311             (self.disk_blocks, block_limit, self.disk_inodes, inode_limit,
312              reserved) = vserverimpl.getdlimit(self.dir, self.ctx)
313         except OSError, ex:
314             if ex.errno != errno.ESRCH:
315                 raise
316             # get here if no vserver disk limit has been set for xid
317             block_limit = -1
318
319         return block_limit
320
321     def set_sched_config(self, cpu_min, cpu_share):
322
323         """ Write current CPU scheduler parameters to the vserver
324         configuration file. This method does not modify the kernel CPU
325         scheduling parameters for this context. """
326
327         self.config.update('sched/fill-rate', cpu_min)
328         self.config.update('sched/fill-rate2', cpu_share)
329         if cpu_share == 0:
330             self.config.unset('sched/idle-time')
331         
332         if self.is_running():
333             self.set_sched(cpu_min, cpu_share)
334
335     def set_sched(self, cpu_min, cpu_share):
336         """ Update kernel CPU scheduling parameters for this context. """
337         vserverimpl.setsched(self.ctx, cpu_min, cpu_share)
338
339     def get_sched(self):
340         # have no way of querying scheduler right now on a per vserver basis
341         return (-1, False)
342
343     def set_bwlimit(self, minrate = bwlimit.bwmin, maxrate = None,
344                     exempt_min = None, exempt_max = None,
345                     share = None, dev = "eth0"):
346
347         if minrate is None:
348             bwlimit.off(self.ctx, dev)
349         else:
350             bwlimit.on(self.ctx, dev, share,
351                        minrate, maxrate, exempt_min, exempt_max)
352
353     def get_bwlimit(self, dev = "eth0"):
354
355         result = bwlimit.get(self.ctx)
356         # result of bwlimit.get is (ctx, share, minrate, maxrate)
357         if result:
358             result = result[1:]
359         return result
360
361     def open(self, filename, mode = "r", bufsize = -1):
362
363         return self.chroot_call(open, filename, mode, bufsize)
364
365     def __do_chcontext(self, state_file):
366
367         if state_file:
368             print >>state_file, "%u" % self.ctx
369             state_file.close()
370
371         if vserverimpl.chcontext(self.ctx, vserverimpl.text2bcaps(self.get_capabilities_config())):
372             self.set_resources()
373             vserverimpl.setup_done(self.ctx)
374
375     def __prep(self, runlevel):
376
377         """ Perform all the crap that the vserver script does before
378         actually executing the startup scripts. """
379
380         # remove /var/run and /var/lock/subsys files
381         # but don't remove utmp from the top-level /var/run
382         RUNDIR = "/var/run"
383         LOCKDIR = "/var/lock/subsys"
384         filter_fn = lambda fs: filter(lambda f: f != 'utmp', fs)
385         garbage = reduce((lambda (out, ff), (dir, subdirs, files):
386                           (out + map((dir + "/").__add__, ff(files)),
387                            lambda fs: fs)),
388                          list(os.walk(RUNDIR)),
389                          ([], filter_fn))[0]
390         garbage += filter(os.path.isfile, map((LOCKDIR + "/").__add__,
391                                               os.listdir(LOCKDIR)))
392         if False:
393             for f in garbage:
394                 os.unlink(f)
395
396         # set the initial runlevel
397         vserverimpl.setrunlevel(RUNDIR + "/utmp", runlevel)
398
399         # mount /proc and /dev/pts
400         self.__do_mount("none", self.dir, "/proc", "proc")
401         # XXX - magic mount options
402         self.__do_mount("none", self.dir, "/dev/pts", "devpts", 0, "gid=5,mode=0620")
403
404     def __do_mount(self, *mount_args):
405
406         try:
407             vserverimpl.mount(*mount_args)
408         except OSError, ex:
409             if ex.errno == errno.EBUSY:
410                 # assume already mounted
411                 return
412             raise ex
413
414     def enter(self):
415         self.config.cache_it()
416         self.__do_chroot()
417         self.__do_chcontext(None)
418
419     def start(self, runlevel = 3):
420
421         if (os.fork() != 0):
422             # Parent should just return.
423             self.vm_running = True
424             return
425         else:
426             # child process
427             try:
428                 # so we don't chcontext with priv'ed fds
429                 close_nonstandard_fds()
430
431                 # get a new session
432                 os.setsid()
433
434                 # open state file to record vserver info
435                 state_file = open("/var/run/vservers/%s" % self.name, "w")
436
437                 # use /dev/null for stdin, /var/log/boot.log for stdout/err
438                 fd = os.open("/dev/null", os.O_RDONLY)
439                 if fd != 0:
440                     os.dup2(fd, 0)
441                     os.close(fd)
442  
443                 # perform pre-init cleanup
444                 self.__prep(runlevel)
445
446                 self.config.cache_it()
447                 self.__do_chroot()
448                 log = open("/var/log/boot.log", "a", 0)
449                 if log.fileno() != 1:
450                     os.dup2(log.fileno(), 1)
451                 os.dup2(1, 2)
452
453                 print >>log, ("%s: starting the virtual server %s" %
454                               (time.asctime(time.gmtime()), self.name))
455                 # execute each init script in turn
456                 # XXX - we don't support all scripts that vserver script does
457                 self.__do_chcontext(state_file)
458                 for cmd in self.INITSCRIPTS:
459                     try:
460                         # enter vserver context
461                         arg_subst = { 'runlevel': runlevel }
462                         cmd_args = [cmd[0]] + map(lambda x: x % arg_subst,
463                                                    cmd[1:])
464                         if os.path.isfile(cmd[0]):                         
465                             print >>log, "executing '%s'" % " ".join(cmd_args)
466                             os.spawnvp(os.P_NOWAIT,cmd[0],cmd_args)
467                     except:
468                         print >>log, traceback.format_exc()
469
470             # we get here due to an exception in the top-level child process
471             except Exception, ex:
472                 self.log(traceback.format_exc())
473             os._exit(0)
474
475     def set_resources(self):
476
477         """ Called when vserver context is entered for first time,
478         should be overridden by subclass. """
479
480         pass
481
482     def init_disk_info(self):
483         try:
484             dlimit = vserverimpl.getdlimit(self.dir, self.ctx)
485             self.disk_blocks = dlimit[0]
486             self.disk_inodes = dlimit[2]
487             return self.disk_blocks * 1024
488         except Exception, e:
489             pass
490         cmd = "/usr/sbin/vdu --script --space --inodes --blocksize 1024 --xid %d %s" % (self.ctx, self.dir)
491         p = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE,
492                              stdout=subprocess.PIPE, stderr=subprocess.PIPE,
493                              close_fds=True)
494         p.stdin.close()
495         line = p.stdout.readline()
496         if not line:
497             sys.stderr.write(p.stderr.read())
498         p.stdout.close()
499         p.stderr.close()
500         ret = p.wait()
501
502         (space, inodes) = line.split()
503         self.disk_inodes = int(inodes)
504         self.disk_blocks = int(space)
505         #(self.disk_inodes, self.disk_blocks) = vduimpl.vdu(self.dir)
506
507         return self.disk_blocks * 1024
508
509     def stop(self, signal = signal.SIGKILL):
510         vserverimpl.killall(self.ctx, signal)
511         self.vm_running = False
512
513     def setname(self, slice_id):
514         '''Set vcVHI_CONTEXT field in kernel to slice_id'''
515         vserverimpl.setname(self.ctx, slice_id)
516
517     def getname(self):
518         '''Get vcVHI_CONTEXT field in kernel'''
519         return vserverimpl.getname(self.ctx)
520
521
522 def create(vm_name, static = False, ctor = VServer):
523
524     options = ['vuseradd']
525     if static:
526         options += ['--static']
527     ret = os.spawnvp(os.P_WAIT, 'vuseradd', options + [vm_name])
528     if not os.WIFEXITED(ret) or os.WEXITSTATUS(ret) != 0:
529         out = "system command ('%s') " % options
530         if os.WIFEXITED(ret):
531             out += "failed, rc = %d" % os.WEXITSTATUS(ret)
532         else:
533             out += "killed by signal %d" % os.WTERMSIG(ret)
534         raise SystemError, out
535     vm_id = pwd.getpwnam(vm_name)[2]
536
537     return ctor(vm_name, vm_id)
538
539
540 def close_nonstandard_fds():
541     """Close all open file descriptors other than 0, 1, and 2."""
542     _SC_OPEN_MAX = 4
543     for fd in range(3, os.sysconf(_SC_OPEN_MAX)):
544         try: os.close(fd)
545         except OSError: pass  # most likely an fd that isn't open
546