Use cgroup CPU scheduler
[util-vserver-pl.git] / python / vserver.py
1 # Copyright 2005 Princeton University
2
3 #$Id: vserver.py,v 1.72 2007/08/02 16:01:59 dhozac Exp $
4
5 import errno
6 import fcntl
7 import os
8 import re
9 import pwd
10 import signal
11 import sys
12 import time
13 import traceback
14 import subprocess
15 import resource
16
17 import vserverimpl
18 import cpulimit, bwlimit
19
20 from vserverimpl import DLIMIT_INF
21 from vserverimpl import VC_LIM_KEEP
22 from vserverimpl import VLIMIT_NSOCK
23 from vserverimpl import VLIMIT_OPENFD
24 from vserverimpl import VLIMIT_ANON
25 from vserverimpl import VLIMIT_SHMEM
26
27 #
28 # these are the flags taken from the kernel linux/vserver/legacy.h
29 #
30 FLAGS_LOCK = 1
31 FLAGS_SCHED = 2  # XXX - defined in util-vserver/src/chcontext.c
32 FLAGS_NPROC = 4
33 FLAGS_PRIVATE = 8
34 FLAGS_INIT = 16
35 FLAGS_HIDEINFO = 32
36 FLAGS_ULIMIT = 64
37 FLAGS_NAMESPACE = 128
38
39 RLIMITS = { "NSOCK": VLIMIT_NSOCK,
40             "OPENFD": VLIMIT_OPENFD,
41             "ANON": VLIMIT_ANON,
42             "SHMEM": VLIMIT_SHMEM}
43
44 CPU_SHARE_MULT = 1024
45
46 # add in the platform supported rlimits
47 for entry in resource.__dict__.keys():
48     if entry.find("RLIMIT_")==0:
49         k = entry[len("RLIMIT_"):]
50         if not RLIMITS.has_key(k):
51             RLIMITS[k]=resource.__dict__[entry]
52         else:
53             print "WARNING: duplicate RLIMITS key %s" % k
54
55 class NoSuchVServer(Exception): pass
56
57
58 class VServerConfig:
59     def __init__(self, name, directory):
60         self.name = name
61         self.dir = directory
62         self.cache = None
63         if not (os.path.isdir(self.dir) and
64                 os.access(self.dir, os.R_OK | os.W_OK | os.X_OK)):
65             raise NoSuchVServer, "%s does not exist" % self.dir
66
67     def get(self, option, default = None):
68         try:
69             if self.cache:
70                 return self.cache[option]
71             else:
72                 f = open(os.path.join(self.dir, option), "r")
73                 buf = f.read().rstrip()
74                 f.close()
75                 return buf
76         except:
77             if default is not None:
78                 return default
79             else:
80                 raise KeyError, "Key %s is not set for %s" % (option, self.name)
81
82     def update(self, option, value):
83         if self.cache:
84             return
85
86         try:
87             old_umask = os.umask(0022)
88             filename = os.path.join(self.dir, option)
89             try:
90                 os.makedirs(os.path.dirname(filename), 0755)
91             except:
92                 pass
93             f = open(filename, 'w')
94             if isinstance(value, list):
95                 f.write("%s\n" % "\n".join(value))
96             else:
97                 f.write("%s\n" % value)
98             f.close()
99             os.umask(old_umask)
100         except:
101             raise
102
103     def unset(self, option):
104         if self.cache:
105             return
106
107         try:
108             filename = os.path.join(self.dir, option)
109             os.unlink(filename)
110             try:
111                 os.removedirs(os.path.dirname(filename))
112             except:
113                 pass
114             return True
115         except:
116             return False
117
118     def cache_it(self):
119         self.cache = {}
120         def add_to_cache(cache, dirname, fnames):
121             for file in fnames:
122                 full_name = os.path.join(dirname, file)
123                 if os.path.islink(full_name):
124                     fnames.remove(file)
125                 elif (os.path.isfile(full_name) and
126                       os.access(full_name, os.R_OK)):
127                     f = open(full_name, "r")
128                     cache[full_name.replace(os.path.join(self.dir, ''),
129                                             '')] = f.read().rstrip()
130                     f.close()
131         os.path.walk(self.dir, add_to_cache, self.cache)
132
133
134 class VServer:
135
136     INITSCRIPTS = [('/etc/rc.vinit', 'start'),
137                    ('/etc/rc.d/rc', '%(runlevel)d')]
138
139     def __init__(self, name, vm_id = None, vm_running = None, logfile=None):
140
141         self.name = name
142         self.dir = "%s/%s" % (vserverimpl.VSERVER_BASEDIR, name)
143         if not (os.path.isdir(self.dir) and
144                 os.access(self.dir, os.R_OK | os.W_OK | os.X_OK)):
145             raise NoSuchVServer, "no such vserver: " + name
146         self.config = VServerConfig(name, "/etc/vservers/%s" % name)
147         self.remove_caps = ~vserverimpl.CAP_SAFE;
148         if vm_id == None:
149             vm_id = int(self.config.get('context'))
150         self.ctx = vm_id
151         if vm_running == None:
152             vm_running = self.is_running()
153         self.vm_running = vm_running
154         self.logfile = logfile
155
156     # inspired from nodemanager's logger
157     def log(self,msg):
158         if self.logfile:
159             try:
160                 fd = os.open(self.logfile,os.O_WRONLY | os.O_CREAT | os.O_APPEND, 0600)
161                 if not msg.endswith('\n'): msg += '\n'
162                 os.write(fd, '%s: %s' % (time.asctime(time.gmtime()), msg))
163                 os.close(fd)
164             except:
165                 print '%s: (%s failed to open) %s'%(time.asctime(time.gmtime()),self.logfile,msg)
166
167     def set_rlimit(self, type, hard, soft, min):
168         """Generic set resource limit function for vserver"""
169         global RLIMITS
170         update = False
171
172         if hard <> VC_LIM_KEEP:
173             self.config.update('rlimits/%s.hard' % type.lower(), hard)
174             update = True
175         if soft <> VC_LIM_KEEP:
176             self.config.update('rlimits/%s.soft' % type.lower(), soft)
177             update = True
178         if min <> VC_LIM_KEEP:
179             self.config.update('rlimits/%s.min' % type.lower(), min)
180             update = True
181
182         if self.is_running() and update:
183             resource_type = RLIMITS[type]
184             try:
185                 vserverimpl.setrlimit(self.ctx, resource_type, hard, soft, min)
186             except OSError, e:
187                 self.log("Error: setrlimit(%d, %s, %d, %d, %d): %s"
188                          % (self.ctx, type.lower(), hard, soft, min))
189
190         return update
191
192     def get_prefix_from_capabilities(self, capabilities, prefix):
193         split_caps = capabilities.split(',')
194         return ",".join(["%s" % (c) for c in split_caps if c.startswith(prefix.upper()) or c.startswith(prefix.lower())])
195
196     def get_bcaps_from_capabilities(self, capabilities):
197         return self.get_prefix_from_capabilities(capabilities, "cap_")
198
199     def get_ccaps_from_capabilities(self, capabilities):
200         return self.get_prefix_from_capabilities(capabilities, "vxc_")
201
202     def set_capabilities_config(self, capabilities):
203         bcaps = self.get_bcaps_from_capabilities(capabilities)
204         ccaps = self.get_ccaps_from_capabilities(capabilities)
205         if len(bcaps) > 0:
206             bcaps += ","
207         bcaps += "CAP_NET_RAW"
208         self.config.update('bcapabilities', bcaps)
209         self.config.update('ccapabilities', ccaps)
210         ret = vserverimpl.setbcaps(self.ctx, vserverimpl.text2bcaps(bcaps))
211         if ret > 0:
212             return ret
213         return vserverimpl.setccaps(self.ctx, vserverimpl.text2ccaps(ccaps))
214
215     def get_capabilities(self):
216         bcaps = vserverimpl.bcaps2text(vserverimpl.getbcaps(self.ctx))
217         ccaps = vserverimpl.ccaps2text(vserverimpl.getccaps(self.ctx))
218         if bcaps and ccaps:
219             ccaps = "," + ccaps
220         return (bcaps + ccaps)
221  
222     def get_capabilities_config(self):
223         bcaps = self.config.get('bcapabilities', '')
224         ccaps = self.config.get('ccapabilities', '')
225         if bcaps and ccaps:
226             ccaps = "," + ccaps
227         return (bcaps + ccaps)
228
229     def set_ipaddresses(self, addresses):
230         vserverimpl.netremove(self.ctx, "all")
231         for a in addresses.split(","):
232             vserverimpl.netadd(self.ctx, a)
233
234     def set_ipaddresses_config(self, addresses):
235         return
236         i = 0
237         for a in addresses.split(","):
238             self.config.update("interfaces/%d/ip" % i, a)
239             i += 1
240         while self.config.unset("interfaces/%d/ip" % i):
241             i += 1
242         self.set_ipaddresses(addresses)
243
244     def get_ipaddresses_config(self):
245         i = 0
246         ret = []
247         while True:
248             r = self.config.get("interfaces/%d/ip" % i, '')
249             if r == '':
250                 break
251             ret += [r]
252             i += 1
253         return ",".join(ret)
254
255     def get_ipaddresses(self):
256         # No clean way to do this right now.
257         return None
258
259     def get_unshare_netns_config(self):
260         try:
261             unshare_netns = int(self.config.get('spaces/net'))
262         except:
263             unshare_netns = 0;
264         return unshare_netns;
265
266     def __do_chroot(self):
267         os.chroot(self.dir)
268         os.chdir("/")
269
270     def chroot_call(self, fn, *args):
271
272         cwd_fd = os.open(".", os.O_RDONLY)
273         try:
274             root_fd = os.open("/", os.O_RDONLY)
275             try:
276                 self.__do_chroot()
277                 result = fn(*args)
278             finally:
279                 os.fchdir(root_fd)
280                 os.chroot(".")
281                 os.fchdir(cwd_fd)
282                 os.close(root_fd)
283         finally:
284             os.close(cwd_fd)
285         return result
286
287     def set_disklimit(self, block_limit):
288         # block_limit is in kB
289         if block_limit == 0:
290             try:
291                 vserverimpl.unsetdlimit(self.dir, self.ctx)
292             except OSError, e:
293                 self.log("Unexpected error with unsetdlimit for context %d" % self.ctx)
294             return
295
296         if self.vm_running:
297             block_usage = vserverimpl.DLIMIT_KEEP
298             inode_usage = vserverimpl.DLIMIT_KEEP
299         else:
300             # init_disk_info() must have been called to get usage values
301             block_usage = self.disk_blocks
302             inode_usage = self.disk_inodes
303
304         try:
305             vserverimpl.setdlimit(self.dir,
306                                   self.ctx,
307                                   block_usage,
308                                   block_limit,
309                                   inode_usage,
310                                   vserverimpl.DLIMIT_INF,  # inode limit
311                                   2)   # %age reserved for root
312         except OSError, e:
313             self.log("Unexpected error with setdlimit for context %d" % self.ctx)
314
315
316         self.config.update('dlimits/0/space_total', block_limit)
317
318     def is_running(self):
319         return vserverimpl.isrunning(self.ctx)
320     
321     def get_disklimit(self):
322
323         try:
324             (self.disk_blocks, block_limit, self.disk_inodes, inode_limit,
325              reserved) = vserverimpl.getdlimit(self.dir, self.ctx)
326         except OSError, ex:
327             if ex.errno != errno.ESRCH:
328                 raise
329             # get here if no vserver disk limit has been set for xid
330             block_limit = -1
331
332         return block_limit
333
334     def set_sched_config(self, cpu_min, cpu_share):
335
336         """ Write current CPU scheduler parameters to the vserver
337         configuration file. Currently, 'cpu_min' is not supported. """
338         self.config.update('cgroup/cpu.shares', cpu_share * CPU_SHARE_MULT)
339         if self.is_running():
340             self.set_sched(cpu_min, cpu_share)
341
342     def set_sched(self, cpu_min, cpu_share):
343         """ Update kernel CPU scheduling parameters for this context.
344         Currently, 'cpu_min' is not supported. """
345         try:
346             cgroup = open('/dev/cgroup/%s/cpu.shares' % name, 'w')
347             cgroup.write('%s' % (cpu_share * CPU_SHARE_MULT))
348             cgroup.close()
349         except:
350             pass
351
352     def get_sched(self):
353         try:
354             cpu_share = int(int(self.config.get('cgroup/cpu.shares')) / CPU_SHARE_MULT)
355         except:
356             cpu_share = False
357         return (-1, cpu_share)
358
359     def set_bwlimit(self, minrate = bwlimit.bwmin, maxrate = None,
360                     exempt_min = None, exempt_max = None,
361                     share = None, dev = "eth0"):
362
363         if minrate is None:
364             bwlimit.off(self.ctx, dev)
365         else:
366             bwlimit.on(self.ctx, dev, share,
367                        minrate, maxrate, exempt_min, exempt_max)
368
369     def get_bwlimit(self, dev = "eth0"):
370
371         result = bwlimit.get(self.ctx)
372         # result of bwlimit.get is (ctx, share, minrate, maxrate)
373         if result:
374             result = result[1:]
375         return result
376
377     def open(self, filename, mode = "r", bufsize = -1):
378
379         return self.chroot_call(open, filename, mode, bufsize)
380
381     def __do_chcontext(self, state_file):
382
383         if state_file:
384             print >>state_file, "%u" % self.ctx
385             state_file.close()
386
387         if vserverimpl.chcontext(self.ctx, vserverimpl.text2bcaps(self.get_capabilities_config()), self.get_unshare_netns_config()):
388             self.set_resources()
389             vserverimpl.setup_done(self.ctx)
390
391     def __prep(self, runlevel):
392
393         """ Perform all the crap that the vserver script does before
394         actually executing the startup scripts. """
395
396         # remove /var/run and /var/lock/subsys files
397         # but don't remove utmp from the top-level /var/run
398         RUNDIR = "/var/run"
399         LOCKDIR = "/var/lock/subsys"
400         filter_fn = lambda fs: filter(lambda f: f != 'utmp', fs)
401         garbage = reduce((lambda (out, ff), (dir, subdirs, files):
402                           (out + map((dir + "/").__add__, ff(files)),
403                            lambda fs: fs)),
404                          list(os.walk(RUNDIR)),
405                          ([], filter_fn))[0]
406         garbage += filter(os.path.isfile, map((LOCKDIR + "/").__add__,
407                                               os.listdir(LOCKDIR)))
408         if False:
409             for f in garbage:
410                 os.unlink(f)
411
412         # set the initial runlevel
413         vserverimpl.setrunlevel(RUNDIR + "/utmp", runlevel)
414
415         # mount /proc and /dev/pts
416         self.__do_mount("none", self.dir, "/proc", "proc")
417         # XXX - magic mount options
418         self.__do_mount("none", self.dir, "/dev/pts", "devpts", 0, "gid=5,mode=0620")
419
420     def __do_mount(self, *mount_args):
421
422         try:
423             vserverimpl.mount(*mount_args)
424         except OSError, ex:
425             if ex.errno == errno.EBUSY:
426                 # assume already mounted
427                 return
428             raise ex
429
430     def enter(self):
431         subprocess.call("/usr/sbin/vserver %s enter" % self.name, shell=True)
432
433     def start(self, runlevel = 3):
434         if (os.fork() != 0):
435             # Parent should just return.
436             self.vm_running = True
437             return
438         else:
439             # child process
440             try:
441                 subprocess.call("/usr/sbin/vserver %s start" % self.name, 
442                                 shell=True)
443             # we get here due to an exception in the top-level child process
444             except Exception, ex:
445                 self.log(traceback.format_exc())
446             os._exit(0)
447
448     def set_resources(self):
449
450         """ Called when vserver context is entered for first time,
451         should be overridden by subclass. """
452
453         pass
454
455     def init_disk_info(self):
456         try:
457             dlimit = vserverimpl.getdlimit(self.dir, self.ctx)
458             self.disk_blocks = dlimit[0]
459             self.disk_inodes = dlimit[2]
460             return self.disk_blocks * 1024
461         except Exception, e:
462             pass
463         cmd = "/usr/sbin/vdu --script --space --inodes --blocksize 1024 --xid %d %s" % (self.ctx, self.dir)
464         p = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE,
465                              stdout=subprocess.PIPE, stderr=subprocess.PIPE,
466                              close_fds=True)
467         p.stdin.close()
468         line = p.stdout.readline()
469         if not line:
470             sys.stderr.write(p.stderr.read())
471         p.stdout.close()
472         p.stderr.close()
473         ret = p.wait()
474
475         (space, inodes) = line.split()
476         self.disk_inodes = int(inodes)
477         self.disk_blocks = int(space)
478         #(self.disk_inodes, self.disk_blocks) = vduimpl.vdu(self.dir)
479
480         return self.disk_blocks * 1024
481
482     def stop(self, signal = signal.SIGKILL):
483         self.vm_running = False
484         subprocess.call("/usr/sbin/vserver %s stop" % self.name, shell=True)
485
486     def setname(self, slice_id):
487         '''Set vcVHI_CONTEXT field in kernel to slice_id'''
488         vserverimpl.setname(self.ctx, slice_id)
489
490     def getname(self):
491         '''Get vcVHI_CONTEXT field in kernel'''
492         return vserverimpl.getname(self.ctx)
493
494
495 def create(vm_name, static = False, ctor = VServer):
496
497     options = ['vuseradd']
498     if static:
499         options += ['--static']
500     ret = os.spawnvp(os.P_WAIT, 'vuseradd', options + [vm_name])
501     if not os.WIFEXITED(ret) or os.WEXITSTATUS(ret) != 0:
502         out = "system command ('%s') " % options
503         if os.WIFEXITED(ret):
504             out += "failed, rc = %d" % os.WEXITSTATUS(ret)
505         else:
506             out += "killed by signal %d" % os.WTERMSIG(ret)
507         raise SystemError, out
508     vm_id = pwd.getpwnam(vm_name)[2]
509
510     return ctor(vm_name, vm_id)
511
512
513 def close_nonstandard_fds():
514     """Close all open file descriptors other than 0, 1, and 2."""
515     _SC_OPEN_MAX = 4
516     for fd in range(3, os.sysconf(_SC_OPEN_MAX)):
517         try: os.close(fd)
518         except OSError: pass  # most likely an fd that isn't open
519