shorten cgroups-related code, and try as much sysfs related stuff before exiting
[lxc-userspace.git] / lxcsu
1 #!/usr/bin/python
2
3 import sys
4 import os
5 import setns
6
7 from argparse import ArgumentParser
8
9 drop_capabilities='cap_sys_admin,cap_sys_boot,cap_sys_module'
10
11 # can set to True here, but also use the -d option
12 debug = False
13
14 #################### should go into a separate libvirtsystemd.py
15 # but we want to keep packaging simple for now
16
17 # reproducing libvirt's systemd-oriented escaping mechanism
18 # http://code.metager.de/source/xref/lib/virt/src/util/virsystemd.c
19 # (see original code at the end of this file)
20
21 def virSystemdEscapeName (name):
22     result=''
23     def ESCAPE(c,s):
24         # replace hex's output '0x..' into '\x..' 
25         return s+hex(ord(c)).replace('0','\\',1)
26     VALID_CHARS = \
27         "0123456789" + \
28         "abcdefghijklmnopqrstuvwxyz" + \
29         "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + \
30         ":-_.\\"
31     for c in name:
32         if c=='/': 
33             result += '-'
34         elif c in '-\\' or c not in VALID_CHARS:
35             result=ESCAPE(c,result)
36         else:
37             result += c
38     return result
39
40 #35static void virSystemdEscapeName(virBufferPtr buf,
41 #36                                 const char *name)
42 #37{
43 #38    static const char hextable[16] = "0123456789abcdef";
44 #39
45 #40#define ESCAPE(c)                                                       \
46 #41    do {                                                                \
47 #42        virBufferAddChar(buf, '\\');                                    \
48 #43        virBufferAddChar(buf, 'x');                                     \
49 #44        virBufferAddChar(buf, hextable[(c >> 4) & 15]);                 \
50 #45        virBufferAddChar(buf, hextable[c & 15]);                        \
51 #46    } while (0)
52 #47
53 #48#define VALID_CHARS                             \
54 #49        "0123456789"                            \
55 #50        "abcdefghijklmnopqrstuvwxyz"            \
56 #51        "ABCDEFGHIJKLMNOPQRSTUVWXYZ"            \
57 #52        ":-_.\\"
58 #53
59 #54    if (*name == '.') {
60 #55        ESCAPE(*name);
61 #56        name++;
62 #57    }
63 #58
64 #59    while (*name) {
65 #60        if (*name == '/')
66 #61            virBufferAddChar(buf, '-');
67 #62        else if (*name == '-' ||
68 #63                 *name == '\\' ||
69 #64                 !strchr(VALID_CHARS, *name))
70 #65            ESCAPE(*name);
71 #66        else
72 #67            virBufferAddChar(buf, *name);
73 #68        name++;
74 #69    }
75 #70
76 #71#undef ESCAPE
77 #72#undef VALID_CHARS
78 #73}
79
80 def virSystemdMakeScopeName (name, drivername, partition):
81     result=''
82     result += virSystemdEscapeName (partition)
83     result += '-'
84     result += virSystemdEscapeName (drivername)
85     result += '\\x2d'
86     result += virSystemdEscapeName (name)
87     result += '.scope'
88     return result
89
90 #76char *virSystemdMakeScopeName(const char *name,
91 #77                              const char *drivername,
92 #78                              const char *partition)
93 #79{
94 #80    virBuffer buf = VIR_BUFFER_INITIALIZER;
95 #81
96 #82    if (*partition == '/')
97 #83        partition++;
98 #84
99 #85    virSystemdEscapeName(&buf, partition);
100 #86    virBufferAddChar(&buf, '-');
101 #87    virSystemdEscapeName(&buf, drivername);
102 #88    virBufferAddLit(&buf, "\\x2d");
103 #89    virSystemdEscapeName(&buf, name);
104 #90    virBufferAddLit(&buf, ".scope");
105 #91
106 #92    if (virBufferError(&buf)) {
107 #93        virReportOOMError();
108 #94        return NULL;
109 #95    }
110 #96
111 #97    return virBufferContentAndReset(&buf);
112 #98}
113
114 ### our own additions
115 # heuristics to locate /sys/fs/cgroup stuff
116 import os.path
117 def find_first_dir (candidates):
118     for candidate in candidates:
119         if os.path.isdir(candidate): return candidate
120     raise Exception,"Cannot find valid dir among\n" + "\n".join([" ->"+c for c in candidates])
121
122 def find_sysfs_scope (subsystem, slice_name):
123     subsystem1=subsystem
124     subsystem2=subsystem
125     if subsystem=='cpuacct':
126         subsystem2='cpu,cpuacct'
127     candidates = [ 
128         # for f16 and our locally brewed libvirt 1.0.4
129         "/sys/fs/cgroup/%s/libvirt/lxc/%s"%(subsystem1, slice_name),
130         "/sys/fs/cgroup/%s/system/libvirtd.service/libvirt/lxc/%s"%(subsystem1, slice_name),
131         # f20 and libvirt 1.1.3
132         "/sys/fs/cgroup/%s/machine.slice/%s"%(subsystem2, 
133                                               virSystemdMakeScopeName(slice_name,'lxc','machine')),
134         ]
135     return find_first_dir (candidates)
136
137 #################### end of libvirtsystemd.py
138
139 def getarch(f):
140     output = os.popen('readelf -h %s 2>&1'%f).readlines()
141     classlines = [x for x in output if ('Class' in x.split(':')[0])]
142     line = classlines[0]
143     c = line.split(':')[1]
144     if ('ELF64' in c):
145         return 'x86_64'
146     elif ('ELF32' in c):
147         return 'i686'
148     else:
149         raise Exception('Could not determine architecture')
150
151 def umount(fs_dir, opts=''):
152     output = os.popen('/bin/umount %s %s 2>&1'%(opts, fs_dir)).read()
153     return ('device is busy' not in output)
154
155 def main ():
156     parser = ArgumentParser()
157     parser.add_argument("-n", "--nonet",
158                                         action="store_true", dest="netns", default=False,
159                                         help="Don't enter network namespace")
160     parser.add_argument("-m", "--nomnt",
161                                         action="store_true", dest="mntns", default=False,
162                                         help="Don't enter mount namespace")
163     parser.add_argument("-p", "--nopid",
164                                         action="store_true", dest="pidns", default=False,
165                                         help="Don't enter pid namespace")
166     parser.add_argument("-r", "--root",
167                                         action="store_true", dest="root", default=False,
168                                         help="Enter as root: be careful")
169     parser.add_argument("-i","--internal",
170                                         action="store_true", dest="internal", default=False,
171                                         help="does *not* prepend '-- -c' to arguments - or invoke lxcsu-internal")
172     parser.add_argument("-d","--debug",
173                                         action='store_true', dest='debug', default=False,
174                                         help="debug option")
175     parser.add_argument("-s","--nosliceuid",
176                                         action='store_true', dest="nosliceuid", default=False,
177                                         help="do not change to slice uid inside of slice")
178     parser.add_argument("-o","--noslicehome",
179                                         action='store_true', dest="noslicehome", default=False,
180                                         help="do not change to slice home directory inside of slice")
181
182     if os.path.exists("/etc/lxcsu_default"):
183         defaults = parser.parse_args(file("/etc/lxcsu_default","r").read().split())
184         parser.set_defaults(**defaults.__dict__)
185
186     parser.add_argument ("slice_name")
187     parser.add_argument ("command_to_run",nargs="*")
188
189     args = parser.parse_args()
190     slice_name=args.slice_name
191
192     # unless we run the symlink 'lxcsu-internal', or we specify the -i option, prepend '--' '-c'
193     if sys.argv[0].find('internal')>=0: args.internal=True
194
195     if len(args.command_to_run)>0 and (args.command_to_run[0] == "/sbin/service"):
196         # A quick hack to support nodemanager interfaces.py when restarting
197         # networking in a slice.
198         args.nosliceuid = True
199
200     # plain lxcsu
201     if not args.internal:
202         # no command given: enter interactive shell
203         if not args.command_to_run: args.command_to_run=['/bin/sh']
204         args.command_to_run = [ '-c' ] + [" ".join(args.command_to_run)]
205
206     # support for either setting debug at the top of this file, or on the command-line
207     if args.debug:
208         global debug
209         debug=True
210
211     try:
212         cmd = '/usr/bin/virsh --connect lxc:/// domid %s'%slice_name
213         pidnum = int(os.popen(cmd).read().rstrip())
214     except:
215         print "Domain %s not found"%slice_name
216         exit(1)
217
218     pid = '%s'%pidnum
219     if debug: print "Found pidnum",pidnum
220     cmdline = open('/proc/%s/cmdline'%pidnum).read().rstrip('\n\x00')
221     arch = getarch('/proc/%s/exe'%pid)
222
223     if (not pid):
224         print "Domain %s not started"%slice_name
225         exit(1)
226
227     if arch is None:
228         arch = 'x86_64'
229
230     # Set sysctls specific to slice
231     sysctls = []
232     sysctl_dir = '/etc/planetlab/vsys-attributes/%s'%slice_name
233     if (os.access(sysctl_dir,0)):
234         entries = os.listdir(sysctl_dir)
235         for e in entries:
236             prefix = 'vsys_sysctl.'
237             if (e.startswith(prefix)):
238                 sysctl_file = '/'.join([sysctl_dir,e])
239                 sysctl_name = e[len(prefix):]
240                 sysctl_val = open(sysctl_file).read()
241                 sysctls.append((sysctl_file, sysctl_name, sysctl_val))
242
243     # Enter cgroups
244     # do not exit right away when something goes wrong
245     # check as much as we can and only then exit
246     cgroups_ok=True
247     for subsystem in ['cpuset' ,'memory' ,'blkio', 'cpuacct']:
248         try:
249             open( find_sysfs_scope (subsystem, slice_name)+"/tasks", 'w').write(str(os.getpid()))
250         except Exception,e:
251             if debug: print e 
252             print "ERROR assigning resources for %s in subsystem %s - bailing out"%(slice_name,subsystem)
253             cgroups_ok=False
254
255     # If the slice is frozen, then we'll get an EBUSY when trying to write to the task
256     # list for the freezer cgroup. Since the user couldn't do anything anyway, it's best
257     # in this case to error out the shell. (an alternative would be to un-freeze it,
258     # add the task, and re-freeze it)
259     try:
260         f=open( find_sysfs_scope ('freezer', slice_name)+"/tasks", 'w')
261         f.write(str(os.getpid()))
262         # note: we need to call f.close() explicitly, or we'll get an exception in
263         # the object destructor, which will not be caught
264         f.close()
265     except Exception,e:
266         if debug: print e 
267         print "Error adding task to freezer cgroup. Slice is probably frozen: %s" % slice_name
268         cgroups_ok=False
269
270     setns.chcontext('/proc/%s/ns/uts'%pid)
271     setns.chcontext('/proc/%s/ns/ipc'%pid)
272         
273     if (not args.pidns):
274         setns.chcontext('/proc/%s/ns/pid'%pid)
275
276     if (not args.netns):
277         setns.chcontext('/proc/%s/ns/net'%pid)
278
279     if (not args.mntns):
280         setns.chcontext('/proc/%s/ns/mnt'%pid)
281
282     proc_mounted = False
283     if (not os.access('/proc/self',0)):
284         proc_mounted = True
285         setns.proc_mount()
286
287     for (sysctl_file, sysctl_name, sysctl_val) in sysctls:
288         for fn in ["/sbin/sysctl", "/usr/sbin/sysctl", "/bin/sysctl", "/usr/bin/sysctl"]:
289             if os.path.exists(fn):
290                 os.system('%s -w %s=%s  >/dev/null 2>&1'%(fn, sysctl_name,sysctl_val))
291                 break
292             else:
293                 print "Error: image does not have a sysctl binary"
294
295     # cgroups is not yet LXC-safe, so we need to use the coarse grained access control
296     # strategy of unmounting the filesystem
297
298     umount_result = True
299     for subsystem in ['cpuset','cpu,cpuacct','memory','devices','freezer','net_cls','blkio','perf_event','systemd']:
300         fs_path = '/sys/fs/cgroup/%s'%subsystem
301         if (not umount(fs_path,'-l')):
302             pass
303             # Leaving these comments for historical reference
304             #print "Error disabling cgroup access"
305             #exit(1) - Don't need this because failure here implies failure in the call to umount /sys/fs/cgroup
306
307     if (not umount('/sys/fs/cgroup')):
308         print "Error disabling cgroup access"
309         cgroups_ok=False
310
311     if not cgroups_ok:
312         print 'exiting'
313         exit(1)
314
315     pid = os.fork()
316
317     # capsh has a --user option starting with f14
318     # so if only for f12 we need to fake this one
319     #
320     # capsh.c does essentially the following when invoked with --user:
321     #           pwd = getpwnam(user); ...
322     #           ngroups = MAX_GROUPS; 
323     #           status = getgrouplist(user, pwd->pw_gid, groups, &ngroups); ...
324     #           status = setgroups(ngroups, groups); ...
325     #           status = setgid(pwd->pw_gid); ...
326     #           status = setuid(pwd->pw_uid); ...
327     # however we cannot simulate that ourselves because if we did in this process then
328     # capsh could not be allowed to mess with caps any more
329
330     def getuid (slicename):
331         import pwd
332         try:
333             return pwd.getpwnam(slicename).pw_uid
334         except:
335             return
336
337     if (pid == 0):
338         cap_arg = '--drop='+drop_capabilities
339
340         if (not args.root):
341             if (args.nosliceuid):
342                 # we still want to drop capabilities, but don't want to switch UIDs
343                 exec_args = [arch,'/usr/sbin/capsh',cap_arg,'--','--login',]+args.command_to_run
344             else:
345                 uid = getuid (slice_name)
346                 if not uid:
347                     print "lxcsu could not spot %s in /etc/passwd - exiting"%slice_name
348                     exit(1)
349                 exec_args = [arch,'/usr/sbin/capsh',cap_arg,'--uid=%s'%uid,'--','--login',]+args.command_to_run
350 # once we can drop f12, it would be nicer to instead go for
351 # exec_args = [arch,'/usr/sbin/capsh',cap_arg,'--user=%s'%slice_name,'--','--login',]+args.command_to_run
352         else:
353             exec_args = [arch,'/usr/sbin/capsh','--','--login']+args.command_to_run
354
355         os.environ['SHELL'] = '/bin/sh'
356         if os.path.exists('/etc/planetlab/lib/bind_public.so'):
357             os.environ['LD_PRELOAD'] = '/etc/planetlab/lib/bind_public.so'
358         if not args.noslicehome:
359             os.environ['HOME'] = '/home/%s'%slice_name
360             os.chdir("/home/%s"%(slice_name))
361         if debug: print 'lxcsu:execv:','/usr/bin/setarch',exec_args
362         os.execv('/usr/bin/setarch',exec_args)
363     else:
364         setns.proc_umount()
365         _,status = os.waitpid(pid,0)
366         exit(os.WEXITSTATUS(status))
367
368 if __name__ == '__main__':
369         main()