support for libvirt—1.1 naming scheme
[lxc-userspace.git] / lxcsu
1 #!/usr/bin/python
2
3 import sys
4 import os
5 import setns
6
7 from argparse import ArgumentParser
8
9 drop_capabilities='cap_sys_admin,cap_sys_boot,cap_sys_module'
10
11 # can set to True here, but also use the -d option
12 debug = False
13
14 #################### should go into a separate libvirtsystemd.py
15 # but we want to keep packaging simple for now
16
17 # reproducing libvirt's systemd-oriented escaping mechanism
18 # http://code.metager.de/source/xref/lib/virt/src/util/virsystemd.c
19 # (see original code at the end of this file)
20
21 def virSystemdEscapeName (name):
22     result=''
23     def ESCAPE(c,s):
24         # replace hex's output '0x..' into '\x..' 
25         return s+hex(ord(c)).replace('0','\\',1)
26     VALID_CHARS = \
27         "0123456789" + \
28         "abcdefghijklmnopqrstuvwxyz" + \
29         "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + \
30         ":-_.\\"
31     for c in name:
32         if c=='/': 
33             result += '-'
34         elif c in '-\\' or c not in VALID_CHARS:
35             result=ESCAPE(c,result)
36         else:
37             result += c
38     return result
39
40 #35static void virSystemdEscapeName(virBufferPtr buf,
41 #36                                 const char *name)
42 #37{
43 #38    static const char hextable[16] = "0123456789abcdef";
44 #39
45 #40#define ESCAPE(c)                                                       \
46 #41    do {                                                                \
47 #42        virBufferAddChar(buf, '\\');                                    \
48 #43        virBufferAddChar(buf, 'x');                                     \
49 #44        virBufferAddChar(buf, hextable[(c >> 4) & 15]);                 \
50 #45        virBufferAddChar(buf, hextable[c & 15]);                        \
51 #46    } while (0)
52 #47
53 #48#define VALID_CHARS                             \
54 #49        "0123456789"                            \
55 #50        "abcdefghijklmnopqrstuvwxyz"            \
56 #51        "ABCDEFGHIJKLMNOPQRSTUVWXYZ"            \
57 #52        ":-_.\\"
58 #53
59 #54    if (*name == '.') {
60 #55        ESCAPE(*name);
61 #56        name++;
62 #57    }
63 #58
64 #59    while (*name) {
65 #60        if (*name == '/')
66 #61            virBufferAddChar(buf, '-');
67 #62        else if (*name == '-' ||
68 #63                 *name == '\\' ||
69 #64                 !strchr(VALID_CHARS, *name))
70 #65            ESCAPE(*name);
71 #66        else
72 #67            virBufferAddChar(buf, *name);
73 #68        name++;
74 #69    }
75 #70
76 #71#undef ESCAPE
77 #72#undef VALID_CHARS
78 #73}
79
80 def virSystemdMakeScopeName (name, drivername, partition):
81     result=''
82     result += virSystemdEscapeName (partition)
83     result += '-'
84     result += virSystemdEscapeName (drivername)
85     result += '\\x2d'
86     result += virSystemdEscapeName (name)
87     result += '.scope'
88     return result
89
90 #76char *virSystemdMakeScopeName(const char *name,
91 #77                              const char *drivername,
92 #78                              const char *partition)
93 #79{
94 #80    virBuffer buf = VIR_BUFFER_INITIALIZER;
95 #81
96 #82    if (*partition == '/')
97 #83        partition++;
98 #84
99 #85    virSystemdEscapeName(&buf, partition);
100 #86    virBufferAddChar(&buf, '-');
101 #87    virSystemdEscapeName(&buf, drivername);
102 #88    virBufferAddLit(&buf, "\\x2d");
103 #89    virSystemdEscapeName(&buf, name);
104 #90    virBufferAddLit(&buf, ".scope");
105 #91
106 #92    if (virBufferError(&buf)) {
107 #93        virReportOOMError();
108 #94        return NULL;
109 #95    }
110 #96
111 #97    return virBufferContentAndReset(&buf);
112 #98}
113
114 ### our own additions
115 import os.path
116 def find_first_dir (candidates):
117     for candidate in candidates:
118         if os.path.isdir(candidate): return candidate
119     raise Exception,"Cannot find valid dir among %s"%'\n'.join(candidates)
120
121 def find_sysfs_scope (subsystem, slice_name):
122     subsystem1=subsystem
123     subsystem2=subsystem
124     if subsystem=='cpuacct':
125         subsystem2='cpu,cpuacct'
126     candidates = [ 
127         # for f16 and our locally brewed libvirt 1.0.4
128         "/sys/fs/cgroup/%s/libvirt/lxc/%s"%(subsystem1, slice_name),
129         # f20 and libvirt 1.1.3
130         "/sys/fs/cgroup/%s/machine.slice/%s"%(subsystem2, 
131                                               virSystemdMakeScopeName(slice_name,'lxc','machine')),
132         ]
133     return find_first_dir (candidates)
134
135 #################### end of libvirtsystemd.py
136
137 def getarch(f):
138     output = os.popen('readelf -h %s 2>&1'%f).readlines()
139     classlines = [x for x in output if ('Class' in x.split(':')[0])]
140     line = classlines[0]
141     c = line.split(':')[1]
142     if ('ELF64' in c):
143         return 'x86_64'
144     elif ('ELF32' in c):
145         return 'i686'
146     else:
147         raise Exception('Could not determine architecture')
148
149 def umount(fs_dir, opts=''):
150     output = os.popen('/bin/umount %s %s 2>&1'%(opts, fs_dir)).read()
151     return ('device is busy' not in output)
152
153 def main ():
154     parser = ArgumentParser()
155     parser.add_argument("-n", "--nonet",
156                                         action="store_true", dest="netns", default=False,
157                                         help="Don't enter network namespace")
158     parser.add_argument("-m", "--nomnt",
159                                         action="store_true", dest="mntns", default=False,
160                                         help="Don't enter mount namespace")
161     parser.add_argument("-p", "--nopid",
162                                         action="store_true", dest="pidns", default=False,
163                                         help="Don't enter pid namespace")
164     parser.add_argument("-r", "--root",
165                                         action="store_true", dest="root", default=False,
166                                         help="Enter as root: be careful")
167     parser.add_argument("-i","--internal",
168                                         action="store_true", dest="internal", default=False,
169                                         help="does *not* prepend '-- -c' to arguments - or invoke lxcsu-internal")
170     parser.add_argument("-d","--debug",
171                                         action='store_true', dest='debug', default=False,
172                                         help="debug option")
173     parser.add_argument("-s","--nosliceuid",
174                                         action='store_true', dest="nosliceuid", default=False,
175                                         help="do not change to slice uid inside of slice")
176     parser.add_argument("-o","--noslicehome",
177                                         action='store_true', dest="noslicehome", default=False,
178                                         help="do not change to slice home directory inside of slice")
179
180     if os.path.exists("/etc/lxcsu_default"):
181         defaults = parser.parse_args(file("/etc/lxcsu_default","r").read().split())
182         parser.set_defaults(**defaults.__dict__)
183
184     parser.add_argument ("slice_name")
185     parser.add_argument ("command_to_run",nargs="*")
186
187     args = parser.parse_args()
188     slice_name=args.slice_name
189
190     # unless we run the symlink 'lxcsu-internal', or we specify the -i option, prepend '--' '-c'
191     if sys.argv[0].find('internal')>=0: args.internal=True
192
193     if len(args.command_to_run)>0 and (args.command_to_run[0] == "/sbin/service"):
194         # A quick hack to support nodemanager interfaces.py when restarting
195         # networking in a slice.
196         args.nosliceuid = True
197
198     # plain lxcsu
199     if not args.internal:
200         # no command given: enter interactive shell
201         if not args.command_to_run: args.command_to_run=['/bin/sh']
202         args.command_to_run = [ '-c' ] + [" ".join(args.command_to_run)]
203
204     # support for either setting debug at the top of this file, or on the command-line
205     if args.debug:
206         global debug
207         debug=True
208
209     try:
210         cmd = '/usr/bin/virsh --connect lxc:/// domid %s'%slice_name
211         pidnum = int(os.popen(cmd).read().rstrip())
212     except:
213         print "Error finding slice %s"%slice_name
214         exit(1)
215
216     pid = '%s'%pidnum
217     if debug: print "Found pidnum",pidnum
218     cmdline = open('/proc/%s/cmdline'%pidnum).read().rstrip('\n\x00')
219     arch = getarch('/proc/%s/exe'%pid)
220
221     if (not pid):
222         print "Not started: %s"%slice_name
223         exit(1)
224
225     if arch is None:
226         arch = 'x86_64'
227
228     # Set sysctls specific to slice
229     sysctls = []
230     sysctl_dir = '/etc/planetlab/vsys-attributes/%s'%slice_name
231     if (os.access(sysctl_dir,0)):
232         entries = os.listdir(sysctl_dir)
233         for e in entries:
234             prefix = 'vsys_sysctl.'
235             if (e.startswith(prefix)):
236                 sysctl_file = '/'.join([sysctl_dir,e])
237                 sysctl_name = e[len(prefix):]
238                 sysctl_val = open(sysctl_file).read()
239                 sysctls.append((sysctl_file, sysctl_name, sysctl_val))
240
241     # Enter cgroups
242     try:
243         for subsystem in ['cpuset','memory','blkio']:
244             open( find_sysfs_scope (subsystem, slice_name)+"/tasks", 'w').write(str(os.getpid()))
245
246     except Exception,e:
247         if debug: print e 
248         print "Error assigning resources: %s"%slice_name
249         exit(1)
250
251     try:
252         open ( find_sysfs_scope ('cpuacct', slice_name)+"/tasks", 'w').write(str(os.getpid()))
253     except Exception,e:
254         if debug: print e 
255         print "Error assigning cpuacct: %s" % slice_name
256         exit(1)
257
258     # If the slice is frozen, then we'll get an EBUSY when trying to write to the task
259     # list for the freezer cgroup. Since the user couldn't do anything anyway, it's best
260     # in this case to error out the shell. (an alternative would be to un-freeze it,
261     # add the task, and re-freeze it)
262     try:
263         f=open( find_sysfs_scope ('freezer', slice_name)+"/tasks", 'w')
264         f.write(str(os.getpid()))
265         # note: we need to call f.close() explicitly, or we'll get an exception in
266         # the object destructor, which will not be caught
267         f.close()
268     except Exception,e:
269         if debug: print e 
270         print "Error adding task to freezer cgroup. Slice is probably frozen: %s" % slice_name
271         exit(1)
272
273     setns.chcontext('/proc/%s/ns/uts'%pid)
274     setns.chcontext('/proc/%s/ns/ipc'%pid)
275         
276     if (not args.pidns):
277         setns.chcontext('/proc/%s/ns/pid'%pid)
278
279     if (not args.netns):
280         setns.chcontext('/proc/%s/ns/net'%pid)
281
282     if (not args.mntns):
283         setns.chcontext('/proc/%s/ns/mnt'%pid)
284
285     proc_mounted = False
286     if (not os.access('/proc/self',0)):
287         proc_mounted = True
288         setns.proc_mount()
289
290     for (sysctl_file, sysctl_name, sysctl_val) in sysctls:
291         for fn in ["/sbin/sysctl", "/usr/sbin/sysctl", "/bin/sysctl", "/usr/bin/sysctl"]:
292             if os.path.exists(fn):
293                 os.system('%s -w %s=%s  >/dev/null 2>&1'%(fn, sysctl_name,sysctl_val))
294                 break
295             else:
296                 print "Error: image does not have a sysctl binary"
297
298     # cgroups is not yet LXC-safe, so we need to use the coarse grained access control
299     # strategy of unmounting the filesystem
300
301     umount_result = True
302     for subsystem in ['cpuset','cpu,cpuacct','memory','devices','freezer','net_cls','blkio','perf_event','systemd']:
303         fs_path = '/sys/fs/cgroup/%s'%subsystem
304         if (not umount(fs_path,'-l')):
305             pass
306             # Leaving these comments for historical reference
307             #print "Error disabling cgroup access"
308             #exit(1) - Don't need this because failure here implies failure in the call to umount /sys/fs/cgroup
309
310     if (not umount('/sys/fs/cgroup')):
311         print "Error disabling cgroup access"
312         exit(1)
313
314     pid = os.fork()
315
316     # capsh has a --user option starting with f14
317     # so if only for f12 we need to fake this one
318     #
319     # capsh.c does essentially the following when invoked with --user:
320     #           pwd = getpwnam(user); ...
321     #           ngroups = MAX_GROUPS; 
322     #           status = getgrouplist(user, pwd->pw_gid, groups, &ngroups); ...
323     #           status = setgroups(ngroups, groups); ...
324     #           status = setgid(pwd->pw_gid); ...
325     #           status = setuid(pwd->pw_uid); ...
326     # however we cannot simulate that ourselves because if we did in this process then
327     # capsh could not be allowed to mess with caps any more
328
329     def getuid (slicename):
330         import pwd
331         try:
332             return pwd.getpwnam(slicename).pw_uid
333         except:
334             return
335
336     if (pid == 0):
337         cap_arg = '--drop='+drop_capabilities
338
339         if (not args.root):
340             if (args.nosliceuid):
341                 # we still want to drop capabilities, but don't want to switch UIDs
342                 exec_args = [arch,'/usr/sbin/capsh',cap_arg,'--','--login',]+args.command_to_run
343             else:
344                 uid = getuid (slice_name)
345                 if not uid:
346                     print "lxcsu could not spot %s in /etc/passwd - exiting"%slice_name
347                     exit(1)
348                 exec_args = [arch,'/usr/sbin/capsh',cap_arg,'--uid=%s'%uid,'--','--login',]+args.command_to_run
349 # once we can drop f12, it would be nicer to instead go for
350 # exec_args = [arch,'/usr/sbin/capsh',cap_arg,'--user=%s'%slice_name,'--','--login',]+args.command_to_run
351         else:
352             exec_args = [arch,'/usr/sbin/capsh','--','--login']+args.command_to_run
353
354         os.environ['SHELL'] = '/bin/sh'
355         if os.path.exists('/etc/planetlab/lib/bind_public.so'):
356             os.environ['LD_PRELOAD'] = '/etc/planetlab/lib/bind_public.so'
357         if not args.noslicehome:
358             os.environ['HOME'] = '/home/%s'%slice_name
359             os.chdir("/home/%s"%(slice_name))
360         if debug: print 'lxcsu:execv:','/usr/bin/setarch',exec_args
361         os.execv('/usr/bin/setarch',exec_args)
362     else:
363         setns.proc_umount()
364         _,status = os.waitpid(pid,0)
365         exit(os.WEXITSTATUS(status))
366
367 if __name__ == '__main__':
368         main()