Sapan's changes to address cgroups location
[lxc-userspace.git] / lxcsu
1 #!/usr/bin/python
2
3 import sys
4 import os
5 import setns
6 import pdb
7
8 from argparse import ArgumentParser
9
10 drop_capabilities='cap_sys_admin,cap_sys_boot,cap_sys_module'
11
12 # can set to True here, but also use the -d option
13 debug = False
14
15 #################### should go into a separate libvirtsystemd.py
16 # but we want to keep packaging simple for now
17
18 # reproducing libvirt's systemd-oriented escaping mechanism
19 # http://code.metager.de/source/xref/lib/virt/src/util/virsystemd.c
20 # (see original code at the end of this file)
21
22 def virSystemdEscapeName (name):
23     result=''
24     def ESCAPE(c,s):
25         # replace hex's output '0x..' into '\x..' 
26         return s+hex(ord(c)).replace('0','\\',1)
27     VALID_CHARS = \
28         "0123456789" + \
29         "abcdefghijklmnopqrstuvwxyz" + \
30         "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + \
31         ":-_.\\"
32     for c in name:
33         if c=='/': 
34             result += '-'
35         elif c in '-\\' or c not in VALID_CHARS:
36             result=ESCAPE(c,result)
37         else:
38             result += c
39     return result
40
41 #35static void virSystemdEscapeName(virBufferPtr buf,
42 #36                                 const char *name)
43 #37{
44 #38    static const char hextable[16] = "0123456789abcdef";
45 #39
46 #40#define ESCAPE(c)                                                       \
47 #41    do {                                                                \
48 #42        virBufferAddChar(buf, '\\');                                    \
49 #43        virBufferAddChar(buf, 'x');                                     \
50 #44        virBufferAddChar(buf, hextable[(c >> 4) & 15]);                 \
51 #45        virBufferAddChar(buf, hextable[c & 15]);                        \
52 #46    } while (0)
53 #47
54 #48#define VALID_CHARS                             \
55 #49        "0123456789"                            \
56 #50        "abcdefghijklmnopqrstuvwxyz"            \
57 #51        "ABCDEFGHIJKLMNOPQRSTUVWXYZ"            \
58 #52        ":-_.\\"
59 #53
60 #54    if (*name == '.') {
61 #55        ESCAPE(*name);
62 #56        name++;
63 #57    }
64 #58
65 #59    while (*name) {
66 #60        if (*name == '/')
67 #61            virBufferAddChar(buf, '-');
68 #62        else if (*name == '-' ||
69 #63                 *name == '\\' ||
70 #64                 !strchr(VALID_CHARS, *name))
71 #65            ESCAPE(*name);
72 #66        else
73 #67            virBufferAddChar(buf, *name);
74 #68        name++;
75 #69    }
76 #70
77 #71#undef ESCAPE
78 #72#undef VALID_CHARS
79 #73}
80
81 def virSystemdMakeScopeName (name, drivername, partition):
82     result=''
83     result += virSystemdEscapeName (partition)
84     result += '-'
85     result += virSystemdEscapeName (drivername)
86     result += '\\x2d'
87     result += virSystemdEscapeName (name)
88     result += '.scope'
89     return result
90
91 #76char *virSystemdMakeScopeName(const char *name,
92 #77                              const char *drivername,
93 #78                              const char *partition)
94 #79{
95 #80    virBuffer buf = VIR_BUFFER_INITIALIZER;
96 #81
97 #82    if (*partition == '/')
98 #83        partition++;
99 #84
100 #85    virSystemdEscapeName(&buf, partition);
101 #86    virBufferAddChar(&buf, '-');
102 #87    virSystemdEscapeName(&buf, drivername);
103 #88    virBufferAddLit(&buf, "\\x2d");
104 #89    virSystemdEscapeName(&buf, name);
105 #90    virBufferAddLit(&buf, ".scope");
106 #91
107 #92    if (virBufferError(&buf)) {
108 #93        virReportOOMError();
109 #94        return NULL;
110 #95    }
111 #96
112 #97    return virBufferContentAndReset(&buf);
113 #98}
114
115 ### our own additions
116 import os.path
117 def find_first_dir (candidates):
118     for candidate in candidates:
119         if os.path.isdir(candidate): return candidate
120     raise Exception,"Cannot find valid dir among %s"%'\n'.join(candidates)
121
122 def find_sysfs_scope (subsystem, slice_name):
123     subsystem1=subsystem
124     subsystem2=subsystem
125     if subsystem=='cpuacct':
126         subsystem2='cpu,cpuacct'
127     candidates = [ 
128         # for f16 and our locally brewed libvirt 1.0.4
129         "/sys/fs/cgroup/%s/libvirt/lxc/%s"%(subsystem1, slice_name),
130         # f20 and libvirt 1.1.3
131         "/sys/fs/cgroup/%s/machine.slice/%s"%(subsystem2, 
132                                               virSystemdMakeScopeName(slice_name,'lxc','machine')),
133         ]
134     return find_first_dir (candidates)
135
136 #################### end of libvirtsystemd.py
137
138 def getarch(f):
139     output = os.popen('readelf -h %s 2>&1'%f).readlines()
140     classlines = [x for x in output if ('Class' in x.split(':')[0])]
141     line = classlines[0]
142     c = line.split(':')[1]
143     if ('ELF64' in c):
144         return 'x86_64'
145     elif ('ELF32' in c):
146         return 'i686'
147     else:
148         raise Exception('Could not determine architecture')
149
150 def get_cgroup_subdirs_for_pid(pid):
151     cgroup_info_file = '/proc/%s/cgroup'%pid
152     cgroup_lines = open(cgroup_info_file).read().splitlines()
153     
154     subdirs = {}
155     for line in cgroup_lines:
156         try:
157             _, cgroup_name, subdir = line.split(':')
158             subdirs[cgroup_name] = subdir
159         except Exception, e:
160             print "Error reading cgroup info: %s"%str(e)
161             pass
162     
163     return subdirs
164         
165     
166 def umount(fs_dir, opts=''):
167     output = os.popen('/bin/umount %s %s 2>&1'%(opts, fs_dir)).read()
168     return ('device is busy' not in output)
169
170 def main ():
171     parser = ArgumentParser()
172     parser.add_argument("-n", "--nonet",
173                                         action="store_true", dest="netns", default=False,
174                                         help="Don't enter network namespace")
175     parser.add_argument("-m", "--nomnt",
176                                         action="store_true", dest="mntns", default=False,
177                                         help="Don't enter mount namespace")
178     parser.add_argument("-p", "--nopid",
179                                         action="store_true", dest="pidns", default=False,
180                                         help="Don't enter pid namespace")
181     parser.add_argument("-r", "--root",
182                                         action="store_true", dest="root", default=False,
183                                         help="Enter as root: be careful")
184     parser.add_argument("-i","--internal",
185                                         action="store_true", dest="internal", default=False,
186                                         help="does *not* prepend '-- -c' to arguments - or invoke lxcsu-internal")
187     parser.add_argument("-d","--debug",
188                                         action='store_true', dest='debug', default=False,
189                                         help="debug option")
190     parser.add_argument("-s","--nosliceuid",
191                                         action='store_true', dest="nosliceuid", default=False,
192                                         help="do not change to slice uid inside of slice")
193     parser.add_argument("-o","--noslicehome",
194                                         action='store_true', dest="noslicehome", default=False,
195                                         help="do not change to slice home directory inside of slice")
196
197     if os.path.exists("/etc/lxcsu_default"):
198         defaults = parser.parse_args(file("/etc/lxcsu_default","r").read().split())
199         parser.set_defaults(**defaults.__dict__)
200
201     parser.add_argument ("slice_name")
202     parser.add_argument ("command_to_run",nargs="*")
203
204     args = parser.parse_args()
205     slice_name=args.slice_name
206
207     # unless we run the symlink 'lxcsu-internal', or we specify the -i option, prepend '--' '-c'
208     if sys.argv[0].find('internal')>=0: args.internal=True
209
210     if len(args.command_to_run)>0 and (args.command_to_run[0] == "/sbin/service"):
211         # A quick hack to support nodemanager interfaces.py when restarting
212         # networking in a slice.
213         args.nosliceuid = True
214
215     # plain lxcsu
216     if not args.internal:
217         # no command given: enter interactive shell
218         if not args.command_to_run: args.command_to_run=['/bin/sh']
219         args.command_to_run = [ '-c' ] + [" ".join(args.command_to_run)]
220
221     # support for either setting debug at the top of this file, or on the command-line
222     if args.debug:
223         global debug
224         debug=True
225
226     try:
227         cmd = '/usr/bin/virsh --connect lxc:/// domid %s'%slice_name
228         pidnum = int(os.popen(cmd).read().rstrip())
229     except:
230         print "Error finding slice %s"%slice_name
231         exit(1)
232
233     pid = '%s'%pidnum
234     if debug: print "Found pidnum",pidnum
235     cmdline = open('/proc/%s/cmdline'%pidnum).read().rstrip('\n\x00')
236     arch = getarch('/proc/%s/exe'%pid)
237
238     if (not pid):
239         print "Not started: %s"%slice_name
240         exit(1)
241
242     if arch is None:
243         arch = 'x86_64'
244
245     # Set sysctls specific to slice
246     sysctls = []
247     sysctl_dir = '/etc/planetlab/vsys-attributes/%s'%slice_name
248     if (os.access(sysctl_dir,0)):
249         entries = os.listdir(sysctl_dir)
250         for e in entries:
251             prefix = 'vsys_sysctl.'
252             if (e.startswith(prefix)):
253                 sysctl_file = '/'.join([sysctl_dir,e])
254                 sysctl_name = e[len(prefix):]
255                 sysctl_val = open(sysctl_file).read()
256                 sysctls.append((sysctl_file, sysctl_name, sysctl_val))
257
258     subdirs = get_cgroup_subdirs_for_pid(pid) 
259     sysfs_root = '/sys/fs/cgroup'
260
261     # If the slice is frozen, then we'll get an EBUSY when trying to write to the task
262     # list for the freezer cgroup. Since the user couldn't do anything anyway, it's best
263     # in this case to error out the shell. (an alternative would be to un-freeze it,
264     # add the task, and re-freeze it)
265     # Enter cgroups
266     current_cgroup = ''
267     for subsystem in ['cpuset','memory','blkio','cpuacct','cpuacct,cpu','freezer']:
268         try:
269             current_cgroup = subsystem
270
271             # There seems to be a bug in the cgroup schema: cpuacct,cpu can become cpu,cpuacct
272             # We need to handle both
273             task_path_alt = None
274             try:
275                subsystem_comps = subsystem.split(',')
276                subsystem_comps.reverse()
277                subsystem_alt = ','.join(subsystem_comps)
278                tasks_path_alt = [sysfs_root, subsystem_alt, subdirs[subsystem], 'tasks']
279             except Exception,e:
280                 pass
281                
282             tasks_path = [sysfs_root,subsystem,subdirs[subsystem],'tasks']
283             tasks_path_str = '/'.join(tasks_path)
284      
285             try:
286                 f = open(tasks_path_str, 'w')
287             except:
288                 tasks_path_alt_str = '/'.join(tasks_path_alt)
289                 f = open(tasks_path_alt_str, 'w')
290
291             f.write(str(os.getpid()))
292             if (subsystem=='freezer'):
293                 f.close()
294
295         except Exception,e:
296             if (not subdirs.has_key(subsystem)):
297                 pass
298             else:
299                 if debug: print e 
300                 print "Error assigning cgroup %s (%s) for slice %s"%(current_cgroup,pid, slice_name)
301                 exit(1)
302
303
304     setns.chcontext('/proc/%s/ns/uts'%pid)
305     setns.chcontext('/proc/%s/ns/ipc'%pid)
306         
307     if (not args.pidns):
308         setns.chcontext('/proc/%s/ns/pid'%pid)
309
310     if (not args.netns):
311         setns.chcontext('/proc/%s/ns/net'%pid)
312
313     if (not args.mntns):
314         setns.chcontext('/proc/%s/ns/mnt'%pid)
315
316     proc_mounted = False
317     if (not os.access('/proc/self',0)):
318         proc_mounted = True
319         setns.proc_mount()
320
321     for (sysctl_file, sysctl_name, sysctl_val) in sysctls:
322         for fn in ["/sbin/sysctl", "/usr/sbin/sysctl", "/bin/sysctl", "/usr/bin/sysctl"]:
323             if os.path.exists(fn):
324                 os.system('%s -w %s=%s  >/dev/null 2>&1'%(fn, sysctl_name,sysctl_val))
325                 break
326             else:
327                 print "Error: image does not have a sysctl binary"
328
329     # cgroups is not yet LXC-safe, so we need to use the coarse grained access control
330     # strategy of unmounting the filesystem
331
332     umount_result = True
333     for subsystem in ['cpuset','cpu,cpuacct','memory','devices','freezer','net_cls','blkio','perf_event','systemd']:
334         fs_path = '/sys/fs/cgroup/%s'%subsystem
335         if (not umount(fs_path,'-l')):
336             pass
337             # Leaving these comments for historical reference
338             #print "Error disabling cgroup access"
339             #exit(1) - Don't need this because failure here implies failure in the call to umount /sys/fs/cgroup
340
341     if (not umount('/sys/fs/cgroup')):
342         print "Error disabling cgroup access"
343         exit(1)
344
345     pid = os.fork()
346
347     # capsh has a --user option starting with f14
348     # so if only for f12 we need to fake this one
349     #
350     # capsh.c does essentially the following when invoked with --user:
351     #           pwd = getpwnam(user); ...
352     #           ngroups = MAX_GROUPS; 
353     #           status = getgrouplist(user, pwd->pw_gid, groups, &ngroups); ...
354     #           status = setgroups(ngroups, groups); ...
355     #           status = setgid(pwd->pw_gid); ...
356     #           status = setuid(pwd->pw_uid); ...
357     # however we cannot simulate that ourselves because if we did in this process then
358     # capsh could not be allowed to mess with caps any more
359
360     def getuid (slicename):
361         import pwd
362         try:
363             return pwd.getpwnam(slicename).pw_uid
364         except:
365             return
366
367     if (pid == 0):
368         cap_arg = '--drop='+drop_capabilities
369
370         if (not args.root):
371             if (args.nosliceuid):
372                 # we still want to drop capabilities, but don't want to switch UIDs
373                 exec_args = [arch,'/usr/sbin/capsh',cap_arg,'--','--login',]+args.command_to_run
374             else:
375                 uid = getuid (slice_name)
376                 if not uid:
377                     print "lxcsu could not spot %s in /etc/passwd - exiting"%slice_name
378                     exit(1)
379                 exec_args = [arch,'/usr/sbin/capsh',cap_arg,'--uid=%s'%uid,'--','--login',]+args.command_to_run
380 # once we can drop f12, it would be nicer to instead go for
381 # exec_args = [arch,'/usr/sbin/capsh',cap_arg,'--user=%s'%slice_name,'--','--login',]+args.command_to_run
382         else:
383             exec_args = [arch,'/usr/sbin/capsh','--','--login']+args.command_to_run
384
385         os.environ['SHELL'] = '/bin/sh'
386         if os.path.exists('/etc/planetlab/lib/bind_public.so'):
387             os.environ['LD_PRELOAD'] = '/etc/planetlab/lib/bind_public.so'
388         if not args.noslicehome:
389             os.environ['HOME'] = '/home/%s'%slice_name
390             os.chdir("/home/%s"%(slice_name))
391         if debug: print 'lxcsu:execv:','/usr/bin/setarch',exec_args
392         os.execv('/usr/bin/setarch',exec_args)
393     else:
394         setns.proc_umount()
395         _,status = os.waitpid(pid,0)
396         exit(os.WEXITSTATUS(status))
397
398 if __name__ == '__main__':
399         main()