/* Copyright 2005 Princeton University Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PRINCETON UNIVERSITY OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef HAVE_CONFIG_H # include #endif #include #include #include #include #include #include #include #include #include #include #define _GNU_SOURCE #include #include "vserver.h" #include "planetlab.h" #ifndef VC_NXC_RAW_SOCKET # define VC_NXC_RAW_SOCKET 0x00000200ull #endif #ifndef VC_NXC_RAW_SEND # define VC_NXC_RAW_SEND 0x00000400ull #endif #ifndef VC_NXF_LBACK_ALLOW # define VC_NXF_LBACK_ALLOW 0x00000400ull #endif static int create_context(xid_t ctx, uint64_t bcaps) { struct vc_ctx_caps vc_caps; struct vc_net_flags vc_nf; struct vc_net_caps vc_ncaps; uint32_t unshare_mask; /* Create network context */ if (vc_net_create(ctx) == VC_NOCTX) { if (errno == EEXIST) goto tag; return -1; } /* Make the network context persistent */ vc_nf.mask = vc_nf.flagword = VC_NXF_PERSISTENT | VC_NXF_LBACK_ALLOW; if (vc_set_nflags(ctx, &vc_nf)) return -1; /* Give it raw sockets capabilities */ vc_ncaps.ncaps = vc_ncaps.cmask = VC_NXC_RAW_ICMP | VC_NXC_RAW_SOCKET; if (vc_set_ncaps(ctx, &vc_ncaps)) return -1; tag: /* Create tag context */ if (vc_tag_create(ctx) == VC_NOCTX) return -1; process: /* * Create context info - this sets the STATE_SETUP and STATE_INIT flags. */ if (vc_ctx_create(ctx, 0) == VC_NOCTX) return -1; /* Unshare the net namespace if the slice if requested in the local slice configuration */ unshare_mask = get_space_flag(ctx); if (unshare_mask != 0) { sys_unshare(unshare_mask); vc_set_namespace(ctx, unshare_mask); } /* Set capabilities - these don't take effect until SETUP flag is unset */ vc_caps.bcaps = bcaps; vc_caps.bmask = ~0ULL; /* currently unused */ vc_caps.ccaps = 0; /* don't want any of these */ vc_caps.cmask = ~0ULL; if (vc_set_ccaps(ctx, &vc_caps)) return -1; if (pl_setsched(ctx, 0, 1) < 0) { PERROR("pl_setsched(%u)", ctx); exit(1); } return 0; } int pl_setup_done(xid_t ctx) { struct vc_ctx_flags vc_flags; /* unset SETUP flag - this allows other processes to migrate */ /* set the PERSISTENT flag - so the context doesn't vanish */ /* Don't clear the STATE_INIT flag, as that would make us the init task. */ vc_flags.mask = VC_VXF_STATE_SETUP|VC_VXF_PERSISTENT; vc_flags.flagword = VC_VXF_PERSISTENT; if (vc_set_cflags(ctx, &vc_flags)) return -1; return 0; } #define RETRY_LIMIT 10 int pl_chcontext(xid_t ctx, uint64_t bcaps, const struct sliver_resources *slr) { int retry_count = 0; int net_migrated = 0; if (pl_set_ulimits(slr) != 0) return -1; for (;;) { struct vc_ctx_flags vc_flags; if (vc_get_cflags(ctx, &vc_flags)) { if (errno != ESRCH) return -1; /* context doesn't exist - create it */ if (create_context(ctx, bcaps)) { if (errno == EEXIST) /* another process beat us in a race */ goto migrate; if (errno == EBUSY) /* another process is creating - poll the SETUP flag */ continue; return -1; } /* created context and migrated to it i.e., we're done */ return 1; } /* check the SETUP flag */ if (vc_flags.flagword & VC_VXF_STATE_SETUP) { /* context is still being setup - wait a while then retry */ if (retry_count++ >= RETRY_LIMIT) { errno = EBUSY; return -1; } sleep(1); continue; } /* context has been setup */ migrate: if (net_migrated || !vc_net_migrate(ctx)) { uint32_t unshare_mask; /* Unshare the net namespace if the slice if requested in the local slice configuration */ unshare_mask = get_space_flag(ctx); if (unshare_mask != 0) { vc_enter_namespace(ctx, unshare_mask); } if (!vc_tag_migrate(ctx) && !vc_ctx_migrate(ctx, 0)) break; /* done */ net_migrated = 1; } /* context disappeared - retry */ } return 0; } /* it's okay for a syscall to fail because the context doesn't exist */ #define VC_SYSCALL(x) \ do \ { \ if (x) \ return errno == ESRCH ? 0 : -1; \ } \ while (0) int pl_setsched(xid_t ctx, uint32_t cpu_min, uint32_t cpu_share) { struct vc_set_sched vc_sched; struct vc_ctx_flags vc_flags; vc_sched.set_mask = (VC_VXSM_FILL_RATE | VC_VXSM_INTERVAL | VC_VXSM_TOKENS | VC_VXSM_TOKENS_MIN | VC_VXSM_TOKENS_MAX | VC_VXSM_MSEC | VC_VXSM_FILL_RATE2 | VC_VXSM_INTERVAL2 | VC_VXSM_FORCE); vc_sched.fill_rate = cpu_min; /* percent reserved */ vc_sched.interval = 100; vc_sched.fill_rate2 = cpu_share; /* best-effort fair share of unreserved */ vc_sched.interval2 = 1000; /* milliseconds */ vc_sched.tokens = 100; /* initial allocation of tokens */ vc_sched.tokens_min = 50; /* need this many tokens to run */ vc_sched.tokens_max = 100; /* max accumulated number of tokens */ if (cpu_share) { if (cpu_share == (uint32_t)VC_LIM_KEEP) vc_sched.set_mask &= ~(VC_VXSM_FILL_RATE|VC_VXSM_FILL_RATE2); else vc_sched.set_mask |= VC_VXSM_IDLE_TIME; } VC_SYSCALL(vc_set_sched(ctx, &vc_sched)); vc_flags.mask = VC_VXF_SCHED_FLAGS; vc_flags.flagword = VC_VXF_SCHED_HARD; VC_SYSCALL(vc_set_cflags(ctx, &vc_flags)); return 0; } enum { TYPE_LONG = 1, TYPE_PERS = 2, }; struct pl_resources { char *name; unsigned type; union { unsigned long long *limit; unsigned long int *personality; }; }; #define WHITESPACE(buffer,index,len) \ while(isspace((int)buffer[index])) \ if (index < len) index++; else goto out; #define VSERVERCONF "/etc/vservers/" void pl_get_limits(const char *context, struct sliver_resources *slr) { FILE *fb; int cwd; size_t len = strlen(VSERVERCONF) + strlen(context) + NULLBYTE_SIZE; char *conf = (char *)malloc(len + strlen("rlimits/openfd.hard")); struct pl_resources *r; struct pl_resources sliver_list[] = { {"sched/fill-rate2", TYPE_LONG, &slr->vs_cpu}, {"rlimits/nproc.hard", TYPE_LONG, &slr->vs_nproc.hard}, {"rlimits/nproc.soft", TYPE_LONG, &slr->vs_nproc.soft}, {"rlimits/nproc.min", TYPE_LONG, &slr->vs_nproc.min}, {"rlimits/rss.hard", TYPE_LONG, &slr->vs_rss.hard}, {"rlimits/rss.soft", TYPE_LONG, &slr->vs_rss.soft}, {"rlimits/rss.min", TYPE_LONG, &slr->vs_rss.min}, {"rlimits/as.hard", TYPE_LONG, &slr->vs_as.hard}, {"rlimits/as.soft", TYPE_LONG, &slr->vs_as.soft}, {"rlimits/as.min", TYPE_LONG, &slr->vs_as.min}, {"rlimits/openfd.hard", TYPE_LONG, &slr->vs_openfd.hard}, {"rlimits/openfd.soft", TYPE_LONG, &slr->vs_openfd.soft}, {"rlimits/openfd.min", TYPE_LONG, &slr->vs_openfd.min}, {"personality", TYPE_PERS, &slr->personality}, {0,0} }; sprintf(conf, "%s%s", VSERVERCONF, context); slr->vs_rss.hard = VC_LIM_KEEP; slr->vs_rss.soft = VC_LIM_KEEP; slr->vs_rss.min = VC_LIM_KEEP; slr->vs_as.hard = VC_LIM_KEEP; slr->vs_as.soft = VC_LIM_KEEP; slr->vs_as.min = VC_LIM_KEEP; slr->vs_nproc.hard = VC_LIM_KEEP; slr->vs_nproc.soft = VC_LIM_KEEP; slr->vs_nproc.min = VC_LIM_KEEP; slr->vs_openfd.hard = VC_LIM_KEEP; slr->vs_openfd.soft = VC_LIM_KEEP; slr->vs_openfd.min = VC_LIM_KEEP; slr->personality = 0; cwd = open(".", O_RDONLY); if (cwd == -1) { perror("cannot get a handle on ."); goto out; } if (chdir(conf) == -1) { fprintf(stderr, "cannot chdir to "); perror(conf); goto out_fd; } for (r = &sliver_list[0]; r->name; r++) { char buf[1000]; fb = fopen(r->name, "r"); if (fb == NULL) continue; if (fgets(buf, sizeof(buf), fb) != NULL) { len=strlen(buf); /* remove trailing newline */ if (buf[len-1] == '\n') { buf[len-1]='\0'; len --; } if ( (r->type == TYPE_LONG) && isdigit(*buf)) { *r->limit = atoi(buf); } else if ( (r->type == TYPE_PERS) && isalpha(*buf)) { unsigned long int res; res = vc_str2personalitytype(buf,len); if (res != VC_BAD_PERSONALITY) { *r->personality = res; } } } fclose(fb); } fchdir(cwd); out_fd: close(cwd); out: free(conf); } int adjust_lim(const struct vc_rlimit *vcr, struct rlimit *lim) { int adjusted = 0; if (vcr->min != VC_LIM_KEEP) { if (vcr->min > lim->rlim_cur) { lim->rlim_cur = vcr->min; adjusted = 1; } if (vcr->min > lim->rlim_max) { lim->rlim_max = vcr->min; adjusted = 1; } } if (vcr->soft != VC_LIM_KEEP) { switch (vcr->min != VC_LIM_KEEP) { case 1: if (vcr->soft < vcr->min) break; case 0: lim->rlim_cur = vcr->soft; adjusted = 1; } } if (vcr->hard != VC_LIM_KEEP) { switch (vcr->min != VC_LIM_KEEP) { case 1: if (vcr->hard < vcr->min) break; case 0: lim->rlim_max = vcr->hard; adjusted = 1; } } return adjusted; } static inline void set_one_ulimit(int resource, const struct vc_rlimit *limit) { struct rlimit lim; getrlimit(resource, &lim); adjust_lim(limit, &lim); setrlimit(resource, &lim); } static inline int set_personality(unsigned long int personality_arg) { if (personality_arg == 0) return 0; if (personality(personality_arg) < 0) { return -1; } return 0; } int pl_set_ulimits(const struct sliver_resources *slr) { if (!slr) return 0; set_one_ulimit(RLIMIT_RSS, &slr->vs_rss); set_one_ulimit(RLIMIT_AS, &slr->vs_as); set_one_ulimit(RLIMIT_NPROC, &slr->vs_nproc); set_one_ulimit(RLIMIT_NOFILE, &slr->vs_openfd); return set_personality(slr->personality); }