upgrade to fedora-2.6.12-1.1398.FC4 + vserver 2.0.rc7

[linux-2.6.git] / kernel / fork.c
diff --git a/kernel/fork.c b/kernel/fork.c

index a8fc224..365bcd2 100644 (file)
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -29,6 +29,7 @@
  #include <linux/mman.h>
  #include <linux/fs.h>
  #include <linux/cpu.h>
+#include <linux/cpuset.h>
  #include <linux/security.h>
  #include <linux/swap.h>
  #include <linux/syscalls.h>
@@ -39,6 +40,7 @@
  #include <linux/audit.h>
  #include <linux/profile.h>
  #include <linux/rmap.h>
+#include <linux/acct.h>
  #include <linux/vs_network.h>
  #include <linux/vs_limit.h>
  #include <linux/vs_memory.h>
@@ -50,17 +52,17 @@
  #include <asm/cacheflush.h>
  #include <asm/tlbflush.h>
  
-/* The idle threads do not count..
- * Protected by write_lock_irq(&tasklist_lock)
+/*
+ * Protected counters by write_lock_irq(&tasklist_lock)
   */
-int nr_threads;
-
-int max_threads;
  unsigned long total_forks;     /* Handle normal Linux uptimes. */
+int nr_threads;                /* The idle threads do not count.. */
+
+int max_threads;               /* tunable limit on nr_threads */
  
  DEFINE_PER_CPU(unsigned long, process_counts) = 0;
  
-rwlock_t tasklist_lock __cacheline_aligned = RW_LOCK_UNLOCKED;  /* outer */
+ __cacheline_aligned DEFINE_RWLOCK(tasklist_lock);  /* outer */
  
  EXPORT_SYMBOL(tasklist_lock);
  
@@ -81,6 +83,24 @@ int nr_processes(void)
  static kmem_cache_t *task_struct_cachep;
  #endif
  
+/* SLAB cache for signal_struct structures (tsk->signal) */
+kmem_cache_t *signal_cachep;
+
+/* SLAB cache for sighand_struct structures (tsk->sighand) */
+kmem_cache_t *sighand_cachep;
+
+/* SLAB cache for files_struct structures (tsk->files) */
+kmem_cache_t *files_cachep;
+
+/* SLAB cache for fs_struct structures (tsk->fs) */
+kmem_cache_t *fs_cachep;
+
+/* SLAB cache for vm_area_struct structures */
+kmem_cache_t *vm_area_cachep;
+
+/* SLAB cache for mm_struct structures (tsk->mm) */
+static kmem_cache_t *mm_cachep;
+
  void free_task(struct task_struct *tsk)
  {
         free_thread_info(tsk->thread_info);
@@ -133,6 +153,8 @@ void __init fork_init(unsigned long mempages)
  
         init_task.signal->rlim[RLIMIT_NPROC].rlim_cur = max_threads/2;
         init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2;
+       init_task.signal->rlim[RLIMIT_SIGPENDING] =
+               init_task.signal->rlim[RLIMIT_NPROC];
  }
  
  static struct task_struct *dup_task_struct(struct task_struct *orig)
@@ -178,8 +200,8 @@ static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm)
         mm->mmap_cache = NULL;
         mm->free_area_cache = oldmm->mmap_base;
         mm->map_count = 0;
-       mm->rss = 0;
-       mm->anon_rss = 0;
+       __set_mm_counter(mm, rss, 0);
+       __set_mm_counter(mm, anon_rss, 0);
         cpus_clear(mm->cpu_vm_mask);
         mm->mm_rb = RB_ROOT;
         rb_link = &mm->mm_rb.rb_node;
@@ -223,6 +245,7 @@ static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm)
        
                         /* insert tmp into the share list, just after mpnt */
                         spin_lock(&file->f_mapping->i_mmap_lock);
+                       tmp->vm_truncate_count = mpnt->vm_truncate_count;
                         flush_dcache_mmap_lock(file->f_mapping);
                         vma_prio_tree_add(tmp, mpnt);
                         flush_dcache_mmap_unlock(file->f_mapping);
@@ -284,7 +307,7 @@ static inline void mm_free_pgd(struct mm_struct * mm)
  #define mm_free_pgd(mm)
  #endif /* CONFIG_MMU */
  
-spinlock_t mmlist_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
+ __cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock);
  
  #define allocate_mm()  (kmem_cache_alloc(mm_cachep, SLAB_KERNEL))
  #define free_mm(mm)    (kmem_cache_free(mm_cachep, (mm)))
@@ -477,6 +500,9 @@ static int copy_mm(unsigned long clone_flags, struct task_struct * tsk)
         if (retval)
                 goto free_pt;
  
+       mm->hiwater_rss = get_mm_counter(mm,rss);
+       mm->hiwater_vm = mm->total_vm;
+
  good_mm:
         tsk->mm = mm;
         tsk->active_mm = mm;
@@ -492,6 +518,7 @@ fail_nocontext:
          * If init_new_context() failed, we cannot use mmput() to free the mm
          * because it calls destroy_context()
          */
+       clr_vx_info(&mm->mm_vx_info);
         mm_free_pgd(mm);
         free_mm(mm);
         return retval;
@@ -558,7 +585,7 @@ static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
  {
         struct files_struct *oldf, *newf;
         struct file **old_fds, **new_fds;
-       int open_files, nfds, size, i, error = 0;
+       int open_files, size, i, error = 0, expand;
  
         /*
          * A background process may not have any files ...
@@ -593,36 +620,32 @@ static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
         newf->open_fds      = &newf->open_fds_init;
         newf->fd            = &newf->fd_array[0];
  
-       /* We don't yet have the oldf readlock, but even if the old
-           fdset gets grown now, we'll only copy up to "size" fds */
-       size = oldf->max_fdset;
-       if (size > __FD_SETSIZE) {
-               newf->max_fdset = 0;
-               spin_lock(&newf->file_lock);
-               error = expand_fdset(newf, size-1);
-               spin_unlock(&newf->file_lock);
-               if (error)
-                       goto out_release;
-       }
         spin_lock(&oldf->file_lock);
  
-       open_files = count_open_files(oldf, size);
+       open_files = count_open_files(oldf, oldf->max_fdset);
+       expand = 0;
  
         /*
-        * Check whether we need to allocate a larger fd array.
-        * Note: we're not a clone task, so the open count won't
-        * change.
+        * Check whether we need to allocate a larger fd array or fd set.
+        * Note: we're not a clone task, so the open count won't  change.
          */
-       nfds = NR_OPEN_DEFAULT;
-       if (open_files > nfds) {
-               spin_unlock(&oldf->file_lock);
+       if (open_files > newf->max_fdset) {
+               newf->max_fdset = 0;
+               expand = 1;
+       }
+       if (open_files > newf->max_fds) {
                 newf->max_fds = 0;
+               expand = 1;
+       }
+
+       /* if the old fdset gets grown now, we'll only copy up to "size" fds */
+       if (expand) {
+               spin_unlock(&oldf->file_lock);
                 spin_lock(&newf->file_lock);
-               error = expand_fd_array(newf, open_files-1);
+               error = expand_files(newf, open_files-1);
                 spin_unlock(&newf->file_lock);
-               if (error) 
+               if (error < 0)
                         goto out_release;
-               nfds = newf->max_fds;
                 spin_lock(&oldf->file_lock);
         }
  
@@ -636,6 +659,8 @@ static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
                 struct file *f = *old_fds++;
                 if (f) {
                         get_file(f);
+                       /* FIXME sum it first for check and performance */
+                       vx_openfd_inc(open_files - i);
                 } else {
                         /*
                          * The fd may be claimed in the fd bitmap but not yet
@@ -671,6 +696,7 @@ out:
  out_release:
         free_fdset (newf->close_on_exec, newf->max_fdset);
         free_fdset (newf->open_fds, newf->max_fdset);
+       free_fd_array(newf->fd, newf->max_fds);
         kmem_cache_free(files_cachep, newf);
         goto out;
  }
@@ -725,6 +751,7 @@ static inline int copy_sighand(unsigned long clone_flags, struct task_struct * t
  static inline int copy_signal(unsigned long clone_flags, struct task_struct * tsk)
  {
         struct signal_struct *sig;
+       int ret;
  
         if (clone_flags & CLONE_THREAD) {
                 atomic_inc(&current->signal->count);
@@ -735,31 +762,61 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
         tsk->signal = sig;
         if (!sig)
                 return -ENOMEM;
+
+       ret = copy_thread_group_keys(tsk);
+       if (ret < 0) {
+               kmem_cache_free(signal_cachep, sig);
+               return ret;
+       }
+
         atomic_set(&sig->count, 1);
         atomic_set(&sig->live, 1);
-       sig->group_exit = 0;
+       init_waitqueue_head(&sig->wait_chldexit);
+       sig->flags = 0;
         sig->group_exit_code = 0;
         sig->group_exit_task = NULL;
         sig->group_stop_count = 0;
-       sig->stop_state = 0;
         sig->curr_target = NULL;
         init_sigpending(&sig->shared_pending);
         INIT_LIST_HEAD(&sig->posix_timers);
  
+       sig->it_real_value = sig->it_real_incr = 0;
+       sig->real_timer.function = it_real_fn;
+       sig->real_timer.data = (unsigned long) tsk;
+       init_timer(&sig->real_timer);
+
+       sig->it_virt_expires = cputime_zero;
+       sig->it_virt_incr = cputime_zero;
+       sig->it_prof_expires = cputime_zero;
+       sig->it_prof_incr = cputime_zero;
+
         sig->tty = current->signal->tty;
         sig->pgrp = process_group(current);
         sig->session = current->signal->session;
         sig->leader = 0;        /* session leadership doesn't inherit */
         sig->tty_old_pgrp = 0;
  
-       sig->utime = sig->stime = sig->cutime = sig->cstime = 0;
+       sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero;
         sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
         sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
+       sig->sched_time = 0;
+       INIT_LIST_HEAD(&sig->cpu_timers[0]);
+       INIT_LIST_HEAD(&sig->cpu_timers[1]);
+       INIT_LIST_HEAD(&sig->cpu_timers[2]);
  
         task_lock(current->group_leader);
         memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim);
         task_unlock(current->group_leader);
  
+       if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) {
+               /*
+                * New sole thread in the process gets an expiry time
+                * of the whole CPU time limit.
+                */
+               tsk->it_prof_expires =
+                       secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur);
+       }
+
         return 0;
  }
  
@@ -800,6 +857,7 @@ static task_t *copy_process(unsigned long clone_flags,
         int retval;
         struct task_struct *p = NULL;
         struct vx_info *vxi;
+       struct nx_info *nxi;
  
         if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
                 return ERR_PTR(-EINVAL);
@@ -842,7 +900,7 @@ static task_t *copy_process(unsigned long clone_flags,
                         goto bad_fork_free;
         }
         if (p->mm && vx_flags(VXF_FORK_RSS, 0)) {
-               if (!vx_rsspages_avail(p->mm, p->mm->rss))
+               if (!vx_rsspages_avail(p->mm, get_mm_counter(p->mm, rss)))
                         goto bad_fork_cleanup_vm;
         }
  
@@ -859,7 +917,53 @@ static task_t *copy_process(unsigned long clone_flags,
                         goto bad_fork_free;
         }
         if (p->mm && vx_flags(VXF_FORK_RSS, 0)) {
-               if (!vx_rsspages_avail(p->mm, p->mm->rss))
+               if (!vx_rsspages_avail(p->mm, get_mm_counter(p->mm, rss)))
+                       goto bad_fork_cleanup_vm;
+       }
+
+       init_vx_info(&p->vx_info, current->vx_info);
+       p->nx_info = NULL;
+       set_nx_info(&p->nx_info, current->nx_info);
+
+       /* check vserver memory */
+       if (p->mm && !(clone_flags & CLONE_VM)) {
+               if (vx_vmpages_avail(p->mm, p->mm->total_vm))
+                       vx_pages_add(p->mm->mm_vx_info, RLIMIT_AS, p->mm->total_vm);
+               else
+                       goto bad_fork_free;
+       }
+       if (p->mm && vx_flags(VXF_FORK_RSS, 0)) {
+               if (!vx_rsspages_avail(p->mm, get_mm_counter(p->mm, rss)))
+                       goto bad_fork_cleanup_vm;
+       }
+
+       init_vx_info(&p->vx_info, current->vx_info);
+       init_nx_info(&p->nx_info, current->nx_info);
+
+       /* check vserver memory */
+       if (p->mm && !(clone_flags & CLONE_VM)) {
+               if (vx_vmpages_avail(p->mm, p->mm->total_vm))
+                       vx_pages_add(p->mm->mm_vx_info, RLIMIT_AS, p->mm->total_vm);
+               else
+                       goto bad_fork_free;
+       }
+       if (p->mm && vx_flags(VXF_FORK_RSS, 0)) {
+               if (!vx_rsspages_avail(p->mm, get_mm_counter(p->mm, rss)))
+                       goto bad_fork_cleanup_vm;
+       }
+
+       init_vx_info(&p->vx_info, current->vx_info);
+       init_nx_info(&p->nx_info, current->nx_info);
+
+       /* check vserver memory */
+       if (p->mm && !(clone_flags & CLONE_VM)) {
+               if (vx_vmpages_avail(p->mm, p->mm->total_vm))
+                       vx_pages_add(p->mm->mm_vx_info, RLIMIT_AS, p->mm->total_vm);
+               else
+                       goto bad_fork_free;
+       }
+       if (p->mm && vx_flags(VXF_FORK_RSS, 0)) {
+               if (!vx_rsspages_avail(p->mm, get_mm_counter(p->mm, rss)))
                         goto bad_fork_cleanup_vm;
         }
  
@@ -904,7 +1008,6 @@ static task_t *copy_process(unsigned long clone_flags,
  
         INIT_LIST_HEAD(&p->children);
         INIT_LIST_HEAD(&p->sibling);
-       init_waitqueue_head(&p->wait_chldexit);
         p->vfork_done = NULL;
         spin_lock_init(&p->alloc_lock);
         spin_lock_init(&p->proc_lock);
@@ -912,12 +1015,22 @@ static task_t *copy_process(unsigned long clone_flags,
         clear_tsk_thread_flag(p, TIF_SIGPENDING);
         init_sigpending(&p->pending);
  
-       p->it_real_value = p->it_virt_value = p->it_prof_value = 0;
-       p->it_real_incr = p->it_virt_incr = p->it_prof_incr = 0;
-       init_timer(&p->real_timer);
-       p->real_timer.data = (unsigned long) p;
+       p->utime = cputime_zero;
+       p->stime = cputime_zero;
+       p->sched_time = 0;
+       p->rchar = 0;           /* I/O counter: bytes read */
+       p->wchar = 0;           /* I/O counter: bytes written */
+       p->syscr = 0;           /* I/O counter: read syscalls */
+       p->syscw = 0;           /* I/O counter: write syscalls */
+       acct_clear_integrals(p);
+
+       p->it_virt_expires = cputime_zero;
+       p->it_prof_expires = cputime_zero;
+       p->it_sched_expires = 0;
+       INIT_LIST_HEAD(&p->cpu_timers[0]);
+       INIT_LIST_HEAD(&p->cpu_timers[1]);
+       INIT_LIST_HEAD(&p->cpu_timers[2]);
  
-       p->utime = p->stime = 0;
         p->lock_depth = -1;             /* -1 = no lock */
         do_posix_clock_monotonic_gettime(&p->start_time);
         p->security = NULL;
@@ -1032,7 +1145,7 @@ static task_t *copy_process(unsigned long clone_flags,
                  * do not create this new thread - the whole thread
                  * group is supposed to exit anyway.
                  */
-               if (current->signal->group_exit) {
+               if (current->signal->flags & SIGNAL_GROUP_EXIT) {
                         spin_unlock(&current->sighand->siglock);
                         write_unlock_irq(&tasklist_lock);
                         retval = -EAGAIN;
@@ -1050,6 +1163,21 @@ static task_t *copy_process(unsigned long clone_flags,
                         set_tsk_thread_flag(p, TIF_SIGPENDING);
                 }
  
+               if (!cputime_eq(current->signal->it_virt_expires,
+                               cputime_zero) ||
+                   !cputime_eq(current->signal->it_prof_expires,
+                               cputime_zero) ||
+                   current->signal->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY ||
+                   !list_empty(&current->signal->cpu_timers[0]) ||
+                   !list_empty(&current->signal->cpu_timers[1]) ||
+                   !list_empty(&current->signal->cpu_timers[2])) {
+                       /*
+                        * Have child wake up on its first tick to check
+                        * for process CPU timers.
+                        */
+                       p->it_prof_expires = jiffies_to_cputime(1);
+               }
+
                 spin_unlock(&current->sighand->siglock);
         }
  
@@ -1057,6 +1185,8 @@ static task_t *copy_process(unsigned long clone_flags,
         if (unlikely(p->ptrace & PT_PTRACED))
                 __ptrace_link(p, current->parent);
  
+       cpuset_fork(p);
+
         attach_pid(p, PIDTYPE_PID, p->pid);
         attach_pid(p, PIDTYPE_TGID, p->tgid);
         if (thread_group_leader(p)) {
@@ -1068,12 +1198,19 @@ static task_t *copy_process(unsigned long clone_flags,
  
         p->ioprio = current->ioprio;
         nr_threads++;
+       total_forks++;
+
         /* p is copy of current */
         vxi = p->vx_info;
         if (vxi) {
+               claim_vx_info(vxi, p);
                 atomic_inc(&vxi->cvirt.nr_threads);
+               atomic_inc(&vxi->cvirt.total_forks);
                 vx_nproc_inc(p);
         }
+       nxi = p->nx_info;
+       if (nxi)
+               claim_nx_info(nxi, p);
         write_unlock_irq(&tasklist_lock);
         retval = 0;
  
@@ -1210,7 +1347,6 @@ long do_fork(unsigned long clone_flags,
                         wake_up_new_task(p, clone_flags);
                 else
                         p->state = TASK_STOPPED;
-               ++total_forks;
  
                 if (unlikely (trace)) {
                         current->ptrace_message = pid;
@@ -1229,24 +1365,6 @@ long do_fork(unsigned long clone_flags,
         return pid;
  }
  
-/* SLAB cache for signal_struct structures (tsk->signal) */
-kmem_cache_t *signal_cachep;
-
-/* SLAB cache for sighand_struct structures (tsk->sighand) */
-kmem_cache_t *sighand_cachep;
-
-/* SLAB cache for files_struct structures (tsk->files) */
-kmem_cache_t *files_cachep;
-
-/* SLAB cache for fs_struct structures (tsk->fs) */
-kmem_cache_t *fs_cachep;
-
-/* SLAB cache for vm_area_struct structures */
-kmem_cache_t *vm_area_cachep;
-
-/* SLAB cache for mm_struct structures (tsk->mm) */
-kmem_cache_t *mm_cachep;
-
  void __init proc_caches_init(void)
  {
         sighand_cachep = kmem_cache_create("sighand_cache",