X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=kernel%2Fsys.c;h=8012768627c5303ef59f9229ae5490e6e6e95947;hb=9464c7cf61b9433057924c36e6e02f303a00e768;hp=8d53d8f7cfadaf150f1fef466487f3a9db5fc61e;hpb=41689045f6a3cbe0550e1d34e9cc20d2e8c432ba;p=linux-2.6.git diff --git a/kernel/sys.c b/kernel/sys.c index 8d53d8f7c..801276862 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -4,6 +4,7 @@ * Copyright (C) 1991, 1992 Linus Torvalds */ +#include #include #include #include @@ -13,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -58,12 +60,6 @@ #ifndef GET_FPEXC_CTL # define GET_FPEXC_CTL(a,b) (-EINVAL) #endif -#ifndef GET_ENDIAN -# define GET_ENDIAN(a,b) (-EINVAL) -#endif -#ifndef SET_ENDIAN -# define SET_ENDIAN(a,b) (-EINVAL) -#endif /* * this is where the system-wide overflow UID and GID are defined, for @@ -139,15 +135,14 @@ static int __kprobes notifier_call_chain(struct notifier_block **nl, unsigned long val, void *v) { int ret = NOTIFY_DONE; - struct notifier_block *nb, *next_nb; + struct notifier_block *nb; nb = rcu_dereference(*nl); while (nb) { - next_nb = rcu_dereference(nb->next); ret = nb->notifier_call(nb, val, v); if ((ret & NOTIFY_STOP_MASK) == NOTIFY_STOP_MASK) break; - nb = next_nb; + nb = rcu_dereference(nb->next); } return ret; } @@ -596,7 +591,7 @@ void emergency_restart(void) } EXPORT_SYMBOL_GPL(emergency_restart); -static void kernel_restart_prepare(char *cmd) +void kernel_restart_prepare(char *cmd) { blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd); system_state = SYSTEM_RESTART; @@ -630,7 +625,7 @@ EXPORT_SYMBOL_GPL(kernel_restart); * Move into place and start executing a preloaded standalone * executable. If nothing was preloaded return an error. */ -static void kernel_kexec(void) +void kernel_kexec(void) { #ifdef CONFIG_KEXEC struct kimage *image; @@ -644,6 +639,7 @@ static void kernel_kexec(void) machine_kexec(image); #endif } +EXPORT_SYMBOL_GPL(kernel_kexec); void kernel_shutdown_prepare(enum system_states state) { @@ -1396,6 +1392,7 @@ asmlinkage long sys_setsid(void) pid_t session; int err = -EPERM; + mutex_lock(&tty_mutex); write_lock_irq(&tasklist_lock); /* Fail if I am already a session leader */ @@ -1415,15 +1412,12 @@ asmlinkage long sys_setsid(void) group_leader->signal->leader = 1; __set_special_pids(session, session); - - spin_lock(&group_leader->sighand->siglock); group_leader->signal->tty = NULL; group_leader->signal->tty_old_pgrp = 0; - spin_unlock(&group_leader->sighand->siglock); - err = process_group(group_leader); out: write_unlock_irq(&tasklist_lock); + mutex_unlock(&tty_mutex); return err; } @@ -1889,20 +1883,23 @@ out: * fields when reaping, so a sample either gets all the additions of a * given child after it's reaped, or none so this sample is before reaping. * - * Locking: - * We need to take the siglock for CHILDEREN, SELF and BOTH - * for the cases current multithreaded, non-current single threaded - * non-current multithreaded. Thread traversal is now safe with - * the siglock held. - * Strictly speaking, we donot need to take the siglock if we are current and - * single threaded, as no one else can take our signal_struct away, no one - * else can reap the children to update signal->c* counters, and no one else - * can race with the signal-> fields. If we do not take any lock, the - * signal-> fields could be read out of order while another thread was just - * exiting. So we should place a read memory barrier when we avoid the lock. - * On the writer side, write memory barrier is implied in __exit_signal - * as __exit_signal releases the siglock spinlock after updating the signal-> - * fields. But we don't do this yet to keep things simple. + * tasklist_lock locking optimisation: + * If we are current and single threaded, we do not need to take the tasklist + * lock or the siglock. No one else can take our signal_struct away, + * no one else can reap the children to update signal->c* counters, and + * no one else can race with the signal-> fields. + * If we do not take the tasklist_lock, the signal-> fields could be read + * out of order while another thread was just exiting. So we place a + * read memory barrier when we avoid the lock. On the writer side, + * write memory barrier is implied in __exit_signal as __exit_signal releases + * the siglock spinlock after updating the signal-> fields. + * + * We don't really need the siglock when we access the non c* fields + * of the signal_struct (for RUSAGE_SELF) even in multithreaded + * case, since we take the tasklist lock for read and the non c* signal-> + * fields are updated only in __exit_signal, which is called with + * tasklist_lock taken for write, hence these two threads cannot execute + * concurrently. * */ @@ -1911,25 +1908,35 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) struct task_struct *t; unsigned long flags; cputime_t utime, stime; + int need_lock = 0; memset((char *) r, 0, sizeof *r); utime = stime = cputime_zero; - rcu_read_lock(); - if (!lock_task_sighand(p, &flags)) { - rcu_read_unlock(); - return; - } + if (p != current || !thread_group_empty(p)) + need_lock = 1; + + if (need_lock) { + read_lock(&tasklist_lock); + if (unlikely(!p->signal)) { + read_unlock(&tasklist_lock); + return; + } + } else + /* See locking comments above */ + smp_rmb(); switch (who) { case RUSAGE_BOTH: case RUSAGE_CHILDREN: + spin_lock_irqsave(&p->sighand->siglock, flags); utime = p->signal->cutime; stime = p->signal->cstime; r->ru_nvcsw = p->signal->cnvcsw; r->ru_nivcsw = p->signal->cnivcsw; r->ru_minflt = p->signal->cmin_flt; r->ru_majflt = p->signal->cmaj_flt; + spin_unlock_irqrestore(&p->sighand->siglock, flags); if (who == RUSAGE_CHILDREN) break; @@ -1957,9 +1964,8 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) BUG(); } - unlock_task_sighand(p, &flags); - rcu_read_unlock(); - + if (need_lock) + read_unlock(&tasklist_lock); cputime_to_timeval(utime, &r->ru_utime); cputime_to_timeval(stime, &r->ru_stime); } @@ -2074,13 +2080,6 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, return -EFAULT; return 0; } - case PR_GET_ENDIAN: - error = GET_ENDIAN(current, arg2); - break; - case PR_SET_ENDIAN: - error = SET_ENDIAN(current, arg2); - break; - default: error = -EINVAL; break;