--- /dev/null
+/* include/linux/ckrm_mem.h : memory control for CKRM
+ *
+ * Copyright (C) Jiantao Kong, IBM Corp. 2003
+ * (C) Shailabh Nagar, IBM Corp. 2003
+ * (C) Chandra Seetharaman, IBM Corp. 2004
+ *
+ *
+ * Memory control functions of the CKRM kernel API
+ *
+ * Latest version, more details at http://ckrm.sf.net
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+
+/* Changes
+ *
+ * 28 Aug 2003
+ * Created.
+ */
+
+#ifndef _LINUX_CKRM_MEM_H
+#define _LINUX_CKRM_MEM_H
+
+#ifdef CONFIG_CKRM_RES_MEM
+
+#include <linux/list.h>
+#include <linux/ckrm_rc.h>
+
+typedef struct ckrm_mem_res {
+ unsigned long reclaim_flags;
+ unsigned long flags;
+ struct ckrm_core_class *core; // the core i am part of...
+ struct ckrm_core_class *parent; // parent of the core i am part of....
+ struct ckrm_shares shares;
+ struct list_head mcls_list; // list of all 1-level classes
+ struct list_head shrink_list; // list of classes need to be shrunk
+ atomic_t nr_users; // # of references to this class/data structure
+ atomic_t pg_total; // # of pages used by this class
+ int pg_guar; // # of pages this class is guaranteed
+ int pg_limit; // max # of pages this class can get
+ int pg_borrowed; // # of pages this class borrowed from its parent
+ int pg_lent; // # of pages this class lent to its children
+ int pg_unused; // # of pages left to this class (after giving the
+ // guarantees to children. need to borrow from parent if
+ // more than this is needed.
+ int nr_active[MAX_NR_ZONES];
+ int nr_inactive[MAX_NR_ZONES];
+ int shrink_count;
+ unsigned long last_shrink;
+ int over_limit_failures;
+ int hier; // hiearchy, root = 0
+} ckrm_mem_res_t;
+
+extern atomic_t ckrm_mem_real_count;
+extern unsigned int ckrm_tot_lru_pages;
+extern struct list_head ckrm_shrink_list;
+extern spinlock_t ckrm_mem_lock;
+extern struct ckrm_res_ctlr mem_rcbs;
+
+#define page_class(page) ((ckrm_mem_res_t*)((page)->memclass))
+
+// used to fill reclaim_flags, used only when memory is low in the system
+#define CLS_CLEAR (0) // class under its guarantee
+#define CLS_OVER_GUAR (1 << 0) // class is over its guarantee
+#define CLS_PARENT_OVER (1 << 1) // parent is over 120% mark over limit
+#define CLS_OVER_75 (1 << 2) // class over 75% mark bet guar(0) & limit(100)
+#define CLS_OVER_100 (1 << 3) // class over its limit
+#define CLS_OVER_110 (1 << 4) // class over 110% mark over limit
+#define CLS_FLAGS_ALL ( CLS_OVER_GUAR | CLS_PARENT_OVER | CLS_OVER_75 | \
+ CLS_OVER_100 | CLS_OVER_110 )
+#define CLS_SHRINK_BIT (31) // used to both lock and set the bit
+#define CLS_SHRINK (1 << CLS_SHRINK_BIT) // shrink the given class
+
+// used in flags. set when a class is more than 90% of its maxlimit
+#define MEM_NEAR_LIMIT 1
+
+extern void ckrm_set_aggressive(ckrm_mem_res_t *);
+extern unsigned int ckrm_setup_reclamation(void);
+extern void ckrm_teardown_reclamation(void);
+extern void ckrm_get_reclaim_bits(unsigned int *, unsigned int *);
+extern void ckrm_init_mm_to_task(struct mm_struct *, struct task_struct *);
+extern void ckrm_mem_evaluate_mm(struct mm_struct *);
+extern void ckrm_mem_evaluate_page_byadd(struct page *, struct mm_struct *);
+extern void ckrm_near_limit(ckrm_mem_res_t *);
+#define ckrm_get_reclaim_flags(cls) ((cls)->reclaim_flags)
+
+#else
+
+#define ckrm_init_mm_to_current(a) do {} while (0)
+#define ckrm_mem_evaluate_mm(a) do {} while (0)
+#define ckrm_mem_evaluate_page_byadd(a,b) do {} while (0)
+#define page_class(page) (NULL)
+#define ckrm_get_reclaim_flags(a) (0)
+#define ckrm_setup_reclamation() (0)
+#define ckrm_teardown_reclamation() do {} while (0)
+#define ckrm_get_reclaim_bits(a, b) do { *(a) = 0; *(b)= 0; } while (0)
+#define ckrm_init_mm_to_task(a,b) do {} while (0)
+
+#endif // CONFIG_CKRM_RES_MEM
+
+#endif //_LINUX_CKRM_MEM_H
+
--- /dev/null
+/* include/linux/ckrm_mem_inline.h : memory control for CKRM
+ *
+ * Copyright (C) Jiantao Kong, IBM Corp. 2003
+ * (C) Shailabh Nagar, IBM Corp. 2003
+ * (C) Chandra Seetharaman, IBM Corp. 2004
+ *
+ *
+ * Memory control functions of the CKRM kernel API
+ *
+ * Latest version, more details at http://ckrm.sf.net
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+
+/* Changes
+ *
+ * 28 Aug 2003
+ * Created.
+ */
+
+
+#ifndef _LINUX_CKRM_MEM_INLINE_H_
+#define _LINUX_CKRM_MEM_INLINE_H_
+
+#include <linux/rmap.h>
+#include <linux/mmzone.h>
+#include <linux/ckrm_mem.h>
+
+
+#ifdef CONFIG_CKRM_RES_MEM
+
+#define GET_MEM_CLASS(tsk) \
+ ckrm_get_res_class(tsk->taskclass, mem_rcbs.resid, ckrm_mem_res_t)
+
+#define ckrm_set_shrink(cls) \
+ set_bit(CLS_SHRINK_BIT, (unsigned long *)&(cls)->reclaim_flags)
+#define ckrm_test_set_shrink(cls) \
+ test_and_set_bit(CLS_SHRINK_BIT, (unsigned long *)&(cls)->reclaim_flags)
+#define ckrm_clear_shrink(cls) \
+ clear_bit(CLS_SHRINK_BIT, (unsigned long *)&(cls)->reclaim_flags)
+
+#define ckrm_shrink_list_empty() list_empty(&ckrm_shrink_list)
+
+/*
+ * Currently, the class of an address is assigned to the class with max
+ * available guarantee. Simply replace this function for other policies.
+ */
+static inline int
+ckrm_mem_share_compare(ckrm_mem_res_t *a, ckrm_mem_res_t *b)
+{
+ if (a == NULL)
+ return -(b != NULL) ;
+ if (b == NULL)
+ return 0;
+ return (a->pg_unused - b->pg_unused);
+}
+
+static inline void
+mem_class_get(ckrm_mem_res_t *cls)
+{
+ if (cls)
+ atomic_inc(&((cls)->nr_users));
+}
+
+static inline void
+mem_class_put(ckrm_mem_res_t *cls)
+{
+ if (cls && atomic_dec_and_test(&(cls->nr_users)) ) {
+ printk("freeing memclass %p of <core:%s>\n", cls, cls->core->name);
+ //kfree(cls);
+ }
+}
+
+static inline int
+incr_use_count(ckrm_mem_res_t *cls, int borrow)
+{
+ int over_limit;
+
+ atomic_inc(&cls->pg_total);
+ over_limit = (atomic_read(&cls->pg_total) > ((9 * cls->pg_limit) / 10));
+
+ if (borrow)
+ cls->pg_lent++;
+ if ((cls->pg_guar != CKRM_SHARE_DONTCARE) &&
+ (atomic_read(&cls->pg_total) > cls->pg_unused)) {
+ ckrm_mem_res_t *parcls = ckrm_get_res_class(cls->parent,
+ mem_rcbs.resid, ckrm_mem_res_t);
+ if (parcls) {
+ over_limit |= incr_use_count(parcls, 1);
+ cls->pg_borrowed++;
+ return over_limit;
+ }
+ }
+ atomic_inc(&ckrm_mem_real_count);
+ return over_limit;
+}
+
+static inline void
+decr_use_count(ckrm_mem_res_t *cls, int borrowed)
+{
+ atomic_dec(&cls->pg_total);
+ if (borrowed)
+ cls->pg_lent--;
+ if (cls->pg_borrowed > 0) {
+ ckrm_mem_res_t *parcls = ckrm_get_res_class(cls->parent,
+ mem_rcbs.resid, ckrm_mem_res_t);
+ if (parcls) {
+ decr_use_count(parcls, 1);
+ cls->pg_borrowed--;
+ return;
+ }
+ }
+ atomic_dec(&ckrm_mem_real_count);
+}
+
+static inline void
+ckrm_set_page_class(struct page *page, ckrm_mem_res_t *cls)
+{
+ if (mem_rcbs.resid != -1 && cls != NULL) {
+ if (unlikely(page->memclass)) {
+ mem_class_put(page->memclass);
+ }
+ page->memclass = cls;
+ mem_class_get(cls);
+ } else {
+ page->memclass = NULL;
+ }
+}
+
+static inline void
+ckrm_set_pages_class(struct page *pages, int numpages, ckrm_mem_res_t *cls)
+{
+ int i;
+ for (i = 0; i < numpages; pages++, i++) {
+ ckrm_set_page_class(pages, cls);
+ }
+}
+
+static inline void
+ckrm_clear_page_class(struct page *page)
+{
+ if (page->memclass != NULL) {
+ mem_class_put(page->memclass);
+ page->memclass = NULL;
+ }
+}
+
+static inline void
+ckrm_clear_pages_class(struct page *pages, int numpages)
+{
+ int i;
+ for (i = 0; i < numpages; pages++, i++) {
+ ckrm_clear_page_class(pages);
+ }
+}
+
+static inline void
+ckrm_change_page_class(struct page *page, ckrm_mem_res_t *cls)
+{
+ ckrm_clear_page_class(page);
+ ckrm_set_page_class(page, cls);
+}
+
+static inline void
+ckrm_change_pages_class(struct page *pages, int numpages,
+ ckrm_mem_res_t *cls)
+{
+ int i;
+ for (i = 0; i < numpages; pages++, i++) {
+ ckrm_change_page_class(pages, cls);
+ }
+}
+
+static inline void
+ckrm_mem_inc_active(struct page *page)
+{
+ ckrm_mem_res_t *cls = page_class(page);
+ BUG_ON(cls == NULL);
+ cls->nr_active[page_zonenum(page)]++;
+ if (incr_use_count(cls, 0)) {
+ ckrm_near_limit(cls);
+ }
+}
+
+static inline void
+ckrm_mem_dec_active(struct page *page)
+{
+ ckrm_mem_res_t *cls = page_class(page);
+ BUG_ON(cls == NULL);
+ cls->nr_active[page_zonenum(page)]--;
+ decr_use_count(cls, 0);
+}
+
+static inline void
+ckrm_mem_inc_inactive(struct page *page)
+{
+ ckrm_mem_res_t *cls = page_class(page);
+ BUG_ON(cls == NULL);
+ cls->nr_inactive[page_zonenum(page)]++;
+ if (incr_use_count(cls, 0) &&
+ ((cls->flags & MEM_NEAR_LIMIT) != MEM_NEAR_LIMIT)) {
+ ckrm_near_limit(cls);
+ }
+}
+
+static inline void
+ckrm_mem_dec_inactive(struct page *page)
+{
+ ckrm_mem_res_t *cls = page_class(page);
+ BUG_ON(cls == NULL);
+ cls->nr_inactive[page_zonenum(page)]--;
+ decr_use_count(cls, 0);
+}
+
+static inline int
+ckrm_kick_page(struct page *page, unsigned int bits)
+{
+ if (page_class(page) == NULL) {
+ return bits;
+ } else {
+ return (page_class(page)->reclaim_flags & bits);
+ }
+}
+
+static inline int
+ckrm_class_limit_ok(ckrm_mem_res_t *cls)
+{
+ if ((mem_rcbs.resid == -1) || !cls) {
+ return 1;
+ }
+ return (atomic_read(&cls->pg_total) <= (11 * cls->pg_limit) / 10);
+}
+
+#else // !CONFIG_CKRM_RES_MEM
+
+#define ckrm_set_page_class(a,b) do{}while(0)
+#define ckrm_set_pages_class(a,b,c) do{}while(0)
+#define ckrm_clear_page_class(a) do{}while(0)
+#define ckrm_clear_pages_class(a,b) do{}while(0)
+#define ckrm_change_page_class(a,b) do{}while(0)
+#define ckrm_change_pages_class(a,b,c) do{}while(0)
+#define ckrm_mem_inc_active(a) do{}while(0)
+#define ckrm_mem_dec_active(a) do{}while(0)
+#define ckrm_mem_inc_inactive(a) do{}while(0)
+#define ckrm_mem_dec_inactive(a) do{}while(0)
+#define ckrm_shrink_list_empty() (1)
+#define ckrm_kick_page(a,b) (0)
+#define ckrm_class_limit_ok(a) (1)
+
+#endif // CONFIG_CKRM_RES_MEM
+
+#endif // _LINUX_CKRM_MEM_INLINE_H_
--- /dev/null
+/* ckrm_mem.c - Memory Resource Manager for CKRM
+ *
+ * Copyright (C) Chandra Seetharaman, IBM Corp. 2004
+ *
+ * Provides a Memory Resource controller for CKRM
+ *
+ * Latest version, more details at http://ckrm.sf.net
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+
+/* Code Description: TBD
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <asm/errno.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/pagemap.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+#include <linux/cache.h>
+#include <linux/percpu.h>
+#include <linux/pagevec.h>
+
+#include <linux/ckrm_mem_inline.h>
+
+#include <asm/uaccess.h>
+#include <asm/pgtable.h>
+
+#define MEM_NAME "mem"
+
+#define CKRM_MEM_MAX_HIERARCHY 2 // allows only upto 2 levels - 0, 1 & 2
+
+/* all 1-level memory_share_class are chained together */
+static LIST_HEAD(ckrm_memclass_list);
+LIST_HEAD(ckrm_shrink_list);
+EXPORT_SYMBOL(ckrm_shrink_list);
+spinlock_t ckrm_mem_lock = SPIN_LOCK_UNLOCKED; // protects both lists above
+EXPORT_SYMBOL(ckrm_mem_lock);
+unsigned int ckrm_tot_lru_pages; // total # of pages in the system
+ // currently doesn't handle memory add/remove
+EXPORT_SYMBOL(ckrm_tot_lru_pages);
+
+static ckrm_mem_res_t *ckrm_mem_root_class;
+atomic_t ckrm_mem_real_count = ATOMIC_INIT(0);
+EXPORT_SYMBOL(ckrm_mem_real_count);
+
+/* Initialize rescls values
+ * May be called on each rcfs unmount or as part of error recovery
+ * to make share values sane.
+ * Does not traverse hierarchy reinitializing children.
+ */
+
+static void
+set_ckrm_tot_pages(void)
+{
+ struct zone *zone;
+ int tot_lru_pages = 0;
+
+ for_each_zone(zone) {
+ tot_lru_pages += zone->nr_active;
+ tot_lru_pages += zone->nr_inactive;
+ tot_lru_pages += zone->free_pages;
+ }
+ ckrm_tot_lru_pages = tot_lru_pages;
+}
+
+static void
+mem_res_initcls_one(void *my_res)
+{
+ ckrm_mem_res_t *res = my_res;
+
+ memset(res, 0, sizeof(ckrm_mem_res_t));
+
+ res->shares.my_guarantee = CKRM_SHARE_DONTCARE;
+ res->shares.my_limit = CKRM_SHARE_DONTCARE;
+ res->shares.total_guarantee = CKRM_SHARE_DFLT_TOTAL_GUARANTEE;
+ res->shares.max_limit = CKRM_SHARE_DFLT_MAX_LIMIT;
+ res->shares.unused_guarantee = CKRM_SHARE_DFLT_TOTAL_GUARANTEE;
+ res->shares.cur_max_limit = 0;
+
+ res->pg_guar = CKRM_SHARE_DONTCARE;
+ res->pg_limit = CKRM_SHARE_DONTCARE;
+ res->pg_unused = CKRM_SHARE_DONTCARE;
+}
+
+static void *
+mem_res_alloc(struct ckrm_core_class *core, struct ckrm_core_class *parent)
+{
+ ckrm_mem_res_t *res, *parres;
+
+ if (mem_rcbs.resid == -1) {
+ return NULL;
+ }
+
+ parres = ckrm_get_res_class(parent, mem_rcbs.resid, ckrm_mem_res_t);
+ if (parres && (parres->hier == CKRM_MEM_MAX_HIERARCHY)) {
+ // allows only upto CKRM_MEM_MAX_HIERARCHY
+ return NULL;
+ }
+
+ if (unlikely((parent == NULL) && (ckrm_mem_root_class != NULL))) {
+ printk(KERN_ERR "MEM_RC: Only one root class is allowed\n");
+ return NULL;
+ }
+
+ if (unlikely((parent != NULL) && (ckrm_mem_root_class == NULL))) {
+ printk(KERN_ERR "MEM_RC: creating child class without root class\n");
+ return NULL;
+ }
+
+ res = kmalloc(sizeof(ckrm_mem_res_t), GFP_ATOMIC);
+
+ if (res) {
+ mem_res_initcls_one(res);
+ res->core = core;
+ res->parent = parent;
+ spin_lock(&ckrm_mem_lock);
+ list_add(&res->mcls_list, &ckrm_memclass_list);
+ spin_unlock(&ckrm_mem_lock);
+ if (parent == NULL) {
+ // I am part of the root class. So, set the max to
+ // number of pages available
+ res->pg_guar = ckrm_tot_lru_pages;
+ res->pg_unused = ckrm_tot_lru_pages;
+ res->pg_limit = ckrm_tot_lru_pages;
+ res->hier = 0;
+ ckrm_mem_root_class = res;
+ } else {
+ res->hier = parres->hier + 1;
+ }
+ mem_class_get(res);
+ }
+ else
+ printk(KERN_ERR "mem_res_alloc: failed GFP_ATOMIC alloc\n");
+ return res;
+}
+
+/*
+ * It is the caller's responsibility to make sure that the parent only
+ * has chilren that are to be accounted. i.e if a new child is added
+ * this function should be called after it has been added, and if a
+ * child is deleted this should be called after the child is removed.
+ */
+static void
+child_maxlimit_changed_local(ckrm_mem_res_t *parres)
+{
+ int maxlimit = 0;
+ ckrm_mem_res_t *childres;
+ ckrm_core_class_t *child = NULL;
+
+ // run thru parent's children and get the new max_limit of the parent
+ ckrm_lock_hier(parres->core);
+ while ((child = ckrm_get_next_child(parres->core, child)) != NULL) {
+ childres = ckrm_get_res_class(child, mem_rcbs.resid,
+ ckrm_mem_res_t);
+ if (maxlimit < childres->shares.my_limit) {
+ maxlimit = childres->shares.my_limit;
+ }
+ }
+ ckrm_unlock_hier(parres->core);
+ parres->shares.cur_max_limit = maxlimit;
+}
+
+static void
+mem_res_free(void *my_res)
+{
+ ckrm_mem_res_t *res = my_res;
+ ckrm_mem_res_t *parres;
+
+ if (!res)
+ return;
+
+ parres = ckrm_get_res_class(res->parent, mem_rcbs.resid, ckrm_mem_res_t);
+
+ // return child's limit/guarantee to parent node
+ if (parres) {
+ child_guarantee_changed(&parres->shares, res->shares.my_guarantee, 0);
+ child_maxlimit_changed_local(parres);
+ }
+ res->shares.my_guarantee = 0;
+ res->shares.my_limit = 0;
+ spin_lock(&ckrm_mem_lock);
+ list_del(&res->mcls_list);
+ spin_unlock(&ckrm_mem_lock);
+ mem_class_put(res);
+
+ return;
+}
+
+/*
+ * Recalculate the guarantee and limit in # of pages... and propagate the
+ * same to children.
+ * Caller is responsible for protecting res and for the integrity of parres
+ */
+static void
+recalc_and_propagate(ckrm_mem_res_t * res, ckrm_mem_res_t * parres)
+{
+ ckrm_core_class_t *child = NULL;
+ ckrm_mem_res_t *childres;
+ int resid = mem_rcbs.resid;
+ struct ckrm_shares *self = &res->shares;
+
+ if (parres) {
+ struct ckrm_shares *par = &parres->shares;
+
+ // calculate pg_guar and pg_limit
+ //
+ if (parres->pg_guar == CKRM_SHARE_DONTCARE ||
+ self->my_guarantee == CKRM_SHARE_DONTCARE) {
+ res->pg_guar = CKRM_SHARE_DONTCARE;
+ } else if (par->total_guarantee) {
+ u64 temp = (u64) self->my_guarantee * parres->pg_guar;
+ do_div(temp, par->total_guarantee);
+ res->pg_guar = (int) temp;
+ } else {
+ res->pg_guar = 0;
+ }
+
+ if (parres->pg_limit == CKRM_SHARE_DONTCARE ||
+ self->my_limit == CKRM_SHARE_DONTCARE) {
+ res->pg_limit = CKRM_SHARE_DONTCARE;
+ } else if (par->max_limit) {
+ u64 temp = (u64) self->my_limit * parres->pg_limit;
+ do_div(temp, par->max_limit);
+ res->pg_limit = (int) temp;
+ } else {
+ res->pg_limit = 0;
+ }
+ }
+
+ // Calculate unused units
+ if (res->pg_guar == CKRM_SHARE_DONTCARE) {
+ res->pg_unused = CKRM_SHARE_DONTCARE;
+ } else if (self->total_guarantee) {
+ u64 temp = (u64) self->unused_guarantee * res->pg_guar;
+ do_div(temp, self->total_guarantee);
+ res->pg_unused = (int) temp;
+ } else {
+ res->pg_unused = 0;
+ }
+
+ // propagate to children
+ ckrm_lock_hier(res->core);
+ while ((child = ckrm_get_next_child(res->core, child)) != NULL) {
+ childres = ckrm_get_res_class(child, resid, ckrm_mem_res_t);
+ recalc_and_propagate(childres, res);
+ }
+ ckrm_unlock_hier(res->core);
+ return;
+}
+
+static int
+mem_set_share_values(void *my_res, struct ckrm_shares *shares)
+{
+ ckrm_mem_res_t *res = my_res;
+ ckrm_mem_res_t *parres;
+ int rc = EINVAL;
+
+ if (!res)
+ return -EINVAL;
+
+ parres = ckrm_get_res_class(res->parent, mem_rcbs.resid, ckrm_mem_res_t);
+
+ rc = set_shares(shares, &res->shares, parres ? &parres->shares : NULL);
+
+ if ((rc == 0) && (parres != NULL)) {
+ child_maxlimit_changed_local(parres);
+ recalc_and_propagate(parres, NULL);
+ }
+ return rc;
+}
+
+static int
+mem_get_share_values(void *my_res, struct ckrm_shares *shares)
+{
+ ckrm_mem_res_t *res = my_res;
+
+ if (!res)
+ return -EINVAL;
+ *shares = res->shares;
+ return 0;
+}
+
+static int
+mem_get_stats(void *my_res, struct seq_file *sfile)
+{
+ ckrm_mem_res_t *res = my_res;
+
+ if (!res)
+ return -EINVAL;
+
+#if 0
+ seq_printf(sfile, "tot %6d;gua %6d;lmt %6d;unu %6d;"
+ "lnt %6d;bor %6d;rlt %6d\n", atomic_read(&res->pg_total),
+ res->pg_guar, res->pg_limit, res->pg_unused, res->pg_lent,
+ res->pg_borrowed, atomic_read(&ckrm_mem_real_count));
+#endif
+
+
+ seq_printf(sfile, "----------- Memory Resource stats start -----------\n");
+ seq_printf(sfile, "Number of pages used(including pages lent to children):"
+ " %d\n", atomic_read(&res->pg_total));
+ seq_printf(sfile, "Number of pages guaranteed: %d\n",
+ res->pg_guar);
+ seq_printf(sfile, "Maximum limit of pages: %d\n",
+ res->pg_limit);
+ seq_printf(sfile, "Total number of pages available"
+ "(after serving guarantees to children): %d\n",
+ res->pg_unused);
+ seq_printf(sfile, "Number of pages lent to children: %d\n",
+ res->pg_lent);
+ seq_printf(sfile, "Number of pages borrowed from the parent: %d\n",
+ res->pg_borrowed);
+ seq_printf(sfile, "----------- Memory Resource stats end -----------\n");
+
+ return 0;
+}
+
+static void
+mem_change_resclass(void *tsk, void *old, void *new)
+{
+ struct mm_struct *mm;
+ struct task_struct *task = tsk, *t1;
+ struct ckrm_mem_res *prev_mmcls;
+
+ if (!task->mm || (new == old) || (old == (void *) -1))
+ return;
+
+ mm = task->active_mm;
+ spin_lock(&mm->peertask_lock);
+ prev_mmcls = mm->memclass;
+
+ if (new == NULL) {
+ list_del_init(&task->mm_peers);
+ } else {
+ int found = 0;
+ list_for_each_entry(t1, &mm->tasklist, mm_peers) {
+ if (t1 == task) {
+ found++;
+ break;
+ }
+ }
+ if (!found) {
+ list_del_init(&task->mm_peers);
+ list_add_tail(&task->mm_peers, &mm->tasklist);
+ }
+ }
+
+ ckrm_mem_evaluate_mm(mm);
+ spin_unlock(&mm->peertask_lock);
+ return;
+}
+
+// config file is available only at the root level,
+// so assuming my_res to be the system level class
+static int
+mem_set_config(void *my_res, const char *cfgstr)
+{
+ ckrm_mem_res_t *res = my_res;
+
+ printk(KERN_INFO "%s class of %s is called with config<%s>\n",
+ MEM_NAME, res->core->name, cfgstr);
+ return 0;
+}
+
+static int
+mem_show_config(void *my_res, struct seq_file *sfile)
+{
+ struct zone *zone;
+ ckrm_mem_res_t *res = my_res;
+ int active = 0, inactive = 0, fr = 0;
+
+ if (!res)
+ return -EINVAL;
+
+ for_each_zone(zone) {
+ active += zone->nr_active;
+ inactive += zone->nr_inactive;
+ fr += zone->free_pages;
+ }
+ seq_printf(sfile, "res=%s;tot_pages=%d,active=%d,inactive=%d,free=%d\n",
+ MEM_NAME, ckrm_tot_lru_pages,active,inactive,fr);
+
+
+ return 0;
+}
+
+static int
+mem_reset_stats(void *my_res)
+{
+ ckrm_mem_res_t *res = my_res;
+ printk(KERN_INFO " memclass of %s called for reset\n", res->core->name);
+ return 0;
+}
+
+struct ckrm_res_ctlr mem_rcbs = {
+ .res_name = MEM_NAME,
+ .res_hdepth = CKRM_MEM_MAX_HIERARCHY,
+ .resid = -1,
+ .res_alloc = mem_res_alloc,
+ .res_free = mem_res_free,
+ .set_share_values = mem_set_share_values,
+ .get_share_values = mem_get_share_values,
+ .get_stats = mem_get_stats,
+ .change_resclass = mem_change_resclass,
+ .show_config = mem_show_config,
+ .set_config = mem_set_config,
+ .reset_stats = mem_reset_stats,
+};
+
+EXPORT_SYMBOL(mem_rcbs);
+
+int __init
+init_ckrm_mem_res(void)
+{
+ struct ckrm_classtype *clstype;
+ int resid = mem_rcbs.resid;
+
+ set_ckrm_tot_pages();
+ clstype = ckrm_find_classtype_by_name("taskclass");
+ if (clstype == NULL) {
+ printk(KERN_INFO " Unknown ckrm classtype<taskclass>");
+ return -ENOENT;
+ }
+
+ if (resid == -1) {
+ resid = ckrm_register_res_ctlr(clstype, &mem_rcbs);
+ if (resid != -1) {
+ mem_rcbs.classtype = clstype;
+ }
+ }
+ return ((resid < 0) ? resid : 0);
+}
+
+void __exit
+exit_ckrm_mem_res(void)
+{
+ ckrm_unregister_res_ctlr(&mem_rcbs);
+ mem_rcbs.resid = -1;
+}
+
+module_init(init_ckrm_mem_res)
+module_exit(exit_ckrm_mem_res)
+
+static void
+set_flags_of_children(ckrm_mem_res_t *parres, unsigned int flag)
+{
+ ckrm_mem_res_t *childres;
+ ckrm_core_class_t *child = NULL;
+
+ parres->reclaim_flags |= flag;
+ ckrm_lock_hier(parres->core);
+ while ((child = ckrm_get_next_child(parres->core, child)) != NULL) {
+ childres = ckrm_get_res_class(child, mem_rcbs.resid,
+ ckrm_mem_res_t);
+ set_flags_of_children(childres, flag);
+ }
+ ckrm_unlock_hier(parres->core);
+ return;
+}
+
+// FIXME: more attention is needed to this function
+static unsigned int
+set_usage_flags(ckrm_mem_res_t *res)
+{
+ int tot_usage, cls_usage, range, guar;
+
+ if (res->pg_limit == CKRM_SHARE_DONTCARE) {
+ // No limit is set for the class. don't bother it
+ res->reclaim_flags = 0;
+ return res->reclaim_flags;
+ }
+
+ tot_usage = atomic_read(&res->pg_total);
+ cls_usage = tot_usage - res->pg_lent;
+ guar = (res->pg_guar > 0) ? res->pg_guar : 0;
+ range = res->pg_limit - guar;
+
+ if ((tot_usage > (guar + ((120 * range) / 100))) &&
+ (res->pg_lent > (guar + ((25 * range) / 100)))) {
+ set_flags_of_children(res, CLS_PARENT_OVER);
+ }
+
+ if (cls_usage > (guar + ((110 * range) / 100))) {
+ res->reclaim_flags |= CLS_OVER_110;
+ } else if (cls_usage > (guar + range)) {
+ res->reclaim_flags |= CLS_OVER_100;
+ } else if (cls_usage > (guar + ((3 * range) / 4))) {
+ res->reclaim_flags |= CLS_OVER_75;
+ } else if (cls_usage > guar) {
+ res->reclaim_flags |= CLS_OVER_GUAR;
+ } else {
+ res->reclaim_flags = 0;
+ }
+ return res->reclaim_flags;
+}
+
+/*
+ * The functions ckrm_setup_reclamation(), ckrm_teardown_reclamation(),
+ * ckrm_get_reclaim_bits() and the macro ckrm_kick_page() along with the
+ * macros CLS_* define how the pages are reclaimed.
+ * Keeping this logic thru these interface eliminate the necessity to
+ * change the reclaimation code in VM if we want to change the logic.
+ */
+unsigned int
+ckrm_setup_reclamation(void)
+{
+ ckrm_mem_res_t *res;
+ unsigned int ret = 0;
+
+ spin_lock(&ckrm_mem_lock);
+ set_ckrm_tot_pages();
+ ckrm_mem_root_class->pg_guar = ckrm_tot_lru_pages;
+ ckrm_mem_root_class->pg_unused = ckrm_tot_lru_pages;
+ ckrm_mem_root_class->pg_limit = ckrm_tot_lru_pages;
+ recalc_and_propagate(ckrm_mem_root_class, NULL);
+ list_for_each_entry(res, &ckrm_memclass_list, mcls_list) {
+ ret |= set_usage_flags(res);
+ }
+ spin_unlock(&ckrm_mem_lock);
+ return ret;
+}
+
+void
+ckrm_teardown_reclamation(void)
+{
+ ckrm_mem_res_t *res;
+ spin_lock(&ckrm_mem_lock);
+ list_for_each_entry(res, &ckrm_memclass_list, mcls_list) {
+ res->reclaim_flags = 0;
+ }
+ spin_unlock(&ckrm_mem_lock);
+}
+
+void
+ckrm_get_reclaim_bits(unsigned int *flags, unsigned int *extract)
+{
+ int i, j, mask = 0;
+
+ if (*extract == 0 || *flags == 0) {
+ return;
+ }
+ if (*flags & CLS_SHRINK) {
+ *extract = CLS_SHRINK;
+ *flags = 0;
+ return;
+ }
+
+
+ i = fls(*flags);
+ for (j = i-1; j > 0; j--) {
+ mask = (mask<<1) | 1;
+ }
+ *extract = (CLS_FLAGS_ALL & ~mask);
+ *flags &= ~*extract;
+ return;
+}
+
+void
+ckrm_near_limit(ckrm_mem_res_t *cls)
+{
+ struct zone *zone;
+ unsigned long now = jiffies;
+
+ if (!cls || ((cls->flags & MEM_NEAR_LIMIT) == MEM_NEAR_LIMIT)) {
+ return;
+ }
+ if ((cls->last_shrink + (10 * HZ)) < now) { // 10 seconds since last ?
+ cls->last_shrink = now;
+ cls->shrink_count = 0;
+ }
+ cls->shrink_count++;
+ if (cls->shrink_count > 10) {
+ return;
+ }
+ spin_lock(&ckrm_mem_lock);
+ list_add(&cls->shrink_list, &ckrm_shrink_list);
+ spin_unlock(&ckrm_mem_lock);
+ cls->flags |= MEM_NEAR_LIMIT;
+ for_each_zone(zone) {
+ wakeup_kswapd(zone);
+ break; // only once is enough
+ }
+}
+
+static int
+ckrm_mem_evaluate_page_anon(struct page* page)
+{
+ ckrm_mem_res_t* pgcls = page_class(page);
+ ckrm_mem_res_t* maxshareclass = NULL;
+ struct anon_vma *anon_vma = (struct anon_vma *) page->mapping;
+ struct vm_area_struct *vma;
+ struct mm_struct* mm;
+
+ spin_lock(&anon_vma->lock);
+ BUG_ON(list_empty(&anon_vma->head));
+ list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
+ mm = vma->vm_mm;
+ if (!maxshareclass ||
+ ckrm_mem_share_compare(maxshareclass, mm->memclass) < 0) {
+ maxshareclass = mm->memclass;
+ }
+ }
+ spin_unlock(&anon_vma->lock);
+
+ if (maxshareclass && (pgcls != maxshareclass)) {
+ ckrm_change_page_class(page, maxshareclass);
+ return 1;
+ }
+ return 0;
+}
+
+static int
+ckrm_mem_evaluate_page_file(struct page* page)
+{
+ ckrm_mem_res_t* pgcls = page_class(page);
+ ckrm_mem_res_t* maxshareclass = NULL;
+ struct address_space *mapping = page->mapping;
+ struct vm_area_struct *vma = NULL;
+ pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+ struct prio_tree_iter iter;
+ struct mm_struct* mm;
+
+ if (!mapping)
+ return 0;
+
+ if (!spin_trylock(&mapping->i_mmap_lock))
+ return 0;
+
+ while ((vma = vma_prio_tree_next(vma, &mapping->i_mmap,
+ &iter, pgoff, pgoff)) != NULL) {
+ mm = vma->vm_mm;
+ if (!maxshareclass || ckrm_mem_share_compare(maxshareclass,mm->memclass)<0)
+ maxshareclass = mm->memclass;
+ }
+ spin_unlock(&mapping->i_mmap_lock);
+
+ if (maxshareclass && pgcls != maxshareclass) {
+ ckrm_change_page_class(page, maxshareclass);
+ return 1;
+ }
+ return 0;
+}
+
+static int
+ckrm_mem_evaluate_page(struct page* page)
+{
+ int changed = 0;
+
+ if (page->mapping) {
+ if (PageAnon(page))
+ changed = ckrm_mem_evaluate_page_anon(page);
+ else
+ changed = ckrm_mem_evaluate_page_file(page);
+ }
+ return changed;
+}
+
+static inline int
+class_migrate_pmd(struct mm_struct* mm, struct vm_area_struct* vma,
+ pmd_t* pmdir, unsigned long address, unsigned long end)
+{
+ pte_t* pte;
+ unsigned long pmd_end;
+
+ if (pmd_none(*pmdir))
+ return 0;
+ BUG_ON(pmd_bad(*pmdir));
+
+ pte = pte_offset_map(pmdir,address);
+ pmd_end = (address+PMD_SIZE)&PMD_MASK;
+ if (end>pmd_end)
+ end = pmd_end;
+
+ do {
+ if (pte_present(*pte)) {
+ ckrm_mem_evaluate_page(pte_page(*pte));
+ }
+ address += PAGE_SIZE;
+ pte++;
+ } while(address && (address<end));
+ return 0;
+}
+
+static inline int
+class_migrate_pgd(struct mm_struct* mm, struct vm_area_struct* vma,
+ pgd_t* pgdir, unsigned long address, unsigned long end)
+{
+ pmd_t* pmd;
+ unsigned long pgd_end;
+
+ if (pgd_none(*pgdir))
+ return 0;
+ BUG_ON(pgd_bad(*pgdir));
+
+ pmd = pmd_offset(pgdir,address);
+ pgd_end = (address+PGDIR_SIZE)&PGDIR_MASK;
+
+ if (pgd_end && (end>pgd_end))
+ end = pgd_end;
+
+ do {
+ class_migrate_pmd(mm,vma,pmd,address,end);
+ address = (address+PMD_SIZE)&PMD_MASK;
+ pmd++;
+ } while (address && (address<end));
+ return 0;
+}
+
+static inline int
+class_migrate_vma(struct mm_struct* mm, struct vm_area_struct* vma)
+{
+ pgd_t* pgdir;
+ unsigned long address, end;
+
+ address = vma->vm_start;
+ end = vma->vm_end;
+
+ pgdir = pgd_offset(vma->vm_mm, address);
+ do {
+ class_migrate_pgd(mm,vma,pgdir,address,end);
+ address = (address + PGDIR_SIZE) & PGDIR_MASK;
+ pgdir++;
+ } while(address && (address<end));
+ return 0;
+}
+
+/* this function is called with mm->peertask_lock hold */
+void
+ckrm_mem_evaluate_mm(struct mm_struct* mm)
+{
+ struct task_struct *task;
+ struct ckrm_mem_res *maxshareclass = NULL;
+ struct vm_area_struct *vma;
+
+ if (list_empty(&mm->tasklist)) {
+ /* We leave the mm->memclass untouched since we believe that one
+ * mm with no task associated will be deleted soon or attach
+ * with another task later.
+ */
+ return;
+ }
+
+ list_for_each_entry(task, &mm->tasklist, mm_peers) {
+ ckrm_mem_res_t* cls = GET_MEM_CLASS(task);
+ if (!cls)
+ continue;
+ if (!maxshareclass || ckrm_mem_share_compare(maxshareclass,cls)<0 )
+ maxshareclass = cls;
+ }
+
+ if (mm->memclass != (void *)maxshareclass) {
+ mem_class_get(maxshareclass);
+ if (mm->memclass)
+ mem_class_put(mm->memclass);
+ mm->memclass = maxshareclass;
+
+ /* Go through all VMA to migrate pages */
+ down_read(&mm->mmap_sem);
+ vma = mm->mmap;
+ while(vma) {
+ class_migrate_vma(mm, vma);
+ vma = vma->vm_next;
+ }
+ up_read(&mm->mmap_sem);
+ }
+ return;
+}
+
+void
+ckrm_mem_evaluate_page_byadd(struct page* page, struct mm_struct* mm)
+{
+ ckrm_mem_res_t *pgcls = page_class(page);
+ ckrm_mem_res_t *chgcls = mm->memclass ? mm->memclass : GET_MEM_CLASS(current);
+
+ if (!chgcls || pgcls == chgcls)
+ return;
+
+ if (!page->mapcount) {
+ ckrm_change_page_class(page, chgcls);
+ return;
+ }
+ if (ckrm_mem_share_compare(pgcls, chgcls) < 0) {
+ ckrm_change_page_class(page, chgcls);
+ return;
+ }
+ return;
+}
+
+void
+ckrm_init_mm_to_task(struct mm_struct * mm, struct task_struct *task)
+{
+ spin_lock(&mm->peertask_lock);
+ if (!list_empty(&task->mm_peers)) {
+ printk(KERN_ERR "CKRM_MEM: Task list should be empty, but is not!!\n");
+ list_del_init(&task->mm_peers);
+ }
+ list_add_tail(&task->mm_peers, &mm->tasklist);
+ if (mm->memclass != GET_MEM_CLASS(task))
+ ckrm_mem_evaluate_mm(mm);
+ spin_unlock(&mm->peertask_lock);
+ return;
+}
+
+MODULE_LICENSE("GPL");
--- /dev/null
+/*
+ * linux/kernel/exit.c
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ */
+
+#include <linux/config.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/interrupt.h>
+#include <linux/smp_lock.h>
+#include <linux/module.h>
+#include <linux/completion.h>
+#include <linux/personality.h>
+#include <linux/tty.h>
+#include <linux/namespace.h>
+#include <linux/security.h>
+#include <linux/acct.h>
+#include <linux/file.h>
+#include <linux/binfmts.h>
+#include <linux/ptrace.h>
+#include <linux/profile.h>
+#include <linux/mount.h>
+#include <linux/proc_fs.h>
+#include <linux/mempolicy.h>
+#include <linux/ckrm.h>
+#include <linux/ckrm_tsk.h>
+
+#include <asm/uaccess.h>
+#include <asm/unistd.h>
+#include <asm/pgtable.h>
+#include <asm/mmu_context.h>
+
+extern void sem_exit (void);
+extern struct task_struct *child_reaper;
+
+int getrusage(struct task_struct *, int, struct rusage __user *);
+
+static void __unhash_process(struct task_struct *p)
+{
+ nr_threads--;
+ detach_pid(p, PIDTYPE_PID);
+ detach_pid(p, PIDTYPE_TGID);
+ if (thread_group_leader(p)) {
+ detach_pid(p, PIDTYPE_PGID);
+ detach_pid(p, PIDTYPE_SID);
+ if (p->pid)
+ __get_cpu_var(process_counts)--;
+ }
+
+ REMOVE_LINKS(p);
+}
+
+void release_task(struct task_struct * p)
+{
+ int zap_leader;
+ task_t *leader;
+ struct dentry *proc_dentry;
+
+repeat:
+ BUG_ON(p->state < TASK_ZOMBIE);
+
+ atomic_dec(&p->user->processes);
+ spin_lock(&p->proc_lock);
+ proc_dentry = proc_pid_unhash(p);
+ write_lock_irq(&tasklist_lock);
+ if (unlikely(p->ptrace))
+ __ptrace_unlink(p);
+ BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children));
+ __exit_signal(p);
+ __exit_sighand(p);
+ __unhash_process(p);
+
+ /*
+ * If we are the last non-leader member of the thread
+ * group, and the leader is zombie, then notify the
+ * group leader's parent process. (if it wants notification.)
+ */
+ zap_leader = 0;
+ leader = p->group_leader;
+ if (leader != p && thread_group_empty(leader) && leader->state == TASK_ZOMBIE) {
+ BUG_ON(leader->exit_signal == -1);
+ do_notify_parent(leader, leader->exit_signal);
+ /*
+ * If we were the last child thread and the leader has
+ * exited already, and the leader's parent ignores SIGCHLD,
+ * then we are the one who should release the leader.
+ *
+ * do_notify_parent() will have marked it self-reaping in
+ * that case.
+ */
+ zap_leader = (leader->exit_signal == -1);
+ }
+
+ p->parent->cutime += p->utime + p->cutime;
+ p->parent->cstime += p->stime + p->cstime;
+ p->parent->cmin_flt += p->min_flt + p->cmin_flt;
+ p->parent->cmaj_flt += p->maj_flt + p->cmaj_flt;
+ p->parent->cnvcsw += p->nvcsw + p->cnvcsw;
+ p->parent->cnivcsw += p->nivcsw + p->cnivcsw;
+ sched_exit(p);
+ write_unlock_irq(&tasklist_lock);
+ spin_unlock(&p->proc_lock);
+ proc_pid_flush(proc_dentry);
+ release_thread(p);
+ put_task_struct(p);
+
+ p = leader;
+ if (unlikely(zap_leader))
+ goto repeat;
+}
+
+/* we are using it only for SMP init */
+
+void unhash_process(struct task_struct *p)
+{
+ struct dentry *proc_dentry;
+
+ spin_lock(&p->proc_lock);
+ proc_dentry = proc_pid_unhash(p);
+ write_lock_irq(&tasklist_lock);
+ __unhash_process(p);
+ write_unlock_irq(&tasklist_lock);
+ spin_unlock(&p->proc_lock);
+ proc_pid_flush(proc_dentry);
+}
+
+/*
+ * This checks not only the pgrp, but falls back on the pid if no
+ * satisfactory pgrp is found. I dunno - gdb doesn't work correctly
+ * without this...
+ */
+int session_of_pgrp(int pgrp)
+{
+ struct task_struct *p;
+ struct list_head *l;
+ struct pid *pid;
+ int sid = -1;
+
+ read_lock(&tasklist_lock);
+ for_each_task_pid(pgrp, PIDTYPE_PGID, p, l, pid)
+ if (p->signal->session > 0) {
+ sid = p->signal->session;
+ goto out;
+ }
+ p = find_task_by_pid(pgrp);
+ if (p)
+ sid = p->signal->session;
+out:
+ read_unlock(&tasklist_lock);
+
+ return sid;
+}
+
+/*
+ * Determine if a process group is "orphaned", according to the POSIX
+ * definition in 2.2.2.52. Orphaned process groups are not to be affected
+ * by terminal-generated stop signals. Newly orphaned process groups are
+ * to receive a SIGHUP and a SIGCONT.
+ *
+ * "I ask you, have you ever known what it is to be an orphan?"
+ */
+static int will_become_orphaned_pgrp(int pgrp, task_t *ignored_task)
+{
+ struct task_struct *p;
+ struct list_head *l;
+ struct pid *pid;
+ int ret = 1;
+
+ for_each_task_pid(pgrp, PIDTYPE_PGID, p, l, pid) {
+ if (p == ignored_task
+ || p->state >= TASK_ZOMBIE
+ || p->real_parent->pid == 1)
+ continue;
+ if (process_group(p->real_parent) != pgrp
+ && p->real_parent->signal->session == p->signal->session) {
+ ret = 0;
+ break;
+ }
+ }
+ return ret; /* (sighing) "Often!" */
+}
+
+int is_orphaned_pgrp(int pgrp)
+{
+ int retval;
+
+ read_lock(&tasklist_lock);
+ retval = will_become_orphaned_pgrp(pgrp, NULL);
+ read_unlock(&tasklist_lock);
+
+ return retval;
+}
+
+static inline int has_stopped_jobs(int pgrp)
+{
+ int retval = 0;
+ struct task_struct *p;
+ struct list_head *l;
+ struct pid *pid;
+
+ for_each_task_pid(pgrp, PIDTYPE_PGID, p, l, pid) {
+ if (p->state != TASK_STOPPED)
+ continue;
+
+ /* If p is stopped by a debugger on a signal that won't
+ stop it, then don't count p as stopped. This isn't
+ perfect but it's a good approximation. */
+ if (unlikely (p->ptrace)
+ && p->exit_code != SIGSTOP
+ && p->exit_code != SIGTSTP
+ && p->exit_code != SIGTTOU
+ && p->exit_code != SIGTTIN)
+ continue;
+
+ retval = 1;
+ break;
+ }
+ return retval;
+}
+
+/**
+ * reparent_to_init() - Reparent the calling kernel thread to the init task.
+ *
+ * If a kernel thread is launched as a result of a system call, or if
+ * it ever exits, it should generally reparent itself to init so that
+ * it is correctly cleaned up on exit.
+ *
+ * The various task state such as scheduling policy and priority may have
+ * been inherited from a user process, so we reset them to sane values here.
+ *
+ * NOTE that reparent_to_init() gives the caller full capabilities.
+ */
+void reparent_to_init(void)
+{
+ write_lock_irq(&tasklist_lock);
+
+ ptrace_unlink(current);
+ /* Reparent to init */
+ REMOVE_LINKS(current);
+ current->parent = child_reaper;
+ current->real_parent = child_reaper;
+ SET_LINKS(current);
+
+ /* Set the exit signal to SIGCHLD so we signal init on exit */
+ current->exit_signal = SIGCHLD;
+
+ if ((current->policy == SCHED_NORMAL) && (task_nice(current) < 0))
+ set_user_nice(current, 0);
+ /* cpus_allowed? */
+ /* rt_priority? */
+ /* signals? */
+ security_task_reparent_to_init(current);
+ memcpy(current->rlim, init_task.rlim, sizeof(*(current->rlim)));
+ atomic_inc(&(INIT_USER->__count));
+ switch_uid(INIT_USER);
+
+ write_unlock_irq(&tasklist_lock);
+}
+
+void __set_special_pids(pid_t session, pid_t pgrp)
+{
+ struct task_struct *curr = current;
+
+ if (curr->signal->session != session) {
+ detach_pid(curr, PIDTYPE_SID);
+ curr->signal->session = session;
+ attach_pid(curr, PIDTYPE_SID, session);
+ }
+ if (process_group(curr) != pgrp) {
+ detach_pid(curr, PIDTYPE_PGID);
+ curr->signal->pgrp = pgrp;
+ attach_pid(curr, PIDTYPE_PGID, pgrp);
+ }
+}
+
+void set_special_pids(pid_t session, pid_t pgrp)
+{
+ write_lock_irq(&tasklist_lock);
+ __set_special_pids(session, pgrp);
+ write_unlock_irq(&tasklist_lock);
+}
+
+/*
+ * Let kernel threads use this to say that they
+ * allow a certain signal (since daemonize() will
+ * have disabled all of them by default).
+ */
+int allow_signal(int sig)
+{
+ if (sig < 1 || sig > _NSIG)
+ return -EINVAL;
+
+ spin_lock_irq(¤t->sighand->siglock);
+ sigdelset(¤t->blocked, sig);
+ if (!current->mm) {
+ /* Kernel threads handle their own signals.
+ Let the signal code know it'll be handled, so
+ that they don't get converted to SIGKILL or
+ just silently dropped */
+ current->sighand->action[(sig)-1].sa.sa_handler = (void *)2;
+ }
+ recalc_sigpending();
+ spin_unlock_irq(¤t->sighand->siglock);
+ return 0;
+}
+
+EXPORT_SYMBOL(allow_signal);
+
+int disallow_signal(int sig)
+{
+ if (sig < 1 || sig > _NSIG)
+ return -EINVAL;
+
+ spin_lock_irq(¤t->sighand->siglock);
+ sigaddset(¤t->blocked, sig);
+ recalc_sigpending();
+ spin_unlock_irq(¤t->sighand->siglock);
+ return 0;
+}
+
+EXPORT_SYMBOL(disallow_signal);
+
+/*
+ * Put all the gunge required to become a kernel thread without
+ * attached user resources in one place where it belongs.
+ */
+
+void daemonize(const char *name, ...)
+{
+ va_list args;
+ struct fs_struct *fs;
+ sigset_t blocked;
+
+ va_start(args, name);
+ vsnprintf(current->comm, sizeof(current->comm), name, args);
+ va_end(args);
+
+ /*
+ * If we were started as result of loading a module, close all of the
+ * user space pages. We don't need them, and if we didn't close them
+ * they would be locked into memory.
+ */
+ exit_mm(current);
+
+ set_special_pids(1, 1);
+ current->signal->tty = NULL;
+
+ /* Block and flush all signals */
+ sigfillset(&blocked);
+ sigprocmask(SIG_BLOCK, &blocked, NULL);
+ flush_signals(current);
+
+ /* Become as one with the init task */
+
+ exit_fs(current); /* current->fs->count--; */
+ fs = init_task.fs;
+ current->fs = fs;
+ atomic_inc(&fs->count);
+ exit_files(current);
+ current->files = init_task.files;
+ atomic_inc(¤t->files->count);
+
+ reparent_to_init();
+}
+
+EXPORT_SYMBOL(daemonize);
+
+static inline void close_files(struct files_struct * files)
+{
+ int i, j;
+
+ j = 0;
+ for (;;) {
+ unsigned long set;
+ i = j * __NFDBITS;
+ if (i >= files->max_fdset || i >= files->max_fds)
+ break;
+ set = files->open_fds->fds_bits[j++];
+ while (set) {
+ if (set & 1) {
+ struct file * file = xchg(&files->fd[i], NULL);
+ if (file)
+ filp_close(file, files);
+ }
+ i++;
+ set >>= 1;
+ }
+ }
+}
+
+struct files_struct *get_files_struct(struct task_struct *task)
+{
+ struct files_struct *files;
+
+ task_lock(task);
+ files = task->files;
+ if (files)
+ atomic_inc(&files->count);
+ task_unlock(task);
+
+ return files;
+}
+
+void fastcall put_files_struct(struct files_struct *files)
+{
+ if (atomic_dec_and_test(&files->count)) {
+ close_files(files);
+ /*
+ * Free the fd and fdset arrays if we expanded them.
+ */
+ if (files->fd != &files->fd_array[0])
+ free_fd_array(files->fd, files->max_fds);
+ if (files->max_fdset > __FD_SETSIZE) {
+ free_fdset(files->open_fds, files->max_fdset);
+ free_fdset(files->close_on_exec, files->max_fdset);
+ }
+ kmem_cache_free(files_cachep, files);
+ }
+}
+
+EXPORT_SYMBOL(put_files_struct);
+
+static inline void __exit_files(struct task_struct *tsk)
+{
+ struct files_struct * files = tsk->files;
+
+ if (files) {
+ task_lock(tsk);
+ tsk->files = NULL;
+ task_unlock(tsk);
+ put_files_struct(files);
+ }
+}
+
+void exit_files(struct task_struct *tsk)
+{
+ __exit_files(tsk);
+}
+
+static inline void __put_fs_struct(struct fs_struct *fs)
+{
+ /* No need to hold fs->lock if we are killing it */
+ if (atomic_dec_and_test(&fs->count)) {
+ dput(fs->root);
+ mntput(fs->rootmnt);
+ dput(fs->pwd);
+ mntput(fs->pwdmnt);
+ if (fs->altroot) {
+ dput(fs->altroot);
+ mntput(fs->altrootmnt);
+ }
+ kmem_cache_free(fs_cachep, fs);
+ }
+}
+
+void put_fs_struct(struct fs_struct *fs)
+{
+ __put_fs_struct(fs);
+}
+
+static inline void __exit_fs(struct task_struct *tsk)
+{
+ struct fs_struct * fs = tsk->fs;
+
+ if (fs) {
+ task_lock(tsk);
+ tsk->fs = NULL;
+ task_unlock(tsk);
+ __put_fs_struct(fs);
+ }
+}
+
+void exit_fs(struct task_struct *tsk)
+{
+ __exit_fs(tsk);
+}
+
+EXPORT_SYMBOL_GPL(exit_fs);
+
+/*
+ * Turn us into a lazy TLB process if we
+ * aren't already..
+ */
+static inline void __exit_mm(struct task_struct * tsk)
+{
+ struct mm_struct *mm = tsk->mm;
+
+ mm_release(tsk, mm);
+ if (!mm)
+ return;
+ /*
+ * Serialize with any possible pending coredump.
+ * We must hold mmap_sem around checking core_waiters
+ * and clearing tsk->mm. The core-inducing thread
+ * will increment core_waiters for each thread in the
+ * group with ->mm != NULL.
+ */
+ down_read(&mm->mmap_sem);
+ if (mm->core_waiters) {
+ up_read(&mm->mmap_sem);
+ down_write(&mm->mmap_sem);
+ if (!--mm->core_waiters)
+ complete(mm->core_startup_done);
+ up_write(&mm->mmap_sem);
+
+ wait_for_completion(&mm->core_done);
+ down_read(&mm->mmap_sem);
+ }
+ atomic_inc(&mm->mm_count);
+ if (mm != tsk->active_mm) BUG();
+ /* more a memory barrier than a real lock */
+ task_lock(tsk);
+ tsk->mm = NULL;
+ up_read(&mm->mmap_sem);
+ enter_lazy_tlb(mm, current);
+ task_unlock(tsk);
+ mmput(mm);
+}
+
+void exit_mm(struct task_struct *tsk)
+{
+ __exit_mm(tsk);
+}
+
+EXPORT_SYMBOL(exit_mm);
+
+static inline void choose_new_parent(task_t *p, task_t *reaper, task_t *child_reaper)
+{
+ /*
+ * Make sure we're not reparenting to ourselves and that
+ * the parent is not a zombie.
+ */
+ if (p == reaper || reaper->state >= TASK_ZOMBIE)
+ p->real_parent = child_reaper;
+ else
+ p->real_parent = reaper;
+ if (p->parent == p->real_parent)
+ BUG();
+}
+
+static inline void reparent_thread(task_t *p, task_t *father, int traced)
+{
+ /* We don't want people slaying init. */
+ if (p->exit_signal != -1)
+ p->exit_signal = SIGCHLD;
+ p->self_exec_id++;
+
+ if (p->pdeath_signal)
+ /* We already hold the tasklist_lock here. */
+ group_send_sig_info(p->pdeath_signal, (void *) 0, p);
+
+ /* Move the child from its dying parent to the new one. */
+ if (unlikely(traced)) {
+ /* Preserve ptrace links if someone else is tracing this child. */
+ list_del_init(&p->ptrace_list);
+ if (p->parent != p->real_parent)
+ list_add(&p->ptrace_list, &p->real_parent->ptrace_children);
+ } else {
+ /* If this child is being traced, then we're the one tracing it
+ * anyway, so let go of it.
+ */
+ p->ptrace = 0;
+ list_del_init(&p->sibling);
+ p->parent = p->real_parent;
+ list_add_tail(&p->sibling, &p->parent->children);
+
+ /* If we'd notified the old parent about this child's death,
+ * also notify the new parent.
+ */
+ if (p->state == TASK_ZOMBIE && p->exit_signal != -1 &&
+ thread_group_empty(p))
+ do_notify_parent(p, p->exit_signal);
+ }
+
+ /*
+ * process group orphan check
+ * Case ii: Our child is in a different pgrp
+ * than we are, and it was the only connection
+ * outside, so the child pgrp is now orphaned.
+ */
+ if ((process_group(p) != process_group(father)) &&
+ (p->signal->session == father->signal->session)) {
+ int pgrp = process_group(p);
+
+ if (will_become_orphaned_pgrp(pgrp, NULL) && has_stopped_jobs(pgrp)) {
+ __kill_pg_info(SIGHUP, (void *)1, pgrp);
+ __kill_pg_info(SIGCONT, (void *)1, pgrp);
+ }
+ }
+}
+
+/*
+ * When we die, we re-parent all our children.
+ * Try to give them to another thread in our thread
+ * group, and if no such member exists, give it to
+ * the global child reaper process (ie "init")
+ */
+static inline void forget_original_parent(struct task_struct * father)
+{
+ struct task_struct *p, *reaper = father;
+ struct list_head *_p, *_n;
+
+ reaper = father->group_leader;
+ if (reaper == father)
+ reaper = child_reaper;
+
+ /*
+ * There are only two places where our children can be:
+ *
+ * - in our child list
+ * - in our ptraced child list
+ *
+ * Search them and reparent children.
+ */
+ list_for_each_safe(_p, _n, &father->children) {
+ p = list_entry(_p,struct task_struct,sibling);
+ if (father == p->real_parent) {
+ choose_new_parent(p, reaper, child_reaper);
+ reparent_thread(p, father, 0);
+ } else {
+ ptrace_unlink (p);
+ if (p->state == TASK_ZOMBIE && p->exit_signal != -1 &&
+ thread_group_empty(p))
+ do_notify_parent(p, p->exit_signal);
+ }
+ }
+ list_for_each_safe(_p, _n, &father->ptrace_children) {
+ p = list_entry(_p,struct task_struct,ptrace_list);
+ choose_new_parent(p, reaper, child_reaper);
+ reparent_thread(p, father, 1);
+ }
+}
+
+/*
+ * Send signals to all our closest relatives so that they know
+ * to properly mourn us..
+ */
+static void exit_notify(struct task_struct *tsk)
+{
+ int state;
+ struct task_struct *t;
+
+ ckrm_cb_exit(tsk);
+
+ if (signal_pending(tsk) && !tsk->signal->group_exit
+ && !thread_group_empty(tsk)) {
+ /*
+ * This occurs when there was a race between our exit
+ * syscall and a group signal choosing us as the one to
+ * wake up. It could be that we are the only thread
+ * alerted to check for pending signals, but another thread
+ * should be woken now to take the signal since we will not.
+ * Now we'll wake all the threads in the group just to make
+ * sure someone gets all the pending signals.
+ */
+ read_lock(&tasklist_lock);
+ spin_lock_irq(&tsk->sighand->siglock);
+ for (t = next_thread(tsk); t != tsk; t = next_thread(t))
+ if (!signal_pending(t) && !(t->flags & PF_EXITING)) {
+ recalc_sigpending_tsk(t);
+ if (signal_pending(t))
+ signal_wake_up(t, 0);
+ }
+ spin_unlock_irq(&tsk->sighand->siglock);
+ read_unlock(&tasklist_lock);
+ }
+
+ write_lock_irq(&tasklist_lock);
+
+ /*
+ * This does two things:
+ *
+ * A. Make init inherit all the child processes
+ * B. Check to see if any process groups have become orphaned
+ * as a result of our exiting, and if they have any stopped
+ * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
+ */
+
+ forget_original_parent(tsk);
+ BUG_ON(!list_empty(&tsk->children));
+
+ /*
+ * Check to see if any process groups have become orphaned
+ * as a result of our exiting, and if they have any stopped
+ * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
+ *
+ * Case i: Our father is in a different pgrp than we are
+ * and we were the only connection outside, so our pgrp
+ * is about to become orphaned.
+ */
+
+ t = tsk->real_parent;
+
+ if ((process_group(t) != process_group(tsk)) &&
+ (t->signal->session == tsk->signal->session) &&
+ will_become_orphaned_pgrp(process_group(tsk), tsk) &&
+ has_stopped_jobs(process_group(tsk))) {
+ __kill_pg_info(SIGHUP, (void *)1, process_group(tsk));
+ __kill_pg_info(SIGCONT, (void *)1, process_group(tsk));
+ }
+
+ /* Let father know we died
+ *
+ * Thread signals are configurable, but you aren't going to use
+ * that to send signals to arbitary processes.
+ * That stops right now.
+ *
+ * If the parent exec id doesn't match the exec id we saved
+ * when we started then we know the parent has changed security
+ * domain.
+ *
+ * If our self_exec id doesn't match our parent_exec_id then
+ * we have changed execution domain as these two values started
+ * the same after a fork.
+ *
+ */
+
+ if (tsk->exit_signal != SIGCHLD && tsk->exit_signal != -1 &&
+ ( tsk->parent_exec_id != t->self_exec_id ||
+ tsk->self_exec_id != tsk->parent_exec_id)
+ && !capable(CAP_KILL))
+ tsk->exit_signal = SIGCHLD;
+
+
+ /* If something other than our normal parent is ptracing us, then
+ * send it a SIGCHLD instead of honoring exit_signal. exit_signal
+ * only has special meaning to our real parent.
+ */
+ if (tsk->exit_signal != -1 && thread_group_empty(tsk)) {
+ int signal = tsk->parent == tsk->real_parent ? tsk->exit_signal : SIGCHLD;
+ do_notify_parent(tsk, signal);
+ } else if (tsk->ptrace) {
+ do_notify_parent(tsk, SIGCHLD);
+ }
+
+ state = TASK_ZOMBIE;
+ if (tsk->exit_signal == -1 && tsk->ptrace == 0)
+ state = TASK_DEAD;
+ tsk->state = state;
+ tsk->flags |= PF_DEAD;
+
+ /*
+ * Clear these here so that update_process_times() won't try to deliver
+ * itimer, profile or rlimit signals to this task while it is in late exit.
+ */
+ tsk->it_virt_value = 0;
+ tsk->it_prof_value = 0;
+ tsk->rlim[RLIMIT_CPU].rlim_cur = RLIM_INFINITY;
+
+ /*
+ * In the preemption case it must be impossible for the task
+ * to get runnable again, so use "_raw_" unlock to keep
+ * preempt_count elevated until we schedule().
+ *
+ * To avoid deadlock on SMP, interrupts must be unmasked. If we
+ * don't, subsequently called functions (e.g, wait_task_inactive()
+ * via release_task()) will spin, with interrupt flags
+ * unwittingly blocked, until the other task sleeps. That task
+ * may itself be waiting for smp_call_function() to answer and
+ * complete, and with interrupts blocked that will never happen.
+ */
+ _raw_write_unlock(&tasklist_lock);
+ local_irq_enable();
+
+ /* If the process is dead, release it - nobody will wait for it */
+ if (state == TASK_DEAD)
+ release_task(tsk);
+
+}
+
+asmlinkage NORET_TYPE void do_exit(long code)
+{
+ struct task_struct *tsk = current;
+
+ if (unlikely(in_interrupt()))
+ panic("Aiee, killing interrupt handler!");
+ if (unlikely(!tsk->pid))
+ panic("Attempted to kill the idle task!");
+ if (unlikely(tsk->pid == 1))
+ panic("Attempted to kill init!");
+ if (tsk->io_context)
+ exit_io_context();
+ tsk->flags |= PF_EXITING;
+ del_timer_sync(&tsk->real_timer);
+
+ if (unlikely(in_atomic()))
+ printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n",
+ current->comm, current->pid,
+ preempt_count());
+
+ profile_exit_task(tsk);
+
+ if (unlikely(current->ptrace & PT_TRACE_EXIT)) {
+ current->ptrace_message = code;
+ ptrace_notify((PTRACE_EVENT_EXIT << 8) | SIGTRAP);
+ }
+
+ acct_process(code);
+ __exit_mm(tsk);
+
+ exit_sem(tsk);
+ __exit_files(tsk);
+ __exit_fs(tsk);
+ exit_namespace(tsk);
+ exit_thread();
+#ifdef CONFIG_NUMA
+ mpol_free(tsk->mempolicy);
+#endif
+
+ if (tsk->signal->leader)
+ disassociate_ctty(1);
+
+ module_put(tsk->thread_info->exec_domain->module);
+ if (tsk->binfmt)
+ module_put(tsk->binfmt->module);
+
+ tsk->exit_code = code;
+#ifdef CONFIG_CKRM_TYPE_TASKCLASS
+ numtasks_put_ref(tsk->taskclass);
+#endif
+ exit_notify(tsk);
+ schedule();
+ BUG();
+ /* Avoid "noreturn function does return". */
+ for (;;) ;
+}
+
+NORET_TYPE void complete_and_exit(struct completion *comp, long code)
+{
+ if (comp)
+ complete(comp);
+
+ do_exit(code);
+}
+
+EXPORT_SYMBOL(complete_and_exit);
+
+asmlinkage long sys_exit(int error_code)
+{
+ do_exit((error_code&0xff)<<8);
+}
+
+task_t fastcall *next_thread(task_t *p)
+{
+ struct pid_link *link = p->pids + PIDTYPE_TGID;
+ struct list_head *tmp, *head = &link->pidptr->task_list;
+
+#ifdef CONFIG_SMP
+ if (!p->sighand)
+ BUG();
+ if (!spin_is_locked(&p->sighand->siglock) &&
+ !rwlock_is_locked(&tasklist_lock))
+ BUG();
+#endif
+ tmp = link->pid_chain.next;
+ if (tmp == head)
+ tmp = head->next;
+
+ return pid_task(tmp, PIDTYPE_TGID);
+}
+
+EXPORT_SYMBOL(next_thread);
+
+/*
+ * Take down every thread in the group. This is called by fatal signals
+ * as well as by sys_exit_group (below).
+ */
+NORET_TYPE void
+do_group_exit(int exit_code)
+{
+ BUG_ON(exit_code & 0x80); /* core dumps don't get here */
+
+ if (current->signal->group_exit)
+ exit_code = current->signal->group_exit_code;
+ else if (!thread_group_empty(current)) {
+ struct signal_struct *const sig = current->signal;
+ struct sighand_struct *const sighand = current->sighand;
+ read_lock(&tasklist_lock);
+ spin_lock_irq(&sighand->siglock);
+ if (sig->group_exit)
+ /* Another thread got here before we took the lock. */
+ exit_code = sig->group_exit_code;
+ else {
+ sig->group_exit = 1;
+ sig->group_exit_code = exit_code;
+ zap_other_threads(current);
+ }
+ spin_unlock_irq(&sighand->siglock);
+ read_unlock(&tasklist_lock);
+ }
+
+ do_exit(exit_code);
+ /* NOTREACHED */
+}
+
+/*
+ * this kills every thread in the thread group. Note that any externally
+ * wait4()-ing process will get the correct exit code - even if this
+ * thread is not the thread group leader.
+ */
+asmlinkage void sys_exit_group(int error_code)
+{
+ do_group_exit((error_code & 0xff) << 8);
+}
+
+static int eligible_child(pid_t pid, int options, task_t *p)
+{
+ if (pid > 0) {
+ if (p->pid != pid)
+ return 0;
+ } else if (!pid) {
+ if (process_group(p) != process_group(current))
+ return 0;
+ } else if (pid != -1) {
+ if (process_group(p) != -pid)
+ return 0;
+ }
+
+ /*
+ * Do not consider detached threads that are
+ * not ptraced:
+ */
+ if (p->exit_signal == -1 && !p->ptrace)
+ return 0;
+
+ /* Wait for all children (clone and not) if __WALL is set;
+ * otherwise, wait for clone children *only* if __WCLONE is
+ * set; otherwise, wait for non-clone children *only*. (Note:
+ * A "clone" child here is one that reports to its parent
+ * using a signal other than SIGCHLD.) */
+ if (((p->exit_signal != SIGCHLD) ^ ((options & __WCLONE) != 0))
+ && !(options & __WALL))
+ return 0;
+ /*
+ * Do not consider thread group leaders that are
+ * in a non-empty thread group:
+ */
+ if (current->tgid != p->tgid && delay_group_leader(p))
+ return 2;
+
+ if (security_task_wait(p))
+ return 0;
+
+ return 1;
+}
+
+/*
+ * Handle sys_wait4 work for one task in state TASK_ZOMBIE. We hold
+ * read_lock(&tasklist_lock) on entry. If we return zero, we still hold
+ * the lock and this task is uninteresting. If we return nonzero, we have
+ * released the lock and the system call should return.
+ */
+static int wait_task_zombie(task_t *p, unsigned int __user *stat_addr, struct rusage __user *ru)
+{
+ unsigned long state;
+ int retval;
+
+ /*
+ * Try to move the task's state to DEAD
+ * only one thread is allowed to do this:
+ */
+ state = xchg(&p->state, TASK_DEAD);
+ if (state != TASK_ZOMBIE) {
+ BUG_ON(state != TASK_DEAD);
+ return 0;
+ }
+ if (unlikely(p->exit_signal == -1 && p->ptrace == 0))
+ /*
+ * This can only happen in a race with a ptraced thread
+ * dying on another processor.
+ */
+ return 0;
+
+ /*
+ * Now we are sure this task is interesting, and no other
+ * thread can reap it because we set its state to TASK_DEAD.
+ */
+ read_unlock(&tasklist_lock);
+
+ retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0;
+ if (!retval && stat_addr) {
+ if (p->signal->group_exit)
+ retval = put_user(p->signal->group_exit_code, stat_addr);
+ else
+ retval = put_user(p->exit_code, stat_addr);
+ }
+ if (retval) {
+ p->state = TASK_ZOMBIE;
+ return retval;
+ }
+ retval = p->pid;
+ if (p->real_parent != p->parent) {
+ write_lock_irq(&tasklist_lock);
+ /* Double-check with lock held. */
+ if (p->real_parent != p->parent) {
+ __ptrace_unlink(p);
+ p->state = TASK_ZOMBIE;
+ /* If this is a detached thread, this is where it goes away. */
+ if (p->exit_signal == -1) {
+ /* release_task takes the lock itself. */
+ write_unlock_irq(&tasklist_lock);
+ release_task (p);
+ }
+ else {
+ do_notify_parent(p, p->exit_signal);
+ write_unlock_irq(&tasklist_lock);
+ }
+ p = NULL;
+ }
+ else
+ write_unlock_irq(&tasklist_lock);
+ }
+ if (p != NULL)
+ release_task(p);
+ BUG_ON(!retval);
+ return retval;
+}
+
+/*
+ * Handle sys_wait4 work for one task in state TASK_STOPPED. We hold
+ * read_lock(&tasklist_lock) on entry. If we return zero, we still hold
+ * the lock and this task is uninteresting. If we return nonzero, we have
+ * released the lock and the system call should return.
+ */
+static int wait_task_stopped(task_t *p, int delayed_group_leader,
+ unsigned int __user *stat_addr,
+ struct rusage __user *ru)
+{
+ int retval, exit_code;
+
+ if (!p->exit_code)
+ return 0;
+ if (delayed_group_leader && !(p->ptrace & PT_PTRACED) &&
+ p->signal && p->signal->group_stop_count > 0)
+ /*
+ * A group stop is in progress and this is the group leader.
+ * We won't report until all threads have stopped.
+ */
+ return 0;
+
+ /*
+ * Now we are pretty sure this task is interesting.
+ * Make sure it doesn't get reaped out from under us while we
+ * give up the lock and then examine it below. We don't want to
+ * keep holding onto the tasklist_lock while we call getrusage and
+ * possibly take page faults for user memory.
+ */
+ get_task_struct(p);
+ read_unlock(&tasklist_lock);
+ write_lock_irq(&tasklist_lock);
+
+ /*
+ * This uses xchg to be atomic with the thread resuming and setting
+ * it. It must also be done with the write lock held to prevent a
+ * race with the TASK_ZOMBIE case.
+ */
+ exit_code = xchg(&p->exit_code, 0);
+ if (unlikely(p->state > TASK_STOPPED)) {
+ /*
+ * The task resumed and then died. Let the next iteration
+ * catch it in TASK_ZOMBIE. Note that exit_code might
+ * already be zero here if it resumed and did _exit(0).
+ * The task itself is dead and won't touch exit_code again;
+ * other processors in this function are locked out.
+ */
+ p->exit_code = exit_code;
+ exit_code = 0;
+ }
+ if (unlikely(exit_code == 0)) {
+ /*
+ * Another thread in this function got to it first, or it
+ * resumed, or it resumed and then died.
+ */
+ write_unlock_irq(&tasklist_lock);
+ put_task_struct(p);
+ read_lock(&tasklist_lock);
+ return 0;
+ }
+
+ /* move to end of parent's list to avoid starvation */
+ remove_parent(p);
+ add_parent(p, p->parent);
+
+ write_unlock_irq(&tasklist_lock);
+
+ retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0;
+ if (!retval && stat_addr)
+ retval = put_user((exit_code << 8) | 0x7f, stat_addr);
+ if (!retval)
+ retval = p->pid;
+ put_task_struct(p);
+
+ BUG_ON(!retval);
+ return retval;
+}
+
+asmlinkage long sys_wait4(pid_t pid,unsigned int __user *stat_addr, int options, struct rusage __user *ru)
+{
+ DECLARE_WAITQUEUE(wait, current);
+ struct task_struct *tsk;
+ int flag, retval;
+
+ if (options & ~(WNOHANG|WUNTRACED|__WNOTHREAD|__WCLONE|__WALL))
+ return -EINVAL;
+
+ add_wait_queue(¤t->wait_chldexit,&wait);
+repeat:
+ flag = 0;
+ current->state = TASK_INTERRUPTIBLE;
+ read_lock(&tasklist_lock);
+ tsk = current;
+ do {
+ struct task_struct *p;
+ struct list_head *_p;
+ int ret;
+
+ list_for_each(_p,&tsk->children) {
+ p = list_entry(_p,struct task_struct,sibling);
+
+ ret = eligible_child(pid, options, p);
+ if (!ret)
+ continue;
+ flag = 1;
+
+ switch (p->state) {
+ case TASK_STOPPED:
+ if (!(options & WUNTRACED) &&
+ !(p->ptrace & PT_PTRACED))
+ continue;
+ retval = wait_task_stopped(p, ret == 2,
+ stat_addr, ru);
+ if (retval != 0) /* He released the lock. */
+ goto end_wait4;
+ break;
+ case TASK_ZOMBIE:
+ /*
+ * Eligible but we cannot release it yet:
+ */
+ if (ret == 2)
+ continue;
+ retval = wait_task_zombie(p, stat_addr, ru);
+ if (retval != 0) /* He released the lock. */
+ goto end_wait4;
+ break;
+ }
+ }
+ if (!flag) {
+ list_for_each (_p,&tsk->ptrace_children) {
+ p = list_entry(_p,struct task_struct,ptrace_list);
+ if (!eligible_child(pid, options, p))
+ continue;
+ flag = 1;
+ break;
+ }
+ }
+ if (options & __WNOTHREAD)
+ break;
+ tsk = next_thread(tsk);
+ if (tsk->signal != current->signal)
+ BUG();
+ } while (tsk != current);
+ read_unlock(&tasklist_lock);
+ if (flag) {
+ retval = 0;
+ if (options & WNOHANG)
+ goto end_wait4;
+ retval = -ERESTARTSYS;
+ if (signal_pending(current))
+ goto end_wait4;
+ schedule();
+ goto repeat;
+ }
+ retval = -ECHILD;
+end_wait4:
+ current->state = TASK_RUNNING;
+ remove_wait_queue(¤t->wait_chldexit,&wait);
+ return retval;
+}
+
+#ifdef __ARCH_WANT_SYS_WAITPID
+
+/*
+ * sys_waitpid() remains for compatibility. waitpid() should be
+ * implemented by calling sys_wait4() from libc.a.
+ */
+asmlinkage long sys_waitpid(pid_t pid, unsigned __user *stat_addr, int options)
+{
+ return sys_wait4(pid, stat_addr, options, NULL);
+}
+
+#endif