Initial revision
authorMarc Fiuczynski <mef@cs.princeton.edu>
Wed, 8 Sep 2004 19:40:51 +0000 (19:40 +0000)
committerMarc Fiuczynski <mef@cs.princeton.edu>
Wed, 8 Sep 2004 19:40:51 +0000 (19:40 +0000)
include/linux/ckrm_mem.h [new file with mode: 0644]
include/linux/ckrm_mem_inline.h [new file with mode: 0644]
kernel/ckrm/ckrm_mem.c [new file with mode: 0644]
kernel/exit.c.orig [new file with mode: 0644]

diff --git a/include/linux/ckrm_mem.h b/include/linux/ckrm_mem.h
new file mode 100644 (file)
index 0000000..52dc949
--- /dev/null
@@ -0,0 +1,106 @@
+/* include/linux/ckrm_mem.h : memory control for CKRM
+ *
+ * Copyright (C) Jiantao Kong, IBM Corp. 2003
+ *           (C) Shailabh Nagar, IBM Corp. 2003
+ *           (C) Chandra Seetharaman, IBM Corp. 2004
+ * 
+ * 
+ * Memory control functions of the CKRM kernel API 
+ *
+ * Latest version, more details at http://ckrm.sf.net
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+
+/* Changes
+ *
+ * 28 Aug 2003
+ *        Created.
+ */
+
+#ifndef _LINUX_CKRM_MEM_H
+#define _LINUX_CKRM_MEM_H
+
+#ifdef CONFIG_CKRM_RES_MEM
+
+#include <linux/list.h>
+#include <linux/ckrm_rc.h>
+
+typedef struct ckrm_mem_res {
+       unsigned long reclaim_flags; 
+       unsigned long flags; 
+       struct ckrm_core_class *core; // the core i am part of...
+       struct ckrm_core_class *parent; // parent of the core i am part of....
+       struct ckrm_shares shares;
+       struct list_head mcls_list; // list of all 1-level classes
+       struct list_head shrink_list; // list of classes need to be shrunk
+       atomic_t nr_users; // # of references to this class/data structure
+       atomic_t pg_total;  // # of pages used by this class
+       int pg_guar; // # of pages this class is guaranteed
+       int pg_limit; // max # of pages this class can get
+       int pg_borrowed; // # of pages this class borrowed from its parent
+       int pg_lent; // # of pages this class lent to its children
+       int pg_unused; // # of pages left to this class (after giving the
+                               // guarantees to children. need to borrow from parent if
+                               // more than this is needed.
+       int nr_active[MAX_NR_ZONES];
+       int nr_inactive[MAX_NR_ZONES];
+       int shrink_count;
+       unsigned long last_shrink;
+       int over_limit_failures;
+       int hier; // hiearchy, root = 0
+} ckrm_mem_res_t;
+
+extern atomic_t ckrm_mem_real_count;
+extern unsigned int ckrm_tot_lru_pages;
+extern struct list_head ckrm_shrink_list;
+extern spinlock_t ckrm_mem_lock;
+extern struct ckrm_res_ctlr mem_rcbs;
+
+#define page_class(page)       ((ckrm_mem_res_t*)((page)->memclass))
+
+// used to fill reclaim_flags, used only when memory is low in the system
+#define CLS_CLEAR              (0)      // class under its guarantee
+#define CLS_OVER_GUAR  (1 << 0) // class is over its guarantee
+#define CLS_PARENT_OVER        (1 << 1) // parent is over 120% mark over limit
+#define CLS_OVER_75            (1 << 2) // class over 75% mark bet guar(0) & limit(100)
+#define CLS_OVER_100   (1 << 3) // class over its limit
+#define CLS_OVER_110   (1 << 4) // class over 110% mark over limit
+#define CLS_FLAGS_ALL  ( CLS_OVER_GUAR | CLS_PARENT_OVER | CLS_OVER_75 | \
+                                       CLS_OVER_100 | CLS_OVER_110 )
+#define CLS_SHRINK_BIT (31)      // used to both lock and set the bit
+#define CLS_SHRINK             (1 << CLS_SHRINK_BIT) // shrink the given class
+
+// used in flags. set when a class is more than 90% of its maxlimit
+#define MEM_NEAR_LIMIT 1
+
+extern void ckrm_set_aggressive(ckrm_mem_res_t *);
+extern unsigned int ckrm_setup_reclamation(void);
+extern void ckrm_teardown_reclamation(void);
+extern void ckrm_get_reclaim_bits(unsigned int *, unsigned int *);
+extern void ckrm_init_mm_to_task(struct mm_struct *, struct task_struct *);
+extern void ckrm_mem_evaluate_mm(struct mm_struct *);
+extern void ckrm_mem_evaluate_page_byadd(struct page *, struct mm_struct *);
+extern void ckrm_near_limit(ckrm_mem_res_t *);
+#define ckrm_get_reclaim_flags(cls)    ((cls)->reclaim_flags)
+
+#else
+
+#define ckrm_init_mm_to_current(a)                     do {} while (0)
+#define ckrm_mem_evaluate_mm(a)                                do {} while (0)
+#define ckrm_mem_evaluate_page_byadd(a,b)      do {} while (0)
+#define page_class(page)                                       (NULL)
+#define ckrm_get_reclaim_flags(a)                      (0)
+#define ckrm_setup_reclamation()                       (0)
+#define ckrm_teardown_reclamation()                    do {} while (0)
+#define ckrm_get_reclaim_bits(a, b)                    do { *(a) = 0; *(b)= 0; } while (0)
+#define ckrm_init_mm_to_task(a,b)                      do {} while (0)
+
+#endif // CONFIG_CKRM_RES_MEM
+
+#endif //_LINUX_CKRM_MEM_H
+
diff --git a/include/linux/ckrm_mem_inline.h b/include/linux/ckrm_mem_inline.h
new file mode 100644 (file)
index 0000000..0eb4e49
--- /dev/null
@@ -0,0 +1,256 @@
+/* include/linux/ckrm_mem_inline.h : memory control for CKRM
+ *
+ * Copyright (C) Jiantao Kong, IBM Corp. 2003
+ *           (C) Shailabh Nagar, IBM Corp. 2003
+ *           (C) Chandra Seetharaman, IBM Corp. 2004
+ * 
+ * 
+ * Memory control functions of the CKRM kernel API 
+ *
+ * Latest version, more details at http://ckrm.sf.net
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+
+/* Changes
+ *
+ * 28 Aug 2003
+ *        Created.
+ */
+
+
+#ifndef _LINUX_CKRM_MEM_INLINE_H_
+#define _LINUX_CKRM_MEM_INLINE_H_
+
+#include <linux/rmap.h>
+#include <linux/mmzone.h>
+#include <linux/ckrm_mem.h>
+
+
+#ifdef CONFIG_CKRM_RES_MEM
+
+#define GET_MEM_CLASS(tsk) \
+       ckrm_get_res_class(tsk->taskclass, mem_rcbs.resid, ckrm_mem_res_t)
+
+#define ckrm_set_shrink(cls) \
+       set_bit(CLS_SHRINK_BIT, (unsigned long *)&(cls)->reclaim_flags)
+#define ckrm_test_set_shrink(cls) \
+       test_and_set_bit(CLS_SHRINK_BIT, (unsigned long *)&(cls)->reclaim_flags)
+#define ckrm_clear_shrink(cls) \
+       clear_bit(CLS_SHRINK_BIT, (unsigned long *)&(cls)->reclaim_flags)
+
+#define ckrm_shrink_list_empty()       list_empty(&ckrm_shrink_list)
+
+/*
+ * Currently, the class of an address is assigned to the class with max
+ * available guarantee. Simply replace this function for other policies.
+ */
+static inline int
+ckrm_mem_share_compare(ckrm_mem_res_t *a, ckrm_mem_res_t *b)
+{
+       if (a == NULL) 
+               return -(b != NULL) ;
+       if (b == NULL)
+               return 0;
+       return (a->pg_unused - b->pg_unused);
+}
+
+static inline void
+mem_class_get(ckrm_mem_res_t *cls)
+{
+       if (cls)
+               atomic_inc(&((cls)->nr_users));
+}
+
+static inline void
+mem_class_put(ckrm_mem_res_t *cls)
+{
+       if (cls && atomic_dec_and_test(&(cls->nr_users)) ) {
+               printk("freeing memclass %p of <core:%s>\n", cls, cls->core->name);
+               //kfree(cls);
+       }       
+}
+
+static inline int
+incr_use_count(ckrm_mem_res_t *cls, int borrow)
+{
+       int over_limit;
+
+       atomic_inc(&cls->pg_total);
+       over_limit = (atomic_read(&cls->pg_total) > ((9 * cls->pg_limit) / 10));
+
+       if (borrow) 
+               cls->pg_lent++;
+       if ((cls->pg_guar != CKRM_SHARE_DONTCARE) &&
+                               (atomic_read(&cls->pg_total) > cls->pg_unused)) {
+               ckrm_mem_res_t *parcls = ckrm_get_res_class(cls->parent,
+                               mem_rcbs.resid, ckrm_mem_res_t);
+               if (parcls) {
+                       over_limit |= incr_use_count(parcls, 1);
+                       cls->pg_borrowed++;
+                       return over_limit;
+               }
+       }
+       atomic_inc(&ckrm_mem_real_count);
+       return over_limit;
+}
+
+static inline void
+decr_use_count(ckrm_mem_res_t *cls, int borrowed)
+{
+       atomic_dec(&cls->pg_total);
+       if (borrowed)
+               cls->pg_lent--;
+       if (cls->pg_borrowed > 0) {
+               ckrm_mem_res_t *parcls = ckrm_get_res_class(cls->parent,
+                               mem_rcbs.resid, ckrm_mem_res_t);
+               if (parcls) {
+                       decr_use_count(parcls, 1);
+                       cls->pg_borrowed--;
+                       return;
+               }
+       }
+       atomic_dec(&ckrm_mem_real_count);
+}
+
+static inline void
+ckrm_set_page_class(struct page *page, ckrm_mem_res_t *cls)
+{
+       if (mem_rcbs.resid != -1 && cls != NULL) {
+               if (unlikely(page->memclass)) {
+                       mem_class_put(page->memclass);
+               }
+               page->memclass = cls;
+               mem_class_get(cls);
+       } else {
+               page->memclass = NULL;
+       }
+}
+
+static inline void
+ckrm_set_pages_class(struct page *pages, int numpages, ckrm_mem_res_t *cls)
+{
+       int i;
+       for (i = 0; i < numpages; pages++, i++) {
+               ckrm_set_page_class(pages, cls);
+       }
+}
+
+static inline void
+ckrm_clear_page_class(struct page *page)
+{
+       if (page->memclass != NULL) {
+               mem_class_put(page->memclass);
+               page->memclass = NULL;
+       }
+}
+
+static inline void
+ckrm_clear_pages_class(struct page *pages, int numpages)
+{
+       int i;
+       for (i = 0; i < numpages; pages++, i++) {
+               ckrm_clear_page_class(pages);
+       }
+}
+
+static inline void
+ckrm_change_page_class(struct page *page, ckrm_mem_res_t *cls)
+{
+       ckrm_clear_page_class(page);
+       ckrm_set_page_class(page, cls);
+}
+
+static inline void
+ckrm_change_pages_class(struct page *pages, int numpages, 
+                                       ckrm_mem_res_t *cls)
+{
+       int i;
+       for (i = 0; i < numpages; pages++, i++) {
+               ckrm_change_page_class(pages, cls);
+       }
+}
+
+static inline void
+ckrm_mem_inc_active(struct page *page)
+{
+       ckrm_mem_res_t *cls = page_class(page);
+       BUG_ON(cls == NULL);
+       cls->nr_active[page_zonenum(page)]++;
+       if (incr_use_count(cls, 0)) {
+               ckrm_near_limit(cls);
+       }
+}
+
+static inline void
+ckrm_mem_dec_active(struct page *page)
+{
+       ckrm_mem_res_t *cls = page_class(page);
+       BUG_ON(cls == NULL);
+       cls->nr_active[page_zonenum(page)]--;
+       decr_use_count(cls, 0);
+}
+
+static inline void
+ckrm_mem_inc_inactive(struct page *page)
+{
+       ckrm_mem_res_t *cls = page_class(page);
+       BUG_ON(cls == NULL);
+       cls->nr_inactive[page_zonenum(page)]++;
+       if (incr_use_count(cls, 0) &&
+                       ((cls->flags & MEM_NEAR_LIMIT) != MEM_NEAR_LIMIT)) {
+               ckrm_near_limit(cls);
+       }
+}
+
+static inline void
+ckrm_mem_dec_inactive(struct page *page)
+{
+       ckrm_mem_res_t *cls = page_class(page);
+       BUG_ON(cls == NULL);
+       cls->nr_inactive[page_zonenum(page)]--;
+       decr_use_count(cls, 0);
+}
+
+static inline int
+ckrm_kick_page(struct page *page, unsigned int bits)
+{
+       if (page_class(page) == NULL) {
+               return bits;
+       } else {
+               return (page_class(page)->reclaim_flags & bits);
+       }
+}
+
+static inline int 
+ckrm_class_limit_ok(ckrm_mem_res_t *cls)
+{
+       if ((mem_rcbs.resid == -1) || !cls) {
+               return 1;
+       }
+       return (atomic_read(&cls->pg_total) <= (11 * cls->pg_limit) / 10);
+}
+
+#else // !CONFIG_CKRM_RES_MEM
+
+#define ckrm_set_page_class(a,b)               do{}while(0)
+#define ckrm_set_pages_class(a,b,c)            do{}while(0)
+#define ckrm_clear_page_class(a)               do{}while(0)
+#define ckrm_clear_pages_class(a,b)            do{}while(0)
+#define ckrm_change_page_class(a,b)            do{}while(0)
+#define ckrm_change_pages_class(a,b,c) do{}while(0)
+#define ckrm_mem_inc_active(a)                 do{}while(0)
+#define ckrm_mem_dec_active(a)                 do{}while(0)
+#define ckrm_mem_inc_inactive(a)               do{}while(0)
+#define ckrm_mem_dec_inactive(a)               do{}while(0)
+#define ckrm_shrink_list_empty()               (1)
+#define ckrm_kick_page(a,b)                            (0)
+#define ckrm_class_limit_ok(a)                 (1)
+
+#endif // CONFIG_CKRM_RES_MEM
+
+#endif // _LINUX_CKRM_MEM_INLINE_H_
diff --git a/kernel/ckrm/ckrm_mem.c b/kernel/ckrm/ckrm_mem.c
new file mode 100644 (file)
index 0000000..667ac9c
--- /dev/null
@@ -0,0 +1,814 @@
+/* ckrm_mem.c - Memory Resource Manager for CKRM
+ *
+ * Copyright (C) Chandra Seetharaman, IBM Corp. 2004
+ *
+ * Provides a Memory Resource controller for CKRM
+ *
+ * Latest version, more details at http://ckrm.sf.net
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+
+/* Code Description: TBD
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <asm/errno.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/pagemap.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+#include <linux/cache.h>
+#include <linux/percpu.h>
+#include <linux/pagevec.h>
+
+#include <linux/ckrm_mem_inline.h>
+
+#include <asm/uaccess.h>
+#include <asm/pgtable.h>
+
+#define MEM_NAME "mem"
+
+#define CKRM_MEM_MAX_HIERARCHY 2 // allows only upto 2 levels - 0, 1 & 2
+
+/* all 1-level memory_share_class are chained together */
+static LIST_HEAD(ckrm_memclass_list);
+LIST_HEAD(ckrm_shrink_list);
+EXPORT_SYMBOL(ckrm_shrink_list);
+spinlock_t ckrm_mem_lock = SPIN_LOCK_UNLOCKED; // protects both lists above
+EXPORT_SYMBOL(ckrm_mem_lock);
+unsigned int ckrm_tot_lru_pages; // total # of pages in the system
+                                                        // currently doesn't handle memory add/remove
+EXPORT_SYMBOL(ckrm_tot_lru_pages);
+
+static ckrm_mem_res_t *ckrm_mem_root_class;
+atomic_t ckrm_mem_real_count = ATOMIC_INIT(0);
+EXPORT_SYMBOL(ckrm_mem_real_count);
+
+/* Initialize rescls values
+ * May be called on each rcfs unmount or as part of error recovery
+ * to make share values sane.
+ * Does not traverse hierarchy reinitializing children.
+ */
+
+static void
+set_ckrm_tot_pages(void)
+{
+       struct zone *zone;
+       int tot_lru_pages = 0;
+
+       for_each_zone(zone) {
+               tot_lru_pages += zone->nr_active;
+               tot_lru_pages += zone->nr_inactive;
+               tot_lru_pages += zone->free_pages;
+       }
+       ckrm_tot_lru_pages = tot_lru_pages;
+}
+
+static void
+mem_res_initcls_one(void *my_res)
+{
+       ckrm_mem_res_t *res = my_res;
+
+       memset(res, 0, sizeof(ckrm_mem_res_t));
+
+       res->shares.my_guarantee     = CKRM_SHARE_DONTCARE;
+       res->shares.my_limit         = CKRM_SHARE_DONTCARE;
+       res->shares.total_guarantee  = CKRM_SHARE_DFLT_TOTAL_GUARANTEE;
+       res->shares.max_limit        = CKRM_SHARE_DFLT_MAX_LIMIT;
+       res->shares.unused_guarantee = CKRM_SHARE_DFLT_TOTAL_GUARANTEE;
+       res->shares.cur_max_limit    = 0;
+
+       res->pg_guar = CKRM_SHARE_DONTCARE;
+       res->pg_limit = CKRM_SHARE_DONTCARE;
+       res->pg_unused = CKRM_SHARE_DONTCARE;
+}
+
+static void *
+mem_res_alloc(struct ckrm_core_class *core, struct ckrm_core_class *parent)
+{
+       ckrm_mem_res_t *res, *parres;
+
+       if (mem_rcbs.resid == -1) {
+               return NULL;
+       }
+
+       parres = ckrm_get_res_class(parent, mem_rcbs.resid, ckrm_mem_res_t);
+       if (parres && (parres->hier == CKRM_MEM_MAX_HIERARCHY)) {
+               // allows only upto CKRM_MEM_MAX_HIERARCHY
+               return NULL;
+       }
+
+       if (unlikely((parent == NULL) && (ckrm_mem_root_class != NULL))) {
+               printk(KERN_ERR "MEM_RC: Only one root class is allowed\n");
+               return NULL;
+       }
+               
+       if (unlikely((parent != NULL) && (ckrm_mem_root_class == NULL))) {
+               printk(KERN_ERR "MEM_RC: creating child class without root class\n");
+               return NULL;
+       }
+               
+       res = kmalloc(sizeof(ckrm_mem_res_t), GFP_ATOMIC);
+       
+       if (res) {
+               mem_res_initcls_one(res);
+               res->core = core;
+               res->parent = parent;
+               spin_lock(&ckrm_mem_lock);
+               list_add(&res->mcls_list, &ckrm_memclass_list);
+               spin_unlock(&ckrm_mem_lock);
+               if (parent == NULL) {
+                       // I am part of the root class. So, set the max to 
+                       // number of pages available
+                       res->pg_guar = ckrm_tot_lru_pages;
+                       res->pg_unused = ckrm_tot_lru_pages;
+                       res->pg_limit = ckrm_tot_lru_pages;
+                       res->hier = 0;
+                       ckrm_mem_root_class = res;
+               } else {
+                       res->hier = parres->hier + 1;
+               }
+               mem_class_get(res);
+       }
+       else
+               printk(KERN_ERR "mem_res_alloc: failed GFP_ATOMIC alloc\n");
+       return res;
+}
+
+/*
+ * It is the caller's responsibility to make sure that the parent only
+ * has chilren that are to be accounted. i.e if a new child is added
+ * this function should be called after it has been added, and if a
+ * child is deleted this should be called after the child is removed.
+ */
+static void
+child_maxlimit_changed_local(ckrm_mem_res_t *parres)
+{
+       int maxlimit = 0;
+       ckrm_mem_res_t *childres;
+       ckrm_core_class_t *child = NULL;
+
+       // run thru parent's children and get the new max_limit of the parent
+       ckrm_lock_hier(parres->core);
+       while ((child = ckrm_get_next_child(parres->core, child)) != NULL) {
+               childres = ckrm_get_res_class(child, mem_rcbs.resid,
+                               ckrm_mem_res_t);
+               if (maxlimit < childres->shares.my_limit) {
+                       maxlimit = childres->shares.my_limit;
+               }
+       }
+       ckrm_unlock_hier(parres->core);
+       parres->shares.cur_max_limit = maxlimit;
+}
+
+static void
+mem_res_free(void *my_res)
+{
+       ckrm_mem_res_t *res = my_res;
+       ckrm_mem_res_t *parres;
+
+       if (!res) 
+               return;
+
+       parres = ckrm_get_res_class(res->parent, mem_rcbs.resid, ckrm_mem_res_t);
+
+       // return child's limit/guarantee to parent node
+       if (parres) {
+               child_guarantee_changed(&parres->shares, res->shares.my_guarantee, 0);
+               child_maxlimit_changed_local(parres);
+       }
+       res->shares.my_guarantee = 0;
+       res->shares.my_limit = 0;
+       spin_lock(&ckrm_mem_lock);
+       list_del(&res->mcls_list);
+       spin_unlock(&ckrm_mem_lock);
+       mem_class_put(res);
+
+       return;
+}
+
+/*
+ * Recalculate the guarantee and limit in # of pages... and propagate the
+ * same to children.
+ * Caller is responsible for protecting res and for the integrity of parres
+ */
+static void
+recalc_and_propagate(ckrm_mem_res_t * res, ckrm_mem_res_t * parres)
+{
+       ckrm_core_class_t *child = NULL;
+       ckrm_mem_res_t *childres;
+       int resid = mem_rcbs.resid;
+       struct ckrm_shares *self = &res->shares;
+
+       if (parres) {
+               struct ckrm_shares *par = &parres->shares;
+
+               // calculate pg_guar and pg_limit
+               //
+               if (parres->pg_guar == CKRM_SHARE_DONTCARE ||
+                               self->my_guarantee == CKRM_SHARE_DONTCARE) {
+                       res->pg_guar = CKRM_SHARE_DONTCARE;
+               } else if (par->total_guarantee) {
+                       u64 temp = (u64) self->my_guarantee * parres->pg_guar;
+                       do_div(temp, par->total_guarantee);
+                       res->pg_guar = (int) temp;
+               } else {
+                       res->pg_guar = 0;
+               }
+
+               if (parres->pg_limit == CKRM_SHARE_DONTCARE ||
+                               self->my_limit == CKRM_SHARE_DONTCARE) {
+                       res->pg_limit = CKRM_SHARE_DONTCARE;
+               } else if (par->max_limit) {
+                       u64 temp = (u64) self->my_limit * parres->pg_limit;
+                       do_div(temp, par->max_limit);
+                       res->pg_limit = (int) temp;
+               } else {
+                       res->pg_limit = 0;
+               }
+       }
+
+       // Calculate unused units
+       if (res->pg_guar == CKRM_SHARE_DONTCARE) {
+               res->pg_unused = CKRM_SHARE_DONTCARE;
+       } else if (self->total_guarantee) {
+               u64 temp = (u64) self->unused_guarantee * res->pg_guar;
+               do_div(temp, self->total_guarantee);
+               res->pg_unused = (int) temp;
+       } else {
+               res->pg_unused = 0;
+       }
+
+       // propagate to children
+       ckrm_lock_hier(res->core);
+       while ((child = ckrm_get_next_child(res->core, child)) != NULL) {
+               childres = ckrm_get_res_class(child, resid, ckrm_mem_res_t);
+               recalc_and_propagate(childres, res);
+       }
+       ckrm_unlock_hier(res->core);
+       return;
+}
+
+static int
+mem_set_share_values(void *my_res, struct ckrm_shares *shares)
+{
+       ckrm_mem_res_t *res = my_res;
+       ckrm_mem_res_t *parres;
+       int rc = EINVAL;
+
+       if (!res) 
+               return -EINVAL;
+
+       parres = ckrm_get_res_class(res->parent, mem_rcbs.resid, ckrm_mem_res_t);
+
+       rc = set_shares(shares, &res->shares, parres ? &parres->shares : NULL);
+
+       if ((rc == 0) && (parres != NULL)) {
+               child_maxlimit_changed_local(parres);
+               recalc_and_propagate(parres, NULL);
+       }
+       return rc;
+}
+
+static int
+mem_get_share_values(void *my_res, struct ckrm_shares *shares)
+{
+       ckrm_mem_res_t *res = my_res;
+
+       if (!res) 
+               return -EINVAL;
+       *shares = res->shares;
+       return 0;
+}
+
+static int  
+mem_get_stats(void *my_res, struct seq_file *sfile)
+{
+       ckrm_mem_res_t *res = my_res;
+
+       if (!res) 
+               return -EINVAL;
+
+#if 0
+       seq_printf(sfile, "tot %6d;gua %6d;lmt %6d;unu %6d;"
+                       "lnt %6d;bor %6d;rlt %6d\n", atomic_read(&res->pg_total),
+                       res->pg_guar, res->pg_limit, res->pg_unused, res->pg_lent,
+                       res->pg_borrowed, atomic_read(&ckrm_mem_real_count));
+#endif
+
+
+       seq_printf(sfile, "----------- Memory Resource stats start -----------\n");
+       seq_printf(sfile, "Number of pages used(including pages lent to children):"
+                       " %d\n", atomic_read(&res->pg_total));
+       seq_printf(sfile, "Number of pages guaranteed: %d\n",
+                       res->pg_guar);
+       seq_printf(sfile, "Maximum limit of pages: %d\n",
+                       res->pg_limit);
+       seq_printf(sfile, "Total number of pages available"
+                       "(after serving guarantees to children): %d\n",
+                       res->pg_unused);
+       seq_printf(sfile, "Number of pages lent to children: %d\n",
+                       res->pg_lent);
+       seq_printf(sfile, "Number of pages borrowed from the parent: %d\n",
+                       res->pg_borrowed);
+       seq_printf(sfile, "----------- Memory Resource stats end -----------\n");
+
+       return 0;
+}
+
+static void
+mem_change_resclass(void *tsk, void *old, void *new)
+{
+       struct mm_struct *mm;
+       struct task_struct *task = tsk, *t1;
+       struct ckrm_mem_res *prev_mmcls;
+       
+       if (!task->mm || (new == old) || (old == (void *) -1))
+               return;
+
+       mm = task->active_mm;
+       spin_lock(&mm->peertask_lock);
+       prev_mmcls = mm->memclass;
+               
+       if (new == NULL) {
+               list_del_init(&task->mm_peers);
+       } else {
+               int found = 0;
+               list_for_each_entry(t1, &mm->tasklist, mm_peers) {
+                       if (t1 == task) {
+                               found++;
+                               break;
+                       }
+               }
+               if (!found) {
+                       list_del_init(&task->mm_peers);
+                       list_add_tail(&task->mm_peers, &mm->tasklist);
+               }
+       }
+
+       ckrm_mem_evaluate_mm(mm);
+       spin_unlock(&mm->peertask_lock);
+       return;
+}
+
+// config file is available only at the root level,
+// so assuming my_res to be the system level class
+static int
+mem_set_config(void *my_res, const char *cfgstr)
+{
+       ckrm_mem_res_t *res = my_res;
+
+       printk(KERN_INFO "%s class of %s is called with config<%s>\n",
+                       MEM_NAME, res->core->name, cfgstr);
+       return 0;
+}
+
+static int 
+mem_show_config(void *my_res, struct seq_file *sfile)
+{
+       struct zone *zone;
+       ckrm_mem_res_t *res = my_res;
+       int active = 0, inactive = 0, fr = 0;
+
+       if (!res)
+               return -EINVAL;
+
+       for_each_zone(zone) {
+               active += zone->nr_active;
+               inactive += zone->nr_inactive;
+               fr += zone->free_pages;
+       }
+       seq_printf(sfile, "res=%s;tot_pages=%d,active=%d,inactive=%d,free=%d\n",
+                       MEM_NAME, ckrm_tot_lru_pages,active,inactive,fr);
+
+
+       return 0;
+}
+
+static int
+mem_reset_stats(void *my_res)
+{
+       ckrm_mem_res_t *res = my_res;
+       printk(KERN_INFO " memclass of %s called for reset\n", res->core->name);
+       return 0;
+}
+
+struct ckrm_res_ctlr mem_rcbs = {
+       .res_name          = MEM_NAME,
+       .res_hdepth        = CKRM_MEM_MAX_HIERARCHY,
+       .resid             = -1,
+       .res_alloc         = mem_res_alloc,
+       .res_free          = mem_res_free,
+       .set_share_values  = mem_set_share_values,
+       .get_share_values  = mem_get_share_values,
+       .get_stats         = mem_get_stats,
+       .change_resclass   = mem_change_resclass,
+       .show_config       = mem_show_config,
+       .set_config        = mem_set_config,
+       .reset_stats       = mem_reset_stats,
+};
+
+EXPORT_SYMBOL(mem_rcbs);
+
+int __init
+init_ckrm_mem_res(void)
+{
+       struct ckrm_classtype *clstype;
+       int resid = mem_rcbs.resid;
+
+       set_ckrm_tot_pages();
+       clstype = ckrm_find_classtype_by_name("taskclass");
+       if (clstype == NULL) {
+               printk(KERN_INFO " Unknown ckrm classtype<taskclass>");
+               return -ENOENT;
+       }
+
+       if (resid == -1) {
+               resid = ckrm_register_res_ctlr(clstype, &mem_rcbs);
+               if (resid != -1) {
+                       mem_rcbs.classtype = clstype;
+               }
+       }
+       return ((resid < 0) ? resid : 0);
+}      
+
+void __exit
+exit_ckrm_mem_res(void)
+{
+       ckrm_unregister_res_ctlr(&mem_rcbs);
+       mem_rcbs.resid = -1;
+}
+
+module_init(init_ckrm_mem_res)
+module_exit(exit_ckrm_mem_res)
+
+static void
+set_flags_of_children(ckrm_mem_res_t *parres, unsigned int flag)
+{
+       ckrm_mem_res_t *childres;
+       ckrm_core_class_t *child = NULL;
+
+       parres->reclaim_flags |= flag;
+       ckrm_lock_hier(parres->core);
+       while ((child = ckrm_get_next_child(parres->core, child)) != NULL) {
+               childres = ckrm_get_res_class(child, mem_rcbs.resid,
+                               ckrm_mem_res_t);
+               set_flags_of_children(childres, flag);
+       }
+       ckrm_unlock_hier(parres->core);
+       return;
+}
+
+// FIXME: more attention is needed to this function
+static unsigned int
+set_usage_flags(ckrm_mem_res_t *res)
+{
+       int tot_usage, cls_usage, range, guar;
+
+       if (res->pg_limit == CKRM_SHARE_DONTCARE) {
+                       // No limit is set for the class. don't bother it
+                       res->reclaim_flags = 0;
+                       return res->reclaim_flags;
+       }
+
+       tot_usage = atomic_read(&res->pg_total);
+       cls_usage = tot_usage - res->pg_lent;
+       guar = (res->pg_guar > 0) ? res->pg_guar : 0;
+       range = res->pg_limit - guar;
+
+       if ((tot_usage > (guar + ((120 * range) / 100))) &&
+                               (res->pg_lent > (guar + ((25 * range) / 100)))) {
+               set_flags_of_children(res, CLS_PARENT_OVER);
+       }
+
+       if (cls_usage > (guar + ((110 * range) / 100))) {
+               res->reclaim_flags |= CLS_OVER_110;
+       } else if (cls_usage > (guar + range)) {
+               res->reclaim_flags |= CLS_OVER_100;
+       } else if (cls_usage > (guar + ((3 * range) / 4))) {
+               res->reclaim_flags |= CLS_OVER_75;
+       } else if (cls_usage > guar) {
+               res->reclaim_flags |= CLS_OVER_GUAR;
+       } else {
+               res->reclaim_flags = 0;
+       }
+       return res->reclaim_flags;
+}
+
+/*
+ * The functions ckrm_setup_reclamation(), ckrm_teardown_reclamation(),
+ * ckrm_get_reclaim_bits() and the macro ckrm_kick_page() along with the 
+ * macros CLS_* define how the pages are reclaimed.
+ * Keeping this logic thru these interface eliminate the necessity to
+ * change the reclaimation code in VM if we want to change the logic.
+ */
+unsigned int
+ckrm_setup_reclamation(void)
+{
+       ckrm_mem_res_t *res;
+       unsigned int ret = 0;
+
+       spin_lock(&ckrm_mem_lock);
+       set_ckrm_tot_pages();
+       ckrm_mem_root_class->pg_guar = ckrm_tot_lru_pages;
+       ckrm_mem_root_class->pg_unused = ckrm_tot_lru_pages;
+       ckrm_mem_root_class->pg_limit = ckrm_tot_lru_pages;
+       recalc_and_propagate(ckrm_mem_root_class, NULL);
+       list_for_each_entry(res, &ckrm_memclass_list, mcls_list) {
+               ret |= set_usage_flags(res);
+       }
+       spin_unlock(&ckrm_mem_lock);
+       return ret;
+}
+
+void
+ckrm_teardown_reclamation(void)
+{
+       ckrm_mem_res_t *res;
+       spin_lock(&ckrm_mem_lock);
+       list_for_each_entry(res, &ckrm_memclass_list, mcls_list) {
+               res->reclaim_flags = 0;
+       }
+       spin_unlock(&ckrm_mem_lock);
+}
+
+void
+ckrm_get_reclaim_bits(unsigned int *flags, unsigned int *extract)
+{
+       int i, j, mask = 0;
+
+       if (*extract == 0 || *flags == 0) {
+               return;
+       }
+       if (*flags & CLS_SHRINK) {
+               *extract = CLS_SHRINK;
+               *flags = 0;
+               return;
+       }
+                       
+
+       i = fls(*flags);
+       for (j = i-1; j > 0; j--) {
+               mask = (mask<<1) | 1;
+       }
+       *extract = (CLS_FLAGS_ALL & ~mask);
+       *flags &= ~*extract;
+       return;
+}
+
+void
+ckrm_near_limit(ckrm_mem_res_t *cls)
+{
+       struct zone *zone;
+       unsigned long now = jiffies;
+
+       if (!cls || ((cls->flags & MEM_NEAR_LIMIT) == MEM_NEAR_LIMIT)) {
+               return;
+       }
+       if ((cls->last_shrink + (10 * HZ)) < now) { // 10 seconds since last ?
+               cls->last_shrink = now;
+               cls->shrink_count = 0;
+       }
+       cls->shrink_count++;
+       if (cls->shrink_count > 10) {
+               return;
+       }
+       spin_lock(&ckrm_mem_lock);
+       list_add(&cls->shrink_list, &ckrm_shrink_list);
+       spin_unlock(&ckrm_mem_lock);
+       cls->flags |= MEM_NEAR_LIMIT;
+       for_each_zone(zone) {
+               wakeup_kswapd(zone);
+               break; // only once is enough
+       }
+}
+
+static int
+ckrm_mem_evaluate_page_anon(struct page* page)
+{
+       ckrm_mem_res_t* pgcls = page_class(page);
+       ckrm_mem_res_t* maxshareclass = NULL;
+       struct anon_vma *anon_vma = (struct anon_vma *) page->mapping;
+       struct vm_area_struct *vma;
+       struct mm_struct* mm;
+
+       spin_lock(&anon_vma->lock);
+       BUG_ON(list_empty(&anon_vma->head));
+       list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
+               mm = vma->vm_mm;
+               if (!maxshareclass ||
+                               ckrm_mem_share_compare(maxshareclass, mm->memclass) < 0) {
+                       maxshareclass = mm->memclass;
+               }
+       }
+       spin_unlock(&anon_vma->lock);
+
+       if (maxshareclass && (pgcls != maxshareclass)) {
+               ckrm_change_page_class(page, maxshareclass);
+               return 1;
+       }
+       return 0;
+}
+
+static int
+ckrm_mem_evaluate_page_file(struct page* page) 
+{
+       ckrm_mem_res_t* pgcls = page_class(page);
+       ckrm_mem_res_t* maxshareclass = NULL;
+       struct address_space *mapping = page->mapping;
+       struct vm_area_struct *vma = NULL;
+       pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+       struct prio_tree_iter iter;
+       struct mm_struct* mm;
+
+       if (!mapping)
+               return 0;
+
+       if (!spin_trylock(&mapping->i_mmap_lock))
+               return 0;
+
+       while ((vma = vma_prio_tree_next(vma, &mapping->i_mmap,
+                                       &iter, pgoff, pgoff)) != NULL) {
+               mm = vma->vm_mm;
+               if (!maxshareclass || ckrm_mem_share_compare(maxshareclass,mm->memclass)<0)
+                       maxshareclass = mm->memclass;
+       }
+       spin_unlock(&mapping->i_mmap_lock);
+
+       if (maxshareclass && pgcls != maxshareclass) {
+               ckrm_change_page_class(page, maxshareclass);
+               return 1;
+       }
+       return 0;
+}
+
+static int
+ckrm_mem_evaluate_page(struct page* page) 
+{
+       int changed = 0;
+
+       if (page->mapping) {
+               if (PageAnon(page))
+                       changed = ckrm_mem_evaluate_page_anon(page);
+               else
+                       changed = ckrm_mem_evaluate_page_file(page);
+       }
+       return changed;
+}
+
+static inline int
+class_migrate_pmd(struct mm_struct* mm, struct vm_area_struct* vma,
+               pmd_t* pmdir, unsigned long address, unsigned long end)
+{
+       pte_t* pte;
+       unsigned long pmd_end;
+       
+       if (pmd_none(*pmdir))
+               return 0;
+       BUG_ON(pmd_bad(*pmdir));
+       
+       pte = pte_offset_map(pmdir,address);
+       pmd_end = (address+PMD_SIZE)&PMD_MASK;
+       if (end>pmd_end)
+               end = pmd_end;
+       
+       do {
+               if (pte_present(*pte)) {
+                       ckrm_mem_evaluate_page(pte_page(*pte));
+               }
+               address += PAGE_SIZE;
+               pte++;
+       } while(address && (address<end));
+       return 0;
+}
+
+static inline int
+class_migrate_pgd(struct mm_struct* mm, struct vm_area_struct* vma,
+               pgd_t* pgdir, unsigned long address, unsigned long end)
+{
+       pmd_t* pmd;
+       unsigned long pgd_end;
+       
+       if (pgd_none(*pgdir))
+               return 0;
+       BUG_ON(pgd_bad(*pgdir));
+       
+       pmd = pmd_offset(pgdir,address);
+       pgd_end = (address+PGDIR_SIZE)&PGDIR_MASK;
+       
+       if (pgd_end && (end>pgd_end))
+               end = pgd_end;
+       
+       do {
+               class_migrate_pmd(mm,vma,pmd,address,end);
+               address =  (address+PMD_SIZE)&PMD_MASK;
+               pmd++;
+       } while (address && (address<end));
+       return 0;
+}
+
+static inline int
+class_migrate_vma(struct mm_struct* mm, struct vm_area_struct* vma)
+{
+       pgd_t* pgdir;
+       unsigned long address, end;
+       
+       address = vma->vm_start;
+       end = vma->vm_end;
+       
+       pgdir = pgd_offset(vma->vm_mm, address);
+       do {
+               class_migrate_pgd(mm,vma,pgdir,address,end);
+               address = (address + PGDIR_SIZE) & PGDIR_MASK;
+               pgdir++;
+       } while(address && (address<end));
+       return 0;
+}
+
+/* this function is called with mm->peertask_lock hold */
+void
+ckrm_mem_evaluate_mm(struct mm_struct* mm)
+{
+       struct task_struct *task;
+       struct ckrm_mem_res *maxshareclass = NULL;
+       struct vm_area_struct *vma;
+       
+       if (list_empty(&mm->tasklist)) {
+               /* We leave the mm->memclass untouched since we believe that one
+                * mm with no task associated will be deleted soon or attach
+                * with another task later.
+                */
+               return; 
+       }
+
+       list_for_each_entry(task, &mm->tasklist, mm_peers) {
+               ckrm_mem_res_t* cls = GET_MEM_CLASS(task);
+               if (!cls)
+                       continue;
+               if (!maxshareclass || ckrm_mem_share_compare(maxshareclass,cls)<0 ) 
+                       maxshareclass = cls;
+       }
+
+       if (mm->memclass != (void *)maxshareclass) {
+               mem_class_get(maxshareclass);
+               if (mm->memclass)
+                       mem_class_put(mm->memclass);
+               mm->memclass = maxshareclass;
+               
+               /* Go through all VMA to migrate pages */
+               down_read(&mm->mmap_sem);
+               vma = mm->mmap;
+               while(vma) {
+                       class_migrate_vma(mm, vma);
+                       vma = vma->vm_next;
+               }
+               up_read(&mm->mmap_sem);
+       }
+       return;
+}
+
+void
+ckrm_mem_evaluate_page_byadd(struct page* page, struct mm_struct* mm)
+{
+       ckrm_mem_res_t *pgcls = page_class(page);
+       ckrm_mem_res_t *chgcls = mm->memclass ? mm->memclass : GET_MEM_CLASS(current);
+
+       if (!chgcls || pgcls == chgcls)
+               return;
+
+       if (!page->mapcount) {
+               ckrm_change_page_class(page, chgcls);
+               return;
+       }
+       if (ckrm_mem_share_compare(pgcls, chgcls) < 0) {
+               ckrm_change_page_class(page, chgcls);
+               return;
+       }
+       return;
+}
+
+void
+ckrm_init_mm_to_task(struct mm_struct * mm, struct task_struct *task)
+{
+       spin_lock(&mm->peertask_lock);
+       if (!list_empty(&task->mm_peers)) {
+               printk(KERN_ERR "CKRM_MEM: Task list should be empty, but is not!!\n");
+               list_del_init(&task->mm_peers);
+       }
+       list_add_tail(&task->mm_peers, &mm->tasklist);
+       if (mm->memclass != GET_MEM_CLASS(task))
+               ckrm_mem_evaluate_mm(mm);
+       spin_unlock(&mm->peertask_lock);
+       return;
+}
+
+MODULE_LICENSE("GPL");
diff --git a/kernel/exit.c.orig b/kernel/exit.c.orig
new file mode 100644 (file)
index 0000000..f53583e
--- /dev/null
@@ -0,0 +1,1192 @@
+/*
+ *  linux/kernel/exit.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ */
+
+#include <linux/config.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/interrupt.h>
+#include <linux/smp_lock.h>
+#include <linux/module.h>
+#include <linux/completion.h>
+#include <linux/personality.h>
+#include <linux/tty.h>
+#include <linux/namespace.h>
+#include <linux/security.h>
+#include <linux/acct.h>
+#include <linux/file.h>
+#include <linux/binfmts.h>
+#include <linux/ptrace.h>
+#include <linux/profile.h>
+#include <linux/mount.h>
+#include <linux/proc_fs.h>
+#include <linux/mempolicy.h>
+#include <linux/ckrm.h>
+#include <linux/ckrm_tsk.h>
+
+#include <asm/uaccess.h>
+#include <asm/unistd.h>
+#include <asm/pgtable.h>
+#include <asm/mmu_context.h>
+
+extern void sem_exit (void);
+extern struct task_struct *child_reaper;
+
+int getrusage(struct task_struct *, int, struct rusage __user *);
+
+static void __unhash_process(struct task_struct *p)
+{
+       nr_threads--;
+       detach_pid(p, PIDTYPE_PID);
+       detach_pid(p, PIDTYPE_TGID);
+       if (thread_group_leader(p)) {
+               detach_pid(p, PIDTYPE_PGID);
+               detach_pid(p, PIDTYPE_SID);
+               if (p->pid)
+                       __get_cpu_var(process_counts)--;
+       }
+
+       REMOVE_LINKS(p);
+}
+
+void release_task(struct task_struct * p)
+{
+       int zap_leader;
+       task_t *leader;
+       struct dentry *proc_dentry;
+
+repeat: 
+       BUG_ON(p->state < TASK_ZOMBIE);
+       atomic_dec(&p->user->processes);
+       spin_lock(&p->proc_lock);
+       proc_dentry = proc_pid_unhash(p);
+       write_lock_irq(&tasklist_lock);
+       if (unlikely(p->ptrace))
+               __ptrace_unlink(p);
+       BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children));
+       __exit_signal(p);
+       __exit_sighand(p);
+       __unhash_process(p);
+
+       /*
+        * If we are the last non-leader member of the thread
+        * group, and the leader is zombie, then notify the
+        * group leader's parent process. (if it wants notification.)
+        */
+       zap_leader = 0;
+       leader = p->group_leader;
+       if (leader != p && thread_group_empty(leader) && leader->state == TASK_ZOMBIE) {
+               BUG_ON(leader->exit_signal == -1);
+               do_notify_parent(leader, leader->exit_signal);
+               /*
+                * If we were the last child thread and the leader has
+                * exited already, and the leader's parent ignores SIGCHLD,
+                * then we are the one who should release the leader.
+                *
+                * do_notify_parent() will have marked it self-reaping in
+                * that case.
+                */
+               zap_leader = (leader->exit_signal == -1);
+       }
+
+       p->parent->cutime += p->utime + p->cutime;
+       p->parent->cstime += p->stime + p->cstime;
+       p->parent->cmin_flt += p->min_flt + p->cmin_flt;
+       p->parent->cmaj_flt += p->maj_flt + p->cmaj_flt;
+       p->parent->cnvcsw += p->nvcsw + p->cnvcsw;
+       p->parent->cnivcsw += p->nivcsw + p->cnivcsw;
+       sched_exit(p);
+       write_unlock_irq(&tasklist_lock);
+       spin_unlock(&p->proc_lock);
+       proc_pid_flush(proc_dentry);
+       release_thread(p);
+       put_task_struct(p);
+
+       p = leader;
+       if (unlikely(zap_leader))
+               goto repeat;
+}
+
+/* we are using it only for SMP init */
+
+void unhash_process(struct task_struct *p)
+{
+       struct dentry *proc_dentry;
+
+       spin_lock(&p->proc_lock);
+       proc_dentry = proc_pid_unhash(p);
+       write_lock_irq(&tasklist_lock);
+       __unhash_process(p);
+       write_unlock_irq(&tasklist_lock);
+       spin_unlock(&p->proc_lock);
+       proc_pid_flush(proc_dentry);
+}
+
+/*
+ * This checks not only the pgrp, but falls back on the pid if no
+ * satisfactory pgrp is found. I dunno - gdb doesn't work correctly
+ * without this...
+ */
+int session_of_pgrp(int pgrp)
+{
+       struct task_struct *p;
+       struct list_head *l;
+       struct pid *pid;
+       int sid = -1;
+
+       read_lock(&tasklist_lock);
+       for_each_task_pid(pgrp, PIDTYPE_PGID, p, l, pid)
+               if (p->signal->session > 0) {
+                       sid = p->signal->session;
+                       goto out;
+               }
+       p = find_task_by_pid(pgrp);
+       if (p)
+               sid = p->signal->session;
+out:
+       read_unlock(&tasklist_lock);
+       
+       return sid;
+}
+
+/*
+ * Determine if a process group is "orphaned", according to the POSIX
+ * definition in 2.2.2.52.  Orphaned process groups are not to be affected
+ * by terminal-generated stop signals.  Newly orphaned process groups are
+ * to receive a SIGHUP and a SIGCONT.
+ *
+ * "I ask you, have you ever known what it is to be an orphan?"
+ */
+static int will_become_orphaned_pgrp(int pgrp, task_t *ignored_task)
+{
+       struct task_struct *p;
+       struct list_head *l;
+       struct pid *pid;
+       int ret = 1;
+
+       for_each_task_pid(pgrp, PIDTYPE_PGID, p, l, pid) {
+               if (p == ignored_task
+                               || p->state >= TASK_ZOMBIE 
+                               || p->real_parent->pid == 1)
+                       continue;
+               if (process_group(p->real_parent) != pgrp
+                           && p->real_parent->signal->session == p->signal->session) {
+                       ret = 0;
+                       break;
+               }
+       }
+       return ret;     /* (sighing) "Often!" */
+}
+
+int is_orphaned_pgrp(int pgrp)
+{
+       int retval;
+
+       read_lock(&tasklist_lock);
+       retval = will_become_orphaned_pgrp(pgrp, NULL);
+       read_unlock(&tasklist_lock);
+
+       return retval;
+}
+
+static inline int has_stopped_jobs(int pgrp)
+{
+       int retval = 0;
+       struct task_struct *p;
+       struct list_head *l;
+       struct pid *pid;
+
+       for_each_task_pid(pgrp, PIDTYPE_PGID, p, l, pid) {
+               if (p->state != TASK_STOPPED)
+                       continue;
+
+               /* If p is stopped by a debugger on a signal that won't
+                  stop it, then don't count p as stopped.  This isn't
+                  perfect but it's a good approximation.  */
+               if (unlikely (p->ptrace)
+                   && p->exit_code != SIGSTOP
+                   && p->exit_code != SIGTSTP
+                   && p->exit_code != SIGTTOU
+                   && p->exit_code != SIGTTIN)
+                       continue;
+
+               retval = 1;
+               break;
+       }
+       return retval;
+}
+
+/**
+ * reparent_to_init() - Reparent the calling kernel thread to the init task.
+ *
+ * If a kernel thread is launched as a result of a system call, or if
+ * it ever exits, it should generally reparent itself to init so that
+ * it is correctly cleaned up on exit.
+ *
+ * The various task state such as scheduling policy and priority may have
+ * been inherited from a user process, so we reset them to sane values here.
+ *
+ * NOTE that reparent_to_init() gives the caller full capabilities.
+ */
+void reparent_to_init(void)
+{
+       write_lock_irq(&tasklist_lock);
+
+       ptrace_unlink(current);
+       /* Reparent to init */
+       REMOVE_LINKS(current);
+       current->parent = child_reaper;
+       current->real_parent = child_reaper;
+       SET_LINKS(current);
+
+       /* Set the exit signal to SIGCHLD so we signal init on exit */
+       current->exit_signal = SIGCHLD;
+
+       if ((current->policy == SCHED_NORMAL) && (task_nice(current) < 0))
+               set_user_nice(current, 0);
+       /* cpus_allowed? */
+       /* rt_priority? */
+       /* signals? */
+       security_task_reparent_to_init(current);
+       memcpy(current->rlim, init_task.rlim, sizeof(*(current->rlim)));
+       atomic_inc(&(INIT_USER->__count));
+       switch_uid(INIT_USER);
+
+       write_unlock_irq(&tasklist_lock);
+}
+
+void __set_special_pids(pid_t session, pid_t pgrp)
+{
+       struct task_struct *curr = current;
+
+       if (curr->signal->session != session) {
+               detach_pid(curr, PIDTYPE_SID);
+               curr->signal->session = session;
+               attach_pid(curr, PIDTYPE_SID, session);
+       }
+       if (process_group(curr) != pgrp) {
+               detach_pid(curr, PIDTYPE_PGID);
+               curr->signal->pgrp = pgrp;
+               attach_pid(curr, PIDTYPE_PGID, pgrp);
+       }
+}
+
+void set_special_pids(pid_t session, pid_t pgrp)
+{
+       write_lock_irq(&tasklist_lock);
+       __set_special_pids(session, pgrp);
+       write_unlock_irq(&tasklist_lock);
+}
+
+/*
+ * Let kernel threads use this to say that they
+ * allow a certain signal (since daemonize() will
+ * have disabled all of them by default).
+ */
+int allow_signal(int sig)
+{
+       if (sig < 1 || sig > _NSIG)
+               return -EINVAL;
+
+       spin_lock_irq(&current->sighand->siglock);
+       sigdelset(&current->blocked, sig);
+       if (!current->mm) {
+               /* Kernel threads handle their own signals.
+                  Let the signal code know it'll be handled, so
+                  that they don't get converted to SIGKILL or
+                  just silently dropped */
+               current->sighand->action[(sig)-1].sa.sa_handler = (void *)2;
+       }
+       recalc_sigpending();
+       spin_unlock_irq(&current->sighand->siglock);
+       return 0;
+}
+
+EXPORT_SYMBOL(allow_signal);
+
+int disallow_signal(int sig)
+{
+       if (sig < 1 || sig > _NSIG)
+               return -EINVAL;
+
+       spin_lock_irq(&current->sighand->siglock);
+       sigaddset(&current->blocked, sig);
+       recalc_sigpending();
+       spin_unlock_irq(&current->sighand->siglock);
+       return 0;
+}
+
+EXPORT_SYMBOL(disallow_signal);
+
+/*
+ *     Put all the gunge required to become a kernel thread without
+ *     attached user resources in one place where it belongs.
+ */
+
+void daemonize(const char *name, ...)
+{
+       va_list args;
+       struct fs_struct *fs;
+       sigset_t blocked;
+
+       va_start(args, name);
+       vsnprintf(current->comm, sizeof(current->comm), name, args);
+       va_end(args);
+
+       /*
+        * If we were started as result of loading a module, close all of the
+        * user space pages.  We don't need them, and if we didn't close them
+        * they would be locked into memory.
+        */
+       exit_mm(current);
+
+       set_special_pids(1, 1);
+       current->signal->tty = NULL;
+
+       /* Block and flush all signals */
+       sigfillset(&blocked);
+       sigprocmask(SIG_BLOCK, &blocked, NULL);
+       flush_signals(current);
+
+       /* Become as one with the init task */
+
+       exit_fs(current);       /* current->fs->count--; */
+       fs = init_task.fs;
+       current->fs = fs;
+       atomic_inc(&fs->count);
+       exit_files(current);
+       current->files = init_task.files;
+       atomic_inc(&current->files->count);
+
+       reparent_to_init();
+}
+
+EXPORT_SYMBOL(daemonize);
+
+static inline void close_files(struct files_struct * files)
+{
+       int i, j;
+
+       j = 0;
+       for (;;) {
+               unsigned long set;
+               i = j * __NFDBITS;
+               if (i >= files->max_fdset || i >= files->max_fds)
+                       break;
+               set = files->open_fds->fds_bits[j++];
+               while (set) {
+                       if (set & 1) {
+                               struct file * file = xchg(&files->fd[i], NULL);
+                               if (file)
+                                       filp_close(file, files);
+                       }
+                       i++;
+                       set >>= 1;
+               }
+       }
+}
+
+struct files_struct *get_files_struct(struct task_struct *task)
+{
+       struct files_struct *files;
+
+       task_lock(task);
+       files = task->files;
+       if (files)
+               atomic_inc(&files->count);
+       task_unlock(task);
+
+       return files;
+}
+
+void fastcall put_files_struct(struct files_struct *files)
+{
+       if (atomic_dec_and_test(&files->count)) {
+               close_files(files);
+               /*
+                * Free the fd and fdset arrays if we expanded them.
+                */
+               if (files->fd != &files->fd_array[0])
+                       free_fd_array(files->fd, files->max_fds);
+               if (files->max_fdset > __FD_SETSIZE) {
+                       free_fdset(files->open_fds, files->max_fdset);
+                       free_fdset(files->close_on_exec, files->max_fdset);
+               }
+               kmem_cache_free(files_cachep, files);
+       }
+}
+
+EXPORT_SYMBOL(put_files_struct);
+
+static inline void __exit_files(struct task_struct *tsk)
+{
+       struct files_struct * files = tsk->files;
+
+       if (files) {
+               task_lock(tsk);
+               tsk->files = NULL;
+               task_unlock(tsk);
+               put_files_struct(files);
+       }
+}
+
+void exit_files(struct task_struct *tsk)
+{
+       __exit_files(tsk);
+}
+
+static inline void __put_fs_struct(struct fs_struct *fs)
+{
+       /* No need to hold fs->lock if we are killing it */
+       if (atomic_dec_and_test(&fs->count)) {
+               dput(fs->root);
+               mntput(fs->rootmnt);
+               dput(fs->pwd);
+               mntput(fs->pwdmnt);
+               if (fs->altroot) {
+                       dput(fs->altroot);
+                       mntput(fs->altrootmnt);
+               }
+               kmem_cache_free(fs_cachep, fs);
+       }
+}
+
+void put_fs_struct(struct fs_struct *fs)
+{
+       __put_fs_struct(fs);
+}
+
+static inline void __exit_fs(struct task_struct *tsk)
+{
+       struct fs_struct * fs = tsk->fs;
+
+       if (fs) {
+               task_lock(tsk);
+               tsk->fs = NULL;
+               task_unlock(tsk);
+               __put_fs_struct(fs);
+       }
+}
+
+void exit_fs(struct task_struct *tsk)
+{
+       __exit_fs(tsk);
+}
+
+EXPORT_SYMBOL_GPL(exit_fs);
+
+/*
+ * Turn us into a lazy TLB process if we
+ * aren't already..
+ */
+static inline void __exit_mm(struct task_struct * tsk)
+{
+       struct mm_struct *mm = tsk->mm;
+
+       mm_release(tsk, mm);
+       if (!mm)
+               return;
+       /*
+        * Serialize with any possible pending coredump.
+        * We must hold mmap_sem around checking core_waiters
+        * and clearing tsk->mm.  The core-inducing thread
+        * will increment core_waiters for each thread in the
+        * group with ->mm != NULL.
+        */
+       down_read(&mm->mmap_sem);
+       if (mm->core_waiters) {
+               up_read(&mm->mmap_sem);
+               down_write(&mm->mmap_sem);
+               if (!--mm->core_waiters)
+                       complete(mm->core_startup_done);
+               up_write(&mm->mmap_sem);
+
+               wait_for_completion(&mm->core_done);
+               down_read(&mm->mmap_sem);
+       }
+       atomic_inc(&mm->mm_count);
+       if (mm != tsk->active_mm) BUG();
+       /* more a memory barrier than a real lock */
+       task_lock(tsk);
+       tsk->mm = NULL;
+       up_read(&mm->mmap_sem);
+       enter_lazy_tlb(mm, current);
+       task_unlock(tsk);
+       mmput(mm);
+}
+
+void exit_mm(struct task_struct *tsk)
+{
+       __exit_mm(tsk);
+}
+
+EXPORT_SYMBOL(exit_mm);
+
+static inline void choose_new_parent(task_t *p, task_t *reaper, task_t *child_reaper)
+{
+       /*
+        * Make sure we're not reparenting to ourselves and that
+        * the parent is not a zombie.
+        */
+       if (p == reaper || reaper->state >= TASK_ZOMBIE)
+               p->real_parent = child_reaper;
+       else
+               p->real_parent = reaper;
+       if (p->parent == p->real_parent)
+               BUG();
+}
+
+static inline void reparent_thread(task_t *p, task_t *father, int traced)
+{
+       /* We don't want people slaying init.  */
+       if (p->exit_signal != -1)
+               p->exit_signal = SIGCHLD;
+       p->self_exec_id++;
+
+       if (p->pdeath_signal)
+               /* We already hold the tasklist_lock here.  */
+               group_send_sig_info(p->pdeath_signal, (void *) 0, p);
+
+       /* Move the child from its dying parent to the new one.  */
+       if (unlikely(traced)) {
+               /* Preserve ptrace links if someone else is tracing this child.  */
+               list_del_init(&p->ptrace_list);
+               if (p->parent != p->real_parent)
+                       list_add(&p->ptrace_list, &p->real_parent->ptrace_children);
+       } else {
+               /* If this child is being traced, then we're the one tracing it
+                * anyway, so let go of it.
+                */
+               p->ptrace = 0;
+               list_del_init(&p->sibling);
+               p->parent = p->real_parent;
+               list_add_tail(&p->sibling, &p->parent->children);
+
+               /* If we'd notified the old parent about this child's death,
+                * also notify the new parent.
+                */
+               if (p->state == TASK_ZOMBIE && p->exit_signal != -1 &&
+                   thread_group_empty(p))
+                       do_notify_parent(p, p->exit_signal);
+       }
+
+       /*
+        * process group orphan check
+        * Case ii: Our child is in a different pgrp
+        * than we are, and it was the only connection
+        * outside, so the child pgrp is now orphaned.
+        */
+       if ((process_group(p) != process_group(father)) &&
+           (p->signal->session == father->signal->session)) {
+               int pgrp = process_group(p);
+
+               if (will_become_orphaned_pgrp(pgrp, NULL) && has_stopped_jobs(pgrp)) {
+                       __kill_pg_info(SIGHUP, (void *)1, pgrp);
+                       __kill_pg_info(SIGCONT, (void *)1, pgrp);
+               }
+       }
+}
+
+/*
+ * When we die, we re-parent all our children.
+ * Try to give them to another thread in our thread
+ * group, and if no such member exists, give it to
+ * the global child reaper process (ie "init")
+ */
+static inline void forget_original_parent(struct task_struct * father)
+{
+       struct task_struct *p, *reaper = father;
+       struct list_head *_p, *_n;
+
+       reaper = father->group_leader;
+       if (reaper == father)
+               reaper = child_reaper;
+
+       /*
+        * There are only two places where our children can be:
+        *
+        * - in our child list
+        * - in our ptraced child list
+        *
+        * Search them and reparent children.
+        */
+       list_for_each_safe(_p, _n, &father->children) {
+               p = list_entry(_p,struct task_struct,sibling);
+               if (father == p->real_parent) {
+                       choose_new_parent(p, reaper, child_reaper);
+                       reparent_thread(p, father, 0);
+               } else {
+                       ptrace_unlink (p);
+                       if (p->state == TASK_ZOMBIE && p->exit_signal != -1 &&
+                           thread_group_empty(p))
+                               do_notify_parent(p, p->exit_signal);
+               }
+       }
+       list_for_each_safe(_p, _n, &father->ptrace_children) {
+               p = list_entry(_p,struct task_struct,ptrace_list);
+               choose_new_parent(p, reaper, child_reaper);
+               reparent_thread(p, father, 1);
+       }
+}
+
+/*
+ * Send signals to all our closest relatives so that they know
+ * to properly mourn us..
+ */
+static void exit_notify(struct task_struct *tsk)
+{
+       int state;
+       struct task_struct *t;
+
+       ckrm_cb_exit(tsk);
+
+       if (signal_pending(tsk) && !tsk->signal->group_exit
+           && !thread_group_empty(tsk)) {
+               /*
+                * This occurs when there was a race between our exit
+                * syscall and a group signal choosing us as the one to
+                * wake up.  It could be that we are the only thread
+                * alerted to check for pending signals, but another thread
+                * should be woken now to take the signal since we will not.
+                * Now we'll wake all the threads in the group just to make
+                * sure someone gets all the pending signals.
+                */
+               read_lock(&tasklist_lock);
+               spin_lock_irq(&tsk->sighand->siglock);
+               for (t = next_thread(tsk); t != tsk; t = next_thread(t))
+                       if (!signal_pending(t) && !(t->flags & PF_EXITING)) {
+                               recalc_sigpending_tsk(t);
+                               if (signal_pending(t))
+                                       signal_wake_up(t, 0);
+                       }
+               spin_unlock_irq(&tsk->sighand->siglock);
+               read_unlock(&tasklist_lock);
+       }
+
+       write_lock_irq(&tasklist_lock);
+
+       /*
+        * This does two things:
+        *
+        * A.  Make init inherit all the child processes
+        * B.  Check to see if any process groups have become orphaned
+        *      as a result of our exiting, and if they have any stopped
+        *      jobs, send them a SIGHUP and then a SIGCONT.  (POSIX 3.2.2.2)
+        */
+
+       forget_original_parent(tsk);
+       BUG_ON(!list_empty(&tsk->children));
+
+       /*
+        * Check to see if any process groups have become orphaned
+        * as a result of our exiting, and if they have any stopped
+        * jobs, send them a SIGHUP and then a SIGCONT.  (POSIX 3.2.2.2)
+        *
+        * Case i: Our father is in a different pgrp than we are
+        * and we were the only connection outside, so our pgrp
+        * is about to become orphaned.
+        */
+        
+       t = tsk->real_parent;
+       
+       if ((process_group(t) != process_group(tsk)) &&
+           (t->signal->session == tsk->signal->session) &&
+           will_become_orphaned_pgrp(process_group(tsk), tsk) &&
+           has_stopped_jobs(process_group(tsk))) {
+               __kill_pg_info(SIGHUP, (void *)1, process_group(tsk));
+               __kill_pg_info(SIGCONT, (void *)1, process_group(tsk));
+       }
+
+       /* Let father know we died 
+        *
+        * Thread signals are configurable, but you aren't going to use
+        * that to send signals to arbitary processes. 
+        * That stops right now.
+        *
+        * If the parent exec id doesn't match the exec id we saved
+        * when we started then we know the parent has changed security
+        * domain.
+        *
+        * If our self_exec id doesn't match our parent_exec_id then
+        * we have changed execution domain as these two values started
+        * the same after a fork.
+        *      
+        */
+       
+       if (tsk->exit_signal != SIGCHLD && tsk->exit_signal != -1 &&
+           ( tsk->parent_exec_id != t->self_exec_id  ||
+             tsk->self_exec_id != tsk->parent_exec_id)
+           && !capable(CAP_KILL))
+               tsk->exit_signal = SIGCHLD;
+
+
+       /* If something other than our normal parent is ptracing us, then
+        * send it a SIGCHLD instead of honoring exit_signal.  exit_signal
+        * only has special meaning to our real parent.
+        */
+       if (tsk->exit_signal != -1 && thread_group_empty(tsk)) {
+               int signal = tsk->parent == tsk->real_parent ? tsk->exit_signal : SIGCHLD;
+               do_notify_parent(tsk, signal);
+       } else if (tsk->ptrace) {
+               do_notify_parent(tsk, SIGCHLD);
+       }
+
+       state = TASK_ZOMBIE;
+       if (tsk->exit_signal == -1 && tsk->ptrace == 0)
+               state = TASK_DEAD;
+       tsk->state = state;
+       tsk->flags |= PF_DEAD;
+
+       /*
+        * Clear these here so that update_process_times() won't try to deliver
+        * itimer, profile or rlimit signals to this task while it is in late exit.
+        */
+       tsk->it_virt_value = 0;
+       tsk->it_prof_value = 0;
+       tsk->rlim[RLIMIT_CPU].rlim_cur = RLIM_INFINITY;
+
+       /*
+        * In the preemption case it must be impossible for the task
+        * to get runnable again, so use "_raw_" unlock to keep
+        * preempt_count elevated until we schedule().
+        *
+        * To avoid deadlock on SMP, interrupts must be unmasked.  If we
+        * don't, subsequently called functions (e.g, wait_task_inactive()
+        * via release_task()) will spin, with interrupt flags
+        * unwittingly blocked, until the other task sleeps.  That task
+        * may itself be waiting for smp_call_function() to answer and
+        * complete, and with interrupts blocked that will never happen.
+        */
+       _raw_write_unlock(&tasklist_lock);
+       local_irq_enable();
+
+       /* If the process is dead, release it - nobody will wait for it */
+       if (state == TASK_DEAD)
+               release_task(tsk);
+
+}
+
+asmlinkage NORET_TYPE void do_exit(long code)
+{
+       struct task_struct *tsk = current;
+
+       if (unlikely(in_interrupt()))
+               panic("Aiee, killing interrupt handler!");
+       if (unlikely(!tsk->pid))
+               panic("Attempted to kill the idle task!");
+       if (unlikely(tsk->pid == 1))
+               panic("Attempted to kill init!");
+       if (tsk->io_context)
+               exit_io_context();
+       tsk->flags |= PF_EXITING;
+       del_timer_sync(&tsk->real_timer);
+
+       if (unlikely(in_atomic()))
+               printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n",
+                               current->comm, current->pid,
+                               preempt_count());
+
+       profile_exit_task(tsk);
+       if (unlikely(current->ptrace & PT_TRACE_EXIT)) {
+               current->ptrace_message = code;
+               ptrace_notify((PTRACE_EVENT_EXIT << 8) | SIGTRAP);
+       }
+
+       acct_process(code);
+       __exit_mm(tsk);
+
+       exit_sem(tsk);
+       __exit_files(tsk);
+       __exit_fs(tsk);
+       exit_namespace(tsk);
+       exit_thread();
+#ifdef CONFIG_NUMA
+       mpol_free(tsk->mempolicy);
+#endif
+
+       if (tsk->signal->leader)
+               disassociate_ctty(1);
+
+       module_put(tsk->thread_info->exec_domain->module);
+       if (tsk->binfmt)
+               module_put(tsk->binfmt->module);
+
+       tsk->exit_code = code;
+#ifdef CONFIG_CKRM_TYPE_TASKCLASS
+       numtasks_put_ref(tsk->taskclass);
+#endif
+       exit_notify(tsk);
+       schedule();
+       BUG();
+       /* Avoid "noreturn function does return".  */
+       for (;;) ;
+}
+
+NORET_TYPE void complete_and_exit(struct completion *comp, long code)
+{
+       if (comp)
+               complete(comp);
+       
+       do_exit(code);
+}
+
+EXPORT_SYMBOL(complete_and_exit);
+
+asmlinkage long sys_exit(int error_code)
+{
+       do_exit((error_code&0xff)<<8);
+}
+
+task_t fastcall *next_thread(task_t *p)
+{
+       struct pid_link *link = p->pids + PIDTYPE_TGID;
+       struct list_head *tmp, *head = &link->pidptr->task_list;
+
+#ifdef CONFIG_SMP
+       if (!p->sighand)
+               BUG();
+       if (!spin_is_locked(&p->sighand->siglock) &&
+                               !rwlock_is_locked(&tasklist_lock))
+               BUG();
+#endif
+       tmp = link->pid_chain.next;
+       if (tmp == head)
+               tmp = head->next;
+
+       return pid_task(tmp, PIDTYPE_TGID);
+}
+
+EXPORT_SYMBOL(next_thread);
+
+/*
+ * Take down every thread in the group.  This is called by fatal signals
+ * as well as by sys_exit_group (below).
+ */
+NORET_TYPE void
+do_group_exit(int exit_code)
+{
+       BUG_ON(exit_code & 0x80); /* core dumps don't get here */
+
+       if (current->signal->group_exit)
+               exit_code = current->signal->group_exit_code;
+       else if (!thread_group_empty(current)) {
+               struct signal_struct *const sig = current->signal;
+               struct sighand_struct *const sighand = current->sighand;
+               read_lock(&tasklist_lock);
+               spin_lock_irq(&sighand->siglock);
+               if (sig->group_exit)
+                       /* Another thread got here before we took the lock.  */
+                       exit_code = sig->group_exit_code;
+               else {
+                       sig->group_exit = 1;
+                       sig->group_exit_code = exit_code;
+                       zap_other_threads(current);
+               }
+               spin_unlock_irq(&sighand->siglock);
+               read_unlock(&tasklist_lock);
+       }
+
+       do_exit(exit_code);
+       /* NOTREACHED */
+}
+
+/*
+ * this kills every thread in the thread group. Note that any externally
+ * wait4()-ing process will get the correct exit code - even if this
+ * thread is not the thread group leader.
+ */
+asmlinkage void sys_exit_group(int error_code)
+{
+       do_group_exit((error_code & 0xff) << 8);
+}
+
+static int eligible_child(pid_t pid, int options, task_t *p)
+{
+       if (pid > 0) {
+               if (p->pid != pid)
+                       return 0;
+       } else if (!pid) {
+               if (process_group(p) != process_group(current))
+                       return 0;
+       } else if (pid != -1) {
+               if (process_group(p) != -pid)
+                       return 0;
+       }
+
+       /*
+        * Do not consider detached threads that are
+        * not ptraced:
+        */
+       if (p->exit_signal == -1 && !p->ptrace)
+               return 0;
+
+       /* Wait for all children (clone and not) if __WALL is set;
+        * otherwise, wait for clone children *only* if __WCLONE is
+        * set; otherwise, wait for non-clone children *only*.  (Note:
+        * A "clone" child here is one that reports to its parent
+        * using a signal other than SIGCHLD.) */
+       if (((p->exit_signal != SIGCHLD) ^ ((options & __WCLONE) != 0))
+           && !(options & __WALL))
+               return 0;
+       /*
+        * Do not consider thread group leaders that are
+        * in a non-empty thread group:
+        */
+       if (current->tgid != p->tgid && delay_group_leader(p))
+               return 2;
+
+       if (security_task_wait(p))
+               return 0;
+
+       return 1;
+}
+
+/*
+ * Handle sys_wait4 work for one task in state TASK_ZOMBIE.  We hold
+ * read_lock(&tasklist_lock) on entry.  If we return zero, we still hold
+ * the lock and this task is uninteresting.  If we return nonzero, we have
+ * released the lock and the system call should return.
+ */
+static int wait_task_zombie(task_t *p, unsigned int __user *stat_addr, struct rusage __user *ru)
+{
+       unsigned long state;
+       int retval;
+
+       /*
+        * Try to move the task's state to DEAD
+        * only one thread is allowed to do this:
+        */
+       state = xchg(&p->state, TASK_DEAD);
+       if (state != TASK_ZOMBIE) {
+               BUG_ON(state != TASK_DEAD);
+               return 0;
+       }
+       if (unlikely(p->exit_signal == -1 && p->ptrace == 0))
+               /*
+                * This can only happen in a race with a ptraced thread
+                * dying on another processor.
+                */
+               return 0;
+
+       /*
+        * Now we are sure this task is interesting, and no other
+        * thread can reap it because we set its state to TASK_DEAD.
+        */
+       read_unlock(&tasklist_lock);
+
+       retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0;
+       if (!retval && stat_addr) {
+               if (p->signal->group_exit)
+                       retval = put_user(p->signal->group_exit_code, stat_addr);
+               else
+                       retval = put_user(p->exit_code, stat_addr);
+       }
+       if (retval) {
+               p->state = TASK_ZOMBIE;
+               return retval;
+       }
+       retval = p->pid;
+       if (p->real_parent != p->parent) {
+               write_lock_irq(&tasklist_lock);
+               /* Double-check with lock held.  */
+               if (p->real_parent != p->parent) {
+                       __ptrace_unlink(p);
+                       p->state = TASK_ZOMBIE;
+                       /* If this is a detached thread, this is where it goes away.  */
+                       if (p->exit_signal == -1) {
+                               /* release_task takes the lock itself.  */
+                               write_unlock_irq(&tasklist_lock);
+                               release_task (p);
+                       }
+                       else {
+                               do_notify_parent(p, p->exit_signal);
+                               write_unlock_irq(&tasklist_lock);
+                       }
+                       p = NULL;
+               }
+               else
+                       write_unlock_irq(&tasklist_lock);
+       }
+       if (p != NULL)
+               release_task(p);
+       BUG_ON(!retval);
+       return retval;
+}
+
+/*
+ * Handle sys_wait4 work for one task in state TASK_STOPPED.  We hold
+ * read_lock(&tasklist_lock) on entry.  If we return zero, we still hold
+ * the lock and this task is uninteresting.  If we return nonzero, we have
+ * released the lock and the system call should return.
+ */
+static int wait_task_stopped(task_t *p, int delayed_group_leader,
+                            unsigned int __user *stat_addr,
+                            struct rusage __user *ru)
+{
+       int retval, exit_code;
+
+       if (!p->exit_code)
+               return 0;
+       if (delayed_group_leader && !(p->ptrace & PT_PTRACED) &&
+           p->signal && p->signal->group_stop_count > 0)
+               /*
+                * A group stop is in progress and this is the group leader.
+                * We won't report until all threads have stopped.
+                */
+               return 0;
+
+       /*
+        * Now we are pretty sure this task is interesting.
+        * Make sure it doesn't get reaped out from under us while we
+        * give up the lock and then examine it below.  We don't want to
+        * keep holding onto the tasklist_lock while we call getrusage and
+        * possibly take page faults for user memory.
+        */
+       get_task_struct(p);
+       read_unlock(&tasklist_lock);
+       write_lock_irq(&tasklist_lock);
+
+       /*
+        * This uses xchg to be atomic with the thread resuming and setting
+        * it.  It must also be done with the write lock held to prevent a
+        * race with the TASK_ZOMBIE case.
+        */
+       exit_code = xchg(&p->exit_code, 0);
+       if (unlikely(p->state > TASK_STOPPED)) {
+               /*
+                * The task resumed and then died.  Let the next iteration
+                * catch it in TASK_ZOMBIE.  Note that exit_code might
+                * already be zero here if it resumed and did _exit(0).
+                * The task itself is dead and won't touch exit_code again;
+                * other processors in this function are locked out.
+                */
+               p->exit_code = exit_code;
+               exit_code = 0;
+       }
+       if (unlikely(exit_code == 0)) {
+               /*
+                * Another thread in this function got to it first, or it
+                * resumed, or it resumed and then died.
+                */
+               write_unlock_irq(&tasklist_lock);
+               put_task_struct(p);
+               read_lock(&tasklist_lock);
+               return 0;
+       }
+
+       /* move to end of parent's list to avoid starvation */
+       remove_parent(p);
+       add_parent(p, p->parent);
+
+       write_unlock_irq(&tasklist_lock);
+
+       retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0;
+       if (!retval && stat_addr)
+               retval = put_user((exit_code << 8) | 0x7f, stat_addr);
+       if (!retval)
+               retval = p->pid;
+       put_task_struct(p);
+
+       BUG_ON(!retval);
+       return retval;
+}
+
+asmlinkage long sys_wait4(pid_t pid,unsigned int __user *stat_addr, int options, struct rusage __user *ru)
+{
+       DECLARE_WAITQUEUE(wait, current);
+       struct task_struct *tsk;
+       int flag, retval;
+
+       if (options & ~(WNOHANG|WUNTRACED|__WNOTHREAD|__WCLONE|__WALL))
+               return -EINVAL;
+
+       add_wait_queue(&current->wait_chldexit,&wait);
+repeat:
+       flag = 0;
+       current->state = TASK_INTERRUPTIBLE;
+       read_lock(&tasklist_lock);
+       tsk = current;
+       do {
+               struct task_struct *p;
+               struct list_head *_p;
+               int ret;
+
+               list_for_each(_p,&tsk->children) {
+                       p = list_entry(_p,struct task_struct,sibling);
+
+                       ret = eligible_child(pid, options, p);
+                       if (!ret)
+                               continue;
+                       flag = 1;
+
+                       switch (p->state) {
+                       case TASK_STOPPED:
+                               if (!(options & WUNTRACED) &&
+                                   !(p->ptrace & PT_PTRACED))
+                                       continue;
+                               retval = wait_task_stopped(p, ret == 2,
+                                                          stat_addr, ru);
+                               if (retval != 0) /* He released the lock.  */
+                                       goto end_wait4;
+                               break;
+                       case TASK_ZOMBIE:
+                               /*
+                                * Eligible but we cannot release it yet:
+                                */
+                               if (ret == 2)
+                                       continue;
+                               retval = wait_task_zombie(p, stat_addr, ru);
+                               if (retval != 0) /* He released the lock.  */
+                                       goto end_wait4;
+                               break;
+                       }
+               }
+               if (!flag) {
+                       list_for_each (_p,&tsk->ptrace_children) {
+                               p = list_entry(_p,struct task_struct,ptrace_list);
+                               if (!eligible_child(pid, options, p))
+                                       continue;
+                               flag = 1;
+                               break;
+                       }
+               }
+               if (options & __WNOTHREAD)
+                       break;
+               tsk = next_thread(tsk);
+               if (tsk->signal != current->signal)
+                       BUG();
+       } while (tsk != current);
+       read_unlock(&tasklist_lock);
+       if (flag) {
+               retval = 0;
+               if (options & WNOHANG)
+                       goto end_wait4;
+               retval = -ERESTARTSYS;
+               if (signal_pending(current))
+                       goto end_wait4;
+               schedule();
+               goto repeat;
+       }
+       retval = -ECHILD;
+end_wait4:
+       current->state = TASK_RUNNING;
+       remove_wait_queue(&current->wait_chldexit,&wait);
+       return retval;
+}
+
+#ifdef __ARCH_WANT_SYS_WAITPID
+
+/*
+ * sys_waitpid() remains for compatibility. waitpid() should be
+ * implemented by calling sys_wait4() from libc.a.
+ */
+asmlinkage long sys_waitpid(pid_t pid, unsigned __user *stat_addr, int options)
+{
+       return sys_wait4(pid, stat_addr, options, NULL);
+}
+
+#endif