From a7f82aceea1d136c4b94c862f44c3ab1a50bb4e2 Mon Sep 17 00:00:00 2001
From: Planet-Lab Support <support@planet-lab.org>
Date: Wed, 8 Sep 2004 19:40:52 +0000
Subject: [PATCH] This commit was manufactured by cvs2svn to create branch
 'ckrm'.

---
 include/linux/ckrm_mem.h        |  106 +++
 include/linux/ckrm_mem_inline.h |  256 +++++++
 kernel/ckrm/ckrm_mem.c          |  814 +++++++++++++++++++++
 kernel/exit.c.orig              | 1192 +++++++++++++++++++++++++++++++
 4 files changed, 2368 insertions(+)
 create mode 100644 include/linux/ckrm_mem.h
 create mode 100644 include/linux/ckrm_mem_inline.h
 create mode 100644 kernel/ckrm/ckrm_mem.c
 create mode 100644 kernel/exit.c.orig

diff --git a/include/linux/ckrm_mem.h b/include/linux/ckrm_mem.h
new file mode 100644
index 000000000..52dc949ec
--- /dev/null
+++ b/include/linux/ckrm_mem.h
@@ -0,0 +1,106 @@
+/* include/linux/ckrm_mem.h : memory control for CKRM
+ *
+ * Copyright (C) Jiantao Kong, IBM Corp. 2003
+ *           (C) Shailabh Nagar, IBM Corp. 2003
+ *           (C) Chandra Seetharaman, IBM Corp. 2004
+ * 
+ * 
+ * Memory control functions of the CKRM kernel API 
+ *
+ * Latest version, more details at http://ckrm.sf.net
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+
+/* Changes
+ *
+ * 28 Aug 2003
+ *        Created.
+ */
+
+#ifndef _LINUX_CKRM_MEM_H
+#define _LINUX_CKRM_MEM_H
+
+#ifdef CONFIG_CKRM_RES_MEM
+
+#include <linux/list.h>
+#include <linux/ckrm_rc.h>
+
+typedef struct ckrm_mem_res {
+	unsigned long reclaim_flags; 
+	unsigned long flags; 
+	struct ckrm_core_class *core; // the core i am part of...
+	struct ckrm_core_class *parent; // parent of the core i am part of....
+	struct ckrm_shares shares;
+	struct list_head mcls_list; // list of all 1-level classes
+	struct list_head shrink_list; // list of classes need to be shrunk
+	atomic_t nr_users; // # of references to this class/data structure
+	atomic_t pg_total;  // # of pages used by this class
+	int pg_guar; // # of pages this class is guaranteed
+	int pg_limit; // max # of pages this class can get
+	int pg_borrowed; // # of pages this class borrowed from its parent
+	int pg_lent; // # of pages this class lent to its children
+	int pg_unused; // # of pages left to this class (after giving the
+				// guarantees to children. need to borrow from parent if
+				// more than this is needed.
+	int nr_active[MAX_NR_ZONES];
+	int nr_inactive[MAX_NR_ZONES];
+	int shrink_count;
+	unsigned long last_shrink;
+	int over_limit_failures;
+	int hier; // hiearchy, root = 0
+} ckrm_mem_res_t;
+
+extern atomic_t ckrm_mem_real_count;
+extern unsigned int ckrm_tot_lru_pages;
+extern struct list_head ckrm_shrink_list;
+extern spinlock_t ckrm_mem_lock;
+extern struct ckrm_res_ctlr mem_rcbs;
+
+#define page_class(page)	((ckrm_mem_res_t*)((page)->memclass))
+
+// used to fill reclaim_flags, used only when memory is low in the system
+#define CLS_CLEAR		(0)      // class under its guarantee
+#define CLS_OVER_GUAR	(1 << 0) // class is over its guarantee
+#define CLS_PARENT_OVER	(1 << 1) // parent is over 120% mark over limit
+#define CLS_OVER_75		(1 << 2) // class over 75% mark bet guar(0) & limit(100)
+#define CLS_OVER_100	(1 << 3) // class over its limit
+#define CLS_OVER_110	(1 << 4) // class over 110% mark over limit
+#define CLS_FLAGS_ALL	( CLS_OVER_GUAR | CLS_PARENT_OVER | CLS_OVER_75 | \
+					CLS_OVER_100 | CLS_OVER_110 )
+#define CLS_SHRINK_BIT	(31)	  // used to both lock and set the bit
+#define CLS_SHRINK		(1 << CLS_SHRINK_BIT) // shrink the given class
+
+// used in flags. set when a class is more than 90% of its maxlimit
+#define MEM_NEAR_LIMIT 1
+
+extern void ckrm_set_aggressive(ckrm_mem_res_t *);
+extern unsigned int ckrm_setup_reclamation(void);
+extern void ckrm_teardown_reclamation(void);
+extern void ckrm_get_reclaim_bits(unsigned int *, unsigned int *);
+extern void ckrm_init_mm_to_task(struct mm_struct *, struct task_struct *);
+extern void ckrm_mem_evaluate_mm(struct mm_struct *);
+extern void ckrm_mem_evaluate_page_byadd(struct page *, struct mm_struct *);
+extern void ckrm_near_limit(ckrm_mem_res_t *);
+#define ckrm_get_reclaim_flags(cls)	((cls)->reclaim_flags)
+
+#else
+
+#define ckrm_init_mm_to_current(a)			do {} while (0)
+#define ckrm_mem_evaluate_mm(a)				do {} while (0)
+#define ckrm_mem_evaluate_page_byadd(a,b)	do {} while (0)
+#define page_class(page)					(NULL)
+#define ckrm_get_reclaim_flags(a)			(0)
+#define ckrm_setup_reclamation()			(0)
+#define ckrm_teardown_reclamation()			do {} while (0)
+#define ckrm_get_reclaim_bits(a, b)			do { *(a) = 0; *(b)= 0; } while (0)
+#define ckrm_init_mm_to_task(a,b)			do {} while (0)
+
+#endif // CONFIG_CKRM_RES_MEM
+
+#endif //_LINUX_CKRM_MEM_H
+
diff --git a/include/linux/ckrm_mem_inline.h b/include/linux/ckrm_mem_inline.h
new file mode 100644
index 000000000..0eb4e49c0
--- /dev/null
+++ b/include/linux/ckrm_mem_inline.h
@@ -0,0 +1,256 @@
+/* include/linux/ckrm_mem_inline.h : memory control for CKRM
+ *
+ * Copyright (C) Jiantao Kong, IBM Corp. 2003
+ *           (C) Shailabh Nagar, IBM Corp. 2003
+ *           (C) Chandra Seetharaman, IBM Corp. 2004
+ * 
+ * 
+ * Memory control functions of the CKRM kernel API 
+ *
+ * Latest version, more details at http://ckrm.sf.net
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+
+/* Changes
+ *
+ * 28 Aug 2003
+ *        Created.
+ */
+
+
+#ifndef _LINUX_CKRM_MEM_INLINE_H_
+#define _LINUX_CKRM_MEM_INLINE_H_
+
+#include <linux/rmap.h>
+#include <linux/mmzone.h>
+#include <linux/ckrm_mem.h>
+
+
+#ifdef CONFIG_CKRM_RES_MEM
+
+#define GET_MEM_CLASS(tsk) \
+	ckrm_get_res_class(tsk->taskclass, mem_rcbs.resid, ckrm_mem_res_t)
+
+#define ckrm_set_shrink(cls) \
+	set_bit(CLS_SHRINK_BIT, (unsigned long *)&(cls)->reclaim_flags)
+#define ckrm_test_set_shrink(cls) \
+	test_and_set_bit(CLS_SHRINK_BIT, (unsigned long *)&(cls)->reclaim_flags)
+#define ckrm_clear_shrink(cls) \
+	clear_bit(CLS_SHRINK_BIT, (unsigned long *)&(cls)->reclaim_flags)
+
+#define ckrm_shrink_list_empty()	list_empty(&ckrm_shrink_list)
+
+/*
+ * Currently, the class of an address is assigned to the class with max
+ * available guarantee. Simply replace this function for other policies.
+ */
+static inline int
+ckrm_mem_share_compare(ckrm_mem_res_t *a, ckrm_mem_res_t *b)
+{
+	if (a == NULL) 
+		return -(b != NULL) ;
+	if (b == NULL)
+		return 0;
+	return (a->pg_unused - b->pg_unused);
+}
+
+static inline void
+mem_class_get(ckrm_mem_res_t *cls)
+{
+	if (cls)
+		atomic_inc(&((cls)->nr_users));
+}
+
+static inline void
+mem_class_put(ckrm_mem_res_t *cls)
+{
+	if (cls && atomic_dec_and_test(&(cls->nr_users)) ) {
+		printk("freeing memclass %p of <core:%s>\n", cls, cls->core->name);
+		//kfree(cls);
+	}	
+}
+
+static inline int
+incr_use_count(ckrm_mem_res_t *cls, int borrow)
+{
+	int over_limit;
+
+	atomic_inc(&cls->pg_total);
+	over_limit = (atomic_read(&cls->pg_total) > ((9 * cls->pg_limit) / 10));
+
+	if (borrow) 
+		cls->pg_lent++;
+	if ((cls->pg_guar != CKRM_SHARE_DONTCARE) &&
+				(atomic_read(&cls->pg_total) > cls->pg_unused)) {
+		ckrm_mem_res_t *parcls = ckrm_get_res_class(cls->parent,
+				mem_rcbs.resid, ckrm_mem_res_t);
+		if (parcls) {
+			over_limit |= incr_use_count(parcls, 1);
+			cls->pg_borrowed++;
+			return over_limit;
+		}
+	}
+	atomic_inc(&ckrm_mem_real_count);
+	return over_limit;
+}
+
+static inline void
+decr_use_count(ckrm_mem_res_t *cls, int borrowed)
+{
+	atomic_dec(&cls->pg_total);
+	if (borrowed)
+		cls->pg_lent--;
+	if (cls->pg_borrowed > 0) {
+		ckrm_mem_res_t *parcls = ckrm_get_res_class(cls->parent,
+				mem_rcbs.resid, ckrm_mem_res_t);
+		if (parcls) {
+			decr_use_count(parcls, 1);
+			cls->pg_borrowed--;
+			return;
+		}
+	}
+	atomic_dec(&ckrm_mem_real_count);
+}
+
+static inline void
+ckrm_set_page_class(struct page *page, ckrm_mem_res_t *cls)
+{
+	if (mem_rcbs.resid != -1 && cls != NULL) {
+		if (unlikely(page->memclass)) {
+			mem_class_put(page->memclass);
+		}
+		page->memclass = cls;
+		mem_class_get(cls);
+	} else {
+		page->memclass = NULL;
+	}
+}
+
+static inline void
+ckrm_set_pages_class(struct page *pages, int numpages, ckrm_mem_res_t *cls)
+{
+	int i;
+	for (i = 0; i < numpages; pages++, i++) {
+		ckrm_set_page_class(pages, cls);
+	}
+}
+
+static inline void
+ckrm_clear_page_class(struct page *page)
+{
+	if (page->memclass != NULL) {
+		mem_class_put(page->memclass);
+		page->memclass = NULL;
+	}
+}
+
+static inline void
+ckrm_clear_pages_class(struct page *pages, int numpages)
+{
+	int i;
+	for (i = 0; i < numpages; pages++, i++) {
+		ckrm_clear_page_class(pages);
+	}
+}
+
+static inline void
+ckrm_change_page_class(struct page *page, ckrm_mem_res_t *cls)
+{
+	ckrm_clear_page_class(page);
+	ckrm_set_page_class(page, cls);
+}
+
+static inline void
+ckrm_change_pages_class(struct page *pages, int numpages, 
+					ckrm_mem_res_t *cls)
+{
+	int i;
+	for (i = 0; i < numpages; pages++, i++) {
+		ckrm_change_page_class(pages, cls);
+	}
+}
+
+static inline void
+ckrm_mem_inc_active(struct page *page)
+{
+	ckrm_mem_res_t *cls = page_class(page);
+	BUG_ON(cls == NULL);
+	cls->nr_active[page_zonenum(page)]++;
+	if (incr_use_count(cls, 0)) {
+		ckrm_near_limit(cls);
+	}
+}
+
+static inline void
+ckrm_mem_dec_active(struct page *page)
+{
+	ckrm_mem_res_t *cls = page_class(page);
+	BUG_ON(cls == NULL);
+	cls->nr_active[page_zonenum(page)]--;
+	decr_use_count(cls, 0);
+}
+
+static inline void
+ckrm_mem_inc_inactive(struct page *page)
+{
+	ckrm_mem_res_t *cls = page_class(page);
+	BUG_ON(cls == NULL);
+	cls->nr_inactive[page_zonenum(page)]++;
+	if (incr_use_count(cls, 0) &&
+			((cls->flags & MEM_NEAR_LIMIT) != MEM_NEAR_LIMIT)) {
+		ckrm_near_limit(cls);
+	}
+}
+
+static inline void
+ckrm_mem_dec_inactive(struct page *page)
+{
+	ckrm_mem_res_t *cls = page_class(page);
+	BUG_ON(cls == NULL);
+	cls->nr_inactive[page_zonenum(page)]--;
+	decr_use_count(cls, 0);
+}
+
+static inline int
+ckrm_kick_page(struct page *page, unsigned int bits)
+{
+	if (page_class(page) == NULL) {
+		return bits;
+	} else {
+		return (page_class(page)->reclaim_flags & bits);
+	}
+}
+
+static inline int 
+ckrm_class_limit_ok(ckrm_mem_res_t *cls)
+{
+	if ((mem_rcbs.resid == -1) || !cls) {
+		return 1;
+	}
+	return (atomic_read(&cls->pg_total) <= (11 * cls->pg_limit) / 10);
+}
+
+#else // !CONFIG_CKRM_RES_MEM
+
+#define ckrm_set_page_class(a,b)		do{}while(0)
+#define ckrm_set_pages_class(a,b,c)		do{}while(0)
+#define ckrm_clear_page_class(a)		do{}while(0)
+#define ckrm_clear_pages_class(a,b)		do{}while(0)
+#define ckrm_change_page_class(a,b)		do{}while(0)
+#define ckrm_change_pages_class(a,b,c)	do{}while(0)
+#define ckrm_mem_inc_active(a)			do{}while(0)
+#define ckrm_mem_dec_active(a)			do{}while(0)
+#define ckrm_mem_inc_inactive(a)		do{}while(0)
+#define ckrm_mem_dec_inactive(a)		do{}while(0)
+#define ckrm_shrink_list_empty()		(1)
+#define ckrm_kick_page(a,b)				(0)
+#define ckrm_class_limit_ok(a)			(1)
+
+#endif // CONFIG_CKRM_RES_MEM
+
+#endif // _LINUX_CKRM_MEM_INLINE_H_
diff --git a/kernel/ckrm/ckrm_mem.c b/kernel/ckrm/ckrm_mem.c
new file mode 100644
index 000000000..667ac9c67
--- /dev/null
+++ b/kernel/ckrm/ckrm_mem.c
@@ -0,0 +1,814 @@
+/* ckrm_mem.c - Memory Resource Manager for CKRM
+ *
+ * Copyright (C) Chandra Seetharaman, IBM Corp. 2004
+ *
+ * Provides a Memory Resource controller for CKRM
+ *
+ * Latest version, more details at http://ckrm.sf.net
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+
+/* Code Description: TBD
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <asm/errno.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/pagemap.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+#include <linux/cache.h>
+#include <linux/percpu.h>
+#include <linux/pagevec.h>
+
+#include <linux/ckrm_mem_inline.h>
+
+#include <asm/uaccess.h>
+#include <asm/pgtable.h>
+
+#define MEM_NAME "mem"
+
+#define CKRM_MEM_MAX_HIERARCHY 2 // allows only upto 2 levels - 0, 1 & 2
+
+/* all 1-level memory_share_class are chained together */
+static LIST_HEAD(ckrm_memclass_list);
+LIST_HEAD(ckrm_shrink_list);
+EXPORT_SYMBOL(ckrm_shrink_list);
+spinlock_t ckrm_mem_lock = SPIN_LOCK_UNLOCKED; // protects both lists above
+EXPORT_SYMBOL(ckrm_mem_lock);
+unsigned int ckrm_tot_lru_pages; // total # of pages in the system
+							 // currently doesn't handle memory add/remove
+EXPORT_SYMBOL(ckrm_tot_lru_pages);
+
+static ckrm_mem_res_t *ckrm_mem_root_class;
+atomic_t ckrm_mem_real_count = ATOMIC_INIT(0);
+EXPORT_SYMBOL(ckrm_mem_real_count);
+
+/* Initialize rescls values
+ * May be called on each rcfs unmount or as part of error recovery
+ * to make share values sane.
+ * Does not traverse hierarchy reinitializing children.
+ */
+
+static void
+set_ckrm_tot_pages(void)
+{
+	struct zone *zone;
+	int tot_lru_pages = 0;
+
+	for_each_zone(zone) {
+		tot_lru_pages += zone->nr_active;
+		tot_lru_pages += zone->nr_inactive;
+		tot_lru_pages += zone->free_pages;
+	}
+	ckrm_tot_lru_pages = tot_lru_pages;
+}
+
+static void
+mem_res_initcls_one(void *my_res)
+{
+	ckrm_mem_res_t *res = my_res;
+
+	memset(res, 0, sizeof(ckrm_mem_res_t));
+
+	res->shares.my_guarantee     = CKRM_SHARE_DONTCARE;
+	res->shares.my_limit         = CKRM_SHARE_DONTCARE;
+	res->shares.total_guarantee  = CKRM_SHARE_DFLT_TOTAL_GUARANTEE;
+	res->shares.max_limit        = CKRM_SHARE_DFLT_MAX_LIMIT;
+	res->shares.unused_guarantee = CKRM_SHARE_DFLT_TOTAL_GUARANTEE;
+	res->shares.cur_max_limit    = 0;
+
+	res->pg_guar = CKRM_SHARE_DONTCARE;
+	res->pg_limit = CKRM_SHARE_DONTCARE;
+	res->pg_unused = CKRM_SHARE_DONTCARE;
+}
+
+static void *
+mem_res_alloc(struct ckrm_core_class *core, struct ckrm_core_class *parent)
+{
+	ckrm_mem_res_t *res, *parres;
+
+	if (mem_rcbs.resid == -1) {
+		return NULL;
+	}
+
+	parres = ckrm_get_res_class(parent, mem_rcbs.resid, ckrm_mem_res_t);
+	if (parres && (parres->hier == CKRM_MEM_MAX_HIERARCHY)) {
+		// allows only upto CKRM_MEM_MAX_HIERARCHY
+		return NULL;
+	}
+
+	if (unlikely((parent == NULL) && (ckrm_mem_root_class != NULL))) {
+		printk(KERN_ERR "MEM_RC: Only one root class is allowed\n");
+		return NULL;
+	}
+		
+	if (unlikely((parent != NULL) && (ckrm_mem_root_class == NULL))) {
+		printk(KERN_ERR "MEM_RC: creating child class without root class\n");
+		return NULL;
+	}
+		
+	res = kmalloc(sizeof(ckrm_mem_res_t), GFP_ATOMIC);
+	
+	if (res) {
+		mem_res_initcls_one(res);
+		res->core = core;
+		res->parent = parent;
+		spin_lock(&ckrm_mem_lock);
+		list_add(&res->mcls_list, &ckrm_memclass_list);
+		spin_unlock(&ckrm_mem_lock);
+		if (parent == NULL) {
+			// I am part of the root class. So, set the max to 
+			// number of pages available
+			res->pg_guar = ckrm_tot_lru_pages;
+			res->pg_unused = ckrm_tot_lru_pages;
+			res->pg_limit = ckrm_tot_lru_pages;
+			res->hier = 0;
+			ckrm_mem_root_class = res;
+		} else {
+			res->hier = parres->hier + 1;
+		}
+		mem_class_get(res);
+	}
+	else
+		printk(KERN_ERR "mem_res_alloc: failed GFP_ATOMIC alloc\n");
+	return res;
+}
+
+/*
+ * It is the caller's responsibility to make sure that the parent only
+ * has chilren that are to be accounted. i.e if a new child is added
+ * this function should be called after it has been added, and if a
+ * child is deleted this should be called after the child is removed.
+ */
+static void
+child_maxlimit_changed_local(ckrm_mem_res_t *parres)
+{
+	int maxlimit = 0;
+	ckrm_mem_res_t *childres;
+	ckrm_core_class_t *child = NULL;
+
+	// run thru parent's children and get the new max_limit of the parent
+	ckrm_lock_hier(parres->core);
+	while ((child = ckrm_get_next_child(parres->core, child)) != NULL) {
+		childres = ckrm_get_res_class(child, mem_rcbs.resid,
+				ckrm_mem_res_t);
+		if (maxlimit < childres->shares.my_limit) {
+			maxlimit = childres->shares.my_limit;
+		}
+	}
+	ckrm_unlock_hier(parres->core);
+	parres->shares.cur_max_limit = maxlimit;
+}
+
+static void
+mem_res_free(void *my_res)
+{
+	ckrm_mem_res_t *res = my_res;
+	ckrm_mem_res_t *parres;
+
+	if (!res) 
+		return;
+
+	parres = ckrm_get_res_class(res->parent, mem_rcbs.resid, ckrm_mem_res_t);
+
+	// return child's limit/guarantee to parent node
+	if (parres) {
+		child_guarantee_changed(&parres->shares, res->shares.my_guarantee, 0);
+		child_maxlimit_changed_local(parres);
+	}
+	res->shares.my_guarantee = 0;
+	res->shares.my_limit = 0;
+	spin_lock(&ckrm_mem_lock);
+	list_del(&res->mcls_list);
+	spin_unlock(&ckrm_mem_lock);
+	mem_class_put(res);
+
+	return;
+}
+
+/*
+ * Recalculate the guarantee and limit in # of pages... and propagate the
+ * same to children.
+ * Caller is responsible for protecting res and for the integrity of parres
+ */
+static void
+recalc_and_propagate(ckrm_mem_res_t * res, ckrm_mem_res_t * parres)
+{
+	ckrm_core_class_t *child = NULL;
+	ckrm_mem_res_t *childres;
+	int resid = mem_rcbs.resid;
+	struct ckrm_shares *self = &res->shares;
+
+	if (parres) {
+		struct ckrm_shares *par = &parres->shares;
+
+		// calculate pg_guar and pg_limit
+		//
+		if (parres->pg_guar == CKRM_SHARE_DONTCARE ||
+				self->my_guarantee == CKRM_SHARE_DONTCARE) {
+			res->pg_guar = CKRM_SHARE_DONTCARE;
+		} else if (par->total_guarantee) {
+			u64 temp = (u64) self->my_guarantee * parres->pg_guar;
+			do_div(temp, par->total_guarantee);
+			res->pg_guar = (int) temp;
+		} else {
+			res->pg_guar = 0;
+		}
+
+		if (parres->pg_limit == CKRM_SHARE_DONTCARE ||
+				self->my_limit == CKRM_SHARE_DONTCARE) {
+			res->pg_limit = CKRM_SHARE_DONTCARE;
+		} else if (par->max_limit) {
+			u64 temp = (u64) self->my_limit * parres->pg_limit;
+			do_div(temp, par->max_limit);
+			res->pg_limit = (int) temp;
+		} else {
+			res->pg_limit = 0;
+		}
+	}
+
+	// Calculate unused units
+	if (res->pg_guar == CKRM_SHARE_DONTCARE) {
+		res->pg_unused = CKRM_SHARE_DONTCARE;
+	} else if (self->total_guarantee) {
+		u64 temp = (u64) self->unused_guarantee * res->pg_guar;
+		do_div(temp, self->total_guarantee);
+		res->pg_unused = (int) temp;
+	} else {
+		res->pg_unused = 0;
+	}
+
+	// propagate to children
+	ckrm_lock_hier(res->core);
+	while ((child = ckrm_get_next_child(res->core, child)) != NULL) {
+		childres = ckrm_get_res_class(child, resid, ckrm_mem_res_t);
+		recalc_and_propagate(childres, res);
+	}
+	ckrm_unlock_hier(res->core);
+	return;
+}
+
+static int
+mem_set_share_values(void *my_res, struct ckrm_shares *shares)
+{
+	ckrm_mem_res_t *res = my_res;
+	ckrm_mem_res_t *parres;
+	int rc = EINVAL;
+
+	if (!res) 
+		return -EINVAL;
+
+	parres = ckrm_get_res_class(res->parent, mem_rcbs.resid, ckrm_mem_res_t);
+
+	rc = set_shares(shares, &res->shares, parres ? &parres->shares : NULL);
+
+	if ((rc == 0) && (parres != NULL)) {
+		child_maxlimit_changed_local(parres);
+		recalc_and_propagate(parres, NULL);
+	}
+	return rc;
+}
+
+static int
+mem_get_share_values(void *my_res, struct ckrm_shares *shares)
+{
+	ckrm_mem_res_t *res = my_res;
+
+	if (!res) 
+		return -EINVAL;
+	*shares = res->shares;
+	return 0;
+}
+
+static int  
+mem_get_stats(void *my_res, struct seq_file *sfile)
+{
+	ckrm_mem_res_t *res = my_res;
+
+	if (!res) 
+		return -EINVAL;
+
+#if 0
+	seq_printf(sfile, "tot %6d;gua %6d;lmt %6d;unu %6d;"
+			"lnt %6d;bor %6d;rlt %6d\n", atomic_read(&res->pg_total),
+			res->pg_guar, res->pg_limit, res->pg_unused, res->pg_lent,
+			res->pg_borrowed, atomic_read(&ckrm_mem_real_count));
+#endif
+
+
+	seq_printf(sfile, "----------- Memory Resource stats start -----------\n");
+	seq_printf(sfile, "Number of pages used(including pages lent to children):"
+			" %d\n", atomic_read(&res->pg_total));
+	seq_printf(sfile, "Number of pages guaranteed: %d\n",
+			res->pg_guar);
+	seq_printf(sfile, "Maximum limit of pages: %d\n",
+			res->pg_limit);
+	seq_printf(sfile, "Total number of pages available"
+			"(after serving guarantees to children): %d\n",
+			res->pg_unused);
+	seq_printf(sfile, "Number of pages lent to children: %d\n",
+			res->pg_lent);
+	seq_printf(sfile, "Number of pages borrowed from the parent: %d\n",
+			res->pg_borrowed);
+	seq_printf(sfile, "----------- Memory Resource stats end -----------\n");
+
+	return 0;
+}
+
+static void
+mem_change_resclass(void *tsk, void *old, void *new)
+{
+	struct mm_struct *mm;
+	struct task_struct *task = tsk, *t1;
+	struct ckrm_mem_res *prev_mmcls;
+	
+	if (!task->mm || (new == old) || (old == (void *) -1))
+		return;
+
+	mm = task->active_mm;
+	spin_lock(&mm->peertask_lock);
+	prev_mmcls = mm->memclass;
+		
+	if (new == NULL) {
+		list_del_init(&task->mm_peers);
+	} else {
+		int found = 0;
+		list_for_each_entry(t1, &mm->tasklist, mm_peers) {
+			if (t1 == task) {
+				found++;
+				break;
+			}
+		}
+		if (!found) {
+			list_del_init(&task->mm_peers);
+			list_add_tail(&task->mm_peers, &mm->tasklist);
+		}
+	}
+
+	ckrm_mem_evaluate_mm(mm);
+	spin_unlock(&mm->peertask_lock);
+	return;
+}
+
+// config file is available only at the root level,
+// so assuming my_res to be the system level class
+static int
+mem_set_config(void *my_res, const char *cfgstr)
+{
+	ckrm_mem_res_t *res = my_res;
+
+	printk(KERN_INFO "%s class of %s is called with config<%s>\n",
+			MEM_NAME, res->core->name, cfgstr);
+	return 0;
+}
+
+static int 
+mem_show_config(void *my_res, struct seq_file *sfile)
+{
+	struct zone *zone;
+	ckrm_mem_res_t *res = my_res;
+	int active = 0, inactive = 0, fr = 0;
+
+	if (!res)
+		return -EINVAL;
+
+	for_each_zone(zone) {
+		active += zone->nr_active;
+		inactive += zone->nr_inactive;
+		fr += zone->free_pages;
+	}
+	seq_printf(sfile, "res=%s;tot_pages=%d,active=%d,inactive=%d,free=%d\n",
+			MEM_NAME, ckrm_tot_lru_pages,active,inactive,fr);
+
+
+	return 0;
+}
+
+static int
+mem_reset_stats(void *my_res)
+{
+	ckrm_mem_res_t *res = my_res;
+	printk(KERN_INFO " memclass of %s called for reset\n", res->core->name);
+	return 0;
+}
+
+struct ckrm_res_ctlr mem_rcbs = {
+	.res_name          = MEM_NAME,
+	.res_hdepth        = CKRM_MEM_MAX_HIERARCHY,
+	.resid             = -1,
+	.res_alloc         = mem_res_alloc,
+	.res_free          = mem_res_free,
+	.set_share_values  = mem_set_share_values,
+	.get_share_values  = mem_get_share_values,
+	.get_stats         = mem_get_stats,
+	.change_resclass   = mem_change_resclass,
+	.show_config       = mem_show_config,
+	.set_config        = mem_set_config,
+	.reset_stats       = mem_reset_stats,
+};
+
+EXPORT_SYMBOL(mem_rcbs);
+
+int __init
+init_ckrm_mem_res(void)
+{
+	struct ckrm_classtype *clstype;
+	int resid = mem_rcbs.resid;
+
+	set_ckrm_tot_pages();
+	clstype = ckrm_find_classtype_by_name("taskclass");
+	if (clstype == NULL) {
+		printk(KERN_INFO " Unknown ckrm classtype<taskclass>");
+		return -ENOENT;
+	}
+
+	if (resid == -1) {
+		resid = ckrm_register_res_ctlr(clstype, &mem_rcbs);
+		if (resid != -1) {
+			mem_rcbs.classtype = clstype;
+		}
+	}
+	return ((resid < 0) ? resid : 0);
+}	
+
+void __exit
+exit_ckrm_mem_res(void)
+{
+	ckrm_unregister_res_ctlr(&mem_rcbs);
+	mem_rcbs.resid = -1;
+}
+
+module_init(init_ckrm_mem_res)
+module_exit(exit_ckrm_mem_res)
+
+static void
+set_flags_of_children(ckrm_mem_res_t *parres, unsigned int flag)
+{
+	ckrm_mem_res_t *childres;
+	ckrm_core_class_t *child = NULL;
+
+	parres->reclaim_flags |= flag;
+	ckrm_lock_hier(parres->core);
+	while ((child = ckrm_get_next_child(parres->core, child)) != NULL) {
+		childres = ckrm_get_res_class(child, mem_rcbs.resid,
+				ckrm_mem_res_t);
+		set_flags_of_children(childres, flag);
+	}
+	ckrm_unlock_hier(parres->core);
+	return;
+}
+
+// FIXME: more attention is needed to this function
+static unsigned int
+set_usage_flags(ckrm_mem_res_t *res)
+{
+	int tot_usage, cls_usage, range, guar;
+
+	if (res->pg_limit == CKRM_SHARE_DONTCARE) {
+			// No limit is set for the class. don't bother it
+			res->reclaim_flags = 0;
+			return res->reclaim_flags;
+	}
+
+	tot_usage = atomic_read(&res->pg_total);
+	cls_usage = tot_usage - res->pg_lent;
+	guar = (res->pg_guar > 0) ? res->pg_guar : 0;
+	range = res->pg_limit - guar;
+
+	if ((tot_usage > (guar + ((120 * range) / 100))) &&
+				(res->pg_lent > (guar + ((25 * range) / 100)))) {
+		set_flags_of_children(res, CLS_PARENT_OVER);
+	}
+
+	if (cls_usage > (guar + ((110 * range) / 100))) {
+		res->reclaim_flags |= CLS_OVER_110;
+	} else if (cls_usage > (guar + range)) {
+		res->reclaim_flags |= CLS_OVER_100;
+	} else if (cls_usage > (guar + ((3 * range) / 4))) {
+		res->reclaim_flags |= CLS_OVER_75;
+	} else if (cls_usage > guar) {
+		res->reclaim_flags |= CLS_OVER_GUAR;
+	} else {
+		res->reclaim_flags = 0;
+	}
+	return res->reclaim_flags;
+}
+
+/*
+ * The functions ckrm_setup_reclamation(), ckrm_teardown_reclamation(),
+ * ckrm_get_reclaim_bits() and the macro ckrm_kick_page() along with the 
+ * macros CLS_* define how the pages are reclaimed.
+ * Keeping this logic thru these interface eliminate the necessity to
+ * change the reclaimation code in VM if we want to change the logic.
+ */
+unsigned int
+ckrm_setup_reclamation(void)
+{
+	ckrm_mem_res_t *res;
+	unsigned int ret = 0;
+
+	spin_lock(&ckrm_mem_lock);
+	set_ckrm_tot_pages();
+	ckrm_mem_root_class->pg_guar = ckrm_tot_lru_pages;
+	ckrm_mem_root_class->pg_unused = ckrm_tot_lru_pages;
+	ckrm_mem_root_class->pg_limit = ckrm_tot_lru_pages;
+	recalc_and_propagate(ckrm_mem_root_class, NULL);
+	list_for_each_entry(res, &ckrm_memclass_list, mcls_list) {
+		ret |= set_usage_flags(res);
+	}
+	spin_unlock(&ckrm_mem_lock);
+	return ret;
+}
+
+void
+ckrm_teardown_reclamation(void)
+{
+	ckrm_mem_res_t *res;
+	spin_lock(&ckrm_mem_lock);
+	list_for_each_entry(res, &ckrm_memclass_list, mcls_list) {
+		res->reclaim_flags = 0;
+	}
+	spin_unlock(&ckrm_mem_lock);
+}
+
+void
+ckrm_get_reclaim_bits(unsigned int *flags, unsigned int *extract)
+{
+	int i, j, mask = 0;
+
+	if (*extract == 0 || *flags == 0) {
+		return;
+	}
+	if (*flags & CLS_SHRINK) {
+		*extract = CLS_SHRINK;
+		*flags = 0;
+		return;
+	}
+			
+
+	i = fls(*flags);
+	for (j = i-1; j > 0; j--) {
+		mask = (mask<<1) | 1;
+	}
+	*extract = (CLS_FLAGS_ALL & ~mask);
+	*flags &= ~*extract;
+	return;
+}
+
+void
+ckrm_near_limit(ckrm_mem_res_t *cls)
+{
+	struct zone *zone;
+	unsigned long now = jiffies;
+
+	if (!cls || ((cls->flags & MEM_NEAR_LIMIT) == MEM_NEAR_LIMIT)) {
+		return;
+	}
+	if ((cls->last_shrink + (10 * HZ)) < now) { // 10 seconds since last ?
+		cls->last_shrink = now;
+		cls->shrink_count = 0;
+	}
+	cls->shrink_count++;
+	if (cls->shrink_count > 10) {
+		return;
+	}
+	spin_lock(&ckrm_mem_lock);
+	list_add(&cls->shrink_list, &ckrm_shrink_list);
+	spin_unlock(&ckrm_mem_lock);
+	cls->flags |= MEM_NEAR_LIMIT;
+	for_each_zone(zone) {
+		wakeup_kswapd(zone);
+		break; // only once is enough
+	}
+}
+
+static int
+ckrm_mem_evaluate_page_anon(struct page* page)
+{
+	ckrm_mem_res_t* pgcls = page_class(page);
+	ckrm_mem_res_t* maxshareclass = NULL;
+	struct anon_vma *anon_vma = (struct anon_vma *) page->mapping;
+	struct vm_area_struct *vma;
+	struct mm_struct* mm;
+
+	spin_lock(&anon_vma->lock);
+	BUG_ON(list_empty(&anon_vma->head));
+	list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
+		mm = vma->vm_mm;
+		if (!maxshareclass ||
+				ckrm_mem_share_compare(maxshareclass, mm->memclass) < 0) {
+			maxshareclass = mm->memclass;
+		}
+	}
+	spin_unlock(&anon_vma->lock);
+
+	if (maxshareclass && (pgcls != maxshareclass)) {
+		ckrm_change_page_class(page, maxshareclass);
+		return 1;
+	}
+	return 0;
+}
+
+static int
+ckrm_mem_evaluate_page_file(struct page* page) 
+{
+	ckrm_mem_res_t* pgcls = page_class(page);
+	ckrm_mem_res_t* maxshareclass = NULL;
+	struct address_space *mapping = page->mapping;
+	struct vm_area_struct *vma = NULL;
+	pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+	struct prio_tree_iter iter;
+	struct mm_struct* mm;
+
+	if (!mapping)
+		return 0;
+
+	if (!spin_trylock(&mapping->i_mmap_lock))
+		return 0;
+
+	while ((vma = vma_prio_tree_next(vma, &mapping->i_mmap,
+					&iter, pgoff, pgoff)) != NULL) {
+		mm = vma->vm_mm;
+		if (!maxshareclass || ckrm_mem_share_compare(maxshareclass,mm->memclass)<0)
+			maxshareclass = mm->memclass;
+	}
+	spin_unlock(&mapping->i_mmap_lock);
+
+	if (maxshareclass && pgcls != maxshareclass) {
+		ckrm_change_page_class(page, maxshareclass);
+		return 1;
+	}
+	return 0;
+}
+
+static int
+ckrm_mem_evaluate_page(struct page* page) 
+{
+	int changed = 0;
+
+	if (page->mapping) {
+		if (PageAnon(page))
+			changed = ckrm_mem_evaluate_page_anon(page);
+		else
+			changed = ckrm_mem_evaluate_page_file(page);
+	}
+	return changed;
+}
+
+static inline int
+class_migrate_pmd(struct mm_struct* mm, struct vm_area_struct* vma,
+		pmd_t* pmdir, unsigned long address, unsigned long end)
+{
+	pte_t* pte;
+	unsigned long pmd_end;
+	
+	if (pmd_none(*pmdir))
+		return 0;
+	BUG_ON(pmd_bad(*pmdir));
+	
+	pte = pte_offset_map(pmdir,address);
+	pmd_end = (address+PMD_SIZE)&PMD_MASK;
+	if (end>pmd_end)
+		end = pmd_end;
+	
+	do {
+		if (pte_present(*pte)) {
+			ckrm_mem_evaluate_page(pte_page(*pte));
+		}
+		address += PAGE_SIZE;
+		pte++;
+	} while(address && (address<end));
+	return 0;
+}
+
+static inline int
+class_migrate_pgd(struct mm_struct* mm, struct vm_area_struct* vma,
+		pgd_t* pgdir, unsigned long address, unsigned long end)
+{
+	pmd_t* pmd;
+	unsigned long pgd_end;
+	
+	if (pgd_none(*pgdir))
+		return 0;
+	BUG_ON(pgd_bad(*pgdir));
+	
+	pmd = pmd_offset(pgdir,address);
+	pgd_end = (address+PGDIR_SIZE)&PGDIR_MASK;
+	
+	if (pgd_end && (end>pgd_end))
+		end = pgd_end;
+	
+	do {
+		class_migrate_pmd(mm,vma,pmd,address,end);
+		address =  (address+PMD_SIZE)&PMD_MASK;
+		pmd++;
+	} while (address && (address<end));
+	return 0;
+}
+
+static inline int
+class_migrate_vma(struct mm_struct* mm, struct vm_area_struct* vma)
+{
+	pgd_t* pgdir;
+	unsigned long address, end;
+	
+	address = vma->vm_start;
+	end = vma->vm_end;
+	
+	pgdir = pgd_offset(vma->vm_mm, address);
+	do {
+		class_migrate_pgd(mm,vma,pgdir,address,end);
+		address = (address + PGDIR_SIZE) & PGDIR_MASK;
+		pgdir++;
+	} while(address && (address<end));
+	return 0;
+}
+
+/* this function is called with mm->peertask_lock hold */
+void
+ckrm_mem_evaluate_mm(struct mm_struct* mm)
+{
+	struct task_struct *task;
+	struct ckrm_mem_res *maxshareclass = NULL;
+	struct vm_area_struct *vma;
+	
+	if (list_empty(&mm->tasklist)) {
+		/* We leave the mm->memclass untouched since we believe that one
+		 * mm with no task associated will be deleted soon or attach
+		 * with another task later.
+		 */
+		return; 
+	}
+
+	list_for_each_entry(task, &mm->tasklist, mm_peers) {
+		ckrm_mem_res_t* cls = GET_MEM_CLASS(task);
+		if (!cls)
+			continue;
+		if (!maxshareclass || ckrm_mem_share_compare(maxshareclass,cls)<0 ) 
+			maxshareclass = cls;
+	}
+
+	if (mm->memclass != (void *)maxshareclass) {
+		mem_class_get(maxshareclass);
+		if (mm->memclass)
+			mem_class_put(mm->memclass);
+		mm->memclass = maxshareclass;
+		
+		/* Go through all VMA to migrate pages */
+		down_read(&mm->mmap_sem);
+		vma = mm->mmap;
+		while(vma) {
+			class_migrate_vma(mm, vma);
+			vma = vma->vm_next;
+		}
+		up_read(&mm->mmap_sem);
+	}
+	return;
+}
+
+void
+ckrm_mem_evaluate_page_byadd(struct page* page, struct mm_struct* mm)
+{
+	ckrm_mem_res_t *pgcls = page_class(page);
+	ckrm_mem_res_t *chgcls = mm->memclass ? mm->memclass : GET_MEM_CLASS(current);
+
+	if (!chgcls || pgcls == chgcls)
+		return;
+
+	if (!page->mapcount) {
+		ckrm_change_page_class(page, chgcls);
+		return;
+	}
+	if (ckrm_mem_share_compare(pgcls, chgcls) < 0) {
+		ckrm_change_page_class(page, chgcls);
+		return;
+	}
+	return;
+}
+
+void
+ckrm_init_mm_to_task(struct mm_struct * mm, struct task_struct *task)
+{
+	spin_lock(&mm->peertask_lock);
+	if (!list_empty(&task->mm_peers)) {
+		printk(KERN_ERR "CKRM_MEM: Task list should be empty, but is not!!\n");
+		list_del_init(&task->mm_peers);
+	}
+	list_add_tail(&task->mm_peers, &mm->tasklist);
+	if (mm->memclass != GET_MEM_CLASS(task))
+		ckrm_mem_evaluate_mm(mm);
+	spin_unlock(&mm->peertask_lock);
+	return;
+}
+
+MODULE_LICENSE("GPL");
diff --git a/kernel/exit.c.orig b/kernel/exit.c.orig
new file mode 100644
index 000000000..f53583e2b
--- /dev/null
+++ b/kernel/exit.c.orig
@@ -0,0 +1,1192 @@
+/*
+ *  linux/kernel/exit.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ */
+
+#include <linux/config.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/interrupt.h>
+#include <linux/smp_lock.h>
+#include <linux/module.h>
+#include <linux/completion.h>
+#include <linux/personality.h>
+#include <linux/tty.h>
+#include <linux/namespace.h>
+#include <linux/security.h>
+#include <linux/acct.h>
+#include <linux/file.h>
+#include <linux/binfmts.h>
+#include <linux/ptrace.h>
+#include <linux/profile.h>
+#include <linux/mount.h>
+#include <linux/proc_fs.h>
+#include <linux/mempolicy.h>
+#include <linux/ckrm.h>
+#include <linux/ckrm_tsk.h>
+
+#include <asm/uaccess.h>
+#include <asm/unistd.h>
+#include <asm/pgtable.h>
+#include <asm/mmu_context.h>
+
+extern void sem_exit (void);
+extern struct task_struct *child_reaper;
+
+int getrusage(struct task_struct *, int, struct rusage __user *);
+
+static void __unhash_process(struct task_struct *p)
+{
+	nr_threads--;
+	detach_pid(p, PIDTYPE_PID);
+	detach_pid(p, PIDTYPE_TGID);
+	if (thread_group_leader(p)) {
+		detach_pid(p, PIDTYPE_PGID);
+		detach_pid(p, PIDTYPE_SID);
+		if (p->pid)
+			__get_cpu_var(process_counts)--;
+	}
+
+	REMOVE_LINKS(p);
+}
+
+void release_task(struct task_struct * p)
+{
+	int zap_leader;
+	task_t *leader;
+	struct dentry *proc_dentry;
+
+repeat: 
+	BUG_ON(p->state < TASK_ZOMBIE);
+ 
+	atomic_dec(&p->user->processes);
+	spin_lock(&p->proc_lock);
+	proc_dentry = proc_pid_unhash(p);
+	write_lock_irq(&tasklist_lock);
+	if (unlikely(p->ptrace))
+		__ptrace_unlink(p);
+	BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children));
+	__exit_signal(p);
+	__exit_sighand(p);
+	__unhash_process(p);
+
+	/*
+	 * If we are the last non-leader member of the thread
+	 * group, and the leader is zombie, then notify the
+	 * group leader's parent process. (if it wants notification.)
+	 */
+	zap_leader = 0;
+	leader = p->group_leader;
+	if (leader != p && thread_group_empty(leader) && leader->state == TASK_ZOMBIE) {
+		BUG_ON(leader->exit_signal == -1);
+		do_notify_parent(leader, leader->exit_signal);
+		/*
+		 * If we were the last child thread and the leader has
+		 * exited already, and the leader's parent ignores SIGCHLD,
+		 * then we are the one who should release the leader.
+		 *
+		 * do_notify_parent() will have marked it self-reaping in
+		 * that case.
+		 */
+		zap_leader = (leader->exit_signal == -1);
+	}
+
+	p->parent->cutime += p->utime + p->cutime;
+	p->parent->cstime += p->stime + p->cstime;
+	p->parent->cmin_flt += p->min_flt + p->cmin_flt;
+	p->parent->cmaj_flt += p->maj_flt + p->cmaj_flt;
+	p->parent->cnvcsw += p->nvcsw + p->cnvcsw;
+	p->parent->cnivcsw += p->nivcsw + p->cnivcsw;
+	sched_exit(p);
+	write_unlock_irq(&tasklist_lock);
+	spin_unlock(&p->proc_lock);
+	proc_pid_flush(proc_dentry);
+	release_thread(p);
+	put_task_struct(p);
+
+	p = leader;
+	if (unlikely(zap_leader))
+		goto repeat;
+}
+
+/* we are using it only for SMP init */
+
+void unhash_process(struct task_struct *p)
+{
+	struct dentry *proc_dentry;
+
+	spin_lock(&p->proc_lock);
+	proc_dentry = proc_pid_unhash(p);
+	write_lock_irq(&tasklist_lock);
+	__unhash_process(p);
+	write_unlock_irq(&tasklist_lock);
+	spin_unlock(&p->proc_lock);
+	proc_pid_flush(proc_dentry);
+}
+
+/*
+ * This checks not only the pgrp, but falls back on the pid if no
+ * satisfactory pgrp is found. I dunno - gdb doesn't work correctly
+ * without this...
+ */
+int session_of_pgrp(int pgrp)
+{
+	struct task_struct *p;
+	struct list_head *l;
+	struct pid *pid;
+	int sid = -1;
+
+	read_lock(&tasklist_lock);
+	for_each_task_pid(pgrp, PIDTYPE_PGID, p, l, pid)
+		if (p->signal->session > 0) {
+			sid = p->signal->session;
+			goto out;
+		}
+	p = find_task_by_pid(pgrp);
+	if (p)
+		sid = p->signal->session;
+out:
+	read_unlock(&tasklist_lock);
+	
+	return sid;
+}
+
+/*
+ * Determine if a process group is "orphaned", according to the POSIX
+ * definition in 2.2.2.52.  Orphaned process groups are not to be affected
+ * by terminal-generated stop signals.  Newly orphaned process groups are
+ * to receive a SIGHUP and a SIGCONT.
+ *
+ * "I ask you, have you ever known what it is to be an orphan?"
+ */
+static int will_become_orphaned_pgrp(int pgrp, task_t *ignored_task)
+{
+	struct task_struct *p;
+	struct list_head *l;
+	struct pid *pid;
+	int ret = 1;
+
+	for_each_task_pid(pgrp, PIDTYPE_PGID, p, l, pid) {
+		if (p == ignored_task
+				|| p->state >= TASK_ZOMBIE 
+				|| p->real_parent->pid == 1)
+			continue;
+		if (process_group(p->real_parent) != pgrp
+			    && p->real_parent->signal->session == p->signal->session) {
+			ret = 0;
+			break;
+		}
+	}
+	return ret;	/* (sighing) "Often!" */
+}
+
+int is_orphaned_pgrp(int pgrp)
+{
+	int retval;
+
+	read_lock(&tasklist_lock);
+	retval = will_become_orphaned_pgrp(pgrp, NULL);
+	read_unlock(&tasklist_lock);
+
+	return retval;
+}
+
+static inline int has_stopped_jobs(int pgrp)
+{
+	int retval = 0;
+	struct task_struct *p;
+	struct list_head *l;
+	struct pid *pid;
+
+	for_each_task_pid(pgrp, PIDTYPE_PGID, p, l, pid) {
+		if (p->state != TASK_STOPPED)
+			continue;
+
+		/* If p is stopped by a debugger on a signal that won't
+		   stop it, then don't count p as stopped.  This isn't
+		   perfect but it's a good approximation.  */
+		if (unlikely (p->ptrace)
+		    && p->exit_code != SIGSTOP
+		    && p->exit_code != SIGTSTP
+		    && p->exit_code != SIGTTOU
+		    && p->exit_code != SIGTTIN)
+			continue;
+
+		retval = 1;
+		break;
+	}
+	return retval;
+}
+
+/**
+ * reparent_to_init() - Reparent the calling kernel thread to the init task.
+ *
+ * If a kernel thread is launched as a result of a system call, or if
+ * it ever exits, it should generally reparent itself to init so that
+ * it is correctly cleaned up on exit.
+ *
+ * The various task state such as scheduling policy and priority may have
+ * been inherited from a user process, so we reset them to sane values here.
+ *
+ * NOTE that reparent_to_init() gives the caller full capabilities.
+ */
+void reparent_to_init(void)
+{
+	write_lock_irq(&tasklist_lock);
+
+	ptrace_unlink(current);
+	/* Reparent to init */
+	REMOVE_LINKS(current);
+	current->parent = child_reaper;
+	current->real_parent = child_reaper;
+	SET_LINKS(current);
+
+	/* Set the exit signal to SIGCHLD so we signal init on exit */
+	current->exit_signal = SIGCHLD;
+
+	if ((current->policy == SCHED_NORMAL) && (task_nice(current) < 0))
+		set_user_nice(current, 0);
+	/* cpus_allowed? */
+	/* rt_priority? */
+	/* signals? */
+	security_task_reparent_to_init(current);
+	memcpy(current->rlim, init_task.rlim, sizeof(*(current->rlim)));
+	atomic_inc(&(INIT_USER->__count));
+	switch_uid(INIT_USER);
+
+	write_unlock_irq(&tasklist_lock);
+}
+
+void __set_special_pids(pid_t session, pid_t pgrp)
+{
+	struct task_struct *curr = current;
+
+	if (curr->signal->session != session) {
+		detach_pid(curr, PIDTYPE_SID);
+		curr->signal->session = session;
+		attach_pid(curr, PIDTYPE_SID, session);
+	}
+	if (process_group(curr) != pgrp) {
+		detach_pid(curr, PIDTYPE_PGID);
+		curr->signal->pgrp = pgrp;
+		attach_pid(curr, PIDTYPE_PGID, pgrp);
+	}
+}
+
+void set_special_pids(pid_t session, pid_t pgrp)
+{
+	write_lock_irq(&tasklist_lock);
+	__set_special_pids(session, pgrp);
+	write_unlock_irq(&tasklist_lock);
+}
+
+/*
+ * Let kernel threads use this to say that they
+ * allow a certain signal (since daemonize() will
+ * have disabled all of them by default).
+ */
+int allow_signal(int sig)
+{
+	if (sig < 1 || sig > _NSIG)
+		return -EINVAL;
+
+	spin_lock_irq(&current->sighand->siglock);
+	sigdelset(&current->blocked, sig);
+	if (!current->mm) {
+		/* Kernel threads handle their own signals.
+		   Let the signal code know it'll be handled, so
+		   that they don't get converted to SIGKILL or
+		   just silently dropped */
+		current->sighand->action[(sig)-1].sa.sa_handler = (void *)2;
+	}
+	recalc_sigpending();
+	spin_unlock_irq(&current->sighand->siglock);
+	return 0;
+}
+
+EXPORT_SYMBOL(allow_signal);
+
+int disallow_signal(int sig)
+{
+	if (sig < 1 || sig > _NSIG)
+		return -EINVAL;
+
+	spin_lock_irq(&current->sighand->siglock);
+	sigaddset(&current->blocked, sig);
+	recalc_sigpending();
+	spin_unlock_irq(&current->sighand->siglock);
+	return 0;
+}
+
+EXPORT_SYMBOL(disallow_signal);
+
+/*
+ *	Put all the gunge required to become a kernel thread without
+ *	attached user resources in one place where it belongs.
+ */
+
+void daemonize(const char *name, ...)
+{
+	va_list args;
+	struct fs_struct *fs;
+	sigset_t blocked;
+
+	va_start(args, name);
+	vsnprintf(current->comm, sizeof(current->comm), name, args);
+	va_end(args);
+
+	/*
+	 * If we were started as result of loading a module, close all of the
+	 * user space pages.  We don't need them, and if we didn't close them
+	 * they would be locked into memory.
+	 */
+	exit_mm(current);
+
+	set_special_pids(1, 1);
+	current->signal->tty = NULL;
+
+	/* Block and flush all signals */
+	sigfillset(&blocked);
+	sigprocmask(SIG_BLOCK, &blocked, NULL);
+	flush_signals(current);
+
+	/* Become as one with the init task */
+
+	exit_fs(current);	/* current->fs->count--; */
+	fs = init_task.fs;
+	current->fs = fs;
+	atomic_inc(&fs->count);
+ 	exit_files(current);
+	current->files = init_task.files;
+	atomic_inc(&current->files->count);
+
+	reparent_to_init();
+}
+
+EXPORT_SYMBOL(daemonize);
+
+static inline void close_files(struct files_struct * files)
+{
+	int i, j;
+
+	j = 0;
+	for (;;) {
+		unsigned long set;
+		i = j * __NFDBITS;
+		if (i >= files->max_fdset || i >= files->max_fds)
+			break;
+		set = files->open_fds->fds_bits[j++];
+		while (set) {
+			if (set & 1) {
+				struct file * file = xchg(&files->fd[i], NULL);
+				if (file)
+					filp_close(file, files);
+			}
+			i++;
+			set >>= 1;
+		}
+	}
+}
+
+struct files_struct *get_files_struct(struct task_struct *task)
+{
+	struct files_struct *files;
+
+	task_lock(task);
+	files = task->files;
+	if (files)
+		atomic_inc(&files->count);
+	task_unlock(task);
+
+	return files;
+}
+
+void fastcall put_files_struct(struct files_struct *files)
+{
+	if (atomic_dec_and_test(&files->count)) {
+		close_files(files);
+		/*
+		 * Free the fd and fdset arrays if we expanded them.
+		 */
+		if (files->fd != &files->fd_array[0])
+			free_fd_array(files->fd, files->max_fds);
+		if (files->max_fdset > __FD_SETSIZE) {
+			free_fdset(files->open_fds, files->max_fdset);
+			free_fdset(files->close_on_exec, files->max_fdset);
+		}
+		kmem_cache_free(files_cachep, files);
+	}
+}
+
+EXPORT_SYMBOL(put_files_struct);
+
+static inline void __exit_files(struct task_struct *tsk)
+{
+	struct files_struct * files = tsk->files;
+
+	if (files) {
+		task_lock(tsk);
+		tsk->files = NULL;
+		task_unlock(tsk);
+		put_files_struct(files);
+	}
+}
+
+void exit_files(struct task_struct *tsk)
+{
+	__exit_files(tsk);
+}
+
+static inline void __put_fs_struct(struct fs_struct *fs)
+{
+	/* No need to hold fs->lock if we are killing it */
+	if (atomic_dec_and_test(&fs->count)) {
+		dput(fs->root);
+		mntput(fs->rootmnt);
+		dput(fs->pwd);
+		mntput(fs->pwdmnt);
+		if (fs->altroot) {
+			dput(fs->altroot);
+			mntput(fs->altrootmnt);
+		}
+		kmem_cache_free(fs_cachep, fs);
+	}
+}
+
+void put_fs_struct(struct fs_struct *fs)
+{
+	__put_fs_struct(fs);
+}
+
+static inline void __exit_fs(struct task_struct *tsk)
+{
+	struct fs_struct * fs = tsk->fs;
+
+	if (fs) {
+		task_lock(tsk);
+		tsk->fs = NULL;
+		task_unlock(tsk);
+		__put_fs_struct(fs);
+	}
+}
+
+void exit_fs(struct task_struct *tsk)
+{
+	__exit_fs(tsk);
+}
+
+EXPORT_SYMBOL_GPL(exit_fs);
+
+/*
+ * Turn us into a lazy TLB process if we
+ * aren't already..
+ */
+static inline void __exit_mm(struct task_struct * tsk)
+{
+	struct mm_struct *mm = tsk->mm;
+
+	mm_release(tsk, mm);
+	if (!mm)
+		return;
+	/*
+	 * Serialize with any possible pending coredump.
+	 * We must hold mmap_sem around checking core_waiters
+	 * and clearing tsk->mm.  The core-inducing thread
+	 * will increment core_waiters for each thread in the
+	 * group with ->mm != NULL.
+	 */
+	down_read(&mm->mmap_sem);
+	if (mm->core_waiters) {
+		up_read(&mm->mmap_sem);
+		down_write(&mm->mmap_sem);
+		if (!--mm->core_waiters)
+			complete(mm->core_startup_done);
+		up_write(&mm->mmap_sem);
+
+		wait_for_completion(&mm->core_done);
+		down_read(&mm->mmap_sem);
+	}
+	atomic_inc(&mm->mm_count);
+	if (mm != tsk->active_mm) BUG();
+	/* more a memory barrier than a real lock */
+	task_lock(tsk);
+	tsk->mm = NULL;
+	up_read(&mm->mmap_sem);
+	enter_lazy_tlb(mm, current);
+	task_unlock(tsk);
+	mmput(mm);
+}
+
+void exit_mm(struct task_struct *tsk)
+{
+	__exit_mm(tsk);
+}
+
+EXPORT_SYMBOL(exit_mm);
+
+static inline void choose_new_parent(task_t *p, task_t *reaper, task_t *child_reaper)
+{
+	/*
+	 * Make sure we're not reparenting to ourselves and that
+	 * the parent is not a zombie.
+	 */
+	if (p == reaper || reaper->state >= TASK_ZOMBIE)
+		p->real_parent = child_reaper;
+	else
+		p->real_parent = reaper;
+	if (p->parent == p->real_parent)
+		BUG();
+}
+
+static inline void reparent_thread(task_t *p, task_t *father, int traced)
+{
+	/* We don't want people slaying init.  */
+	if (p->exit_signal != -1)
+		p->exit_signal = SIGCHLD;
+	p->self_exec_id++;
+
+	if (p->pdeath_signal)
+		/* We already hold the tasklist_lock here.  */
+		group_send_sig_info(p->pdeath_signal, (void *) 0, p);
+
+	/* Move the child from its dying parent to the new one.  */
+	if (unlikely(traced)) {
+		/* Preserve ptrace links if someone else is tracing this child.  */
+		list_del_init(&p->ptrace_list);
+		if (p->parent != p->real_parent)
+			list_add(&p->ptrace_list, &p->real_parent->ptrace_children);
+	} else {
+		/* If this child is being traced, then we're the one tracing it
+		 * anyway, so let go of it.
+		 */
+		p->ptrace = 0;
+		list_del_init(&p->sibling);
+		p->parent = p->real_parent;
+		list_add_tail(&p->sibling, &p->parent->children);
+
+		/* If we'd notified the old parent about this child's death,
+		 * also notify the new parent.
+		 */
+		if (p->state == TASK_ZOMBIE && p->exit_signal != -1 &&
+		    thread_group_empty(p))
+			do_notify_parent(p, p->exit_signal);
+	}
+
+	/*
+	 * process group orphan check
+	 * Case ii: Our child is in a different pgrp
+	 * than we are, and it was the only connection
+	 * outside, so the child pgrp is now orphaned.
+	 */
+	if ((process_group(p) != process_group(father)) &&
+	    (p->signal->session == father->signal->session)) {
+		int pgrp = process_group(p);
+
+		if (will_become_orphaned_pgrp(pgrp, NULL) && has_stopped_jobs(pgrp)) {
+			__kill_pg_info(SIGHUP, (void *)1, pgrp);
+			__kill_pg_info(SIGCONT, (void *)1, pgrp);
+		}
+	}
+}
+
+/*
+ * When we die, we re-parent all our children.
+ * Try to give them to another thread in our thread
+ * group, and if no such member exists, give it to
+ * the global child reaper process (ie "init")
+ */
+static inline void forget_original_parent(struct task_struct * father)
+{
+	struct task_struct *p, *reaper = father;
+	struct list_head *_p, *_n;
+
+	reaper = father->group_leader;
+	if (reaper == father)
+		reaper = child_reaper;
+
+	/*
+	 * There are only two places where our children can be:
+	 *
+	 * - in our child list
+	 * - in our ptraced child list
+	 *
+	 * Search them and reparent children.
+	 */
+	list_for_each_safe(_p, _n, &father->children) {
+		p = list_entry(_p,struct task_struct,sibling);
+		if (father == p->real_parent) {
+			choose_new_parent(p, reaper, child_reaper);
+			reparent_thread(p, father, 0);
+		} else {
+			ptrace_unlink (p);
+			if (p->state == TASK_ZOMBIE && p->exit_signal != -1 &&
+			    thread_group_empty(p))
+				do_notify_parent(p, p->exit_signal);
+		}
+	}
+	list_for_each_safe(_p, _n, &father->ptrace_children) {
+		p = list_entry(_p,struct task_struct,ptrace_list);
+		choose_new_parent(p, reaper, child_reaper);
+		reparent_thread(p, father, 1);
+	}
+}
+
+/*
+ * Send signals to all our closest relatives so that they know
+ * to properly mourn us..
+ */
+static void exit_notify(struct task_struct *tsk)
+{
+	int state;
+	struct task_struct *t;
+
+	ckrm_cb_exit(tsk);
+
+	if (signal_pending(tsk) && !tsk->signal->group_exit
+	    && !thread_group_empty(tsk)) {
+		/*
+		 * This occurs when there was a race between our exit
+		 * syscall and a group signal choosing us as the one to
+		 * wake up.  It could be that we are the only thread
+		 * alerted to check for pending signals, but another thread
+		 * should be woken now to take the signal since we will not.
+		 * Now we'll wake all the threads in the group just to make
+		 * sure someone gets all the pending signals.
+		 */
+		read_lock(&tasklist_lock);
+		spin_lock_irq(&tsk->sighand->siglock);
+		for (t = next_thread(tsk); t != tsk; t = next_thread(t))
+			if (!signal_pending(t) && !(t->flags & PF_EXITING)) {
+				recalc_sigpending_tsk(t);
+				if (signal_pending(t))
+					signal_wake_up(t, 0);
+			}
+		spin_unlock_irq(&tsk->sighand->siglock);
+		read_unlock(&tasklist_lock);
+	}
+
+	write_lock_irq(&tasklist_lock);
+
+	/*
+	 * This does two things:
+	 *
+  	 * A.  Make init inherit all the child processes
+	 * B.  Check to see if any process groups have become orphaned
+	 *	as a result of our exiting, and if they have any stopped
+	 *	jobs, send them a SIGHUP and then a SIGCONT.  (POSIX 3.2.2.2)
+	 */
+
+	forget_original_parent(tsk);
+	BUG_ON(!list_empty(&tsk->children));
+
+	/*
+	 * Check to see if any process groups have become orphaned
+	 * as a result of our exiting, and if they have any stopped
+	 * jobs, send them a SIGHUP and then a SIGCONT.  (POSIX 3.2.2.2)
+	 *
+	 * Case i: Our father is in a different pgrp than we are
+	 * and we were the only connection outside, so our pgrp
+	 * is about to become orphaned.
+	 */
+	 
+	t = tsk->real_parent;
+	
+	if ((process_group(t) != process_group(tsk)) &&
+	    (t->signal->session == tsk->signal->session) &&
+	    will_become_orphaned_pgrp(process_group(tsk), tsk) &&
+	    has_stopped_jobs(process_group(tsk))) {
+		__kill_pg_info(SIGHUP, (void *)1, process_group(tsk));
+		__kill_pg_info(SIGCONT, (void *)1, process_group(tsk));
+	}
+
+	/* Let father know we died 
+	 *
+	 * Thread signals are configurable, but you aren't going to use
+	 * that to send signals to arbitary processes. 
+	 * That stops right now.
+	 *
+	 * If the parent exec id doesn't match the exec id we saved
+	 * when we started then we know the parent has changed security
+	 * domain.
+	 *
+	 * If our self_exec id doesn't match our parent_exec_id then
+	 * we have changed execution domain as these two values started
+	 * the same after a fork.
+	 *	
+	 */
+	
+	if (tsk->exit_signal != SIGCHLD && tsk->exit_signal != -1 &&
+	    ( tsk->parent_exec_id != t->self_exec_id  ||
+	      tsk->self_exec_id != tsk->parent_exec_id)
+	    && !capable(CAP_KILL))
+		tsk->exit_signal = SIGCHLD;
+
+
+	/* If something other than our normal parent is ptracing us, then
+	 * send it a SIGCHLD instead of honoring exit_signal.  exit_signal
+	 * only has special meaning to our real parent.
+	 */
+	if (tsk->exit_signal != -1 && thread_group_empty(tsk)) {
+		int signal = tsk->parent == tsk->real_parent ? tsk->exit_signal : SIGCHLD;
+		do_notify_parent(tsk, signal);
+	} else if (tsk->ptrace) {
+		do_notify_parent(tsk, SIGCHLD);
+	}
+
+	state = TASK_ZOMBIE;
+	if (tsk->exit_signal == -1 && tsk->ptrace == 0)
+		state = TASK_DEAD;
+	tsk->state = state;
+	tsk->flags |= PF_DEAD;
+
+	/*
+	 * Clear these here so that update_process_times() won't try to deliver
+	 * itimer, profile or rlimit signals to this task while it is in late exit.
+	 */
+	tsk->it_virt_value = 0;
+	tsk->it_prof_value = 0;
+	tsk->rlim[RLIMIT_CPU].rlim_cur = RLIM_INFINITY;
+
+	/*
+	 * In the preemption case it must be impossible for the task
+	 * to get runnable again, so use "_raw_" unlock to keep
+	 * preempt_count elevated until we schedule().
+	 *
+	 * To avoid deadlock on SMP, interrupts must be unmasked.  If we
+	 * don't, subsequently called functions (e.g, wait_task_inactive()
+	 * via release_task()) will spin, with interrupt flags
+	 * unwittingly blocked, until the other task sleeps.  That task
+	 * may itself be waiting for smp_call_function() to answer and
+	 * complete, and with interrupts blocked that will never happen.
+	 */
+	_raw_write_unlock(&tasklist_lock);
+	local_irq_enable();
+
+	/* If the process is dead, release it - nobody will wait for it */
+	if (state == TASK_DEAD)
+		release_task(tsk);
+
+}
+
+asmlinkage NORET_TYPE void do_exit(long code)
+{
+	struct task_struct *tsk = current;
+
+	if (unlikely(in_interrupt()))
+		panic("Aiee, killing interrupt handler!");
+	if (unlikely(!tsk->pid))
+		panic("Attempted to kill the idle task!");
+	if (unlikely(tsk->pid == 1))
+		panic("Attempted to kill init!");
+	if (tsk->io_context)
+		exit_io_context();
+	tsk->flags |= PF_EXITING;
+	del_timer_sync(&tsk->real_timer);
+
+	if (unlikely(in_atomic()))
+		printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n",
+				current->comm, current->pid,
+				preempt_count());
+
+	profile_exit_task(tsk);
+ 
+	if (unlikely(current->ptrace & PT_TRACE_EXIT)) {
+		current->ptrace_message = code;
+		ptrace_notify((PTRACE_EVENT_EXIT << 8) | SIGTRAP);
+	}
+
+	acct_process(code);
+	__exit_mm(tsk);
+
+	exit_sem(tsk);
+	__exit_files(tsk);
+	__exit_fs(tsk);
+	exit_namespace(tsk);
+	exit_thread();
+#ifdef CONFIG_NUMA
+	mpol_free(tsk->mempolicy);
+#endif
+
+	if (tsk->signal->leader)
+		disassociate_ctty(1);
+
+	module_put(tsk->thread_info->exec_domain->module);
+	if (tsk->binfmt)
+		module_put(tsk->binfmt->module);
+
+	tsk->exit_code = code;
+#ifdef CONFIG_CKRM_TYPE_TASKCLASS
+	numtasks_put_ref(tsk->taskclass);
+#endif
+	exit_notify(tsk);
+	schedule();
+	BUG();
+	/* Avoid "noreturn function does return".  */
+	for (;;) ;
+}
+
+NORET_TYPE void complete_and_exit(struct completion *comp, long code)
+{
+	if (comp)
+		complete(comp);
+	
+	do_exit(code);
+}
+
+EXPORT_SYMBOL(complete_and_exit);
+
+asmlinkage long sys_exit(int error_code)
+{
+	do_exit((error_code&0xff)<<8);
+}
+
+task_t fastcall *next_thread(task_t *p)
+{
+	struct pid_link *link = p->pids + PIDTYPE_TGID;
+	struct list_head *tmp, *head = &link->pidptr->task_list;
+
+#ifdef CONFIG_SMP
+	if (!p->sighand)
+		BUG();
+	if (!spin_is_locked(&p->sighand->siglock) &&
+				!rwlock_is_locked(&tasklist_lock))
+		BUG();
+#endif
+	tmp = link->pid_chain.next;
+	if (tmp == head)
+		tmp = head->next;
+
+	return pid_task(tmp, PIDTYPE_TGID);
+}
+
+EXPORT_SYMBOL(next_thread);
+
+/*
+ * Take down every thread in the group.  This is called by fatal signals
+ * as well as by sys_exit_group (below).
+ */
+NORET_TYPE void
+do_group_exit(int exit_code)
+{
+	BUG_ON(exit_code & 0x80); /* core dumps don't get here */
+
+	if (current->signal->group_exit)
+		exit_code = current->signal->group_exit_code;
+	else if (!thread_group_empty(current)) {
+		struct signal_struct *const sig = current->signal;
+		struct sighand_struct *const sighand = current->sighand;
+		read_lock(&tasklist_lock);
+		spin_lock_irq(&sighand->siglock);
+		if (sig->group_exit)
+			/* Another thread got here before we took the lock.  */
+			exit_code = sig->group_exit_code;
+		else {
+			sig->group_exit = 1;
+			sig->group_exit_code = exit_code;
+			zap_other_threads(current);
+		}
+		spin_unlock_irq(&sighand->siglock);
+		read_unlock(&tasklist_lock);
+	}
+
+	do_exit(exit_code);
+	/* NOTREACHED */
+}
+
+/*
+ * this kills every thread in the thread group. Note that any externally
+ * wait4()-ing process will get the correct exit code - even if this
+ * thread is not the thread group leader.
+ */
+asmlinkage void sys_exit_group(int error_code)
+{
+	do_group_exit((error_code & 0xff) << 8);
+}
+
+static int eligible_child(pid_t pid, int options, task_t *p)
+{
+	if (pid > 0) {
+		if (p->pid != pid)
+			return 0;
+	} else if (!pid) {
+		if (process_group(p) != process_group(current))
+			return 0;
+	} else if (pid != -1) {
+		if (process_group(p) != -pid)
+			return 0;
+	}
+
+	/*
+	 * Do not consider detached threads that are
+	 * not ptraced:
+	 */
+	if (p->exit_signal == -1 && !p->ptrace)
+		return 0;
+
+	/* Wait for all children (clone and not) if __WALL is set;
+	 * otherwise, wait for clone children *only* if __WCLONE is
+	 * set; otherwise, wait for non-clone children *only*.  (Note:
+	 * A "clone" child here is one that reports to its parent
+	 * using a signal other than SIGCHLD.) */
+	if (((p->exit_signal != SIGCHLD) ^ ((options & __WCLONE) != 0))
+	    && !(options & __WALL))
+		return 0;
+	/*
+	 * Do not consider thread group leaders that are
+	 * in a non-empty thread group:
+	 */
+	if (current->tgid != p->tgid && delay_group_leader(p))
+		return 2;
+
+	if (security_task_wait(p))
+		return 0;
+
+	return 1;
+}
+
+/*
+ * Handle sys_wait4 work for one task in state TASK_ZOMBIE.  We hold
+ * read_lock(&tasklist_lock) on entry.  If we return zero, we still hold
+ * the lock and this task is uninteresting.  If we return nonzero, we have
+ * released the lock and the system call should return.
+ */
+static int wait_task_zombie(task_t *p, unsigned int __user *stat_addr, struct rusage __user *ru)
+{
+	unsigned long state;
+	int retval;
+
+	/*
+	 * Try to move the task's state to DEAD
+	 * only one thread is allowed to do this:
+	 */
+	state = xchg(&p->state, TASK_DEAD);
+	if (state != TASK_ZOMBIE) {
+		BUG_ON(state != TASK_DEAD);
+		return 0;
+	}
+	if (unlikely(p->exit_signal == -1 && p->ptrace == 0))
+		/*
+		 * This can only happen in a race with a ptraced thread
+		 * dying on another processor.
+		 */
+		return 0;
+
+	/*
+	 * Now we are sure this task is interesting, and no other
+	 * thread can reap it because we set its state to TASK_DEAD.
+	 */
+	read_unlock(&tasklist_lock);
+
+	retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0;
+	if (!retval && stat_addr) {
+		if (p->signal->group_exit)
+			retval = put_user(p->signal->group_exit_code, stat_addr);
+		else
+			retval = put_user(p->exit_code, stat_addr);
+	}
+	if (retval) {
+		p->state = TASK_ZOMBIE;
+		return retval;
+	}
+	retval = p->pid;
+	if (p->real_parent != p->parent) {
+		write_lock_irq(&tasklist_lock);
+		/* Double-check with lock held.  */
+		if (p->real_parent != p->parent) {
+			__ptrace_unlink(p);
+			p->state = TASK_ZOMBIE;
+			/* If this is a detached thread, this is where it goes away.  */
+			if (p->exit_signal == -1) {
+				/* release_task takes the lock itself.  */
+				write_unlock_irq(&tasklist_lock);
+				release_task (p);
+			}
+			else {
+				do_notify_parent(p, p->exit_signal);
+				write_unlock_irq(&tasklist_lock);
+			}
+			p = NULL;
+		}
+		else
+			write_unlock_irq(&tasklist_lock);
+	}
+	if (p != NULL)
+		release_task(p);
+	BUG_ON(!retval);
+	return retval;
+}
+
+/*
+ * Handle sys_wait4 work for one task in state TASK_STOPPED.  We hold
+ * read_lock(&tasklist_lock) on entry.  If we return zero, we still hold
+ * the lock and this task is uninteresting.  If we return nonzero, we have
+ * released the lock and the system call should return.
+ */
+static int wait_task_stopped(task_t *p, int delayed_group_leader,
+			     unsigned int __user *stat_addr,
+			     struct rusage __user *ru)
+{
+	int retval, exit_code;
+
+	if (!p->exit_code)
+		return 0;
+	if (delayed_group_leader && !(p->ptrace & PT_PTRACED) &&
+	    p->signal && p->signal->group_stop_count > 0)
+		/*
+		 * A group stop is in progress and this is the group leader.
+		 * We won't report until all threads have stopped.
+		 */
+		return 0;
+
+	/*
+	 * Now we are pretty sure this task is interesting.
+	 * Make sure it doesn't get reaped out from under us while we
+	 * give up the lock and then examine it below.  We don't want to
+	 * keep holding onto the tasklist_lock while we call getrusage and
+	 * possibly take page faults for user memory.
+	 */
+	get_task_struct(p);
+	read_unlock(&tasklist_lock);
+	write_lock_irq(&tasklist_lock);
+
+	/*
+	 * This uses xchg to be atomic with the thread resuming and setting
+	 * it.  It must also be done with the write lock held to prevent a
+	 * race with the TASK_ZOMBIE case.
+	 */
+	exit_code = xchg(&p->exit_code, 0);
+	if (unlikely(p->state > TASK_STOPPED)) {
+		/*
+		 * The task resumed and then died.  Let the next iteration
+		 * catch it in TASK_ZOMBIE.  Note that exit_code might
+		 * already be zero here if it resumed and did _exit(0).
+		 * The task itself is dead and won't touch exit_code again;
+		 * other processors in this function are locked out.
+		 */
+		p->exit_code = exit_code;
+		exit_code = 0;
+	}
+	if (unlikely(exit_code == 0)) {
+		/*
+		 * Another thread in this function got to it first, or it
+		 * resumed, or it resumed and then died.
+		 */
+		write_unlock_irq(&tasklist_lock);
+		put_task_struct(p);
+		read_lock(&tasklist_lock);
+		return 0;
+	}
+
+	/* move to end of parent's list to avoid starvation */
+	remove_parent(p);
+	add_parent(p, p->parent);
+
+	write_unlock_irq(&tasklist_lock);
+
+	retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0;
+	if (!retval && stat_addr)
+		retval = put_user((exit_code << 8) | 0x7f, stat_addr);
+	if (!retval)
+		retval = p->pid;
+	put_task_struct(p);
+
+	BUG_ON(!retval);
+	return retval;
+}
+
+asmlinkage long sys_wait4(pid_t pid,unsigned int __user *stat_addr, int options, struct rusage __user *ru)
+{
+	DECLARE_WAITQUEUE(wait, current);
+	struct task_struct *tsk;
+	int flag, retval;
+
+	if (options & ~(WNOHANG|WUNTRACED|__WNOTHREAD|__WCLONE|__WALL))
+		return -EINVAL;
+
+	add_wait_queue(&current->wait_chldexit,&wait);
+repeat:
+	flag = 0;
+	current->state = TASK_INTERRUPTIBLE;
+	read_lock(&tasklist_lock);
+	tsk = current;
+	do {
+		struct task_struct *p;
+		struct list_head *_p;
+		int ret;
+
+		list_for_each(_p,&tsk->children) {
+			p = list_entry(_p,struct task_struct,sibling);
+
+			ret = eligible_child(pid, options, p);
+			if (!ret)
+				continue;
+			flag = 1;
+
+			switch (p->state) {
+			case TASK_STOPPED:
+				if (!(options & WUNTRACED) &&
+				    !(p->ptrace & PT_PTRACED))
+					continue;
+				retval = wait_task_stopped(p, ret == 2,
+							   stat_addr, ru);
+				if (retval != 0) /* He released the lock.  */
+					goto end_wait4;
+				break;
+			case TASK_ZOMBIE:
+				/*
+				 * Eligible but we cannot release it yet:
+				 */
+				if (ret == 2)
+					continue;
+				retval = wait_task_zombie(p, stat_addr, ru);
+				if (retval != 0) /* He released the lock.  */
+					goto end_wait4;
+				break;
+			}
+		}
+		if (!flag) {
+			list_for_each (_p,&tsk->ptrace_children) {
+				p = list_entry(_p,struct task_struct,ptrace_list);
+				if (!eligible_child(pid, options, p))
+					continue;
+				flag = 1;
+				break;
+			}
+		}
+		if (options & __WNOTHREAD)
+			break;
+		tsk = next_thread(tsk);
+		if (tsk->signal != current->signal)
+			BUG();
+	} while (tsk != current);
+	read_unlock(&tasklist_lock);
+	if (flag) {
+		retval = 0;
+		if (options & WNOHANG)
+			goto end_wait4;
+		retval = -ERESTARTSYS;
+		if (signal_pending(current))
+			goto end_wait4;
+		schedule();
+		goto repeat;
+	}
+	retval = -ECHILD;
+end_wait4:
+	current->state = TASK_RUNNING;
+	remove_wait_queue(&current->wait_chldexit,&wait);
+	return retval;
+}
+
+#ifdef __ARCH_WANT_SYS_WAITPID
+
+/*
+ * sys_waitpid() remains for compatibility. waitpid() should be
+ * implemented by calling sys_wait4() from libc.a.
+ */
+asmlinkage long sys_waitpid(pid_t pid, unsigned __user *stat_addr, int options)
+{
+	return sys_wait4(pid, stat_addr, options, NULL);
+}
+
+#endif
-- 
2.47.0