* kernel subsystems and hints as to where to find out what things do.
*/
-#include <linux/config.h>
+#include <linux/oom.h>
#include <linux/mm.h>
#include <linux/sched.h>
#include <linux/swap.h>
#include <linux/timex.h>
#include <linux/jiffies.h>
#include <linux/cpuset.h>
+#include <linux/module.h>
+#include <linux/notifier.h>
+#include <linux/vs_memory.h>
+int sysctl_panic_on_oom;
/* #define DEBUG */
/**
- * oom_badness - calculate a numeric value for how bad this task has been
+ * badness - calculate a numeric value for how bad this task has been
* @p: task struct of which task we should calculate
* @uptime: current uptime in seconds
*
*/
points = mm->total_vm;
+ /*
+ * add points for context badness
+ */
+
+ points += vx_badness(p, mm);
+
/*
* After this unlock we can no longer dereference local variable `mm'
*/
task_unlock(p);
- /* FIXME: add vserver badness ;) */
+ /*
+ * swapoff can easily use up all memory, so kill those first.
+ */
+ if (p->flags & PF_SWAPOFF)
+ return ULONG_MAX;
/*
* Processes which fork a lot of child processes are likely
if (cap_t(p->cap_effective) & CAP_TO_MASK(CAP_SYS_RAWIO))
points /= 4;
+ /*
+ * If p's nodes don't overlap ours, it may still help to kill p
+ * because p may have allocated or otherwise mapped memory on
+ * this node before. However it will be less likely.
+ */
+ if (!cpuset_excl_nodes_overlap(p))
+ points /= 8;
+
/*
* Adjust the score by oomkilladj.
*/
}
#ifdef DEBUG
- printk(KERN_DEBUG "OOMkill: task %d (%s) got %d points\n",
- p->pid, p->comm, points);
+ printk(KERN_DEBUG "OOMkill: task %d:#%u (%s) got %d points\n",
+ p->pid, p->xid, p->comm, points);
#endif
return points;
}
{
#ifdef CONFIG_NUMA
struct zone **z;
- nodemask_t nodes = node_online_map;
+ nodemask_t nodes;
+ int node;
+
+ nodes_clear(nodes);
+ /* node has memory ? */
+ for_each_online_node(node)
+ if (NODE_DATA(node)->node_present_pages)
+ node_set(node, nodes);
for (z = zonelist->zones; *z; z++)
- if (cpuset_zone_allowed(*z, gfp_mask))
- node_clear((*z)->zone_pgdat->node_id,
- nodes);
+ if (cpuset_zone_allowed_softwall(*z, gfp_mask))
+ node_clear(zone_to_nid(*z), nodes);
else
return CONSTRAINT_CPUSET;
do_posix_clock_monotonic_gettime(&uptime);
do_each_thread(g, p) {
unsigned long points;
- int releasing;
- /* skip the init task with pid == 1 */
- if (p->pid == 1)
- continue;
- if (p->oomkilladj == OOM_DISABLE)
+ /*
+ * skip kernel threads and tasks which have already released
+ * their mm.
+ */
+ if (!p->mm)
continue;
- /* If p's nodes don't overlap ours, it won't help to kill p. */
- if (!cpuset_excl_nodes_overlap(p))
+ /* skip the init task */
+ if (is_init(p))
continue;
/*
- * This is in the process of releasing memory so for wait it
- * to finish before killing some other task by mistake.
+ * This task already has access to memory reserves and is
+ * being killed. Don't allow any other task access to the
+ * memory reserve.
+ *
+ * Note: this may have a chance of deadlock if it gets
+ * blocked waiting for another task which itself is waiting
+ * for memory. Is there a better alternative?
*/
- releasing = test_tsk_thread_flag(p, TIF_MEMDIE) ||
- p->flags & PF_EXITING;
- if (releasing && !(p->flags & PF_DEAD))
+ if (test_tsk_thread_flag(p, TIF_MEMDIE))
return ERR_PTR(-1UL);
- if (p->flags & PF_SWAPOFF)
- return p;
+
+ /*
+ * This is in the process of releasing memory so wait for it
+ * to finish before killing some other task by mistake.
+ *
+ * However, if p is the current task, we allow the 'kill' to
+ * go ahead if it is exiting: this will simply set TIF_MEMDIE,
+ * which will allow it to gain access to memory reserves in
+ * the process of exiting and releasing its resources.
+ * Otherwise we could get an easy OOM deadlock.
+ */
+ if (p->flags & PF_EXITING) {
+ if (p != current)
+ return ERR_PTR(-1UL);
+
+ chosen = p;
+ *ppoints = ULONG_MAX;
+ }
+
+ if (p->oomkilladj == OOM_DISABLE)
+ continue;
points = badness(p, uptime.tv_sec);
if (points > *ppoints || !chosen) {
*ppoints = points;
}
} while_each_thread(g, p);
+
return chosen;
}
/**
- * We must be careful though to never send SIGKILL a process with
- * CAP_SYS_RAW_IO set, send SIGTERM instead (but it's unlikely that
- * we select a process with CAP_SYS_RAW_IO set).
+ * Send SIGKILL to the selected process irrespective of CAP_SYS_RAW_IO
+ * flag though it's unlikely that we select a process with CAP_SYS_RAW_IO
+ * set.
*/
-static void __oom_kill_task(task_t *p, const char *message)
+static void __oom_kill_task(struct task_struct *p, int verbose)
{
- if (p->pid == 1) {
+ if (is_init(p)) {
WARN_ON(1);
printk(KERN_WARNING "tried to kill init!\n");
return;
}
- task_lock(p);
- if (!p->mm || p->mm == &init_mm) {
+ if (!p->mm) {
WARN_ON(1);
printk(KERN_WARNING "tried to kill an mm-less task!\n");
- task_unlock(p);
return;
}
- task_unlock(p);
- printk(KERN_ERR "%s: Killed process %d (%s).\n",
- message, p->pid, p->comm);
+
+ if (verbose)
+ printk(KERN_ERR "Killed process %d:#%u (%s)\n",
+ p->pid, p->xid, p->comm);
/*
* We give our sacrificial lamb high priority and access to
force_sig(SIGKILL, p);
}
-static int oom_kill_task(task_t *p, const char *message)
+static int oom_kill_task(struct task_struct *p)
{
struct mm_struct *mm;
- task_t * g, * q;
+ struct task_struct *g, *q;
mm = p->mm;
* However, this is of no concern to us.
*/
- if (mm == NULL || mm == &init_mm)
+ if (mm == NULL)
return 1;
- __oom_kill_task(p, message);
+ /*
+ * Don't kill the process if any threads are set to OOM_DISABLE
+ */
+ do_each_thread(g, q) {
+ if (q->mm == mm && q->oomkilladj == OOM_DISABLE)
+ return 1;
+ } while_each_thread(g, q);
+
+ __oom_kill_task(p, 1);
+
/*
* kill all processes that share the ->mm (i.e. all threads),
- * but are in a different thread group
+ * but are in a different thread group. Don't let them have access
+ * to memory reserves though, otherwise we might deplete all memory.
*/
- do_each_thread(g, q)
+ do_each_thread(g, q) {
if (q->mm == mm && q->tgid != p->tgid)
- __oom_kill_task(q, message);
- while_each_thread(g, q);
+ force_sig(SIGKILL, q);
+ } while_each_thread(g, q);
return 0;
}
struct task_struct *c;
struct list_head *tsk;
- printk(KERN_ERR "Out of Memory: Kill process %d (%s) score %li and "
- "children.\n", p->pid, p->comm, points);
+ /*
+ * If the task is already exiting, don't alarm the sysadmin or kill
+ * its children or threads, just set TIF_MEMDIE so it can die quickly
+ */
+ if (p->flags & PF_EXITING) {
+ __oom_kill_task(p, 0);
+ return 0;
+ }
+
+ printk(KERN_ERR "%s: kill process %d:#%u (%s) score %li or a child\n",
+ message, p->pid, p->xid, p->comm, points);
+
/* Try to kill a child first */
list_for_each(tsk, &p->children) {
c = list_entry(tsk, struct task_struct, sibling);
if (c->mm == p->mm)
continue;
- if (!oom_kill_task(c, message))
+ if (!oom_kill_task(c))
return 0;
}
- return oom_kill_task(p, message);
+ return oom_kill_task(p);
+}
+
+static BLOCKING_NOTIFIER_HEAD(oom_notify_list);
+
+int register_oom_notifier(struct notifier_block *nb)
+{
+ return blocking_notifier_chain_register(&oom_notify_list, nb);
+}
+EXPORT_SYMBOL_GPL(register_oom_notifier);
+
+int unregister_oom_notifier(struct notifier_block *nb)
+{
+ return blocking_notifier_chain_unregister(&oom_notify_list, nb);
}
+EXPORT_SYMBOL_GPL(unregister_oom_notifier);
/**
- * oom_kill - kill the "best" process when we run out of memory
+ * out_of_memory - kill the "best" process when we run out of memory
*
* If we run out of memory, we have the choice between either
* killing a random task (bad), letting the system crash (worse)
*/
void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order)
{
- task_t *p;
+ struct task_struct *p;
unsigned long points = 0;
+ unsigned long freed = 0;
+
+ blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
+ if (freed > 0)
+ /* Got some memory back in the last second. */
+ return;
if (printk_ratelimit()) {
- printk("oom-killer: gfp_mask=0x%x, order=%d\n",
- gfp_mask, order);
+ printk(KERN_WARNING "%s invoked oom-killer: "
+ "gfp_mask=0x%x, order=%d, oomkilladj=%d\n",
+ current->comm, gfp_mask, order, current->oomkilladj);
dump_stack();
show_mem();
}
break;
case CONSTRAINT_NONE:
+ if (sysctl_panic_on_oom)
+ panic("out of memory. panic_on_oom is selected\n");
retry:
/*
* Rambo mode: Shoot down a process and hope it solves whatever