mm/oom_kill.c

   1 /*
   2  *  linux/mm/oom_kill.c
   3  *
   4  *  Copyright (C)  1998,2000  Rik van Riel
   5  *      Thanks go out to Claus Fischer for some serious inspiration and
   6  *      for goading me into coding this file...
   7  *
   8  *  The routines in this file are used to kill a process when
   9  *  we're seriously out of memory. This gets called from __alloc_pages()
  10  *  in mm/page_alloc.c when we really run out of memory.
  11  *
  12  *  Since we won't call these routines often (on a well-configured
  13  *  machine) this file will double as a 'coding guide' and a signpost
  14  *  for newbie kernel hackers. It features several pointers to major
  15  *  kernel subsystems and hints as to where to find out what things do.
  16  */
  17
  18 #include <linux/mm.h>
  19 #include <linux/sched.h>
  20 #include <linux/swap.h>
  21 #include <linux/timex.h>
  22 #include <linux/jiffies.h>
  23 #include <linux/cpuset.h>
  24
  25 /* #define DEBUG */
  26
  27 /**
  28  * oom_badness - calculate a numeric value for how bad this task has been
  29  * @p: task struct of which task we should calculate
  30  * @uptime: current uptime in seconds
  31  *
  32  * The formula used is relatively simple and documented inline in the
  33  * function. The main rationale is that we want to select a good task
  34  * to kill when we run out of memory.
  35  *
  36  * Good in this context means that:
  37  * 1) we lose the minimum amount of work done
  38  * 2) we recover a large amount of memory
  39  * 3) we don't kill anything innocent of eating tons of memory
  40  * 4) we want to kill the minimum amount of processes (one)
  41  * 5) we try to kill the process the user expects us to kill, this
  42  *    algorithm has been meticulously tuned to meet the principle
  43  *    of least surprise ... (be careful when you change it)
  44  */
  45
  46 unsigned long badness(struct task_struct *p, unsigned long uptime)
  47 {
  48         unsigned long points, cpu_time, run_time, s;
  49         struct mm_struct *mm;
  50         struct task_struct *child;
  51
  52         task_lock(p);
  53         mm = p->mm;
  54         if (!mm) {
  55                 task_unlock(p);
  56                 return 0;
  57         }
  58
  59         /*
  60          * The memory size of the process is the basis for the badness.
  61          */
  62         points = mm->total_vm;
  63
  64         /*
  65          * After this unlock we can no longer dereference local variable `mm'
  66          */
  67         task_unlock(p);
  68
  69         /* FIXME: add vserver badness ;) */
  70
  71         /*
  72          * Processes which fork a lot of child processes are likely
  73          * a good choice. We add half the vmsize of the children if they
  74          * have an own mm. This prevents forking servers to flood the
  75          * machine with an endless amount of children. In case a single
  76          * child is eating the vast majority of memory, adding only half
  77          * to the parents will make the child our kill candidate of choice.
  78          */
  79         list_for_each_entry(child, &p->children, sibling) {
  80                 task_lock(child);
  81                 if (child->mm != mm && child->mm)
  82                         points += child->mm->total_vm/2 + 1;
  83                 task_unlock(child);
  84         }
  85
  86         /*
  87          * CPU time is in tens of seconds and run time is in thousands
  88          * of seconds. There is no particular reason for this other than
  89          * that it turned out to work very well in practice.
  90          */
  91         cpu_time = (cputime_to_jiffies(p->utime) + cputime_to_jiffies(p->stime))
  92                 >> (SHIFT_HZ + 3);
  93
  94         if (uptime >= p->start_time.tv_sec)
  95                 run_time = (uptime - p->start_time.tv_sec) >> 10;
  96         else
  97                 run_time = 0;
  98
  99         s = int_sqrt(cpu_time);
 100         if (s)
 101                 points /= s;
 102         s = int_sqrt(int_sqrt(run_time));
 103         if (s)
 104                 points /= s;
 105
 106         /*
 107          * Niced processes are most likely less important, so double
 108          * their badness points.
 109          */
 110         if (task_nice(p) > 0)
 111                 points *= 2;
 112
 113         /*
 114          * Superuser processes are usually more important, so we make it
 115          * less likely that we kill those.
 116          */
 117         if (cap_t(p->cap_effective) & CAP_TO_MASK(CAP_SYS_ADMIN) ||
 118                                 p->uid == 0 || p->euid == 0)
 119                 points /= 4;
 120
 121         /*
 122          * We don't want to kill a process with direct hardware access.
 123          * Not only could that mess up the hardware, but usually users
 124          * tend to only have this flag set on applications they think
 125          * of as important.
 126          */
 127         if (cap_t(p->cap_effective) & CAP_TO_MASK(CAP_SYS_RAWIO))
 128                 points /= 4;
 129
 130         /*
 131          * Adjust the score by oomkilladj.
 132          */
 133         if (p->oomkilladj) {
 134                 if (p->oomkilladj > 0)
 135                         points <<= p->oomkilladj;
 136                 else
 137                         points >>= -(p->oomkilladj);
 138         }
 139
 140 #ifdef DEBUG
 141         printk(KERN_DEBUG "OOMkill: task %d (%s) got %d points\n",
 142         p->pid, p->comm, points);
 143 #endif
 144         return points;
 145 }
 146
 147 /*
 148  * Types of limitations to the nodes from which allocations may occur
 149  */
 150 #define CONSTRAINT_NONE 1
 151 #define CONSTRAINT_MEMORY_POLICY 2
 152 #define CONSTRAINT_CPUSET 3
 153
 154 /*
 155  * Determine the type of allocation constraint.
 156  */
 157 static inline int constrained_alloc(struct zonelist *zonelist, gfp_t gfp_mask)
 158 {
 159 #ifdef CONFIG_NUMA
 160         struct zone **z;
 161         nodemask_t nodes = node_online_map;
 162
 163         for (z = zonelist->zones; *z; z++)
 164                 if (cpuset_zone_allowed(*z, gfp_mask))
 165                         node_clear((*z)->zone_pgdat->node_id,
 166                                         nodes);
 167                 else
 168                         return CONSTRAINT_CPUSET;
 169
 170         if (!nodes_empty(nodes))
 171                 return CONSTRAINT_MEMORY_POLICY;
 172 #endif
 173
 174         return CONSTRAINT_NONE;
 175 }
 176
 177 /*
 178  * Simple selection loop. We chose the process with the highest
 179  * number of 'points'. We expect the caller will lock the tasklist.
 180  *
 181  * (not docbooked, we don't want this one cluttering up the manual)
 182  */
 183 static struct task_struct *select_bad_process(unsigned long *ppoints)
 184 {
 185         struct task_struct *g, *p;
 186         struct task_struct *chosen = NULL;
 187         struct timespec uptime;
 188         *ppoints = 0;
 189
 190         do_posix_clock_monotonic_gettime(&uptime);
 191         do_each_thread(g, p) {
 192                 unsigned long points;
 193                 int releasing;
 194
 195                 /* skip the init task with pid == 1 */
 196                 if (p->pid == 1)
 197                         continue;
 198                 if (p->oomkilladj == OOM_DISABLE)
 199                         continue;
 200                 /* If p's nodes don't overlap ours, it won't help to kill p. */
 201                 if (!cpuset_excl_nodes_overlap(p))
 202                         continue;
 203
 204                 /*
 205                  * This is in the process of releasing memory so for wait it
 206                  * to finish before killing some other task by mistake.
 207                  */
 208                 releasing = test_tsk_thread_flag(p, TIF_MEMDIE) ||
 209                                                 p->flags & PF_EXITING;
 210                 if (releasing && !(p->flags & PF_DEAD))
 211                         return ERR_PTR(-1UL);
 212                 if (p->flags & PF_SWAPOFF)
 213                         return p;
 214
 215                 points = badness(p, uptime.tv_sec);
 216                 if (points > *ppoints || !chosen) {
 217                         chosen = p;
 218                         *ppoints = points;
 219                 }
 220         } while_each_thread(g, p);
 221         return chosen;
 222 }
 223
 224 /**
 225  * We must be careful though to never send SIGKILL a process with
 226  * CAP_SYS_RAW_IO set, send SIGTERM instead (but it's unlikely that
 227  * we select a process with CAP_SYS_RAW_IO set).
 228  */
 229 static void __oom_kill_task(task_t *p, const char *message)
 230 {
 231         if (p->pid == 1) {
 232                 WARN_ON(1);
 233                 printk(KERN_WARNING "tried to kill init!\n");
 234                 return;
 235         }
 236
 237         task_lock(p);
 238         if (!p->mm || p->mm == &init_mm) {
 239                 WARN_ON(1);
 240                 printk(KERN_WARNING "tried to kill an mm-less task!\n");
 241                 task_unlock(p);
 242                 return;
 243         }
 244         task_unlock(p);
 245         printk(KERN_ERR "%s: Killed process %d (%s).\n",
 246                                 message, p->pid, p->comm);
 247
 248         /*
 249          * We give our sacrificial lamb high priority and access to
 250          * all the memory it needs. That way it should be able to
 251          * exit() and clear out its resources quickly...
 252          */
 253         p->time_slice = HZ;
 254         set_tsk_thread_flag(p, TIF_MEMDIE);
 255
 256         force_sig(SIGKILL, p);
 257 }
 258
 259 static int oom_kill_task(task_t *p, const char *message)
 260 {
 261         struct mm_struct *mm;
 262         task_t * g, * q;
 263
 264         mm = p->mm;
 265
 266         /* WARNING: mm may not be dereferenced since we did not obtain its
 267          * value from get_task_mm(p).  This is OK since all we need to do is
 268          * compare mm to q->mm below.
 269          *
 270          * Furthermore, even if mm contains a non-NULL value, p->mm may
 271          * change to NULL at any time since we do not hold task_lock(p).
 272          * However, this is of no concern to us.
 273          */
 274
 275         if (mm == NULL || mm == &init_mm)
 276                 return 1;
 277
 278         __oom_kill_task(p, message);
 279         /*
 280          * kill all processes that share the ->mm (i.e. all threads),
 281          * but are in a different thread group
 282          */
 283         do_each_thread(g, q)
 284                 if (q->mm == mm && q->tgid != p->tgid)
 285                         __oom_kill_task(q, message);
 286         while_each_thread(g, q);
 287
 288         return 0;
 289 }
 290
 291 static int oom_kill_process(struct task_struct *p, unsigned long points,
 292                 const char *message)
 293 {
 294         struct task_struct *c;
 295         struct list_head *tsk;
 296
 297         printk(KERN_ERR "Out of Memory: Kill process %d (%s) score %li and "
 298                 "children.\n", p->pid, p->comm, points);
 299         /* Try to kill a child first */
 300         list_for_each(tsk, &p->children) {
 301                 c = list_entry(tsk, struct task_struct, sibling);
 302                 if (c->mm == p->mm)
 303                         continue;
 304                 if (!oom_kill_task(c, message))
 305                         return 0;
 306         }
 307         return oom_kill_task(p, message);
 308 }
 309
 310 /**
 311  * oom_kill - kill the "best" process when we run out of memory
 312  *
 313  * If we run out of memory, we have the choice between either
 314  * killing a random task (bad), letting the system crash (worse)
 315  * OR try to be smart about which process to kill. Note that we
 316  * don't have to be perfect here, we just have to be good.
 317  */
 318 void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order)
 319 {
 320         task_t *p;
 321         unsigned long points = 0;
 322
 323         if (printk_ratelimit()) {
 324                 printk("oom-killer: gfp_mask=0x%x, order=%d\n",
 325                         gfp_mask, order);
 326                 dump_stack();
 327                 show_mem();
 328         }
 329
 330         cpuset_lock();
 331         read_lock(&tasklist_lock);
 332
 333         /*
 334          * Check if there were limitations on the allocation (only relevant for
 335          * NUMA) that may require different handling.
 336          */
 337         switch (constrained_alloc(zonelist, gfp_mask)) {
 338         case CONSTRAINT_MEMORY_POLICY:
 339                 oom_kill_process(current, points,
 340                                 "No available memory (MPOL_BIND)");
 341                 break;
 342
 343         case CONSTRAINT_CPUSET:
 344                 oom_kill_process(current, points,
 345                                 "No available memory in cpuset");
 346                 break;
 347
 348         case CONSTRAINT_NONE:
 349 retry:
 350                 /*
 351                  * Rambo mode: Shoot down a process and hope it solves whatever
 352                  * issues we may have.
 353                  */
 354                 p = select_bad_process(&points);
 355
 356                 if (PTR_ERR(p) == -1UL)
 357                         goto out;
 358
 359                 /* Found nothing?!?! Either we hang forever, or we panic. */
 360                 if (!p) {
 361                         read_unlock(&tasklist_lock);
 362                         cpuset_unlock();
 363                         panic("Out of memory and no killable processes...\n");
 364                 }
 365
 366                 if (oom_kill_process(p, points, "Out of memory"))
 367                         goto retry;
 368
 369                 break;
 370         }
 371
 372 out:
 373         read_unlock(&tasklist_lock);
 374         cpuset_unlock();
 375
 376         /*
 377          * Give "p" a good chance of killing itself before we
 378          * retry to allocate memory unless "p" is current
 379          */
 380         if (!test_thread_flag(TIF_MEMDIE))
 381                 schedule_timeout_uninterruptible(1);
 382 }