make it compile
[linux-2.6.git] / linux-2.6-execshield.patch
1 diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
2 index c45f415..3a6dbad 100644
3 --- a/arch/x86/include/asm/desc.h
4 +++ b/arch/x86/include/asm/desc.h
5 @@ -6,6 +6,7 @@
6  #include <asm/ldt.h>
7  #include <asm/mmu.h>
8  #include <linux/smp.h>
9 +#include <linux/mm_types.h>
10  
11  static inline void fill_ldt(struct desc_struct *desc,
12                             const struct user_desc *info)
13 @@ -94,6 +95,9 @@ static inline int desc_empty(const void *ptr)
14  
15  #define load_TLS(t, cpu) native_load_tls(t, cpu)
16  #define set_ldt native_set_ldt
17 +#ifdef CONFIG_X86_32
18 +#define load_user_cs_desc native_load_user_cs_desc
19 +#endif /*CONFIG_X86_32*/
20  
21  #define write_ldt_entry(dt, entry, desc)       \
22         native_write_ldt_entry(dt, entry, desc)
23 @@ -380,4 +384,25 @@ static inline void set_system_intr_gate_ist(int n, void *addr, unsigned ist)
24         _set_gate(n, GATE_INTERRUPT, addr, 0x3, ist, __KERNEL_CS);
25  }
26  
27 +#ifdef CONFIG_X86_32
28 +static inline void set_user_cs(struct desc_struct *desc, unsigned long limit)
29 +{
30 +       limit = (limit - 1) / PAGE_SIZE;
31 +       desc->a = limit & 0xffff;
32 +       desc->b = (limit & 0xf0000) | 0x00c0fb00;
33 +}
34 +
35 +static inline void native_load_user_cs_desc(int cpu, struct mm_struct *mm)
36 +{
37 +       get_cpu_gdt_table(cpu)[GDT_ENTRY_DEFAULT_USER_CS] = (mm)->context.user_cs;
38 +}
39 +
40 +#define arch_add_exec_range arch_add_exec_range
41 +#define arch_remove_exec_range arch_remove_exec_range
42 +#define arch_flush_exec_range arch_flush_exec_range
43 +extern void arch_add_exec_range(struct mm_struct *mm, unsigned long limit);
44 +extern void arch_remove_exec_range(struct mm_struct *mm, unsigned long limit);
45 +extern void arch_flush_exec_range(struct mm_struct *mm);
46 +#endif /* CONFIG_X86_32 */
47 +
48  #endif /* _ASM_X86_DESC_H */
49 diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
50 index 80a1dee..8314c66 100644
51 --- a/arch/x86/include/asm/mmu.h
52 +++ b/arch/x86/include/asm/mmu.h
53 @@ -7,12 +7,19 @@
54  /*
55   * The x86 doesn't have a mmu context, but
56   * we put the segment information here.
57 + *
58 + * exec_limit is used to track the range PROT_EXEC
59 + * mappings span.
60   */
61  typedef struct {
62         void *ldt;
63         int size;
64         struct mutex lock;
65         void *vdso;
66 +#ifdef CONFIG_X86_32
67 +       struct desc_struct user_cs;
68 +       unsigned long exec_limit;
69 +#endif
70  } mm_context_t;
71  
72  #ifdef CONFIG_SMP
73 diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
74 index 4fb37c8..d5cc31c 100644
75 --- a/arch/x86/include/asm/paravirt.h
76 +++ b/arch/x86/include/asm/paravirt.h
77 @@ -139,6 +139,9 @@ struct pv_cpu_ops {
78         void (*store_gdt)(struct desc_ptr *);
79         void (*store_idt)(struct desc_ptr *);
80         void (*set_ldt)(const void *desc, unsigned entries);
81 +#ifdef CONFIG_X86_32
82 +       void (*load_user_cs_desc)(int cpu, struct mm_struct *mm);
83 +#endif /*CONFIG_X86_32*/
84         unsigned long (*store_tr)(void);
85         void (*load_tls)(struct thread_struct *t, unsigned int cpu);
86  #ifdef CONFIG_X86_64
87 @@ -955,6 +958,12 @@ static inline void set_ldt(const void *addr, unsigned entries)
88  {
89         PVOP_VCALL2(pv_cpu_ops.set_ldt, addr, entries);
90  }
91 +#ifdef CONFIG_X86_32
92 +static inline void load_user_cs_desc(unsigned int cpu, struct mm_struct *mm)
93 +{
94 +       PVOP_VCALL2(pv_cpu_ops.load_user_cs_desc, cpu, mm);
95 +}
96 +#endif /*CONFIG_X86_32*/
97  static inline void store_gdt(struct desc_ptr *dtr)
98  {
99         PVOP_VCALL1(pv_cpu_ops.store_gdt, dtr);
100 diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
101 index c776826..fb6b579 100644
102 --- a/arch/x86/include/asm/processor.h
103 +++ b/arch/x86/include/asm/processor.h
104 @@ -160,6 +160,9 @@ static inline int hlt_works(int cpu)
105  
106  #define cache_line_size()      (boot_cpu_data.x86_cache_alignment)
107  
108 +#define __HAVE_ARCH_ALIGN_STACK
109 +extern unsigned long arch_align_stack(unsigned long sp);
110 +
111  extern void cpu_detect(struct cpuinfo_x86 *c);
112  
113  extern struct pt_regs *idle_regs(struct pt_regs *);
114 diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
115 index 3ffdcfa..62cba96 100644
116 --- a/arch/x86/kernel/cpu/common.c
117 +++ b/arch/x86/kernel/cpu/common.c
118 @@ -804,6 +804,20 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
119         /* Filter out anything that depends on CPUID levels we don't have */
120         filter_cpuid_features(c, true);
121  
122 +       /*
123 +        *  emulation of NX with segment limits unfortunately means
124 +        *  we have to disable the fast system calls, due to the way that
125 +        *  sysexit clears the segment limits on return.
126 +        *  If we have either disabled exec-shield on the boot command line,
127 +        *  or we have NX, then we don't need to do this.
128 +        */
129 +       if (exec_shield != 0) {
130 +#ifdef CONFIG_X86_PAE
131 +               if (!test_cpu_cap(c, X86_FEATURE_NX))
132 +#endif
133 +                       clear_cpu_cap(c, X86_FEATURE_SEP);
134 +       }
135 +
136         /* If the model name is still unset, do table lookup. */
137         if (!c->x86_model_id[0]) {
138                 const char *p;
139 diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
140 index 70ec9b9..d956b8c 100644
141 --- a/arch/x86/kernel/paravirt.c
142 +++ b/arch/x86/kernel/paravirt.c
143 @@ -369,6 +369,9 @@ struct pv_cpu_ops pv_cpu_ops = {
144         .read_tscp = native_read_tscp,
145         .load_tr_desc = native_load_tr_desc,
146         .set_ldt = native_set_ldt,
147 +#ifdef CONFIG_X86_32
148 +       .load_user_cs_desc = native_load_user_cs_desc,
149 +#endif /*CONFIG_X86_32*/
150         .load_gdt = native_load_gdt,
151         .load_idt = native_load_idt,
152         .store_gdt = native_store_gdt,
153 diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
154 index 59f4524..068e286 100644
155 --- a/arch/x86/kernel/process_32.c
156 +++ b/arch/x86/kernel/process_32.c
157 @@ -299,7 +299,10 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
158  void
159  start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
160  {
161 +       int cpu;
162 +
163         set_user_gs(regs, 0);
164 +
165         regs->fs                = 0;
166         set_fs(USER_DS);
167         regs->ds                = __USER_DS;
168 @@ -308,6 +311,11 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
169         regs->cs                = __USER_CS;
170         regs->ip                = new_ip;
171         regs->sp                = new_sp;
172 +
173 +       cpu = get_cpu();
174 +       load_user_cs_desc(cpu, current->mm);
175 +       put_cpu();
176 +
177         /*
178          * Free the old FP and other extended state
179          */
180 @@ -354,7 +362,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
181         /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */
182  
183         __unlazy_fpu(prev_p);
184 -
185 +       if (next_p->mm)
186 +               load_user_cs_desc(cpu, next_p->mm);
187  
188         /* we're going to use this soon, after a few expensive things */
189         if (next_p->fpu_counter > 5)
190 @@ -495,3 +504,40 @@ unsigned long get_wchan(struct task_struct *p)
191         return 0;
192  }
193  
194 +static void modify_cs(struct mm_struct *mm, unsigned long limit)
195 +{
196 +       mm->context.exec_limit = limit;
197 +       set_user_cs(&mm->context.user_cs, limit);
198 +       if (mm == current->mm) {
199 +               int cpu;
200 +
201 +               cpu = get_cpu();
202 +               load_user_cs_desc(cpu, mm);
203 +               put_cpu();
204 +       }
205 +}
206 +
207 +void arch_add_exec_range(struct mm_struct *mm, unsigned long limit)
208 +{
209 +       if (limit > mm->context.exec_limit)
210 +               modify_cs(mm, limit);
211 +}
212 +
213 +void arch_remove_exec_range(struct mm_struct *mm, unsigned long old_end)
214 +{
215 +       struct vm_area_struct *vma;
216 +       unsigned long limit = PAGE_SIZE;
217 +
218 +       if (old_end == mm->context.exec_limit) {
219 +               for (vma = mm->mmap; vma; vma = vma->vm_next)
220 +                       if ((vma->vm_flags & VM_EXEC) && (vma->vm_end > limit))
221 +                               limit = vma->vm_end;
222 +               modify_cs(mm, limit);
223 +       }
224 +}
225 +
226 +void arch_flush_exec_range(struct mm_struct *mm)
227 +{
228 +       mm->context.exec_limit = 0;
229 +       set_user_cs(&mm->context.user_cs, 0);
230 +}
231 diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
232 index 07d60c8..41e9129 100644
233 --- a/arch/x86/kernel/traps.c
234 +++ b/arch/x86/kernel/traps.c
235 @@ -118,6 +118,76 @@ die_if_kernel(const char *str, struct pt_regs *regs, long err)
236         if (!user_mode_vm(regs))
237                 die(str, regs, err);
238  }
239 +
240 +static inline int
241 +__compare_user_cs_desc(const struct desc_struct *desc1,
242 +       const struct desc_struct *desc2)
243 +{
244 +       return ((desc1->limit0 != desc2->limit0) ||
245 +               (desc1->limit != desc2->limit) ||
246 +               (desc1->base0 != desc2->base0) ||
247 +               (desc1->base1 != desc2->base1) ||
248 +               (desc1->base2 != desc2->base2));
249 +}
250 +
251 +/*
252 + * lazy-check for CS validity on exec-shield binaries:
253 + *
254 + * the original non-exec stack patch was written by
255 + * Solar Designer <solar at openwall.com>. Thanks!
256 + */
257 +static int
258 +check_lazy_exec_limit(int cpu, struct pt_regs *regs, long error_code)
259 +{
260 +       struct desc_struct *desc1, *desc2;
261 +       struct vm_area_struct *vma;
262 +       unsigned long limit;
263 +
264 +       if (current->mm == NULL)
265 +               return 0;
266 +
267 +       limit = -1UL;
268 +       if (current->mm->context.exec_limit != -1UL) {
269 +               limit = PAGE_SIZE;
270 +               spin_lock(&current->mm->page_table_lock);
271 +               for (vma = current->mm->mmap; vma; vma = vma->vm_next)
272 +                       if ((vma->vm_flags & VM_EXEC) && (vma->vm_end > limit))
273 +                               limit = vma->vm_end;
274 +               vma = get_gate_vma(current);
275 +               if (vma && (vma->vm_flags & VM_EXEC) && (vma->vm_end > limit))
276 +                       limit = vma->vm_end;
277 +               spin_unlock(&current->mm->page_table_lock);
278 +               if (limit >= TASK_SIZE)
279 +                       limit = -1UL;
280 +               current->mm->context.exec_limit = limit;
281 +       }
282 +       set_user_cs(&current->mm->context.user_cs, limit);
283 +
284 +       desc1 = &current->mm->context.user_cs;
285 +       desc2 = get_cpu_gdt_table(cpu) + GDT_ENTRY_DEFAULT_USER_CS;
286 +
287 +       if (__compare_user_cs_desc(desc1, desc2)) {
288 +               /*
289 +                * The CS was not in sync - reload it and retry the
290 +                * instruction. If the instruction still faults then
291 +                * we won't hit this branch next time around.
292 +                */
293 +               if (print_fatal_signals >= 2) {
294 +                       printk(KERN_ERR "#GPF fixup (%ld[seg:%lx]) at %08lx, CPU#%d.\n",
295 +                               error_code, error_code/8, regs->ip,
296 +                               smp_processor_id());
297 +                       printk(KERN_ERR "exec_limit: %08lx, user_cs: %08x/%08x, CPU_cs: %08x/%08x.\n",
298 +                               current->mm->context.exec_limit,
299 +                               desc1->a, desc1->b, desc2->a, desc2->b);
300 +               }
301 +
302 +               load_user_cs_desc(cpu, current->mm);
303 +
304 +               return 1;
305 +       }
306 +
307 +       return 0;
308 +}
309  #endif
310  
311  static void __kprobes
312 @@ -276,6 +346,29 @@ do_general_protection(struct pt_regs *regs, long error_code)
313         if (!user_mode(regs))
314                 goto gp_in_kernel;
315  
316 +#ifdef CONFIG_X86_32
317 +{
318 +       int cpu;
319 +       int ok;
320 +
321 +       cpu = get_cpu();
322 +       ok = check_lazy_exec_limit(cpu, regs, error_code);
323 +       put_cpu();
324 +
325 +       if (ok)
326 +               return;
327 +
328 +       if (print_fatal_signals) {
329 +               printk(KERN_ERR "#GPF(%ld[seg:%lx]) at %08lx, CPU#%d.\n",
330 +                       error_code, error_code/8, regs->ip, smp_processor_id());
331 +               printk(KERN_ERR "exec_limit: %08lx, user_cs: %08x/%08x.\n",
332 +                       current->mm->context.exec_limit,
333 +                       current->mm->context.user_cs.a,
334 +                       current->mm->context.user_cs.b);
335 +       }
336 +}
337 +#endif /*CONFIG_X86_32*/
338 +
339         tsk->thread.error_code = error_code;
340         tsk->thread.trap_no = 13;
341  
342 @@ -885,19 +978,37 @@ do_device_not_available(struct pt_regs *regs, long error_code)
343  }
344  
345  #ifdef CONFIG_X86_32
346 +/*
347 + * The fixup code for errors in iret jumps to here (iret_exc). It loses
348 + * the original trap number and erorr code. The bogus trap 32 and error
349 + * code 0 are what the vanilla kernel delivers via:
350 + * DO_ERROR_INFO(32, SIGSEGV, "iret exception", iret_error, ILL_BADSTK, 0, 1)
351 + *
352 + * NOTE: Because of the final "1" in the macro we need to enable interrupts.
353 + *
354 + * In case of a general protection fault in the iret instruction, we
355 + * need to check for a lazy CS update for exec-shield.
356 + */
357  dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
358  {
359 -       siginfo_t info;
360 +       int ok;
361 +       int cpu;
362 +
363         local_irq_enable();
364  
365 -       info.si_signo = SIGILL;
366 -       info.si_errno = 0;
367 -       info.si_code = ILL_BADSTK;
368 -       info.si_addr = NULL;
369 -       if (notify_die(DIE_TRAP, "iret exception",
370 -                       regs, error_code, 32, SIGILL) == NOTIFY_STOP)
371 -               return;
372 -       do_trap(32, SIGILL, "iret exception", regs, error_code, &info);
373 +       cpu = get_cpu();
374 +       ok = check_lazy_exec_limit(cpu, regs, error_code);
375 +       put_cpu();
376 +
377 +       if (!ok && notify_die(DIE_TRAP, "iret exception", regs,
378 +               error_code, 32, SIGSEGV) != NOTIFY_STOP) {
379 +                       siginfo_t info;
380 +                       info.si_signo = SIGSEGV;
381 +                       info.si_errno = 0;
382 +                       info.si_code = ILL_BADSTK;
383 +                       info.si_addr = 0;
384 +                       do_trap(32, SIGSEGV, "iret exception", regs, error_code, &info);
385 +       }
386  }
387  #endif
388  
389 diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
390 index 34c1bfb..32c3d8d 100644
391 --- a/arch/x86/mm/init.c
392 +++ b/arch/x86/mm/init.c
393 @@ -228,6 +228,12 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
394         set_nx();
395         if (nx_enabled)
396                 printk(KERN_INFO "NX (Execute Disable) protection: active\n");
397 +#ifdef CONFIG_X86_32
398 +       else
399 +       if (exec_shield)
400 +               printk(KERN_INFO "Using x86 segment limits to approximate "
401 +                       "NX protection\n");
402 +#endif
403  
404         /* Enable PSE if available */
405         if (cpu_has_pse)
406 diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
407 index 949708d..c1373b6 100644
408 --- a/arch/x86/mm/init_32.c
409 +++ b/arch/x86/mm/init_32.c
410 @@ -587,6 +587,54 @@ void zap_low_mappings(void)
411  pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL | _PAGE_IOMAP);
412  EXPORT_SYMBOL_GPL(__supported_pte_mask);
413  
414 +#ifdef CONFIG_X86_PAE
415 +
416 +static int disable_nx __initdata;
417 +
418 +/*
419 + * noexec = on|off
420 + *
421 + * Control non executable mappings.
422 + *
423 + * on      Enable
424 + * off     Disable (disables exec-shield too)
425 + */
426 +static int __init noexec_setup(char *str)
427 +{
428 +       if (!str || !strcmp(str, "on")) {
429 +               if (cpu_has_nx) {
430 +                       __supported_pte_mask |= _PAGE_NX;
431 +                       disable_nx = 0;
432 +               }
433 +       } else if (!strcmp(str, "off")) {
434 +               disable_nx = 1;
435 +               __supported_pte_mask &= ~_PAGE_NX;
436 +               exec_shield = 0;
437 +       } else
438 +               return -EINVAL;
439 +
440 +       return 0;
441 +}
442 +early_param("noexec", noexec_setup);
443 +
444 +void __init set_nx(void)
445 +{
446 +       unsigned int v[4], l, h;
447 +
448 +       if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) {
449 +               cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]);
450 +
451 +               if ((v[3] & (1 << 20)) && !disable_nx) {
452 +                       rdmsr(MSR_EFER, l, h);
453 +                       l |= EFER_NX;
454 +                       wrmsr(MSR_EFER, l, h);
455 +                       nx_enabled = 1;
456 +                       __supported_pte_mask |= _PAGE_NX;
457 +               }
458 +       }
459 +}
460 +#endif
461 +
462  /* user-defined highmem size */
463  static unsigned int highmem_pages = -1;
464  
465 diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
466 index 1658296..72056cf 100644
467 --- a/arch/x86/mm/mmap.c
468 +++ b/arch/x86/mm/mmap.c
469 @@ -111,13 +111,16 @@ static unsigned long mmap_legacy_base(void)
470   */
471  void arch_pick_mmap_layout(struct mm_struct *mm)
472  {
473 -       if (mmap_is_legacy()) {
474 +       if (!(2 & exec_shield) && mmap_is_legacy()) {
475                 mm->mmap_base = mmap_legacy_base();
476                 mm->get_unmapped_area = arch_get_unmapped_area;
477                 mm->unmap_area = arch_unmap_area;
478         } else {
479                 mm->mmap_base = mmap_base();
480                 mm->get_unmapped_area = arch_get_unmapped_area_topdown;
481 +               if (!(current->personality & READ_IMPLIES_EXEC)
482 +                   && mmap_is_ia32())
483 +                       mm->get_unmapped_exec_area = arch_get_unmapped_exec_area;
484                 mm->unmap_area = arch_unmap_area_topdown;
485         }
486  }
487 diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
488 index 821e970..ea5a4c3 100644
489 --- a/arch/x86/mm/tlb.c
490 +++ b/arch/x86/mm/tlb.c
491 @@ -6,6 +6,7 @@
492  #include <linux/interrupt.h>
493  #include <linux/module.h>
494  
495 +#include <asm/desc.h>
496  #include <asm/tlbflush.h>
497  #include <asm/mmu_context.h>
498  #include <asm/apic.h>
499 @@ -129,6 +130,12 @@ void smp_invalidate_interrupt(struct pt_regs *regs)
500         union smp_flush_state *f;
501  
502         cpu = smp_processor_id();
503 +
504 +#ifdef CONFIG_X86_32
505 +       if (current->active_mm)
506 +               load_user_cs_desc(cpu, current->active_mm);
507 +#endif
508 +
509         /*
510          * orig_rax contains the negated interrupt vector.
511          * Use that to determine where the sender put the data.
512 diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c
513 index 58bc00f..1fdafb5 100644
514 --- a/arch/x86/vdso/vdso32-setup.c
515 +++ b/arch/x86/vdso/vdso32-setup.c
516 @@ -331,7 +331,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
517         if (compat)
518                 addr = VDSO_HIGH_BASE;
519         else {
520 -               addr = get_unmapped_area(NULL, 0, PAGE_SIZE, 0, 0);
521 +               addr = get_unmapped_area_prot(NULL, 0, PAGE_SIZE, 0, 0, 1);
522                 if (IS_ERR_VALUE(addr)) {
523                         ret = addr;
524                         goto up_fail;
525 diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
526 index 0a1700a..37b8744 100644
527 --- a/arch/x86/xen/enlighten.c
528 +++ b/arch/x86/xen/enlighten.c
529 @@ -321,6 +321,24 @@ static void xen_set_ldt(const void *addr, unsigned entries)
530         xen_mc_issue(PARAVIRT_LAZY_CPU);
531  }
532  
533 +#ifdef CONFIG_X86_32
534 +static void xen_load_user_cs_desc(int cpu, struct mm_struct *mm)
535 +{
536 +       void *gdt;
537 +       xmaddr_t mgdt;
538 +       u64 descriptor;
539 +       struct desc_struct user_cs;
540 +
541 +       gdt = &get_cpu_gdt_table(cpu)[GDT_ENTRY_DEFAULT_USER_CS];
542 +       mgdt = virt_to_machine(gdt);
543 +
544 +       user_cs = mm->context.user_cs;
545 +       descriptor = (u64) user_cs.a | ((u64) user_cs.b) << 32;
546 +
547 +       HYPERVISOR_update_descriptor(mgdt.maddr, descriptor);
548 +}
549 +#endif /*CONFIG_X86_32*/
550 +
551  static void xen_load_gdt(const struct desc_ptr *dtr)
552  {
553         unsigned long va = dtr->address;
554 @@ -886,6 +904,9 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
555  
556         .load_tr_desc = paravirt_nop,
557         .set_ldt = xen_set_ldt,
558 +#ifdef CONFIG_X86_32
559 +       .load_user_cs_desc = xen_load_user_cs_desc,
560 +#endif /*CONFIG_X86_32*/
561         .load_gdt = xen_load_gdt,
562         .load_idt = xen_load_idt,
563         .load_tls = xen_load_tls,
564 diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
565 index 40381df..f856fab 100644
566 --- a/fs/binfmt_elf.c
567 +++ b/fs/binfmt_elf.c
568 @@ -73,7 +73,7 @@ static struct linux_binfmt elf_format = {
569                 .hasvdso        = 1
570  };
571  
572 -#define BAD_ADDR(x) ((unsigned long)(x) >= TASK_SIZE)
573 +#define BAD_ADDR(x) IS_ERR_VALUE(x)
574  
575  static int set_brk(unsigned long start, unsigned long end)
576  {
577 @@ -721,6 +721,11 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
578                         break;
579                 }
580  
581 +       if (current->personality == PER_LINUX && (exec_shield & 2)) {
582 +               executable_stack = EXSTACK_DISABLE_X;
583 +               current->flags |= PF_RANDOMIZE;
584 +       }
585 +
586         /* Some simple consistency checks for the interpreter */
587         if (elf_interpreter) {
588                 retval = -ELIBBAD;
589 @@ -740,6 +745,15 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
590         if (retval)
591                 goto out_free_dentry;
592  
593 +#ifdef CONFIG_X86_32
594 +       /*
595 +        * Turn off the CS limit completely if exec-shield disabled or
596 +        * NX active:
597 +        */
598 +       if (!exec_shield || executable_stack != EXSTACK_DISABLE_X || nx_enabled)
599 +               arch_add_exec_range(current->mm, -1);
600 +#endif
601 +
602         /* OK, This is the point of no return */
603         current->flags &= ~PF_FORKNOEXEC;
604         current->mm->def_flags = def_flags;
605 @@ -747,7 +761,8 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
606         /* Do this immediately, since STACK_TOP as used in setup_arg_pages
607            may depend on the personality.  */
608         SET_PERSONALITY(loc->elf_ex);
609 -       if (elf_read_implies_exec(loc->elf_ex, executable_stack))
610 +       if (!(exec_shield & 2) &&
611 +                       elf_read_implies_exec(loc->elf_ex, executable_stack))
612                 current->personality |= READ_IMPLIES_EXEC;
613  
614         if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
615 @@ -912,7 +927,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
616                                             interpreter,
617                                             &interp_map_addr,
618                                             load_bias);
619 -               if (!IS_ERR((void *)elf_entry)) {
620 +               if (!BAD_ADDR(elf_entry)) {
621                         /*
622                          * load_elf_interp() returns relocation
623                          * adjustment
624 diff --git a/include/linux/mm.h b/include/linux/mm.h
625 index ad613ed..08f08d0 100644
626 --- a/include/linux/mm.h
627 +++ b/include/linux/mm.h
628 @@ -1135,7 +1135,13 @@ extern int install_special_mapping(struct mm_struct *mm,
629                                    unsigned long addr, unsigned long len,
630                                    unsigned long flags, struct page **pages);
631  
632 -extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
633 +extern unsigned long get_unmapped_area_prot(struct file *, unsigned long, unsigned long, unsigned long, unsigned long, int);
634 +
635 +static inline unsigned long get_unmapped_area(struct file *file, unsigned long addr,
636 +               unsigned long len, unsigned long pgoff, unsigned long flags)
637 +{
638 +       return get_unmapped_area_prot(file, addr, len, pgoff, flags, 0);
639 +}
640  
641  extern unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
642         unsigned long len, unsigned long prot,
643 diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
644 index 0e80e26..af904ea 100644
645 --- a/include/linux/mm_types.h
646 +++ b/include/linux/mm_types.h
647 @@ -198,6 +198,9 @@ struct mm_struct {
648         unsigned long (*get_unmapped_area) (struct file *filp,
649                                 unsigned long addr, unsigned long len,
650                                 unsigned long pgoff, unsigned long flags);
651 +       unsigned long (*get_unmapped_exec_area) (struct file *filp,
652 +                               unsigned long addr, unsigned long len,
653 +                               unsigned long pgoff, unsigned long flags);
654         void (*unmap_area) (struct mm_struct *mm, unsigned long addr);
655         unsigned long mmap_base;                /* base of mmap area */
656         unsigned long task_size;                /* size of task vm space */
657 diff --git a/include/linux/resource.h b/include/linux/resource.h
658 index 40fc7e6..68c2549 100644
659 --- a/include/linux/resource.h
660 +++ b/include/linux/resource.h
661 @@ -55,8 +55,11 @@ struct rlimit {
662  /*
663   * Limit the stack by to some sane default: root can always
664   * increase this limit if needed..  8MB seems reasonable.
665 + *
666 + * (2MB more to cover randomization effects.)
667   */
668 -#define _STK_LIM       (8*1024*1024)
669 +#define _STK_LIM       (10*1024*1024)
670 +#define EXEC_STACK_BIAS        (2*1024*1024)
671  
672  /*
673   * GPG2 wants 64kB of mlocked memory, to make sure pass phrases
674 diff --git a/include/linux/sched.h b/include/linux/sched.h
675 index 4896fdf..3513e03 100644
676 --- a/include/linux/sched.h
677 +++ b/include/linux/sched.h
678 @@ -101,6 +101,9 @@ struct fs_struct;
679  struct bts_context;
680  struct perf_counter_context;
681  
682 +extern int exec_shield;
683 +extern int print_fatal_signals;
684 +
685  /*
686   * List of flags we want to share for kernel threads,
687   * if only because they are not used by them anyway.
688 @@ -359,6 +362,10 @@ extern int sysctl_max_map_count;
689  extern unsigned long
690  arch_get_unmapped_area(struct file *, unsigned long, unsigned long,
691                        unsigned long, unsigned long);
692 +
693 +extern unsigned long
694 +arch_get_unmapped_exec_area(struct file *, unsigned long, unsigned long,
695 +                      unsigned long, unsigned long);
696  extern unsigned long
697  arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
698                           unsigned long len, unsigned long pgoff,
699 diff --git a/kernel/sysctl.c b/kernel/sysctl.c
700 index ce664f9..1905e22 100644
701 --- a/kernel/sysctl.c
702 +++ b/kernel/sysctl.c
703 @@ -87,6 +87,26 @@ extern int sysctl_nr_open_min, sysctl_nr_open_max;
704  #ifndef CONFIG_MMU
705  extern int sysctl_nr_trim_pages;
706  #endif
707 +
708 +int exec_shield = (1<<0);
709 +/* exec_shield is a bitmask:
710 + * 0: off; vdso at STACK_TOP, 1 page below TASK_SIZE
711 + * (1<<0) 1: on [also on if !=0]
712 + * (1<<1) 2: force noexecstack regardless of PT_GNU_STACK
713 + * The old settings
714 + * (1<<2) 4: vdso just below .text of main (unless too low)
715 + * (1<<3) 8: vdso just below .text of PT_INTERP (unless too low)
716 + * are ignored because the vdso is placed completely randomly
717 + */
718 +
719 +static int __init setup_exec_shield(char *str)
720 +{
721 +       get_option(&str, &exec_shield);
722 +
723 +       return 1;
724 +}
725 +__setup("exec-shield=", setup_exec_shield);
726 +
727  #ifdef CONFIG_RCU_TORTURE_TEST
728  extern int rcutorture_runnable;
729  #endif /* #ifdef CONFIG_RCU_TORTURE_TEST */
730 @@ -382,6 +402,14 @@ static struct ctl_table kern_table[] = {
731                 .proc_handler   = &proc_dointvec,
732         },
733         {
734 +               .ctl_name       = CTL_UNNUMBERED,
735 +               .procname       = "exec-shield",
736 +               .data           = &exec_shield,
737 +               .maxlen         = sizeof(int),
738 +               .mode           = 0644,
739 +               .proc_handler   = &proc_dointvec,
740 +       },
741 +       {
742                 .ctl_name       = KERN_CORE_USES_PID,
743                 .procname       = "core_uses_pid",
744                 .data           = &core_uses_pid,
745 diff --git a/mm/mmap.c b/mm/mmap.c
746 index 34579b2..260bb3c 100644
747 --- a/mm/mmap.c
748 +++ b/mm/mmap.c
749 @@ -29,6 +29,7 @@
750  #include <linux/rmap.h>
751  #include <linux/mmu_notifier.h>
752  #include <linux/perf_counter.h>
753 +#include <linux/random.h>
754  
755  #include <asm/uaccess.h>
756  #include <asm/cacheflush.h>
757 @@ -45,6 +46,18 @@
758  #define arch_rebalance_pgtables(addr, len)             (addr)
759  #endif
760  
761 +/* No sane architecture will #define these to anything else */
762 +#ifndef arch_add_exec_range
763 +#define arch_add_exec_range(mm, limit) do { ; } while (0)
764 +#endif
765 +#ifndef arch_flush_exec_range
766 +#define arch_flush_exec_range(mm)      do { ; } while (0)
767 +#endif
768 +#ifndef arch_remove_exec_range
769 +#define arch_remove_exec_range(mm, limit)      do { ; } while (0)
770 +#endif
771 +
772 +
773  static void unmap_region(struct mm_struct *mm,
774                 struct vm_area_struct *vma, struct vm_area_struct *prev,
775                 unsigned long start, unsigned long end);
776 @@ -392,6 +405,8 @@ static inline void
777  __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
778                 struct vm_area_struct *prev, struct rb_node *rb_parent)
779  {
780 +       if (vma->vm_flags & VM_EXEC)
781 +               arch_add_exec_range(mm, vma->vm_end);
782         if (prev) {
783                 vma->vm_next = prev->vm_next;
784                 prev->vm_next = vma;
785 @@ -494,6 +509,8 @@ __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma,
786         rb_erase(&vma->vm_rb, &mm->mm_rb);
787         if (mm->mmap_cache == vma)
788                 mm->mmap_cache = prev;
789 +       if (vma->vm_flags & VM_EXEC)
790 +               arch_remove_exec_range(mm, vma->vm_end);
791  }
792  
793  /*
794 @@ -803,6 +820,8 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
795                 } else                                  /* cases 2, 5, 7 */
796                         vma_adjust(prev, prev->vm_start,
797                                 end, prev->vm_pgoff, NULL);
798 +               if (prev->vm_flags & VM_EXEC)
799 +                       arch_add_exec_range(mm, prev->vm_end);
800                 return prev;
801         }
802  
803 @@ -957,7 +976,8 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
804         /* Obtain the address to map to. we verify (or select) it and ensure
805          * that it represents a valid section of the address space.
806          */
807 -       addr = get_unmapped_area(file, addr, len, pgoff, flags);
808 +       addr = get_unmapped_area_prot(file, addr, len, pgoff, flags,
809 +               prot & PROT_EXEC);
810         if (addr & ~PAGE_MASK)
811                 return addr;
812  
813 @@ -1442,13 +1462,17 @@ void arch_unmap_area_topdown(struct mm_struct *mm, unsigned long addr)
814  }
815  
816  unsigned long
817 -get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
818 -               unsigned long pgoff, unsigned long flags)
819 +get_unmapped_area_prot(struct file *file, unsigned long addr, unsigned long len,
820 +               unsigned long pgoff, unsigned long flags, int exec)
821  {
822         unsigned long (*get_area)(struct file *, unsigned long,
823                                   unsigned long, unsigned long, unsigned long);
824  
825 -       get_area = current->mm->get_unmapped_area;
826 +       if (exec && current->mm->get_unmapped_exec_area)
827 +               get_area = current->mm->get_unmapped_exec_area;
828 +       else
829 +               get_area = current->mm->get_unmapped_area;
830 +
831         if (file && file->f_op && file->f_op->get_unmapped_area)
832                 get_area = file->f_op->get_unmapped_area;
833         addr = get_area(file, addr, len, pgoff, flags);
834 @@ -1462,8 +1486,76 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
835  
836         return arch_rebalance_pgtables(addr, len);
837  }
838 +EXPORT_SYMBOL(get_unmapped_area_prot);
839 +
840 +#define SHLIB_BASE     0x00110000
841 +
842 +unsigned long
843 +arch_get_unmapped_exec_area(struct file *filp, unsigned long addr0,
844 +               unsigned long len0, unsigned long pgoff, unsigned long flags)
845 +{
846 +       unsigned long addr = addr0, len = len0;
847 +       struct mm_struct *mm = current->mm;
848 +       struct vm_area_struct *vma;
849 +       unsigned long tmp;
850 +
851 +       if (len > TASK_SIZE)
852 +               return -ENOMEM;
853 +
854 +       if (flags & MAP_FIXED)
855 +               return addr;
856 +
857 +       if (!addr)
858 +               addr = randomize_range(SHLIB_BASE, 0x01000000, len);
859 +
860 +       if (addr) {
861 +               addr = PAGE_ALIGN(addr);
862 +               vma = find_vma(mm, addr);
863 +               if (TASK_SIZE - len >= addr &&
864 +                   (!vma || addr + len <= vma->vm_start))
865 +                       return addr;
866 +       }
867 +
868 +       addr = SHLIB_BASE;
869 +       for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
870 +               /* At this point:  (!vma || addr < vma->vm_end). */
871 +               if (TASK_SIZE - len < addr)
872 +                       return -ENOMEM;
873 +
874 +               if (!vma || addr + len <= vma->vm_start) {
875 +                       /*
876 +                        * Must not let a PROT_EXEC mapping get into the
877 +                        * brk area:
878 +                        */
879 +                       if (addr + len > mm->brk)
880 +                               goto failed;
881 +
882 +                       /*
883 +                        * Up until the brk area we randomize addresses
884 +                        * as much as possible:
885 +                        */
886 +                       if (addr >= 0x01000000) {
887 +                               tmp = randomize_range(0x01000000,
888 +                                       PAGE_ALIGN(max(mm->start_brk,
889 +                                       (unsigned long)0x08000000)), len);
890 +                               vma = find_vma(mm, tmp);
891 +                               if (TASK_SIZE - len >= tmp &&
892 +                                   (!vma || tmp + len <= vma->vm_start))
893 +                                       return tmp;
894 +                       }
895 +                       /*
896 +                        * Ok, randomization didnt work out - return
897 +                        * the result of the linear search:
898 +                        */
899 +                       return addr;
900 +               }
901 +               addr = vma->vm_end;
902 +       }
903 +
904 +failed:
905 +       return current->mm->get_unmapped_area(filp, addr0, len0, pgoff, flags);
906 +}
907  
908 -EXPORT_SYMBOL(get_unmapped_area);
909  
910  /* Look up the first VMA which satisfies  addr < vm_end,  NULL if none. */
911  struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
912 @@ -1538,6 +1630,14 @@ out:
913         return prev ? prev->vm_next : vma;
914  }
915  
916 +static int over_stack_limit(unsigned long sz)
917 +{
918 +       if (sz < EXEC_STACK_BIAS)
919 +               return 0;
920 +       return (sz - EXEC_STACK_BIAS) >
921 +                       current->signal->rlim[RLIMIT_STACK].rlim_cur;
922 +}
923 +
924  /*
925   * Verify that the stack growth is acceptable and
926   * update accounting. This is shared with both the
927 @@ -1554,7 +1654,7 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns
928                 return -ENOMEM;
929  
930         /* Stack limit test */
931 -       if (size > rlim[RLIMIT_STACK].rlim_cur)
932 +       if (over_stack_limit(size))
933                 return -ENOMEM;
934  
935         /* mlock limit tests */
936 @@ -1864,10 +1964,14 @@ int split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
937         if (new->vm_ops && new->vm_ops->open)
938                 new->vm_ops->open(new);
939  
940 -       if (new_below)
941 +       if (new_below) {
942 +               unsigned long old_end = vma->vm_end;
943 +
944                 vma_adjust(vma, addr, vma->vm_end, vma->vm_pgoff +
945                         ((addr - new->vm_start) >> PAGE_SHIFT), new);
946 -       else
947 +               if (vma->vm_flags & VM_EXEC)
948 +                       arch_remove_exec_range(mm, old_end);
949 +       } else
950                 vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new);
951  
952         return 0;
953 @@ -2116,6 +2220,7 @@ void exit_mmap(struct mm_struct *mm)
954         vm_unacct_memory(nr_accounted);
955         free_pgtables(tlb, vma, FIRST_USER_ADDRESS, 0);
956         tlb_finish_mmu(tlb, 0, end);
957 +       arch_flush_exec_range(mm);
958  }
959  
960  /*
961 diff --git a/mm/mprotect.c b/mm/mprotect.c
962 index d80311b..032423d 100644
963 --- a/mm/mprotect.c
964 +++ b/mm/mprotect.c
965 @@ -26,9 +26,14 @@
966  #include <linux/perf_counter.h>
967  #include <asm/uaccess.h>
968  #include <asm/pgtable.h>
969 +#include <asm/pgalloc.h>
970  #include <asm/cacheflush.h>
971  #include <asm/tlbflush.h>
972  
973 +#ifndef arch_remove_exec_range
974 +#define arch_remove_exec_range(mm, limit)      do { ; } while (0)
975 +#endif
976 +
977  #ifndef pgprot_modify
978  static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
979  {
980 @@ -139,7 +144,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
981         struct mm_struct *mm = vma->vm_mm;
982         unsigned long oldflags = vma->vm_flags;
983         long nrpages = (end - start) >> PAGE_SHIFT;
984 -       unsigned long charged = 0;
985 +       unsigned long charged = 0, old_end = vma->vm_end;
986         pgoff_t pgoff;
987         int error;
988         int dirty_accountable = 0;
989 @@ -204,6 +209,9 @@ success:
990                 dirty_accountable = 1;
991         }
992  
993 +       if (oldflags & VM_EXEC)
994 +               arch_remove_exec_range(current->mm, old_end);
995 +
996         mmu_notifier_invalidate_range_start(mm, start, end);
997         if (is_vm_hugetlb_page(vma))
998                 hugetlb_change_protection(vma, start, end, vma->vm_page_prot);
999 diff --git a/mm/mremap.c b/mm/mremap.c
1000 index a39b7b9..6bebfde 100644
1001 --- a/mm/mremap.c
1002 +++ b/mm/mremap.c
1003 @@ -400,8 +400,8 @@ unsigned long do_mremap(unsigned long addr,
1004                         if (vma->vm_flags & VM_MAYSHARE)
1005                                 map_flags |= MAP_SHARED;
1006  
1007 -                       new_addr = get_unmapped_area(vma->vm_file, 0, new_len,
1008 -                                               vma->vm_pgoff, map_flags);
1009 +                       new_addr = get_unmapped_area_prot(vma->vm_file, 0, new_len,
1010 +                               vma->vm_pgoff, map_flags, vma->vm_flags & VM_EXEC);
1011                         if (new_addr & ~PAGE_MASK) {
1012                                 ret = new_addr;
1013                                 goto out;