vserver 1.9.5.x5
[linux-2.6.git] / arch / um / kernel / tt / process_kern.c
1 /* 
2  * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
3  * Licensed under the GPL
4  */
5
6 #include "linux/sched.h"
7 #include "linux/signal.h"
8 #include "linux/kernel.h"
9 #include "linux/interrupt.h"
10 #include "linux/ptrace.h"
11 #include "asm/system.h"
12 #include "asm/pgalloc.h"
13 #include "asm/ptrace.h"
14 #include "asm/tlbflush.h"
15 #include "irq_user.h"
16 #include "signal_user.h"
17 #include "kern_util.h"
18 #include "user_util.h"
19 #include "os.h"
20 #include "kern.h"
21 #include "sigcontext.h"
22 #include "time_user.h"
23 #include "mem_user.h"
24 #include "tlb.h"
25 #include "mode.h"
26 #include "init.h"
27 #include "tt.h"
28
29 void *switch_to_tt(void *prev, void *next, void *last)
30 {
31         struct task_struct *from, *to, *prev_sched;
32         unsigned long flags;
33         int err, vtalrm, alrm, prof, cpu;
34         char c;
35         /* jailing and SMP are incompatible, so this doesn't need to be 
36          * made per-cpu 
37          */
38         static int reading;
39
40         from = prev;
41         to = next;
42
43         to->thread.prev_sched = from;
44
45         cpu = from->thread_info->cpu;
46         if(cpu == 0)
47                 forward_interrupts(to->thread.mode.tt.extern_pid);
48 #ifdef CONFIG_SMP
49         forward_ipi(cpu_data[cpu].ipi_pipe[0], to->thread.mode.tt.extern_pid);
50 #endif
51         local_irq_save(flags);
52
53         vtalrm = change_sig(SIGVTALRM, 0);
54         alrm = change_sig(SIGALRM, 0);
55         prof = change_sig(SIGPROF, 0);
56
57         forward_pending_sigio(to->thread.mode.tt.extern_pid);
58
59         c = 0;
60         set_current(to);
61
62         reading = 0;
63         err = os_write_file(to->thread.mode.tt.switch_pipe[1], &c, sizeof(c));
64         if(err != sizeof(c))
65                 panic("write of switch_pipe failed, err = %d", -err);
66
67         reading = 1;
68         if((from->exit_state == EXIT_ZOMBIE) ||
69            (from->exit_state == EXIT_DEAD))
70                 os_kill_process(os_getpid(), 0);
71
72         err = os_read_file(from->thread.mode.tt.switch_pipe[0], &c, sizeof(c));
73         if(err != sizeof(c))
74                 panic("read of switch_pipe failed, errno = %d", -err);
75
76         /* If the process that we have just scheduled away from has exited,
77          * then it needs to be killed here.  The reason is that, even though
78          * it will kill itself when it next runs, that may be too late.  Its
79          * stack will be freed, possibly before then, and if that happens,
80          * we have a use-after-free situation.  So, it gets killed here
81          * in case it has not already killed itself.
82          */
83         prev_sched = current->thread.prev_sched;
84         if((prev_sched->exit_state == EXIT_ZOMBIE) ||
85            (prev_sched->exit_state == EXIT_DEAD))
86                 os_kill_process(prev_sched->thread.mode.tt.extern_pid, 1);
87
88         /* This works around a nasty race with 'jail'.  If we are switching
89          * between two threads of a threaded app and the incoming process 
90          * runs before the outgoing process reaches the read, and it makes
91          * it all the way out to userspace, then it will have write-protected 
92          * the outgoing process stack.  Then, when the outgoing process 
93          * returns from the write, it will segfault because it can no longer
94          * write its own stack.  So, in order to avoid that, the incoming 
95          * thread sits in a loop yielding until 'reading' is set.  This 
96          * isn't entirely safe, since there may be a reschedule from a timer
97          * happening between setting 'reading' and sleeping in read.  But,
98          * it should get a whole quantum in which to reach the read and sleep,
99          * which should be enough.
100          */
101
102         if(jail){
103                 while(!reading) sched_yield();
104         }
105
106         change_sig(SIGVTALRM, vtalrm);
107         change_sig(SIGALRM, alrm);
108         change_sig(SIGPROF, prof);
109
110         arch_switch();
111
112         flush_tlb_all();
113         local_irq_restore(flags);
114
115         return(current->thread.prev_sched);
116 }
117
118 void release_thread_tt(struct task_struct *task)
119 {
120         int pid = task->thread.mode.tt.extern_pid;
121
122         if(os_getpid() != pid)
123                 os_kill_process(pid, 0);
124 }
125
126 void exit_thread_tt(void)
127 {
128         os_close_file(current->thread.mode.tt.switch_pipe[0]);
129         os_close_file(current->thread.mode.tt.switch_pipe[1]);
130 }
131
132 void suspend_new_thread(int fd)
133 {
134         int err;
135         char c;
136
137         os_stop_process(os_getpid());
138         err = os_read_file(fd, &c, sizeof(c));
139         if(err != sizeof(c))
140                 panic("read failed in suspend_new_thread, err = %d", -err);
141 }
142
143 void schedule_tail(task_t *prev);
144
145 static void new_thread_handler(int sig)
146 {
147         unsigned long disable;
148         int (*fn)(void *);
149         void *arg;
150
151         fn = current->thread.request.u.thread.proc;
152         arg = current->thread.request.u.thread.arg;
153
154         UPT_SC(&current->thread.regs.regs) = (void *) (&sig + 1);
155         disable = (1 << (SIGVTALRM - 1)) | (1 << (SIGALRM - 1)) |
156                 (1 << (SIGIO - 1)) | (1 << (SIGPROF - 1));
157         SC_SIGMASK(UPT_SC(&current->thread.regs.regs)) &= ~disable;
158
159         suspend_new_thread(current->thread.mode.tt.switch_pipe[0]);
160
161         force_flush_all();
162         if(current->thread.prev_sched != NULL)
163                 schedule_tail(current->thread.prev_sched);
164         current->thread.prev_sched = NULL;
165
166         init_new_thread_signals(1);
167         enable_timer();
168         free_page(current->thread.temp_stack);
169         set_cmdline("(kernel thread)");
170
171         change_sig(SIGUSR1, 1);
172         change_sig(SIGVTALRM, 1);
173         change_sig(SIGPROF, 1);
174         local_irq_enable();
175         if(!run_kernel_thread(fn, arg, &current->thread.exec_buf))
176                 do_exit(0);
177
178         /* XXX No set_user_mode here because a newly execed process will
179          * immediately segfault on its non-existent IP, coming straight back
180          * to the signal handler, which will call set_user_mode on its way
181          * out.  This should probably change since it's confusing.
182          */
183 }
184
185 static int new_thread_proc(void *stack)
186 {
187         /* local_irq_disable is needed to block out signals until this thread is
188          * properly scheduled.  Otherwise, the tracing thread will get mighty
189          * upset about any signals that arrive before that.
190          * This has the complication that it sets the saved signal mask in
191          * the sigcontext to block signals.  This gets restored when this
192          * thread (or a descendant, since they get a copy of this sigcontext)
193          * returns to userspace.
194          * So, this is compensated for elsewhere.
195          * XXX There is still a small window until local_irq_disable() actually
196          * finishes where signals are possible - shouldn't be a problem in
197          * practice since SIGIO hasn't been forwarded here yet, and the
198          * local_irq_disable should finish before a SIGVTALRM has time to be
199          * delivered.
200          */
201
202         local_irq_disable();
203         init_new_thread_stack(stack, new_thread_handler);
204         os_usr1_process(os_getpid());
205         change_sig(SIGUSR1, 1);
206         return(0);
207 }
208
209 /* Signal masking - signals are blocked at the start of fork_tramp.  They
210  * are re-enabled when finish_fork_handler is entered by fork_tramp hitting
211  * itself with a SIGUSR1.  set_user_mode has to be run with SIGUSR1 off,
212  * so it is blocked before it's called.  They are re-enabled on sigreturn
213  * despite the fact that they were blocked when the SIGUSR1 was issued because
214  * copy_thread copies the parent's sigcontext, including the signal mask
215  * onto the signal frame.
216  */
217
218 void finish_fork_handler(int sig)
219 {
220         UPT_SC(&current->thread.regs.regs) = (void *) (&sig + 1);
221         suspend_new_thread(current->thread.mode.tt.switch_pipe[0]);
222
223         force_flush_all();
224         if(current->thread.prev_sched != NULL)
225                 schedule_tail(current->thread.prev_sched);
226         current->thread.prev_sched = NULL;
227
228         enable_timer();
229         change_sig(SIGVTALRM, 1);
230         local_irq_enable();
231         if(current->mm != current->parent->mm)
232                 protect_memory(uml_reserved, high_physmem - uml_reserved, 1, 
233                                1, 0, 1);
234         task_protections((unsigned long) current_thread);
235
236         free_page(current->thread.temp_stack);
237         local_irq_disable();
238         change_sig(SIGUSR1, 0);
239         set_user_mode(current);
240 }
241
242 int fork_tramp(void *stack)
243 {
244         local_irq_disable();
245         arch_init_thread();
246         init_new_thread_stack(stack, finish_fork_handler);
247
248         os_usr1_process(os_getpid());
249         change_sig(SIGUSR1, 1);
250         return(0);
251 }
252
253 int copy_thread_tt(int nr, unsigned long clone_flags, unsigned long sp,
254                    unsigned long stack_top, struct task_struct * p, 
255                    struct pt_regs *regs)
256 {
257         int (*tramp)(void *);
258         int new_pid, err;
259         unsigned long stack;
260         
261         if(current->thread.forking)
262                 tramp = fork_tramp;
263         else {
264                 tramp = new_thread_proc;
265                 p->thread.request.u.thread = current->thread.request.u.thread;
266         }
267
268         err = os_pipe(p->thread.mode.tt.switch_pipe, 1, 1);
269         if(err < 0){
270                 printk("copy_thread : pipe failed, err = %d\n", -err);
271                 return(err);
272         }
273
274         stack = alloc_stack(0, 0);
275         if(stack == 0){
276                 printk(KERN_ERR "copy_thread : failed to allocate "
277                        "temporary stack\n");
278                 return(-ENOMEM);
279         }
280
281         clone_flags &= CLONE_VM;
282         p->thread.temp_stack = stack;
283         new_pid = start_fork_tramp(p->thread_info, stack, clone_flags, tramp);
284         if(new_pid < 0){
285                 printk(KERN_ERR "copy_thread : clone failed - errno = %d\n", 
286                        -new_pid);
287                 return(new_pid);
288         }
289
290         if(current->thread.forking){
291                 sc_to_sc(UPT_SC(&p->thread.regs.regs), 
292                          UPT_SC(&current->thread.regs.regs));
293                 SC_SET_SYSCALL_RETURN(UPT_SC(&p->thread.regs.regs), 0);
294                 if(sp != 0) SC_SP(UPT_SC(&p->thread.regs.regs)) = sp;
295         }
296         p->thread.mode.tt.extern_pid = new_pid;
297
298         current->thread.request.op = OP_FORK;
299         current->thread.request.u.fork.pid = new_pid;
300         os_usr1_process(os_getpid());
301
302         /* Enable the signal and then disable it to ensure that it is handled
303          * here, and nowhere else.
304          */
305         change_sig(SIGUSR1, 1);
306
307         change_sig(SIGUSR1, 0);
308         err = 0;
309         return(err);
310 }
311
312 void reboot_tt(void)
313 {
314         current->thread.request.op = OP_REBOOT;
315         os_usr1_process(os_getpid());
316         change_sig(SIGUSR1, 1);
317 }
318
319 void halt_tt(void)
320 {
321         current->thread.request.op = OP_HALT;
322         os_usr1_process(os_getpid());
323         change_sig(SIGUSR1, 1);
324 }
325
326 void kill_off_processes_tt(void)
327 {
328         struct task_struct *p;
329         int me;
330
331         me = os_getpid();
332         for_each_process(p){
333                 if(p->thread.mode.tt.extern_pid != me) 
334                         os_kill_process(p->thread.mode.tt.extern_pid, 0);
335         }
336         if(init_task.thread.mode.tt.extern_pid != me) 
337                 os_kill_process(init_task.thread.mode.tt.extern_pid, 0);
338 }
339
340 void initial_thread_cb_tt(void (*proc)(void *), void *arg)
341 {
342         if(os_getpid() == tracing_pid){
343                 (*proc)(arg);
344         }
345         else {
346                 current->thread.request.op = OP_CB;
347                 current->thread.request.u.cb.proc = proc;
348                 current->thread.request.u.cb.arg = arg;
349                 os_usr1_process(os_getpid());
350                 change_sig(SIGUSR1, 1);
351
352                 change_sig(SIGUSR1, 0);
353         }
354 }
355
356 int do_proc_op(void *t, int proc_id)
357 {
358         struct task_struct *task;
359         struct thread_struct *thread;
360         int op, pid;
361
362         task = t;
363         thread = &task->thread;
364         op = thread->request.op;
365         switch(op){
366         case OP_NONE:
367         case OP_TRACE_ON:
368                 break;
369         case OP_EXEC:
370                 pid = thread->request.u.exec.pid;
371                 do_exec(thread->mode.tt.extern_pid, pid);
372                 thread->mode.tt.extern_pid = pid;
373                 cpu_tasks[task->thread_info->cpu].pid = pid;
374                 break;
375         case OP_FORK:
376                 attach_process(thread->request.u.fork.pid);
377                 break;
378         case OP_CB:
379                 (*thread->request.u.cb.proc)(thread->request.u.cb.arg);
380                 break;
381         case OP_REBOOT:
382         case OP_HALT:
383                 break;
384         default:
385                 tracer_panic("Bad op in do_proc_op");
386                 break;
387         }
388         thread->request.op = OP_NONE;
389         return(op);
390 }
391
392 void init_idle_tt(void)
393 {
394         default_idle();
395 }
396
397 /* Changed by jail_setup, which is a setup */
398 int jail = 0;
399
400 int __init jail_setup(char *line, int *add)
401 {
402         int ok = 1;
403
404         if(jail) return(0);
405 #ifdef CONFIG_SMP
406         printf("'jail' may not used used in a kernel with CONFIG_SMP "
407                "enabled\n");
408         ok = 0;
409 #endif
410 #ifdef CONFIG_HOSTFS
411         printf("'jail' may not used used in a kernel with CONFIG_HOSTFS "
412                "enabled\n");
413         ok = 0;
414 #endif
415 #ifdef CONFIG_MODULES
416         printf("'jail' may not used used in a kernel with CONFIG_MODULES "
417                "enabled\n");
418         ok = 0;
419 #endif  
420         if(!ok) exit(1);
421
422         /* CAP_SYS_RAWIO controls the ability to open /dev/mem and /dev/kmem.
423          * Removing it from the bounding set eliminates the ability of anything
424          * to acquire it, and thus read or write kernel memory.
425          */
426         cap_lower(cap_bset, CAP_SYS_RAWIO);
427         jail = 1;
428         return(0);
429 }
430
431 __uml_setup("jail", jail_setup,
432 "jail\n"
433 "    Enables the protection of kernel memory from processes.\n\n"
434 );
435
436 static void mprotect_kernel_mem(int w)
437 {
438         unsigned long start, end;
439         int pages;
440
441         if(!jail || (current == &init_task)) return;
442
443         pages = (1 << CONFIG_KERNEL_STACK_ORDER);
444
445         start = (unsigned long) current_thread + PAGE_SIZE;
446         end = (unsigned long) current_thread + PAGE_SIZE * pages;
447         protect_memory(uml_reserved, start - uml_reserved, 1, w, 1, 1);
448         protect_memory(end, high_physmem - end, 1, w, 1, 1);
449
450         start = (unsigned long) UML_ROUND_DOWN(&_stext);
451         end = (unsigned long) UML_ROUND_UP(&_etext);
452         protect_memory(start, end - start, 1, w, 1, 1);
453
454         start = (unsigned long) UML_ROUND_DOWN(&_unprotected_end);
455         end = (unsigned long) UML_ROUND_UP(&_edata);
456         protect_memory(start, end - start, 1, w, 1, 1);
457
458         start = (unsigned long) UML_ROUND_DOWN(&__bss_start);
459         end = (unsigned long) UML_ROUND_UP(brk_start);
460         protect_memory(start, end - start, 1, w, 1, 1);
461
462         mprotect_kernel_vm(w);
463 }
464
465 void unprotect_kernel_mem(void)
466 {
467         mprotect_kernel_mem(1);
468 }
469
470 void protect_kernel_mem(void)
471 {
472         mprotect_kernel_mem(0);
473 }
474
475 extern void start_kernel(void);
476
477 static int start_kernel_proc(void *unused)
478 {
479         int pid;
480
481         block_signals();
482         pid = os_getpid();
483
484         cpu_tasks[0].pid = pid;
485         cpu_tasks[0].task = current;
486 #ifdef CONFIG_SMP
487         cpu_online_map = cpumask_of_cpu(0);
488 #endif
489         if(debug) os_stop_process(pid);
490         start_kernel();
491         return(0);
492 }
493
494 void set_tracing(void *task, int tracing)
495 {
496         ((struct task_struct *) task)->thread.mode.tt.tracing = tracing;
497 }
498
499 int is_tracing(void *t)
500 {
501         return (((struct task_struct *) t)->thread.mode.tt.tracing);
502 }
503
504 int set_user_mode(void *t)
505 {
506         struct task_struct *task;
507
508         task = t ? t : current;
509         if(task->thread.mode.tt.tracing) 
510                 return(1);
511         task->thread.request.op = OP_TRACE_ON;
512         os_usr1_process(os_getpid());
513         return(0);
514 }
515
516 void set_init_pid(int pid)
517 {
518         int err;
519
520         init_task.thread.mode.tt.extern_pid = pid;
521         err = os_pipe(init_task.thread.mode.tt.switch_pipe, 1, 1);
522         if(err)
523                 panic("Can't create switch pipe for init_task, errno = %d",
524                       -err);
525 }
526
527 int start_uml_tt(void)
528 {
529         void *sp;
530         int pages;
531
532         pages = (1 << CONFIG_KERNEL_STACK_ORDER);
533         sp = (void *) ((unsigned long) init_task.thread_info) +
534                 pages * PAGE_SIZE - sizeof(unsigned long);
535         return(tracer(start_kernel_proc, sp));
536 }
537
538 int external_pid_tt(struct task_struct *task)
539 {
540         return(task->thread.mode.tt.extern_pid);
541 }
542
543 int thread_pid_tt(struct task_struct *task)
544 {
545         return(task->thread.mode.tt.extern_pid);
546 }
547
548 int is_valid_pid(int pid)
549 {
550         struct task_struct *task;
551
552         read_lock(&tasklist_lock);
553         for_each_process(task){
554                 if(task->thread.mode.tt.extern_pid == pid){
555                         read_unlock(&tasklist_lock);
556                         return(1);
557                 }
558         }
559         read_unlock(&tasklist_lock);
560         return(0);
561 }
562
563 /*
564  * Overrides for Emacs so that we follow Linus's tabbing style.
565  * Emacs will notice this stuff at the end of the file and automatically
566  * adjust the settings for this buffer only.  This must remain at the end
567  * of the file.
568  * ---------------------------------------------------------------------------
569  * Local variables:
570  * c-file-style: "linux"
571  * End:
572  */