This commit was manufactured by cvs2svn to create tag
[linux-2.6.git] / kernel / ckrm / rbce / rbcemod_ext.c
1 /* Data Collection Extension to Rule-based Classification Engine (RBCE) module
2  *
3  * Copyright (C) Hubertus Franke, IBM Corp. 2003
4  *
5  * Extension to be included into RBCE to collect delay and sample information
6  * requires user daemon <crbcedmn> to activate.
7  *
8  * Latest version, more details at http://ckrm.sf.net
9  *
10  * This program is free software; you can redistribute it and/or modify
11  * it under the terms of the GNU General Public License as published by
12  * the Free Software Foundation; either version 2 of the License, or
13  * (at your option) any later version.
14  *
15  */
16
17 /*******************************************************************************
18  *
19  *   User-Kernel Communication Channel (UKCC)
20  *   Protocol and communication handling
21  *
22  ******************************************************************************/
23
24 #include <linux/relayfs_fs.h>
25
26 #define PSAMPLE(pdata)    (&((pdata)->ext_data.sample))
27 #define UKCC_N_SUB_BUFFERS     (4)
28 #define UKCC_SUB_BUFFER_SIZE   (1<<15)
29 #define UKCC_TOTAL_BUFFER_SIZE (UKCC_N_SUB_BUFFERS * UKCC_SUB_BUFFER_SIZE)
30
31 #define CHANNEL_AUTO_CONT  0    /* this is during debugging only. It allows 
32                                    the module to continue sending data through 
33                                    the UKCC if space frees up vs. going into 
34                                    the recovery driven mode
35                                  */
36
37 enum ukcc_state {
38         UKCC_OK = 0,
39         UKCC_STANDBY = 1,
40         UKCC_FULL = 2
41 };
42
43 int ukcc_channel = -1;
44 static enum ukcc_state chan_state = UKCC_STANDBY;
45
46 inline static int ukcc_ok(void)
47 {
48         return (chan_state == UKCC_OK);
49 }
50
51 static void ukcc_cmd_deliver(int rchan_id, char *from, u32 len);
52 static void client_attached(void);
53 static void client_detached(void);
54
55 static int ukcc_fileop_notify(int rchan_id,
56                               struct file *filp, enum relay_fileop fileop)
57 {
58         static int readers = 0;
59         if (fileop == RELAY_FILE_OPEN) {
60                 // printk("got fileop_notify RELAY_FILE_OPEN for file %p\n", 
61                 //              filp);
62                 if (readers) {
63                         printk("only one client allowed, backoff .... \n");
64                         return -EPERM;
65                 }
66                 if (!try_module_get(THIS_MODULE))
67                         return -EPERM;
68                 readers++;
69                 client_attached();
70
71         } else if (fileop == RELAY_FILE_CLOSE) {
72                 // printk("got fileop_notify RELAY_FILE_CLOSE for file %p\n", 
73                 //              filp);
74                 client_detached();
75                 readers--;
76                 module_put(THIS_MODULE);
77         }
78         return 0;
79 }
80
81 static int create_ukcc_channel(void)
82 {
83         static struct rchan_callbacks ukcc_callbacks = {
84                 .buffer_start = NULL,
85                 .buffer_end = NULL,
86                 .deliver = NULL,
87                 .user_deliver = ukcc_cmd_deliver,
88                 .needs_resize = NULL,
89                 .fileop_notify = ukcc_fileop_notify,
90         };
91
92         u32 channel_flags =
93             RELAY_USAGE_GLOBAL | RELAY_SCHEME_ANY | RELAY_TIMESTAMP_ANY;
94
95         // notify on subbuffer full (through poll)
96         channel_flags |= RELAY_DELIVERY_BULK;
97         // channel_flags     |= RELAY_DELIVERY_PACKET;
98         // avoid overwrite, otherwise recovery will be nasty...
99         channel_flags |= RELAY_MODE_NO_OVERWRITE;
100
101         ukcc_channel = relay_open(CRBCE_UKCC_NAME,
102                                   UKCC_SUB_BUFFER_SIZE,
103                                   UKCC_N_SUB_BUFFERS,
104                                   channel_flags,
105                                   &ukcc_callbacks, 0, 0, 0, 0, 0, 0, NULL, 0);
106         if (ukcc_channel < 0)
107                 printk("crbce: ukcc creation failed, errcode: %d\n",
108                        ukcc_channel);
109         else
110                 printk("crbce: ukcc created (%u KB)\n",
111                        UKCC_TOTAL_BUFFER_SIZE >> 10);
112         return ukcc_channel;
113 }
114
115 static inline void close_ukcc_channel(void)
116 {
117         if (ukcc_channel >= 0) {
118                 relay_close(ukcc_channel);
119                 ukcc_channel = -1;
120                 chan_state = UKCC_STANDBY;
121         }
122 }
123
124 #define rec_set_hdr(r,t,p)      ((r)->hdr.type = (t), (r)->hdr.pid = (p))
125 #define rec_set_timehdr(r,t,p,c)  (rec_set_hdr(r,t,p), \
126 (r)->hdr.jiffies = jiffies, (r)->hdr.cls=(unsigned long)(c) )
127
128 #if CHANNEL_AUTO_CONT
129
130 /* we only provide this for debugging.. it allows us to send records
131  * based on availability in the channel when the UKCC stalles rather
132  * going through the UKCC recovery protocol
133  */
134
135 #define rec_send_len(r,l)                                               \
136         do {                                                            \
137                 int chan_wasok = (chan_state == UKCC_OK);               \
138                 int chan_isok = (relay_write(ukcc_channel,              \
139                                              (r),(l),-1,NULL) > 0);     \
140                 chan_state = chan_isok ? UKCC_OK : UKCC_STANDBY;        \
141                 if (chan_wasok && !chan_isok) {                         \
142                         printk("Channel stalled\n");                    \ 
143                 } else if (!chan_wasok && chan_isok) {                  \
144                         printk("Channel continues\n");                  \
145                 }                                                       \
146         } while (0)
147
148 #define rec_send(r)     rec_send_len(r,sizeof(*(r)))
149
150 #else
151
152 /* Default UKCC channel protocol. 
153  * Though a UKCC buffer overflow should not happen ever, it is possible iff 
154  * the user daemon stops reading for some reason. Hence we provide a simple 
155  * protocol based on 3 states 
156  *     UKCC_OK      :=  channel is active and properly working. When a channel 
157  *                      write fails we move to state CHAN_FULL.
158  *     UKCC_FULL    :=  channel is active, but the last send_rec has failed. As
159  *                      a result we will try to send an indication to the daemon
160  *                      that this has happened. When that succeeds, we move to 
161  *                      state UKCC_STANDBY.
162  *     UKCC_STANDBY :=  we are waiting to be restarted by the user daemon 
163  *
164  */
165
166 static void ukcc_full(void)
167 {
168         static spinlock_t ukcc_state_lock = SPIN_LOCK_UNLOCKED;
169         /* protect transition from OK -> FULL to ensure only one record is sent,
170            rest we do not need to protect, protocol implies that. we keep the 
171            channel OK until
172         */
173         int send = 0;
174         spin_lock(&ukcc_state_lock);
175         if ((send = (chan_state != UKCC_STANDBY)))
176                 chan_state = UKCC_STANDBY;      /* assume we can send */
177         spin_unlock(&ukcc_state_lock);
178
179         if (send) {
180                 struct crbce_ukcc_full rec;
181                 rec_set_timehdr(&rec, CRBCE_REC_UKCC_FULL, 0, 0);
182                 if (relay_write(ukcc_channel, &rec, 
183                                 sizeof(rec), -1, NULL) <= 0) {
184                         /* channel is remains full .. try with next one */
185                         chan_state = UKCC_FULL;
186                 }
187         }
188 }
189
190 #define rec_send_len(r,l)                                               \
191         do {                                                            \
192                 switch (chan_state) {                                   \
193                 case UKCC_OK:                                           \
194                         if (relay_write(ukcc_channel,(r),               \
195                                 (l),-1,NULL) > 0)                       \
196                                 break;                                  \
197                 case UKCC_FULL:                                         \
198                         ukcc_full();                                    \
199                         break;                                          \
200                 default:                                                \
201                         break;                                          \
202                 }                                                       \
203         } while (0)
204
205 #define rec_send(r)     rec_send_len(r,sizeof(*(r)))
206
207 #endif
208
209 /******************************************************************************
210  *
211  *  Callbacks for the CKRM engine. 
212  *    In each we do the necessary classification and event record generation
213  *    We generate 3 kind of records in the callback 
214  *    (a) FORK                  send the pid, the class and the ppid
215  *    (b) RECLASSIFICATION      send the pid, the class and < sample data + 
216  *                              delay data >
217  *    (b) EXIT                  send the pid
218  *
219  ******************************************************************************/
220
221 int delta_mode = 0;
222
223 static inline void copy_delay(struct task_delay_info *delay,
224                               struct task_struct *tsk)
225 {
226         *delay = tsk->delays;
227 }
228
229 static inline void zero_delay(struct task_delay_info *delay)
230 {
231         memset(delay, 0, sizeof(struct task_delay_info));       
232         /* we need to think about doing this 64-bit atomic */
233 }
234
235 static inline void zero_sample(struct task_sample_info *sample)
236 {
237         memset(sample, 0, sizeof(struct task_sample_info));     
238         /* we need to think about doing this 64-bit atomic */
239 }
240
241 static inline int check_zero(void *ptr, int len)
242 {
243         int iszero = 1;
244         int i;
245         unsigned long *uptr = (unsigned long *)ptr;
246
247         for (i = len / sizeof(unsigned long); i-- && iszero; uptr++)    
248                 // assume its rounded 
249                 iszero &= (*uptr == 0);
250         return iszero;
251 }
252
253 static inline int check_not_zero(void *ptr, int len)
254 {
255         int i;
256         unsigned long *uptr = (unsigned long *)ptr;
257
258         for (i = len / sizeof(unsigned long); i--; uptr++)      
259                 // assume its rounded 
260                 if (*uptr)
261                         return 1;
262         return 0;
263 }
264
265 static inline int sample_changed(struct task_sample_info *s)
266 {
267         return check_not_zero(s, sizeof(*s));
268 }
269 static inline int delay_changed(struct task_delay_info *d)
270 {
271         return check_not_zero(d, sizeof(*d));
272 }
273
274 static inline int
275 send_task_record(struct task_struct *tsk, int event,
276                  struct ckrm_core_class *core, int send_forced)
277 {
278         struct crbce_rec_task_data rec;
279         struct rbce_private_data *pdata;
280         int send = 0;
281
282         if (!ukcc_ok())
283                 return 0;
284         pdata = RBCE_DATA(tsk);
285         if (pdata == NULL) {
286                 // printk("send [%d]<%s>: no pdata\n",tsk->pid,tsk->comm);
287                 return 0;
288         }
289         if (send_forced || (delta_mode == 0)
290             || sample_changed(PSAMPLE(RBCE_DATA(tsk)))
291             || delay_changed(&tsk->delays)) {
292                 rec_set_timehdr(&rec, event, tsk->pid,
293                                 core ? core : (struct ckrm_core_class *)tsk->
294                                 taskclass);
295                 rec.sample = *PSAMPLE(RBCE_DATA(tsk));
296                 copy_delay(&rec.delay, tsk);
297                 rec_send(&rec);
298                 if (delta_mode || send_forced) {
299                         // on reclassify or delta mode reset the counters  
300                         zero_sample(PSAMPLE(RBCE_DATA(tsk)));
301                         zero_delay(&tsk->delays);
302                 }
303                 send = 1;
304         }
305         return send;
306 }
307
308 static inline void send_exit_notification(struct task_struct *tsk)
309 {
310         send_task_record(tsk, CRBCE_REC_EXIT, NULL, 1);
311 }
312
313 static inline void
314 rbce_tc_ext_notify(int event, void *core, struct task_struct *tsk)
315 {
316         struct crbce_rec_fork rec;
317
318         switch (event) {
319         case CKRM_EVENT_FORK:
320                 if (ukcc_ok()) {
321                         rec.ppid = tsk->parent->pid;
322                         rec_set_timehdr(&rec, CKRM_EVENT_FORK, tsk->pid, core);
323                         rec_send(&rec);
324                 }
325                 break;
326         case CKRM_EVENT_MANUAL:
327                 rbce_tc_manual(tsk);
328
329         default:
330                 send_task_record(tsk, event, (struct ckrm_core_class *)core, 1);
331                 break;
332         }
333 }
334
335 /*====================== end classification engine =======================*/
336
337 static void sample_task_data(unsigned long unused);
338
339 struct timer_list sample_timer = {.expires = 0,.function = sample_task_data };
340 unsigned long timer_interval_length = (250 * HZ) / 1000;
341
342 inline void stop_sample_timer(void)
343 {
344         if (sample_timer.expires > 0) {
345                 del_timer_sync(&sample_timer);
346                 sample_timer.expires = 0;
347         }
348 }
349
350 inline void start_sample_timer(void)
351 {
352         if (timer_interval_length > 0) {
353                 sample_timer.expires =
354                     jiffies + (timer_interval_length * HZ) / 1000;
355                 add_timer(&sample_timer);
356         }
357 }
358
359 static void send_task_data(void)
360 {
361         struct crbce_rec_data_delim limrec;
362         struct task_struct *proc, *thread;
363         int sendcnt = 0;
364         int taskcnt = 0;
365         limrec.is_stop = 0;
366         rec_set_timehdr(&limrec, CRBCE_REC_DATA_DELIMITER, 0, 0);
367         rec_send(&limrec);
368
369         read_lock(&tasklist_lock);
370         do_each_thread(proc, thread) {
371                 taskcnt++;
372                 task_lock(thread);
373                 sendcnt += send_task_record(thread, CRBCE_REC_SAMPLE, NULL, 0);
374                 task_unlock(thread);
375         } while_each_thread(proc, thread);
376         read_unlock(&tasklist_lock);
377
378         limrec.is_stop = 1;
379         rec_set_timehdr(&limrec, CRBCE_REC_DATA_DELIMITER, 0, 0);
380         rec_send(&limrec);
381
382         // printk("send_task_data mode=%d t#=%d s#=%d\n",
383         //              delta_mode,taskcnt,sendcnt);
384 }
385
386 static void notify_class_action(struct rbce_class *cls, int action)
387 {
388         struct crbce_class_info cinfo;
389         int len;
390
391         rec_set_timehdr(&cinfo, CRBCE_REC_CLASS_INFO, 0, cls->classobj);
392         cinfo.action = action;
393         len = strnlen(cls->obj.name, CRBCE_MAX_CLASS_NAME_LEN - 1);
394         memcpy(&cinfo.name, cls->obj.name, len);
395         cinfo.name[len] = '\0';
396         len++;
397         cinfo.namelen = len;
398
399         len += sizeof(cinfo) - CRBCE_MAX_CLASS_NAME_LEN;
400         rec_send_len(&cinfo, len);
401 }
402
403 static void send_classlist(void)
404 {
405         struct rbce_class *cls;
406
407         read_lock(&global_rwlock);
408         list_for_each_entry(cls, &class_list, obj.link) {
409                 notify_class_action(cls, 1);
410         }
411         read_unlock(&global_rwlock);
412 }
413
414 /*
415  *  resend_task_info 
416  * 
417  *  This function resends all essential task information to the client.
418  *  
419  */
420 static void resend_task_info(void)
421 {
422         struct crbce_rec_data_delim limrec;
423         struct crbce_rec_fork rec;
424         struct task_struct *proc, *thread;
425
426         send_classlist();       // first send available class information
427
428         limrec.is_stop = 2;
429         rec_set_timehdr(&limrec, CRBCE_REC_DATA_DELIMITER, 0, 0);
430         rec_send(&limrec);
431
432         write_lock(&tasklist_lock);     // avoid any mods during this phase 
433         do_each_thread(proc, thread) {
434                 if (ukcc_ok()) {
435                         rec.ppid = thread->parent->pid;
436                         rec_set_timehdr(&rec, CRBCE_REC_TASKINFO, thread->pid,
437                                         thread->taskclass);
438                         rec_send(&rec);
439                 }
440         }
441         while_each_thread(proc, thread);
442         write_unlock(&tasklist_lock);
443
444         limrec.is_stop = 3;
445         rec_set_timehdr(&limrec, CRBCE_REC_DATA_DELIMITER, 0, 0);
446         rec_send(&limrec);
447 }
448
449 extern int task_running_sys(struct task_struct *);
450
451 static void add_all_private_data(void)
452 {
453         struct task_struct *proc, *thread;
454
455         write_lock(&tasklist_lock);
456         do_each_thread(proc, thread) {
457                 if (RBCE_DATA(thread) == NULL)
458                         RBCE_DATAP(thread) = create_private_data(NULL, 0);
459         }
460         while_each_thread(proc, thread);
461         write_unlock(&tasklist_lock);
462 }
463
464 static void sample_task_data(unsigned long unused)
465 {
466         struct task_struct *proc, *thread;
467
468         int run = 0;
469         int wait = 0;
470         read_lock(&tasklist_lock);
471         do_each_thread(proc, thread) {
472                 struct rbce_private_data *pdata = RBCE_DATA(thread);
473
474                 if (pdata == NULL) {
475                         // some wierdo race condition .. simply ignore 
476                         continue;
477                 }
478                 if (thread->state == TASK_RUNNING) {
479                         if (task_running_sys(thread)) {
480                                 atomic_inc((atomic_t *) &
481                                            (PSAMPLE(pdata)->cpu_running));
482                                 run++;
483                         } else {
484                                 atomic_inc((atomic_t *) &
485                                            (PSAMPLE(pdata)->cpu_waiting));
486                                 wait++;
487                         }
488                 }
489                 /* update IO state */
490                 if (thread->flags & PF_IOWAIT) {
491                         if (thread->flags & PF_MEMIO)
492                                 atomic_inc((atomic_t *) &
493                                            (PSAMPLE(pdata)->memio_delayed));
494                         else
495                                 atomic_inc((atomic_t *) &
496                                            (PSAMPLE(pdata)->io_delayed));
497                 }
498         }
499         while_each_thread(proc, thread);
500         read_unlock(&tasklist_lock);
501 //      printk("sample_timer: run=%d wait=%d\n",run,wait);
502         start_sample_timer();
503 }
504
505 static void ukcc_cmd_deliver(int rchan_id, char *from, u32 len)
506 {
507         struct crbce_command *cmdrec = (struct crbce_command *)from;
508         struct crbce_cmd_done cmdret;
509         int rc = 0;
510
511 //      printk("ukcc_cmd_deliver: %d %d len=%d:%d\n",cmdrec->type, 
512 //              cmdrec->cmd,cmdrec->len,len);
513
514         cmdrec->len = len;      // add this to reflection so the user doesn't 
515                                 // accidently write the wrong length and the 
516                                 // protocol is getting screwed up 
517
518         if (cmdrec->type != CRBCE_REC_KERNEL_CMD) {
519                 rc = EINVAL;
520                 goto out;
521         }
522
523         switch (cmdrec->cmd) {
524         case CRBCE_CMD_SET_TIMER:
525                 {
526                         struct crbce_cmd_settimer *cptr =
527                             (struct crbce_cmd_settimer *)cmdrec;
528                         if (len != sizeof(*cptr)) {
529                                 rc = EINVAL;
530                                 break;
531                         }
532                         stop_sample_timer();
533                         timer_interval_length = cptr->interval;
534                         if ((timer_interval_length > 0)
535                             && (timer_interval_length < 10))
536                                 timer_interval_length = 10;   
537                                 // anything finer can create problems 
538                         printk(KERN_INFO "CRBCE set sample collect timer %lu\n",
539                                timer_interval_length);
540                         start_sample_timer();
541                         break;
542                 }
543         case CRBCE_CMD_SEND_DATA:
544                 {
545                         struct crbce_cmd_send_data *cptr =
546                             (struct crbce_cmd_send_data *)cmdrec;
547                         if (len != sizeof(*cptr)) {
548                                 rc = EINVAL;
549                                 break;
550                         }
551                         delta_mode = cptr->delta_mode;
552                         send_task_data();
553                         break;
554                 }
555         case CRBCE_CMD_START:
556                 add_all_private_data();
557                 chan_state = UKCC_OK;
558                 resend_task_info();
559                 break;
560
561         case CRBCE_CMD_STOP:
562                 chan_state = UKCC_STANDBY;
563                 free_all_private_data();
564                 break;
565
566         default:
567                 rc = EINVAL;
568                 break;
569         }
570
571       out:
572         cmdret.hdr.type = CRBCE_REC_KERNEL_CMD_DONE;
573         cmdret.hdr.cmd = cmdrec->cmd;
574         cmdret.rc = rc;
575         rec_send(&cmdret);
576 //      printk("ukcc_cmd_deliver ACK: %d %d rc=%d %d\n",cmdret.hdr.type,
577 //                      cmdret.hdr.cmd,rc,sizeof(cmdret));
578 }
579
580 static void client_attached(void)
581 {
582         printk("client [%d]<%s> attached to UKCC\n", current->pid,
583                current->comm);
584         relay_reset(ukcc_channel);
585 }
586
587 static void client_detached(void)
588 {
589         printk("client [%d]<%s> detached to UKCC\n", current->pid,
590                current->comm);
591         chan_state = UKCC_STANDBY;
592         stop_sample_timer();
593         relay_reset(ukcc_channel);
594         free_all_private_data();
595 }
596
597 static int init_rbce_ext_pre(void)
598 {
599         int rc;
600
601         rc = create_ukcc_channel();
602         return ((rc < 0) ? rc : 0);
603 }
604
605 static int init_rbce_ext_post(void)
606 {
607         init_timer(&sample_timer);
608         return 0;
609 }
610
611 static void exit_rbce_ext(void)
612 {
613         stop_sample_timer();
614         close_ukcc_channel();
615 }