3cae550f752e0912fb3093b1910b4d1376c6b553
[linux-2.6.git] / kernel / ckrm / rbce / rbcemod_ext.c
1 /* Data Collection Extension to Rule-based Classification Engine (RBCE) module
2  *
3  * Copyright (C) Hubertus Franke, IBM Corp. 2003
4  *
5  * Extension to be included into RBCE to collect delay and sample information
6  * Requires user daemon e.g. crbcedmn to activate.
7  *
8  * Latest version, more details at http://ckrm.sf.net
9  *
10  * This program is free software; you can redistribute it and/or modify
11  * it under the terms of the GNU General Public License as published by
12  * the Free Software Foundation; either version 2 of the License, or
13  * (at your option) any later version.
14  *
15  * This program is distributed in the hope that it would be useful, but
16  * WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
18  *
19  */
20
21
22 /*******************************************************************************
23  *
24  *   User-Kernel Communication Channel (UKCC)
25  *   Protocol and communication handling
26  *
27  ******************************************************************************/
28
29 #include <linux/relayfs_fs.h>
30
31 #define PSAMPLE(pdata)    (&((pdata)->ext_data.sample))
32 #define UKCC_N_SUB_BUFFERS     (4)
33 #define UKCC_SUB_BUFFER_SIZE   (1<<15)
34 #define UKCC_TOTAL_BUFFER_SIZE (UKCC_N_SUB_BUFFERS * UKCC_SUB_BUFFER_SIZE)
35
36 #define CHANNEL_AUTO_CONT  0    /* this is during debugging only. It allows 
37                                    the module to continue sending data through 
38                                    the UKCC if space frees up vs. going into 
39                                    the recovery driven mode
40                                  */
41
42 enum ukcc_state {
43         UKCC_OK = 0,
44         UKCC_STANDBY = 1,
45         UKCC_FULL = 2
46 };
47
48 int ukcc_channel = -1;
49 static enum ukcc_state chan_state = UKCC_STANDBY;
50
51 inline static int ukcc_ok(void)
52 {
53         return (chan_state == UKCC_OK);
54 }
55
56 static void ukcc_cmd_deliver(int rchan_id, char *from, u32 len);
57 static void client_attached(void);
58 static void client_detached(void);
59
60 static int ukcc_fileop_notify(int rchan_id,
61                               struct file *filp, enum relay_fileop fileop)
62 {
63         static int readers = 0;
64         if (fileop == RELAY_FILE_OPEN) {
65                 // printk(KERN_DEBUG "got fileop_notify RELAY_FILE_OPEN for file %p\n", 
66                 //              filp);
67                 if (readers) {
68                         printk(KERN_DEBUG "only one client allowed, backoff .... \n");
69                         return -EPERM;
70                 }
71                 if (!try_module_get(THIS_MODULE))
72                         return -EPERM;
73                 readers++;
74                 client_attached();
75
76         } else if (fileop == RELAY_FILE_CLOSE) {
77                 // printk(KERN_DEBUG "got fileop_notify RELAY_FILE_CLOSE for file %p\n", 
78                 //              filp);
79                 client_detached();
80                 readers--;
81                 module_put(THIS_MODULE);
82         }
83         return 0;
84 }
85
86 static int create_ukcc_channel(void)
87 {
88         static struct rchan_callbacks ukcc_callbacks = {
89                 .buffer_start = NULL,
90                 .buffer_end = NULL,
91                 .deliver = NULL,
92                 .user_deliver = ukcc_cmd_deliver,
93                 .needs_resize = NULL,
94                 .fileop_notify = ukcc_fileop_notify,
95         };
96
97         u32 channel_flags =
98             RELAY_USAGE_GLOBAL | RELAY_SCHEME_ANY | RELAY_TIMESTAMP_ANY;
99
100         // notify on subbuffer full (through poll)
101         channel_flags |= RELAY_DELIVERY_BULK;
102         // channel_flags     |= RELAY_DELIVERY_PACKET;
103         // avoid overwrite, otherwise recovery will be nasty...
104         channel_flags |= RELAY_MODE_NO_OVERWRITE;
105
106         ukcc_channel = relay_open(CRBCE_UKCC_NAME,
107                                   UKCC_SUB_BUFFER_SIZE,
108                                   UKCC_N_SUB_BUFFERS,
109                                   channel_flags,
110                                   &ukcc_callbacks, 0, 0, 0, 0, 0, 0, NULL, 0);
111         if (ukcc_channel < 0)
112                 printk(KERN_DEBUG "crbce: ukcc creation failed, errcode: %d\n",
113                        ukcc_channel);
114         else
115                 printk(KERN_DEBUG "crbce: ukcc created (%u KB)\n",
116                        UKCC_TOTAL_BUFFER_SIZE >> 10);
117         return ukcc_channel;
118 }
119
120 static inline void close_ukcc_channel(void)
121 {
122         if (ukcc_channel >= 0) {
123                 relay_close(ukcc_channel);
124                 ukcc_channel = -1;
125                 chan_state = UKCC_STANDBY;
126         }
127 }
128
129 #define rec_set_hdr(r,t,p)      ((r)->hdr.type = (t), (r)->hdr.pid = (p))
130 #define rec_set_timehdr(r,t,p,c)  (rec_set_hdr(r,t,p), \
131 (r)->hdr.jiffies = jiffies, (r)->hdr.cls=(unsigned long)(c) )
132
133 #if CHANNEL_AUTO_CONT
134
135 /* we only provide this for debugging.. it allows us to send records
136  * based on availability in the channel when the UKCC stalles rather
137  * going through the UKCC recovery protocol
138  */
139
140 #define rec_send_len(r,l)                                               \
141         do {                                                            \
142                 int chan_wasok = (chan_state == UKCC_OK);               \
143                 int chan_isok = (relay_write(ukcc_channel,              \
144                                              (r),(l),-1,NULL) > 0);     \
145                 chan_state = chan_isok ? UKCC_OK : UKCC_STANDBY;        \
146                 if (chan_wasok && !chan_isok) {                         \
147                         printk(KERN_DEBUG "Channel stalled\n");                 \
148                 } else if (!chan_wasok && chan_isok) {                  \
149                         printk(KERN_DEBUG "Channel continues\n");                       \
150                 }                                                       \
151         } while (0)
152
153 #define rec_send(r)     rec_send_len(r,sizeof(*(r)))
154
155 #else
156
157 /* Default UKCC channel protocol. 
158  * Though a UKCC buffer overflow should not happen ever, it is possible iff 
159  * the user daemon stops reading for some reason. Hence we provide a simple 
160  * protocol based on 3 states 
161  *     UKCC_OK      :=  channel is active and properly working. When a channel 
162  *                      write fails we move to state CHAN_FULL.
163  *     UKCC_FULL    :=  channel is active, but the last send_rec has failed. As
164  *                      a result we will try to send an indication to the daemon
165  *                      that this has happened. When that succeeds, we move to 
166  *                      state UKCC_STANDBY.
167  *     UKCC_STANDBY :=  we are waiting to be restarted by the user daemon 
168  *
169  */
170
171 static void ukcc_full(void)
172 {
173         static spinlock_t ukcc_state_lock = SPIN_LOCK_UNLOCKED;
174         /* protect transition from OK -> FULL to ensure only one record is sent,
175            rest we do not need to protect, protocol implies that. we keep the 
176            channel OK until
177         */
178         int send = 0;
179         spin_lock(&ukcc_state_lock);
180         if ((send = (chan_state != UKCC_STANDBY)))
181                 chan_state = UKCC_STANDBY;      /* assume we can send */
182         spin_unlock(&ukcc_state_lock);
183
184         if (send) {
185                 struct crbce_ukcc_full rec;
186                 rec_set_timehdr(&rec, CRBCE_REC_UKCC_FULL, 0, 0);
187                 if (relay_write(ukcc_channel, &rec, 
188                                 sizeof(rec), -1, NULL) <= 0) {
189                         /* channel is remains full .. try with next one */
190                         chan_state = UKCC_FULL;
191                 }
192         }
193 }
194
195 #define rec_send_len(r,l)                                               \
196         do {                                                            \
197                 switch (chan_state) {                                   \
198                 case UKCC_OK:                                           \
199                         if (relay_write(ukcc_channel,(r),               \
200                                 (l),-1,NULL) > 0)                       \
201                                 break;                                  \
202                 case UKCC_FULL:                                         \
203                         ukcc_full();                                    \
204                         break;                                          \
205                 default:                                                \
206                         break;                                          \
207                 }                                                       \
208         } while (0)
209
210 #define rec_send(r)     rec_send_len(r,sizeof(*(r)))
211
212 #endif
213
214 /******************************************************************************
215  *
216  *  Callbacks for the CKRM engine. 
217  *    In each we do the necessary classification and event record generation
218  *    We generate 3 kind of records in the callback 
219  *    (a) FORK                  send the pid, the class and the ppid
220  *    (b) RECLASSIFICATION      send the pid, the class and < sample data + 
221  *                              delay data >
222  *    (b) EXIT                  send the pid
223  *
224  ******************************************************************************/
225
226 int delta_mode = 0;
227
228 static inline void copy_delay(struct task_delay_info *delay,
229                               struct task_struct *tsk)
230 {
231         *delay = tsk->delays;
232 }
233
234 static inline void zero_delay(struct task_delay_info *delay)
235 {
236         memset(delay, 0, sizeof(struct task_delay_info));       
237         /* we need to think about doing this 64-bit atomic */
238 }
239
240 static inline void zero_sample(struct task_sample_info *sample)
241 {
242         memset(sample, 0, sizeof(struct task_sample_info));     
243         /* we need to think about doing this 64-bit atomic */
244 }
245
246 static inline int check_zero(void *ptr, int len)
247 {
248         int iszero = 1;
249         int i;
250         unsigned long *uptr = (unsigned long *)ptr;
251
252         for (i = len / sizeof(unsigned long); i-- && iszero; uptr++)    
253                 // assume its rounded 
254                 iszero &= (*uptr == 0);
255         return iszero;
256 }
257
258 static inline int check_not_zero(void *ptr, int len)
259 {
260         int i;
261         unsigned long *uptr = (unsigned long *)ptr;
262
263         for (i = len / sizeof(unsigned long); i--; uptr++)      
264                 // assume its rounded 
265                 if (*uptr)
266                         return 1;
267         return 0;
268 }
269
270 static inline int sample_changed(struct task_sample_info *s)
271 {
272         return check_not_zero(s, sizeof(*s));
273 }
274 static inline int delay_changed(struct task_delay_info *d)
275 {
276         return check_not_zero(d, sizeof(*d));
277 }
278
279 static inline int
280 send_task_record(struct task_struct *tsk, int event,
281                  struct ckrm_core_class *core, int send_forced)
282 {
283         struct crbce_rec_task_data rec;
284         struct rbce_private_data *pdata;
285         int send = 0;
286
287         if (!ukcc_ok())
288                 return 0;
289         pdata = RBCE_DATA(tsk);
290         if (pdata == NULL) {
291                 // printk(KERN_DEBUG "send [%d]<%s>: no pdata\n",tsk->pid,tsk->comm);
292                 return 0;
293         }
294         if (send_forced || (delta_mode == 0)
295             || sample_changed(PSAMPLE(RBCE_DATA(tsk)))
296             || delay_changed(&tsk->delays)) {
297                 rec_set_timehdr(&rec, event, tsk->pid,
298                                 core ? core : (struct ckrm_core_class *)tsk->
299                                 taskclass);
300                 rec.sample = *PSAMPLE(RBCE_DATA(tsk));
301                 copy_delay(&rec.delay, tsk);
302                 rec_send(&rec);
303                 if (delta_mode || send_forced) {
304                         // on reclassify or delta mode reset the counters  
305                         zero_sample(PSAMPLE(RBCE_DATA(tsk)));
306                         zero_delay(&tsk->delays);
307                 }
308                 send = 1;
309         }
310         return send;
311 }
312
313 static inline void send_exit_notification(struct task_struct *tsk)
314 {
315         send_task_record(tsk, CRBCE_REC_EXIT, NULL, 1);
316 }
317
318 static inline void
319 rbce_tc_ext_notify(int event, void *core, struct task_struct *tsk)
320 {
321         struct crbce_rec_fork rec;
322
323         switch (event) {
324         case CKRM_EVENT_FORK:
325                 if (ukcc_ok()) {
326                         rec.ppid = tsk->parent->pid;
327                         rec_set_timehdr(&rec, CKRM_EVENT_FORK, tsk->pid, core);
328                         rec_send(&rec);
329                 }
330                 break;
331         case CKRM_EVENT_MANUAL:
332                 rbce_tc_manual(tsk);
333
334         default:
335                 send_task_record(tsk, event, (struct ckrm_core_class *)core, 1);
336                 break;
337         }
338 }
339
340 /*====================== end classification engine =======================*/
341
342 static void sample_task_data(unsigned long unused);
343
344 struct timer_list sample_timer = {.expires = 0,.function = sample_task_data };
345 unsigned long timer_interval_length = (250 * HZ) / 1000;
346
347 inline void stop_sample_timer(void)
348 {
349         if (sample_timer.expires > 0) {
350                 del_timer_sync(&sample_timer);
351                 sample_timer.expires = 0;
352         }
353 }
354
355 inline void start_sample_timer(void)
356 {
357         if (timer_interval_length > 0) {
358                 sample_timer.expires =
359                     jiffies + (timer_interval_length * HZ) / 1000;
360                 add_timer(&sample_timer);
361         }
362 }
363
364 static void send_task_data(void)
365 {
366         struct crbce_rec_data_delim limrec;
367         struct task_struct *proc, *thread;
368         int sendcnt = 0;
369         int taskcnt = 0;
370         limrec.is_stop = 0;
371         rec_set_timehdr(&limrec, CRBCE_REC_DATA_DELIMITER, 0, 0);
372         rec_send(&limrec);
373
374         read_lock(&tasklist_lock);
375         do_each_thread(proc, thread) {
376                 taskcnt++;
377                 task_lock(thread);
378                 sendcnt += send_task_record(thread, CRBCE_REC_SAMPLE, NULL, 0);
379                 task_unlock(thread);
380         } while_each_thread(proc, thread);
381         read_unlock(&tasklist_lock);
382
383         limrec.is_stop = 1;
384         rec_set_timehdr(&limrec, CRBCE_REC_DATA_DELIMITER, 0, 0);
385         rec_send(&limrec);
386
387         // printk(KERN_DEBUG "send_task_data mode=%d t#=%d s#=%d\n",
388         //              delta_mode,taskcnt,sendcnt);
389 }
390
391 static void notify_class_action(struct rbce_class *cls, int action)
392 {
393         struct crbce_class_info cinfo;
394         int len;
395
396         rec_set_timehdr(&cinfo, CRBCE_REC_CLASS_INFO, 0, cls->classobj);
397         cinfo.action = action;
398         len = strnlen(cls->obj.name, CRBCE_MAX_CLASS_NAME_LEN - 1);
399         memcpy(&cinfo.name, cls->obj.name, len);
400         cinfo.name[len] = '\0';
401         len++;
402         cinfo.namelen = len;
403
404         len += sizeof(cinfo) - CRBCE_MAX_CLASS_NAME_LEN;
405         rec_send_len(&cinfo, len);
406 }
407
408 static void send_classlist(void)
409 {
410         struct rbce_class *cls;
411
412         read_lock(&global_rwlock);
413         list_for_each_entry(cls, &class_list, obj.link) {
414                 notify_class_action(cls, 1);
415         }
416         read_unlock(&global_rwlock);
417 }
418
419 /*
420  *  resend_task_info 
421  * 
422  *  This function resends all essential task information to the client.
423  *  
424  */
425 static void resend_task_info(void)
426 {
427         struct crbce_rec_data_delim limrec;
428         struct crbce_rec_fork rec;
429         struct task_struct *proc, *thread;
430
431         send_classlist();       // first send available class information
432
433         limrec.is_stop = 2;
434         rec_set_timehdr(&limrec, CRBCE_REC_DATA_DELIMITER, 0, 0);
435         rec_send(&limrec);
436
437         write_lock(&tasklist_lock);     // avoid any mods during this phase 
438         do_each_thread(proc, thread) {
439                 if (ukcc_ok()) {
440                         rec.ppid = thread->parent->pid;
441                         rec_set_timehdr(&rec, CRBCE_REC_TASKINFO, thread->pid,
442                                         thread->taskclass);
443                         rec_send(&rec);
444                 }
445         }
446         while_each_thread(proc, thread);
447         write_unlock(&tasklist_lock);
448
449         limrec.is_stop = 3;
450         rec_set_timehdr(&limrec, CRBCE_REC_DATA_DELIMITER, 0, 0);
451         rec_send(&limrec);
452 }
453
454 extern int task_running_sys(struct task_struct *);
455
456 static void add_all_private_data(void)
457 {
458         struct task_struct *proc, *thread;
459
460         write_lock(&tasklist_lock);
461         do_each_thread(proc, thread) {
462                 if (RBCE_DATA(thread) == NULL)
463                         RBCE_DATAP(thread) = create_private_data(NULL, 0);
464         }
465         while_each_thread(proc, thread);
466         write_unlock(&tasklist_lock);
467 }
468
469 static void sample_task_data(unsigned long unused)
470 {
471         struct task_struct *proc, *thread;
472
473         int run = 0;
474         int wait = 0;
475         read_lock(&tasklist_lock);
476         do_each_thread(proc, thread) {
477                 struct rbce_private_data *pdata = RBCE_DATA(thread);
478
479                 if (pdata == NULL) {
480                         // some wierdo race condition .. simply ignore 
481                         continue;
482                 }
483                 if (thread->state == TASK_RUNNING) {
484                         if (task_running_sys(thread)) {
485                                 atomic_inc((atomic_t *) &
486                                            (PSAMPLE(pdata)->cpu_running));
487                                 run++;
488                         } else {
489                                 atomic_inc((atomic_t *) &
490                                            (PSAMPLE(pdata)->cpu_waiting));
491                                 wait++;
492                         }
493                 }
494                 /* update IO state */
495                 if (thread->flags & PF_IOWAIT) {
496                         if (thread->flags & PF_MEMIO)
497                                 atomic_inc((atomic_t *) &
498                                            (PSAMPLE(pdata)->memio_delayed));
499                         else
500                                 atomic_inc((atomic_t *) &
501                                            (PSAMPLE(pdata)->io_delayed));
502                 }
503         }
504         while_each_thread(proc, thread);
505         read_unlock(&tasklist_lock);
506 //      printk(KERN_DEBUG "sample_timer: run=%d wait=%d\n",run,wait);
507         start_sample_timer();
508 }
509
510 static void ukcc_cmd_deliver(int rchan_id, char *from, u32 len)
511 {
512         struct crbce_command *cmdrec = (struct crbce_command *)from;
513         struct crbce_cmd_done cmdret;
514         int rc = 0;
515
516 //      printk(KERN_DEBUG "ukcc_cmd_deliver: %d %d len=%d:%d\n",cmdrec->type, 
517 //              cmdrec->cmd,cmdrec->len,len);
518
519         cmdrec->len = len;      // add this to reflection so the user doesn't 
520                                 // accidently write the wrong length and the 
521                                 // protocol is getting screwed up 
522
523         if (cmdrec->type != CRBCE_REC_KERNEL_CMD) {
524                 rc = EINVAL;
525                 goto out;
526         }
527
528         switch (cmdrec->cmd) {
529         case CRBCE_CMD_SET_TIMER:
530                 {
531                         struct crbce_cmd_settimer *cptr =
532                             (struct crbce_cmd_settimer *)cmdrec;
533                         if (len != sizeof(*cptr)) {
534                                 rc = EINVAL;
535                                 break;
536                         }
537                         stop_sample_timer();
538                         timer_interval_length = cptr->interval;
539                         if ((timer_interval_length > 0)
540                             && (timer_interval_length < 10))
541                                 timer_interval_length = 10;   
542                                 // anything finer can create problems 
543                         printk(KERN_INFO "CRBCE set sample collect timer %lu\n",
544                                timer_interval_length);
545                         start_sample_timer();
546                         break;
547                 }
548         case CRBCE_CMD_SEND_DATA:
549                 {
550                         struct crbce_cmd_send_data *cptr =
551                             (struct crbce_cmd_send_data *)cmdrec;
552                         if (len != sizeof(*cptr)) {
553                                 rc = EINVAL;
554                                 break;
555                         }
556                         delta_mode = cptr->delta_mode;
557                         send_task_data();
558                         break;
559                 }
560         case CRBCE_CMD_START:
561                 add_all_private_data();
562                 chan_state = UKCC_OK;
563                 resend_task_info();
564                 break;
565
566         case CRBCE_CMD_STOP:
567                 chan_state = UKCC_STANDBY;
568                 free_all_private_data();
569                 break;
570
571         default:
572                 rc = EINVAL;
573                 break;
574         }
575
576       out:
577         cmdret.hdr.type = CRBCE_REC_KERNEL_CMD_DONE;
578         cmdret.hdr.cmd = cmdrec->cmd;
579         cmdret.rc = rc;
580         rec_send(&cmdret);
581 //      printk(KERN_DEBUG "ukcc_cmd_deliver ACK: %d %d rc=%d %d\n",cmdret.hdr.type,
582 //                      cmdret.hdr.cmd,rc,sizeof(cmdret));
583 }
584
585 static void client_attached(void)
586 {
587         printk(KERN_DEBUG "client [%d]<%s> attached to UKCC\n", current->pid,
588                current->comm);
589         relay_reset(ukcc_channel);
590 }
591
592 static void client_detached(void)
593 {
594         printk(KERN_DEBUG "client [%d]<%s> detached to UKCC\n", current->pid,
595                current->comm);
596         chan_state = UKCC_STANDBY;
597         stop_sample_timer();
598         relay_reset(ukcc_channel);
599         free_all_private_data();
600 }
601
602 static int init_rbce_ext_pre(void)
603 {
604         int rc;
605
606         rc = create_ukcc_channel();
607         return ((rc < 0) ? rc : 0);
608 }
609
610 static int init_rbce_ext_post(void)
611 {
612         init_timer(&sample_timer);
613         return 0;
614 }
615
616 static void exit_rbce_ext(void)
617 {
618         stop_sample_timer();
619         close_ukcc_channel();
620 }