integrated

[ipfw.git] / dummynet2 / ip_dn_io.c
diff --git a/dummynet2/ip_dn_io.c b/dummynet2/ip_dn_io.c

index 3450466..6672424 100644 (file)
--- a/dummynet2/ip_dn_io.c
+++ b/dummynet2/ip_dn_io.c
@@ -45,8 +45,11 @@ __FBSDID("$FreeBSD: user/luigi/ipfw3-head/sys/netinet/ipfw/ip_dn_io.c 203321 201
  #include <sys/socket.h>
  #include <sys/time.h>
  #include <sys/sysctl.h>
+
  #include <net/if.h>    /* IFNAMSIZ, struct ifaddr, ifq head, lock.h mutex.h */
  #include <net/netisr.h>
+#include <net/vnet.h>
+
  #include <netinet/in.h>
  #include <netinet/ip.h>                /* ip_len, ip_off */
  #include <netinet/ip_var.h>    /* ip_output(), IP_FORWARDING */
@@ -69,6 +72,7 @@ __FBSDID("$FreeBSD: user/luigi/ipfw3-head/sys/netinet/ipfw/ip_dn_io.c 203321 201
   */
  
  struct dn_parms dn_cfg;
+//VNET_DEFINE(struct dn_parms, _base_dn_cfg);
  
  static long tick_last;         /* Last tick duration (usec). */
  static long tick_delta;                /* Last vs standard tick diff (usec). */
@@ -100,31 +104,30 @@ SYSCTL_DECL(_net_inet);
  SYSCTL_DECL(_net_inet_ip);
  SYSCTL_NODE(_net_inet_ip, OID_AUTO, dummynet, CTLFLAG_RW, 0, "Dummynet");
  
+/* wrapper to pass dn_cfg fields to SYSCTL_* */
+//#define DC(x)        (&(VNET_NAME(_base_dn_cfg).x))
+#define DC(x)  (&(dn_cfg.x))
  /* parameters */
  SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, hash_size,
-    CTLFLAG_RW, &dn_cfg.hash_size, 0, "Default hash table size");
+    CTLFLAG_RW, DC(hash_size), 0, "Default hash table size");
  SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, pipe_slot_limit,
-    CTLFLAG_RW, &dn_cfg.slot_limit, 0,
+    CTLFLAG_RW, DC(slot_limit), 0,
      "Upper limit in slots for pipe queue.");
  SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, pipe_byte_limit,
-    CTLFLAG_RW, &dn_cfg.byte_limit, 0,
+    CTLFLAG_RW, DC(byte_limit), 0,
      "Upper limit in bytes for pipe queue.");
  SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, io_fast,
-    CTLFLAG_RW, &dn_cfg.io_fast, 0, "Enable fast dummynet io.");
+    CTLFLAG_RW, DC(io_fast), 0, "Enable fast dummynet io.");
  SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, debug,
-    CTLFLAG_RW, &dn_cfg.debug, 0, "Dummynet debug level");
-SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, expire,
-    CTLFLAG_RW, &dn_cfg.expire, 0, "Expire empty queues/pipes");
-SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, expire_cycle,
-    CTLFLAG_RD, &dn_cfg.expire_cycle, 0, "Expire cycle for queues/pipes");
+    CTLFLAG_RW, DC(debug), 0, "Dummynet debug level");
  
  /* RED parameters */
  SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_lookup_depth,
-    CTLFLAG_RD, &dn_cfg.red_lookup_depth, 0, "Depth of RED lookup table");
+    CTLFLAG_RD, DC(red_lookup_depth), 0, "Depth of RED lookup table");
  SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_avg_pkt_size,
-    CTLFLAG_RD, &dn_cfg.red_avg_pkt_size, 0, "RED Medium packet size");
+    CTLFLAG_RD, DC(red_avg_pkt_size), 0, "RED Medium packet size");
  SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_max_pkt_size,
-    CTLFLAG_RD, &dn_cfg.red_max_pkt_size, 0, "RED Max packet size");
+    CTLFLAG_RD, DC(red_max_pkt_size), 0, "RED Max packet size");
  
  /* time adjustment */
  SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_delta,
@@ -140,15 +143,27 @@ SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_lost,
      CTLFLAG_RD, &tick_lost, 0,
      "Number of ticks coalesced by dummynet taskqueue.");
  
+/* Drain parameters */
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, expire,
+    CTLFLAG_RW, DC(expire), 0, "Expire empty queues/pipes");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, expire_cycle,
+    CTLFLAG_RD, DC(expire_cycle), 0, "Expire cycle for queues/pipes");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, expire_object,
+    CTLFLAG_RW, DC(expire_object), 0, "Min # of objects before start drain routine");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, object_idle_tick,
+    CTLFLAG_RD, DC(object_idle_tick), 0, "Time (in ticks) to cosiderer an object as idle");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, drain_ratio,
+    CTLFLAG_RD, DC(drain_ratio), 0, "% of dummynet_task() to dedicate to drain routine");
+
  /* statistics */
  SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, schk_count,
-    CTLFLAG_RD, &dn_cfg.schk_count, 0, "Number of schedulers");
+    CTLFLAG_RD, DC(schk_count), 0, "Number of schedulers");
  SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, si_count,
-    CTLFLAG_RD, &dn_cfg.si_count, 0, "Number of scheduler instances");
+    CTLFLAG_RD, DC(si_count), 0, "Number of scheduler instances");
  SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, fsk_count,
-    CTLFLAG_RD, &dn_cfg.fsk_count, 0, "Number of flowsets");
+    CTLFLAG_RD, DC(fsk_count), 0, "Number of flowsets");
  SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, queue_count,
-    CTLFLAG_RD, &dn_cfg.queue_count, 0, "Number of queues");
+    CTLFLAG_RD, DC(queue_count), 0, "Number of queues");
  SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt,
      CTLFLAG_RD, &io_pkt, 0,
      "Number of packets passed to dummynet.");
@@ -158,7 +173,7 @@ SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt_fast,
  SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt_drop,
      CTLFLAG_RD, &io_pkt_drop, 0,
      "Number of packets dropped by dummynet.");
-
+#undef DC
  SYSEND
  
  #endif
@@ -364,6 +379,11 @@ dn_enqueue(struct dn_queue *q, struct mbuf* m, int drop)
                 goto drop;
         }
         mq_append(&q->mq, m);
+       if (q->ni.length == 0) {        /* queue was idle */
+               dn_cfg.idle_queue--;
+               if (ni->length == 0)    /* scheduler was idle */
+                       dn_cfg.idle_si--;
+       }
         q->ni.length++;
         q->ni.len_bytes += len;
         ni->length++;
@@ -455,30 +475,33 @@ serve_sched(struct mq *q, struct dn_sch_inst *si, uint64_t now)
         si->sched_time = now;
         done = 0;
         while (si->credit >= 0 && (m = s->fp->dequeue(si)) != NULL) {
+               uint64_t len_scaled;
+
+               /*
+                * Some schedulers might want wake up the scheduler later.
+                * To suppor this the caller returns an mbuf with len < 0
+                * this will result in a new wake up of the scheduler
+                * instance between m->m_pkthdr.len ticks.
+                */
                 if (m->m_pkthdr.len < 0) {
-                       /* Received a packet with negative length.
-                        * the scheduler instance will be waken up after
-                        * -m->m_pkthdr.len ticks.
-                        */
                         si->kflags |= DN_ACTIVE;
                         heap_insert(&dn_cfg.evheap, now - m->m_pkthdr.len, si);
-
-                       /* Delete the fake packet */
-                       free(m, M_DUMMYNET);
-
-                       /* Dont' touch credit, exit from the function */
+                       if (delay_line_idle && done)
+                               transmit_event(q, &si->dline, now);
                         return NULL;
-               } else {                /* normal behaviour */
-                       uint64_t len_scaled;
-                       done++;
-                       len_scaled = (bw == 0) ? 0 : hz *
-                               (m->m_pkthdr.len * 8 + extra_bits(m, s));
-                       si->credit -= len_scaled;
-                       /* Move packet in the delay line */
-                       dn_tag_get(m)->output_time += s->link.delay ;
-                       mq_append(&si->dline.mq, m);
                 }
+
+               /* a regular mbuf received */
+               done++;
+               if (bw == 0) printf("bw is null\n");
+               len_scaled = (bw == 0) ? 0 : hz *
+                       (m->m_pkthdr.len * 8 + extra_bits(m, s));
+               si->credit -= len_scaled;
+               /* Move packet in the delay line */
+               dn_tag_get(m)->output_time = dn_cfg.curr_time + s->link.delay;
+               mq_append(&si->dline.mq, m);
         }
+
         /*
          * If credit >= 0 the instance is idle, mark time.
          * Otherwise put back in the heap, and adjust the output
@@ -500,6 +523,131 @@ serve_sched(struct mq *q, struct dn_sch_inst *si, uint64_t now)
         return q->head;
  }
  
+/*
+ * Support function to read the TSC (or equivalent). We use this
+ * high resolution timer to adapt the amount of work done for
+ * expiring the clock.
+ * Supports Linux and FreeBSD both i386 and amd64 platform
+ * Supports OpenWRT mips architecture
+ *
+ * SMP no special works is needed in
+ * - In linux 2.6 timers will always run in the same cpu that have added it.See
+ * (http://book.opensourceproject.org.cn/kernel/kernel3rd/opensource/0596005652/understandlk-chp-6-sect-5.html)
+ * - FreeBSD8 has a new callout_reset_on() with specify the cpu on which
+ *   the timer must be run
+ * - Windows runs dummynet_task() on cpu0.
+ *
+ * - Linux 2.4 doesn't assure to run a timer in the same cpu every time.
+ */
+#ifdef HAVE_TSC
+uint64_t
+readTSC (void)
+{
+       uint64_t a=0;
+
+#ifdef __linux__
+       /* Linux and openwrt have a macro to read the tsc for i386 and
+        * amd64.
+        * Openwrt have patched the kernel and allow use of tsc with mips
+        * and other platforms
+        * rdtscll() is a macro defined in include/asm-xxx/msr.h,
+        * where xxx is the architecture (x86, mips).
+        */
+       rdtscll(a);
+#elif defined(_WIN32)
+       /* Microsoft recommends the use of KeQueryPerformanceCounter()
+        * insteead of rdtsc().
+        */
+       KeQueryPerformanceCounter((PLARGE_INTEGER)&a);  //XXX not tested!
+#elif defined(__FreeBSD__)
+       /* FreeBSD (i386/amd64) has macro rdtsc() defined in machine/cpufunc.h.
+        * We could use the macro instead of explicity assembly XXX
+        */
+       return rdtsc();
+#endif
+       return a;
+}
+#endif /* HAVE_TSC */
+
+/*
+ * compute avg task period.
+ * We could do something more complex, possibly.
+ */
+static void
+do_update_cycle(void)
+{
+#ifdef HAVE_TSC
+       uint64_t tmp = readTSC();
+#if defined (LINUX_24) && defined(CONFIG_SMP)
+       /* on LINUX24 and SMP, we have no guarantees on which cpu runs
+        * the timer callbacks. If the difference between new and
+        * old value is negative, we assume that the values come from
+        * different cpus so we adjust 'new' accordingly.
+        */
+       if (tmp <= dn_cfg.cycle_task_new)
+               dn_cfg.cycle_task_new = tmp - dn_cfg.cycle_task;
+#endif /* !(linux24 && SMP) */
+       dn_cfg.cycle_task_old = dn_cfg.cycle_task_new;
+       dn_cfg.cycle_task_new = tmp;
+       dn_cfg.cycle_task = dn_cfg.cycle_task_new - dn_cfg.cycle_task_old;
+
+       /* Update the average
+        * avg = (2^N * avg + new - avg ) / 2^N * avg
+        * N==4 seems to be a good compromise between clock clock change
+        *      and 'spurious' cycle_task value
+        */
+#define DN_N   4
+       dn_cfg.cycle_task_avg = (dn_cfg.cycle_task_avg << DN_N) +
+                               dn_cfg.cycle_task - dn_cfg.cycle_task_avg;
+       dn_cfg.cycle_task_avg = dn_cfg.cycle_task_avg >> DN_N;
+#undef DN_N
+
+#endif /* HAVE_TSC */
+}
+
+static void
+do_drain(void)
+{
+#ifdef HAVE_TSC
+       uint64_t dt_max;
+#endif
+       if (!dn_cfg.expire || ++dn_cfg.expire_cycle < dn_cfg.expire)
+               return;
+       /* It's time to check if drain routines should be called */
+       dn_cfg.expire_cycle = 0;
+
+       dn_cfg.idle_queue_wait = 0;
+       dn_cfg.idle_si_wait = 0;
+       /* Do a drain cycle even if there isn't time to do it */
+#ifdef HAVE_TSC
+       dt_max = dn_cfg.cycle_task_avg * dn_cfg.drain_ratio;
+#endif
+       for (;;) {
+               int done = 0;
+
+               if (dn_cfg.idle_queue > dn_cfg.expire_object &&
+                   dn_cfg.idle_queue_wait < dn_cfg.idle_queue) {
+                       dn_drain_queue();
+                       done = 1;
+               }
+               if (dn_cfg.idle_si > dn_cfg.expire_object &&
+                   dn_cfg.idle_si_wait < dn_cfg.idle_si) {
+                       dn_drain_scheduler();
+                       done = 1;
+               }
+               /* time to end ? */
+#ifndef HAVE_TSC
+               /* If tsc does not exist, do only one drain cycle and exit */
+               break;
+#else
+               /* Exit when nothing was done or we have consumed all time */
+               if ( (done == 0) || 
+                    ((readTSC() -  dn_cfg.cycle_task_new) * 100 > dt_max) )
+                       break;
+#endif /* HAVE_TSC */
+       }
+}
+
  /*
   * The timer handler for dummynet. Time is computed in ticks, but
   * but the code is tolerant to the actual rate at which this is called.
@@ -510,7 +658,11 @@ dummynet_task(void *context, int pending)
  {
         struct timeval t;
         struct mq q = { NULL, NULL }; /* queue to accumulate results */
-       
+
+       CURVNET_SET((struct vnet *)context);
+
+       do_update_cycle();      /* compute avg. tick duration */
+
         DN_BH_WLOCK();
  
         /* Update number of lost(coalesced) ticks. */
@@ -565,16 +717,13 @@ dummynet_task(void *context, int pending)
                         transmit_event(&q, (struct delay_line *)p, dn_cfg.curr_time);
                 }
         }
-       if (dn_cfg.expire && ++dn_cfg.expire_cycle >= dn_cfg.expire) {
-               dn_cfg.expire_cycle = 0;
-       dn_drain_scheduler();
-       dn_drain_queue();
-       }
+       do_drain();
  
         DN_BH_WUNLOCK();
         dn_reschedule();
         if (q.head != NULL)
                 dummynet_send(q.head);
+       CURVNET_RESTORE();
  }
  
  /*
@@ -732,21 +881,25 @@ dummynet_io(struct mbuf **m0, int dir, struct ip_fw_args *fwa)
                 goto dropit;    /* This queue/pipe does not exist! */
         if (fs->sched == NULL)  /* should not happen */
                 goto dropit;
-       /* find scheduler instance, possibly applying sched_mask */
-       si = ipdn_si_find(fs->sched, &(fwa->f_id));
-       if (si == NULL)
-               goto dropit;
         /*
          * If the scheduler supports multiple queues, find the right one
          * (otherwise it will be ignored by enqueue).
          */
         if (fs->sched->fp->flags & DN_MULTIQUEUE) {
-               q = ipdn_q_find(fs, si, &(fwa->f_id));
+               q = ipdn_q_find(fs, &(fwa->f_id));
                 if (q == NULL)
                         goto dropit;
-       }
+               /* The scheduler instance lookup is done only for new queue.
+                * The callback q_new() will create the scheduler instance
+                * if needed.
+                */
+               si = q->_si;
+       } else
+               si = ipdn_si_find(fs->sched, &(fwa->f_id));
+
+       if (si == NULL)
+               goto dropit;
         if (fs->sched->fp->enqueue(si, q, m)) {
-               printf("%s dropped by enqueue\n", __FUNCTION__);
                 /* packet was dropped by enqueue() */
                 m = *m0 = NULL;
                 goto dropit;
@@ -758,8 +911,11 @@ dummynet_io(struct mbuf **m0, int dir, struct ip_fw_args *fwa)
         }
  
         /* compute the initial allowance */
-       {
+       if (si->idle_time < dn_cfg.curr_time) {
+           /* Do this only on the first packet on an idle pipe */
             struct dn_link *p = &fs->sched->link;
+
+           si->sched_time = dn_cfg.curr_time;
             si->credit = dn_cfg.io_fast ? p->bandwidth : 0;
             if (p->burst) {
                 uint64_t burst = (dn_cfg.curr_time - si->idle_time) * p->bandwidth;