#include <sys/socket.h>
#include <sys/time.h>
#include <sys/sysctl.h>
+
#include <net/if.h> /* IFNAMSIZ, struct ifaddr, ifq head, lock.h mutex.h */
#include <net/netisr.h>
+#include <net/vnet.h>
+
#include <netinet/in.h>
#include <netinet/ip.h> /* ip_len, ip_off */
#include <netinet/ip_var.h> /* ip_output(), IP_FORWARDING */
*/
struct dn_parms dn_cfg;
+//VNET_DEFINE(struct dn_parms, _base_dn_cfg);
static long tick_last; /* Last tick duration (usec). */
static long tick_delta; /* Last vs standard tick diff (usec). */
SYSCTL_DECL(_net_inet_ip);
SYSCTL_NODE(_net_inet_ip, OID_AUTO, dummynet, CTLFLAG_RW, 0, "Dummynet");
+/* wrapper to pass dn_cfg fields to SYSCTL_* */
+//#define DC(x) (&(VNET_NAME(_base_dn_cfg).x))
+#define DC(x) (&(dn_cfg.x))
/* parameters */
SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, hash_size,
- CTLFLAG_RW, &dn_cfg.hash_size, 0, "Default hash table size");
+ CTLFLAG_RW, DC(hash_size), 0, "Default hash table size");
SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, pipe_slot_limit,
- CTLFLAG_RW, &dn_cfg.slot_limit, 0,
+ CTLFLAG_RW, DC(slot_limit), 0,
"Upper limit in slots for pipe queue.");
SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, pipe_byte_limit,
- CTLFLAG_RW, &dn_cfg.byte_limit, 0,
+ CTLFLAG_RW, DC(byte_limit), 0,
"Upper limit in bytes for pipe queue.");
SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, io_fast,
- CTLFLAG_RW, &dn_cfg.io_fast, 0, "Enable fast dummynet io.");
+ CTLFLAG_RW, DC(io_fast), 0, "Enable fast dummynet io.");
SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, debug,
- CTLFLAG_RW, &dn_cfg.debug, 0, "Dummynet debug level");
-SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, expire,
- CTLFLAG_RW, &dn_cfg.expire, 0, "Expire empty queues/pipes");
-SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, expire_cycle,
- CTLFLAG_RD, &dn_cfg.expire_cycle, 0, "Expire cycle for queues/pipes");
+ CTLFLAG_RW, DC(debug), 0, "Dummynet debug level");
/* RED parameters */
SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_lookup_depth,
- CTLFLAG_RD, &dn_cfg.red_lookup_depth, 0, "Depth of RED lookup table");
+ CTLFLAG_RD, DC(red_lookup_depth), 0, "Depth of RED lookup table");
SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_avg_pkt_size,
- CTLFLAG_RD, &dn_cfg.red_avg_pkt_size, 0, "RED Medium packet size");
+ CTLFLAG_RD, DC(red_avg_pkt_size), 0, "RED Medium packet size");
SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_max_pkt_size,
- CTLFLAG_RD, &dn_cfg.red_max_pkt_size, 0, "RED Max packet size");
+ CTLFLAG_RD, DC(red_max_pkt_size), 0, "RED Max packet size");
/* time adjustment */
SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_delta,
CTLFLAG_RD, &tick_lost, 0,
"Number of ticks coalesced by dummynet taskqueue.");
+/* Drain parameters */
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, expire,
+ CTLFLAG_RW, DC(expire), 0, "Expire empty queues/pipes");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, expire_cycle,
+ CTLFLAG_RD, DC(expire_cycle), 0, "Expire cycle for queues/pipes");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, expire_object,
+ CTLFLAG_RW, DC(expire_object), 0, "Min # of objects before start drain routine");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, object_idle_tick,
+ CTLFLAG_RD, DC(object_idle_tick), 0, "Time (in ticks) to cosiderer an object as idle");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, drain_ratio,
+ CTLFLAG_RD, DC(drain_ratio), 0, "% of dummynet_task() to dedicate to drain routine");
+
/* statistics */
SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, schk_count,
- CTLFLAG_RD, &dn_cfg.schk_count, 0, "Number of schedulers");
+ CTLFLAG_RD, DC(schk_count), 0, "Number of schedulers");
SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, si_count,
- CTLFLAG_RD, &dn_cfg.si_count, 0, "Number of scheduler instances");
+ CTLFLAG_RD, DC(si_count), 0, "Number of scheduler instances");
SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, fsk_count,
- CTLFLAG_RD, &dn_cfg.fsk_count, 0, "Number of flowsets");
+ CTLFLAG_RD, DC(fsk_count), 0, "Number of flowsets");
SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, queue_count,
- CTLFLAG_RD, &dn_cfg.queue_count, 0, "Number of queues");
+ CTLFLAG_RD, DC(queue_count), 0, "Number of queues");
SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt,
CTLFLAG_RD, &io_pkt, 0,
"Number of packets passed to dummynet.");
SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt_drop,
CTLFLAG_RD, &io_pkt_drop, 0,
"Number of packets dropped by dummynet.");
-
+#undef DC
SYSEND
#endif
goto drop;
}
mq_append(&q->mq, m);
+ if (q->ni.length == 0) { /* queue was idle */
+ dn_cfg.idle_queue--;
+ if (ni->length == 0) /* scheduler was idle */
+ dn_cfg.idle_si--;
+ }
q->ni.length++;
q->ni.len_bytes += len;
ni->length++;
si->sched_time = now;
done = 0;
while (si->credit >= 0 && (m = s->fp->dequeue(si)) != NULL) {
+ uint64_t len_scaled;
+
+ /*
+ * Some schedulers might want wake up the scheduler later.
+ * To suppor this the caller returns an mbuf with len < 0
+ * this will result in a new wake up of the scheduler
+ * instance between m->m_pkthdr.len ticks.
+ */
if (m->m_pkthdr.len < 0) {
- /* Received a packet with negative length.
- * the scheduler instance will be waken up after
- * -m->m_pkthdr.len ticks.
- */
si->kflags |= DN_ACTIVE;
heap_insert(&dn_cfg.evheap, now - m->m_pkthdr.len, si);
-
- /* Delete the fake packet */
- free(m, M_DUMMYNET);
-
- /* Dont' touch credit, exit from the function */
+ if (delay_line_idle && done)
+ transmit_event(q, &si->dline, now);
return NULL;
- } else { /* normal behaviour */
- uint64_t len_scaled;
- done++;
- len_scaled = (bw == 0) ? 0 : hz *
- (m->m_pkthdr.len * 8 + extra_bits(m, s));
- si->credit -= len_scaled;
- /* Move packet in the delay line */
- dn_tag_get(m)->output_time += s->link.delay ;
- mq_append(&si->dline.mq, m);
}
+
+ /* a regular mbuf received */
+ done++;
+ if (bw == 0) printf("bw is null\n");
+ len_scaled = (bw == 0) ? 0 : hz *
+ (m->m_pkthdr.len * 8 + extra_bits(m, s));
+ si->credit -= len_scaled;
+ /* Move packet in the delay line */
+ dn_tag_get(m)->output_time = dn_cfg.curr_time + s->link.delay;
+ mq_append(&si->dline.mq, m);
}
+
/*
* If credit >= 0 the instance is idle, mark time.
* Otherwise put back in the heap, and adjust the output
return q->head;
}
+/*
+ * Support function to read the TSC (or equivalent). We use this
+ * high resolution timer to adapt the amount of work done for
+ * expiring the clock.
+ * Supports Linux and FreeBSD both i386 and amd64 platform
+ * Supports OpenWRT mips architecture
+ *
+ * SMP no special works is needed in
+ * - In linux 2.6 timers will always run in the same cpu that have added it.See
+ * (http://book.opensourceproject.org.cn/kernel/kernel3rd/opensource/0596005652/understandlk-chp-6-sect-5.html)
+ * - FreeBSD8 has a new callout_reset_on() with specify the cpu on which
+ * the timer must be run
+ * - Windows runs dummynet_task() on cpu0.
+ *
+ * - Linux 2.4 doesn't assure to run a timer in the same cpu every time.
+ */
+#ifdef HAVE_TSC
+uint64_t
+readTSC (void)
+{
+ uint64_t a=0;
+
+#ifdef __linux__
+ /* Linux and openwrt have a macro to read the tsc for i386 and
+ * amd64.
+ * Openwrt have patched the kernel and allow use of tsc with mips
+ * and other platforms
+ * rdtscll() is a macro defined in include/asm-xxx/msr.h,
+ * where xxx is the architecture (x86, mips).
+ */
+ rdtscll(a);
+#elif defined(_WIN32)
+ /* Microsoft recommends the use of KeQueryPerformanceCounter()
+ * insteead of rdtsc().
+ */
+ KeQueryPerformanceCounter((PLARGE_INTEGER)&a); //XXX not tested!
+#elif defined(__FreeBSD__)
+ /* FreeBSD (i386/amd64) has macro rdtsc() defined in machine/cpufunc.h.
+ * We could use the macro instead of explicity assembly XXX
+ */
+ return rdtsc();
+#endif
+ return a;
+}
+#endif /* HAVE_TSC */
+
+/*
+ * compute avg task period.
+ * We could do something more complex, possibly.
+ */
+static void
+do_update_cycle(void)
+{
+#ifdef HAVE_TSC
+ uint64_t tmp = readTSC();
+#if defined (LINUX_24) && defined(CONFIG_SMP)
+ /* on LINUX24 and SMP, we have no guarantees on which cpu runs
+ * the timer callbacks. If the difference between new and
+ * old value is negative, we assume that the values come from
+ * different cpus so we adjust 'new' accordingly.
+ */
+ if (tmp <= dn_cfg.cycle_task_new)
+ dn_cfg.cycle_task_new = tmp - dn_cfg.cycle_task;
+#endif /* !(linux24 && SMP) */
+ dn_cfg.cycle_task_old = dn_cfg.cycle_task_new;
+ dn_cfg.cycle_task_new = tmp;
+ dn_cfg.cycle_task = dn_cfg.cycle_task_new - dn_cfg.cycle_task_old;
+
+ /* Update the average
+ * avg = (2^N * avg + new - avg ) / 2^N * avg
+ * N==4 seems to be a good compromise between clock clock change
+ * and 'spurious' cycle_task value
+ */
+#define DN_N 4
+ dn_cfg.cycle_task_avg = (dn_cfg.cycle_task_avg << DN_N) +
+ dn_cfg.cycle_task - dn_cfg.cycle_task_avg;
+ dn_cfg.cycle_task_avg = dn_cfg.cycle_task_avg >> DN_N;
+#undef DN_N
+
+#endif /* HAVE_TSC */
+}
+
+static void
+do_drain(void)
+{
+#ifdef HAVE_TSC
+ uint64_t dt_max;
+#endif
+ if (!dn_cfg.expire || ++dn_cfg.expire_cycle < dn_cfg.expire)
+ return;
+ /* It's time to check if drain routines should be called */
+ dn_cfg.expire_cycle = 0;
+
+ dn_cfg.idle_queue_wait = 0;
+ dn_cfg.idle_si_wait = 0;
+ /* Do a drain cycle even if there isn't time to do it */
+#ifdef HAVE_TSC
+ dt_max = dn_cfg.cycle_task_avg * dn_cfg.drain_ratio;
+#endif
+ for (;;) {
+ int done = 0;
+
+ if (dn_cfg.idle_queue > dn_cfg.expire_object &&
+ dn_cfg.idle_queue_wait < dn_cfg.idle_queue) {
+ dn_drain_queue();
+ done = 1;
+ }
+ if (dn_cfg.idle_si > dn_cfg.expire_object &&
+ dn_cfg.idle_si_wait < dn_cfg.idle_si) {
+ dn_drain_scheduler();
+ done = 1;
+ }
+ /* time to end ? */
+#ifndef HAVE_TSC
+ /* If tsc does not exist, do only one drain cycle and exit */
+ break;
+#else
+ /* Exit when nothing was done or we have consumed all time */
+ if ( (done == 0) ||
+ ((readTSC() - dn_cfg.cycle_task_new) * 100 > dt_max) )
+ break;
+#endif /* HAVE_TSC */
+ }
+}
+
/*
* The timer handler for dummynet. Time is computed in ticks, but
* but the code is tolerant to the actual rate at which this is called.
{
struct timeval t;
struct mq q = { NULL, NULL }; /* queue to accumulate results */
-
+
+ CURVNET_SET((struct vnet *)context);
+
+ do_update_cycle(); /* compute avg. tick duration */
+
DN_BH_WLOCK();
/* Update number of lost(coalesced) ticks. */
transmit_event(&q, (struct delay_line *)p, dn_cfg.curr_time);
}
}
- if (dn_cfg.expire && ++dn_cfg.expire_cycle >= dn_cfg.expire) {
- dn_cfg.expire_cycle = 0;
- dn_drain_scheduler();
- dn_drain_queue();
- }
+ do_drain();
DN_BH_WUNLOCK();
dn_reschedule();
if (q.head != NULL)
dummynet_send(q.head);
+ CURVNET_RESTORE();
}
/*
goto dropit; /* This queue/pipe does not exist! */
if (fs->sched == NULL) /* should not happen */
goto dropit;
- /* find scheduler instance, possibly applying sched_mask */
- si = ipdn_si_find(fs->sched, &(fwa->f_id));
- if (si == NULL)
- goto dropit;
/*
* If the scheduler supports multiple queues, find the right one
* (otherwise it will be ignored by enqueue).
*/
if (fs->sched->fp->flags & DN_MULTIQUEUE) {
- q = ipdn_q_find(fs, si, &(fwa->f_id));
+ q = ipdn_q_find(fs, &(fwa->f_id));
if (q == NULL)
goto dropit;
- }
+ /* The scheduler instance lookup is done only for new queue.
+ * The callback q_new() will create the scheduler instance
+ * if needed.
+ */
+ si = q->_si;
+ } else
+ si = ipdn_si_find(fs->sched, &(fwa->f_id));
+
+ if (si == NULL)
+ goto dropit;
if (fs->sched->fp->enqueue(si, q, m)) {
- printf("%s dropped by enqueue\n", __FUNCTION__);
/* packet was dropped by enqueue() */
m = *m0 = NULL;
goto dropit;
}
/* compute the initial allowance */
- {
+ if (si->idle_time < dn_cfg.curr_time) {
+ /* Do this only on the first packet on an idle pipe */
struct dn_link *p = &fs->sched->link;
+
+ si->sched_time = dn_cfg.curr_time;
si->credit = dn_cfg.io_fast ? p->bandwidth : 0;
if (p->burst) {
uint64_t burst = (dn_cfg.curr_time - si->idle_time) * p->bandwidth;