X-Git-Url: http://git.onelab.eu/?p=ipfw.git;a=blobdiff_plain;f=dummynet2%2Fip_dn_io.c;fp=dummynet2%2Fip_dn_io.c;h=6672424ee0e9b524fed91730ed7bb24b74c3823c;hp=34504665fbab9b0e113a64a3a840eff8dca23f81;hb=28a7fe9d930667786b902af6697c01eb87694173;hpb=2a8b6c544cf5ea3c84f763144c7ecfa79daea969 diff --git a/dummynet2/ip_dn_io.c b/dummynet2/ip_dn_io.c index 3450466..6672424 100644 --- a/dummynet2/ip_dn_io.c +++ b/dummynet2/ip_dn_io.c @@ -45,8 +45,11 @@ __FBSDID("$FreeBSD: user/luigi/ipfw3-head/sys/netinet/ipfw/ip_dn_io.c 203321 201 #include #include #include + #include /* IFNAMSIZ, struct ifaddr, ifq head, lock.h mutex.h */ #include +#include + #include #include /* ip_len, ip_off */ #include /* ip_output(), IP_FORWARDING */ @@ -69,6 +72,7 @@ __FBSDID("$FreeBSD: user/luigi/ipfw3-head/sys/netinet/ipfw/ip_dn_io.c 203321 201 */ struct dn_parms dn_cfg; +//VNET_DEFINE(struct dn_parms, _base_dn_cfg); static long tick_last; /* Last tick duration (usec). */ static long tick_delta; /* Last vs standard tick diff (usec). */ @@ -100,31 +104,30 @@ SYSCTL_DECL(_net_inet); SYSCTL_DECL(_net_inet_ip); SYSCTL_NODE(_net_inet_ip, OID_AUTO, dummynet, CTLFLAG_RW, 0, "Dummynet"); +/* wrapper to pass dn_cfg fields to SYSCTL_* */ +//#define DC(x) (&(VNET_NAME(_base_dn_cfg).x)) +#define DC(x) (&(dn_cfg.x)) /* parameters */ SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, hash_size, - CTLFLAG_RW, &dn_cfg.hash_size, 0, "Default hash table size"); + CTLFLAG_RW, DC(hash_size), 0, "Default hash table size"); SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, pipe_slot_limit, - CTLFLAG_RW, &dn_cfg.slot_limit, 0, + CTLFLAG_RW, DC(slot_limit), 0, "Upper limit in slots for pipe queue."); SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, pipe_byte_limit, - CTLFLAG_RW, &dn_cfg.byte_limit, 0, + CTLFLAG_RW, DC(byte_limit), 0, "Upper limit in bytes for pipe queue."); SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, io_fast, - CTLFLAG_RW, &dn_cfg.io_fast, 0, "Enable fast dummynet io."); + CTLFLAG_RW, DC(io_fast), 0, "Enable fast dummynet io."); SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, debug, - CTLFLAG_RW, &dn_cfg.debug, 0, "Dummynet debug level"); -SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, expire, - CTLFLAG_RW, &dn_cfg.expire, 0, "Expire empty queues/pipes"); -SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, expire_cycle, - CTLFLAG_RD, &dn_cfg.expire_cycle, 0, "Expire cycle for queues/pipes"); + CTLFLAG_RW, DC(debug), 0, "Dummynet debug level"); /* RED parameters */ SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_lookup_depth, - CTLFLAG_RD, &dn_cfg.red_lookup_depth, 0, "Depth of RED lookup table"); + CTLFLAG_RD, DC(red_lookup_depth), 0, "Depth of RED lookup table"); SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_avg_pkt_size, - CTLFLAG_RD, &dn_cfg.red_avg_pkt_size, 0, "RED Medium packet size"); + CTLFLAG_RD, DC(red_avg_pkt_size), 0, "RED Medium packet size"); SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_max_pkt_size, - CTLFLAG_RD, &dn_cfg.red_max_pkt_size, 0, "RED Max packet size"); + CTLFLAG_RD, DC(red_max_pkt_size), 0, "RED Max packet size"); /* time adjustment */ SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_delta, @@ -140,15 +143,27 @@ SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_lost, CTLFLAG_RD, &tick_lost, 0, "Number of ticks coalesced by dummynet taskqueue."); +/* Drain parameters */ +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, expire, + CTLFLAG_RW, DC(expire), 0, "Expire empty queues/pipes"); +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, expire_cycle, + CTLFLAG_RD, DC(expire_cycle), 0, "Expire cycle for queues/pipes"); +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, expire_object, + CTLFLAG_RW, DC(expire_object), 0, "Min # of objects before start drain routine"); +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, object_idle_tick, + CTLFLAG_RD, DC(object_idle_tick), 0, "Time (in ticks) to cosiderer an object as idle"); +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, drain_ratio, + CTLFLAG_RD, DC(drain_ratio), 0, "% of dummynet_task() to dedicate to drain routine"); + /* statistics */ SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, schk_count, - CTLFLAG_RD, &dn_cfg.schk_count, 0, "Number of schedulers"); + CTLFLAG_RD, DC(schk_count), 0, "Number of schedulers"); SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, si_count, - CTLFLAG_RD, &dn_cfg.si_count, 0, "Number of scheduler instances"); + CTLFLAG_RD, DC(si_count), 0, "Number of scheduler instances"); SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, fsk_count, - CTLFLAG_RD, &dn_cfg.fsk_count, 0, "Number of flowsets"); + CTLFLAG_RD, DC(fsk_count), 0, "Number of flowsets"); SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, queue_count, - CTLFLAG_RD, &dn_cfg.queue_count, 0, "Number of queues"); + CTLFLAG_RD, DC(queue_count), 0, "Number of queues"); SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt, CTLFLAG_RD, &io_pkt, 0, "Number of packets passed to dummynet."); @@ -158,7 +173,7 @@ SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt_fast, SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt_drop, CTLFLAG_RD, &io_pkt_drop, 0, "Number of packets dropped by dummynet."); - +#undef DC SYSEND #endif @@ -364,6 +379,11 @@ dn_enqueue(struct dn_queue *q, struct mbuf* m, int drop) goto drop; } mq_append(&q->mq, m); + if (q->ni.length == 0) { /* queue was idle */ + dn_cfg.idle_queue--; + if (ni->length == 0) /* scheduler was idle */ + dn_cfg.idle_si--; + } q->ni.length++; q->ni.len_bytes += len; ni->length++; @@ -455,30 +475,33 @@ serve_sched(struct mq *q, struct dn_sch_inst *si, uint64_t now) si->sched_time = now; done = 0; while (si->credit >= 0 && (m = s->fp->dequeue(si)) != NULL) { + uint64_t len_scaled; + + /* + * Some schedulers might want wake up the scheduler later. + * To suppor this the caller returns an mbuf with len < 0 + * this will result in a new wake up of the scheduler + * instance between m->m_pkthdr.len ticks. + */ if (m->m_pkthdr.len < 0) { - /* Received a packet with negative length. - * the scheduler instance will be waken up after - * -m->m_pkthdr.len ticks. - */ si->kflags |= DN_ACTIVE; heap_insert(&dn_cfg.evheap, now - m->m_pkthdr.len, si); - - /* Delete the fake packet */ - free(m, M_DUMMYNET); - - /* Dont' touch credit, exit from the function */ + if (delay_line_idle && done) + transmit_event(q, &si->dline, now); return NULL; - } else { /* normal behaviour */ - uint64_t len_scaled; - done++; - len_scaled = (bw == 0) ? 0 : hz * - (m->m_pkthdr.len * 8 + extra_bits(m, s)); - si->credit -= len_scaled; - /* Move packet in the delay line */ - dn_tag_get(m)->output_time += s->link.delay ; - mq_append(&si->dline.mq, m); } + + /* a regular mbuf received */ + done++; + if (bw == 0) printf("bw is null\n"); + len_scaled = (bw == 0) ? 0 : hz * + (m->m_pkthdr.len * 8 + extra_bits(m, s)); + si->credit -= len_scaled; + /* Move packet in the delay line */ + dn_tag_get(m)->output_time = dn_cfg.curr_time + s->link.delay; + mq_append(&si->dline.mq, m); } + /* * If credit >= 0 the instance is idle, mark time. * Otherwise put back in the heap, and adjust the output @@ -500,6 +523,131 @@ serve_sched(struct mq *q, struct dn_sch_inst *si, uint64_t now) return q->head; } +/* + * Support function to read the TSC (or equivalent). We use this + * high resolution timer to adapt the amount of work done for + * expiring the clock. + * Supports Linux and FreeBSD both i386 and amd64 platform + * Supports OpenWRT mips architecture + * + * SMP no special works is needed in + * - In linux 2.6 timers will always run in the same cpu that have added it.See + * (http://book.opensourceproject.org.cn/kernel/kernel3rd/opensource/0596005652/understandlk-chp-6-sect-5.html) + * - FreeBSD8 has a new callout_reset_on() with specify the cpu on which + * the timer must be run + * - Windows runs dummynet_task() on cpu0. + * + * - Linux 2.4 doesn't assure to run a timer in the same cpu every time. + */ +#ifdef HAVE_TSC +uint64_t +readTSC (void) +{ + uint64_t a=0; + +#ifdef __linux__ + /* Linux and openwrt have a macro to read the tsc for i386 and + * amd64. + * Openwrt have patched the kernel and allow use of tsc with mips + * and other platforms + * rdtscll() is a macro defined in include/asm-xxx/msr.h, + * where xxx is the architecture (x86, mips). + */ + rdtscll(a); +#elif defined(_WIN32) + /* Microsoft recommends the use of KeQueryPerformanceCounter() + * insteead of rdtsc(). + */ + KeQueryPerformanceCounter((PLARGE_INTEGER)&a); //XXX not tested! +#elif defined(__FreeBSD__) + /* FreeBSD (i386/amd64) has macro rdtsc() defined in machine/cpufunc.h. + * We could use the macro instead of explicity assembly XXX + */ + return rdtsc(); +#endif + return a; +} +#endif /* HAVE_TSC */ + +/* + * compute avg task period. + * We could do something more complex, possibly. + */ +static void +do_update_cycle(void) +{ +#ifdef HAVE_TSC + uint64_t tmp = readTSC(); +#if defined (LINUX_24) && defined(CONFIG_SMP) + /* on LINUX24 and SMP, we have no guarantees on which cpu runs + * the timer callbacks. If the difference between new and + * old value is negative, we assume that the values come from + * different cpus so we adjust 'new' accordingly. + */ + if (tmp <= dn_cfg.cycle_task_new) + dn_cfg.cycle_task_new = tmp - dn_cfg.cycle_task; +#endif /* !(linux24 && SMP) */ + dn_cfg.cycle_task_old = dn_cfg.cycle_task_new; + dn_cfg.cycle_task_new = tmp; + dn_cfg.cycle_task = dn_cfg.cycle_task_new - dn_cfg.cycle_task_old; + + /* Update the average + * avg = (2^N * avg + new - avg ) / 2^N * avg + * N==4 seems to be a good compromise between clock clock change + * and 'spurious' cycle_task value + */ +#define DN_N 4 + dn_cfg.cycle_task_avg = (dn_cfg.cycle_task_avg << DN_N) + + dn_cfg.cycle_task - dn_cfg.cycle_task_avg; + dn_cfg.cycle_task_avg = dn_cfg.cycle_task_avg >> DN_N; +#undef DN_N + +#endif /* HAVE_TSC */ +} + +static void +do_drain(void) +{ +#ifdef HAVE_TSC + uint64_t dt_max; +#endif + if (!dn_cfg.expire || ++dn_cfg.expire_cycle < dn_cfg.expire) + return; + /* It's time to check if drain routines should be called */ + dn_cfg.expire_cycle = 0; + + dn_cfg.idle_queue_wait = 0; + dn_cfg.idle_si_wait = 0; + /* Do a drain cycle even if there isn't time to do it */ +#ifdef HAVE_TSC + dt_max = dn_cfg.cycle_task_avg * dn_cfg.drain_ratio; +#endif + for (;;) { + int done = 0; + + if (dn_cfg.idle_queue > dn_cfg.expire_object && + dn_cfg.idle_queue_wait < dn_cfg.idle_queue) { + dn_drain_queue(); + done = 1; + } + if (dn_cfg.idle_si > dn_cfg.expire_object && + dn_cfg.idle_si_wait < dn_cfg.idle_si) { + dn_drain_scheduler(); + done = 1; + } + /* time to end ? */ +#ifndef HAVE_TSC + /* If tsc does not exist, do only one drain cycle and exit */ + break; +#else + /* Exit when nothing was done or we have consumed all time */ + if ( (done == 0) || + ((readTSC() - dn_cfg.cycle_task_new) * 100 > dt_max) ) + break; +#endif /* HAVE_TSC */ + } +} + /* * The timer handler for dummynet. Time is computed in ticks, but * but the code is tolerant to the actual rate at which this is called. @@ -510,7 +658,11 @@ dummynet_task(void *context, int pending) { struct timeval t; struct mq q = { NULL, NULL }; /* queue to accumulate results */ - + + CURVNET_SET((struct vnet *)context); + + do_update_cycle(); /* compute avg. tick duration */ + DN_BH_WLOCK(); /* Update number of lost(coalesced) ticks. */ @@ -565,16 +717,13 @@ dummynet_task(void *context, int pending) transmit_event(&q, (struct delay_line *)p, dn_cfg.curr_time); } } - if (dn_cfg.expire && ++dn_cfg.expire_cycle >= dn_cfg.expire) { - dn_cfg.expire_cycle = 0; - dn_drain_scheduler(); - dn_drain_queue(); - } + do_drain(); DN_BH_WUNLOCK(); dn_reschedule(); if (q.head != NULL) dummynet_send(q.head); + CURVNET_RESTORE(); } /* @@ -732,21 +881,25 @@ dummynet_io(struct mbuf **m0, int dir, struct ip_fw_args *fwa) goto dropit; /* This queue/pipe does not exist! */ if (fs->sched == NULL) /* should not happen */ goto dropit; - /* find scheduler instance, possibly applying sched_mask */ - si = ipdn_si_find(fs->sched, &(fwa->f_id)); - if (si == NULL) - goto dropit; /* * If the scheduler supports multiple queues, find the right one * (otherwise it will be ignored by enqueue). */ if (fs->sched->fp->flags & DN_MULTIQUEUE) { - q = ipdn_q_find(fs, si, &(fwa->f_id)); + q = ipdn_q_find(fs, &(fwa->f_id)); if (q == NULL) goto dropit; - } + /* The scheduler instance lookup is done only for new queue. + * The callback q_new() will create the scheduler instance + * if needed. + */ + si = q->_si; + } else + si = ipdn_si_find(fs->sched, &(fwa->f_id)); + + if (si == NULL) + goto dropit; if (fs->sched->fp->enqueue(si, q, m)) { - printf("%s dropped by enqueue\n", __FUNCTION__); /* packet was dropped by enqueue() */ m = *m0 = NULL; goto dropit; @@ -758,8 +911,11 @@ dummynet_io(struct mbuf **m0, int dir, struct ip_fw_args *fwa) } /* compute the initial allowance */ - { + if (si->idle_time < dn_cfg.curr_time) { + /* Do this only on the first packet on an idle pipe */ struct dn_link *p = &fs->sched->link; + + si->sched_time = dn_cfg.curr_time; si->credit = dn_cfg.io_fast ? p->bandwidth : 0; if (p->burst) { uint64_t burst = (dn_cfg.curr_time - si->idle_time) * p->bandwidth;