X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=net%2Fcore%2Fdev.c;h=81a3ae26fb4b139a0f0a832cdfff7c711643c8e6;hb=97bf2856c6014879bd04983a3e9dfcdac1e7fe85;hp=9ac8c41ea1c68a3f1b857e436839d44440ae369d;hpb=6a77f38946aaee1cd85eeec6cf4229b204c15071;p=linux-2.6.git diff --git a/net/core/dev.c b/net/core/dev.c index 9ac8c41ea..81a3ae26f 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -7,7 +7,7 @@ * 2 of the License, or (at your option) any later version. * * Derived from the non IP parts of dev.c 1.0.19 - * Authors: Ross Biro, + * Authors: Ross Biro * Fred N. van Kempen, * Mark Evans, * @@ -75,11 +75,12 @@ #include #include #include -#include +#include #include #include #include #include +#include #include #include #include @@ -97,7 +98,6 @@ #include #include #include -#include #include #include #include @@ -109,24 +109,21 @@ #include #include #include -#ifdef CONFIG_NET_RADIO -#include /* Note : will define WIRELESS_EXT */ +#include #include -#endif /* CONFIG_NET_RADIO */ -#include #include +#include +#include +#include +#include +#include /* remove with NXF_HIDE_NETIF */ +#include -/* This define, if set, will randomly drop a packet when congestion - * is more than moderate. It helps fairness in the multi-interface - * case when one of them is a hog, but it kills performance for the - * single interface case so it is off now by default. - */ -#undef RAND_LIE - -/* Setting this will sample the queue lengths and thus congestion - * via a timer instead of as each packet is received. - */ -#undef OFFLINE_SAMPLE +#ifdef CONFIG_XEN +#include +#include +#include +#endif /* * The list of packet types we will receive (as opposed to discard) @@ -139,7 +136,7 @@ * sure which should go first, but I bet it won't make much * difference if we are running VLANs. The good news is that * this protocol won't be in the list unless compiled in, so - * the average user (w/out VLANs) will not be adversly affected. + * the average user (w/out VLANs) will not be adversely affected. * --BLG * * 0800 IP @@ -160,13 +157,14 @@ static DEFINE_SPINLOCK(ptype_lock); static struct list_head ptype_base[16]; /* 16 way hashed list */ static struct list_head ptype_all; /* Taps */ -#ifdef OFFLINE_SAMPLE -static void sample_queue(unsigned long dummy); -static struct timer_list samp_timer = TIMER_INITIALIZER(sample_queue, 0, 0); +#ifdef CONFIG_NET_DMA +static struct dma_client *net_dma_client; +static unsigned int net_dma_count; +static spinlock_t net_dma_event_lock; #endif /* - * The @dev_base list is protected by @dev_base_lock and the rtln + * The @dev_base list is protected by @dev_base_lock and the rtnl * semaphore. * * Pure readers hold dev_base_lock for reading. @@ -210,13 +208,13 @@ static inline struct hlist_head *dev_index_hash(int ifindex) * Our notifier list */ -static struct notifier_block *netdev_chain; +static RAW_NOTIFIER_HEAD(netdev_chain); /* * Device drivers call our routines to queue packets here. We empty the * queue in the local softnet handler. */ -DEFINE_PER_CPU(struct softnet_data, softnet_data) = { 0, }; +DEFINE_PER_CPU(struct softnet_data, softnet_data) = { NULL }; #ifdef CONFIG_SYSFS extern int netdev_sysfs_init(void); @@ -239,7 +237,7 @@ extern void netdev_unregister_sysfs(struct net_device *); * For efficiency */ -int netdev_nit; +static int netdev_nit; /* * Add a protocol ID to the list. Now that the input handler is @@ -285,10 +283,6 @@ void dev_add_pack(struct packet_type *pt) spin_unlock_bh(&ptype_lock); } -extern void linkwatch_run_queue(void); - - - /** * __dev_remove_pack - remove packet handler * @pt: packet type declaration @@ -596,6 +590,8 @@ struct net_device *dev_getbyhwaddr(unsigned short type, char *ha) return dev; } +EXPORT_SYMBOL(dev_getbyhwaddr); + struct net_device *dev_getfirstbyhwtype(unsigned short type) { struct net_device *dev; @@ -644,14 +640,24 @@ struct net_device * dev_get_by_flags(unsigned short if_flags, unsigned short mas * @name: name string * * Network device names need to be valid file names to - * to allow sysfs to work + * to allow sysfs to work. We also disallow any kind of + * whitespace. */ -static int dev_valid_name(const char *name) +int dev_valid_name(const char *name) { - return !(*name == '\0' - || !strcmp(name, ".") - || !strcmp(name, "..") - || strchr(name, '/')); + if (*name == '\0') + return 0; + if (strlen(name) >= IFNAMSIZ) + return 0; + if (!strcmp(name, ".") || !strcmp(name, "..")) + return 0; + + while (*name) { + if (*name == '/' || isspace(*name)) + return 0; + name++; + } + return 1; } /** @@ -660,10 +666,12 @@ static int dev_valid_name(const char *name) * @name: name format string * * Passed a format string - eg "lt%d" it will try and find a suitable - * id. Not efficient for many devices, not called a lot. The caller - * must hold the dev_base or rtnl lock while allocating the name and - * adding the device in order to avoid duplicates. Returns the number - * of the unit assigned or a negative errno code. + * id. It scans list of devices to build up a free map, then chooses + * the first empty slot. The caller must hold the dev_base or rtnl lock + * while allocating the name and adding the device in order to avoid + * duplicates. + * Limited to bits_per_byte * page size devices (ie 32K on most platforms). + * Returns the number of the unit assigned or a negative errno code. */ int dev_alloc_name(struct net_device *dev, const char *name) @@ -755,12 +763,25 @@ int dev_change_name(struct net_device *dev, char *newname) if (!err) { hlist_del(&dev->name_hlist); hlist_add_head(&dev->name_hlist, dev_name_hash(dev->name)); - notifier_call_chain(&netdev_chain, NETDEV_CHANGENAME, dev); + raw_notifier_call_chain(&netdev_chain, + NETDEV_CHANGENAME, dev); } return err; } +/** + * netdev_features_change - device changes features + * @dev: device to cause notification + * + * Called to indicate a device has changed features. + */ +void netdev_features_change(struct net_device *dev) +{ + raw_notifier_call_chain(&netdev_chain, NETDEV_FEAT_CHANGE, dev); +} +EXPORT_SYMBOL(netdev_features_change); + /** * netdev_state_change - device changes state * @dev: device to cause notification @@ -772,7 +793,8 @@ int dev_change_name(struct net_device *dev, char *newname) void netdev_state_change(struct net_device *dev) { if (dev->flags & IFF_UP) { - notifier_call_chain(&netdev_chain, NETDEV_CHANGE, dev); + raw_notifier_call_chain(&netdev_chain, + NETDEV_CHANGE, dev); rtmsg_ifinfo(RTM_NEWLINK, dev, 0); } } @@ -869,7 +891,7 @@ int dev_open(struct net_device *dev) /* * ... and announce new interface. */ - notifier_call_chain(&netdev_chain, NETDEV_UP, dev); + raw_notifier_call_chain(&netdev_chain, NETDEV_UP, dev); } return ret; } @@ -892,7 +914,7 @@ int dev_close(struct net_device *dev) * Tell people we are going down, so that they can * prepare to death, when device is still operating. */ - notifier_call_chain(&netdev_chain, NETDEV_GOING_DOWN, dev); + raw_notifier_call_chain(&netdev_chain, NETDEV_GOING_DOWN, dev); dev_deactivate(dev); @@ -907,8 +929,7 @@ int dev_close(struct net_device *dev) smp_mb__after_clear_bit(); /* Commit netif_running(). */ while (test_bit(__LINK_STATE_RX_SCHED, &dev->state)) { /* No hurry. */ - current->state = TASK_INTERRUPTIBLE; - schedule_timeout(1); + msleep(1); } /* @@ -930,7 +951,7 @@ int dev_close(struct net_device *dev) /* * Tell people we are down */ - notifier_call_chain(&netdev_chain, NETDEV_DOWN, dev); + raw_notifier_call_chain(&netdev_chain, NETDEV_DOWN, dev); return 0; } @@ -961,7 +982,7 @@ int register_netdevice_notifier(struct notifier_block *nb) int err; rtnl_lock(); - err = notifier_chain_register(&netdev_chain, nb); + err = raw_notifier_chain_register(&netdev_chain, nb); if (!err) { for (dev = dev_base; dev; dev = dev->next) { nb->notifier_call(nb, NETDEV_REGISTER, dev); @@ -986,7 +1007,12 @@ int register_netdevice_notifier(struct notifier_block *nb) int unregister_netdevice_notifier(struct notifier_block *nb) { - return notifier_chain_unregister(&netdev_chain, nb); + int err; + + rtnl_lock(); + err = raw_notifier_chain_unregister(&netdev_chain, nb); + rtnl_unlock(); + return err; } /** @@ -995,12 +1021,12 @@ int unregister_netdevice_notifier(struct notifier_block *nb) * @v: pointer passed unmodified to notifier function * * Call all network notifier blocks. Parameters and return value - * are as for notifier_call_chain(). + * are as for raw_notifier_call_chain(). */ int call_netdevice_notifiers(unsigned long val, void *v) { - return notifier_call_chain(&netdev_chain, val, v); + return raw_notifier_call_chain(&netdev_chain, val, v); } /* When > 0 there are consumers of rx skb time stamps */ @@ -1016,13 +1042,22 @@ void net_disable_timestamp(void) atomic_dec(&netstamp_needed); } -static inline void net_timestamp(struct timeval *stamp) +void __net_timestamp(struct sk_buff *skb) +{ + struct timeval tv; + + do_gettimeofday(&tv); + skb_set_timestamp(skb, &tv); +} +EXPORT_SYMBOL(__net_timestamp); + +static inline void net_timestamp(struct sk_buff *skb) { if (atomic_read(&netstamp_needed)) - do_gettimeofday(stamp); + __net_timestamp(skb); else { - stamp->tv_sec = 0; - stamp->tv_usec = 0; + skb->tstamp.off_sec = 0; + skb->tstamp.off_usec = 0; } } @@ -1031,10 +1066,11 @@ static inline void net_timestamp(struct timeval *stamp) * taps currently in use. */ -void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) +static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) { struct packet_type *ptype; - net_timestamp(&skb->stamp); + + net_timestamp(skb); rcu_read_lock(); list_for_each_entry_rcu(ptype, &ptype_all, list) { @@ -1065,24 +1101,91 @@ void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) skb2->h.raw = skb2->nh.raw; skb2->pkt_type = PACKET_OUTGOING; - ptype->func(skb2, skb->dev, ptype); + ptype->func(skb2, skb->dev, ptype, skb->dev); } } rcu_read_unlock(); } + +void __netif_schedule(struct net_device *dev) +{ + if (!test_and_set_bit(__LINK_STATE_SCHED, &dev->state)) { + unsigned long flags; + struct softnet_data *sd; + + local_irq_save(flags); + sd = &__get_cpu_var(softnet_data); + dev->next_sched = sd->output_queue; + sd->output_queue = dev; + raise_softirq_irqoff(NET_TX_SOFTIRQ); + local_irq_restore(flags); + } +} +EXPORT_SYMBOL(__netif_schedule); + +void __netif_rx_schedule(struct net_device *dev) +{ + unsigned long flags; + + local_irq_save(flags); + dev_hold(dev); + list_add_tail(&dev->poll_list, &__get_cpu_var(softnet_data).poll_list); + if (dev->quota < 0) + dev->quota += dev->weight; + else + dev->quota = dev->weight; + __raise_softirq_irqoff(NET_RX_SOFTIRQ); + local_irq_restore(flags); +} +EXPORT_SYMBOL(__netif_rx_schedule); + +void dev_kfree_skb_any(struct sk_buff *skb) +{ + if (in_irq() || irqs_disabled()) + dev_kfree_skb_irq(skb); + else + dev_kfree_skb(skb); +} +EXPORT_SYMBOL(dev_kfree_skb_any); + + +/* Hot-plugging. */ +void netif_device_detach(struct net_device *dev) +{ + if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) && + netif_running(dev)) { + netif_stop_queue(dev); + } +} +EXPORT_SYMBOL(netif_device_detach); + +void netif_device_attach(struct net_device *dev) +{ + if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) && + netif_running(dev)) { + netif_wake_queue(dev); + __netdev_watchdog_up(dev); + } +} +EXPORT_SYMBOL(netif_device_attach); + + /* * Invalidate hardware checksum when packet is to be mangled, and * complete checksum manually on outgoing path. */ -int skb_checksum_help(struct sk_buff *skb, int inward) +int skb_checksum_help(struct sk_buff *skb) { - unsigned int csum; + __wsum csum; int ret = 0, offset = skb->h.raw - skb->data; - if (inward) { - skb->ip_summed = CHECKSUM_NONE; - goto out; + if (skb->ip_summed == CHECKSUM_COMPLETE) + goto out_set_summed; + + if (unlikely(skb_shinfo(skb)->gso_size)) { + /* Let GSO fix up the checksum. */ + goto out_set_summed; } if (skb_cloned(skb)) { @@ -1091,23 +1194,86 @@ int skb_checksum_help(struct sk_buff *skb, int inward) goto out; } - if (offset > (int)skb->len) - BUG(); + BUG_ON(offset > (int)skb->len); csum = skb_checksum(skb, offset, skb->len-offset, 0); offset = skb->tail - skb->h.raw; - if (offset <= 0) - BUG(); - if (skb->csum + 2 > offset) - BUG(); + BUG_ON(offset <= 0); + BUG_ON(skb->csum_offset + 2 > offset); + + *(__sum16*)(skb->h.raw + skb->csum_offset) = csum_fold(csum); - *(u16*)(skb->h.raw + skb->csum) = csum_fold(csum); +out_set_summed: skb->ip_summed = CHECKSUM_NONE; out: return ret; } -#ifdef CONFIG_HIGHMEM +/** + * skb_gso_segment - Perform segmentation on skb. + * @skb: buffer to segment + * @features: features for the output path (see dev->features) + * + * This function segments the given skb and returns a list of segments. + * + * It may return NULL if the skb requires no segmentation. This is + * only possible when GSO is used for verifying header integrity. + */ +struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features) +{ + struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT); + struct packet_type *ptype; + __be16 type = skb->protocol; + int err; + + BUG_ON(skb_shinfo(skb)->frag_list); + + skb->mac.raw = skb->data; + skb->mac_len = skb->nh.raw - skb->data; + __skb_pull(skb, skb->mac_len); + + if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) { + if (skb_header_cloned(skb) && + (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC))) + return ERR_PTR(err); + } + + rcu_read_lock(); + list_for_each_entry_rcu(ptype, &ptype_base[ntohs(type) & 15], list) { + if (ptype->type == type && !ptype->dev && ptype->gso_segment) { + if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) { + err = ptype->gso_send_check(skb); + segs = ERR_PTR(err); + if (err || skb_gso_ok(skb, features)) + break; + __skb_push(skb, skb->data - skb->nh.raw); + } + segs = ptype->gso_segment(skb, features); + break; + } + } + rcu_read_unlock(); + + __skb_push(skb, skb->data - skb->mac.raw); + + return segs; +} + +EXPORT_SYMBOL(skb_gso_segment); + +/* Take action when hardware reception checksum errors are detected. */ +#ifdef CONFIG_BUG +void netdev_rx_csum_fault(struct net_device *dev) +{ + if (net_ratelimit()) { + printk(KERN_ERR "%s: hw csum failure.\n", + dev ? dev->name : ""); + dump_stack(); + } +} +EXPORT_SYMBOL(netdev_rx_csum_fault); +#endif + /* Actually, we should eliminate this check as soon as we know, that: * 1. IOMMU is present and allows to map all the memory. * 2. No high memory really exists on this machine. @@ -1115,6 +1281,7 @@ out: static inline int illegal_highdma(struct net_device *dev, struct sk_buff *skb) { +#ifdef CONFIG_HIGHMEM int i; if (dev->features & NETIF_F_HIGHDMA) @@ -1124,86 +1291,152 @@ static inline int illegal_highdma(struct net_device *dev, struct sk_buff *skb) if (PageHighMem(skb_shinfo(skb)->frags[i].page)) return 1; +#endif return 0; } -#else -#define illegal_highdma(dev, skb) (0) -#endif -extern void skb_release_data(struct sk_buff *); +struct dev_gso_cb { + void (*destructor)(struct sk_buff *skb); +}; + +#define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb) -/* Keep head the same: replace data */ -int __skb_linearize(struct sk_buff *skb, int gfp_mask) +static void dev_gso_skb_destructor(struct sk_buff *skb) { - unsigned int size; - u8 *data; - long offset; - struct skb_shared_info *ninfo; - int headerlen = skb->data - skb->head; - int expand = (skb->tail + skb->data_len) - skb->end; + struct dev_gso_cb *cb; - if (skb_shared(skb)) - BUG(); + do { + struct sk_buff *nskb = skb->next; - if (expand <= 0) - expand = 0; + skb->next = nskb->next; + nskb->next = NULL; + kfree_skb(nskb); + } while (skb->next); - size = skb->end - skb->head + expand; - size = SKB_DATA_ALIGN(size); - data = kmalloc(size + sizeof(struct skb_shared_info), gfp_mask); - if (!data) - return -ENOMEM; + cb = DEV_GSO_CB(skb); + if (cb->destructor) + cb->destructor(skb); +} - /* Copy entire thing */ - if (skb_copy_bits(skb, -headerlen, data, headerlen + skb->len)) - BUG(); +/** + * dev_gso_segment - Perform emulated hardware segmentation on skb. + * @skb: buffer to segment + * + * This function segments the given skb and stores the list of segments + * in skb->next. + */ +static int dev_gso_segment(struct sk_buff *skb) +{ + struct net_device *dev = skb->dev; + struct sk_buff *segs; + int features = dev->features & ~(illegal_highdma(dev, skb) ? + NETIF_F_SG : 0); - /* Set up shinfo */ - ninfo = (struct skb_shared_info*)(data + size); - atomic_set(&ninfo->dataref, 1); - ninfo->tso_size = skb_shinfo(skb)->tso_size; - ninfo->tso_segs = skb_shinfo(skb)->tso_segs; - ninfo->nr_frags = 0; - ninfo->frag_list = NULL; + segs = skb_gso_segment(skb, features); - /* Offset between the two in bytes */ - offset = data - skb->head; + /* Verifying header integrity only. */ + if (!segs) + return 0; - /* Free old data. */ - skb_release_data(skb); + if (unlikely(IS_ERR(segs))) + return PTR_ERR(segs); - skb->head = data; - skb->end = data + size; + skb->next = segs; + DEV_GSO_CB(skb)->destructor = skb->destructor; + skb->destructor = dev_gso_skb_destructor; - /* Set up new pointers */ - skb->h.raw += offset; - skb->nh.raw += offset; - skb->mac.raw += offset; - skb->tail += offset; - skb->data += offset; + return 0; +} - /* We are no longer a clone, even if we were. */ - skb->cloned = 0; +int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev) +{ + if (likely(!skb->next)) { + if (netdev_nit) + dev_queue_xmit_nit(skb, dev); + + if (netif_needs_gso(dev, skb)) { + if (unlikely(dev_gso_segment(skb))) + goto out_kfree_skb; + if (skb->next) + goto gso; + } + + return dev->hard_start_xmit(skb, dev); + } + +gso: + do { + struct sk_buff *nskb = skb->next; + int rc; + + skb->next = nskb->next; + nskb->next = NULL; + rc = dev->hard_start_xmit(nskb, dev); + if (unlikely(rc)) { + nskb->next = skb->next; + skb->next = nskb; + return rc; + } + if (unlikely(netif_queue_stopped(dev) && skb->next)) + return NETDEV_TX_BUSY; + } while (skb->next); + + skb->destructor = DEV_GSO_CB(skb)->destructor; - skb->tail += skb->data_len; - skb->data_len = 0; +out_kfree_skb: + kfree_skb(skb); return 0; } #define HARD_TX_LOCK(dev, cpu) { \ if ((dev->features & NETIF_F_LLTX) == 0) { \ - spin_lock(&dev->xmit_lock); \ - dev->xmit_lock_owner = cpu; \ + netif_tx_lock(dev); \ } \ } #define HARD_TX_UNLOCK(dev) { \ if ((dev->features & NETIF_F_LLTX) == 0) { \ - dev->xmit_lock_owner = -1; \ - spin_unlock(&dev->xmit_lock); \ + netif_tx_unlock(dev); \ } \ } +#ifdef CONFIG_XEN +inline int skb_checksum_setup(struct sk_buff *skb) +{ + if (skb->proto_csum_blank) { + if (skb->protocol != htons(ETH_P_IP)) + goto out; + skb->h.raw = (unsigned char *)skb->nh.iph + 4*skb->nh.iph->ihl; + if (skb->h.raw >= skb->tail) + goto out; + switch (skb->nh.iph->protocol) { + case IPPROTO_TCP: + skb->csum = offsetof(struct tcphdr, check); + break; + case IPPROTO_UDP: + skb->csum = offsetof(struct udphdr, check); + break; + default: + if (net_ratelimit()) + printk(KERN_ERR "Attempting to checksum a non-" + "TCP/UDP packet, dropping a protocol" + " %d packet", skb->nh.iph->protocol); + goto out; + } + if ((skb->h.raw + skb->csum + 2) > skb->tail) + goto out; + skb->ip_summed = CHECKSUM_PARTIAL; + skb->proto_csum_blank = 0; + } + return 0; +out: + return -EPROTO; +} +#else +inline int skb_checksum_setup(struct sk_buff *skb) { return 0; } +#endif + + /** * dev_queue_xmit - transmit a buffer * @skb: buffer to transmit @@ -1215,6 +1448,19 @@ int __skb_linearize(struct sk_buff *skb, int gfp_mask) * A negative errno code is returned on a failure. A success does not * guarantee the frame will be transmitted as it may be dropped due * to congestion or traffic shaping. + * + * ----------------------------------------------------------------------------------- + * I notice this method can also return errors from the queue disciplines, + * including NET_XMIT_DROP, which is a positive value. So, errors can also + * be positive. + * + * Regardless of the return value, the skb is consumed, so it is currently + * difficult to retry a send to this method. (You can bump the ref count + * before sending to hold a reference for retry if you are careful.) + * + * When calling this method, interrupts MUST be enabled. This is because + * the BH enable code must have IRQs enabled so that it will not deadlock. + * --BLG */ int dev_queue_xmit(struct sk_buff *skb) @@ -1223,9 +1469,19 @@ int dev_queue_xmit(struct sk_buff *skb) struct Qdisc *q; int rc = -ENOMEM; + /* If a checksum-deferred packet is forwarded to a device that needs a + * checksum, correct the pointers and force checksumming. + */ + if (skb_checksum_setup(skb)) + goto out_kfree_skb; + + /* GSO will handle the following emulations directly. */ + if (netif_needs_gso(dev, skb)) + goto gso; + if (skb_shinfo(skb)->frag_list && !(dev->features & NETIF_F_FRAGLIST) && - __skb_linearize(skb, GFP_ATOMIC)) + __skb_linearize(skb)) goto out_kfree_skb; /* Fragmented skb is linearized if device does not support SG, @@ -1234,23 +1490,26 @@ int dev_queue_xmit(struct sk_buff *skb) */ if (skb_shinfo(skb)->nr_frags && (!(dev->features & NETIF_F_SG) || illegal_highdma(dev, skb)) && - __skb_linearize(skb, GFP_ATOMIC)) + __skb_linearize(skb)) goto out_kfree_skb; /* If packet is not checksummed and device does not support * checksumming for this protocol, complete checksumming here. */ - if (skb->ip_summed == CHECKSUM_HW && - (!(dev->features & (NETIF_F_HW_CSUM | NETIF_F_NO_CSUM)) && + if (skb->ip_summed == CHECKSUM_PARTIAL && + (!(dev->features & NETIF_F_GEN_CSUM) && (!(dev->features & NETIF_F_IP_CSUM) || skb->protocol != htons(ETH_P_IP)))) - if (skb_checksum_help(skb, 0)) + if (skb_checksum_help(skb)) goto out_kfree_skb; +gso: + spin_lock_prefetch(&dev->queue_lock); + /* Disable soft irqs for various locks below. Also * stops preemption for RCU. */ - local_bh_disable(); + rcu_read_lock_bh(); /* Updates of qdisc are serialized by queue_lock. * The struct Qdisc which is pointed to by qdisc is now a @@ -1271,21 +1530,23 @@ int dev_queue_xmit(struct sk_buff *skb) if (q->enqueue) { /* Grab device queue */ spin_lock(&dev->queue_lock); + q = dev->qdisc; + if (q->enqueue) { + rc = q->enqueue(skb, q); + qdisc_run(dev); + spin_unlock(&dev->queue_lock); - rc = q->enqueue(skb, q); - - qdisc_run(dev); - + rc = rc == NET_XMIT_BYPASS ? NET_XMIT_SUCCESS : rc; + goto out; + } spin_unlock(&dev->queue_lock); - rc = rc == NET_XMIT_BYPASS ? NET_XMIT_SUCCESS : rc; - goto out; } /* The device has no queue. Common case for software devices: loopback, all the sorts of tunnels... - Really, it is unlikely that xmit_lock protection is necessary here. - (f.e. loopback and IP tunnels are clean ignoring statistics + Really, it is unlikely that netif_tx_lock protection is necessary + here. (f.e. loopback and IP tunnels are clean ignoring statistics counters.) However, it is possible, that they rely on protection made by us here. @@ -1301,11 +1562,8 @@ int dev_queue_xmit(struct sk_buff *skb) HARD_TX_LOCK(dev, cpu); if (!netif_queue_stopped(dev)) { - if (netdev_nit) - dev_queue_xmit_nit(skb, dev); - rc = 0; - if (!dev->hard_start_xmit(skb, dev)) { + if (!dev_hard_start_xmit(skb, dev)) { HARD_TX_UNLOCK(dev); goto out; } @@ -1324,13 +1582,13 @@ int dev_queue_xmit(struct sk_buff *skb) } rc = -ENETDOWN; - local_bh_enable(); + rcu_read_unlock_bh(); out_kfree_skb: kfree_skb(skb); return rc; out: - local_bh_enable(); + rcu_read_unlock_bh(); return rc; } @@ -1339,71 +1597,13 @@ out: Receiver routines =======================================================================*/ -int netdev_max_backlog = 300; +int netdev_max_backlog = 1000; +int netdev_budget = 300; int weight_p = 64; /* old backlog weight */ -/* These numbers are selected based on intuition and some - * experimentatiom, if you have more scientific way of doing this - * please go ahead and fix things. - */ -int no_cong_thresh = 10; -int no_cong = 20; -int lo_cong = 100; -int mod_cong = 290; DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, }; -static void get_sample_stats(int cpu) -{ -#ifdef RAND_LIE - unsigned long rd; - int rq; -#endif - struct softnet_data *sd = &per_cpu(softnet_data, cpu); - int blog = sd->input_pkt_queue.qlen; - int avg_blog = sd->avg_blog; - - avg_blog = (avg_blog >> 1) + (blog >> 1); - - if (avg_blog > mod_cong) { - /* Above moderate congestion levels. */ - sd->cng_level = NET_RX_CN_HIGH; -#ifdef RAND_LIE - rd = net_random(); - rq = rd % netdev_max_backlog; - if (rq < avg_blog) /* unlucky bastard */ - sd->cng_level = NET_RX_DROP; -#endif - } else if (avg_blog > lo_cong) { - sd->cng_level = NET_RX_CN_MOD; -#ifdef RAND_LIE - rd = net_random(); - rq = rd % netdev_max_backlog; - if (rq < avg_blog) /* unlucky bastard */ - sd->cng_level = NET_RX_CN_HIGH; -#endif - } else if (avg_blog > no_cong) - sd->cng_level = NET_RX_CN_LOW; - else /* no congestion */ - sd->cng_level = NET_RX_SUCCESS; - - sd->avg_blog = avg_blog; -} - -#ifdef OFFLINE_SAMPLE -static void sample_queue(unsigned long dummy) -{ -/* 10 ms 0r 1ms -- i don't care -- JHS */ - int next_tick = 1; - int cpu = smp_processor_id(); - - get_sample_stats(cpu); - next_tick += jiffies; - mod_timer(&samp_timer, next_tick); -} -#endif - - /** * netif_rx - post buffer to the network code * @skb: buffer to post @@ -1424,57 +1624,37 @@ static void sample_queue(unsigned long dummy) int netif_rx(struct sk_buff *skb) { - int this_cpu; struct softnet_data *queue; unsigned long flags; -#ifdef CONFIG_NETPOLL - if (skb->dev->netpoll_rx && netpoll_rx(skb)) { - kfree_skb(skb); + /* if netpoll wants it, pretend we never saw it */ + if (netpoll_rx(skb)) return NET_RX_DROP; - } -#endif - - if (!skb->stamp.tv_sec) - net_timestamp(&skb->stamp); + + if (!skb->tstamp.off_sec) + net_timestamp(skb); /* * The code is rearranged so that the path is the most * short when CPU is congested, but is still operating. */ local_irq_save(flags); - this_cpu = smp_processor_id(); queue = &__get_cpu_var(softnet_data); __get_cpu_var(netdev_rx_stat).total++; if (queue->input_pkt_queue.qlen <= netdev_max_backlog) { if (queue->input_pkt_queue.qlen) { - if (queue->throttle) - goto drop; - enqueue: dev_hold(skb->dev); __skb_queue_tail(&queue->input_pkt_queue, skb); -#ifndef OFFLINE_SAMPLE - get_sample_stats(this_cpu); -#endif local_irq_restore(flags); - return queue->cng_level; + return NET_RX_SUCCESS; } - if (queue->throttle) - queue->throttle = 0; - netif_rx_schedule(&queue->backlog_dev); goto enqueue; } - if (!queue->throttle) { - queue->throttle = 1; - __get_cpu_var(netdev_rx_stat).throttled++; - } - -drop: __get_cpu_var(netdev_rx_stat).dropped++; local_irq_restore(flags); @@ -1497,14 +1677,19 @@ int netif_rx_ni(struct sk_buff *skb) EXPORT_SYMBOL(netif_rx_ni); -static __inline__ void skb_bond(struct sk_buff *skb) +static inline struct net_device *skb_bond(struct sk_buff *skb) { struct net_device *dev = skb->dev; if (dev->master) { - skb->real_dev = skb->dev; + if (skb_bond_should_drop(skb)) { + kfree_skb(skb); + return NULL; + } skb->dev = dev->master; } + + return dev; } static void net_tx_action(struct softirq_action *h) @@ -1554,17 +1739,23 @@ static void net_tx_action(struct softirq_action *h) } static __inline__ int deliver_skb(struct sk_buff *skb, - struct packet_type *pt_prev) + struct packet_type *pt_prev, + struct net_device *orig_dev) { atomic_inc(&skb->users); - return pt_prev->func(skb, skb->dev, pt_prev); + return pt_prev->func(skb, skb->dev, pt_prev, orig_dev); } #if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE) int (*br_handle_frame_hook)(struct net_bridge_port *p, struct sk_buff **pskb); +struct net_bridge; +struct net_bridge_fdb_entry *(*br_fdb_get_hook)(struct net_bridge *br, + unsigned char *addr); +void (*br_fdb_put_hook)(struct net_bridge_fdb_entry *ent); static __inline__ int handle_bridge(struct sk_buff **pskb, - struct packet_type **pt_prev, int *ret) + struct packet_type **pt_prev, int *ret, + struct net_device *orig_dev) { struct net_bridge_port *port; @@ -1573,14 +1764,14 @@ static __inline__ int handle_bridge(struct sk_buff **pskb, return 0; if (*pt_prev) { - *ret = deliver_skb(*pskb, *pt_prev); + *ret = deliver_skb(*pskb, *pt_prev, orig_dev); *pt_prev = NULL; } return br_handle_frame_hook(port, pskb); } #else -#define handle_bridge(skb, pt_prev, ret) (0) +#define handle_bridge(skb, pt_prev, ret, orig_dev) (0) #endif #ifdef CONFIG_NET_CLS_ACT @@ -1601,22 +1792,19 @@ static int ing_filter(struct sk_buff *skb) if (dev->qdisc_ingress) { __u32 ttl = (__u32) G_TC_RTTL(skb->tc_verd); if (MAX_RED_LOOP < ttl++) { - printk("Redir loop detected Dropping packet (%s->%s)\n", - skb->input_dev?skb->input_dev->name:"??",skb->dev->name); + printk(KERN_WARNING "Redir loop detected Dropping packet (%d->%d)\n", + skb->iif, skb->dev->ifindex); return TC_ACT_SHOT; } skb->tc_verd = SET_TC_RTTL(skb->tc_verd,ttl); skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_INGRESS); - if (NULL == skb->input_dev) { - skb->input_dev = skb->dev; - printk("ing_filter: fixed %s out %s\n",skb->input_dev->name,skb->dev->name); - } - spin_lock(&dev->ingress_lock); + + spin_lock(&dev->queue_lock); if ((q = dev->qdisc_ingress) != NULL) result = q->enqueue(skb, q); - spin_unlock(&dev->ingress_lock); + spin_unlock(&dev->queue_lock); } @@ -1627,20 +1815,24 @@ static int ing_filter(struct sk_buff *skb) int netif_receive_skb(struct sk_buff *skb) { struct packet_type *ptype, *pt_prev; + struct net_device *orig_dev; int ret = NET_RX_DROP; - unsigned short type; + __be16 type; -#ifdef CONFIG_NETPOLL - if (skb->dev->netpoll_rx && skb->dev->poll && netpoll_rx(skb)) { - kfree_skb(skb); + /* if we've gotten here through NAPI, check netpoll */ + if (skb->dev->poll && netpoll_rx(skb)) return NET_RX_DROP; - } -#endif - if (!skb->stamp.tv_sec) - net_timestamp(&skb->stamp); + if (!skb->tstamp.off_sec) + net_timestamp(skb); - skb_bond(skb); + if (!skb->iif) + skb->iif = skb->dev->ifindex; + + orig_dev = skb_bond(skb); + + if (!orig_dev) + return NET_RX_DROP; __get_cpu_var(netdev_rx_stat).total++; @@ -1658,17 +1850,30 @@ int netif_receive_skb(struct sk_buff *skb) } #endif +#ifdef CONFIG_XEN + switch (skb->ip_summed) { + case CHECKSUM_UNNECESSARY: + skb->proto_data_valid = 1; + break; + case CHECKSUM_PARTIAL: + /* XXX Implement me. */ + default: + skb->proto_data_valid = 0; + break; + } +#endif + list_for_each_entry_rcu(ptype, &ptype_all, list) { if (!ptype->dev || ptype->dev == skb->dev) { if (pt_prev) - ret = deliver_skb(skb, pt_prev); + ret = deliver_skb(skb, pt_prev, orig_dev); pt_prev = ptype; } } #ifdef CONFIG_NET_CLS_ACT if (pt_prev) { - ret = deliver_skb(skb, pt_prev); + ret = deliver_skb(skb, pt_prev, orig_dev); pt_prev = NULL; /* noone else should process this after*/ } else { skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd); @@ -1685,9 +1890,7 @@ int netif_receive_skb(struct sk_buff *skb) ncls: #endif - handle_diverter(skb); - - if (handle_bridge(&skb, &pt_prev, &ret)) + if (handle_bridge(&skb, &pt_prev, &ret, orig_dev)) goto out; type = skb->protocol; @@ -1695,13 +1898,13 @@ ncls: if (ptype->type == type && (!ptype->dev || ptype->dev == skb->dev)) { if (pt_prev) - ret = deliver_skb(skb, pt_prev); + ret = deliver_skb(skb, pt_prev, orig_dev); pt_prev = ptype; } } if (pt_prev) { - ret = pt_prev->func(skb, skb->dev, pt_prev); + ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev); } else { kfree_skb(skb); /* Jamal, now you will not able to escape explaining @@ -1722,6 +1925,7 @@ static int process_backlog(struct net_device *backlog_dev, int *budget) struct softnet_data *queue = &__get_cpu_var(softnet_data); unsigned long start_time = jiffies; + backlog_dev->weight = weight_p; for (;;) { struct sk_buff *skb; struct net_device *dev; @@ -1757,8 +1961,6 @@ job_done: smp_mb__before_clear_bit(); netif_poll_enable(backlog_dev); - if (queue->throttle) - queue->throttle = 0; local_irq_enable(); return 0; } @@ -1767,9 +1969,9 @@ static void net_rx_action(struct softirq_action *h) { struct softnet_data *queue = &__get_cpu_var(softnet_data); unsigned long start_time = jiffies; - int budget = netdev_max_backlog; + int budget = netdev_budget; + void *have; - local_irq_disable(); while (!list_empty(&queue->poll_list)) { @@ -1782,21 +1984,36 @@ static void net_rx_action(struct softirq_action *h) dev = list_entry(queue->poll_list.next, struct net_device, poll_list); + have = netpoll_poll_lock(dev); if (dev->quota <= 0 || dev->poll(dev, &budget)) { + netpoll_poll_unlock(have); local_irq_disable(); - list_del(&dev->poll_list); - list_add_tail(&dev->poll_list, &queue->poll_list); + list_move_tail(&dev->poll_list, &queue->poll_list); if (dev->quota < 0) dev->quota += dev->weight; else dev->quota = dev->weight; } else { + netpoll_poll_unlock(have); dev_put(dev); local_irq_disable(); } } out: +#ifdef CONFIG_NET_DMA + /* + * There may not be any more sk_buffs coming right now, so push + * any pending DMA copies to hardware + */ + if (net_dma_client) { + struct dma_chan *chan; + rcu_read_lock(); + list_for_each_entry_rcu(chan, &net_dma_client->channels, client_node) + dma_async_memcpy_issue_pending(chan); + rcu_read_unlock(); + } +#endif local_irq_enable(); return; @@ -2036,15 +2253,9 @@ static int softnet_seq_show(struct seq_file *seq, void *v) struct netif_rx_stats *s = v; seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n", - s->total, s->dropped, s->time_squeeze, s->throttled, - s->fastroute_hit, s->fastroute_success, s->fastroute_defer, - s->fastroute_deferred_out, -#if 0 - s->fastroute_latency_reduction -#else - s->cpu_collision -#endif - ); + s->total, s->dropped, s->time_squeeze, 0, + 0, 0, 0, 0, /* was fastroute */ + s->cpu_collision ); return 0; } @@ -2088,7 +2299,7 @@ static struct file_operations softnet_seq_fops = { .release = seq_release, }; -#ifdef WIRELESS_EXT +#ifdef CONFIG_WIRELESS_EXT extern int wireless_proc_init(void); #else #define wireless_proc_init() 0 @@ -2162,7 +2373,7 @@ int netdev_set_master(struct net_device *slave, struct net_device *master) * @dev: device * @inc: modifier * - * Add or remove promsicuity from a device. While the count in the device + * Add or remove promiscuity from a device. While the count in the device * remains above zero the interface remains promiscuous. Once it hits zero * the device reverts back to normal filtering operation. A negative inc * value is used to drop promiscuity on the device. @@ -2171,14 +2382,21 @@ void dev_set_promiscuity(struct net_device *dev, int inc) { unsigned short old_flags = dev->flags; - dev->flags |= IFF_PROMISC; if ((dev->promiscuity += inc) == 0) dev->flags &= ~IFF_PROMISC; - if (dev->flags ^ old_flags) { + else + dev->flags |= IFF_PROMISC; + if (dev->flags != old_flags) { dev_mc_upload(dev); printk(KERN_INFO "device %s %s promiscuous mode\n", dev->name, (dev->flags & IFF_PROMISC) ? "entered" : "left"); + audit_log(current->audit_context, GFP_ATOMIC, + AUDIT_ANOM_PROMISCUOUS, + "dev=%s prom=%d old_prom=%d auid=%u", + dev->name, (dev->flags & IFF_PROMISC), + (old_flags & IFF_PROMISC), + audit_get_loginuid(current->audit_context)); } } @@ -2211,12 +2429,20 @@ unsigned dev_get_flags(const struct net_device *dev) flags = (dev->flags & ~(IFF_PROMISC | IFF_ALLMULTI | - IFF_RUNNING)) | + IFF_RUNNING | + IFF_LOWER_UP | + IFF_DORMANT)) | (dev->gflags & (IFF_PROMISC | IFF_ALLMULTI)); - if (netif_running(dev) && netif_carrier_ok(dev)) - flags |= IFF_RUNNING; + if (netif_running(dev)) { + if (netif_oper_up(dev)) + flags |= IFF_RUNNING; + if (netif_carrier_ok(dev)) + flags |= IFF_LOWER_UP; + if (netif_dormant(dev)) + flags |= IFF_DORMANT; + } return flags; } @@ -2259,7 +2485,8 @@ int dev_change_flags(struct net_device *dev, unsigned flags) if (dev->flags & IFF_UP && ((old_flags ^ dev->flags) &~ (IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) - notifier_call_chain(&netdev_chain, NETDEV_CHANGE, dev); + raw_notifier_call_chain(&netdev_chain, + NETDEV_CHANGE, dev); if ((flags ^ dev->gflags) & IFF_PROMISC) { int inc = (flags & IFF_PROMISC) ? +1 : -1; @@ -2303,11 +2530,27 @@ int dev_set_mtu(struct net_device *dev, int new_mtu) else dev->mtu = new_mtu; if (!err && dev->flags & IFF_UP) - notifier_call_chain(&netdev_chain, - NETDEV_CHANGEMTU, dev); + raw_notifier_call_chain(&netdev_chain, + NETDEV_CHANGEMTU, dev); return err; } +int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa) +{ + int err; + + if (!dev->set_mac_address) + return -EOPNOTSUPP; + if (sa->sa_family != dev->type) + return -EINVAL; + if (!netif_device_present(dev)) + return -ENODEV; + err = dev->set_mac_address(dev, sa); + if (!err) + raw_notifier_call_chain(&netdev_chain, + NETDEV_CHANGEADDR, dev); + return err; +} /* * Perform the SIOCxIFxxx calls. @@ -2354,24 +2597,14 @@ static int dev_ifsioc(struct ifreq *ifr, unsigned int cmd) return 0; case SIOCSIFHWADDR: - if (!dev->set_mac_address) - return -EOPNOTSUPP; - if (ifr->ifr_hwaddr.sa_family != dev->type) - return -EINVAL; - if (!netif_device_present(dev)) - return -ENODEV; - err = dev->set_mac_address(dev, &ifr->ifr_hwaddr); - if (!err) - notifier_call_chain(&netdev_chain, - NETDEV_CHANGEADDR, dev); - return err; + return dev_set_mac_address(dev, &ifr->ifr_hwaddr); case SIOCSIFHWBROADCAST: if (ifr->ifr_hwaddr.sa_family != dev->type) return -EINVAL; memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data, min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len)); - notifier_call_chain(&netdev_chain, + raw_notifier_call_chain(&netdev_chain, NETDEV_CHANGEADDR, dev); return 0; @@ -2490,9 +2723,9 @@ int dev_ioctl(unsigned int cmd, void __user *arg) */ if (cmd == SIOCGIFCONF) { - rtnl_shlock(); + rtnl_lock(); ret = dev_ifconf((char __user *) arg); - rtnl_shunlock(); + rtnl_unlock(); return ret; } if (cmd == SIOCGIFNAME) @@ -2597,13 +2830,14 @@ int dev_ioctl(unsigned int cmd, void __user *arg) case SIOCBONDENSLAVE: case SIOCBONDRELEASE: case SIOCBONDSETHWADDR: - case SIOCBONDSLAVEINFOQUERY: - case SIOCBONDINFOQUERY: case SIOCBONDCHANGEACTIVE: case SIOCBRADDIF: case SIOCBRDELIF: if (!capable(CAP_NET_ADMIN)) return -EPERM; + /* fall through */ + case SIOCBONDSLAVEINFOQUERY: + case SIOCBONDINFOQUERY: dev_load(ifr.ifr_name); rtnl_lock(); ret = dev_ifsioc(&ifr, cmd); @@ -2635,13 +2869,14 @@ int dev_ioctl(unsigned int cmd, void __user *arg) ret = -EFAULT; return ret; } -#ifdef WIRELESS_EXT +#ifdef CONFIG_WIRELESS_EXT /* Take care of Wireless Extensions */ if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) { /* If command is `set a parameter', or * `get the encoding parameters', check if * the user has the right to do it */ - if (IW_IS_SET(cmd) || cmd == SIOCGIWENCODE) { + if (IW_IS_SET(cmd) || cmd == SIOCGIWENCODE + || cmd == SIOCGIWENCODEEXT) { if (!capable(CAP_NET_ADMIN)) return -EPERM; } @@ -2656,7 +2891,7 @@ int dev_ioctl(unsigned int cmd, void __user *arg) ret = -EFAULT; return ret; } -#endif /* WIRELESS_EXT */ +#endif /* CONFIG_WIRELESS_EXT */ return -EINVAL; } } @@ -2719,20 +2954,18 @@ int register_netdevice(struct net_device *dev) BUG_ON(dev_boot_phase); ASSERT_RTNL(); + might_sleep(); + /* When net_device's are persistent, this will be fatal. */ BUG_ON(dev->reg_state != NETREG_UNINITIALIZED); spin_lock_init(&dev->queue_lock); - spin_lock_init(&dev->xmit_lock); + spin_lock_init(&dev->_xmit_lock); dev->xmit_lock_owner = -1; #ifdef CONFIG_NET_CLS_ACT spin_lock_init(&dev->ingress_lock); #endif - ret = alloc_divert_blk(dev); - if (ret) - goto out; - dev->iflink = -1; /* Init, if this function is available */ @@ -2741,13 +2974,13 @@ int register_netdevice(struct net_device *dev) if (ret) { if (ret > 0) ret = -EIO; - goto out_err; + goto out; } } if (!dev_valid_name(dev->name)) { ret = -EINVAL; - goto out_err; + goto out; } dev->ifindex = dev_new_index(); @@ -2761,16 +2994,14 @@ int register_netdevice(struct net_device *dev) = hlist_entry(p, struct net_device, name_hlist); if (!strncmp(d->name, dev->name, IFNAMSIZ)) { ret = -EEXIST; - goto out_err; + goto out; } } /* Fix illegal SG+CSUM combinations. */ if ((dev->features & NETIF_F_SG) && - !(dev->features & (NETIF_F_IP_CSUM | - NETIF_F_NO_CSUM | - NETIF_F_HW_CSUM))) { - printk("%s: Dropping NETIF_F_SG since no checksum feature.\n", + !(dev->features & NETIF_F_ALL_CSUM)) { + printk(KERN_NOTICE "%s: Dropping NETIF_F_SG since no checksum feature.\n", dev->name); dev->features &= ~NETIF_F_SG; } @@ -2778,10 +3009,24 @@ int register_netdevice(struct net_device *dev) /* TSO requires that SG is present as well. */ if ((dev->features & NETIF_F_TSO) && !(dev->features & NETIF_F_SG)) { - printk("%s: Dropping NETIF_F_TSO since no SG feature.\n", + printk(KERN_NOTICE "%s: Dropping NETIF_F_TSO since no SG feature.\n", dev->name); dev->features &= ~NETIF_F_TSO; } + if (dev->features & NETIF_F_UFO) { + if (!(dev->features & NETIF_F_HW_CSUM)) { + printk(KERN_ERR "%s: Dropping NETIF_F_UFO since no " + "NETIF_F_HW_CSUM feature.\n", + dev->name); + dev->features &= ~NETIF_F_UFO; + } + if (!(dev->features & NETIF_F_SG)) { + printk(KERN_ERR "%s: Dropping NETIF_F_UFO since no " + "NETIF_F_SG feature.\n", + dev->name); + dev->features &= ~NETIF_F_UFO; + } + } /* * nil rebuild_header routine, @@ -2791,6 +3036,11 @@ int register_netdevice(struct net_device *dev) if (!dev->rebuild_header) dev->rebuild_header = default_rebuild_header; + ret = netdev_register_sysfs(dev); + if (ret) + goto out; + dev->reg_state = NETREG_REGISTERED; + /* * Default initial state at registry is that the * device is present. @@ -2806,21 +3056,15 @@ int register_netdevice(struct net_device *dev) hlist_add_head(&dev->name_hlist, head); hlist_add_head(&dev->index_hlist, dev_index_hash(dev->ifindex)); dev_hold(dev); - dev->reg_state = NETREG_REGISTERING; write_unlock_bh(&dev_base_lock); /* Notify protocols, that a new device appeared. */ - notifier_call_chain(&netdev_chain, NETDEV_REGISTER, dev); + raw_notifier_call_chain(&netdev_chain, NETDEV_REGISTER, dev); - /* Finish registration after unlock */ - net_set_todo(dev); ret = 0; out: return ret; -out_err: - free_divert_blk(dev); - goto out; } /** @@ -2852,15 +3096,6 @@ int register_netdev(struct net_device *dev) goto out; } - /* - * Back compatibility hook. Kill this one in 2.5 - */ - if (dev->name[0] == 0 || dev->name[0] == ' ') { - err = dev_alloc_name(dev, "eth%d"); - if (err < 0) - goto out; - } - err = register_netdevice(dev); out: rtnl_unlock(); @@ -2886,10 +3121,10 @@ static void netdev_wait_allrefs(struct net_device *dev) rebroadcast_time = warning_time = jiffies; while (atomic_read(&dev->refcnt) != 0) { if (time_after(jiffies, rebroadcast_time + 1 * HZ)) { - rtnl_shlock(); + rtnl_lock(); /* Rebroadcast unregister notification */ - notifier_call_chain(&netdev_chain, + raw_notifier_call_chain(&netdev_chain, NETDEV_UNREGISTER, dev); if (test_bit(__LINK_STATE_LINKWATCH_PENDING, @@ -2903,7 +3138,7 @@ static void netdev_wait_allrefs(struct net_device *dev) linkwatch_run_queue(); } - rtnl_shunlock(); + __rtnl_unlock(); rebroadcast_time = jiffies; } @@ -2936,20 +3171,18 @@ static void netdev_wait_allrefs(struct net_device *dev) * * We are invoked by rtnl_unlock() after it drops the semaphore. * This allows us to deal with problems: - * 1) We can create/delete sysfs objects which invoke hotplug + * 1) We can delete sysfs objects which invoke hotplug * without deadlocking with linkwatch via keventd. * 2) Since we run with the RTNL semaphore not held, we can sleep * safely in order to wait for the netdev refcnt to drop to zero. */ -static DECLARE_MUTEX(net_todo_run_mutex); +static DEFINE_MUTEX(net_todo_run_mutex); void netdev_run_todo(void) { - struct list_head list = LIST_HEAD_INIT(list); - int err; - + struct list_head list; /* Need to guard against multiple cpu's getting out of order. */ - down(&net_todo_run_mutex); + mutex_lock(&net_todo_run_mutex); /* Not safe to do outside the semaphore. We must not return * until all unregister events invoked by the local processor @@ -2961,52 +3194,41 @@ void netdev_run_todo(void) /* Snapshot list, allow later requests */ spin_lock(&net_todo_list_lock); - list_splice_init(&net_todo_list, &list); + list_replace_init(&net_todo_list, &list); spin_unlock(&net_todo_list_lock); - + while (!list_empty(&list)) { struct net_device *dev = list_entry(list.next, struct net_device, todo_list); list_del(&dev->todo_list); - switch(dev->reg_state) { - case NETREG_REGISTERING: - err = netdev_register_sysfs(dev); - if (err) - printk(KERN_ERR "%s: failed sysfs registration (%d)\n", - dev->name, err); - dev->reg_state = NETREG_REGISTERED; - break; - - case NETREG_UNREGISTERING: - netdev_unregister_sysfs(dev); - dev->reg_state = NETREG_UNREGISTERED; + if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) { + printk(KERN_ERR "network todo '%s' but state %d\n", + dev->name, dev->reg_state); + dump_stack(); + continue; + } - netdev_wait_allrefs(dev); + netdev_unregister_sysfs(dev); + dev->reg_state = NETREG_UNREGISTERED; - /* paranoia */ - BUG_ON(atomic_read(&dev->refcnt)); - BUG_TRAP(!dev->ip_ptr); - BUG_TRAP(!dev->ip6_ptr); - BUG_TRAP(!dev->dn_ptr); + netdev_wait_allrefs(dev); + /* paranoia */ + BUG_ON(atomic_read(&dev->refcnt)); + BUG_TRAP(!dev->ip_ptr); + BUG_TRAP(!dev->ip6_ptr); + BUG_TRAP(!dev->dn_ptr); - /* It must be the very last action, - * after this 'dev' may point to freed up memory. - */ - if (dev->destructor) - dev->destructor(dev); - break; - - default: - printk(KERN_ERR "network todo '%s' but state %d\n", - dev->name, dev->reg_state); - break; - } + /* It must be the very last action, + * after this 'dev' may point to freed up memory. + */ + if (dev->destructor) + dev->destructor(dev); } out: - up(&net_todo_run_mutex); + mutex_unlock(&net_todo_run_mutex); } /** @@ -3025,16 +3247,17 @@ struct net_device *alloc_netdev(int sizeof_priv, const char *name, struct net_device *dev; int alloc_size; + BUG_ON(strlen(name) >= sizeof(dev->name)); + /* ensure 32-byte alignment of both the device and private area */ alloc_size = (sizeof(*dev) + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST; alloc_size += sizeof_priv + NETDEV_ALIGN_CONST; - p = kmalloc(alloc_size, GFP_KERNEL); + p = kzalloc(alloc_size, GFP_KERNEL); if (!p) { - printk(KERN_ERR "alloc_dev: Unable to allocate device.\n"); + printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n"); return NULL; } - memset(p, 0, alloc_size); dev = (struct net_device *) (((long)p + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST); @@ -3060,7 +3283,7 @@ EXPORT_SYMBOL(alloc_netdev); void free_netdev(struct net_device *dev) { #ifdef CONFIG_SYSFS - /* Compatiablity with error handling in drivers */ + /* Compatibility with error handling in drivers */ if (dev->reg_state == NETREG_UNINITIALIZED) { kfree((char *)dev - dev->padded); return; @@ -3080,7 +3303,7 @@ void free_netdev(struct net_device *dev) void synchronize_net(void) { might_sleep(); - synchronize_kernel(); + synchronize_rcu(); } /** @@ -3145,7 +3368,7 @@ int unregister_netdevice(struct net_device *dev) /* Notify protocols, that we are about to destroy this device. They should clean all the things. */ - notifier_call_chain(&netdev_chain, NETDEV_UNREGISTER, dev); + raw_notifier_call_chain(&netdev_chain, NETDEV_UNREGISTER, dev); /* * Flush the multicast chain @@ -3158,8 +3381,6 @@ int unregister_netdevice(struct net_device *dev) /* Notifier chain MUST detach us from master device. */ BUG_TRAP(!dev->master); - free_divert_blk(dev); - /* Finish processing unregister after unlock */ net_set_todo(dev); @@ -3190,7 +3411,6 @@ void unregister_netdev(struct net_device *dev) EXPORT_SYMBOL(unregister_netdev); -#ifdef CONFIG_HOTPLUG_CPU static int dev_cpu_callback(struct notifier_block *nfb, unsigned long action, void *ocpu) @@ -3234,8 +3454,84 @@ static int dev_cpu_callback(struct notifier_block *nfb, return NOTIFY_OK; } -#endif /* CONFIG_HOTPLUG_CPU */ +#ifdef CONFIG_NET_DMA +/** + * net_dma_rebalance - + * This is called when the number of channels allocated to the net_dma_client + * changes. The net_dma_client tries to have one DMA channel per CPU. + */ +static void net_dma_rebalance(void) +{ + unsigned int cpu, i, n; + struct dma_chan *chan; + + if (net_dma_count == 0) { + for_each_online_cpu(cpu) + rcu_assign_pointer(per_cpu(softnet_data, cpu).net_dma, NULL); + return; + } + + i = 0; + cpu = first_cpu(cpu_online_map); + + rcu_read_lock(); + list_for_each_entry(chan, &net_dma_client->channels, client_node) { + n = ((num_online_cpus() / net_dma_count) + + (i < (num_online_cpus() % net_dma_count) ? 1 : 0)); + + while(n) { + per_cpu(softnet_data, cpu).net_dma = chan; + cpu = next_cpu(cpu, cpu_online_map); + n--; + } + i++; + } + rcu_read_unlock(); +} + +/** + * netdev_dma_event - event callback for the net_dma_client + * @client: should always be net_dma_client + * @chan: DMA channel for the event + * @event: event type + */ +static void netdev_dma_event(struct dma_client *client, struct dma_chan *chan, + enum dma_event event) +{ + spin_lock(&net_dma_event_lock); + switch (event) { + case DMA_RESOURCE_ADDED: + net_dma_count++; + net_dma_rebalance(); + break; + case DMA_RESOURCE_REMOVED: + net_dma_count--; + net_dma_rebalance(); + break; + default: + break; + } + spin_unlock(&net_dma_event_lock); +} + +/** + * netdev_dma_regiser - register the networking subsystem as a DMA client + */ +static int __init netdev_dma_register(void) +{ + spin_lock_init(&net_dma_event_lock); + net_dma_client = dma_async_client_register(netdev_dma_event); + if (net_dma_client == NULL) + return -ENOMEM; + + dma_async_client_chan_request(net_dma_client, num_online_cpus()); + return 0; +} + +#else +static int __init netdev_dma_register(void) { return -ENODEV; } +#endif /* CONFIG_NET_DMA */ /* * Initialize the DEV module. At boot time this walks the device list and @@ -3254,8 +3550,6 @@ static int __init net_dev_init(void) BUG_ON(!dev_boot_phase); - net_random_init(); - if (dev_proc_init()) goto out; @@ -3276,14 +3570,11 @@ static int __init net_dev_init(void) * Initialise the packet receive queues. */ - for (i = 0; i < NR_CPUS; i++) { + for_each_possible_cpu(i) { struct softnet_data *queue; queue = &per_cpu(softnet_data, i); skb_queue_head_init(&queue->input_pkt_queue); - queue->throttle = 0; - queue->cng_level = 0; - queue->avg_blog = 10; /* arbitrary non-zero */ queue->completion_queue = NULL; INIT_LIST_HEAD(&queue->poll_list); set_bit(__LINK_STATE_START, &queue->backlog_dev.state); @@ -3292,10 +3583,7 @@ static int __init net_dev_init(void) atomic_set(&queue->backlog_dev.refcnt, 1); } -#ifdef OFFLINE_SAMPLE - samp_timer.expires = jiffies + (10 * HZ); - add_timer(&samp_timer); -#endif + netdev_dma_register(); dev_boot_phase = 0; @@ -3315,14 +3603,13 @@ subsys_initcall(net_dev_init); EXPORT_SYMBOL(__dev_get_by_index); EXPORT_SYMBOL(__dev_get_by_name); EXPORT_SYMBOL(__dev_remove_pack); -EXPORT_SYMBOL(__skb_linearize); +EXPORT_SYMBOL(dev_valid_name); EXPORT_SYMBOL(dev_add_pack); EXPORT_SYMBOL(dev_alloc_name); EXPORT_SYMBOL(dev_close); EXPORT_SYMBOL(dev_get_by_flags); EXPORT_SYMBOL(dev_get_by_index); EXPORT_SYMBOL(dev_get_by_name); -EXPORT_SYMBOL(dev_ioctl); EXPORT_SYMBOL(dev_open); EXPORT_SYMBOL(dev_queue_xmit); EXPORT_SYMBOL(dev_remove_pack); @@ -3330,6 +3617,7 @@ EXPORT_SYMBOL(dev_set_allmulti); EXPORT_SYMBOL(dev_set_promiscuity); EXPORT_SYMBOL(dev_change_flags); EXPORT_SYMBOL(dev_set_mtu); +EXPORT_SYMBOL(dev_set_mac_address); EXPORT_SYMBOL(free_netdev); EXPORT_SYMBOL(netdev_boot_setup_check); EXPORT_SYMBOL(netdev_set_master); @@ -3345,9 +3633,13 @@ EXPORT_SYMBOL(unregister_netdevice); EXPORT_SYMBOL(unregister_netdevice_notifier); EXPORT_SYMBOL(net_enable_timestamp); EXPORT_SYMBOL(net_disable_timestamp); +EXPORT_SYMBOL(dev_get_flags); +EXPORT_SYMBOL(skb_checksum_setup); #if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE) EXPORT_SYMBOL(br_handle_frame_hook); +EXPORT_SYMBOL(br_fdb_get_hook); +EXPORT_SYMBOL(br_fdb_put_hook); #endif #ifdef CONFIG_KMOD