2 * NET3 Protocol independent device support routines.
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
9 * Derived from the non IP parts of dev.c 1.0.19
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
15 * Florian la Roche <rzsfl@rz.uni-sb.de>
16 * Alan Cox <gw4pts@gw4pts.ampr.org>
17 * David Hinds <dahinds@users.sourceforge.net>
18 * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19 * Adam Sulmicki <adam@cfar.umd.edu>
20 * Pekka Riikonen <priikone@poesidon.pspt.fi>
23 * D.J. Barrow : Fixed bug where dev->refcnt gets set
24 * to 2 if register_netdev gets called
25 * before net_dev_init & also removed a
26 * few lines of code in the process.
27 * Alan Cox : device private ioctl copies fields back.
28 * Alan Cox : Transmit queue code does relevant
29 * stunts to keep the queue safe.
30 * Alan Cox : Fixed double lock.
31 * Alan Cox : Fixed promisc NULL pointer trap
32 * ???????? : Support the full private ioctl range
33 * Alan Cox : Moved ioctl permission check into
35 * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI
36 * Alan Cox : 100 backlog just doesn't cut it when
37 * you start doing multicast video 8)
38 * Alan Cox : Rewrote net_bh and list manager.
39 * Alan Cox : Fix ETH_P_ALL echoback lengths.
40 * Alan Cox : Took out transmit every packet pass
41 * Saved a few bytes in the ioctl handler
42 * Alan Cox : Network driver sets packet type before
43 * calling netif_rx. Saves a function
45 * Alan Cox : Hashed net_bh()
46 * Richard Kooijman: Timestamp fixes.
47 * Alan Cox : Wrong field in SIOCGIFDSTADDR
48 * Alan Cox : Device lock protection.
49 * Alan Cox : Fixed nasty side effect of device close
51 * Rudi Cilibrasi : Pass the right thing to
53 * Dave Miller : 32bit quantity for the device lock to
54 * make it work out on a Sparc.
55 * Bjorn Ekwall : Added KERNELD hack.
56 * Alan Cox : Cleaned up the backlog initialise.
57 * Craig Metz : SIOCGIFCONF fix if space for under
59 * Thomas Bogendoerfer : Return ENODEV for dev_open, if there
60 * is no device open function.
61 * Andi Kleen : Fix error reporting for SIOCGIFCONF
62 * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF
63 * Cyrus Durgin : Cleaned for KMOD
64 * Adam Sulmicki : Bug Fix : Network Device Unload
65 * A network device unload needs to purge
67 * Paul Rusty Russell : SIOCSIFNAME
68 * Pekka Riikonen : Netdev boot-time settings code
69 * Andrew Morton : Make unregister_netdevice wait
70 * indefinitely on dev->refcnt
71 * J Hadi Salim : - Backlog queue sampling
72 * - netif_rx() feedback
75 #include <asm/uaccess.h>
76 #include <asm/system.h>
77 #include <linux/bitops.h>
78 #include <linux/capability.h>
79 #include <linux/cpu.h>
80 #include <linux/types.h>
81 #include <linux/kernel.h>
82 #include <linux/sched.h>
83 #include <linux/mutex.h>
84 #include <linux/string.h>
86 #include <linux/socket.h>
87 #include <linux/sockios.h>
88 #include <linux/errno.h>
89 #include <linux/interrupt.h>
90 #include <linux/if_ether.h>
91 #include <linux/netdevice.h>
92 #include <linux/etherdevice.h>
93 #include <linux/notifier.h>
94 #include <linux/skbuff.h>
96 #include <linux/rtnetlink.h>
97 #include <linux/proc_fs.h>
98 #include <linux/seq_file.h>
99 #include <linux/stat.h>
100 #include <linux/if_bridge.h>
101 #include <linux/divert.h>
103 #include <net/pkt_sched.h>
104 #include <net/checksum.h>
105 #include <linux/highmem.h>
106 #include <linux/init.h>
107 #include <linux/kmod.h>
108 #include <linux/module.h>
109 #include <linux/kallsyms.h>
110 #include <linux/netpoll.h>
111 #include <linux/rcupdate.h>
112 #include <linux/delay.h>
113 #include <linux/wireless.h>
114 #include <net/iw_handler.h>
115 #include <asm/current.h>
116 #include <linux/audit.h>
117 #include <linux/dmaengine.h>
118 #include <linux/err.h>
119 #include <linux/ctype.h>
120 #include <linux/vs_base.h>
121 #include <linux/vs_network.h>
125 #include <linux/tcp.h>
126 #include <linux/udp.h>
130 * The list of packet types we will receive (as opposed to discard)
131 * and the routines to invoke.
133 * Why 16. Because with 16 the only overlap we get on a hash of the
134 * low nibble of the protocol value is RARP/SNAP/X.25.
136 * NOTE: That is no longer true with the addition of VLAN tags. Not
137 * sure which should go first, but I bet it won't make much
138 * difference if we are running VLANs. The good news is that
139 * this protocol won't be in the list unless compiled in, so
140 * the average user (w/out VLANs) will not be adversely affected.
157 static DEFINE_SPINLOCK(ptype_lock);
158 static struct list_head ptype_base[16]; /* 16 way hashed list */
159 static struct list_head ptype_all; /* Taps */
161 #ifdef CONFIG_NET_DMA
162 static struct dma_client *net_dma_client;
163 static unsigned int net_dma_count;
164 static spinlock_t net_dma_event_lock;
168 * The @dev_base list is protected by @dev_base_lock and the rtnl
171 * Pure readers hold dev_base_lock for reading.
173 * Writers must hold the rtnl semaphore while they loop through the
174 * dev_base list, and hold dev_base_lock for writing when they do the
175 * actual updates. This allows pure readers to access the list even
176 * while a writer is preparing to update it.
178 * To put it another way, dev_base_lock is held for writing only to
179 * protect against pure readers; the rtnl semaphore provides the
180 * protection against other writers.
182 * See, for example usages, register_netdevice() and
183 * unregister_netdevice(), which must be called with the rtnl
186 struct net_device *dev_base;
187 static struct net_device **dev_tail = &dev_base;
188 DEFINE_RWLOCK(dev_base_lock);
190 EXPORT_SYMBOL(dev_base);
191 EXPORT_SYMBOL(dev_base_lock);
193 #define NETDEV_HASHBITS 8
194 static struct hlist_head dev_name_head[1<<NETDEV_HASHBITS];
195 static struct hlist_head dev_index_head[1<<NETDEV_HASHBITS];
197 static inline struct hlist_head *dev_name_hash(const char *name)
199 unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
200 return &dev_name_head[hash & ((1<<NETDEV_HASHBITS)-1)];
203 static inline struct hlist_head *dev_index_hash(int ifindex)
205 return &dev_index_head[ifindex & ((1<<NETDEV_HASHBITS)-1)];
212 static RAW_NOTIFIER_HEAD(netdev_chain);
215 * Device drivers call our routines to queue packets here. We empty the
216 * queue in the local softnet handler.
218 DEFINE_PER_CPU(struct softnet_data, softnet_data) = { NULL };
221 extern int netdev_sysfs_init(void);
222 extern int netdev_register_sysfs(struct net_device *);
223 extern void netdev_unregister_sysfs(struct net_device *);
225 #define netdev_sysfs_init() (0)
226 #define netdev_register_sysfs(dev) (0)
227 #define netdev_unregister_sysfs(dev) do { } while(0)
231 /*******************************************************************************
233 Protocol management and registration routines
235 *******************************************************************************/
241 static int netdev_nit;
244 * Add a protocol ID to the list. Now that the input handler is
245 * smarter we can dispense with all the messy stuff that used to be
248 * BEWARE!!! Protocol handlers, mangling input packets,
249 * MUST BE last in hash buckets and checking protocol handlers
250 * MUST start from promiscuous ptype_all chain in net_bh.
251 * It is true now, do not change it.
252 * Explanation follows: if protocol handler, mangling packet, will
253 * be the first on list, it is not able to sense, that packet
254 * is cloned and should be copied-on-write, so that it will
255 * change it and subsequent readers will get broken packet.
260 * dev_add_pack - add packet handler
261 * @pt: packet type declaration
263 * Add a protocol handler to the networking stack. The passed &packet_type
264 * is linked into kernel lists and may not be freed until it has been
265 * removed from the kernel lists.
267 * This call does not sleep therefore it can not
268 * guarantee all CPU's that are in middle of receiving packets
269 * will see the new packet type (until the next received packet).
272 void dev_add_pack(struct packet_type *pt)
276 spin_lock_bh(&ptype_lock);
277 if (pt->type == htons(ETH_P_ALL)) {
279 list_add_rcu(&pt->list, &ptype_all);
281 hash = ntohs(pt->type) & 15;
282 list_add_rcu(&pt->list, &ptype_base[hash]);
284 spin_unlock_bh(&ptype_lock);
288 * __dev_remove_pack - remove packet handler
289 * @pt: packet type declaration
291 * Remove a protocol handler that was previously added to the kernel
292 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
293 * from the kernel lists and can be freed or reused once this function
296 * The packet type might still be in use by receivers
297 * and must not be freed until after all the CPU's have gone
298 * through a quiescent state.
300 void __dev_remove_pack(struct packet_type *pt)
302 struct list_head *head;
303 struct packet_type *pt1;
305 spin_lock_bh(&ptype_lock);
307 if (pt->type == htons(ETH_P_ALL)) {
311 head = &ptype_base[ntohs(pt->type) & 15];
313 list_for_each_entry(pt1, head, list) {
315 list_del_rcu(&pt->list);
320 printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
322 spin_unlock_bh(&ptype_lock);
325 * dev_remove_pack - remove packet handler
326 * @pt: packet type declaration
328 * Remove a protocol handler that was previously added to the kernel
329 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
330 * from the kernel lists and can be freed or reused once this function
333 * This call sleeps to guarantee that no CPU is looking at the packet
336 void dev_remove_pack(struct packet_type *pt)
338 __dev_remove_pack(pt);
343 /******************************************************************************
345 Device Boot-time Settings Routines
347 *******************************************************************************/
349 /* Boot time configuration table */
350 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
353 * netdev_boot_setup_add - add new setup entry
354 * @name: name of the device
355 * @map: configured settings for the device
357 * Adds new setup entry to the dev_boot_setup list. The function
358 * returns 0 on error and 1 on success. This is a generic routine to
361 static int netdev_boot_setup_add(char *name, struct ifmap *map)
363 struct netdev_boot_setup *s;
367 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
368 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
369 memset(s[i].name, 0, sizeof(s[i].name));
370 strcpy(s[i].name, name);
371 memcpy(&s[i].map, map, sizeof(s[i].map));
376 return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
380 * netdev_boot_setup_check - check boot time settings
381 * @dev: the netdevice
383 * Check boot time settings for the device.
384 * The found settings are set for the device to be used
385 * later in the device probing.
386 * Returns 0 if no settings found, 1 if they are.
388 int netdev_boot_setup_check(struct net_device *dev)
390 struct netdev_boot_setup *s = dev_boot_setup;
393 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
394 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
395 !strncmp(dev->name, s[i].name, strlen(s[i].name))) {
396 dev->irq = s[i].map.irq;
397 dev->base_addr = s[i].map.base_addr;
398 dev->mem_start = s[i].map.mem_start;
399 dev->mem_end = s[i].map.mem_end;
408 * netdev_boot_base - get address from boot time settings
409 * @prefix: prefix for network device
410 * @unit: id for network device
412 * Check boot time settings for the base address of device.
413 * The found settings are set for the device to be used
414 * later in the device probing.
415 * Returns 0 if no settings found.
417 unsigned long netdev_boot_base(const char *prefix, int unit)
419 const struct netdev_boot_setup *s = dev_boot_setup;
423 sprintf(name, "%s%d", prefix, unit);
426 * If device already registered then return base of 1
427 * to indicate not to probe for this interface
429 if (__dev_get_by_name(name))
432 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
433 if (!strcmp(name, s[i].name))
434 return s[i].map.base_addr;
439 * Saves at boot time configured settings for any netdevice.
441 int __init netdev_boot_setup(char *str)
446 str = get_options(str, ARRAY_SIZE(ints), ints);
451 memset(&map, 0, sizeof(map));
455 map.base_addr = ints[2];
457 map.mem_start = ints[3];
459 map.mem_end = ints[4];
461 /* Add new entry to the list */
462 return netdev_boot_setup_add(str, &map);
465 __setup("netdev=", netdev_boot_setup);
467 /*******************************************************************************
469 Device Interface Subroutines
471 *******************************************************************************/
474 * __dev_get_by_name - find a device by its name
475 * @name: name to find
477 * Find an interface by name. Must be called under RTNL semaphore
478 * or @dev_base_lock. If the name is found a pointer to the device
479 * is returned. If the name is not found then %NULL is returned. The
480 * reference counters are not incremented so the caller must be
481 * careful with locks.
484 struct net_device *__dev_get_by_name(const char *name)
486 struct hlist_node *p;
488 hlist_for_each(p, dev_name_hash(name)) {
489 struct net_device *dev
490 = hlist_entry(p, struct net_device, name_hlist);
491 if (!strncmp(dev->name, name, IFNAMSIZ))
498 * dev_get_by_name - find a device by its name
499 * @name: name to find
501 * Find an interface by name. This can be called from any
502 * context and does its own locking. The returned handle has
503 * the usage count incremented and the caller must use dev_put() to
504 * release it when it is no longer needed. %NULL is returned if no
505 * matching device is found.
508 struct net_device *dev_get_by_name(const char *name)
510 struct net_device *dev;
512 read_lock(&dev_base_lock);
513 dev = __dev_get_by_name(name);
516 read_unlock(&dev_base_lock);
521 * __dev_get_by_index - find a device by its ifindex
522 * @ifindex: index of device
524 * Search for an interface by index. Returns %NULL if the device
525 * is not found or a pointer to the device. The device has not
526 * had its reference counter increased so the caller must be careful
527 * about locking. The caller must hold either the RTNL semaphore
531 struct net_device *__dev_get_by_index(int ifindex)
533 struct hlist_node *p;
535 hlist_for_each(p, dev_index_hash(ifindex)) {
536 struct net_device *dev
537 = hlist_entry(p, struct net_device, index_hlist);
538 if (dev->ifindex == ifindex)
546 * dev_get_by_index - find a device by its ifindex
547 * @ifindex: index of device
549 * Search for an interface by index. Returns NULL if the device
550 * is not found or a pointer to the device. The device returned has
551 * had a reference added and the pointer is safe until the user calls
552 * dev_put to indicate they have finished with it.
555 struct net_device *dev_get_by_index(int ifindex)
557 struct net_device *dev;
559 read_lock(&dev_base_lock);
560 dev = __dev_get_by_index(ifindex);
563 read_unlock(&dev_base_lock);
568 * dev_getbyhwaddr - find a device by its hardware address
569 * @type: media type of device
570 * @ha: hardware address
572 * Search for an interface by MAC address. Returns NULL if the device
573 * is not found or a pointer to the device. The caller must hold the
574 * rtnl semaphore. The returned device has not had its ref count increased
575 * and the caller must therefore be careful about locking
578 * If the API was consistent this would be __dev_get_by_hwaddr
581 struct net_device *dev_getbyhwaddr(unsigned short type, char *ha)
583 struct net_device *dev;
587 for (dev = dev_base; dev; dev = dev->next)
588 if (dev->type == type &&
589 !memcmp(dev->dev_addr, ha, dev->addr_len))
594 EXPORT_SYMBOL(dev_getbyhwaddr);
596 struct net_device *dev_getfirstbyhwtype(unsigned short type)
598 struct net_device *dev;
601 for (dev = dev_base; dev; dev = dev->next) {
602 if (dev->type == type) {
611 EXPORT_SYMBOL(dev_getfirstbyhwtype);
614 * dev_get_by_flags - find any device with given flags
615 * @if_flags: IFF_* values
616 * @mask: bitmask of bits in if_flags to check
618 * Search for any interface with the given flags. Returns NULL if a device
619 * is not found or a pointer to the device. The device returned has
620 * had a reference added and the pointer is safe until the user calls
621 * dev_put to indicate they have finished with it.
624 struct net_device * dev_get_by_flags(unsigned short if_flags, unsigned short mask)
626 struct net_device *dev;
628 read_lock(&dev_base_lock);
629 for (dev = dev_base; dev != NULL; dev = dev->next) {
630 if (((dev->flags ^ if_flags) & mask) == 0) {
635 read_unlock(&dev_base_lock);
640 * dev_valid_name - check if name is okay for network device
643 * Network device names need to be valid file names to
644 * to allow sysfs to work. We also disallow any kind of
647 int dev_valid_name(const char *name)
651 if (!strcmp(name, ".") || !strcmp(name, ".."))
655 if (*name == '/' || isspace(*name))
663 * dev_alloc_name - allocate a name for a device
665 * @name: name format string
667 * Passed a format string - eg "lt%d" it will try and find a suitable
668 * id. It scans list of devices to build up a free map, then chooses
669 * the first empty slot. The caller must hold the dev_base or rtnl lock
670 * while allocating the name and adding the device in order to avoid
672 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
673 * Returns the number of the unit assigned or a negative errno code.
676 int dev_alloc_name(struct net_device *dev, const char *name)
681 const int max_netdevices = 8*PAGE_SIZE;
683 struct net_device *d;
685 p = strnchr(name, IFNAMSIZ-1, '%');
688 * Verify the string as this thing may have come from
689 * the user. There must be either one "%d" and no other "%"
692 if (p[1] != 'd' || strchr(p + 2, '%'))
695 /* Use one page as a bit array of possible slots */
696 inuse = (long *) get_zeroed_page(GFP_ATOMIC);
700 for (d = dev_base; d; d = d->next) {
701 if (!sscanf(d->name, name, &i))
703 if (i < 0 || i >= max_netdevices)
706 /* avoid cases where sscanf is not exact inverse of printf */
707 snprintf(buf, sizeof(buf), name, i);
708 if (!strncmp(buf, d->name, IFNAMSIZ))
712 i = find_first_zero_bit(inuse, max_netdevices);
713 free_page((unsigned long) inuse);
716 snprintf(buf, sizeof(buf), name, i);
717 if (!__dev_get_by_name(buf)) {
718 strlcpy(dev->name, buf, IFNAMSIZ);
722 /* It is possible to run out of possible slots
723 * when the name is long and there isn't enough space left
724 * for the digits, or if all bits are used.
731 * dev_change_name - change name of a device
733 * @newname: name (or format string) must be at least IFNAMSIZ
735 * Change name of a device, can pass format strings "eth%d".
738 int dev_change_name(struct net_device *dev, char *newname)
744 if (dev->flags & IFF_UP)
747 if (!dev_valid_name(newname))
750 if (strchr(newname, '%')) {
751 err = dev_alloc_name(dev, newname);
754 strcpy(newname, dev->name);
756 else if (__dev_get_by_name(newname))
759 strlcpy(dev->name, newname, IFNAMSIZ);
761 err = class_device_rename(&dev->class_dev, dev->name);
763 hlist_del(&dev->name_hlist);
764 hlist_add_head(&dev->name_hlist, dev_name_hash(dev->name));
765 raw_notifier_call_chain(&netdev_chain,
766 NETDEV_CHANGENAME, dev);
773 * netdev_features_change - device changes features
774 * @dev: device to cause notification
776 * Called to indicate a device has changed features.
778 void netdev_features_change(struct net_device *dev)
780 raw_notifier_call_chain(&netdev_chain, NETDEV_FEAT_CHANGE, dev);
782 EXPORT_SYMBOL(netdev_features_change);
785 * netdev_state_change - device changes state
786 * @dev: device to cause notification
788 * Called to indicate a device has changed state. This function calls
789 * the notifier chains for netdev_chain and sends a NEWLINK message
790 * to the routing socket.
792 void netdev_state_change(struct net_device *dev)
794 if (dev->flags & IFF_UP) {
795 raw_notifier_call_chain(&netdev_chain,
797 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
802 * dev_load - load a network module
803 * @name: name of interface
805 * If a network interface is not present and the process has suitable
806 * privileges this function loads the module. If module loading is not
807 * available in this kernel then it becomes a nop.
810 void dev_load(const char *name)
812 struct net_device *dev;
814 read_lock(&dev_base_lock);
815 dev = __dev_get_by_name(name);
816 read_unlock(&dev_base_lock);
818 if (!dev && capable(CAP_SYS_MODULE))
819 request_module("%s", name);
822 static int default_rebuild_header(struct sk_buff *skb)
824 printk(KERN_DEBUG "%s: default_rebuild_header called -- BUG!\n",
825 skb->dev ? skb->dev->name : "NULL!!!");
832 * dev_open - prepare an interface for use.
833 * @dev: device to open
835 * Takes a device from down to up state. The device's private open
836 * function is invoked and then the multicast lists are loaded. Finally
837 * the device is moved into the up state and a %NETDEV_UP message is
838 * sent to the netdev notifier chain.
840 * Calling this function on an active interface is a nop. On a failure
841 * a negative errno code is returned.
843 int dev_open(struct net_device *dev)
851 if (dev->flags & IFF_UP)
855 * Is it even present?
857 if (!netif_device_present(dev))
861 * Call device private open method
863 set_bit(__LINK_STATE_START, &dev->state);
865 ret = dev->open(dev);
867 clear_bit(__LINK_STATE_START, &dev->state);
871 * If it went open OK then:
878 dev->flags |= IFF_UP;
881 * Initialize multicasting status
886 * Wakeup transmit queue engine
891 * ... and announce new interface.
893 raw_notifier_call_chain(&netdev_chain, NETDEV_UP, dev);
899 * dev_close - shutdown an interface.
900 * @dev: device to shutdown
902 * This function moves an active device into down state. A
903 * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
904 * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
907 int dev_close(struct net_device *dev)
909 if (!(dev->flags & IFF_UP))
913 * Tell people we are going down, so that they can
914 * prepare to death, when device is still operating.
916 raw_notifier_call_chain(&netdev_chain, NETDEV_GOING_DOWN, dev);
920 clear_bit(__LINK_STATE_START, &dev->state);
922 /* Synchronize to scheduled poll. We cannot touch poll list,
923 * it can be even on different cpu. So just clear netif_running(),
924 * and wait when poll really will happen. Actually, the best place
925 * for this is inside dev->stop() after device stopped its irq
926 * engine, but this requires more changes in devices. */
928 smp_mb__after_clear_bit(); /* Commit netif_running(). */
929 while (test_bit(__LINK_STATE_RX_SCHED, &dev->state)) {
935 * Call the device specific close. This cannot fail.
936 * Only if device is UP
938 * We allow it to be called even after a DETACH hot-plug
945 * Device is now down.
948 dev->flags &= ~IFF_UP;
951 * Tell people we are down
953 raw_notifier_call_chain(&netdev_chain, NETDEV_DOWN, dev);
960 * Device change register/unregister. These are not inline or static
961 * as we export them to the world.
965 * register_netdevice_notifier - register a network notifier block
968 * Register a notifier to be called when network device events occur.
969 * The notifier passed is linked into the kernel structures and must
970 * not be reused until it has been unregistered. A negative errno code
971 * is returned on a failure.
973 * When registered all registration and up events are replayed
974 * to the new notifier to allow device to have a race free
975 * view of the network device list.
978 int register_netdevice_notifier(struct notifier_block *nb)
980 struct net_device *dev;
984 err = raw_notifier_chain_register(&netdev_chain, nb);
986 for (dev = dev_base; dev; dev = dev->next) {
987 nb->notifier_call(nb, NETDEV_REGISTER, dev);
989 if (dev->flags & IFF_UP)
990 nb->notifier_call(nb, NETDEV_UP, dev);
998 * unregister_netdevice_notifier - unregister a network notifier block
1001 * Unregister a notifier previously registered by
1002 * register_netdevice_notifier(). The notifier is unlinked into the
1003 * kernel structures and may then be reused. A negative errno code
1004 * is returned on a failure.
1007 int unregister_netdevice_notifier(struct notifier_block *nb)
1012 err = raw_notifier_chain_unregister(&netdev_chain, nb);
1018 * call_netdevice_notifiers - call all network notifier blocks
1019 * @val: value passed unmodified to notifier function
1020 * @v: pointer passed unmodified to notifier function
1022 * Call all network notifier blocks. Parameters and return value
1023 * are as for raw_notifier_call_chain().
1026 int call_netdevice_notifiers(unsigned long val, void *v)
1028 return raw_notifier_call_chain(&netdev_chain, val, v);
1031 /* When > 0 there are consumers of rx skb time stamps */
1032 static atomic_t netstamp_needed = ATOMIC_INIT(0);
1034 void net_enable_timestamp(void)
1036 atomic_inc(&netstamp_needed);
1039 void net_disable_timestamp(void)
1041 atomic_dec(&netstamp_needed);
1044 void __net_timestamp(struct sk_buff *skb)
1048 do_gettimeofday(&tv);
1049 skb_set_timestamp(skb, &tv);
1051 EXPORT_SYMBOL(__net_timestamp);
1053 static inline void net_timestamp(struct sk_buff *skb)
1055 if (atomic_read(&netstamp_needed))
1056 __net_timestamp(skb);
1058 skb->tstamp.off_sec = 0;
1059 skb->tstamp.off_usec = 0;
1064 * Support routine. Sends outgoing frames to any network
1065 * taps currently in use.
1068 #if !((defined(CONFIG_VNET) || defined(CONFIG_VNET_MODULE)))
1071 void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1073 struct packet_type *ptype;
1078 list_for_each_entry_rcu(ptype, &ptype_all, list) {
1079 /* Never send packets back to the socket
1080 * they originated from - MvS (miquels@drinkel.ow.org)
1082 if ((ptype->dev == dev || !ptype->dev) &&
1083 (ptype->af_packet_priv == NULL ||
1084 (struct sock *)ptype->af_packet_priv != skb->sk)) {
1085 struct sk_buff *skb2= skb_clone(skb, GFP_ATOMIC);
1089 /* skb->nh should be correctly
1090 set by sender, so that the second statement is
1091 just protection against buggy protocols.
1093 skb2->mac.raw = skb2->data;
1095 if (skb2->nh.raw < skb2->data ||
1096 skb2->nh.raw > skb2->tail) {
1097 if (net_ratelimit())
1098 printk(KERN_CRIT "protocol %04x is "
1100 skb2->protocol, dev->name);
1101 skb2->nh.raw = skb2->data;
1104 skb2->h.raw = skb2->nh.raw;
1105 skb2->pkt_type = PACKET_OUTGOING;
1106 ptype->func(skb2, skb->dev, ptype, skb->dev);
1113 void __netif_schedule(struct net_device *dev)
1115 if (!test_and_set_bit(__LINK_STATE_SCHED, &dev->state)) {
1116 unsigned long flags;
1117 struct softnet_data *sd;
1119 local_irq_save(flags);
1120 sd = &__get_cpu_var(softnet_data);
1121 dev->next_sched = sd->output_queue;
1122 sd->output_queue = dev;
1123 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1124 local_irq_restore(flags);
1127 EXPORT_SYMBOL(__netif_schedule);
1129 void __netif_rx_schedule(struct net_device *dev)
1131 unsigned long flags;
1133 local_irq_save(flags);
1135 list_add_tail(&dev->poll_list, &__get_cpu_var(softnet_data).poll_list);
1137 dev->quota += dev->weight;
1139 dev->quota = dev->weight;
1140 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
1141 local_irq_restore(flags);
1143 EXPORT_SYMBOL(__netif_rx_schedule);
1145 void dev_kfree_skb_any(struct sk_buff *skb)
1147 if (in_irq() || irqs_disabled())
1148 dev_kfree_skb_irq(skb);
1152 EXPORT_SYMBOL(dev_kfree_skb_any);
1156 void netif_device_detach(struct net_device *dev)
1158 if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1159 netif_running(dev)) {
1160 netif_stop_queue(dev);
1163 EXPORT_SYMBOL(netif_device_detach);
1165 void netif_device_attach(struct net_device *dev)
1167 if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1168 netif_running(dev)) {
1169 netif_wake_queue(dev);
1170 __netdev_watchdog_up(dev);
1173 EXPORT_SYMBOL(netif_device_attach);
1177 * Invalidate hardware checksum when packet is to be mangled, and
1178 * complete checksum manually on outgoing path.
1180 int skb_checksum_help(struct sk_buff *skb, int inward)
1183 int ret = 0, offset = skb->h.raw - skb->data;
1186 goto out_set_summed;
1188 if (unlikely(skb_shinfo(skb)->gso_size)) {
1189 /* Let GSO fix up the checksum. */
1190 goto out_set_summed;
1193 if (skb_cloned(skb)) {
1194 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1199 BUG_ON(offset > (int)skb->len);
1200 csum = skb_checksum(skb, offset, skb->len-offset, 0);
1202 offset = skb->tail - skb->h.raw;
1203 BUG_ON(offset <= 0);
1204 BUG_ON(skb->csum + 2 > offset);
1206 *(u16*)(skb->h.raw + skb->csum) = csum_fold(csum);
1209 skb->ip_summed = CHECKSUM_NONE;
1215 * skb_gso_segment - Perform segmentation on skb.
1216 * @skb: buffer to segment
1217 * @features: features for the output path (see dev->features)
1219 * This function segments the given skb and returns a list of segments.
1221 * It may return NULL if the skb requires no segmentation. This is
1222 * only possible when GSO is used for verifying header integrity.
1224 struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features)
1226 struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1227 struct packet_type *ptype;
1228 int type = skb->protocol;
1231 BUG_ON(skb_shinfo(skb)->frag_list);
1233 skb->mac.raw = skb->data;
1234 skb->mac_len = skb->nh.raw - skb->data;
1235 __skb_pull(skb, skb->mac_len);
1237 if (unlikely(skb->ip_summed != CHECKSUM_HW)) {
1238 if (skb_header_cloned(skb) &&
1239 (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1240 return ERR_PTR(err);
1244 list_for_each_entry_rcu(ptype, &ptype_base[ntohs(type) & 15], list) {
1245 if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1246 if (unlikely(skb->ip_summed != CHECKSUM_HW)) {
1247 err = ptype->gso_send_check(skb);
1248 segs = ERR_PTR(err);
1249 if (err || skb_gso_ok(skb, features))
1251 __skb_push(skb, skb->data - skb->nh.raw);
1253 segs = ptype->gso_segment(skb, features);
1259 __skb_push(skb, skb->data - skb->mac.raw);
1264 EXPORT_SYMBOL(skb_gso_segment);
1266 /* Take action when hardware reception checksum errors are detected. */
1268 void netdev_rx_csum_fault(struct net_device *dev)
1270 if (net_ratelimit()) {
1271 printk(KERN_ERR "%s: hw csum failure.\n",
1272 dev ? dev->name : "<unknown>");
1276 EXPORT_SYMBOL(netdev_rx_csum_fault);
1279 /* Actually, we should eliminate this check as soon as we know, that:
1280 * 1. IOMMU is present and allows to map all the memory.
1281 * 2. No high memory really exists on this machine.
1284 static inline int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1286 #ifdef CONFIG_HIGHMEM
1289 if (dev->features & NETIF_F_HIGHDMA)
1292 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1293 if (PageHighMem(skb_shinfo(skb)->frags[i].page))
1301 void (*destructor)(struct sk_buff *skb);
1304 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
1306 static void dev_gso_skb_destructor(struct sk_buff *skb)
1308 struct dev_gso_cb *cb;
1311 struct sk_buff *nskb = skb->next;
1313 skb->next = nskb->next;
1316 } while (skb->next);
1318 cb = DEV_GSO_CB(skb);
1320 cb->destructor(skb);
1324 * dev_gso_segment - Perform emulated hardware segmentation on skb.
1325 * @skb: buffer to segment
1327 * This function segments the given skb and stores the list of segments
1330 static int dev_gso_segment(struct sk_buff *skb)
1332 struct net_device *dev = skb->dev;
1333 struct sk_buff *segs;
1334 int features = dev->features & ~(illegal_highdma(dev, skb) ?
1337 segs = skb_gso_segment(skb, features);
1339 /* Verifying header integrity only. */
1343 if (unlikely(IS_ERR(segs)))
1344 return PTR_ERR(segs);
1347 DEV_GSO_CB(skb)->destructor = skb->destructor;
1348 skb->destructor = dev_gso_skb_destructor;
1353 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev)
1355 if (likely(!skb->next)) {
1357 dev_queue_xmit_nit(skb, dev);
1359 if (netif_needs_gso(dev, skb)) {
1360 if (unlikely(dev_gso_segment(skb)))
1366 return dev->hard_start_xmit(skb, dev);
1371 struct sk_buff *nskb = skb->next;
1374 skb->next = nskb->next;
1376 rc = dev->hard_start_xmit(nskb, dev);
1378 nskb->next = skb->next;
1382 if (unlikely(netif_queue_stopped(dev) && skb->next))
1383 return NETDEV_TX_BUSY;
1384 } while (skb->next);
1386 skb->destructor = DEV_GSO_CB(skb)->destructor;
1393 #define HARD_TX_LOCK(dev, cpu) { \
1394 if ((dev->features & NETIF_F_LLTX) == 0) { \
1395 netif_tx_lock(dev); \
1399 #define HARD_TX_UNLOCK(dev) { \
1400 if ((dev->features & NETIF_F_LLTX) == 0) { \
1401 netif_tx_unlock(dev); \
1406 inline int skb_checksum_setup(struct sk_buff *skb)
1408 if (skb->proto_csum_blank) {
1409 if (skb->protocol != htons(ETH_P_IP))
1411 skb->h.raw = (unsigned char *)skb->nh.iph + 4*skb->nh.iph->ihl;
1412 if (skb->h.raw >= skb->tail)
1414 switch (skb->nh.iph->protocol) {
1416 skb->csum = offsetof(struct tcphdr, check);
1419 skb->csum = offsetof(struct udphdr, check);
1422 if (net_ratelimit())
1423 printk(KERN_ERR "Attempting to checksum a non-"
1424 "TCP/UDP packet, dropping a protocol"
1425 " %d packet", skb->nh.iph->protocol);
1428 if ((skb->h.raw + skb->csum + 2) > skb->tail)
1430 skb->ip_summed = CHECKSUM_HW;
1431 skb->proto_csum_blank = 0;
1438 inline int skb_checksum_setup(struct sk_buff *skb) { return 0; }
1443 * dev_queue_xmit - transmit a buffer
1444 * @skb: buffer to transmit
1446 * Queue a buffer for transmission to a network device. The caller must
1447 * have set the device and priority and built the buffer before calling
1448 * this function. The function can be called from an interrupt.
1450 * A negative errno code is returned on a failure. A success does not
1451 * guarantee the frame will be transmitted as it may be dropped due
1452 * to congestion or traffic shaping.
1454 * -----------------------------------------------------------------------------------
1455 * I notice this method can also return errors from the queue disciplines,
1456 * including NET_XMIT_DROP, which is a positive value. So, errors can also
1459 * Regardless of the return value, the skb is consumed, so it is currently
1460 * difficult to retry a send to this method. (You can bump the ref count
1461 * before sending to hold a reference for retry if you are careful.)
1463 * When calling this method, interrupts MUST be enabled. This is because
1464 * the BH enable code must have IRQs enabled so that it will not deadlock.
1468 int dev_queue_xmit(struct sk_buff *skb)
1470 struct net_device *dev = skb->dev;
1474 /* If a checksum-deferred packet is forwarded to a device that needs a
1475 * checksum, correct the pointers and force checksumming.
1477 if (skb_checksum_setup(skb))
1480 /* GSO will handle the following emulations directly. */
1481 if (netif_needs_gso(dev, skb))
1484 if (skb_shinfo(skb)->frag_list &&
1485 !(dev->features & NETIF_F_FRAGLIST) &&
1486 __skb_linearize(skb))
1489 /* Fragmented skb is linearized if device does not support SG,
1490 * or if at least one of fragments is in highmem and device
1491 * does not support DMA from it.
1493 if (skb_shinfo(skb)->nr_frags &&
1494 (!(dev->features & NETIF_F_SG) || illegal_highdma(dev, skb)) &&
1495 __skb_linearize(skb))
1498 /* If packet is not checksummed and device does not support
1499 * checksumming for this protocol, complete checksumming here.
1501 if (skb->ip_summed == CHECKSUM_HW &&
1502 (!(dev->features & NETIF_F_GEN_CSUM) &&
1503 (!(dev->features & NETIF_F_IP_CSUM) ||
1504 skb->protocol != htons(ETH_P_IP))))
1505 if (skb_checksum_help(skb, 0))
1509 spin_lock_prefetch(&dev->queue_lock);
1511 /* Disable soft irqs for various locks below. Also
1512 * stops preemption for RCU.
1516 /* Updates of qdisc are serialized by queue_lock.
1517 * The struct Qdisc which is pointed to by qdisc is now a
1518 * rcu structure - it may be accessed without acquiring
1519 * a lock (but the structure may be stale.) The freeing of the
1520 * qdisc will be deferred until it's known that there are no
1521 * more references to it.
1523 * If the qdisc has an enqueue function, we still need to
1524 * hold the queue_lock before calling it, since queue_lock
1525 * also serializes access to the device queue.
1528 q = rcu_dereference(dev->qdisc);
1529 #ifdef CONFIG_NET_CLS_ACT
1530 skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_EGRESS);
1533 /* Grab device queue */
1534 spin_lock(&dev->queue_lock);
1537 rc = q->enqueue(skb, q);
1539 spin_unlock(&dev->queue_lock);
1541 rc = rc == NET_XMIT_BYPASS ? NET_XMIT_SUCCESS : rc;
1544 spin_unlock(&dev->queue_lock);
1547 /* The device has no queue. Common case for software devices:
1548 loopback, all the sorts of tunnels...
1550 Really, it is unlikely that netif_tx_lock protection is necessary
1551 here. (f.e. loopback and IP tunnels are clean ignoring statistics
1553 However, it is possible, that they rely on protection
1556 Check this and shot the lock. It is not prone from deadlocks.
1557 Either shot noqueue qdisc, it is even simpler 8)
1559 if (dev->flags & IFF_UP) {
1560 int cpu = smp_processor_id(); /* ok because BHs are off */
1562 if (dev->xmit_lock_owner != cpu) {
1564 HARD_TX_LOCK(dev, cpu);
1566 if (!netif_queue_stopped(dev)) {
1568 if (!dev_hard_start_xmit(skb, dev)) {
1569 HARD_TX_UNLOCK(dev);
1573 HARD_TX_UNLOCK(dev);
1574 if (net_ratelimit())
1575 printk(KERN_CRIT "Virtual device %s asks to "
1576 "queue packet!\n", dev->name);
1578 /* Recursion is detected! It is possible,
1580 if (net_ratelimit())
1581 printk(KERN_CRIT "Dead loop on virtual device "
1582 "%s, fix it urgently!\n", dev->name);
1587 rcu_read_unlock_bh();
1593 rcu_read_unlock_bh();
1598 /*=======================================================================
1600 =======================================================================*/
1602 int netdev_max_backlog = 1000;
1603 int netdev_budget = 300;
1604 int weight_p = 64; /* old backlog weight */
1606 DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
1610 * netif_rx - post buffer to the network code
1611 * @skb: buffer to post
1613 * This function receives a packet from a device driver and queues it for
1614 * the upper (protocol) levels to process. It always succeeds. The buffer
1615 * may be dropped during processing for congestion control or by the
1619 * NET_RX_SUCCESS (no congestion)
1620 * NET_RX_CN_LOW (low congestion)
1621 * NET_RX_CN_MOD (moderate congestion)
1622 * NET_RX_CN_HIGH (high congestion)
1623 * NET_RX_DROP (packet was dropped)
1627 int netif_rx(struct sk_buff *skb)
1629 struct softnet_data *queue;
1630 unsigned long flags;
1632 /* if netpoll wants it, pretend we never saw it */
1633 if (netpoll_rx(skb))
1636 if (!skb->tstamp.off_sec)
1640 * The code is rearranged so that the path is the most
1641 * short when CPU is congested, but is still operating.
1643 local_irq_save(flags);
1644 queue = &__get_cpu_var(softnet_data);
1646 __get_cpu_var(netdev_rx_stat).total++;
1647 if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
1648 if (queue->input_pkt_queue.qlen) {
1651 __skb_queue_tail(&queue->input_pkt_queue, skb);
1652 local_irq_restore(flags);
1653 return NET_RX_SUCCESS;
1656 netif_rx_schedule(&queue->backlog_dev);
1660 __get_cpu_var(netdev_rx_stat).dropped++;
1661 local_irq_restore(flags);
1667 int netif_rx_ni(struct sk_buff *skb)
1672 err = netif_rx(skb);
1673 if (local_softirq_pending())
1680 EXPORT_SYMBOL(netif_rx_ni);
1682 static inline struct net_device *skb_bond(struct sk_buff *skb)
1684 struct net_device *dev = skb->dev;
1687 if (skb_bond_should_drop(skb)) {
1691 skb->dev = dev->master;
1697 static void net_tx_action(struct softirq_action *h)
1699 struct softnet_data *sd = &__get_cpu_var(softnet_data);
1701 if (sd->completion_queue) {
1702 struct sk_buff *clist;
1704 local_irq_disable();
1705 clist = sd->completion_queue;
1706 sd->completion_queue = NULL;
1710 struct sk_buff *skb = clist;
1711 clist = clist->next;
1713 BUG_TRAP(!atomic_read(&skb->users));
1718 if (sd->output_queue) {
1719 struct net_device *head;
1721 local_irq_disable();
1722 head = sd->output_queue;
1723 sd->output_queue = NULL;
1727 struct net_device *dev = head;
1728 head = head->next_sched;
1730 smp_mb__before_clear_bit();
1731 clear_bit(__LINK_STATE_SCHED, &dev->state);
1733 if (spin_trylock(&dev->queue_lock)) {
1735 spin_unlock(&dev->queue_lock);
1737 netif_schedule(dev);
1743 static __inline__ int deliver_skb(struct sk_buff *skb,
1744 struct packet_type *pt_prev,
1745 struct net_device *orig_dev)
1747 atomic_inc(&skb->users);
1748 return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1751 #if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE)
1752 int (*br_handle_frame_hook)(struct net_bridge_port *p, struct sk_buff **pskb);
1754 struct net_bridge_fdb_entry *(*br_fdb_get_hook)(struct net_bridge *br,
1755 unsigned char *addr);
1756 void (*br_fdb_put_hook)(struct net_bridge_fdb_entry *ent);
1758 static __inline__ int handle_bridge(struct sk_buff **pskb,
1759 struct packet_type **pt_prev, int *ret,
1760 struct net_device *orig_dev)
1762 struct net_bridge_port *port;
1764 if ((*pskb)->pkt_type == PACKET_LOOPBACK ||
1765 (port = rcu_dereference((*pskb)->dev->br_port)) == NULL)
1769 *ret = deliver_skb(*pskb, *pt_prev, orig_dev);
1773 return br_handle_frame_hook(port, pskb);
1776 #define handle_bridge(skb, pt_prev, ret, orig_dev) (0)
1779 #ifdef CONFIG_NET_CLS_ACT
1780 /* TODO: Maybe we should just force sch_ingress to be compiled in
1781 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
1782 * a compare and 2 stores extra right now if we dont have it on
1783 * but have CONFIG_NET_CLS_ACT
1784 * NOTE: This doesnt stop any functionality; if you dont have
1785 * the ingress scheduler, you just cant add policies on ingress.
1788 static int ing_filter(struct sk_buff *skb)
1791 struct net_device *dev = skb->dev;
1792 int result = TC_ACT_OK;
1794 if (dev->qdisc_ingress) {
1795 __u32 ttl = (__u32) G_TC_RTTL(skb->tc_verd);
1796 if (MAX_RED_LOOP < ttl++) {
1797 printk(KERN_WARNING "Redir loop detected Dropping packet (%s->%s)\n",
1798 skb->input_dev->name, skb->dev->name);
1802 skb->tc_verd = SET_TC_RTTL(skb->tc_verd,ttl);
1804 skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_INGRESS);
1806 spin_lock(&dev->ingress_lock);
1807 if ((q = dev->qdisc_ingress) != NULL)
1808 result = q->enqueue(skb, q);
1809 spin_unlock(&dev->ingress_lock);
1817 int netif_receive_skb(struct sk_buff *skb)
1819 struct packet_type *ptype, *pt_prev;
1820 struct net_device *orig_dev;
1821 int ret = NET_RX_DROP;
1822 unsigned short type;
1824 /* if we've gotten here through NAPI, check netpoll */
1825 if (skb->dev->poll && netpoll_rx(skb))
1828 if (!skb->tstamp.off_sec)
1831 if (!skb->input_dev)
1832 skb->input_dev = skb->dev;
1834 orig_dev = skb_bond(skb);
1839 __get_cpu_var(netdev_rx_stat).total++;
1841 skb->h.raw = skb->nh.raw = skb->data;
1842 skb->mac_len = skb->nh.raw - skb->mac.raw;
1848 #ifdef CONFIG_NET_CLS_ACT
1849 if (skb->tc_verd & TC_NCLS) {
1850 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
1856 switch (skb->ip_summed) {
1857 case CHECKSUM_UNNECESSARY:
1858 skb->proto_data_valid = 1;
1861 /* XXX Implement me. */
1863 skb->proto_data_valid = 0;
1868 list_for_each_entry_rcu(ptype, &ptype_all, list) {
1869 if (!ptype->dev || ptype->dev == skb->dev) {
1871 ret = deliver_skb(skb, pt_prev, orig_dev);
1876 #ifdef CONFIG_NET_CLS_ACT
1878 ret = deliver_skb(skb, pt_prev, orig_dev);
1879 pt_prev = NULL; /* noone else should process this after*/
1881 skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd);
1884 ret = ing_filter(skb);
1886 if (ret == TC_ACT_SHOT || (ret == TC_ACT_STOLEN)) {
1895 handle_diverter(skb);
1897 if (handle_bridge(&skb, &pt_prev, &ret, orig_dev))
1900 type = skb->protocol;
1901 list_for_each_entry_rcu(ptype, &ptype_base[ntohs(type)&15], list) {
1902 if (ptype->type == type &&
1903 (!ptype->dev || ptype->dev == skb->dev)) {
1905 ret = deliver_skb(skb, pt_prev, orig_dev);
1911 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1914 /* Jamal, now you will not able to escape explaining
1915 * me how you were going to use this. :-)
1925 static int process_backlog(struct net_device *backlog_dev, int *budget)
1928 int quota = min(backlog_dev->quota, *budget);
1929 struct softnet_data *queue = &__get_cpu_var(softnet_data);
1930 unsigned long start_time = jiffies;
1932 backlog_dev->weight = weight_p;
1934 struct sk_buff *skb;
1935 struct net_device *dev;
1937 local_irq_disable();
1938 skb = __skb_dequeue(&queue->input_pkt_queue);
1945 netif_receive_skb(skb);
1951 if (work >= quota || jiffies - start_time > 1)
1956 backlog_dev->quota -= work;
1961 backlog_dev->quota -= work;
1964 list_del(&backlog_dev->poll_list);
1965 smp_mb__before_clear_bit();
1966 netif_poll_enable(backlog_dev);
1972 static void net_rx_action(struct softirq_action *h)
1974 struct softnet_data *queue = &__get_cpu_var(softnet_data);
1975 unsigned long start_time = jiffies;
1976 int budget = netdev_budget;
1979 local_irq_disable();
1981 while (!list_empty(&queue->poll_list)) {
1982 struct net_device *dev;
1984 if (budget <= 0 || jiffies - start_time > 1)
1989 dev = list_entry(queue->poll_list.next,
1990 struct net_device, poll_list);
1991 have = netpoll_poll_lock(dev);
1993 if (dev->quota <= 0 || dev->poll(dev, &budget)) {
1994 netpoll_poll_unlock(have);
1995 local_irq_disable();
1996 list_move_tail(&dev->poll_list, &queue->poll_list);
1998 dev->quota += dev->weight;
2000 dev->quota = dev->weight;
2002 netpoll_poll_unlock(have);
2004 local_irq_disable();
2008 #ifdef CONFIG_NET_DMA
2010 * There may not be any more sk_buffs coming right now, so push
2011 * any pending DMA copies to hardware
2013 if (net_dma_client) {
2014 struct dma_chan *chan;
2016 list_for_each_entry_rcu(chan, &net_dma_client->channels, client_node)
2017 dma_async_memcpy_issue_pending(chan);
2025 __get_cpu_var(netdev_rx_stat).time_squeeze++;
2026 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2030 static gifconf_func_t * gifconf_list [NPROTO];
2033 * register_gifconf - register a SIOCGIF handler
2034 * @family: Address family
2035 * @gifconf: Function handler
2037 * Register protocol dependent address dumping routines. The handler
2038 * that is passed must not be freed or reused until it has been replaced
2039 * by another handler.
2041 int register_gifconf(unsigned int family, gifconf_func_t * gifconf)
2043 if (family >= NPROTO)
2045 gifconf_list[family] = gifconf;
2051 * Map an interface index to its name (SIOCGIFNAME)
2055 * We need this ioctl for efficient implementation of the
2056 * if_indextoname() function required by the IPv6 API. Without
2057 * it, we would have to search all the interfaces to find a
2061 static int dev_ifname(struct ifreq __user *arg)
2063 struct net_device *dev;
2067 * Fetch the caller's info block.
2070 if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
2073 read_lock(&dev_base_lock);
2074 dev = __dev_get_by_index(ifr.ifr_ifindex);
2076 read_unlock(&dev_base_lock);
2080 strcpy(ifr.ifr_name, dev->name);
2081 read_unlock(&dev_base_lock);
2083 if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
2089 * Perform a SIOCGIFCONF call. This structure will change
2090 * size eventually, and there is nothing I can do about it.
2091 * Thus we will need a 'compatibility mode'.
2094 static int dev_ifconf(char __user *arg)
2097 struct net_device *dev;
2104 * Fetch the caller's info block.
2107 if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
2114 * Loop over the interfaces, and write an info block for each.
2118 for (dev = dev_base; dev; dev = dev->next) {
2119 if (vx_flags(VXF_HIDE_NETIF, 0) &&
2120 !dev_in_nx_info(dev, current->nx_info))
2122 for (i = 0; i < NPROTO; i++) {
2123 if (gifconf_list[i]) {
2126 done = gifconf_list[i](dev, NULL, 0);
2128 done = gifconf_list[i](dev, pos + total,
2138 * All done. Write the updated control block back to the caller.
2140 ifc.ifc_len = total;
2143 * Both BSD and Solaris return 0 here, so we do too.
2145 return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
2148 #ifdef CONFIG_PROC_FS
2150 * This is invoked by the /proc filesystem handler to display a device
2153 static __inline__ struct net_device *dev_get_idx(loff_t pos)
2155 struct net_device *dev;
2158 for (i = 0, dev = dev_base; dev && i < pos; ++i, dev = dev->next);
2160 return i == pos ? dev : NULL;
2163 void *dev_seq_start(struct seq_file *seq, loff_t *pos)
2165 read_lock(&dev_base_lock);
2166 return *pos ? dev_get_idx(*pos - 1) : SEQ_START_TOKEN;
2169 void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2172 return v == SEQ_START_TOKEN ? dev_base : ((struct net_device *)v)->next;
2175 void dev_seq_stop(struct seq_file *seq, void *v)
2177 read_unlock(&dev_base_lock);
2180 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
2182 struct nx_info *nxi = current->nx_info;
2184 if (vx_flags(VXF_HIDE_NETIF, 0) && !dev_in_nx_info(dev, nxi))
2186 if (dev->get_stats) {
2187 struct net_device_stats *stats = dev->get_stats(dev);
2189 seq_printf(seq, "%6s:%8lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu "
2190 "%8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n",
2191 dev->name, stats->rx_bytes, stats->rx_packets,
2193 stats->rx_dropped + stats->rx_missed_errors,
2194 stats->rx_fifo_errors,
2195 stats->rx_length_errors + stats->rx_over_errors +
2196 stats->rx_crc_errors + stats->rx_frame_errors,
2197 stats->rx_compressed, stats->multicast,
2198 stats->tx_bytes, stats->tx_packets,
2199 stats->tx_errors, stats->tx_dropped,
2200 stats->tx_fifo_errors, stats->collisions,
2201 stats->tx_carrier_errors +
2202 stats->tx_aborted_errors +
2203 stats->tx_window_errors +
2204 stats->tx_heartbeat_errors,
2205 stats->tx_compressed);
2207 seq_printf(seq, "%6s: No statistics available.\n", dev->name);
2211 * Called from the PROCfs module. This now uses the new arbitrary sized
2212 * /proc/net interface to create /proc/net/dev
2214 static int dev_seq_show(struct seq_file *seq, void *v)
2216 if (v == SEQ_START_TOKEN)
2217 seq_puts(seq, "Inter-| Receive "
2219 " face |bytes packets errs drop fifo frame "
2220 "compressed multicast|bytes packets errs "
2221 "drop fifo colls carrier compressed\n");
2223 dev_seq_printf_stats(seq, v);
2227 static struct netif_rx_stats *softnet_get_online(loff_t *pos)
2229 struct netif_rx_stats *rc = NULL;
2231 while (*pos < NR_CPUS)
2232 if (cpu_online(*pos)) {
2233 rc = &per_cpu(netdev_rx_stat, *pos);
2240 static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
2242 return softnet_get_online(pos);
2245 static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2248 return softnet_get_online(pos);
2251 static void softnet_seq_stop(struct seq_file *seq, void *v)
2255 static int softnet_seq_show(struct seq_file *seq, void *v)
2257 struct netif_rx_stats *s = v;
2259 seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
2260 s->total, s->dropped, s->time_squeeze, 0,
2261 0, 0, 0, 0, /* was fastroute */
2266 static struct seq_operations dev_seq_ops = {
2267 .start = dev_seq_start,
2268 .next = dev_seq_next,
2269 .stop = dev_seq_stop,
2270 .show = dev_seq_show,
2273 static int dev_seq_open(struct inode *inode, struct file *file)
2275 return seq_open(file, &dev_seq_ops);
2278 static struct file_operations dev_seq_fops = {
2279 .owner = THIS_MODULE,
2280 .open = dev_seq_open,
2282 .llseek = seq_lseek,
2283 .release = seq_release,
2286 static struct seq_operations softnet_seq_ops = {
2287 .start = softnet_seq_start,
2288 .next = softnet_seq_next,
2289 .stop = softnet_seq_stop,
2290 .show = softnet_seq_show,
2293 static int softnet_seq_open(struct inode *inode, struct file *file)
2295 return seq_open(file, &softnet_seq_ops);
2298 static struct file_operations softnet_seq_fops = {
2299 .owner = THIS_MODULE,
2300 .open = softnet_seq_open,
2302 .llseek = seq_lseek,
2303 .release = seq_release,
2306 #ifdef CONFIG_WIRELESS_EXT
2307 extern int wireless_proc_init(void);
2309 #define wireless_proc_init() 0
2312 static int __init dev_proc_init(void)
2316 if (!proc_net_fops_create("dev", S_IRUGO, &dev_seq_fops))
2318 if (!proc_net_fops_create("softnet_stat", S_IRUGO, &softnet_seq_fops))
2320 if (wireless_proc_init())
2326 proc_net_remove("softnet_stat");
2328 proc_net_remove("dev");
2332 #define dev_proc_init() 0
2333 #endif /* CONFIG_PROC_FS */
2337 * netdev_set_master - set up master/slave pair
2338 * @slave: slave device
2339 * @master: new master device
2341 * Changes the master device of the slave. Pass %NULL to break the
2342 * bonding. The caller must hold the RTNL semaphore. On a failure
2343 * a negative errno code is returned. On success the reference counts
2344 * are adjusted, %RTM_NEWLINK is sent to the routing socket and the
2345 * function returns zero.
2347 int netdev_set_master(struct net_device *slave, struct net_device *master)
2349 struct net_device *old = slave->master;
2359 slave->master = master;
2367 slave->flags |= IFF_SLAVE;
2369 slave->flags &= ~IFF_SLAVE;
2371 rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
2376 * dev_set_promiscuity - update promiscuity count on a device
2380 * Add or remove promiscuity from a device. While the count in the device
2381 * remains above zero the interface remains promiscuous. Once it hits zero
2382 * the device reverts back to normal filtering operation. A negative inc
2383 * value is used to drop promiscuity on the device.
2385 void dev_set_promiscuity(struct net_device *dev, int inc)
2387 unsigned short old_flags = dev->flags;
2389 if ((dev->promiscuity += inc) == 0)
2390 dev->flags &= ~IFF_PROMISC;
2392 dev->flags |= IFF_PROMISC;
2393 if (dev->flags != old_flags) {
2395 printk(KERN_INFO "device %s %s promiscuous mode\n",
2396 dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
2398 audit_log(current->audit_context, GFP_ATOMIC,
2399 AUDIT_ANOM_PROMISCUOUS,
2400 "dev=%s prom=%d old_prom=%d auid=%u",
2401 dev->name, (dev->flags & IFF_PROMISC),
2402 (old_flags & IFF_PROMISC),
2403 audit_get_loginuid(current->audit_context));
2408 * dev_set_allmulti - update allmulti count on a device
2412 * Add or remove reception of all multicast frames to a device. While the
2413 * count in the device remains above zero the interface remains listening
2414 * to all interfaces. Once it hits zero the device reverts back to normal
2415 * filtering operation. A negative @inc value is used to drop the counter
2416 * when releasing a resource needing all multicasts.
2419 void dev_set_allmulti(struct net_device *dev, int inc)
2421 unsigned short old_flags = dev->flags;
2423 dev->flags |= IFF_ALLMULTI;
2424 if ((dev->allmulti += inc) == 0)
2425 dev->flags &= ~IFF_ALLMULTI;
2426 if (dev->flags ^ old_flags)
2430 unsigned dev_get_flags(const struct net_device *dev)
2434 flags = (dev->flags & ~(IFF_PROMISC |
2439 (dev->gflags & (IFF_PROMISC |
2442 if (netif_running(dev)) {
2443 if (netif_oper_up(dev))
2444 flags |= IFF_RUNNING;
2445 if (netif_carrier_ok(dev))
2446 flags |= IFF_LOWER_UP;
2447 if (netif_dormant(dev))
2448 flags |= IFF_DORMANT;
2454 int dev_change_flags(struct net_device *dev, unsigned flags)
2457 int old_flags = dev->flags;
2460 * Set the flags on our device.
2463 dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
2464 IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
2466 (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
2470 * Load in the correct multicast list now the flags have changed.
2476 * Have we downed the interface. We handle IFF_UP ourselves
2477 * according to user attempts to set it, rather than blindly
2482 if ((old_flags ^ flags) & IFF_UP) { /* Bit is different ? */
2483 ret = ((old_flags & IFF_UP) ? dev_close : dev_open)(dev);
2489 if (dev->flags & IFF_UP &&
2490 ((old_flags ^ dev->flags) &~ (IFF_UP | IFF_PROMISC | IFF_ALLMULTI |
2492 raw_notifier_call_chain(&netdev_chain,
2493 NETDEV_CHANGE, dev);
2495 if ((flags ^ dev->gflags) & IFF_PROMISC) {
2496 int inc = (flags & IFF_PROMISC) ? +1 : -1;
2497 dev->gflags ^= IFF_PROMISC;
2498 dev_set_promiscuity(dev, inc);
2501 /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
2502 is important. Some (broken) drivers set IFF_PROMISC, when
2503 IFF_ALLMULTI is requested not asking us and not reporting.
2505 if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
2506 int inc = (flags & IFF_ALLMULTI) ? +1 : -1;
2507 dev->gflags ^= IFF_ALLMULTI;
2508 dev_set_allmulti(dev, inc);
2511 if (old_flags ^ dev->flags)
2512 rtmsg_ifinfo(RTM_NEWLINK, dev, old_flags ^ dev->flags);
2517 int dev_set_mtu(struct net_device *dev, int new_mtu)
2521 if (new_mtu == dev->mtu)
2524 /* MTU must be positive. */
2528 if (!netif_device_present(dev))
2532 if (dev->change_mtu)
2533 err = dev->change_mtu(dev, new_mtu);
2536 if (!err && dev->flags & IFF_UP)
2537 raw_notifier_call_chain(&netdev_chain,
2538 NETDEV_CHANGEMTU, dev);
2542 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
2546 if (!dev->set_mac_address)
2548 if (sa->sa_family != dev->type)
2550 if (!netif_device_present(dev))
2552 err = dev->set_mac_address(dev, sa);
2554 raw_notifier_call_chain(&netdev_chain,
2555 NETDEV_CHANGEADDR, dev);
2560 * Perform the SIOCxIFxxx calls.
2562 static int dev_ifsioc(struct ifreq *ifr, unsigned int cmd)
2565 struct net_device *dev = __dev_get_by_name(ifr->ifr_name);
2571 case SIOCGIFFLAGS: /* Get interface flags */
2572 ifr->ifr_flags = dev_get_flags(dev);
2575 case SIOCSIFFLAGS: /* Set interface flags */
2576 return dev_change_flags(dev, ifr->ifr_flags);
2578 case SIOCGIFMETRIC: /* Get the metric on the interface
2579 (currently unused) */
2580 ifr->ifr_metric = 0;
2583 case SIOCSIFMETRIC: /* Set the metric on the interface
2584 (currently unused) */
2587 case SIOCGIFMTU: /* Get the MTU of a device */
2588 ifr->ifr_mtu = dev->mtu;
2591 case SIOCSIFMTU: /* Set the MTU of a device */
2592 return dev_set_mtu(dev, ifr->ifr_mtu);
2596 memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
2598 memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
2599 min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
2600 ifr->ifr_hwaddr.sa_family = dev->type;
2604 return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
2606 case SIOCSIFHWBROADCAST:
2607 if (ifr->ifr_hwaddr.sa_family != dev->type)
2609 memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
2610 min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
2611 raw_notifier_call_chain(&netdev_chain,
2612 NETDEV_CHANGEADDR, dev);
2616 ifr->ifr_map.mem_start = dev->mem_start;
2617 ifr->ifr_map.mem_end = dev->mem_end;
2618 ifr->ifr_map.base_addr = dev->base_addr;
2619 ifr->ifr_map.irq = dev->irq;
2620 ifr->ifr_map.dma = dev->dma;
2621 ifr->ifr_map.port = dev->if_port;
2625 if (dev->set_config) {
2626 if (!netif_device_present(dev))
2628 return dev->set_config(dev, &ifr->ifr_map);
2633 if (!dev->set_multicast_list ||
2634 ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
2636 if (!netif_device_present(dev))
2638 return dev_mc_add(dev, ifr->ifr_hwaddr.sa_data,
2642 if (!dev->set_multicast_list ||
2643 ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
2645 if (!netif_device_present(dev))
2647 return dev_mc_delete(dev, ifr->ifr_hwaddr.sa_data,
2651 ifr->ifr_ifindex = dev->ifindex;
2655 ifr->ifr_qlen = dev->tx_queue_len;
2659 if (ifr->ifr_qlen < 0)
2661 dev->tx_queue_len = ifr->ifr_qlen;
2665 ifr->ifr_newname[IFNAMSIZ-1] = '\0';
2666 return dev_change_name(dev, ifr->ifr_newname);
2669 * Unknown or private ioctl
2673 if ((cmd >= SIOCDEVPRIVATE &&
2674 cmd <= SIOCDEVPRIVATE + 15) ||
2675 cmd == SIOCBONDENSLAVE ||
2676 cmd == SIOCBONDRELEASE ||
2677 cmd == SIOCBONDSETHWADDR ||
2678 cmd == SIOCBONDSLAVEINFOQUERY ||
2679 cmd == SIOCBONDINFOQUERY ||
2680 cmd == SIOCBONDCHANGEACTIVE ||
2681 cmd == SIOCGMIIPHY ||
2682 cmd == SIOCGMIIREG ||
2683 cmd == SIOCSMIIREG ||
2684 cmd == SIOCBRADDIF ||
2685 cmd == SIOCBRDELIF ||
2686 cmd == SIOCWANDEV) {
2688 if (dev->do_ioctl) {
2689 if (netif_device_present(dev))
2690 err = dev->do_ioctl(dev, ifr,
2703 * This function handles all "interface"-type I/O control requests. The actual
2704 * 'doing' part of this is dev_ifsioc above.
2708 * dev_ioctl - network device ioctl
2709 * @cmd: command to issue
2710 * @arg: pointer to a struct ifreq in user space
2712 * Issue ioctl functions to devices. This is normally called by the
2713 * user space syscall interfaces but can sometimes be useful for
2714 * other purposes. The return value is the return from the syscall if
2715 * positive or a negative errno code on error.
2718 int dev_ioctl(unsigned int cmd, void __user *arg)
2724 /* One special case: SIOCGIFCONF takes ifconf argument
2725 and requires shared lock, because it sleeps writing
2729 if (cmd == SIOCGIFCONF) {
2731 ret = dev_ifconf((char __user *) arg);
2735 if (cmd == SIOCGIFNAME)
2736 return dev_ifname((struct ifreq __user *)arg);
2738 if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
2741 ifr.ifr_name[IFNAMSIZ-1] = 0;
2743 colon = strchr(ifr.ifr_name, ':');
2748 * See which interface the caller is talking about.
2753 * These ioctl calls:
2754 * - can be done by all.
2755 * - atomic and do not require locking.
2766 dev_load(ifr.ifr_name);
2767 read_lock(&dev_base_lock);
2768 ret = dev_ifsioc(&ifr, cmd);
2769 read_unlock(&dev_base_lock);
2773 if (copy_to_user(arg, &ifr,
2774 sizeof(struct ifreq)))
2780 dev_load(ifr.ifr_name);
2782 ret = dev_ethtool(&ifr);
2787 if (copy_to_user(arg, &ifr,
2788 sizeof(struct ifreq)))
2794 * These ioctl calls:
2795 * - require superuser power.
2796 * - require strict serialization.
2802 if (!capable(CAP_NET_ADMIN))
2804 dev_load(ifr.ifr_name);
2806 ret = dev_ifsioc(&ifr, cmd);
2811 if (copy_to_user(arg, &ifr,
2812 sizeof(struct ifreq)))
2818 * These ioctl calls:
2819 * - require superuser power.
2820 * - require strict serialization.
2821 * - do not return a value
2831 case SIOCSIFHWBROADCAST:
2834 case SIOCBONDENSLAVE:
2835 case SIOCBONDRELEASE:
2836 case SIOCBONDSETHWADDR:
2837 case SIOCBONDCHANGEACTIVE:
2840 if (!capable(CAP_NET_ADMIN))
2843 case SIOCBONDSLAVEINFOQUERY:
2844 case SIOCBONDINFOQUERY:
2845 dev_load(ifr.ifr_name);
2847 ret = dev_ifsioc(&ifr, cmd);
2852 /* Get the per device memory space. We can add this but
2853 * currently do not support it */
2855 /* Set the per device memory buffer space.
2856 * Not applicable in our case */
2861 * Unknown or private ioctl.
2864 if (cmd == SIOCWANDEV ||
2865 (cmd >= SIOCDEVPRIVATE &&
2866 cmd <= SIOCDEVPRIVATE + 15)) {
2867 dev_load(ifr.ifr_name);
2869 ret = dev_ifsioc(&ifr, cmd);
2871 if (!ret && copy_to_user(arg, &ifr,
2872 sizeof(struct ifreq)))
2876 #ifdef CONFIG_WIRELESS_EXT
2877 /* Take care of Wireless Extensions */
2878 if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) {
2879 /* If command is `set a parameter', or
2880 * `get the encoding parameters', check if
2881 * the user has the right to do it */
2882 if (IW_IS_SET(cmd) || cmd == SIOCGIWENCODE
2883 || cmd == SIOCGIWENCODEEXT) {
2884 if (!capable(CAP_NET_ADMIN))
2887 dev_load(ifr.ifr_name);
2889 /* Follow me in net/core/wireless.c */
2890 ret = wireless_process_ioctl(&ifr, cmd);
2892 if (IW_IS_GET(cmd) &&
2893 copy_to_user(arg, &ifr,
2894 sizeof(struct ifreq)))
2898 #endif /* CONFIG_WIRELESS_EXT */
2905 * dev_new_index - allocate an ifindex
2907 * Returns a suitable unique value for a new device interface
2908 * number. The caller must hold the rtnl semaphore or the
2909 * dev_base_lock to be sure it remains unique.
2911 static int dev_new_index(void)
2917 if (!__dev_get_by_index(ifindex))
2922 static int dev_boot_phase = 1;
2924 /* Delayed registration/unregisteration */
2925 static DEFINE_SPINLOCK(net_todo_list_lock);
2926 static struct list_head net_todo_list = LIST_HEAD_INIT(net_todo_list);
2928 static inline void net_set_todo(struct net_device *dev)
2930 spin_lock(&net_todo_list_lock);
2931 list_add_tail(&dev->todo_list, &net_todo_list);
2932 spin_unlock(&net_todo_list_lock);
2936 * register_netdevice - register a network device
2937 * @dev: device to register
2939 * Take a completed network device structure and add it to the kernel
2940 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
2941 * chain. 0 is returned on success. A negative errno code is returned
2942 * on a failure to set up the device, or if the name is a duplicate.
2944 * Callers must hold the rtnl semaphore. You may want
2945 * register_netdev() instead of this.
2948 * The locking appears insufficient to guarantee two parallel registers
2949 * will not get the same name.
2952 int register_netdevice(struct net_device *dev)
2954 struct hlist_head *head;
2955 struct hlist_node *p;
2958 BUG_ON(dev_boot_phase);
2963 /* When net_device's are persistent, this will be fatal. */
2964 BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
2966 spin_lock_init(&dev->queue_lock);
2967 spin_lock_init(&dev->_xmit_lock);
2968 dev->xmit_lock_owner = -1;
2969 #ifdef CONFIG_NET_CLS_ACT
2970 spin_lock_init(&dev->ingress_lock);
2973 ret = alloc_divert_blk(dev);
2979 /* Init, if this function is available */
2981 ret = dev->init(dev);
2989 if (!dev_valid_name(dev->name)) {
2994 dev->ifindex = dev_new_index();
2995 if (dev->iflink == -1)
2996 dev->iflink = dev->ifindex;
2998 /* Check for existence of name */
2999 head = dev_name_hash(dev->name);
3000 hlist_for_each(p, head) {
3001 struct net_device *d
3002 = hlist_entry(p, struct net_device, name_hlist);
3003 if (!strncmp(d->name, dev->name, IFNAMSIZ)) {
3009 /* Fix illegal SG+CSUM combinations. */
3010 if ((dev->features & NETIF_F_SG) &&
3011 !(dev->features & NETIF_F_ALL_CSUM)) {
3012 printk(KERN_NOTICE "%s: Dropping NETIF_F_SG since no checksum feature.\n",
3014 dev->features &= ~NETIF_F_SG;
3017 /* TSO requires that SG is present as well. */
3018 if ((dev->features & NETIF_F_TSO) &&
3019 !(dev->features & NETIF_F_SG)) {
3020 printk(KERN_NOTICE "%s: Dropping NETIF_F_TSO since no SG feature.\n",
3022 dev->features &= ~NETIF_F_TSO;
3024 if (dev->features & NETIF_F_UFO) {
3025 if (!(dev->features & NETIF_F_HW_CSUM)) {
3026 printk(KERN_ERR "%s: Dropping NETIF_F_UFO since no "
3027 "NETIF_F_HW_CSUM feature.\n",
3029 dev->features &= ~NETIF_F_UFO;
3031 if (!(dev->features & NETIF_F_SG)) {
3032 printk(KERN_ERR "%s: Dropping NETIF_F_UFO since no "
3033 "NETIF_F_SG feature.\n",
3035 dev->features &= ~NETIF_F_UFO;
3040 * nil rebuild_header routine,
3041 * that should be never called and used as just bug trap.
3044 if (!dev->rebuild_header)
3045 dev->rebuild_header = default_rebuild_header;
3047 ret = netdev_register_sysfs(dev);
3050 dev->reg_state = NETREG_REGISTERED;
3053 * Default initial state at registry is that the
3054 * device is present.
3057 set_bit(__LINK_STATE_PRESENT, &dev->state);
3060 dev_init_scheduler(dev);
3061 write_lock_bh(&dev_base_lock);
3063 dev_tail = &dev->next;
3064 hlist_add_head(&dev->name_hlist, head);
3065 hlist_add_head(&dev->index_hlist, dev_index_hash(dev->ifindex));
3067 write_unlock_bh(&dev_base_lock);
3069 /* Notify protocols, that a new device appeared. */
3070 raw_notifier_call_chain(&netdev_chain, NETDEV_REGISTER, dev);
3077 free_divert_blk(dev);
3082 * register_netdev - register a network device
3083 * @dev: device to register
3085 * Take a completed network device structure and add it to the kernel
3086 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
3087 * chain. 0 is returned on success. A negative errno code is returned
3088 * on a failure to set up the device, or if the name is a duplicate.
3090 * This is a wrapper around register_netdev that takes the rtnl semaphore
3091 * and expands the device name if you passed a format string to
3094 int register_netdev(struct net_device *dev)
3101 * If the name is a format string the caller wants us to do a
3104 if (strchr(dev->name, '%')) {
3105 err = dev_alloc_name(dev, dev->name);
3111 * Back compatibility hook. Kill this one in 2.5
3113 if (dev->name[0] == 0 || dev->name[0] == ' ') {
3114 err = dev_alloc_name(dev, "eth%d");
3119 err = register_netdevice(dev);
3124 EXPORT_SYMBOL(register_netdev);
3127 * netdev_wait_allrefs - wait until all references are gone.
3129 * This is called when unregistering network devices.
3131 * Any protocol or device that holds a reference should register
3132 * for netdevice notification, and cleanup and put back the
3133 * reference if they receive an UNREGISTER event.
3134 * We can get stuck here if buggy protocols don't correctly
3137 static void netdev_wait_allrefs(struct net_device *dev)
3139 unsigned long rebroadcast_time, warning_time;
3141 rebroadcast_time = warning_time = jiffies;
3142 while (atomic_read(&dev->refcnt) != 0) {
3143 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
3146 /* Rebroadcast unregister notification */
3147 raw_notifier_call_chain(&netdev_chain,
3148 NETDEV_UNREGISTER, dev);
3150 if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
3152 /* We must not have linkwatch events
3153 * pending on unregister. If this
3154 * happens, we simply run the queue
3155 * unscheduled, resulting in a noop
3158 linkwatch_run_queue();
3163 rebroadcast_time = jiffies;
3168 if (time_after(jiffies, warning_time + 10 * HZ)) {
3169 printk(KERN_EMERG "unregister_netdevice: "
3170 "waiting for %s to become free. Usage "
3172 dev->name, atomic_read(&dev->refcnt));
3173 warning_time = jiffies;
3182 * register_netdevice(x1);
3183 * register_netdevice(x2);
3185 * unregister_netdevice(y1);
3186 * unregister_netdevice(y2);
3192 * We are invoked by rtnl_unlock() after it drops the semaphore.
3193 * This allows us to deal with problems:
3194 * 1) We can delete sysfs objects which invoke hotplug
3195 * without deadlocking with linkwatch via keventd.
3196 * 2) Since we run with the RTNL semaphore not held, we can sleep
3197 * safely in order to wait for the netdev refcnt to drop to zero.
3199 static DEFINE_MUTEX(net_todo_run_mutex);
3200 void netdev_run_todo(void)
3202 struct list_head list;
3204 /* Need to guard against multiple cpu's getting out of order. */
3205 mutex_lock(&net_todo_run_mutex);
3207 /* Not safe to do outside the semaphore. We must not return
3208 * until all unregister events invoked by the local processor
3209 * have been completed (either by this todo run, or one on
3212 if (list_empty(&net_todo_list))
3215 /* Snapshot list, allow later requests */
3216 spin_lock(&net_todo_list_lock);
3217 list_replace_init(&net_todo_list, &list);
3218 spin_unlock(&net_todo_list_lock);
3220 while (!list_empty(&list)) {
3221 struct net_device *dev
3222 = list_entry(list.next, struct net_device, todo_list);
3223 list_del(&dev->todo_list);
3225 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
3226 printk(KERN_ERR "network todo '%s' but state %d\n",
3227 dev->name, dev->reg_state);
3232 netdev_unregister_sysfs(dev);
3233 dev->reg_state = NETREG_UNREGISTERED;
3235 netdev_wait_allrefs(dev);
3238 BUG_ON(atomic_read(&dev->refcnt));
3239 BUG_TRAP(!dev->ip_ptr);
3240 BUG_TRAP(!dev->ip6_ptr);
3241 BUG_TRAP(!dev->dn_ptr);
3243 /* It must be the very last action,
3244 * after this 'dev' may point to freed up memory.
3246 if (dev->destructor)
3247 dev->destructor(dev);
3251 mutex_unlock(&net_todo_run_mutex);
3255 * alloc_netdev - allocate network device
3256 * @sizeof_priv: size of private data to allocate space for
3257 * @name: device name format string
3258 * @setup: callback to initialize device
3260 * Allocates a struct net_device with private data area for driver use
3261 * and performs basic initialization.
3263 struct net_device *alloc_netdev(int sizeof_priv, const char *name,
3264 void (*setup)(struct net_device *))
3267 struct net_device *dev;
3270 /* ensure 32-byte alignment of both the device and private area */
3271 alloc_size = (sizeof(*dev) + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST;
3272 alloc_size += sizeof_priv + NETDEV_ALIGN_CONST;
3274 p = kzalloc(alloc_size, GFP_KERNEL);
3276 printk(KERN_ERR "alloc_dev: Unable to allocate device.\n");
3280 dev = (struct net_device *)
3281 (((long)p + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST);
3282 dev->padded = (char *)dev - (char *)p;
3285 dev->priv = netdev_priv(dev);
3288 strcpy(dev->name, name);
3291 EXPORT_SYMBOL(alloc_netdev);
3294 * free_netdev - free network device
3297 * This function does the last stage of destroying an allocated device
3298 * interface. The reference to the device object is released.
3299 * If this is the last reference then it will be freed.
3301 void free_netdev(struct net_device *dev)
3304 /* Compatibility with error handling in drivers */
3305 if (dev->reg_state == NETREG_UNINITIALIZED) {
3306 kfree((char *)dev - dev->padded);
3310 BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
3311 dev->reg_state = NETREG_RELEASED;
3313 /* will free via class release */
3314 class_device_put(&dev->class_dev);
3316 kfree((char *)dev - dev->padded);
3320 /* Synchronize with packet receive processing. */
3321 void synchronize_net(void)
3328 * unregister_netdevice - remove device from the kernel
3331 * This function shuts down a device interface and removes it
3332 * from the kernel tables. On success 0 is returned, on a failure
3333 * a negative errno code is returned.
3335 * Callers must hold the rtnl semaphore. You may want
3336 * unregister_netdev() instead of this.
3339 int unregister_netdevice(struct net_device *dev)
3341 struct net_device *d, **dp;
3343 BUG_ON(dev_boot_phase);
3346 /* Some devices call without registering for initialization unwind. */
3347 if (dev->reg_state == NETREG_UNINITIALIZED) {
3348 printk(KERN_DEBUG "unregister_netdevice: device %s/%p never "
3349 "was registered\n", dev->name, dev);
3353 BUG_ON(dev->reg_state != NETREG_REGISTERED);
3355 /* If device is running, close it first. */
3356 if (dev->flags & IFF_UP)
3359 /* And unlink it from device chain. */
3360 for (dp = &dev_base; (d = *dp) != NULL; dp = &d->next) {
3362 write_lock_bh(&dev_base_lock);
3363 hlist_del(&dev->name_hlist);
3364 hlist_del(&dev->index_hlist);
3365 if (dev_tail == &dev->next)
3368 write_unlock_bh(&dev_base_lock);
3373 printk(KERN_ERR "unregister net_device: '%s' not found\n",
3378 dev->reg_state = NETREG_UNREGISTERING;
3382 /* Shutdown queueing discipline. */
3386 /* Notify protocols, that we are about to destroy
3387 this device. They should clean all the things.
3389 raw_notifier_call_chain(&netdev_chain, NETDEV_UNREGISTER, dev);
3392 * Flush the multicast chain
3394 dev_mc_discard(dev);
3399 /* Notifier chain MUST detach us from master device. */
3400 BUG_TRAP(!dev->master);
3402 free_divert_blk(dev);
3404 /* Finish processing unregister after unlock */
3414 * unregister_netdev - remove device from the kernel
3417 * This function shuts down a device interface and removes it
3418 * from the kernel tables. On success 0 is returned, on a failure
3419 * a negative errno code is returned.
3421 * This is just a wrapper for unregister_netdevice that takes
3422 * the rtnl semaphore. In general you want to use this and not
3423 * unregister_netdevice.
3425 void unregister_netdev(struct net_device *dev)
3428 unregister_netdevice(dev);
3432 EXPORT_SYMBOL(unregister_netdev);
3434 #ifdef CONFIG_HOTPLUG_CPU
3435 static int dev_cpu_callback(struct notifier_block *nfb,
3436 unsigned long action,
3439 struct sk_buff **list_skb;
3440 struct net_device **list_net;
3441 struct sk_buff *skb;
3442 unsigned int cpu, oldcpu = (unsigned long)ocpu;
3443 struct softnet_data *sd, *oldsd;
3445 if (action != CPU_DEAD)
3448 local_irq_disable();
3449 cpu = smp_processor_id();
3450 sd = &per_cpu(softnet_data, cpu);
3451 oldsd = &per_cpu(softnet_data, oldcpu);
3453 /* Find end of our completion_queue. */
3454 list_skb = &sd->completion_queue;
3456 list_skb = &(*list_skb)->next;
3457 /* Append completion queue from offline CPU. */
3458 *list_skb = oldsd->completion_queue;
3459 oldsd->completion_queue = NULL;
3461 /* Find end of our output_queue. */
3462 list_net = &sd->output_queue;
3464 list_net = &(*list_net)->next_sched;
3465 /* Append output queue from offline CPU. */
3466 *list_net = oldsd->output_queue;
3467 oldsd->output_queue = NULL;
3469 raise_softirq_irqoff(NET_TX_SOFTIRQ);
3472 /* Process offline CPU's input_pkt_queue */
3473 while ((skb = __skb_dequeue(&oldsd->input_pkt_queue)))
3478 #endif /* CONFIG_HOTPLUG_CPU */
3480 #ifdef CONFIG_NET_DMA
3482 * net_dma_rebalance -
3483 * This is called when the number of channels allocated to the net_dma_client
3484 * changes. The net_dma_client tries to have one DMA channel per CPU.
3486 static void net_dma_rebalance(void)
3488 unsigned int cpu, i, n;
3489 struct dma_chan *chan;
3491 if (net_dma_count == 0) {
3492 for_each_online_cpu(cpu)
3493 rcu_assign_pointer(per_cpu(softnet_data, cpu).net_dma, NULL);
3498 cpu = first_cpu(cpu_online_map);
3501 list_for_each_entry(chan, &net_dma_client->channels, client_node) {
3502 n = ((num_online_cpus() / net_dma_count)
3503 + (i < (num_online_cpus() % net_dma_count) ? 1 : 0));
3506 per_cpu(softnet_data, cpu).net_dma = chan;
3507 cpu = next_cpu(cpu, cpu_online_map);
3516 * netdev_dma_event - event callback for the net_dma_client
3517 * @client: should always be net_dma_client
3518 * @chan: DMA channel for the event
3519 * @event: event type
3521 static void netdev_dma_event(struct dma_client *client, struct dma_chan *chan,
3522 enum dma_event event)
3524 spin_lock(&net_dma_event_lock);
3526 case DMA_RESOURCE_ADDED:
3528 net_dma_rebalance();
3530 case DMA_RESOURCE_REMOVED:
3532 net_dma_rebalance();
3537 spin_unlock(&net_dma_event_lock);
3541 * netdev_dma_regiser - register the networking subsystem as a DMA client
3543 static int __init netdev_dma_register(void)
3545 spin_lock_init(&net_dma_event_lock);
3546 net_dma_client = dma_async_client_register(netdev_dma_event);
3547 if (net_dma_client == NULL)
3550 dma_async_client_chan_request(net_dma_client, num_online_cpus());
3555 static int __init netdev_dma_register(void) { return -ENODEV; }
3556 #endif /* CONFIG_NET_DMA */
3559 * Initialize the DEV module. At boot time this walks the device list and
3560 * unhooks any devices that fail to initialise (normally hardware not
3561 * present) and leaves us with a valid list of present and active devices.
3566 * This is called single threaded during boot, so no need
3567 * to take the rtnl semaphore.
3569 static int __init net_dev_init(void)
3571 int i, rc = -ENOMEM;
3573 BUG_ON(!dev_boot_phase);
3577 if (dev_proc_init())
3580 if (netdev_sysfs_init())
3583 INIT_LIST_HEAD(&ptype_all);
3584 for (i = 0; i < 16; i++)
3585 INIT_LIST_HEAD(&ptype_base[i]);
3587 for (i = 0; i < ARRAY_SIZE(dev_name_head); i++)
3588 INIT_HLIST_HEAD(&dev_name_head[i]);
3590 for (i = 0; i < ARRAY_SIZE(dev_index_head); i++)
3591 INIT_HLIST_HEAD(&dev_index_head[i]);
3594 * Initialise the packet receive queues.
3597 for_each_possible_cpu(i) {
3598 struct softnet_data *queue;
3600 queue = &per_cpu(softnet_data, i);
3601 skb_queue_head_init(&queue->input_pkt_queue);
3602 queue->completion_queue = NULL;
3603 INIT_LIST_HEAD(&queue->poll_list);
3604 set_bit(__LINK_STATE_START, &queue->backlog_dev.state);
3605 queue->backlog_dev.weight = weight_p;
3606 queue->backlog_dev.poll = process_backlog;
3607 atomic_set(&queue->backlog_dev.refcnt, 1);
3610 netdev_dma_register();
3614 open_softirq(NET_TX_SOFTIRQ, net_tx_action, NULL);
3615 open_softirq(NET_RX_SOFTIRQ, net_rx_action, NULL);
3617 hotcpu_notifier(dev_cpu_callback, 0);
3625 subsys_initcall(net_dev_init);
3627 EXPORT_SYMBOL(__dev_get_by_index);
3628 EXPORT_SYMBOL(__dev_get_by_name);
3629 EXPORT_SYMBOL(__dev_remove_pack);
3630 EXPORT_SYMBOL(dev_valid_name);
3631 EXPORT_SYMBOL(dev_add_pack);
3632 EXPORT_SYMBOL(dev_alloc_name);
3633 EXPORT_SYMBOL(dev_close);
3634 EXPORT_SYMBOL(dev_get_by_flags);
3635 EXPORT_SYMBOL(dev_get_by_index);
3636 EXPORT_SYMBOL(dev_get_by_name);
3637 EXPORT_SYMBOL(dev_open);
3638 EXPORT_SYMBOL(dev_queue_xmit);
3639 #if defined(CONFIG_VNET) || defined(CONFIG_VNET_MODULE)
3640 EXPORT_SYMBOL(dev_queue_xmit_nit);
3642 EXPORT_SYMBOL(dev_remove_pack);
3643 EXPORT_SYMBOL(dev_set_allmulti);
3644 EXPORT_SYMBOL(dev_set_promiscuity);
3645 EXPORT_SYMBOL(dev_change_flags);
3646 EXPORT_SYMBOL(dev_set_mtu);
3647 EXPORT_SYMBOL(dev_set_mac_address);
3648 EXPORT_SYMBOL(free_netdev);
3649 EXPORT_SYMBOL(netdev_boot_setup_check);
3650 EXPORT_SYMBOL(netdev_set_master);
3651 EXPORT_SYMBOL(netdev_state_change);
3652 EXPORT_SYMBOL(netif_receive_skb);
3653 EXPORT_SYMBOL(netif_rx);
3654 EXPORT_SYMBOL(register_gifconf);
3655 EXPORT_SYMBOL(register_netdevice);
3656 EXPORT_SYMBOL(register_netdevice_notifier);
3657 EXPORT_SYMBOL(skb_checksum_help);
3658 EXPORT_SYMBOL(synchronize_net);
3659 EXPORT_SYMBOL(unregister_netdevice);
3660 EXPORT_SYMBOL(unregister_netdevice_notifier);
3661 EXPORT_SYMBOL(net_enable_timestamp);
3662 EXPORT_SYMBOL(net_disable_timestamp);
3663 EXPORT_SYMBOL(dev_get_flags);
3664 EXPORT_SYMBOL(skb_checksum_setup);
3666 #if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)
3667 EXPORT_SYMBOL(br_handle_frame_hook);
3668 EXPORT_SYMBOL(br_fdb_get_hook);
3669 EXPORT_SYMBOL(br_fdb_put_hook);
3673 EXPORT_SYMBOL(dev_load);
3676 EXPORT_PER_CPU_SYMBOL(softnet_data);