net/core/dev.c

   1 /*
   2  *      NET3    Protocol independent device support routines.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  *      Derived from the non IP parts of dev.c 1.0.19
  10  *              Authors:        Ross Biro
  11  *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *
  14  *      Additional Authors:
  15  *              Florian la Roche <rzsfl@rz.uni-sb.de>
  16  *              Alan Cox <gw4pts@gw4pts.ampr.org>
  17  *              David Hinds <dahinds@users.sourceforge.net>
  18  *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19  *              Adam Sulmicki <adam@cfar.umd.edu>
  20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21  *
  22  *      Changes:
  23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24  *                                      to 2 if register_netdev gets called
  25  *                                      before net_dev_init & also removed a
  26  *                                      few lines of code in the process.
  27  *              Alan Cox        :       device private ioctl copies fields back.
  28  *              Alan Cox        :       Transmit queue code does relevant
  29  *                                      stunts to keep the queue safe.
  30  *              Alan Cox        :       Fixed double lock.
  31  *              Alan Cox        :       Fixed promisc NULL pointer trap
  32  *              ????????        :       Support the full private ioctl range
  33  *              Alan Cox        :       Moved ioctl permission check into
  34  *                                      drivers
  35  *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
  36  *              Alan Cox        :       100 backlog just doesn't cut it when
  37  *                                      you start doing multicast video 8)
  38  *              Alan Cox        :       Rewrote net_bh and list manager.
  39  *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  40  *              Alan Cox        :       Took out transmit every packet pass
  41  *                                      Saved a few bytes in the ioctl handler
  42  *              Alan Cox        :       Network driver sets packet type before
  43  *                                      calling netif_rx. Saves a function
  44  *                                      call a packet.
  45  *              Alan Cox        :       Hashed net_bh()
  46  *              Richard Kooijman:       Timestamp fixes.
  47  *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
  48  *              Alan Cox        :       Device lock protection.
  49  *              Alan Cox        :       Fixed nasty side effect of device close
  50  *                                      changes.
  51  *              Rudi Cilibrasi  :       Pass the right thing to
  52  *                                      set_mac_address()
  53  *              Dave Miller     :       32bit quantity for the device lock to
  54  *                                      make it work out on a Sparc.
  55  *              Bjorn Ekwall    :       Added KERNELD hack.
  56  *              Alan Cox        :       Cleaned up the backlog initialise.
  57  *              Craig Metz      :       SIOCGIFCONF fix if space for under
  58  *                                      1 device.
  59  *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
  60  *                                      is no device open function.
  61  *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  62  *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  63  *              Cyrus Durgin    :       Cleaned for KMOD
  64  *              Adam Sulmicki   :       Bug Fix : Network Device Unload
  65  *                                      A network device unload needs to purge
  66  *                                      the backlog queue.
  67  *      Paul Rusty Russell      :       SIOCSIFNAME
  68  *              Pekka Riikonen  :       Netdev boot-time settings code
  69  *              Andrew Morton   :       Make unregister_netdevice wait
  70  *                                      indefinitely on dev->refcnt
  71  *              J Hadi Salim    :       - Backlog queue sampling
  72  *                                      - netif_rx() feedback
  73  */
  74
  75 #include <asm/uaccess.h>
  76 #include <asm/system.h>
  77 #include <linux/bitops.h>
  78 #include <linux/capability.h>
  79 #include <linux/cpu.h>
  80 #include <linux/types.h>
  81 #include <linux/kernel.h>
  82 #include <linux/sched.h>
  83 #include <linux/mutex.h>
  84 #include <linux/string.h>
  85 #include <linux/mm.h>
  86 #include <linux/socket.h>
  87 #include <linux/sockios.h>
  88 #include <linux/errno.h>
  89 #include <linux/interrupt.h>
  90 #include <linux/if_ether.h>
  91 #include <linux/netdevice.h>
  92 #include <linux/etherdevice.h>
  93 #include <linux/notifier.h>
  94 #include <linux/skbuff.h>
  95 #include <net/sock.h>
  96 #include <linux/rtnetlink.h>
  97 #include <linux/proc_fs.h>
  98 #include <linux/seq_file.h>
  99 #include <linux/stat.h>
 100 #include <linux/if_bridge.h>
 101 #include <linux/divert.h>
 102 #include <net/dst.h>
 103 #include <net/pkt_sched.h>
 104 #include <net/checksum.h>
 105 #include <linux/highmem.h>
 106 #include <linux/init.h>
 107 #include <linux/kmod.h>
 108 #include <linux/module.h>
 109 #include <linux/kallsyms.h>
 110 #include <linux/netpoll.h>
 111 #include <linux/rcupdate.h>
 112 #include <linux/delay.h>
 113 #include <linux/wireless.h>
 114 #include <net/iw_handler.h>
 115 #include <asm/current.h>
 116 #include <linux/audit.h>
 117 #include <linux/dmaengine.h>
 118 #include <linux/err.h>
 119 #include <linux/ctype.h>
 120 #include <linux/vs_network.h>
 121
 122 #ifdef CONFIG_XEN
 123 #include <net/ip.h>
 124 #include <linux/tcp.h>
 125 #include <linux/udp.h>
 126 #endif
 127
 128 /*
 129  *      The list of packet types we will receive (as opposed to discard)
 130  *      and the routines to invoke.
 131  *
 132  *      Why 16. Because with 16 the only overlap we get on a hash of the
 133  *      low nibble of the protocol value is RARP/SNAP/X.25.
 134  *
 135  *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
 136  *             sure which should go first, but I bet it won't make much
 137  *             difference if we are running VLANs.  The good news is that
 138  *             this protocol won't be in the list unless compiled in, so
 139  *             the average user (w/out VLANs) will not be adversely affected.
 140  *             --BLG
 141  *
 142  *              0800    IP
 143  *              8100    802.1Q VLAN
 144  *              0001    802.3
 145  *              0002    AX.25
 146  *              0004    802.2
 147  *              8035    RARP
 148  *              0005    SNAP
 149  *              0805    X.25
 150  *              0806    ARP
 151  *              8137    IPX
 152  *              0009    Localtalk
 153  *              86DD    IPv6
 154  */
 155
 156 static DEFINE_SPINLOCK(ptype_lock);
 157 static struct list_head ptype_base[16]; /* 16 way hashed list */
 158 static struct list_head ptype_all;              /* Taps */
 159
 160 #ifdef CONFIG_NET_DMA
 161 static struct dma_client *net_dma_client;
 162 static unsigned int net_dma_count;
 163 static spinlock_t net_dma_event_lock;
 164 #endif
 165
 166 /*
 167  * The @dev_base list is protected by @dev_base_lock and the rtnl
 168  * semaphore.
 169  *
 170  * Pure readers hold dev_base_lock for reading.
 171  *
 172  * Writers must hold the rtnl semaphore while they loop through the
 173  * dev_base list, and hold dev_base_lock for writing when they do the
 174  * actual updates.  This allows pure readers to access the list even
 175  * while a writer is preparing to update it.
 176  *
 177  * To put it another way, dev_base_lock is held for writing only to
 178  * protect against pure readers; the rtnl semaphore provides the
 179  * protection against other writers.
 180  *
 181  * See, for example usages, register_netdevice() and
 182  * unregister_netdevice(), which must be called with the rtnl
 183  * semaphore held.
 184  */
 185 struct net_device *dev_base;
 186 static struct net_device **dev_tail = &dev_base;
 187 DEFINE_RWLOCK(dev_base_lock);
 188
 189 EXPORT_SYMBOL(dev_base);
 190 EXPORT_SYMBOL(dev_base_lock);
 191
 192 #define NETDEV_HASHBITS 8
 193 static struct hlist_head dev_name_head[1<<NETDEV_HASHBITS];
 194 static struct hlist_head dev_index_head[1<<NETDEV_HASHBITS];
 195
 196 static inline struct hlist_head *dev_name_hash(const char *name)
 197 {
 198         unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 199         return &dev_name_head[hash & ((1<<NETDEV_HASHBITS)-1)];
 200 }
 201
 202 static inline struct hlist_head *dev_index_hash(int ifindex)
 203 {
 204         return &dev_index_head[ifindex & ((1<<NETDEV_HASHBITS)-1)];
 205 }
 206
 207 /*
 208  *      Our notifier list
 209  */
 210
 211 static RAW_NOTIFIER_HEAD(netdev_chain);
 212
 213 /*
 214  *      Device drivers call our routines to queue packets here. We empty the
 215  *      queue in the local softnet handler.
 216  */
 217 DEFINE_PER_CPU(struct softnet_data, softnet_data) = { NULL };
 218
 219 #ifdef CONFIG_SYSFS
 220 extern int netdev_sysfs_init(void);
 221 extern int netdev_register_sysfs(struct net_device *);
 222 extern void netdev_unregister_sysfs(struct net_device *);
 223 #else
 224 #define netdev_sysfs_init()             (0)
 225 #define netdev_register_sysfs(dev)      (0)
 226 #define netdev_unregister_sysfs(dev)    do { } while(0)
 227 #endif
 228
 229
 230 /*******************************************************************************
 231
 232                 Protocol management and registration routines
 233
 234 *******************************************************************************/
 235
 236 /*
 237  *      For efficiency
 238  */
 239
 240 static int netdev_nit;
 241
 242 /*
 243  *      Add a protocol ID to the list. Now that the input handler is
 244  *      smarter we can dispense with all the messy stuff that used to be
 245  *      here.
 246  *
 247  *      BEWARE!!! Protocol handlers, mangling input packets,
 248  *      MUST BE last in hash buckets and checking protocol handlers
 249  *      MUST start from promiscuous ptype_all chain in net_bh.
 250  *      It is true now, do not change it.
 251  *      Explanation follows: if protocol handler, mangling packet, will
 252  *      be the first on list, it is not able to sense, that packet
 253  *      is cloned and should be copied-on-write, so that it will
 254  *      change it and subsequent readers will get broken packet.
 255  *                                                      --ANK (980803)
 256  */
 257
 258 /**
 259  *      dev_add_pack - add packet handler
 260  *      @pt: packet type declaration
 261  *
 262  *      Add a protocol handler to the networking stack. The passed &packet_type
 263  *      is linked into kernel lists and may not be freed until it has been
 264  *      removed from the kernel lists.
 265  *
 266  *      This call does not sleep therefore it can not
 267  *      guarantee all CPU's that are in middle of receiving packets
 268  *      will see the new packet type (until the next received packet).
 269  */
 270
 271 void dev_add_pack(struct packet_type *pt)
 272 {
 273         int hash;
 274
 275         spin_lock_bh(&ptype_lock);
 276         if (pt->type == htons(ETH_P_ALL)) {
 277                 netdev_nit++;
 278                 list_add_rcu(&pt->list, &ptype_all);
 279         } else {
 280                 hash = ntohs(pt->type) & 15;
 281                 list_add_rcu(&pt->list, &ptype_base[hash]);
 282         }
 283         spin_unlock_bh(&ptype_lock);
 284 }
 285
 286 /**
 287  *      __dev_remove_pack        - remove packet handler
 288  *      @pt: packet type declaration
 289  *
 290  *      Remove a protocol handler that was previously added to the kernel
 291  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 292  *      from the kernel lists and can be freed or reused once this function
 293  *      returns.
 294  *
 295  *      The packet type might still be in use by receivers
 296  *      and must not be freed until after all the CPU's have gone
 297  *      through a quiescent state.
 298  */
 299 void __dev_remove_pack(struct packet_type *pt)
 300 {
 301         struct list_head *head;
 302         struct packet_type *pt1;
 303
 304         spin_lock_bh(&ptype_lock);
 305
 306         if (pt->type == htons(ETH_P_ALL)) {
 307                 netdev_nit--;
 308                 head = &ptype_all;
 309         } else
 310                 head = &ptype_base[ntohs(pt->type) & 15];
 311
 312         list_for_each_entry(pt1, head, list) {
 313                 if (pt == pt1) {
 314                         list_del_rcu(&pt->list);
 315                         goto out;
 316                 }
 317         }
 318
 319         printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
 320 out:
 321         spin_unlock_bh(&ptype_lock);
 322 }
 323 /**
 324  *      dev_remove_pack  - remove packet handler
 325  *      @pt: packet type declaration
 326  *
 327  *      Remove a protocol handler that was previously added to the kernel
 328  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 329  *      from the kernel lists and can be freed or reused once this function
 330  *      returns.
 331  *
 332  *      This call sleeps to guarantee that no CPU is looking at the packet
 333  *      type after return.
 334  */
 335 void dev_remove_pack(struct packet_type *pt)
 336 {
 337         __dev_remove_pack(pt);
 338
 339         synchronize_net();
 340 }
 341
 342 /******************************************************************************
 343
 344                       Device Boot-time Settings Routines
 345
 346 *******************************************************************************/
 347
 348 /* Boot time configuration table */
 349 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 350
 351 /**
 352  *      netdev_boot_setup_add   - add new setup entry
 353  *      @name: name of the device
 354  *      @map: configured settings for the device
 355  *
 356  *      Adds new setup entry to the dev_boot_setup list.  The function
 357  *      returns 0 on error and 1 on success.  This is a generic routine to
 358  *      all netdevices.
 359  */
 360 static int netdev_boot_setup_add(char *name, struct ifmap *map)
 361 {
 362         struct netdev_boot_setup *s;
 363         int i;
 364
 365         s = dev_boot_setup;
 366         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 367                 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 368                         memset(s[i].name, 0, sizeof(s[i].name));
 369                         strcpy(s[i].name, name);
 370                         memcpy(&s[i].map, map, sizeof(s[i].map));
 371                         break;
 372                 }
 373         }
 374
 375         return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 376 }
 377
 378 /**
 379  *      netdev_boot_setup_check - check boot time settings
 380  *      @dev: the netdevice
 381  *
 382  *      Check boot time settings for the device.
 383  *      The found settings are set for the device to be used
 384  *      later in the device probing.
 385  *      Returns 0 if no settings found, 1 if they are.
 386  */
 387 int netdev_boot_setup_check(struct net_device *dev)
 388 {
 389         struct netdev_boot_setup *s = dev_boot_setup;
 390         int i;
 391
 392         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 393                 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 394                     !strncmp(dev->name, s[i].name, strlen(s[i].name))) {
 395                         dev->irq        = s[i].map.irq;
 396                         dev->base_addr  = s[i].map.base_addr;
 397                         dev->mem_start  = s[i].map.mem_start;
 398                         dev->mem_end    = s[i].map.mem_end;
 399                         return 1;
 400                 }
 401         }
 402         return 0;
 403 }
 404
 405
 406 /**
 407  *      netdev_boot_base        - get address from boot time settings
 408  *      @prefix: prefix for network device
 409  *      @unit: id for network device
 410  *
 411  *      Check boot time settings for the base address of device.
 412  *      The found settings are set for the device to be used
 413  *      later in the device probing.
 414  *      Returns 0 if no settings found.
 415  */
 416 unsigned long netdev_boot_base(const char *prefix, int unit)
 417 {
 418         const struct netdev_boot_setup *s = dev_boot_setup;
 419         char name[IFNAMSIZ];
 420         int i;
 421
 422         sprintf(name, "%s%d", prefix, unit);
 423
 424         /*
 425          * If device already registered then return base of 1
 426          * to indicate not to probe for this interface
 427          */
 428         if (__dev_get_by_name(name))
 429                 return 1;
 430
 431         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 432                 if (!strcmp(name, s[i].name))
 433                         return s[i].map.base_addr;
 434         return 0;
 435 }
 436
 437 /*
 438  * Saves at boot time configured settings for any netdevice.
 439  */
 440 int __init netdev_boot_setup(char *str)
 441 {
 442         int ints[5];
 443         struct ifmap map;
 444
 445         str = get_options(str, ARRAY_SIZE(ints), ints);
 446         if (!str || !*str)
 447                 return 0;
 448
 449         /* Save settings */
 450         memset(&map, 0, sizeof(map));
 451         if (ints[0] > 0)
 452                 map.irq = ints[1];
 453         if (ints[0] > 1)
 454                 map.base_addr = ints[2];
 455         if (ints[0] > 2)
 456                 map.mem_start = ints[3];
 457         if (ints[0] > 3)
 458                 map.mem_end = ints[4];
 459
 460         /* Add new entry to the list */
 461         return netdev_boot_setup_add(str, &map);
 462 }
 463
 464 __setup("netdev=", netdev_boot_setup);
 465
 466 /*******************************************************************************
 467
 468                             Device Interface Subroutines
 469
 470 *******************************************************************************/
 471
 472 /**
 473  *      __dev_get_by_name       - find a device by its name
 474  *      @name: name to find
 475  *
 476  *      Find an interface by name. Must be called under RTNL semaphore
 477  *      or @dev_base_lock. If the name is found a pointer to the device
 478  *      is returned. If the name is not found then %NULL is returned. The
 479  *      reference counters are not incremented so the caller must be
 480  *      careful with locks.
 481  */
 482
 483 struct net_device *__dev_get_by_name(const char *name)
 484 {
 485         struct hlist_node *p;
 486
 487         hlist_for_each(p, dev_name_hash(name)) {
 488                 struct net_device *dev
 489                         = hlist_entry(p, struct net_device, name_hlist);
 490                 if (!strncmp(dev->name, name, IFNAMSIZ))
 491                         return dev;
 492         }
 493         return NULL;
 494 }
 495
 496 /**
 497  *      dev_get_by_name         - find a device by its name
 498  *      @name: name to find
 499  *
 500  *      Find an interface by name. This can be called from any
 501  *      context and does its own locking. The returned handle has
 502  *      the usage count incremented and the caller must use dev_put() to
 503  *      release it when it is no longer needed. %NULL is returned if no
 504  *      matching device is found.
 505  */
 506
 507 struct net_device *dev_get_by_name(const char *name)
 508 {
 509         struct net_device *dev;
 510
 511         read_lock(&dev_base_lock);
 512         dev = __dev_get_by_name(name);
 513         if (dev)
 514                 dev_hold(dev);
 515         read_unlock(&dev_base_lock);
 516         return dev;
 517 }
 518
 519 /**
 520  *      __dev_get_by_index - find a device by its ifindex
 521  *      @ifindex: index of device
 522  *
 523  *      Search for an interface by index. Returns %NULL if the device
 524  *      is not found or a pointer to the device. The device has not
 525  *      had its reference counter increased so the caller must be careful
 526  *      about locking. The caller must hold either the RTNL semaphore
 527  *      or @dev_base_lock.
 528  */
 529
 530 struct net_device *__dev_get_by_index(int ifindex)
 531 {
 532         struct hlist_node *p;
 533
 534         hlist_for_each(p, dev_index_hash(ifindex)) {
 535                 struct net_device *dev
 536                         = hlist_entry(p, struct net_device, index_hlist);
 537                 if (dev->ifindex == ifindex)
 538                         return dev;
 539         }
 540         return NULL;
 541 }
 542
 543
 544 /**
 545  *      dev_get_by_index - find a device by its ifindex
 546  *      @ifindex: index of device
 547  *
 548  *      Search for an interface by index. Returns NULL if the device
 549  *      is not found or a pointer to the device. The device returned has
 550  *      had a reference added and the pointer is safe until the user calls
 551  *      dev_put to indicate they have finished with it.
 552  */
 553
 554 struct net_device *dev_get_by_index(int ifindex)
 555 {
 556         struct net_device *dev;
 557
 558         read_lock(&dev_base_lock);
 559         dev = __dev_get_by_index(ifindex);
 560         if (dev)
 561                 dev_hold(dev);
 562         read_unlock(&dev_base_lock);
 563         return dev;
 564 }
 565
 566 /**
 567  *      dev_getbyhwaddr - find a device by its hardware address
 568  *      @type: media type of device
 569  *      @ha: hardware address
 570  *
 571  *      Search for an interface by MAC address. Returns NULL if the device
 572  *      is not found or a pointer to the device. The caller must hold the
 573  *      rtnl semaphore. The returned device has not had its ref count increased
 574  *      and the caller must therefore be careful about locking
 575  *
 576  *      BUGS:
 577  *      If the API was consistent this would be __dev_get_by_hwaddr
 578  */
 579
 580 struct net_device *dev_getbyhwaddr(unsigned short type, char *ha)
 581 {
 582         struct net_device *dev;
 583
 584         ASSERT_RTNL();
 585
 586         for (dev = dev_base; dev; dev = dev->next)
 587                 if (dev->type == type &&
 588                     !memcmp(dev->dev_addr, ha, dev->addr_len))
 589                         break;
 590         return dev;
 591 }
 592
 593 EXPORT_SYMBOL(dev_getbyhwaddr);
 594
 595 struct net_device *dev_getfirstbyhwtype(unsigned short type)
 596 {
 597         struct net_device *dev;
 598
 599         rtnl_lock();
 600         for (dev = dev_base; dev; dev = dev->next) {
 601                 if (dev->type == type) {
 602                         dev_hold(dev);
 603                         break;
 604                 }
 605         }
 606         rtnl_unlock();
 607         return dev;
 608 }
 609
 610 EXPORT_SYMBOL(dev_getfirstbyhwtype);
 611
 612 /**
 613  *      dev_get_by_flags - find any device with given flags
 614  *      @if_flags: IFF_* values
 615  *      @mask: bitmask of bits in if_flags to check
 616  *
 617  *      Search for any interface with the given flags. Returns NULL if a device
 618  *      is not found or a pointer to the device. The device returned has
 619  *      had a reference added and the pointer is safe until the user calls
 620  *      dev_put to indicate they have finished with it.
 621  */
 622
 623 struct net_device * dev_get_by_flags(unsigned short if_flags, unsigned short mask)
 624 {
 625         struct net_device *dev;
 626
 627         read_lock(&dev_base_lock);
 628         for (dev = dev_base; dev != NULL; dev = dev->next) {
 629                 if (((dev->flags ^ if_flags) & mask) == 0) {
 630                         dev_hold(dev);
 631                         break;
 632                 }
 633         }
 634         read_unlock(&dev_base_lock);
 635         return dev;
 636 }
 637
 638 /**
 639  *      dev_valid_name - check if name is okay for network device
 640  *      @name: name string
 641  *
 642  *      Network device names need to be valid file names to
 643  *      to allow sysfs to work.  We also disallow any kind of
 644  *      whitespace.
 645  */
 646 int dev_valid_name(const char *name)
 647 {
 648         if (*name == '\0')
 649                 return 0;
 650         if (!strcmp(name, ".") || !strcmp(name, ".."))
 651                 return 0;
 652
 653         while (*name) {
 654                 if (*name == '/' || isspace(*name))
 655                         return 0;
 656                 name++;
 657         }
 658         return 1;
 659 }
 660
 661 /**
 662  *      dev_alloc_name - allocate a name for a device
 663  *      @dev: device
 664  *      @name: name format string
 665  *
 666  *      Passed a format string - eg "lt%d" it will try and find a suitable
 667  *      id. It scans list of devices to build up a free map, then chooses
 668  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 669  *      while allocating the name and adding the device in order to avoid
 670  *      duplicates.
 671  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 672  *      Returns the number of the unit assigned or a negative errno code.
 673  */
 674
 675 int dev_alloc_name(struct net_device *dev, const char *name)
 676 {
 677         int i = 0;
 678         char buf[IFNAMSIZ];
 679         const char *p;
 680         const int max_netdevices = 8*PAGE_SIZE;
 681         long *inuse;
 682         struct net_device *d;
 683
 684         p = strnchr(name, IFNAMSIZ-1, '%');
 685         if (p) {
 686                 /*
 687                  * Verify the string as this thing may have come from
 688                  * the user.  There must be either one "%d" and no other "%"
 689                  * characters.
 690                  */
 691                 if (p[1] != 'd' || strchr(p + 2, '%'))
 692                         return -EINVAL;
 693
 694                 /* Use one page as a bit array of possible slots */
 695                 inuse = (long *) get_zeroed_page(GFP_ATOMIC);
 696                 if (!inuse)
 697                         return -ENOMEM;
 698
 699                 for (d = dev_base; d; d = d->next) {
 700                         if (!sscanf(d->name, name, &i))
 701                                 continue;
 702                         if (i < 0 || i >= max_netdevices)
 703                                 continue;
 704
 705                         /*  avoid cases where sscanf is not exact inverse of printf */
 706                         snprintf(buf, sizeof(buf), name, i);
 707                         if (!strncmp(buf, d->name, IFNAMSIZ))
 708                                 set_bit(i, inuse);
 709                 }
 710
 711                 i = find_first_zero_bit(inuse, max_netdevices);
 712                 free_page((unsigned long) inuse);
 713         }
 714
 715         snprintf(buf, sizeof(buf), name, i);
 716         if (!__dev_get_by_name(buf)) {
 717                 strlcpy(dev->name, buf, IFNAMSIZ);
 718                 return i;
 719         }
 720
 721         /* It is possible to run out of possible slots
 722          * when the name is long and there isn't enough space left
 723          * for the digits, or if all bits are used.
 724          */
 725         return -ENFILE;
 726 }
 727
 728
 729 /**
 730  *      dev_change_name - change name of a device
 731  *      @dev: device
 732  *      @newname: name (or format string) must be at least IFNAMSIZ
 733  *
 734  *      Change name of a device, can pass format strings "eth%d".
 735  *      for wildcarding.
 736  */
 737 int dev_change_name(struct net_device *dev, char *newname)
 738 {
 739         int err = 0;
 740
 741         ASSERT_RTNL();
 742
 743         if (dev->flags & IFF_UP)
 744                 return -EBUSY;
 745
 746         if (!dev_valid_name(newname))
 747                 return -EINVAL;
 748
 749         if (strchr(newname, '%')) {
 750                 err = dev_alloc_name(dev, newname);
 751                 if (err < 0)
 752                         return err;
 753                 strcpy(newname, dev->name);
 754         }
 755         else if (__dev_get_by_name(newname))
 756                 return -EEXIST;
 757         else
 758                 strlcpy(dev->name, newname, IFNAMSIZ);
 759
 760         err = class_device_rename(&dev->class_dev, dev->name);
 761         if (!err) {
 762                 hlist_del(&dev->name_hlist);
 763                 hlist_add_head(&dev->name_hlist, dev_name_hash(dev->name));
 764                 raw_notifier_call_chain(&netdev_chain,
 765                                 NETDEV_CHANGENAME, dev);
 766         }
 767
 768         return err;
 769 }
 770
 771 /**
 772  *      netdev_features_change - device changes features
 773  *      @dev: device to cause notification
 774  *
 775  *      Called to indicate a device has changed features.
 776  */
 777 void netdev_features_change(struct net_device *dev)
 778 {
 779         raw_notifier_call_chain(&netdev_chain, NETDEV_FEAT_CHANGE, dev);
 780 }
 781 EXPORT_SYMBOL(netdev_features_change);
 782
 783 /**
 784  *      netdev_state_change - device changes state
 785  *      @dev: device to cause notification
 786  *
 787  *      Called to indicate a device has changed state. This function calls
 788  *      the notifier chains for netdev_chain and sends a NEWLINK message
 789  *      to the routing socket.
 790  */
 791 void netdev_state_change(struct net_device *dev)
 792 {
 793         if (dev->flags & IFF_UP) {
 794                 raw_notifier_call_chain(&netdev_chain,
 795                                 NETDEV_CHANGE, dev);
 796                 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
 797         }
 798 }
 799
 800 /**
 801  *      dev_load        - load a network module
 802  *      @name: name of interface
 803  *
 804  *      If a network interface is not present and the process has suitable
 805  *      privileges this function loads the module. If module loading is not
 806  *      available in this kernel then it becomes a nop.
 807  */
 808
 809 void dev_load(const char *name)
 810 {
 811         struct net_device *dev;
 812
 813         read_lock(&dev_base_lock);
 814         dev = __dev_get_by_name(name);
 815         read_unlock(&dev_base_lock);
 816
 817         if (!dev && capable(CAP_SYS_MODULE))
 818                 request_module("%s", name);
 819 }
 820
 821 static int default_rebuild_header(struct sk_buff *skb)
 822 {
 823         printk(KERN_DEBUG "%s: default_rebuild_header called -- BUG!\n",
 824                skb->dev ? skb->dev->name : "NULL!!!");
 825         kfree_skb(skb);
 826         return 1;
 827 }
 828
 829
 830 /**
 831  *      dev_open        - prepare an interface for use.
 832  *      @dev:   device to open
 833  *
 834  *      Takes a device from down to up state. The device's private open
 835  *      function is invoked and then the multicast lists are loaded. Finally
 836  *      the device is moved into the up state and a %NETDEV_UP message is
 837  *      sent to the netdev notifier chain.
 838  *
 839  *      Calling this function on an active interface is a nop. On a failure
 840  *      a negative errno code is returned.
 841  */
 842 int dev_open(struct net_device *dev)
 843 {
 844         int ret = 0;
 845
 846         /*
 847          *      Is it already up?
 848          */
 849
 850         if (dev->flags & IFF_UP)
 851                 return 0;
 852
 853         /*
 854          *      Is it even present?
 855          */
 856         if (!netif_device_present(dev))
 857                 return -ENODEV;
 858
 859         /*
 860          *      Call device private open method
 861          */
 862         set_bit(__LINK_STATE_START, &dev->state);
 863         if (dev->open) {
 864                 ret = dev->open(dev);
 865                 if (ret)
 866                         clear_bit(__LINK_STATE_START, &dev->state);
 867         }
 868
 869         /*
 870          *      If it went open OK then:
 871          */
 872
 873         if (!ret) {
 874                 /*
 875                  *      Set the flags.
 876                  */
 877                 dev->flags |= IFF_UP;
 878
 879                 /*
 880                  *      Initialize multicasting status
 881                  */
 882                 dev_mc_upload(dev);
 883
 884                 /*
 885                  *      Wakeup transmit queue engine
 886                  */
 887                 dev_activate(dev);
 888
 889                 /*
 890                  *      ... and announce new interface.
 891                  */
 892                 raw_notifier_call_chain(&netdev_chain, NETDEV_UP, dev);
 893         }
 894         return ret;
 895 }
 896
 897 /**
 898  *      dev_close - shutdown an interface.
 899  *      @dev: device to shutdown
 900  *
 901  *      This function moves an active device into down state. A
 902  *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
 903  *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
 904  *      chain.
 905  */
 906 int dev_close(struct net_device *dev)
 907 {
 908         if (!(dev->flags & IFF_UP))
 909                 return 0;
 910
 911         /*
 912          *      Tell people we are going down, so that they can
 913          *      prepare to death, when device is still operating.
 914          */
 915         raw_notifier_call_chain(&netdev_chain, NETDEV_GOING_DOWN, dev);
 916
 917         dev_deactivate(dev);
 918
 919         clear_bit(__LINK_STATE_START, &dev->state);
 920
 921         /* Synchronize to scheduled poll. We cannot touch poll list,
 922          * it can be even on different cpu. So just clear netif_running(),
 923          * and wait when poll really will happen. Actually, the best place
 924          * for this is inside dev->stop() after device stopped its irq
 925          * engine, but this requires more changes in devices. */
 926
 927         smp_mb__after_clear_bit(); /* Commit netif_running(). */
 928         while (test_bit(__LINK_STATE_RX_SCHED, &dev->state)) {
 929                 /* No hurry. */
 930                 msleep(1);
 931         }
 932
 933         /*
 934          *      Call the device specific close. This cannot fail.
 935          *      Only if device is UP
 936          *
 937          *      We allow it to be called even after a DETACH hot-plug
 938          *      event.
 939          */
 940         if (dev->stop)
 941                 dev->stop(dev);
 942
 943         /*
 944          *      Device is now down.
 945          */
 946
 947         dev->flags &= ~IFF_UP;
 948
 949         /*
 950          * Tell people we are down
 951          */
 952         raw_notifier_call_chain(&netdev_chain, NETDEV_DOWN, dev);
 953
 954         return 0;
 955 }
 956
 957
 958 /*
 959  *      Device change register/unregister. These are not inline or static
 960  *      as we export them to the world.
 961  */
 962
 963 /**
 964  *      register_netdevice_notifier - register a network notifier block
 965  *      @nb: notifier
 966  *
 967  *      Register a notifier to be called when network device events occur.
 968  *      The notifier passed is linked into the kernel structures and must
 969  *      not be reused until it has been unregistered. A negative errno code
 970  *      is returned on a failure.
 971  *
 972  *      When registered all registration and up events are replayed
 973  *      to the new notifier to allow device to have a race free
 974  *      view of the network device list.
 975  */
 976
 977 int register_netdevice_notifier(struct notifier_block *nb)
 978 {
 979         struct net_device *dev;
 980         int err;
 981
 982         rtnl_lock();
 983         err = raw_notifier_chain_register(&netdev_chain, nb);
 984         if (!err) {
 985                 for (dev = dev_base; dev; dev = dev->next) {
 986                         nb->notifier_call(nb, NETDEV_REGISTER, dev);
 987
 988                         if (dev->flags & IFF_UP)
 989                                 nb->notifier_call(nb, NETDEV_UP, dev);
 990                 }
 991         }
 992         rtnl_unlock();
 993         return err;
 994 }
 995
 996 /**
 997  *      unregister_netdevice_notifier - unregister a network notifier block
 998  *      @nb: notifier
 999  *
1000  *      Unregister a notifier previously registered by
1001  *      register_netdevice_notifier(). The notifier is unlinked into the
1002  *      kernel structures and may then be reused. A negative errno code
1003  *      is returned on a failure.
1004  */
1005
1006 int unregister_netdevice_notifier(struct notifier_block *nb)
1007 {
1008         int err;
1009
1010         rtnl_lock();
1011         err = raw_notifier_chain_unregister(&netdev_chain, nb);
1012         rtnl_unlock();
1013         return err;
1014 }
1015
1016 /**
1017  *      call_netdevice_notifiers - call all network notifier blocks
1018  *      @val: value passed unmodified to notifier function
1019  *      @v:   pointer passed unmodified to notifier function
1020  *
1021  *      Call all network notifier blocks.  Parameters and return value
1022  *      are as for raw_notifier_call_chain().
1023  */
1024
1025 int call_netdevice_notifiers(unsigned long val, void *v)
1026 {
1027         return raw_notifier_call_chain(&netdev_chain, val, v);
1028 }
1029
1030 /* When > 0 there are consumers of rx skb time stamps */
1031 static atomic_t netstamp_needed = ATOMIC_INIT(0);
1032
1033 void net_enable_timestamp(void)
1034 {
1035         atomic_inc(&netstamp_needed);
1036 }
1037
1038 void net_disable_timestamp(void)
1039 {
1040         atomic_dec(&netstamp_needed);
1041 }
1042
1043 void __net_timestamp(struct sk_buff *skb)
1044 {
1045         struct timeval tv;
1046
1047         do_gettimeofday(&tv);
1048         skb_set_timestamp(skb, &tv);
1049 }
1050 EXPORT_SYMBOL(__net_timestamp);
1051
1052 static inline void net_timestamp(struct sk_buff *skb)
1053 {
1054         if (atomic_read(&netstamp_needed))
1055                 __net_timestamp(skb);
1056         else {
1057                 skb->tstamp.off_sec = 0;
1058                 skb->tstamp.off_usec = 0;
1059         }
1060 }
1061
1062 /*
1063  *      Support routine. Sends outgoing frames to any network
1064  *      taps currently in use.
1065  */
1066
1067 #if !((defined(CONFIG_VNET) || defined(CONFIG_VNET_MODULE)))
1068 static
1069 #endif
1070 void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1071 {
1072         struct packet_type *ptype;
1073
1074         net_timestamp(skb);
1075
1076         rcu_read_lock();
1077         list_for_each_entry_rcu(ptype, &ptype_all, list) {
1078                 /* Never send packets back to the socket
1079                  * they originated from - MvS (miquels@drinkel.ow.org)
1080                  */
1081                 if ((ptype->dev == dev || !ptype->dev) &&
1082                     (ptype->af_packet_priv == NULL ||
1083                      (struct sock *)ptype->af_packet_priv != skb->sk)) {
1084                         struct sk_buff *skb2= skb_clone(skb, GFP_ATOMIC);
1085                         if (!skb2)
1086                                 break;
1087
1088                         /* skb->nh should be correctly
1089                            set by sender, so that the second statement is
1090                            just protection against buggy protocols.
1091                          */
1092                         skb2->mac.raw = skb2->data;
1093
1094                         if (skb2->nh.raw < skb2->data ||
1095                             skb2->nh.raw > skb2->tail) {
1096                                 if (net_ratelimit())
1097                                         printk(KERN_CRIT "protocol %04x is "
1098                                                "buggy, dev %s\n",
1099                                                skb2->protocol, dev->name);
1100                                 skb2->nh.raw = skb2->data;
1101                         }
1102
1103                         skb2->h.raw = skb2->nh.raw;
1104                         skb2->pkt_type = PACKET_OUTGOING;
1105                         ptype->func(skb2, skb->dev, ptype, skb->dev);
1106                 }
1107         }
1108         rcu_read_unlock();
1109 }
1110
1111
1112 void __netif_schedule(struct net_device *dev)
1113 {
1114         if (!test_and_set_bit(__LINK_STATE_SCHED, &dev->state)) {
1115                 unsigned long flags;
1116                 struct softnet_data *sd;
1117
1118                 local_irq_save(flags);
1119                 sd = &__get_cpu_var(softnet_data);
1120                 dev->next_sched = sd->output_queue;
1121                 sd->output_queue = dev;
1122                 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1123                 local_irq_restore(flags);
1124         }
1125 }
1126 EXPORT_SYMBOL(__netif_schedule);
1127
1128 void __netif_rx_schedule(struct net_device *dev)
1129 {
1130         unsigned long flags;
1131
1132         local_irq_save(flags);
1133         dev_hold(dev);
1134         list_add_tail(&dev->poll_list, &__get_cpu_var(softnet_data).poll_list);
1135         if (dev->quota < 0)
1136                 dev->quota += dev->weight;
1137         else
1138                 dev->quota = dev->weight;
1139         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
1140         local_irq_restore(flags);
1141 }
1142 EXPORT_SYMBOL(__netif_rx_schedule);
1143
1144 void dev_kfree_skb_any(struct sk_buff *skb)
1145 {
1146         if (in_irq() || irqs_disabled())
1147                 dev_kfree_skb_irq(skb);
1148         else
1149                 dev_kfree_skb(skb);
1150 }
1151 EXPORT_SYMBOL(dev_kfree_skb_any);
1152
1153
1154 /* Hot-plugging. */
1155 void netif_device_detach(struct net_device *dev)
1156 {
1157         if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1158             netif_running(dev)) {
1159                 netif_stop_queue(dev);
1160         }
1161 }
1162 EXPORT_SYMBOL(netif_device_detach);
1163
1164 void netif_device_attach(struct net_device *dev)
1165 {
1166         if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1167             netif_running(dev)) {
1168                 netif_wake_queue(dev);
1169                 __netdev_watchdog_up(dev);
1170         }
1171 }
1172 EXPORT_SYMBOL(netif_device_attach);
1173
1174
1175 /*
1176  * Invalidate hardware checksum when packet is to be mangled, and
1177  * complete checksum manually on outgoing path.
1178  */
1179 int skb_checksum_help(struct sk_buff *skb, int inward)
1180 {
1181         unsigned int csum;
1182         int ret = 0, offset = skb->h.raw - skb->data;
1183
1184         if (inward)
1185                 goto out_set_summed;
1186
1187         if (unlikely(skb_shinfo(skb)->gso_size)) {
1188                 /* Let GSO fix up the checksum. */
1189                 goto out_set_summed;
1190         }
1191
1192         if (skb_cloned(skb)) {
1193                 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1194                 if (ret)
1195                         goto out;
1196         }
1197
1198         BUG_ON(offset > (int)skb->len);
1199         csum = skb_checksum(skb, offset, skb->len-offset, 0);
1200
1201         offset = skb->tail - skb->h.raw;
1202         BUG_ON(offset <= 0);
1203         BUG_ON(skb->csum + 2 > offset);
1204
1205         *(u16*)(skb->h.raw + skb->csum) = csum_fold(csum);
1206
1207 out_set_summed:
1208         skb->ip_summed = CHECKSUM_NONE;
1209 out:
1210         return ret;
1211 }
1212
1213 /**
1214  *      skb_gso_segment - Perform segmentation on skb.
1215  *      @skb: buffer to segment
1216  *      @features: features for the output path (see dev->features)
1217  *
1218  *      This function segments the given skb and returns a list of segments.
1219  *
1220  *      It may return NULL if the skb requires no segmentation.  This is
1221  *      only possible when GSO is used for verifying header integrity.
1222  */
1223 struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features)
1224 {
1225         struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1226         struct packet_type *ptype;
1227         int type = skb->protocol;
1228         int err;
1229
1230         BUG_ON(skb_shinfo(skb)->frag_list);
1231
1232         skb->mac.raw = skb->data;
1233         skb->mac_len = skb->nh.raw - skb->data;
1234         __skb_pull(skb, skb->mac_len);
1235
1236         if (unlikely(skb->ip_summed != CHECKSUM_HW)) {
1237                 if (skb_header_cloned(skb) &&
1238                     (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1239                         return ERR_PTR(err);
1240         }
1241
1242         rcu_read_lock();
1243         list_for_each_entry_rcu(ptype, &ptype_base[ntohs(type) & 15], list) {
1244                 if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1245                         if (unlikely(skb->ip_summed != CHECKSUM_HW)) {
1246                                 err = ptype->gso_send_check(skb);
1247                                 segs = ERR_PTR(err);
1248                                 if (err || skb_gso_ok(skb, features))
1249                                         break;
1250                                 __skb_push(skb, skb->data - skb->nh.raw);
1251                         }
1252                         segs = ptype->gso_segment(skb, features);
1253                         break;
1254                 }
1255         }
1256         rcu_read_unlock();
1257
1258         __skb_push(skb, skb->data - skb->mac.raw);
1259
1260         return segs;
1261 }
1262
1263 EXPORT_SYMBOL(skb_gso_segment);
1264
1265 /* Take action when hardware reception checksum errors are detected. */
1266 #ifdef CONFIG_BUG
1267 void netdev_rx_csum_fault(struct net_device *dev)
1268 {
1269         if (net_ratelimit()) {
1270                 printk(KERN_ERR "%s: hw csum failure.\n",
1271                         dev ? dev->name : "<unknown>");
1272                 dump_stack();
1273         }
1274 }
1275 EXPORT_SYMBOL(netdev_rx_csum_fault);
1276 #endif
1277
1278 /* Actually, we should eliminate this check as soon as we know, that:
1279  * 1. IOMMU is present and allows to map all the memory.
1280  * 2. No high memory really exists on this machine.
1281  */
1282
1283 static inline int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1284 {
1285 #ifdef CONFIG_HIGHMEM
1286         int i;
1287
1288         if (dev->features & NETIF_F_HIGHDMA)
1289                 return 0;
1290
1291         for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1292                 if (PageHighMem(skb_shinfo(skb)->frags[i].page))
1293                         return 1;
1294
1295 #endif
1296         return 0;
1297 }
1298
1299 struct dev_gso_cb {
1300         void (*destructor)(struct sk_buff *skb);
1301 };
1302
1303 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
1304
1305 static void dev_gso_skb_destructor(struct sk_buff *skb)
1306 {
1307         struct dev_gso_cb *cb;
1308
1309         do {
1310                 struct sk_buff *nskb = skb->next;
1311
1312                 skb->next = nskb->next;
1313                 nskb->next = NULL;
1314                 kfree_skb(nskb);
1315         } while (skb->next);
1316
1317         cb = DEV_GSO_CB(skb);
1318         if (cb->destructor)
1319                 cb->destructor(skb);
1320 }
1321
1322 /**
1323  *      dev_gso_segment - Perform emulated hardware segmentation on skb.
1324  *      @skb: buffer to segment
1325  *
1326  *      This function segments the given skb and stores the list of segments
1327  *      in skb->next.
1328  */
1329 static int dev_gso_segment(struct sk_buff *skb)
1330 {
1331         struct net_device *dev = skb->dev;
1332         struct sk_buff *segs;
1333         int features = dev->features & ~(illegal_highdma(dev, skb) ?
1334                                          NETIF_F_SG : 0);
1335
1336         segs = skb_gso_segment(skb, features);
1337
1338         /* Verifying header integrity only. */
1339         if (!segs)
1340                 return 0;
1341
1342         if (unlikely(IS_ERR(segs)))
1343                 return PTR_ERR(segs);
1344
1345         skb->next = segs;
1346         DEV_GSO_CB(skb)->destructor = skb->destructor;
1347         skb->destructor = dev_gso_skb_destructor;
1348
1349         return 0;
1350 }
1351
1352 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev)
1353 {
1354         if (likely(!skb->next)) {
1355                 if (netdev_nit)
1356                         dev_queue_xmit_nit(skb, dev);
1357
1358                 if (netif_needs_gso(dev, skb)) {
1359                         if (unlikely(dev_gso_segment(skb)))
1360                                 goto out_kfree_skb;
1361                         if (skb->next)
1362                                 goto gso;
1363                 }
1364
1365                 return dev->hard_start_xmit(skb, dev);
1366         }
1367
1368 gso:
1369         do {
1370                 struct sk_buff *nskb = skb->next;
1371                 int rc;
1372
1373                 skb->next = nskb->next;
1374                 nskb->next = NULL;
1375                 rc = dev->hard_start_xmit(nskb, dev);
1376                 if (unlikely(rc)) {
1377                         nskb->next = skb->next;
1378                         skb->next = nskb;
1379                         return rc;
1380                 }
1381                 if (unlikely(netif_queue_stopped(dev) && skb->next))
1382                         return NETDEV_TX_BUSY;
1383         } while (skb->next);
1384
1385         skb->destructor = DEV_GSO_CB(skb)->destructor;
1386
1387 out_kfree_skb:
1388         kfree_skb(skb);
1389         return 0;
1390 }
1391
1392 #define HARD_TX_LOCK(dev, cpu) {                        \
1393         if ((dev->features & NETIF_F_LLTX) == 0) {      \
1394                 netif_tx_lock(dev);                     \
1395         }                                               \
1396 }
1397
1398 #define HARD_TX_UNLOCK(dev) {                           \
1399         if ((dev->features & NETIF_F_LLTX) == 0) {      \
1400                 netif_tx_unlock(dev);                   \
1401         }                                               \
1402 }
1403
1404 #ifdef CONFIG_XEN
1405 inline int skb_checksum_setup(struct sk_buff *skb)
1406 {
1407         if (skb->proto_csum_blank) {
1408                 if (skb->protocol != htons(ETH_P_IP))
1409                         goto out;
1410                 skb->h.raw = (unsigned char *)skb->nh.iph + 4*skb->nh.iph->ihl;
1411                 if (skb->h.raw >= skb->tail)
1412                         goto out;
1413                 switch (skb->nh.iph->protocol) {
1414                 case IPPROTO_TCP:
1415                         skb->csum = offsetof(struct tcphdr, check);
1416                         break;
1417                 case IPPROTO_UDP:
1418                         skb->csum = offsetof(struct udphdr, check);
1419                         break;
1420                 default:
1421                         if (net_ratelimit())
1422                                 printk(KERN_ERR "Attempting to checksum a non-"
1423                                        "TCP/UDP packet, dropping a protocol"
1424                                        " %d packet", skb->nh.iph->protocol);
1425                         goto out;
1426                 }
1427                 if ((skb->h.raw + skb->csum + 2) > skb->tail)
1428                         goto out;
1429                 skb->ip_summed = CHECKSUM_HW;
1430                 skb->proto_csum_blank = 0;
1431         }
1432         return 0;
1433 out:
1434         return -EPROTO;
1435 }
1436 #else
1437 inline int skb_checksum_setup(struct sk_buff *skb) { return 0; }
1438 #endif
1439
1440
1441 /**
1442  *      dev_queue_xmit - transmit a buffer
1443  *      @skb: buffer to transmit
1444  *
1445  *      Queue a buffer for transmission to a network device. The caller must
1446  *      have set the device and priority and built the buffer before calling
1447  *      this function. The function can be called from an interrupt.
1448  *
1449  *      A negative errno code is returned on a failure. A success does not
1450  *      guarantee the frame will be transmitted as it may be dropped due
1451  *      to congestion or traffic shaping.
1452  *
1453  * -----------------------------------------------------------------------------------
1454  *      I notice this method can also return errors from the queue disciplines,
1455  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
1456  *      be positive.
1457  *
1458  *      Regardless of the return value, the skb is consumed, so it is currently
1459  *      difficult to retry a send to this method.  (You can bump the ref count
1460  *      before sending to hold a reference for retry if you are careful.)
1461  *
1462  *      When calling this method, interrupts MUST be enabled.  This is because
1463  *      the BH enable code must have IRQs enabled so that it will not deadlock.
1464  *          --BLG
1465  */
1466
1467 int dev_queue_xmit(struct sk_buff *skb)
1468 {
1469         struct net_device *dev = skb->dev;
1470         struct Qdisc *q;
1471         int rc = -ENOMEM;
1472
1473         /* If a checksum-deferred packet is forwarded to a device that needs a
1474          * checksum, correct the pointers and force checksumming.
1475          */
1476         if (skb_checksum_setup(skb))
1477                 goto out_kfree_skb;
1478
1479         /* GSO will handle the following emulations directly. */
1480         if (netif_needs_gso(dev, skb))
1481                 goto gso;
1482
1483         if (skb_shinfo(skb)->frag_list &&
1484             !(dev->features & NETIF_F_FRAGLIST) &&
1485             __skb_linearize(skb))
1486                 goto out_kfree_skb;
1487
1488         /* Fragmented skb is linearized if device does not support SG,
1489          * or if at least one of fragments is in highmem and device
1490          * does not support DMA from it.
1491          */
1492         if (skb_shinfo(skb)->nr_frags &&
1493             (!(dev->features & NETIF_F_SG) || illegal_highdma(dev, skb)) &&
1494             __skb_linearize(skb))
1495                 goto out_kfree_skb;
1496
1497         /* If packet is not checksummed and device does not support
1498          * checksumming for this protocol, complete checksumming here.
1499          */
1500         if (skb->ip_summed == CHECKSUM_HW &&
1501             (!(dev->features & NETIF_F_GEN_CSUM) &&
1502              (!(dev->features & NETIF_F_IP_CSUM) ||
1503               skb->protocol != htons(ETH_P_IP))))
1504                 if (skb_checksum_help(skb, 0))
1505                         goto out_kfree_skb;
1506
1507 gso:
1508         spin_lock_prefetch(&dev->queue_lock);
1509
1510         /* Disable soft irqs for various locks below. Also
1511          * stops preemption for RCU.
1512          */
1513         rcu_read_lock_bh();
1514
1515         /* Updates of qdisc are serialized by queue_lock.
1516          * The struct Qdisc which is pointed to by qdisc is now a
1517          * rcu structure - it may be accessed without acquiring
1518          * a lock (but the structure may be stale.) The freeing of the
1519          * qdisc will be deferred until it's known that there are no
1520          * more references to it.
1521          *
1522          * If the qdisc has an enqueue function, we still need to
1523          * hold the queue_lock before calling it, since queue_lock
1524          * also serializes access to the device queue.
1525          */
1526
1527         q = rcu_dereference(dev->qdisc);
1528 #ifdef CONFIG_NET_CLS_ACT
1529         skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_EGRESS);
1530 #endif
1531         if (q->enqueue) {
1532                 /* Grab device queue */
1533                 spin_lock(&dev->queue_lock);
1534                 q = dev->qdisc;
1535                 if (q->enqueue) {
1536                         rc = q->enqueue(skb, q);
1537                         qdisc_run(dev);
1538                         spin_unlock(&dev->queue_lock);
1539
1540                         rc = rc == NET_XMIT_BYPASS ? NET_XMIT_SUCCESS : rc;
1541                         goto out;
1542                 }
1543                 spin_unlock(&dev->queue_lock);
1544         }
1545
1546         /* The device has no queue. Common case for software devices:
1547            loopback, all the sorts of tunnels...
1548
1549            Really, it is unlikely that netif_tx_lock protection is necessary
1550            here.  (f.e. loopback and IP tunnels are clean ignoring statistics
1551            counters.)
1552            However, it is possible, that they rely on protection
1553            made by us here.
1554
1555            Check this and shot the lock. It is not prone from deadlocks.
1556            Either shot noqueue qdisc, it is even simpler 8)
1557          */
1558         if (dev->flags & IFF_UP) {
1559                 int cpu = smp_processor_id(); /* ok because BHs are off */
1560
1561                 if (dev->xmit_lock_owner != cpu) {
1562
1563                         HARD_TX_LOCK(dev, cpu);
1564
1565                         if (!netif_queue_stopped(dev)) {
1566                                 rc = 0;
1567                                 if (!dev_hard_start_xmit(skb, dev)) {
1568                                         HARD_TX_UNLOCK(dev);
1569                                         goto out;
1570                                 }
1571                         }
1572                         HARD_TX_UNLOCK(dev);
1573                         if (net_ratelimit())
1574                                 printk(KERN_CRIT "Virtual device %s asks to "
1575                                        "queue packet!\n", dev->name);
1576                 } else {
1577                         /* Recursion is detected! It is possible,
1578                          * unfortunately */
1579                         if (net_ratelimit())
1580                                 printk(KERN_CRIT "Dead loop on virtual device "
1581                                        "%s, fix it urgently!\n", dev->name);
1582                 }
1583         }
1584
1585         rc = -ENETDOWN;
1586         rcu_read_unlock_bh();
1587
1588 out_kfree_skb:
1589         kfree_skb(skb);
1590         return rc;
1591 out:
1592         rcu_read_unlock_bh();
1593         return rc;
1594 }
1595
1596
1597 /*=======================================================================
1598                         Receiver routines
1599   =======================================================================*/
1600
1601 int netdev_max_backlog = 1000;
1602 int netdev_budget = 300;
1603 int weight_p = 64;            /* old backlog weight */
1604
1605 DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
1606
1607
1608 /**
1609  *      netif_rx        -       post buffer to the network code
1610  *      @skb: buffer to post
1611  *
1612  *      This function receives a packet from a device driver and queues it for
1613  *      the upper (protocol) levels to process.  It always succeeds. The buffer
1614  *      may be dropped during processing for congestion control or by the
1615  *      protocol layers.
1616  *
1617  *      return values:
1618  *      NET_RX_SUCCESS  (no congestion)
1619  *      NET_RX_CN_LOW   (low congestion)
1620  *      NET_RX_CN_MOD   (moderate congestion)
1621  *      NET_RX_CN_HIGH  (high congestion)
1622  *      NET_RX_DROP     (packet was dropped)
1623  *
1624  */
1625
1626 int netif_rx(struct sk_buff *skb)
1627 {
1628         struct softnet_data *queue;
1629         unsigned long flags;
1630
1631         /* if netpoll wants it, pretend we never saw it */
1632         if (netpoll_rx(skb))
1633                 return NET_RX_DROP;
1634
1635         if (!skb->tstamp.off_sec)
1636                 net_timestamp(skb);
1637
1638         /*
1639          * The code is rearranged so that the path is the most
1640          * short when CPU is congested, but is still operating.
1641          */
1642         local_irq_save(flags);
1643         queue = &__get_cpu_var(softnet_data);
1644
1645         __get_cpu_var(netdev_rx_stat).total++;
1646         if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
1647                 if (queue->input_pkt_queue.qlen) {
1648 enqueue:
1649                         dev_hold(skb->dev);
1650                         __skb_queue_tail(&queue->input_pkt_queue, skb);
1651                         local_irq_restore(flags);
1652                         return NET_RX_SUCCESS;
1653                 }
1654
1655                 netif_rx_schedule(&queue->backlog_dev);
1656                 goto enqueue;
1657         }
1658
1659         __get_cpu_var(netdev_rx_stat).dropped++;
1660         local_irq_restore(flags);
1661
1662         kfree_skb(skb);
1663         return NET_RX_DROP;
1664 }
1665
1666 int netif_rx_ni(struct sk_buff *skb)
1667 {
1668         int err;
1669
1670         preempt_disable();
1671         err = netif_rx(skb);
1672         if (local_softirq_pending())
1673                 do_softirq();
1674         preempt_enable();
1675
1676         return err;
1677 }
1678
1679 EXPORT_SYMBOL(netif_rx_ni);
1680
1681 static inline struct net_device *skb_bond(struct sk_buff *skb)
1682 {
1683         struct net_device *dev = skb->dev;
1684
1685         if (dev->master) {
1686                 if (skb_bond_should_drop(skb)) {
1687                         kfree_skb(skb);
1688                         return NULL;
1689                 }
1690                 skb->dev = dev->master;
1691         }
1692
1693         return dev;
1694 }
1695
1696 static void net_tx_action(struct softirq_action *h)
1697 {
1698         struct softnet_data *sd = &__get_cpu_var(softnet_data);
1699
1700         if (sd->completion_queue) {
1701                 struct sk_buff *clist;
1702
1703                 local_irq_disable();
1704                 clist = sd->completion_queue;
1705                 sd->completion_queue = NULL;
1706                 local_irq_enable();
1707
1708                 while (clist) {
1709                         struct sk_buff *skb = clist;
1710                         clist = clist->next;
1711
1712                         BUG_TRAP(!atomic_read(&skb->users));
1713                         __kfree_skb(skb);
1714                 }
1715         }
1716
1717         if (sd->output_queue) {
1718                 struct net_device *head;
1719
1720                 local_irq_disable();
1721                 head = sd->output_queue;
1722                 sd->output_queue = NULL;
1723                 local_irq_enable();
1724
1725                 while (head) {
1726                         struct net_device *dev = head;
1727                         head = head->next_sched;
1728
1729                         smp_mb__before_clear_bit();
1730                         clear_bit(__LINK_STATE_SCHED, &dev->state);
1731
1732                         if (spin_trylock(&dev->queue_lock)) {
1733                                 qdisc_run(dev);
1734                                 spin_unlock(&dev->queue_lock);
1735                         } else {
1736                                 netif_schedule(dev);
1737                         }
1738                 }
1739         }
1740 }
1741
1742 static __inline__ int deliver_skb(struct sk_buff *skb,
1743                                   struct packet_type *pt_prev,
1744                                   struct net_device *orig_dev)
1745 {
1746         atomic_inc(&skb->users);
1747         return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1748 }
1749
1750 #if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE)
1751 int (*br_handle_frame_hook)(struct net_bridge_port *p, struct sk_buff **pskb);
1752 struct net_bridge;
1753 struct net_bridge_fdb_entry *(*br_fdb_get_hook)(struct net_bridge *br,
1754                                                 unsigned char *addr);
1755 void (*br_fdb_put_hook)(struct net_bridge_fdb_entry *ent);
1756
1757 static __inline__ int handle_bridge(struct sk_buff **pskb,
1758                                     struct packet_type **pt_prev, int *ret,
1759                                     struct net_device *orig_dev)
1760 {
1761         struct net_bridge_port *port;
1762
1763         if ((*pskb)->pkt_type == PACKET_LOOPBACK ||
1764             (port = rcu_dereference((*pskb)->dev->br_port)) == NULL)
1765                 return 0;
1766
1767         if (*pt_prev) {
1768                 *ret = deliver_skb(*pskb, *pt_prev, orig_dev);
1769                 *pt_prev = NULL;
1770         }
1771
1772         return br_handle_frame_hook(port, pskb);
1773 }
1774 #else
1775 #define handle_bridge(skb, pt_prev, ret, orig_dev)      (0)
1776 #endif
1777
1778 #ifdef CONFIG_NET_CLS_ACT
1779 /* TODO: Maybe we should just force sch_ingress to be compiled in
1780  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
1781  * a compare and 2 stores extra right now if we dont have it on
1782  * but have CONFIG_NET_CLS_ACT
1783  * NOTE: This doesnt stop any functionality; if you dont have
1784  * the ingress scheduler, you just cant add policies on ingress.
1785  *
1786  */
1787 static int ing_filter(struct sk_buff *skb)
1788 {
1789         struct Qdisc *q;
1790         struct net_device *dev = skb->dev;
1791         int result = TC_ACT_OK;
1792
1793         if (dev->qdisc_ingress) {
1794                 __u32 ttl = (__u32) G_TC_RTTL(skb->tc_verd);
1795                 if (MAX_RED_LOOP < ttl++) {
1796                         printk(KERN_WARNING "Redir loop detected Dropping packet (%s->%s)\n",
1797                                 skb->input_dev->name, skb->dev->name);
1798                         return TC_ACT_SHOT;
1799                 }
1800
1801                 skb->tc_verd = SET_TC_RTTL(skb->tc_verd,ttl);
1802
1803                 skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_INGRESS);
1804
1805                 spin_lock(&dev->ingress_lock);
1806                 if ((q = dev->qdisc_ingress) != NULL)
1807                         result = q->enqueue(skb, q);
1808                 spin_unlock(&dev->ingress_lock);
1809
1810         }
1811
1812         return result;
1813 }
1814 #endif
1815
1816 int netif_receive_skb(struct sk_buff *skb)
1817 {
1818         struct packet_type *ptype, *pt_prev;
1819         struct net_device *orig_dev;
1820         int ret = NET_RX_DROP;
1821         unsigned short type;
1822
1823         /* if we've gotten here through NAPI, check netpoll */
1824         if (skb->dev->poll && netpoll_rx(skb))
1825                 return NET_RX_DROP;
1826
1827         if (!skb->tstamp.off_sec)
1828                 net_timestamp(skb);
1829
1830         if (!skb->input_dev)
1831                 skb->input_dev = skb->dev;
1832
1833         orig_dev = skb_bond(skb);
1834
1835         if (!orig_dev)
1836                 return NET_RX_DROP;
1837
1838         __get_cpu_var(netdev_rx_stat).total++;
1839
1840         skb->h.raw = skb->nh.raw = skb->data;
1841         skb->mac_len = skb->nh.raw - skb->mac.raw;
1842
1843         pt_prev = NULL;
1844
1845         rcu_read_lock();
1846
1847 #ifdef CONFIG_NET_CLS_ACT
1848         if (skb->tc_verd & TC_NCLS) {
1849                 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
1850                 goto ncls;
1851         }
1852 #endif
1853
1854 #ifdef CONFIG_XEN
1855         switch (skb->ip_summed) {
1856         case CHECKSUM_UNNECESSARY:
1857                 skb->proto_data_valid = 1;
1858                 break;
1859         case CHECKSUM_HW:
1860                 /* XXX Implement me. */
1861         default:
1862                 skb->proto_data_valid = 0;
1863                 break;
1864         }
1865 #endif
1866
1867         list_for_each_entry_rcu(ptype, &ptype_all, list) {
1868                 if (!ptype->dev || ptype->dev == skb->dev) {
1869                         if (pt_prev)
1870                                 ret = deliver_skb(skb, pt_prev, orig_dev);
1871                         pt_prev = ptype;
1872                 }
1873         }
1874
1875 #ifdef CONFIG_NET_CLS_ACT
1876         if (pt_prev) {
1877                 ret = deliver_skb(skb, pt_prev, orig_dev);
1878                 pt_prev = NULL; /* noone else should process this after*/
1879         } else {
1880                 skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd);
1881         }
1882
1883         ret = ing_filter(skb);
1884
1885         if (ret == TC_ACT_SHOT || (ret == TC_ACT_STOLEN)) {
1886                 kfree_skb(skb);
1887                 goto out;
1888         }
1889
1890         skb->tc_verd = 0;
1891 ncls:
1892 #endif
1893
1894         handle_diverter(skb);
1895
1896         if (handle_bridge(&skb, &pt_prev, &ret, orig_dev))
1897                 goto out;
1898
1899         type = skb->protocol;
1900         list_for_each_entry_rcu(ptype, &ptype_base[ntohs(type)&15], list) {
1901                 if (ptype->type == type &&
1902                     (!ptype->dev || ptype->dev == skb->dev)) {
1903                         if (pt_prev)
1904                                 ret = deliver_skb(skb, pt_prev, orig_dev);
1905                         pt_prev = ptype;
1906                 }
1907         }
1908
1909         if (pt_prev) {
1910                 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1911         } else {
1912                 kfree_skb(skb);
1913                 /* Jamal, now you will not able to escape explaining
1914                  * me how you were going to use this. :-)
1915                  */
1916                 ret = NET_RX_DROP;
1917         }
1918
1919 out:
1920         rcu_read_unlock();
1921         return ret;
1922 }
1923
1924 static int process_backlog(struct net_device *backlog_dev, int *budget)
1925 {
1926         int work = 0;
1927         int quota = min(backlog_dev->quota, *budget);
1928         struct softnet_data *queue = &__get_cpu_var(softnet_data);
1929         unsigned long start_time = jiffies;
1930
1931         backlog_dev->weight = weight_p;
1932         for (;;) {
1933                 struct sk_buff *skb;
1934                 struct net_device *dev;
1935
1936                 local_irq_disable();
1937                 skb = __skb_dequeue(&queue->input_pkt_queue);
1938                 if (!skb)
1939                         goto job_done;
1940                 local_irq_enable();
1941
1942                 dev = skb->dev;
1943
1944                 netif_receive_skb(skb);
1945
1946                 dev_put(dev);
1947
1948                 work++;
1949
1950                 if (work >= quota || jiffies - start_time > 1)
1951                         break;
1952
1953         }
1954
1955         backlog_dev->quota -= work;
1956         *budget -= work;
1957         return -1;
1958
1959 job_done:
1960         backlog_dev->quota -= work;
1961         *budget -= work;
1962
1963         list_del(&backlog_dev->poll_list);
1964         smp_mb__before_clear_bit();
1965         netif_poll_enable(backlog_dev);
1966
1967         local_irq_enable();
1968         return 0;
1969 }
1970
1971 static void net_rx_action(struct softirq_action *h)
1972 {
1973         struct softnet_data *queue = &__get_cpu_var(softnet_data);
1974         unsigned long start_time = jiffies;
1975         int budget = netdev_budget;
1976         void *have;
1977
1978         local_irq_disable();
1979
1980         while (!list_empty(&queue->poll_list)) {
1981                 struct net_device *dev;
1982
1983                 if (budget <= 0 || jiffies - start_time > 1)
1984                         goto softnet_break;
1985
1986                 local_irq_enable();
1987
1988                 dev = list_entry(queue->poll_list.next,
1989                                  struct net_device, poll_list);
1990                 have = netpoll_poll_lock(dev);
1991
1992                 if (dev->quota <= 0 || dev->poll(dev, &budget)) {
1993                         netpoll_poll_unlock(have);
1994                         local_irq_disable();
1995                         list_move_tail(&dev->poll_list, &queue->poll_list);
1996                         if (dev->quota < 0)
1997                                 dev->quota += dev->weight;
1998                         else
1999                                 dev->quota = dev->weight;
2000                 } else {
2001                         netpoll_poll_unlock(have);
2002                         dev_put(dev);
2003                         local_irq_disable();
2004                 }
2005         }
2006 out:
2007 #ifdef CONFIG_NET_DMA
2008         /*
2009          * There may not be any more sk_buffs coming right now, so push
2010          * any pending DMA copies to hardware
2011          */
2012         if (net_dma_client) {
2013                 struct dma_chan *chan;
2014                 rcu_read_lock();
2015                 list_for_each_entry_rcu(chan, &net_dma_client->channels, client_node)
2016                         dma_async_memcpy_issue_pending(chan);
2017                 rcu_read_unlock();
2018         }
2019 #endif
2020         local_irq_enable();
2021         return;
2022
2023 softnet_break:
2024         __get_cpu_var(netdev_rx_stat).time_squeeze++;
2025         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2026         goto out;
2027 }
2028
2029 static gifconf_func_t * gifconf_list [NPROTO];
2030
2031 /**
2032  *      register_gifconf        -       register a SIOCGIF handler
2033  *      @family: Address family
2034  *      @gifconf: Function handler
2035  *
2036  *      Register protocol dependent address dumping routines. The handler
2037  *      that is passed must not be freed or reused until it has been replaced
2038  *      by another handler.
2039  */
2040 int register_gifconf(unsigned int family, gifconf_func_t * gifconf)
2041 {
2042         if (family >= NPROTO)
2043                 return -EINVAL;
2044         gifconf_list[family] = gifconf;
2045         return 0;
2046 }
2047
2048
2049 /*
2050  *      Map an interface index to its name (SIOCGIFNAME)
2051  */
2052
2053 /*
2054  *      We need this ioctl for efficient implementation of the
2055  *      if_indextoname() function required by the IPv6 API.  Without
2056  *      it, we would have to search all the interfaces to find a
2057  *      match.  --pb
2058  */
2059
2060 static int dev_ifname(struct ifreq __user *arg)
2061 {
2062         struct net_device *dev;
2063         struct ifreq ifr;
2064
2065         /*
2066          *      Fetch the caller's info block.
2067          */
2068
2069         if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
2070                 return -EFAULT;
2071
2072         read_lock(&dev_base_lock);
2073         dev = __dev_get_by_index(ifr.ifr_ifindex);
2074         if (!dev) {
2075                 read_unlock(&dev_base_lock);
2076                 return -ENODEV;
2077         }
2078
2079         strcpy(ifr.ifr_name, dev->name);
2080         read_unlock(&dev_base_lock);
2081
2082         if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
2083                 return -EFAULT;
2084         return 0;
2085 }
2086
2087 /*
2088  *      Perform a SIOCGIFCONF call. This structure will change
2089  *      size eventually, and there is nothing I can do about it.
2090  *      Thus we will need a 'compatibility mode'.
2091  */
2092
2093 static int dev_ifconf(char __user *arg)
2094 {
2095         struct ifconf ifc;
2096         struct net_device *dev;
2097         char __user *pos;
2098         int len;
2099         int total;
2100         int i;
2101
2102         /*
2103          *      Fetch the caller's info block.
2104          */
2105
2106         if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
2107                 return -EFAULT;
2108
2109         pos = ifc.ifc_buf;
2110         len = ifc.ifc_len;
2111
2112         /*
2113          *      Loop over the interfaces, and write an info block for each.
2114          */
2115
2116         total = 0;
2117         for (dev = dev_base; dev; dev = dev->next) {
2118                 if (vx_flags(VXF_HIDE_NETIF, 0) &&
2119                         !dev_in_nx_info(dev, current->nx_info))
2120                         continue;
2121                 for (i = 0; i < NPROTO; i++) {
2122                         if (gifconf_list[i]) {
2123                                 int done;
2124                                 if (!pos)
2125                                         done = gifconf_list[i](dev, NULL, 0);
2126                                 else
2127                                         done = gifconf_list[i](dev, pos + total,
2128                                                                len - total);
2129                                 if (done < 0)
2130                                         return -EFAULT;
2131                                 total += done;
2132                         }
2133                 }
2134         }
2135
2136         /*
2137          *      All done.  Write the updated control block back to the caller.
2138          */
2139         ifc.ifc_len = total;
2140
2141         /*
2142          *      Both BSD and Solaris return 0 here, so we do too.
2143          */
2144         return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
2145 }
2146
2147 #ifdef CONFIG_PROC_FS
2148 /*
2149  *      This is invoked by the /proc filesystem handler to display a device
2150  *      in detail.
2151  */
2152 static __inline__ struct net_device *dev_get_idx(loff_t pos)
2153 {
2154         struct net_device *dev;
2155         loff_t i;
2156
2157         for (i = 0, dev = dev_base; dev && i < pos; ++i, dev = dev->next);
2158
2159         return i == pos ? dev : NULL;
2160 }
2161
2162 void *dev_seq_start(struct seq_file *seq, loff_t *pos)
2163 {
2164         read_lock(&dev_base_lock);
2165         return *pos ? dev_get_idx(*pos - 1) : SEQ_START_TOKEN;
2166 }
2167
2168 void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2169 {
2170         ++*pos;
2171         return v == SEQ_START_TOKEN ? dev_base : ((struct net_device *)v)->next;
2172 }
2173
2174 void dev_seq_stop(struct seq_file *seq, void *v)
2175 {
2176         read_unlock(&dev_base_lock);
2177 }
2178
2179 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
2180 {
2181         struct nx_info *nxi = current->nx_info;
2182
2183         if (vx_flags(VXF_HIDE_NETIF, 0) && !dev_in_nx_info(dev, nxi))
2184                 return;
2185         if (dev->get_stats) {
2186                 struct net_device_stats *stats = dev->get_stats(dev);
2187
2188                 seq_printf(seq, "%6s:%8lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu "
2189                                 "%8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n",
2190                            dev->name, stats->rx_bytes, stats->rx_packets,
2191                            stats->rx_errors,
2192                            stats->rx_dropped + stats->rx_missed_errors,
2193                            stats->rx_fifo_errors,
2194                            stats->rx_length_errors + stats->rx_over_errors +
2195                              stats->rx_crc_errors + stats->rx_frame_errors,
2196                            stats->rx_compressed, stats->multicast,
2197                            stats->tx_bytes, stats->tx_packets,
2198                            stats->tx_errors, stats->tx_dropped,
2199                            stats->tx_fifo_errors, stats->collisions,
2200                            stats->tx_carrier_errors +
2201                              stats->tx_aborted_errors +
2202                              stats->tx_window_errors +
2203                              stats->tx_heartbeat_errors,
2204                            stats->tx_compressed);
2205         } else
2206                 seq_printf(seq, "%6s: No statistics available.\n", dev->name);
2207 }
2208
2209 /*
2210  *      Called from the PROCfs module. This now uses the new arbitrary sized
2211  *      /proc/net interface to create /proc/net/dev
2212  */
2213 static int dev_seq_show(struct seq_file *seq, void *v)
2214 {
2215         if (v == SEQ_START_TOKEN)
2216                 seq_puts(seq, "Inter-|   Receive                            "
2217                               "                    |  Transmit\n"
2218                               " face |bytes    packets errs drop fifo frame "
2219                               "compressed multicast|bytes    packets errs "
2220                               "drop fifo colls carrier compressed\n");
2221         else
2222                 dev_seq_printf_stats(seq, v);
2223         return 0;
2224 }
2225
2226 static struct netif_rx_stats *softnet_get_online(loff_t *pos)
2227 {
2228         struct netif_rx_stats *rc = NULL;
2229
2230         while (*pos < NR_CPUS)
2231                 if (cpu_online(*pos)) {
2232                         rc = &per_cpu(netdev_rx_stat, *pos);
2233                         break;
2234                 } else
2235                         ++*pos;
2236         return rc;
2237 }
2238
2239 static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
2240 {
2241         return softnet_get_online(pos);
2242 }
2243
2244 static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2245 {
2246         ++*pos;
2247         return softnet_get_online(pos);
2248 }
2249
2250 static void softnet_seq_stop(struct seq_file *seq, void *v)
2251 {
2252 }
2253
2254 static int softnet_seq_show(struct seq_file *seq, void *v)
2255 {
2256         struct netif_rx_stats *s = v;
2257
2258         seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
2259                    s->total, s->dropped, s->time_squeeze, 0,
2260                    0, 0, 0, 0, /* was fastroute */
2261                    s->cpu_collision );
2262         return 0;
2263 }
2264
2265 static struct seq_operations dev_seq_ops = {
2266         .start = dev_seq_start,
2267         .next  = dev_seq_next,
2268         .stop  = dev_seq_stop,
2269         .show  = dev_seq_show,
2270 };
2271
2272 static int dev_seq_open(struct inode *inode, struct file *file)
2273 {
2274         return seq_open(file, &dev_seq_ops);
2275 }
2276
2277 static struct file_operations dev_seq_fops = {
2278         .owner   = THIS_MODULE,
2279         .open    = dev_seq_open,
2280         .read    = seq_read,
2281         .llseek  = seq_lseek,
2282         .release = seq_release,
2283 };
2284
2285 static struct seq_operations softnet_seq_ops = {
2286         .start = softnet_seq_start,
2287         .next  = softnet_seq_next,
2288         .stop  = softnet_seq_stop,
2289         .show  = softnet_seq_show,
2290 };
2291
2292 static int softnet_seq_open(struct inode *inode, struct file *file)
2293 {
2294         return seq_open(file, &softnet_seq_ops);
2295 }
2296
2297 static struct file_operations softnet_seq_fops = {
2298         .owner   = THIS_MODULE,
2299         .open    = softnet_seq_open,
2300         .read    = seq_read,
2301         .llseek  = seq_lseek,
2302         .release = seq_release,
2303 };
2304
2305 #ifdef CONFIG_WIRELESS_EXT
2306 extern int wireless_proc_init(void);
2307 #else
2308 #define wireless_proc_init() 0
2309 #endif
2310
2311 static int __init dev_proc_init(void)
2312 {
2313         int rc = -ENOMEM;
2314
2315         if (!proc_net_fops_create("dev", S_IRUGO, &dev_seq_fops))
2316                 goto out;
2317         if (!proc_net_fops_create("softnet_stat", S_IRUGO, &softnet_seq_fops))
2318                 goto out_dev;
2319         if (wireless_proc_init())
2320                 goto out_softnet;
2321         rc = 0;
2322 out:
2323         return rc;
2324 out_softnet:
2325         proc_net_remove("softnet_stat");
2326 out_dev:
2327         proc_net_remove("dev");
2328         goto out;
2329 }
2330 #else
2331 #define dev_proc_init() 0
2332 #endif  /* CONFIG_PROC_FS */
2333
2334
2335 /**
2336  *      netdev_set_master       -       set up master/slave pair
2337  *      @slave: slave device
2338  *      @master: new master device
2339  *
2340  *      Changes the master device of the slave. Pass %NULL to break the
2341  *      bonding. The caller must hold the RTNL semaphore. On a failure
2342  *      a negative errno code is returned. On success the reference counts
2343  *      are adjusted, %RTM_NEWLINK is sent to the routing socket and the
2344  *      function returns zero.
2345  */
2346 int netdev_set_master(struct net_device *slave, struct net_device *master)
2347 {
2348         struct net_device *old = slave->master;
2349
2350         ASSERT_RTNL();
2351
2352         if (master) {
2353                 if (old)
2354                         return -EBUSY;
2355                 dev_hold(master);
2356         }
2357
2358         slave->master = master;
2359
2360         synchronize_net();
2361
2362         if (old)
2363                 dev_put(old);
2364
2365         if (master)
2366                 slave->flags |= IFF_SLAVE;
2367         else
2368                 slave->flags &= ~IFF_SLAVE;
2369
2370         rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
2371         return 0;
2372 }
2373
2374 /**
2375  *      dev_set_promiscuity     - update promiscuity count on a device
2376  *      @dev: device
2377  *      @inc: modifier
2378  *
2379  *      Add or remove promiscuity from a device. While the count in the device
2380  *      remains above zero the interface remains promiscuous. Once it hits zero
2381  *      the device reverts back to normal filtering operation. A negative inc
2382  *      value is used to drop promiscuity on the device.
2383  */
2384 void dev_set_promiscuity(struct net_device *dev, int inc)
2385 {
2386         unsigned short old_flags = dev->flags;
2387
2388         if ((dev->promiscuity += inc) == 0)
2389                 dev->flags &= ~IFF_PROMISC;
2390         else
2391                 dev->flags |= IFF_PROMISC;
2392         if (dev->flags != old_flags) {
2393                 dev_mc_upload(dev);
2394                 printk(KERN_INFO "device %s %s promiscuous mode\n",
2395                        dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
2396                                                                "left");
2397                 audit_log(current->audit_context, GFP_ATOMIC,
2398                         AUDIT_ANOM_PROMISCUOUS,
2399                         "dev=%s prom=%d old_prom=%d auid=%u",
2400                         dev->name, (dev->flags & IFF_PROMISC),
2401                         (old_flags & IFF_PROMISC),
2402                         audit_get_loginuid(current->audit_context));
2403         }
2404 }
2405
2406 /**
2407  *      dev_set_allmulti        - update allmulti count on a device
2408  *      @dev: device
2409  *      @inc: modifier
2410  *
2411  *      Add or remove reception of all multicast frames to a device. While the
2412  *      count in the device remains above zero the interface remains listening
2413  *      to all interfaces. Once it hits zero the device reverts back to normal
2414  *      filtering operation. A negative @inc value is used to drop the counter
2415  *      when releasing a resource needing all multicasts.
2416  */
2417
2418 void dev_set_allmulti(struct net_device *dev, int inc)
2419 {
2420         unsigned short old_flags = dev->flags;
2421
2422         dev->flags |= IFF_ALLMULTI;
2423         if ((dev->allmulti += inc) == 0)
2424                 dev->flags &= ~IFF_ALLMULTI;
2425         if (dev->flags ^ old_flags)
2426                 dev_mc_upload(dev);
2427 }
2428
2429 unsigned dev_get_flags(const struct net_device *dev)
2430 {
2431         unsigned flags;
2432
2433         flags = (dev->flags & ~(IFF_PROMISC |
2434                                 IFF_ALLMULTI |
2435                                 IFF_RUNNING |
2436                                 IFF_LOWER_UP |
2437                                 IFF_DORMANT)) |
2438                 (dev->gflags & (IFF_PROMISC |
2439                                 IFF_ALLMULTI));
2440
2441         if (netif_running(dev)) {
2442                 if (netif_oper_up(dev))
2443                         flags |= IFF_RUNNING;
2444                 if (netif_carrier_ok(dev))
2445                         flags |= IFF_LOWER_UP;
2446                 if (netif_dormant(dev))
2447                         flags |= IFF_DORMANT;
2448         }
2449
2450         return flags;
2451 }
2452
2453 int dev_change_flags(struct net_device *dev, unsigned flags)
2454 {
2455         int ret;
2456         int old_flags = dev->flags;
2457
2458         /*
2459          *      Set the flags on our device.
2460          */
2461
2462         dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
2463                                IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
2464                                IFF_AUTOMEDIA)) |
2465                      (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
2466                                     IFF_ALLMULTI));
2467
2468         /*
2469          *      Load in the correct multicast list now the flags have changed.
2470          */
2471
2472         dev_mc_upload(dev);
2473
2474         /*
2475          *      Have we downed the interface. We handle IFF_UP ourselves
2476          *      according to user attempts to set it, rather than blindly
2477          *      setting it.
2478          */
2479
2480         ret = 0;
2481         if ((old_flags ^ flags) & IFF_UP) {     /* Bit is different  ? */
2482                 ret = ((old_flags & IFF_UP) ? dev_close : dev_open)(dev);
2483
2484                 if (!ret)
2485                         dev_mc_upload(dev);
2486         }
2487
2488         if (dev->flags & IFF_UP &&
2489             ((old_flags ^ dev->flags) &~ (IFF_UP | IFF_PROMISC | IFF_ALLMULTI |
2490                                           IFF_VOLATILE)))
2491                 raw_notifier_call_chain(&netdev_chain,
2492                                 NETDEV_CHANGE, dev);
2493
2494         if ((flags ^ dev->gflags) & IFF_PROMISC) {
2495                 int inc = (flags & IFF_PROMISC) ? +1 : -1;
2496                 dev->gflags ^= IFF_PROMISC;
2497                 dev_set_promiscuity(dev, inc);
2498         }
2499
2500         /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
2501            is important. Some (broken) drivers set IFF_PROMISC, when
2502            IFF_ALLMULTI is requested not asking us and not reporting.
2503          */
2504         if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
2505                 int inc = (flags & IFF_ALLMULTI) ? +1 : -1;
2506                 dev->gflags ^= IFF_ALLMULTI;
2507                 dev_set_allmulti(dev, inc);
2508         }
2509
2510         if (old_flags ^ dev->flags)
2511                 rtmsg_ifinfo(RTM_NEWLINK, dev, old_flags ^ dev->flags);
2512
2513         return ret;
2514 }
2515
2516 int dev_set_mtu(struct net_device *dev, int new_mtu)
2517 {
2518         int err;
2519
2520         if (new_mtu == dev->mtu)
2521                 return 0;
2522
2523         /*      MTU must be positive.    */
2524         if (new_mtu < 0)
2525                 return -EINVAL;
2526
2527         if (!netif_device_present(dev))
2528                 return -ENODEV;
2529
2530         err = 0;
2531         if (dev->change_mtu)
2532                 err = dev->change_mtu(dev, new_mtu);
2533         else
2534                 dev->mtu = new_mtu;
2535         if (!err && dev->flags & IFF_UP)
2536                 raw_notifier_call_chain(&netdev_chain,
2537                                 NETDEV_CHANGEMTU, dev);
2538         return err;
2539 }
2540
2541 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
2542 {
2543         int err;
2544
2545         if (!dev->set_mac_address)
2546                 return -EOPNOTSUPP;
2547         if (sa->sa_family != dev->type)
2548                 return -EINVAL;
2549         if (!netif_device_present(dev))
2550                 return -ENODEV;
2551         err = dev->set_mac_address(dev, sa);
2552         if (!err)
2553                 raw_notifier_call_chain(&netdev_chain,
2554                                 NETDEV_CHANGEADDR, dev);
2555         return err;
2556 }
2557
2558 /*
2559  *      Perform the SIOCxIFxxx calls.
2560  */
2561 static int dev_ifsioc(struct ifreq *ifr, unsigned int cmd)
2562 {
2563         int err;
2564         struct net_device *dev = __dev_get_by_name(ifr->ifr_name);
2565
2566         if (!dev)
2567                 return -ENODEV;
2568
2569         switch (cmd) {
2570                 case SIOCGIFFLAGS:      /* Get interface flags */
2571                         ifr->ifr_flags = dev_get_flags(dev);
2572                         return 0;
2573
2574                 case SIOCSIFFLAGS:      /* Set interface flags */
2575                         return dev_change_flags(dev, ifr->ifr_flags);
2576
2577                 case SIOCGIFMETRIC:     /* Get the metric on the interface
2578                                            (currently unused) */
2579                         ifr->ifr_metric = 0;
2580                         return 0;
2581
2582                 case SIOCSIFMETRIC:     /* Set the metric on the interface
2583                                            (currently unused) */
2584                         return -EOPNOTSUPP;
2585
2586                 case SIOCGIFMTU:        /* Get the MTU of a device */
2587                         ifr->ifr_mtu = dev->mtu;
2588                         return 0;
2589
2590                 case SIOCSIFMTU:        /* Set the MTU of a device */
2591                         return dev_set_mtu(dev, ifr->ifr_mtu);
2592
2593                 case SIOCGIFHWADDR:
2594                         if (!dev->addr_len)
2595                                 memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
2596                         else
2597                                 memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
2598                                        min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
2599                         ifr->ifr_hwaddr.sa_family = dev->type;
2600                         return 0;
2601
2602                 case SIOCSIFHWADDR:
2603                         return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
2604
2605                 case SIOCSIFHWBROADCAST:
2606                         if (ifr->ifr_hwaddr.sa_family != dev->type)
2607                                 return -EINVAL;
2608                         memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
2609                                min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
2610                         raw_notifier_call_chain(&netdev_chain,
2611                                             NETDEV_CHANGEADDR, dev);
2612                         return 0;
2613
2614                 case SIOCGIFMAP:
2615                         ifr->ifr_map.mem_start = dev->mem_start;
2616                         ifr->ifr_map.mem_end   = dev->mem_end;
2617                         ifr->ifr_map.base_addr = dev->base_addr;
2618                         ifr->ifr_map.irq       = dev->irq;
2619                         ifr->ifr_map.dma       = dev->dma;
2620                         ifr->ifr_map.port      = dev->if_port;
2621                         return 0;
2622
2623                 case SIOCSIFMAP:
2624                         if (dev->set_config) {
2625                                 if (!netif_device_present(dev))
2626                                         return -ENODEV;
2627                                 return dev->set_config(dev, &ifr->ifr_map);
2628                         }
2629                         return -EOPNOTSUPP;
2630
2631                 case SIOCADDMULTI:
2632                         if (!dev->set_multicast_list ||
2633                             ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
2634                                 return -EINVAL;
2635                         if (!netif_device_present(dev))
2636                                 return -ENODEV;
2637                         return dev_mc_add(dev, ifr->ifr_hwaddr.sa_data,
2638                                           dev->addr_len, 1);
2639
2640                 case SIOCDELMULTI:
2641                         if (!dev->set_multicast_list ||
2642                             ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
2643                                 return -EINVAL;
2644                         if (!netif_device_present(dev))
2645                                 return -ENODEV;
2646                         return dev_mc_delete(dev, ifr->ifr_hwaddr.sa_data,
2647                                              dev->addr_len, 1);
2648
2649                 case SIOCGIFINDEX:
2650                         ifr->ifr_ifindex = dev->ifindex;
2651                         return 0;
2652
2653                 case SIOCGIFTXQLEN:
2654                         ifr->ifr_qlen = dev->tx_queue_len;
2655                         return 0;
2656
2657                 case SIOCSIFTXQLEN:
2658                         if (ifr->ifr_qlen < 0)
2659                                 return -EINVAL;
2660                         dev->tx_queue_len = ifr->ifr_qlen;
2661                         return 0;
2662
2663                 case SIOCSIFNAME:
2664                         ifr->ifr_newname[IFNAMSIZ-1] = '\0';
2665                         return dev_change_name(dev, ifr->ifr_newname);
2666
2667                 /*
2668                  *      Unknown or private ioctl
2669                  */
2670
2671                 default:
2672                         if ((cmd >= SIOCDEVPRIVATE &&
2673                             cmd <= SIOCDEVPRIVATE + 15) ||
2674                             cmd == SIOCBONDENSLAVE ||
2675                             cmd == SIOCBONDRELEASE ||
2676                             cmd == SIOCBONDSETHWADDR ||
2677                             cmd == SIOCBONDSLAVEINFOQUERY ||
2678                             cmd == SIOCBONDINFOQUERY ||
2679                             cmd == SIOCBONDCHANGEACTIVE ||
2680                             cmd == SIOCGMIIPHY ||
2681                             cmd == SIOCGMIIREG ||
2682                             cmd == SIOCSMIIREG ||
2683                             cmd == SIOCBRADDIF ||
2684                             cmd == SIOCBRDELIF ||
2685                             cmd == SIOCWANDEV) {
2686                                 err = -EOPNOTSUPP;
2687                                 if (dev->do_ioctl) {
2688                                         if (netif_device_present(dev))
2689                                                 err = dev->do_ioctl(dev, ifr,
2690                                                                     cmd);
2691                                         else
2692                                                 err = -ENODEV;
2693                                 }
2694                         } else
2695                                 err = -EINVAL;
2696
2697         }
2698         return err;
2699 }
2700
2701 /*
2702  *      This function handles all "interface"-type I/O control requests. The actual
2703  *      'doing' part of this is dev_ifsioc above.
2704  */
2705
2706 /**
2707  *      dev_ioctl       -       network device ioctl
2708  *      @cmd: command to issue
2709  *      @arg: pointer to a struct ifreq in user space
2710  *
2711  *      Issue ioctl functions to devices. This is normally called by the
2712  *      user space syscall interfaces but can sometimes be useful for
2713  *      other purposes. The return value is the return from the syscall if
2714  *      positive or a negative errno code on error.
2715  */
2716
2717 int dev_ioctl(unsigned int cmd, void __user *arg)
2718 {
2719         struct ifreq ifr;
2720         int ret;
2721         char *colon;
2722
2723         /* One special case: SIOCGIFCONF takes ifconf argument
2724            and requires shared lock, because it sleeps writing
2725            to user space.
2726          */
2727
2728         if (cmd == SIOCGIFCONF) {
2729                 rtnl_lock();
2730                 ret = dev_ifconf((char __user *) arg);
2731                 rtnl_unlock();
2732                 return ret;
2733         }
2734         if (cmd == SIOCGIFNAME)
2735                 return dev_ifname((struct ifreq __user *)arg);
2736
2737         if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
2738                 return -EFAULT;
2739
2740         ifr.ifr_name[IFNAMSIZ-1] = 0;
2741
2742         colon = strchr(ifr.ifr_name, ':');
2743         if (colon)
2744                 *colon = 0;
2745
2746         /*
2747          *      See which interface the caller is talking about.
2748          */
2749
2750         switch (cmd) {
2751                 /*
2752                  *      These ioctl calls:
2753                  *      - can be done by all.
2754                  *      - atomic and do not require locking.
2755                  *      - return a value
2756                  */
2757                 case SIOCGIFFLAGS:
2758                 case SIOCGIFMETRIC:
2759                 case SIOCGIFMTU:
2760                 case SIOCGIFHWADDR:
2761                 case SIOCGIFSLAVE:
2762                 case SIOCGIFMAP:
2763                 case SIOCGIFINDEX:
2764                 case SIOCGIFTXQLEN:
2765                         dev_load(ifr.ifr_name);
2766                         read_lock(&dev_base_lock);
2767                         ret = dev_ifsioc(&ifr, cmd);
2768                         read_unlock(&dev_base_lock);
2769                         if (!ret) {
2770                                 if (colon)
2771                                         *colon = ':';
2772                                 if (copy_to_user(arg, &ifr,
2773                                                  sizeof(struct ifreq)))
2774                                         ret = -EFAULT;
2775                         }
2776                         return ret;
2777
2778                 case SIOCETHTOOL:
2779                         dev_load(ifr.ifr_name);
2780                         rtnl_lock();
2781                         ret = dev_ethtool(&ifr);
2782                         rtnl_unlock();
2783                         if (!ret) {
2784                                 if (colon)
2785                                         *colon = ':';
2786                                 if (copy_to_user(arg, &ifr,
2787                                                  sizeof(struct ifreq)))
2788                                         ret = -EFAULT;
2789                         }
2790                         return ret;
2791
2792                 /*
2793                  *      These ioctl calls:
2794                  *      - require superuser power.
2795                  *      - require strict serialization.
2796                  *      - return a value
2797                  */
2798                 case SIOCGMIIPHY:
2799                 case SIOCGMIIREG:
2800                 case SIOCSIFNAME:
2801                         if (!capable(CAP_NET_ADMIN))
2802                                 return -EPERM;
2803                         dev_load(ifr.ifr_name);
2804                         rtnl_lock();
2805                         ret = dev_ifsioc(&ifr, cmd);
2806                         rtnl_unlock();
2807                         if (!ret) {
2808                                 if (colon)
2809                                         *colon = ':';
2810                                 if (copy_to_user(arg, &ifr,
2811                                                  sizeof(struct ifreq)))
2812                                         ret = -EFAULT;
2813                         }
2814                         return ret;
2815
2816                 /*
2817                  *      These ioctl calls:
2818                  *      - require superuser power.
2819                  *      - require strict serialization.
2820                  *      - do not return a value
2821                  */
2822                 case SIOCSIFFLAGS:
2823                 case SIOCSIFMETRIC:
2824                 case SIOCSIFMTU:
2825                 case SIOCSIFMAP:
2826                 case SIOCSIFHWADDR:
2827                 case SIOCSIFSLAVE:
2828                 case SIOCADDMULTI:
2829                 case SIOCDELMULTI:
2830                 case SIOCSIFHWBROADCAST:
2831                 case SIOCSIFTXQLEN:
2832                 case SIOCSMIIREG:
2833                 case SIOCBONDENSLAVE:
2834                 case SIOCBONDRELEASE:
2835                 case SIOCBONDSETHWADDR:
2836                 case SIOCBONDCHANGEACTIVE:
2837                 case SIOCBRADDIF:
2838                 case SIOCBRDELIF:
2839                         if (!capable(CAP_NET_ADMIN))
2840                                 return -EPERM;
2841                         /* fall through */
2842                 case SIOCBONDSLAVEINFOQUERY:
2843                 case SIOCBONDINFOQUERY:
2844                         dev_load(ifr.ifr_name);
2845                         rtnl_lock();
2846                         ret = dev_ifsioc(&ifr, cmd);
2847                         rtnl_unlock();
2848                         return ret;
2849
2850                 case SIOCGIFMEM:
2851                         /* Get the per device memory space. We can add this but
2852                          * currently do not support it */
2853                 case SIOCSIFMEM:
2854                         /* Set the per device memory buffer space.
2855                          * Not applicable in our case */
2856                 case SIOCSIFLINK:
2857                         return -EINVAL;
2858
2859                 /*
2860                  *      Unknown or private ioctl.
2861                  */
2862                 default:
2863                         if (cmd == SIOCWANDEV ||
2864                             (cmd >= SIOCDEVPRIVATE &&
2865                              cmd <= SIOCDEVPRIVATE + 15)) {
2866                                 dev_load(ifr.ifr_name);
2867                                 rtnl_lock();
2868                                 ret = dev_ifsioc(&ifr, cmd);
2869                                 rtnl_unlock();
2870                                 if (!ret && copy_to_user(arg, &ifr,
2871                                                          sizeof(struct ifreq)))
2872                                         ret = -EFAULT;
2873                                 return ret;
2874                         }
2875 #ifdef CONFIG_WIRELESS_EXT
2876                         /* Take care of Wireless Extensions */
2877                         if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) {
2878                                 /* If command is `set a parameter', or
2879                                  * `get the encoding parameters', check if
2880                                  * the user has the right to do it */
2881                                 if (IW_IS_SET(cmd) || cmd == SIOCGIWENCODE
2882                                     || cmd == SIOCGIWENCODEEXT) {
2883                                         if (!capable(CAP_NET_ADMIN))
2884                                                 return -EPERM;
2885                                 }
2886                                 dev_load(ifr.ifr_name);
2887                                 rtnl_lock();
2888                                 /* Follow me in net/core/wireless.c */
2889                                 ret = wireless_process_ioctl(&ifr, cmd);
2890                                 rtnl_unlock();
2891                                 if (IW_IS_GET(cmd) &&
2892                                     copy_to_user(arg, &ifr,
2893                                                  sizeof(struct ifreq)))
2894                                         ret = -EFAULT;
2895                                 return ret;
2896                         }
2897 #endif  /* CONFIG_WIRELESS_EXT */
2898                         return -EINVAL;
2899         }
2900 }
2901
2902
2903 /**
2904  *      dev_new_index   -       allocate an ifindex
2905  *
2906  *      Returns a suitable unique value for a new device interface
2907  *      number.  The caller must hold the rtnl semaphore or the
2908  *      dev_base_lock to be sure it remains unique.
2909  */
2910 static int dev_new_index(void)
2911 {
2912         static int ifindex;
2913         for (;;) {
2914                 if (++ifindex <= 0)
2915                         ifindex = 1;
2916                 if (!__dev_get_by_index(ifindex))
2917                         return ifindex;
2918         }
2919 }
2920
2921 static int dev_boot_phase = 1;
2922
2923 /* Delayed registration/unregisteration */
2924 static DEFINE_SPINLOCK(net_todo_list_lock);
2925 static struct list_head net_todo_list = LIST_HEAD_INIT(net_todo_list);
2926
2927 static inline void net_set_todo(struct net_device *dev)
2928 {
2929         spin_lock(&net_todo_list_lock);
2930         list_add_tail(&dev->todo_list, &net_todo_list);
2931         spin_unlock(&net_todo_list_lock);
2932 }
2933
2934 /**
2935  *      register_netdevice      - register a network device
2936  *      @dev: device to register
2937  *
2938  *      Take a completed network device structure and add it to the kernel
2939  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
2940  *      chain. 0 is returned on success. A negative errno code is returned
2941  *      on a failure to set up the device, or if the name is a duplicate.
2942  *
2943  *      Callers must hold the rtnl semaphore. You may want
2944  *      register_netdev() instead of this.
2945  *
2946  *      BUGS:
2947  *      The locking appears insufficient to guarantee two parallel registers
2948  *      will not get the same name.
2949  */
2950
2951 int register_netdevice(struct net_device *dev)
2952 {
2953         struct hlist_head *head;
2954         struct hlist_node *p;
2955         int ret;
2956
2957         BUG_ON(dev_boot_phase);
2958         ASSERT_RTNL();
2959
2960         might_sleep();
2961
2962         /* When net_device's are persistent, this will be fatal. */
2963         BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
2964
2965         spin_lock_init(&dev->queue_lock);
2966         spin_lock_init(&dev->_xmit_lock);
2967         dev->xmit_lock_owner = -1;
2968 #ifdef CONFIG_NET_CLS_ACT
2969         spin_lock_init(&dev->ingress_lock);
2970 #endif
2971
2972         ret = alloc_divert_blk(dev);
2973         if (ret)
2974                 goto out;
2975
2976         dev->iflink = -1;
2977
2978         /* Init, if this function is available */
2979         if (dev->init) {
2980                 ret = dev->init(dev);
2981                 if (ret) {
2982                         if (ret > 0)
2983                                 ret = -EIO;
2984                         goto out_err;
2985                 }
2986         }
2987
2988         if (!dev_valid_name(dev->name)) {
2989                 ret = -EINVAL;
2990                 goto out_err;
2991         }
2992
2993         dev->ifindex = dev_new_index();
2994         if (dev->iflink == -1)
2995                 dev->iflink = dev->ifindex;
2996
2997         /* Check for existence of name */
2998         head = dev_name_hash(dev->name);
2999         hlist_for_each(p, head) {
3000                 struct net_device *d
3001                         = hlist_entry(p, struct net_device, name_hlist);
3002                 if (!strncmp(d->name, dev->name, IFNAMSIZ)) {
3003                         ret = -EEXIST;
3004                         goto out_err;
3005                 }
3006         }
3007
3008         /* Fix illegal SG+CSUM combinations. */
3009         if ((dev->features & NETIF_F_SG) &&
3010             !(dev->features & NETIF_F_ALL_CSUM)) {
3011                 printk(KERN_NOTICE "%s: Dropping NETIF_F_SG since no checksum feature.\n",
3012                        dev->name);
3013                 dev->features &= ~NETIF_F_SG;
3014         }
3015
3016         /* TSO requires that SG is present as well. */
3017         if ((dev->features & NETIF_F_TSO) &&
3018             !(dev->features & NETIF_F_SG)) {
3019                 printk(KERN_NOTICE "%s: Dropping NETIF_F_TSO since no SG feature.\n",
3020                        dev->name);
3021                 dev->features &= ~NETIF_F_TSO;
3022         }
3023         if (dev->features & NETIF_F_UFO) {
3024                 if (!(dev->features & NETIF_F_HW_CSUM)) {
3025                         printk(KERN_ERR "%s: Dropping NETIF_F_UFO since no "
3026                                         "NETIF_F_HW_CSUM feature.\n",
3027                                                         dev->name);
3028                         dev->features &= ~NETIF_F_UFO;
3029                 }
3030                 if (!(dev->features & NETIF_F_SG)) {
3031                         printk(KERN_ERR "%s: Dropping NETIF_F_UFO since no "
3032                                         "NETIF_F_SG feature.\n",
3033                                         dev->name);
3034                         dev->features &= ~NETIF_F_UFO;
3035                 }
3036         }
3037
3038         /*
3039          *      nil rebuild_header routine,
3040          *      that should be never called and used as just bug trap.
3041          */
3042
3043         if (!dev->rebuild_header)
3044                 dev->rebuild_header = default_rebuild_header;
3045
3046         ret = netdev_register_sysfs(dev);
3047         if (ret)
3048                 goto out_err;
3049         dev->reg_state = NETREG_REGISTERED;
3050
3051         /*
3052          *      Default initial state at registry is that the
3053          *      device is present.
3054          */
3055
3056         set_bit(__LINK_STATE_PRESENT, &dev->state);
3057
3058         dev->next = NULL;
3059         dev_init_scheduler(dev);
3060         write_lock_bh(&dev_base_lock);
3061         *dev_tail = dev;
3062         dev_tail = &dev->next;
3063         hlist_add_head(&dev->name_hlist, head);
3064         hlist_add_head(&dev->index_hlist, dev_index_hash(dev->ifindex));
3065         dev_hold(dev);
3066         write_unlock_bh(&dev_base_lock);
3067
3068         /* Notify protocols, that a new device appeared. */
3069         raw_notifier_call_chain(&netdev_chain, NETDEV_REGISTER, dev);
3070
3071         ret = 0;
3072
3073 out:
3074         return ret;
3075 out_err:
3076         free_divert_blk(dev);
3077         goto out;
3078 }
3079
3080 /**
3081  *      register_netdev - register a network device
3082  *      @dev: device to register
3083  *
3084  *      Take a completed network device structure and add it to the kernel
3085  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
3086  *      chain. 0 is returned on success. A negative errno code is returned
3087  *      on a failure to set up the device, or if the name is a duplicate.
3088  *
3089  *      This is a wrapper around register_netdev that takes the rtnl semaphore
3090  *      and expands the device name if you passed a format string to
3091  *      alloc_netdev.
3092  */
3093 int register_netdev(struct net_device *dev)
3094 {
3095         int err;
3096
3097         rtnl_lock();
3098
3099         /*
3100          * If the name is a format string the caller wants us to do a
3101          * name allocation.
3102          */
3103         if (strchr(dev->name, '%')) {
3104                 err = dev_alloc_name(dev, dev->name);
3105                 if (err < 0)
3106                         goto out;
3107         }
3108
3109         /*
3110          * Back compatibility hook. Kill this one in 2.5
3111          */
3112         if (dev->name[0] == 0 || dev->name[0] == ' ') {
3113                 err = dev_alloc_name(dev, "eth%d");
3114                 if (err < 0)
3115                         goto out;
3116         }
3117
3118         err = register_netdevice(dev);
3119 out:
3120         rtnl_unlock();
3121         return err;
3122 }
3123 EXPORT_SYMBOL(register_netdev);
3124
3125 /*
3126  * netdev_wait_allrefs - wait until all references are gone.
3127  *
3128  * This is called when unregistering network devices.
3129  *
3130  * Any protocol or device that holds a reference should register
3131  * for netdevice notification, and cleanup and put back the
3132  * reference if they receive an UNREGISTER event.
3133  * We can get stuck here if buggy protocols don't correctly
3134  * call dev_put.
3135  */
3136 static void netdev_wait_allrefs(struct net_device *dev)
3137 {
3138         unsigned long rebroadcast_time, warning_time;
3139
3140         rebroadcast_time = warning_time = jiffies;
3141         while (atomic_read(&dev->refcnt) != 0) {
3142                 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
3143                         rtnl_lock();
3144
3145                         /* Rebroadcast unregister notification */
3146                         raw_notifier_call_chain(&netdev_chain,
3147                                             NETDEV_UNREGISTER, dev);
3148
3149                         if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
3150                                      &dev->state)) {
3151                                 /* We must not have linkwatch events
3152                                  * pending on unregister. If this
3153                                  * happens, we simply run the queue
3154                                  * unscheduled, resulting in a noop
3155                                  * for this device.
3156                                  */
3157                                 linkwatch_run_queue();
3158                         }
3159
3160                         __rtnl_unlock();
3161
3162                         rebroadcast_time = jiffies;
3163                 }
3164
3165                 msleep(250);
3166
3167                 if (time_after(jiffies, warning_time + 10 * HZ)) {
3168                         printk(KERN_EMERG "unregister_netdevice: "
3169                                "waiting for %s to become free. Usage "
3170                                "count = %d\n",
3171                                dev->name, atomic_read(&dev->refcnt));
3172                         warning_time = jiffies;
3173                 }
3174         }
3175 }
3176
3177 /* The sequence is:
3178  *
3179  *      rtnl_lock();
3180  *      ...
3181  *      register_netdevice(x1);
3182  *      register_netdevice(x2);
3183  *      ...
3184  *      unregister_netdevice(y1);
3185  *      unregister_netdevice(y2);
3186  *      ...
3187  *      rtnl_unlock();
3188  *      free_netdev(y1);
3189  *      free_netdev(y2);
3190  *
3191  * We are invoked by rtnl_unlock() after it drops the semaphore.
3192  * This allows us to deal with problems:
3193  * 1) We can delete sysfs objects which invoke hotplug
3194  *    without deadlocking with linkwatch via keventd.
3195  * 2) Since we run with the RTNL semaphore not held, we can sleep
3196  *    safely in order to wait for the netdev refcnt to drop to zero.
3197  */
3198 static DEFINE_MUTEX(net_todo_run_mutex);
3199 void netdev_run_todo(void)
3200 {
3201         struct list_head list;
3202
3203         /* Need to guard against multiple cpu's getting out of order. */
3204         mutex_lock(&net_todo_run_mutex);
3205
3206         /* Not safe to do outside the semaphore.  We must not return
3207          * until all unregister events invoked by the local processor
3208          * have been completed (either by this todo run, or one on
3209          * another cpu).
3210          */
3211         if (list_empty(&net_todo_list))
3212                 goto out;
3213
3214         /* Snapshot list, allow later requests */
3215         spin_lock(&net_todo_list_lock);
3216         list_replace_init(&net_todo_list, &list);
3217         spin_unlock(&net_todo_list_lock);
3218
3219         while (!list_empty(&list)) {
3220                 struct net_device *dev
3221                         = list_entry(list.next, struct net_device, todo_list);
3222                 list_del(&dev->todo_list);
3223
3224                 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
3225                         printk(KERN_ERR "network todo '%s' but state %d\n",
3226                                dev->name, dev->reg_state);
3227                         dump_stack();
3228                         continue;
3229                 }
3230
3231                 netdev_unregister_sysfs(dev);
3232                 dev->reg_state = NETREG_UNREGISTERED;
3233
3234                 netdev_wait_allrefs(dev);
3235
3236                 /* paranoia */
3237                 BUG_ON(atomic_read(&dev->refcnt));
3238                 BUG_TRAP(!dev->ip_ptr);
3239                 BUG_TRAP(!dev->ip6_ptr);
3240                 BUG_TRAP(!dev->dn_ptr);
3241
3242                 /* It must be the very last action,
3243                  * after this 'dev' may point to freed up memory.
3244                  */
3245                 if (dev->destructor)
3246                         dev->destructor(dev);
3247         }
3248
3249 out:
3250         mutex_unlock(&net_todo_run_mutex);
3251 }
3252
3253 /**
3254  *      alloc_netdev - allocate network device
3255  *      @sizeof_priv:   size of private data to allocate space for
3256  *      @name:          device name format string
3257  *      @setup:         callback to initialize device
3258  *
3259  *      Allocates a struct net_device with private data area for driver use
3260  *      and performs basic initialization.
3261  */
3262 struct net_device *alloc_netdev(int sizeof_priv, const char *name,
3263                 void (*setup)(struct net_device *))
3264 {
3265         void *p;
3266         struct net_device *dev;
3267         int alloc_size;
3268
3269         /* ensure 32-byte alignment of both the device and private area */
3270         alloc_size = (sizeof(*dev) + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST;
3271         alloc_size += sizeof_priv + NETDEV_ALIGN_CONST;
3272
3273         p = kzalloc(alloc_size, GFP_KERNEL);
3274         if (!p) {
3275                 printk(KERN_ERR "alloc_dev: Unable to allocate device.\n");
3276                 return NULL;
3277         }
3278
3279         dev = (struct net_device *)
3280                 (((long)p + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST);
3281         dev->padded = (char *)dev - (char *)p;
3282
3283         if (sizeof_priv)
3284                 dev->priv = netdev_priv(dev);
3285
3286         setup(dev);
3287         strcpy(dev->name, name);
3288         return dev;
3289 }
3290 EXPORT_SYMBOL(alloc_netdev);
3291
3292 /**
3293  *      free_netdev - free network device
3294  *      @dev: device
3295  *
3296  *      This function does the last stage of destroying an allocated device
3297  *      interface. The reference to the device object is released.
3298  *      If this is the last reference then it will be freed.
3299  */
3300 void free_netdev(struct net_device *dev)
3301 {
3302 #ifdef CONFIG_SYSFS
3303         /*  Compatibility with error handling in drivers */
3304         if (dev->reg_state == NETREG_UNINITIALIZED) {
3305                 kfree((char *)dev - dev->padded);
3306                 return;
3307         }
3308
3309         BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
3310         dev->reg_state = NETREG_RELEASED;
3311
3312         /* will free via class release */
3313         class_device_put(&dev->class_dev);
3314 #else
3315         kfree((char *)dev - dev->padded);
3316 #endif
3317 }
3318
3319 /* Synchronize with packet receive processing. */
3320 void synchronize_net(void)
3321 {
3322         might_sleep();
3323         synchronize_rcu();
3324 }
3325
3326 /**
3327  *      unregister_netdevice - remove device from the kernel
3328  *      @dev: device
3329  *
3330  *      This function shuts down a device interface and removes it
3331  *      from the kernel tables. On success 0 is returned, on a failure
3332  *      a negative errno code is returned.
3333  *
3334  *      Callers must hold the rtnl semaphore.  You may want
3335  *      unregister_netdev() instead of this.
3336  */
3337
3338 int unregister_netdevice(struct net_device *dev)
3339 {
3340         struct net_device *d, **dp;
3341
3342         BUG_ON(dev_boot_phase);
3343         ASSERT_RTNL();
3344
3345         /* Some devices call without registering for initialization unwind. */
3346         if (dev->reg_state == NETREG_UNINITIALIZED) {
3347                 printk(KERN_DEBUG "unregister_netdevice: device %s/%p never "
3348                                   "was registered\n", dev->name, dev);
3349                 return -ENODEV;
3350         }
3351
3352         BUG_ON(dev->reg_state != NETREG_REGISTERED);
3353
3354         /* If device is running, close it first. */
3355         if (dev->flags & IFF_UP)
3356                 dev_close(dev);
3357
3358         /* And unlink it from device chain. */
3359         for (dp = &dev_base; (d = *dp) != NULL; dp = &d->next) {
3360                 if (d == dev) {
3361                         write_lock_bh(&dev_base_lock);
3362                         hlist_del(&dev->name_hlist);
3363                         hlist_del(&dev->index_hlist);
3364                         if (dev_tail == &dev->next)
3365                                 dev_tail = dp;
3366                         *dp = d->next;
3367                         write_unlock_bh(&dev_base_lock);
3368                         break;
3369                 }
3370         }
3371         if (!d) {
3372                 printk(KERN_ERR "unregister net_device: '%s' not found\n",
3373                        dev->name);
3374                 return -ENODEV;
3375         }
3376
3377         dev->reg_state = NETREG_UNREGISTERING;
3378
3379         synchronize_net();
3380
3381         /* Shutdown queueing discipline. */
3382         dev_shutdown(dev);
3383
3384
3385         /* Notify protocols, that we are about to destroy
3386            this device. They should clean all the things.
3387         */
3388         raw_notifier_call_chain(&netdev_chain, NETDEV_UNREGISTER, dev);
3389
3390         /*
3391          *      Flush the multicast chain
3392          */
3393         dev_mc_discard(dev);
3394
3395         if (dev->uninit)
3396                 dev->uninit(dev);
3397
3398         /* Notifier chain MUST detach us from master device. */
3399         BUG_TRAP(!dev->master);
3400
3401         free_divert_blk(dev);
3402
3403         /* Finish processing unregister after unlock */
3404         net_set_todo(dev);
3405
3406         synchronize_net();
3407
3408         dev_put(dev);
3409         return 0;
3410 }
3411
3412 /**
3413  *      unregister_netdev - remove device from the kernel
3414  *      @dev: device
3415  *
3416  *      This function shuts down a device interface and removes it
3417  *      from the kernel tables. On success 0 is returned, on a failure
3418  *      a negative errno code is returned.
3419  *
3420  *      This is just a wrapper for unregister_netdevice that takes
3421  *      the rtnl semaphore.  In general you want to use this and not
3422  *      unregister_netdevice.
3423  */
3424 void unregister_netdev(struct net_device *dev)
3425 {
3426         rtnl_lock();
3427         unregister_netdevice(dev);
3428         rtnl_unlock();
3429 }
3430
3431 EXPORT_SYMBOL(unregister_netdev);
3432
3433 #ifdef CONFIG_HOTPLUG_CPU
3434 static int dev_cpu_callback(struct notifier_block *nfb,
3435                             unsigned long action,
3436                             void *ocpu)
3437 {
3438         struct sk_buff **list_skb;
3439         struct net_device **list_net;
3440         struct sk_buff *skb;
3441         unsigned int cpu, oldcpu = (unsigned long)ocpu;
3442         struct softnet_data *sd, *oldsd;
3443
3444         if (action != CPU_DEAD)
3445                 return NOTIFY_OK;
3446
3447         local_irq_disable();
3448         cpu = smp_processor_id();
3449         sd = &per_cpu(softnet_data, cpu);
3450         oldsd = &per_cpu(softnet_data, oldcpu);
3451
3452         /* Find end of our completion_queue. */
3453         list_skb = &sd->completion_queue;
3454         while (*list_skb)
3455                 list_skb = &(*list_skb)->next;
3456         /* Append completion queue from offline CPU. */
3457         *list_skb = oldsd->completion_queue;
3458         oldsd->completion_queue = NULL;
3459
3460         /* Find end of our output_queue. */
3461         list_net = &sd->output_queue;
3462         while (*list_net)
3463                 list_net = &(*list_net)->next_sched;
3464         /* Append output queue from offline CPU. */
3465         *list_net = oldsd->output_queue;
3466         oldsd->output_queue = NULL;
3467
3468         raise_softirq_irqoff(NET_TX_SOFTIRQ);
3469         local_irq_enable();
3470
3471         /* Process offline CPU's input_pkt_queue */
3472         while ((skb = __skb_dequeue(&oldsd->input_pkt_queue)))
3473                 netif_rx(skb);
3474
3475         return NOTIFY_OK;
3476 }
3477 #endif /* CONFIG_HOTPLUG_CPU */
3478
3479 #ifdef CONFIG_NET_DMA
3480 /**
3481  * net_dma_rebalance -
3482  * This is called when the number of channels allocated to the net_dma_client
3483  * changes.  The net_dma_client tries to have one DMA channel per CPU.
3484  */
3485 static void net_dma_rebalance(void)
3486 {
3487         unsigned int cpu, i, n;
3488         struct dma_chan *chan;
3489
3490         if (net_dma_count == 0) {
3491                 for_each_online_cpu(cpu)
3492                         rcu_assign_pointer(per_cpu(softnet_data, cpu).net_dma, NULL);
3493                 return;
3494         }
3495
3496         i = 0;
3497         cpu = first_cpu(cpu_online_map);
3498
3499         rcu_read_lock();
3500         list_for_each_entry(chan, &net_dma_client->channels, client_node) {
3501                 n = ((num_online_cpus() / net_dma_count)
3502                    + (i < (num_online_cpus() % net_dma_count) ? 1 : 0));
3503
3504                 while(n) {
3505                         per_cpu(softnet_data, cpu).net_dma = chan;
3506                         cpu = next_cpu(cpu, cpu_online_map);
3507                         n--;
3508                 }
3509                 i++;
3510         }
3511         rcu_read_unlock();
3512 }
3513
3514 /**
3515  * netdev_dma_event - event callback for the net_dma_client
3516  * @client: should always be net_dma_client
3517  * @chan: DMA channel for the event
3518  * @event: event type
3519  */
3520 static void netdev_dma_event(struct dma_client *client, struct dma_chan *chan,
3521         enum dma_event event)
3522 {
3523         spin_lock(&net_dma_event_lock);
3524         switch (event) {
3525         case DMA_RESOURCE_ADDED:
3526                 net_dma_count++;
3527                 net_dma_rebalance();
3528                 break;
3529         case DMA_RESOURCE_REMOVED:
3530                 net_dma_count--;
3531                 net_dma_rebalance();
3532                 break;
3533         default:
3534                 break;
3535         }
3536         spin_unlock(&net_dma_event_lock);
3537 }
3538
3539 /**
3540  * netdev_dma_regiser - register the networking subsystem as a DMA client
3541  */
3542 static int __init netdev_dma_register(void)
3543 {
3544         spin_lock_init(&net_dma_event_lock);
3545         net_dma_client = dma_async_client_register(netdev_dma_event);
3546         if (net_dma_client == NULL)
3547                 return -ENOMEM;
3548
3549         dma_async_client_chan_request(net_dma_client, num_online_cpus());
3550         return 0;
3551 }
3552
3553 #else
3554 static int __init netdev_dma_register(void) { return -ENODEV; }
3555 #endif /* CONFIG_NET_DMA */
3556
3557 /*
3558  *      Initialize the DEV module. At boot time this walks the device list and
3559  *      unhooks any devices that fail to initialise (normally hardware not
3560  *      present) and leaves us with a valid list of present and active devices.
3561  *
3562  */
3563
3564 /*
3565  *       This is called single threaded during boot, so no need
3566  *       to take the rtnl semaphore.
3567  */
3568 static int __init net_dev_init(void)
3569 {
3570         int i, rc = -ENOMEM;
3571
3572         BUG_ON(!dev_boot_phase);
3573
3574         net_random_init();
3575
3576         if (dev_proc_init())
3577                 goto out;
3578
3579         if (netdev_sysfs_init())
3580                 goto out;
3581
3582         INIT_LIST_HEAD(&ptype_all);
3583         for (i = 0; i < 16; i++)
3584                 INIT_LIST_HEAD(&ptype_base[i]);
3585
3586         for (i = 0; i < ARRAY_SIZE(dev_name_head); i++)
3587                 INIT_HLIST_HEAD(&dev_name_head[i]);
3588
3589         for (i = 0; i < ARRAY_SIZE(dev_index_head); i++)
3590                 INIT_HLIST_HEAD(&dev_index_head[i]);
3591
3592         /*
3593          *      Initialise the packet receive queues.
3594          */
3595
3596         for_each_possible_cpu(i) {
3597                 struct softnet_data *queue;
3598
3599                 queue = &per_cpu(softnet_data, i);
3600                 skb_queue_head_init(&queue->input_pkt_queue);
3601                 queue->completion_queue = NULL;
3602                 INIT_LIST_HEAD(&queue->poll_list);
3603                 set_bit(__LINK_STATE_START, &queue->backlog_dev.state);
3604                 queue->backlog_dev.weight = weight_p;
3605                 queue->backlog_dev.poll = process_backlog;
3606                 atomic_set(&queue->backlog_dev.refcnt, 1);
3607         }
3608
3609         netdev_dma_register();
3610
3611         dev_boot_phase = 0;
3612
3613         open_softirq(NET_TX_SOFTIRQ, net_tx_action, NULL);
3614         open_softirq(NET_RX_SOFTIRQ, net_rx_action, NULL);
3615
3616         hotcpu_notifier(dev_cpu_callback, 0);
3617         dst_init();
3618         dev_mcast_init();
3619         rc = 0;
3620 out:
3621         return rc;
3622 }
3623
3624 subsys_initcall(net_dev_init);
3625
3626 EXPORT_SYMBOL(__dev_get_by_index);
3627 EXPORT_SYMBOL(__dev_get_by_name);
3628 EXPORT_SYMBOL(__dev_remove_pack);
3629 EXPORT_SYMBOL(dev_valid_name);
3630 EXPORT_SYMBOL(dev_add_pack);
3631 EXPORT_SYMBOL(dev_alloc_name);
3632 EXPORT_SYMBOL(dev_close);
3633 EXPORT_SYMBOL(dev_get_by_flags);
3634 EXPORT_SYMBOL(dev_get_by_index);
3635 EXPORT_SYMBOL(dev_get_by_name);
3636 EXPORT_SYMBOL(dev_open);
3637 EXPORT_SYMBOL(dev_queue_xmit);
3638 #if defined(CONFIG_VNET) || defined(CONFIG_VNET_MODULE)
3639 EXPORT_SYMBOL(dev_queue_xmit_nit);
3640 #endif
3641 EXPORT_SYMBOL(dev_remove_pack);
3642 EXPORT_SYMBOL(dev_set_allmulti);
3643 EXPORT_SYMBOL(dev_set_promiscuity);
3644 EXPORT_SYMBOL(dev_change_flags);
3645 EXPORT_SYMBOL(dev_set_mtu);
3646 EXPORT_SYMBOL(dev_set_mac_address);
3647 EXPORT_SYMBOL(free_netdev);
3648 EXPORT_SYMBOL(netdev_boot_setup_check);
3649 EXPORT_SYMBOL(netdev_set_master);
3650 EXPORT_SYMBOL(netdev_state_change);
3651 EXPORT_SYMBOL(netif_receive_skb);
3652 EXPORT_SYMBOL(netif_rx);
3653 EXPORT_SYMBOL(register_gifconf);
3654 EXPORT_SYMBOL(register_netdevice);
3655 EXPORT_SYMBOL(register_netdevice_notifier);
3656 EXPORT_SYMBOL(skb_checksum_help);
3657 EXPORT_SYMBOL(synchronize_net);
3658 EXPORT_SYMBOL(unregister_netdevice);
3659 EXPORT_SYMBOL(unregister_netdevice_notifier);
3660 EXPORT_SYMBOL(net_enable_timestamp);
3661 EXPORT_SYMBOL(net_disable_timestamp);
3662 EXPORT_SYMBOL(dev_get_flags);
3663 EXPORT_SYMBOL(skb_checksum_setup);
3664
3665 #if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)
3666 EXPORT_SYMBOL(br_handle_frame_hook);
3667 EXPORT_SYMBOL(br_fdb_get_hook);
3668 EXPORT_SYMBOL(br_fdb_put_hook);
3669 #endif
3670
3671 #ifdef CONFIG_KMOD
3672 EXPORT_SYMBOL(dev_load);
3673 #endif
3674
3675 EXPORT_PER_CPU_SYMBOL(softnet_data);