net/core/dev.c

   1 /*
   2  *      NET3    Protocol independent device support routines.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  *      Derived from the non IP parts of dev.c 1.0.19
  10  *              Authors:        Ross Biro, <bir7@leland.Stanford.Edu>
  11  *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *
  14  *      Additional Authors:
  15  *              Florian la Roche <rzsfl@rz.uni-sb.de>
  16  *              Alan Cox <gw4pts@gw4pts.ampr.org>
  17  *              David Hinds <dahinds@users.sourceforge.net>
  18  *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19  *              Adam Sulmicki <adam@cfar.umd.edu>
  20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21  *
  22  *      Changes:
  23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24  *                                      to 2 if register_netdev gets called
  25  *                                      before net_dev_init & also removed a
  26  *                                      few lines of code in the process.
  27  *              Alan Cox        :       device private ioctl copies fields back.
  28  *              Alan Cox        :       Transmit queue code does relevant
  29  *                                      stunts to keep the queue safe.
  30  *              Alan Cox        :       Fixed double lock.
  31  *              Alan Cox        :       Fixed promisc NULL pointer trap
  32  *              ????????        :       Support the full private ioctl range
  33  *              Alan Cox        :       Moved ioctl permission check into
  34  *                                      drivers
  35  *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
  36  *              Alan Cox        :       100 backlog just doesn't cut it when
  37  *                                      you start doing multicast video 8)
  38  *              Alan Cox        :       Rewrote net_bh and list manager.
  39  *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  40  *              Alan Cox        :       Took out transmit every packet pass
  41  *                                      Saved a few bytes in the ioctl handler
  42  *              Alan Cox        :       Network driver sets packet type before
  43  *                                      calling netif_rx. Saves a function
  44  *                                      call a packet.
  45  *              Alan Cox        :       Hashed net_bh()
  46  *              Richard Kooijman:       Timestamp fixes.
  47  *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
  48  *              Alan Cox        :       Device lock protection.
  49  *              Alan Cox        :       Fixed nasty side effect of device close
  50  *                                      changes.
  51  *              Rudi Cilibrasi  :       Pass the right thing to
  52  *                                      set_mac_address()
  53  *              Dave Miller     :       32bit quantity for the device lock to
  54  *                                      make it work out on a Sparc.
  55  *              Bjorn Ekwall    :       Added KERNELD hack.
  56  *              Alan Cox        :       Cleaned up the backlog initialise.
  57  *              Craig Metz      :       SIOCGIFCONF fix if space for under
  58  *                                      1 device.
  59  *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
  60  *                                      is no device open function.
  61  *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  62  *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  63  *              Cyrus Durgin    :       Cleaned for KMOD
  64  *              Adam Sulmicki   :       Bug Fix : Network Device Unload
  65  *                                      A network device unload needs to purge
  66  *                                      the backlog queue.
  67  *      Paul Rusty Russell      :       SIOCSIFNAME
  68  *              Pekka Riikonen  :       Netdev boot-time settings code
  69  *              Andrew Morton   :       Make unregister_netdevice wait
  70  *                                      indefinitely on dev->refcnt
  71  *              J Hadi Salim    :       - Backlog queue sampling
  72  *                                      - netif_rx() feedback
  73  */
  74
  75 #include <asm/uaccess.h>
  76 #include <asm/system.h>
  77 #include <linux/bitops.h>
  78 #include <linux/config.h>
  79 #include <linux/cpu.h>
  80 #include <linux/types.h>
  81 #include <linux/kernel.h>
  82 #include <linux/sched.h>
  83 #include <linux/string.h>
  84 #include <linux/mm.h>
  85 #include <linux/socket.h>
  86 #include <linux/sockios.h>
  87 #include <linux/errno.h>
  88 #include <linux/interrupt.h>
  89 #include <linux/if_ether.h>
  90 #include <linux/netdevice.h>
  91 #include <linux/etherdevice.h>
  92 #include <linux/notifier.h>
  93 #include <linux/skbuff.h>
  94 #include <net/sock.h>
  95 #include <linux/rtnetlink.h>
  96 #include <linux/proc_fs.h>
  97 #include <linux/seq_file.h>
  98 #include <linux/stat.h>
  99 #include <linux/if_bridge.h>
 100 #include <linux/divert.h>
 101 #include <net/dst.h>
 102 #include <net/pkt_sched.h>
 103 #include <net/checksum.h>
 104 #include <linux/highmem.h>
 105 #include <linux/init.h>
 106 #include <linux/kmod.h>
 107 #include <linux/module.h>
 108 #include <linux/kallsyms.h>
 109 #include <linux/netpoll.h>
 110 #include <linux/rcupdate.h>
 111 #ifdef CONFIG_NET_RADIO
 112 #include <linux/wireless.h>             /* Note : will define WIRELESS_EXT */
 113 #include <net/iw_handler.h>
 114 #endif  /* CONFIG_NET_RADIO */
 115 #include <asm/current.h>
 116
 117 /* This define, if set, will randomly drop a packet when congestion
 118  * is more than moderate.  It helps fairness in the multi-interface
 119  * case when one of them is a hog, but it kills performance for the
 120  * single interface case so it is off now by default.
 121  */
 122 #undef RAND_LIE
 123
 124 /* Setting this will sample the queue lengths and thus congestion
 125  * via a timer instead of as each packet is received.
 126  */
 127 #undef OFFLINE_SAMPLE
 128
 129 /*
 130  *      The list of packet types we will receive (as opposed to discard)
 131  *      and the routines to invoke.
 132  *
 133  *      Why 16. Because with 16 the only overlap we get on a hash of the
 134  *      low nibble of the protocol value is RARP/SNAP/X.25.
 135  *
 136  *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
 137  *             sure which should go first, but I bet it won't make much
 138  *             difference if we are running VLANs.  The good news is that
 139  *             this protocol won't be in the list unless compiled in, so
 140  *             the average user (w/out VLANs) will not be adversly affected.
 141  *             --BLG
 142  *
 143  *              0800    IP
 144  *              8100    802.1Q VLAN
 145  *              0001    802.3
 146  *              0002    AX.25
 147  *              0004    802.2
 148  *              8035    RARP
 149  *              0005    SNAP
 150  *              0805    X.25
 151  *              0806    ARP
 152  *              8137    IPX
 153  *              0009    Localtalk
 154  *              86DD    IPv6
 155  */
 156
 157 static spinlock_t ptype_lock = SPIN_LOCK_UNLOCKED;
 158 static struct list_head ptype_base[16]; /* 16 way hashed list */
 159 static struct list_head ptype_all;              /* Taps */
 160
 161 #ifdef OFFLINE_SAMPLE
 162 static void sample_queue(unsigned long dummy);
 163 static struct timer_list samp_timer = TIMER_INITIALIZER(sample_queue, 0, 0);
 164 #endif
 165
 166 /*
 167  * The @dev_base list is protected by @dev_base_lock and the rtln
 168  * semaphore.
 169  *
 170  * Pure readers hold dev_base_lock for reading.
 171  *
 172  * Writers must hold the rtnl semaphore while they loop through the
 173  * dev_base list, and hold dev_base_lock for writing when they do the
 174  * actual updates.  This allows pure readers to access the list even
 175  * while a writer is preparing to update it.
 176  *
 177  * To put it another way, dev_base_lock is held for writing only to
 178  * protect against pure readers; the rtnl semaphore provides the
 179  * protection against other writers.
 180  *
 181  * See, for example usages, register_netdevice() and
 182  * unregister_netdevice(), which must be called with the rtnl
 183  * semaphore held.
 184  */
 185 struct net_device *dev_base;
 186 struct net_device **dev_tail = &dev_base;
 187 rwlock_t dev_base_lock = RW_LOCK_UNLOCKED;
 188
 189 EXPORT_SYMBOL(dev_base);
 190 EXPORT_SYMBOL(dev_base_lock);
 191
 192 #define NETDEV_HASHBITS 8
 193 static struct hlist_head dev_name_head[1<<NETDEV_HASHBITS];
 194 static struct hlist_head dev_index_head[1<<NETDEV_HASHBITS];
 195
 196 static inline struct hlist_head *dev_name_hash(const char *name)
 197 {
 198         unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 199         return &dev_name_head[hash & ((1<<NETDEV_HASHBITS)-1)];
 200 }
 201
 202 static inline struct hlist_head *dev_index_hash(int ifindex)
 203 {
 204         return &dev_index_head[ifindex & ((1<<NETDEV_HASHBITS)-1)];
 205 }
 206
 207 /*
 208  *      Our notifier list
 209  */
 210
 211 static struct notifier_block *netdev_chain;
 212
 213 /*
 214  *      Device drivers call our routines to queue packets here. We empty the
 215  *      queue in the local softnet handler.
 216  */
 217 DEFINE_PER_CPU(struct softnet_data, softnet_data) = { 0, };
 218
 219 #ifdef CONFIG_SYSFS
 220 extern int netdev_sysfs_init(void);
 221 extern int netdev_register_sysfs(struct net_device *);
 222 extern void netdev_unregister_sysfs(struct net_device *);
 223 #else
 224 #define netdev_sysfs_init()             (0)
 225 #define netdev_register_sysfs(dev)      (0)
 226 #define netdev_unregister_sysfs(dev)    do { } while(0)
 227 #endif
 228
 229
 230 /*******************************************************************************
 231
 232                 Protocol management and registration routines
 233
 234 *******************************************************************************/
 235
 236 /*
 237  *      For efficiency
 238  */
 239
 240 int netdev_nit;
 241
 242 /*
 243  *      Add a protocol ID to the list. Now that the input handler is
 244  *      smarter we can dispense with all the messy stuff that used to be
 245  *      here.
 246  *
 247  *      BEWARE!!! Protocol handlers, mangling input packets,
 248  *      MUST BE last in hash buckets and checking protocol handlers
 249  *      MUST start from promiscuous ptype_all chain in net_bh.
 250  *      It is true now, do not change it.
 251  *      Explanation follows: if protocol handler, mangling packet, will
 252  *      be the first on list, it is not able to sense, that packet
 253  *      is cloned and should be copied-on-write, so that it will
 254  *      change it and subsequent readers will get broken packet.
 255  *                                                      --ANK (980803)
 256  */
 257
 258 /**
 259  *      dev_add_pack - add packet handler
 260  *      @pt: packet type declaration
 261  *
 262  *      Add a protocol handler to the networking stack. The passed &packet_type
 263  *      is linked into kernel lists and may not be freed until it has been
 264  *      removed from the kernel lists.
 265  *
 266  *      This call does not sleep therefore it can not
 267  *      guarantee all CPU's that are in middle of receiving packets
 268  *      will see the new packet type (until the next received packet).
 269  */
 270
 271 void dev_add_pack(struct packet_type *pt)
 272 {
 273         int hash;
 274
 275         spin_lock_bh(&ptype_lock);
 276         if (pt->type == htons(ETH_P_ALL)) {
 277                 netdev_nit++;
 278                 list_add_rcu(&pt->list, &ptype_all);
 279         } else {
 280                 hash = ntohs(pt->type) & 15;
 281                 list_add_rcu(&pt->list, &ptype_base[hash]);
 282         }
 283         spin_unlock_bh(&ptype_lock);
 284 }
 285
 286 extern void linkwatch_run_queue(void);
 287
 288
 289
 290 /**
 291  *      __dev_remove_pack        - remove packet handler
 292  *      @pt: packet type declaration
 293  *
 294  *      Remove a protocol handler that was previously added to the kernel
 295  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 296  *      from the kernel lists and can be freed or reused once this function
 297  *      returns.
 298  *
 299  *      The packet type might still be in use by receivers
 300  *      and must not be freed until after all the CPU's have gone
 301  *      through a quiescent state.
 302  */
 303 void __dev_remove_pack(struct packet_type *pt)
 304 {
 305         struct list_head *head;
 306         struct packet_type *pt1;
 307
 308         spin_lock_bh(&ptype_lock);
 309
 310         if (pt->type == htons(ETH_P_ALL)) {
 311                 netdev_nit--;
 312                 head = &ptype_all;
 313         } else
 314                 head = &ptype_base[ntohs(pt->type) & 15];
 315
 316         list_for_each_entry(pt1, head, list) {
 317                 if (pt == pt1) {
 318                         list_del_rcu(&pt->list);
 319                         goto out;
 320                 }
 321         }
 322
 323         printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
 324 out:
 325         spin_unlock_bh(&ptype_lock);
 326 }
 327 /**
 328  *      dev_remove_pack  - remove packet handler
 329  *      @pt: packet type declaration
 330  *
 331  *      Remove a protocol handler that was previously added to the kernel
 332  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 333  *      from the kernel lists and can be freed or reused once this function
 334  *      returns.
 335  *
 336  *      This call sleeps to guarantee that no CPU is looking at the packet
 337  *      type after return.
 338  */
 339 void dev_remove_pack(struct packet_type *pt)
 340 {
 341         __dev_remove_pack(pt);
 342
 343         synchronize_net();
 344 }
 345
 346 /******************************************************************************
 347
 348                       Device Boot-time Settings Routines
 349
 350 *******************************************************************************/
 351
 352 /* Boot time configuration table */
 353 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 354
 355 /**
 356  *      netdev_boot_setup_add   - add new setup entry
 357  *      @name: name of the device
 358  *      @map: configured settings for the device
 359  *
 360  *      Adds new setup entry to the dev_boot_setup list.  The function
 361  *      returns 0 on error and 1 on success.  This is a generic routine to
 362  *      all netdevices.
 363  */
 364 int netdev_boot_setup_add(char *name, struct ifmap *map)
 365 {
 366         struct netdev_boot_setup *s;
 367         int i;
 368
 369         s = dev_boot_setup;
 370         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 371                 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 372                         memset(s[i].name, 0, sizeof(s[i].name));
 373                         strcpy(s[i].name, name);
 374                         memcpy(&s[i].map, map, sizeof(s[i].map));
 375                         break;
 376                 }
 377         }
 378
 379         return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 380 }
 381
 382 /**
 383  *      netdev_boot_setup_check - check boot time settings
 384  *      @dev: the netdevice
 385  *
 386  *      Check boot time settings for the device.
 387  *      The found settings are set for the device to be used
 388  *      later in the device probing.
 389  *      Returns 0 if no settings found, 1 if they are.
 390  */
 391 int netdev_boot_setup_check(struct net_device *dev)
 392 {
 393         struct netdev_boot_setup *s = dev_boot_setup;
 394         int i;
 395
 396         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 397                 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 398                     !strncmp(dev->name, s[i].name, strlen(s[i].name))) {
 399                         dev->irq        = s[i].map.irq;
 400                         dev->base_addr  = s[i].map.base_addr;
 401                         dev->mem_start  = s[i].map.mem_start;
 402                         dev->mem_end    = s[i].map.mem_end;
 403                         return 1;
 404                 }
 405         }
 406         return 0;
 407 }
 408
 409
 410 /**
 411  *      netdev_boot_base        - get address from boot time settings
 412  *      @prefix: prefix for network device
 413  *      @unit: id for network device
 414  *
 415  *      Check boot time settings for the base address of device.
 416  *      The found settings are set for the device to be used
 417  *      later in the device probing.
 418  *      Returns 0 if no settings found.
 419  */
 420 unsigned long netdev_boot_base(const char *prefix, int unit)
 421 {
 422         const struct netdev_boot_setup *s = dev_boot_setup;
 423         char name[IFNAMSIZ];
 424         int i;
 425
 426         sprintf(name, "%s%d", prefix, unit);
 427
 428         /*
 429          * If device already registered then return base of 1
 430          * to indicate not to probe for this interface
 431          */
 432         if (__dev_get_by_name(name))
 433                 return 1;
 434
 435         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 436                 if (!strcmp(name, s[i].name))
 437                         return s[i].map.base_addr;
 438         return 0;
 439 }
 440
 441 /*
 442  * Saves at boot time configured settings for any netdevice.
 443  */
 444 int __init netdev_boot_setup(char *str)
 445 {
 446         int ints[5];
 447         struct ifmap map;
 448
 449         str = get_options(str, ARRAY_SIZE(ints), ints);
 450         if (!str || !*str)
 451                 return 0;
 452
 453         /* Save settings */
 454         memset(&map, 0, sizeof(map));
 455         if (ints[0] > 0)
 456                 map.irq = ints[1];
 457         if (ints[0] > 1)
 458                 map.base_addr = ints[2];
 459         if (ints[0] > 2)
 460                 map.mem_start = ints[3];
 461         if (ints[0] > 3)
 462                 map.mem_end = ints[4];
 463
 464         /* Add new entry to the list */
 465         return netdev_boot_setup_add(str, &map);
 466 }
 467
 468 __setup("netdev=", netdev_boot_setup);
 469
 470 /*******************************************************************************
 471
 472                             Device Interface Subroutines
 473
 474 *******************************************************************************/
 475
 476 /**
 477  *      __dev_get_by_name       - find a device by its name
 478  *      @name: name to find
 479  *
 480  *      Find an interface by name. Must be called under RTNL semaphore
 481  *      or @dev_base_lock. If the name is found a pointer to the device
 482  *      is returned. If the name is not found then %NULL is returned. The
 483  *      reference counters are not incremented so the caller must be
 484  *      careful with locks.
 485  */
 486
 487 struct net_device *__dev_get_by_name(const char *name)
 488 {
 489         struct hlist_node *p;
 490
 491         hlist_for_each(p, dev_name_hash(name)) {
 492                 struct net_device *dev
 493                         = hlist_entry(p, struct net_device, name_hlist);
 494                 if (!strncmp(dev->name, name, IFNAMSIZ))
 495                         return dev;
 496         }
 497         return NULL;
 498 }
 499
 500 /**
 501  *      dev_get_by_name         - find a device by its name
 502  *      @name: name to find
 503  *
 504  *      Find an interface by name. This can be called from any
 505  *      context and does its own locking. The returned handle has
 506  *      the usage count incremented and the caller must use dev_put() to
 507  *      release it when it is no longer needed. %NULL is returned if no
 508  *      matching device is found.
 509  */
 510
 511 struct net_device *dev_get_by_name(const char *name)
 512 {
 513         struct net_device *dev;
 514
 515         read_lock(&dev_base_lock);
 516         dev = __dev_get_by_name(name);
 517         if (dev)
 518                 dev_hold(dev);
 519         read_unlock(&dev_base_lock);
 520         return dev;
 521 }
 522
 523 /**
 524  *      __dev_get_by_index - find a device by its ifindex
 525  *      @ifindex: index of device
 526  *
 527  *      Search for an interface by index. Returns %NULL if the device
 528  *      is not found or a pointer to the device. The device has not
 529  *      had its reference counter increased so the caller must be careful
 530  *      about locking. The caller must hold either the RTNL semaphore
 531  *      or @dev_base_lock.
 532  */
 533
 534 struct net_device *__dev_get_by_index(int ifindex)
 535 {
 536         struct hlist_node *p;
 537
 538         hlist_for_each(p, dev_index_hash(ifindex)) {
 539                 struct net_device *dev
 540                         = hlist_entry(p, struct net_device, index_hlist);
 541                 if (dev->ifindex == ifindex)
 542                         return dev;
 543         }
 544         return NULL;
 545 }
 546
 547
 548 /**
 549  *      dev_get_by_index - find a device by its ifindex
 550  *      @ifindex: index of device
 551  *
 552  *      Search for an interface by index. Returns NULL if the device
 553  *      is not found or a pointer to the device. The device returned has
 554  *      had a reference added and the pointer is safe until the user calls
 555  *      dev_put to indicate they have finished with it.
 556  */
 557
 558 struct net_device *dev_get_by_index(int ifindex)
 559 {
 560         struct net_device *dev;
 561
 562         read_lock(&dev_base_lock);
 563         dev = __dev_get_by_index(ifindex);
 564         if (dev)
 565                 dev_hold(dev);
 566         read_unlock(&dev_base_lock);
 567         return dev;
 568 }
 569
 570 /**
 571  *      dev_getbyhwaddr - find a device by its hardware address
 572  *      @type: media type of device
 573  *      @ha: hardware address
 574  *
 575  *      Search for an interface by MAC address. Returns NULL if the device
 576  *      is not found or a pointer to the device. The caller must hold the
 577  *      rtnl semaphore. The returned device has not had its ref count increased
 578  *      and the caller must therefore be careful about locking
 579  *
 580  *      BUGS:
 581  *      If the API was consistent this would be __dev_get_by_hwaddr
 582  */
 583
 584 struct net_device *dev_getbyhwaddr(unsigned short type, char *ha)
 585 {
 586         struct net_device *dev;
 587
 588         ASSERT_RTNL();
 589
 590         for (dev = dev_base; dev; dev = dev->next)
 591                 if (dev->type == type &&
 592                     !memcmp(dev->dev_addr, ha, dev->addr_len))
 593                         break;
 594         return dev;
 595 }
 596
 597 struct net_device *dev_getfirstbyhwtype(unsigned short type)
 598 {
 599         struct net_device *dev;
 600
 601         rtnl_lock();
 602         for (dev = dev_base; dev; dev = dev->next) {
 603                 if (dev->type == type) {
 604                         dev_hold(dev);
 605                         break;
 606                 }
 607         }
 608         rtnl_unlock();
 609         return dev;
 610 }
 611
 612 EXPORT_SYMBOL(dev_getfirstbyhwtype);
 613
 614 /**
 615  *      dev_get_by_flags - find any device with given flags
 616  *      @if_flags: IFF_* values
 617  *      @mask: bitmask of bits in if_flags to check
 618  *
 619  *      Search for any interface with the given flags. Returns NULL if a device
 620  *      is not found or a pointer to the device. The device returned has
 621  *      had a reference added and the pointer is safe until the user calls
 622  *      dev_put to indicate they have finished with it.
 623  */
 624
 625 struct net_device * dev_get_by_flags(unsigned short if_flags, unsigned short mask)
 626 {
 627         struct net_device *dev;
 628
 629         read_lock(&dev_base_lock);
 630         for (dev = dev_base; dev != NULL; dev = dev->next) {
 631                 if (((dev->flags ^ if_flags) & mask) == 0) {
 632                         dev_hold(dev);
 633                         break;
 634                 }
 635         }
 636         read_unlock(&dev_base_lock);
 637         return dev;
 638 }
 639
 640 /**
 641  *      dev_valid_name - check if name is okay for network device
 642  *      @name: name string
 643  *
 644  *      Network device names need to be valid file names to
 645  *      to allow sysfs to work
 646  */
 647 int dev_valid_name(const char *name)
 648 {
 649         return !(*name == '\0'
 650                  || !strcmp(name, ".")
 651                  || !strcmp(name, "..")
 652                  || strchr(name, '/'));
 653 }
 654
 655 /**
 656  *      dev_alloc_name - allocate a name for a device
 657  *      @dev: device
 658  *      @name: name format string
 659  *
 660  *      Passed a format string - eg "lt%d" it will try and find a suitable
 661  *      id. Not efficient for many devices, not called a lot. The caller
 662  *      must hold the dev_base or rtnl lock while allocating the name and
 663  *      adding the device in order to avoid duplicates. Returns the number
 664  *      of the unit assigned or a negative errno code.
 665  */
 666
 667 int dev_alloc_name(struct net_device *dev, const char *name)
 668 {
 669         int i = 0;
 670         char buf[IFNAMSIZ];
 671         const char *p;
 672         const int max_netdevices = 8*PAGE_SIZE;
 673         long *inuse;
 674         struct net_device *d;
 675
 676         p = strnchr(name, IFNAMSIZ-1, '%');
 677         if (p) {
 678                 /*
 679                  * Verify the string as this thing may have come from
 680                  * the user.  There must be either one "%d" and no other "%"
 681                  * characters.
 682                  */
 683                 if (p[1] != 'd' || strchr(p + 2, '%'))
 684                         return -EINVAL;
 685
 686                 /* Use one page as a bit array of possible slots */
 687                 inuse = (long *) get_zeroed_page(GFP_ATOMIC);
 688                 if (!inuse)
 689                         return -ENOMEM;
 690
 691                 for (d = dev_base; d; d = d->next) {
 692                         if (!sscanf(d->name, name, &i))
 693                                 continue;
 694                         if (i < 0 || i >= max_netdevices)
 695                                 continue;
 696
 697                         /*  avoid cases where sscanf is not exact inverse of printf */
 698                         snprintf(buf, sizeof(buf), name, i);
 699                         if (!strncmp(buf, d->name, IFNAMSIZ))
 700                                 set_bit(i, inuse);
 701                 }
 702
 703                 i = find_first_zero_bit(inuse, max_netdevices);
 704                 free_page((unsigned long) inuse);
 705         }
 706
 707         snprintf(buf, sizeof(buf), name, i);
 708         if (!__dev_get_by_name(buf)) {
 709                 strlcpy(dev->name, buf, IFNAMSIZ);
 710                 return i;
 711         }
 712
 713         /* It is possible to run out of possible slots
 714          * when the name is long and there isn't enough space left
 715          * for the digits, or if all bits are used.
 716          */
 717         return -ENFILE;
 718 }
 719
 720
 721 /**
 722  *      dev_change_name - change name of a device
 723  *      @dev: device
 724  *      @newname: name (or format string) must be at least IFNAMSIZ
 725  *
 726  *      Change name of a device, can pass format strings "eth%d".
 727  *      for wildcarding.
 728  */
 729 int dev_change_name(struct net_device *dev, char *newname)
 730 {
 731         int err = 0;
 732
 733         ASSERT_RTNL();
 734
 735         if (dev->flags & IFF_UP)
 736                 return -EBUSY;
 737
 738         if (!dev_valid_name(newname))
 739                 return -EINVAL;
 740
 741         if (strchr(newname, '%')) {
 742                 err = dev_alloc_name(dev, newname);
 743                 if (err < 0)
 744                         return err;
 745                 strcpy(newname, dev->name);
 746         }
 747         else if (__dev_get_by_name(newname))
 748                 return -EEXIST;
 749         else
 750                 strlcpy(dev->name, newname, IFNAMSIZ);
 751
 752         err = class_device_rename(&dev->class_dev, dev->name);
 753         if (!err) {
 754                 hlist_del(&dev->name_hlist);
 755                 hlist_add_head(&dev->name_hlist, dev_name_hash(dev->name));
 756                 notifier_call_chain(&netdev_chain, NETDEV_CHANGENAME, dev);
 757         }
 758
 759         return err;
 760 }
 761
 762 /**
 763  *      netdev_state_change - device changes state
 764  *      @dev: device to cause notification
 765  *
 766  *      Called to indicate a device has changed state. This function calls
 767  *      the notifier chains for netdev_chain and sends a NEWLINK message
 768  *      to the routing socket.
 769  */
 770 void netdev_state_change(struct net_device *dev)
 771 {
 772         if (dev->flags & IFF_UP) {
 773                 notifier_call_chain(&netdev_chain, NETDEV_CHANGE, dev);
 774                 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
 775         }
 776 }
 777
 778 /**
 779  *      dev_load        - load a network module
 780  *      @name: name of interface
 781  *
 782  *      If a network interface is not present and the process has suitable
 783  *      privileges this function loads the module. If module loading is not
 784  *      available in this kernel then it becomes a nop.
 785  */
 786
 787 void dev_load(const char *name)
 788 {
 789         struct net_device *dev;
 790
 791         read_lock(&dev_base_lock);
 792         dev = __dev_get_by_name(name);
 793         read_unlock(&dev_base_lock);
 794
 795         if (!dev && capable(CAP_SYS_MODULE))
 796                 request_module("%s", name);
 797 }
 798
 799 static int default_rebuild_header(struct sk_buff *skb)
 800 {
 801         printk(KERN_DEBUG "%s: default_rebuild_header called -- BUG!\n",
 802                skb->dev ? skb->dev->name : "NULL!!!");
 803         kfree_skb(skb);
 804         return 1;
 805 }
 806
 807
 808 /**
 809  *      dev_open        - prepare an interface for use.
 810  *      @dev:   device to open
 811  *
 812  *      Takes a device from down to up state. The device's private open
 813  *      function is invoked and then the multicast lists are loaded. Finally
 814  *      the device is moved into the up state and a %NETDEV_UP message is
 815  *      sent to the netdev notifier chain.
 816  *
 817  *      Calling this function on an active interface is a nop. On a failure
 818  *      a negative errno code is returned.
 819  */
 820 int dev_open(struct net_device *dev)
 821 {
 822         int ret = 0;
 823
 824         /*
 825          *      Is it already up?
 826          */
 827
 828         if (dev->flags & IFF_UP)
 829                 return 0;
 830
 831         /*
 832          *      Is it even present?
 833          */
 834         if (!netif_device_present(dev))
 835                 return -ENODEV;
 836
 837         /*
 838          *      Call device private open method
 839          */
 840         set_bit(__LINK_STATE_START, &dev->state);
 841         if (dev->open) {
 842                 ret = dev->open(dev);
 843                 if (ret)
 844                         clear_bit(__LINK_STATE_START, &dev->state);
 845         }
 846
 847         /*
 848          *      If it went open OK then:
 849          */
 850
 851         if (!ret) {
 852                 /*
 853                  *      Set the flags.
 854                  */
 855                 dev->flags |= IFF_UP;
 856
 857                 /*
 858                  *      Initialize multicasting status
 859                  */
 860                 dev_mc_upload(dev);
 861
 862                 /*
 863                  *      Wakeup transmit queue engine
 864                  */
 865                 dev_activate(dev);
 866
 867                 /*
 868                  *      ... and announce new interface.
 869                  */
 870                 notifier_call_chain(&netdev_chain, NETDEV_UP, dev);
 871         }
 872         return ret;
 873 }
 874
 875 /**
 876  *      dev_close - shutdown an interface.
 877  *      @dev: device to shutdown
 878  *
 879  *      This function moves an active device into down state. A
 880  *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
 881  *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
 882  *      chain.
 883  */
 884 int dev_close(struct net_device *dev)
 885 {
 886         if (!(dev->flags & IFF_UP))
 887                 return 0;
 888
 889         /*
 890          *      Tell people we are going down, so that they can
 891          *      prepare to death, when device is still operating.
 892          */
 893         notifier_call_chain(&netdev_chain, NETDEV_GOING_DOWN, dev);
 894
 895         dev_deactivate(dev);
 896
 897         clear_bit(__LINK_STATE_START, &dev->state);
 898
 899         /* Synchronize to scheduled poll. We cannot touch poll list,
 900          * it can be even on different cpu. So just clear netif_running(),
 901          * and wait when poll really will happen. Actually, the best place
 902          * for this is inside dev->stop() after device stopped its irq
 903          * engine, but this requires more changes in devices. */
 904
 905         smp_mb__after_clear_bit(); /* Commit netif_running(). */
 906         while (test_bit(__LINK_STATE_RX_SCHED, &dev->state)) {
 907                 /* No hurry. */
 908                 current->state = TASK_INTERRUPTIBLE;
 909                 schedule_timeout(1);
 910         }
 911
 912         /*
 913          *      Call the device specific close. This cannot fail.
 914          *      Only if device is UP
 915          *
 916          *      We allow it to be called even after a DETACH hot-plug
 917          *      event.
 918          */
 919         if (dev->stop)
 920                 dev->stop(dev);
 921
 922         /*
 923          *      Device is now down.
 924          */
 925
 926         dev->flags &= ~IFF_UP;
 927
 928         /*
 929          * Tell people we are down
 930          */
 931         notifier_call_chain(&netdev_chain, NETDEV_DOWN, dev);
 932
 933         return 0;
 934 }
 935
 936
 937 /*
 938  *      Device change register/unregister. These are not inline or static
 939  *      as we export them to the world.
 940  */
 941
 942 /**
 943  *      register_netdevice_notifier - register a network notifier block
 944  *      @nb: notifier
 945  *
 946  *      Register a notifier to be called when network device events occur.
 947  *      The notifier passed is linked into the kernel structures and must
 948  *      not be reused until it has been unregistered. A negative errno code
 949  *      is returned on a failure.
 950  *
 951  *      When registered all registration and up events are replayed
 952  *      to the new notifier to allow device to have a race free
 953  *      view of the network device list.
 954  */
 955
 956 int register_netdevice_notifier(struct notifier_block *nb)
 957 {
 958         struct net_device *dev;
 959         int err;
 960
 961         rtnl_lock();
 962         err = notifier_chain_register(&netdev_chain, nb);
 963         if (!err) {
 964                 for (dev = dev_base; dev; dev = dev->next) {
 965                         nb->notifier_call(nb, NETDEV_REGISTER, dev);
 966
 967                         if (dev->flags & IFF_UP)
 968                                 nb->notifier_call(nb, NETDEV_UP, dev);
 969                 }
 970         }
 971         rtnl_unlock();
 972         return err;
 973 }
 974
 975 /**
 976  *      unregister_netdevice_notifier - unregister a network notifier block
 977  *      @nb: notifier
 978  *
 979  *      Unregister a notifier previously registered by
 980  *      register_netdevice_notifier(). The notifier is unlinked into the
 981  *      kernel structures and may then be reused. A negative errno code
 982  *      is returned on a failure.
 983  */
 984
 985 int unregister_netdevice_notifier(struct notifier_block *nb)
 986 {
 987         return notifier_chain_unregister(&netdev_chain, nb);
 988 }
 989
 990 /**
 991  *      call_netdevice_notifiers - call all network notifier blocks
 992  *      @val: value passed unmodified to notifier function
 993  *      @v:   pointer passed unmodified to notifier function
 994  *
 995  *      Call all network notifier blocks.  Parameters and return value
 996  *      are as for notifier_call_chain().
 997  */
 998
 999 int call_netdevice_notifiers(unsigned long val, void *v)
1000 {
1001         return notifier_call_chain(&netdev_chain, val, v);
1002 }
1003
1004 /* When > 0 there are consumers of rx skb time stamps */
1005 static atomic_t netstamp_needed = ATOMIC_INIT(0);
1006
1007 void net_enable_timestamp(void)
1008 {
1009         atomic_inc(&netstamp_needed);
1010 }
1011
1012 void net_disable_timestamp(void)
1013 {
1014         atomic_dec(&netstamp_needed);
1015 }
1016
1017 static inline void net_timestamp(struct timeval *stamp)
1018 {
1019         if (atomic_read(&netstamp_needed))
1020                 do_gettimeofday(stamp);
1021         else {
1022                 stamp->tv_sec = 0;
1023                 stamp->tv_usec = 0;
1024         }
1025 }
1026
1027 /*
1028  *      Support routine. Sends outgoing frames to any network
1029  *      taps currently in use.
1030  */
1031
1032 void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1033 {
1034         struct packet_type *ptype;
1035         net_timestamp(&skb->stamp);
1036
1037         rcu_read_lock();
1038         list_for_each_entry_rcu(ptype, &ptype_all, list) {
1039                 /* Never send packets back to the socket
1040                  * they originated from - MvS (miquels@drinkel.ow.org)
1041                  */
1042                 if ((ptype->dev == dev || !ptype->dev) &&
1043                     (ptype->af_packet_priv == NULL ||
1044                      (struct sock *)ptype->af_packet_priv != skb->sk)) {
1045                         struct sk_buff *skb2= skb_clone(skb, GFP_ATOMIC);
1046                         if (!skb2)
1047                                 break;
1048
1049                         /* skb->nh should be correctly
1050                            set by sender, so that the second statement is
1051                            just protection against buggy protocols.
1052                          */
1053                         skb2->mac.raw = skb2->data;
1054
1055                         if (skb2->nh.raw < skb2->data ||
1056                             skb2->nh.raw > skb2->tail) {
1057                                 if (net_ratelimit())
1058                                         printk(KERN_CRIT "protocol %04x is "
1059                                                "buggy, dev %s\n",
1060                                                skb2->protocol, dev->name);
1061                                 skb2->nh.raw = skb2->data;
1062                         }
1063
1064                         skb2->h.raw = skb2->nh.raw;
1065                         skb2->pkt_type = PACKET_OUTGOING;
1066                         ptype->func(skb2, skb->dev, ptype);
1067                 }
1068         }
1069         rcu_read_unlock();
1070 }
1071
1072 /*
1073  * Invalidate hardware checksum when packet is to be mangled, and
1074  * complete checksum manually on outgoing path.
1075  */
1076 int skb_checksum_help(struct sk_buff *skb, int inward)
1077 {
1078         unsigned int csum;
1079         int ret = 0, offset = skb->h.raw - skb->data;
1080
1081         if (inward) {
1082                 skb->ip_summed = CHECKSUM_NONE;
1083                 goto out;
1084         }
1085
1086         if (skb_cloned(skb)) {
1087                 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1088                 if (ret)
1089                         goto out;
1090         }
1091
1092         if (offset > (int)skb->len)
1093                 BUG();
1094         csum = skb_checksum(skb, offset, skb->len-offset, 0);
1095
1096         offset = skb->tail - skb->h.raw;
1097         if (offset <= 0)
1098                 BUG();
1099         if (skb->csum + 2 > offset)
1100                 BUG();
1101
1102         *(u16*)(skb->h.raw + skb->csum) = csum_fold(csum);
1103         skb->ip_summed = CHECKSUM_NONE;
1104 out:
1105         return ret;
1106 }
1107
1108 #ifdef CONFIG_HIGHMEM
1109 /* Actually, we should eliminate this check as soon as we know, that:
1110  * 1. IOMMU is present and allows to map all the memory.
1111  * 2. No high memory really exists on this machine.
1112  */
1113
1114 static inline int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1115 {
1116         int i;
1117
1118         if (dev->features & NETIF_F_HIGHDMA)
1119                 return 0;
1120
1121         for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1122                 if (skb_shinfo(skb)->frags[i].page >= highmem_start_page)
1123                         return 1;
1124
1125         return 0;
1126 }
1127 #else
1128 #define illegal_highdma(dev, skb)       (0)
1129 #endif
1130
1131 extern void skb_release_data(struct sk_buff *);
1132
1133 /* Keep head the same: replace data */
1134 int __skb_linearize(struct sk_buff *skb, int gfp_mask)
1135 {
1136         unsigned int size;
1137         u8 *data;
1138         long offset;
1139         struct skb_shared_info *ninfo;
1140         int headerlen = skb->data - skb->head;
1141         int expand = (skb->tail + skb->data_len) - skb->end;
1142
1143         if (skb_shared(skb))
1144                 BUG();
1145
1146         if (expand <= 0)
1147                 expand = 0;
1148
1149         size = skb->end - skb->head + expand;
1150         size = SKB_DATA_ALIGN(size);
1151         data = kmalloc(size + sizeof(struct skb_shared_info), gfp_mask);
1152         if (!data)
1153                 return -ENOMEM;
1154
1155         /* Copy entire thing */
1156         if (skb_copy_bits(skb, -headerlen, data, headerlen + skb->len))
1157                 BUG();
1158
1159         /* Set up shinfo */
1160         ninfo = (struct skb_shared_info*)(data + size);
1161         atomic_set(&ninfo->dataref, 1);
1162         ninfo->tso_size = skb_shinfo(skb)->tso_size;
1163         ninfo->tso_segs = skb_shinfo(skb)->tso_segs;
1164         ninfo->nr_frags = 0;
1165         ninfo->frag_list = NULL;
1166
1167         /* Offset between the two in bytes */
1168         offset = data - skb->head;
1169
1170         /* Free old data. */
1171         skb_release_data(skb);
1172
1173         skb->head = data;
1174         skb->end  = data + size;
1175
1176         /* Set up new pointers */
1177         skb->h.raw   += offset;
1178         skb->nh.raw  += offset;
1179         skb->mac.raw += offset;
1180         skb->tail    += offset;
1181         skb->data    += offset;
1182
1183         /* We are no longer a clone, even if we were. */
1184         skb->cloned    = 0;
1185
1186         skb->tail     += skb->data_len;
1187         skb->data_len  = 0;
1188         return 0;
1189 }
1190
1191 #define HARD_TX_LOCK(dev, cpu) {                        \
1192         if ((dev->features & NETIF_F_LLTX) == 0) {      \
1193                 spin_lock(&dev->xmit_lock);             \
1194                 dev->xmit_lock_owner = cpu;             \
1195         }                                               \
1196 }
1197
1198 #define HARD_TX_UNLOCK(dev) {                           \
1199         if ((dev->features & NETIF_F_LLTX) == 0) {      \
1200                 dev->xmit_lock_owner = -1;              \
1201                 spin_unlock(&dev->xmit_lock);           \
1202         }                                               \
1203 }
1204
1205 /**
1206  *      dev_queue_xmit - transmit a buffer
1207  *      @skb: buffer to transmit
1208  *
1209  *      Queue a buffer for transmission to a network device. The caller must
1210  *      have set the device and priority and built the buffer before calling
1211  *      this function. The function can be called from an interrupt.
1212  *
1213  *      A negative errno code is returned on a failure. A success does not
1214  *      guarantee the frame will be transmitted as it may be dropped due
1215  *      to congestion or traffic shaping.
1216  */
1217
1218 int dev_queue_xmit(struct sk_buff *skb)
1219 {
1220         struct net_device *dev = skb->dev;
1221         struct Qdisc *q;
1222         int rc = -ENOMEM;
1223
1224         if (skb_shinfo(skb)->frag_list &&
1225             !(dev->features & NETIF_F_FRAGLIST) &&
1226             __skb_linearize(skb, GFP_ATOMIC))
1227                 goto out_kfree_skb;
1228
1229         /* Fragmented skb is linearized if device does not support SG,
1230          * or if at least one of fragments is in highmem and device
1231          * does not support DMA from it.
1232          */
1233         if (skb_shinfo(skb)->nr_frags &&
1234             (!(dev->features & NETIF_F_SG) || illegal_highdma(dev, skb)) &&
1235             __skb_linearize(skb, GFP_ATOMIC))
1236                 goto out_kfree_skb;
1237
1238         /* If packet is not checksummed and device does not support
1239          * checksumming for this protocol, complete checksumming here.
1240          */
1241         if (skb->ip_summed == CHECKSUM_HW &&
1242             (!(dev->features & (NETIF_F_HW_CSUM | NETIF_F_NO_CSUM)) &&
1243              (!(dev->features & NETIF_F_IP_CSUM) ||
1244               skb->protocol != htons(ETH_P_IP))))
1245                 if (skb_checksum_help(skb, 0))
1246                         goto out_kfree_skb;
1247
1248         /* Disable soft irqs for various locks below. Also
1249          * stops preemption for RCU.
1250          */
1251         local_bh_disable();
1252
1253         /* Updates of qdisc are serialized by queue_lock.
1254          * The struct Qdisc which is pointed to by qdisc is now a
1255          * rcu structure - it may be accessed without acquiring
1256          * a lock (but the structure may be stale.) The freeing of the
1257          * qdisc will be deferred until it's known that there are no
1258          * more references to it.
1259          *
1260          * If the qdisc has an enqueue function, we still need to
1261          * hold the queue_lock before calling it, since queue_lock
1262          * also serializes access to the device queue.
1263          */
1264
1265         q = rcu_dereference(dev->qdisc);
1266 #ifdef CONFIG_NET_CLS_ACT
1267         skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_EGRESS);
1268 #endif
1269         if (q->enqueue) {
1270                 /* Grab device queue */
1271                 spin_lock(&dev->queue_lock);
1272
1273                 rc = q->enqueue(skb, q);
1274
1275                 qdisc_run(dev);
1276
1277                 spin_unlock(&dev->queue_lock);
1278                 rc = rc == NET_XMIT_BYPASS ? NET_XMIT_SUCCESS : rc;
1279                 goto out;
1280         }
1281
1282         /* The device has no queue. Common case for software devices:
1283            loopback, all the sorts of tunnels...
1284
1285            Really, it is unlikely that xmit_lock protection is necessary here.
1286            (f.e. loopback and IP tunnels are clean ignoring statistics
1287            counters.)
1288            However, it is possible, that they rely on protection
1289            made by us here.
1290
1291            Check this and shot the lock. It is not prone from deadlocks.
1292            Either shot noqueue qdisc, it is even simpler 8)
1293          */
1294         if (dev->flags & IFF_UP) {
1295                 int cpu = smp_processor_id(); /* ok because BHs are off */
1296
1297                 if (dev->xmit_lock_owner != cpu) {
1298
1299                         HARD_TX_LOCK(dev, cpu);
1300
1301                         if (!netif_queue_stopped(dev)) {
1302                                 if (netdev_nit)
1303                                         dev_queue_xmit_nit(skb, dev);
1304
1305                                 rc = 0;
1306                                 if (!dev->hard_start_xmit(skb, dev)) {
1307                                         HARD_TX_UNLOCK(dev);
1308                                         goto out;
1309                                 }
1310                         }
1311                         HARD_TX_UNLOCK(dev);
1312                         if (net_ratelimit())
1313                                 printk(KERN_CRIT "Virtual device %s asks to "
1314                                        "queue packet!\n", dev->name);
1315                 } else {
1316                         /* Recursion is detected! It is possible,
1317                          * unfortunately */
1318                         if (net_ratelimit())
1319                                 printk(KERN_CRIT "Dead loop on virtual device "
1320                                        "%s, fix it urgently!\n", dev->name);
1321                 }
1322         }
1323
1324         rc = -ENETDOWN;
1325         local_bh_enable();
1326
1327 out_kfree_skb:
1328         kfree_skb(skb);
1329         return rc;
1330 out:
1331         local_bh_enable();
1332         return rc;
1333 }
1334
1335
1336 /*=======================================================================
1337                         Receiver routines
1338   =======================================================================*/
1339
1340 int netdev_max_backlog = 300;
1341 int weight_p = 64;            /* old backlog weight */
1342 /* These numbers are selected based on intuition and some
1343  * experimentatiom, if you have more scientific way of doing this
1344  * please go ahead and fix things.
1345  */
1346 int no_cong_thresh = 10;
1347 int no_cong = 20;
1348 int lo_cong = 100;
1349 int mod_cong = 290;
1350
1351 DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
1352
1353
1354 static void get_sample_stats(int cpu)
1355 {
1356 #ifdef RAND_LIE
1357         unsigned long rd;
1358         int rq;
1359 #endif
1360         struct softnet_data *sd = &per_cpu(softnet_data, cpu);
1361         int blog = sd->input_pkt_queue.qlen;
1362         int avg_blog = sd->avg_blog;
1363
1364         avg_blog = (avg_blog >> 1) + (blog >> 1);
1365
1366         if (avg_blog > mod_cong) {
1367                 /* Above moderate congestion levels. */
1368                 sd->cng_level = NET_RX_CN_HIGH;
1369 #ifdef RAND_LIE
1370                 rd = net_random();
1371                 rq = rd % netdev_max_backlog;
1372                 if (rq < avg_blog) /* unlucky bastard */
1373                         sd->cng_level = NET_RX_DROP;
1374 #endif
1375         } else if (avg_blog > lo_cong) {
1376                 sd->cng_level = NET_RX_CN_MOD;
1377 #ifdef RAND_LIE
1378                 rd = net_random();
1379                 rq = rd % netdev_max_backlog;
1380                         if (rq < avg_blog) /* unlucky bastard */
1381                                 sd->cng_level = NET_RX_CN_HIGH;
1382 #endif
1383         } else if (avg_blog > no_cong)
1384                 sd->cng_level = NET_RX_CN_LOW;
1385         else  /* no congestion */
1386                 sd->cng_level = NET_RX_SUCCESS;
1387
1388         sd->avg_blog = avg_blog;
1389 }
1390
1391 #ifdef OFFLINE_SAMPLE
1392 static void sample_queue(unsigned long dummy)
1393 {
1394 /* 10 ms 0r 1ms -- i don't care -- JHS */
1395         int next_tick = 1;
1396         int cpu = smp_processor_id();
1397
1398         get_sample_stats(cpu);
1399         next_tick += jiffies;
1400         mod_timer(&samp_timer, next_tick);
1401 }
1402 #endif
1403
1404
1405 /**
1406  *      netif_rx        -       post buffer to the network code
1407  *      @skb: buffer to post
1408  *
1409  *      This function receives a packet from a device driver and queues it for
1410  *      the upper (protocol) levels to process.  It always succeeds. The buffer
1411  *      may be dropped during processing for congestion control or by the
1412  *      protocol layers.
1413  *
1414  *      return values:
1415  *      NET_RX_SUCCESS  (no congestion)
1416  *      NET_RX_CN_LOW   (low congestion)
1417  *      NET_RX_CN_MOD   (moderate congestion)
1418  *      NET_RX_CN_HIGH  (high congestion)
1419  *      NET_RX_DROP     (packet was dropped)
1420  *
1421  */
1422
1423 int netif_rx(struct sk_buff *skb)
1424 {
1425         int this_cpu;
1426         struct softnet_data *queue;
1427         unsigned long flags;
1428
1429 #ifdef CONFIG_NETPOLL
1430         if (skb->dev->netpoll_rx && netpoll_rx(skb)) {
1431                 kfree_skb(skb);
1432                 return NET_RX_DROP;
1433         }
1434 #endif
1435
1436         if (!skb->stamp.tv_sec)
1437                 net_timestamp(&skb->stamp);
1438
1439         /*
1440          * The code is rearranged so that the path is the most
1441          * short when CPU is congested, but is still operating.
1442          */
1443         local_irq_save(flags);
1444         this_cpu = smp_processor_id();
1445         queue = &__get_cpu_var(softnet_data);
1446
1447         __get_cpu_var(netdev_rx_stat).total++;
1448         if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
1449                 if (queue->input_pkt_queue.qlen) {
1450                         if (queue->throttle)
1451                                 goto drop;
1452
1453 enqueue:
1454                         dev_hold(skb->dev);
1455                         __skb_queue_tail(&queue->input_pkt_queue, skb);
1456 #ifndef OFFLINE_SAMPLE
1457                         get_sample_stats(this_cpu);
1458 #endif
1459                         local_irq_restore(flags);
1460                         return queue->cng_level;
1461                 }
1462
1463                 if (queue->throttle)
1464                         queue->throttle = 0;
1465
1466                 netif_rx_schedule(&queue->backlog_dev);
1467                 goto enqueue;
1468         }
1469
1470         if (!queue->throttle) {
1471                 queue->throttle = 1;
1472                 __get_cpu_var(netdev_rx_stat).throttled++;
1473         }
1474
1475 drop:
1476         __get_cpu_var(netdev_rx_stat).dropped++;
1477         local_irq_restore(flags);
1478
1479         kfree_skb(skb);
1480         return NET_RX_DROP;
1481 }
1482
1483 int netif_rx_ni(struct sk_buff *skb)
1484 {
1485         int err;
1486
1487         preempt_disable();
1488         err = netif_rx(skb);
1489         if (softirq_pending(smp_processor_id()))
1490                 do_softirq();
1491         preempt_enable();
1492
1493         return err;
1494 }
1495
1496 EXPORT_SYMBOL(netif_rx_ni);
1497
1498 static __inline__ void skb_bond(struct sk_buff *skb)
1499 {
1500         struct net_device *dev = skb->dev;
1501
1502         if (dev->master) {
1503                 skb->real_dev = skb->dev;
1504                 skb->dev = dev->master;
1505         }
1506 }
1507
1508 static void net_tx_action(struct softirq_action *h)
1509 {
1510         struct softnet_data *sd = &__get_cpu_var(softnet_data);
1511
1512         if (sd->completion_queue) {
1513                 struct sk_buff *clist;
1514
1515                 local_irq_disable();
1516                 clist = sd->completion_queue;
1517                 sd->completion_queue = NULL;
1518                 local_irq_enable();
1519
1520                 while (clist) {
1521                         struct sk_buff *skb = clist;
1522                         clist = clist->next;
1523
1524                         BUG_TRAP(!atomic_read(&skb->users));
1525                         __kfree_skb(skb);
1526                 }
1527         }
1528
1529         if (sd->output_queue) {
1530                 struct net_device *head;
1531
1532                 local_irq_disable();
1533                 head = sd->output_queue;
1534                 sd->output_queue = NULL;
1535                 local_irq_enable();
1536
1537                 while (head) {
1538                         struct net_device *dev = head;
1539                         head = head->next_sched;
1540
1541                         smp_mb__before_clear_bit();
1542                         clear_bit(__LINK_STATE_SCHED, &dev->state);
1543
1544                         if (spin_trylock(&dev->queue_lock)) {
1545                                 qdisc_run(dev);
1546                                 spin_unlock(&dev->queue_lock);
1547                         } else {
1548                                 netif_schedule(dev);
1549                         }
1550                 }
1551         }
1552 }
1553
1554 static __inline__ int deliver_skb(struct sk_buff *skb,
1555                                   struct packet_type *pt_prev)
1556 {
1557         atomic_inc(&skb->users);
1558         return pt_prev->func(skb, skb->dev, pt_prev);
1559 }
1560
1561 #if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE)
1562 int (*br_handle_frame_hook)(struct net_bridge_port *p, struct sk_buff **pskb);
1563
1564 static __inline__ int handle_bridge(struct sk_buff **pskb,
1565                                     struct packet_type **pt_prev, int *ret)
1566 {
1567         struct net_bridge_port *port;
1568
1569         if ((*pskb)->pkt_type == PACKET_LOOPBACK ||
1570             (port = rcu_dereference((*pskb)->dev->br_port)) == NULL)
1571                 return 0;
1572
1573         if (*pt_prev) {
1574                 *ret = deliver_skb(*pskb, *pt_prev);
1575                 *pt_prev = NULL;
1576         }
1577
1578         return br_handle_frame_hook(port, pskb);
1579 }
1580 #else
1581 #define handle_bridge(skb, pt_prev, ret)        (0)
1582 #endif
1583
1584 #ifdef CONFIG_NET_CLS_ACT
1585 /* TODO: Maybe we should just force sch_ingress to be compiled in
1586  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
1587  * a compare and 2 stores extra right now if we dont have it on
1588  * but have CONFIG_NET_CLS_ACT
1589  * NOTE: This doesnt stop any functionality; if you dont have
1590  * the ingress scheduler, you just cant add policies on ingress.
1591  *
1592  */
1593 int ing_filter(struct sk_buff *skb)
1594 {
1595         struct Qdisc *q;
1596         struct net_device *dev = skb->dev;
1597         int result = TC_ACT_OK;
1598
1599         if (dev->qdisc_ingress) {
1600                 __u32 ttl = (__u32) G_TC_RTTL(skb->tc_verd);
1601                 if (MAX_RED_LOOP < ttl++) {
1602                         printk("Redir loop detected Dropping packet (%s->%s)\n",
1603                                 skb->input_dev?skb->input_dev->name:"??",skb->dev->name);
1604                         return TC_ACT_SHOT;
1605                 }
1606
1607                 skb->tc_verd = SET_TC_RTTL(skb->tc_verd,ttl);
1608
1609                 skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_INGRESS);
1610                 if (NULL == skb->input_dev) {
1611                         skb->input_dev = skb->dev;
1612                         printk("ing_filter:  fixed  %s out %s\n",skb->input_dev->name,skb->dev->name);
1613                 }
1614                 spin_lock(&dev->ingress_lock);
1615                 if ((q = dev->qdisc_ingress) != NULL)
1616                         result = q->enqueue(skb, q);
1617                 spin_unlock(&dev->ingress_lock);
1618
1619         }
1620
1621         return result;
1622 }
1623 #endif
1624
1625 int netif_receive_skb(struct sk_buff *skb)
1626 {
1627         struct packet_type *ptype, *pt_prev;
1628         int ret = NET_RX_DROP;
1629         unsigned short type;
1630
1631 #ifdef CONFIG_NETPOLL
1632         if (skb->dev->netpoll_rx && skb->dev->poll && netpoll_rx(skb)) {
1633                 kfree_skb(skb);
1634                 return NET_RX_DROP;
1635         }
1636 #endif
1637
1638         if (!skb->stamp.tv_sec)
1639                 net_timestamp(&skb->stamp);
1640
1641         skb_bond(skb);
1642
1643         __get_cpu_var(netdev_rx_stat).total++;
1644
1645         skb->h.raw = skb->nh.raw = skb->data;
1646         skb->mac_len = skb->nh.raw - skb->mac.raw;
1647
1648         pt_prev = NULL;
1649
1650         rcu_read_lock();
1651
1652 #ifdef CONFIG_NET_CLS_ACT
1653         if (skb->tc_verd & TC_NCLS) {
1654                 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
1655                 goto ncls;
1656         }
1657 #endif
1658
1659         list_for_each_entry_rcu(ptype, &ptype_all, list) {
1660                 if (!ptype->dev || ptype->dev == skb->dev) {
1661                         if (pt_prev)
1662                                 ret = deliver_skb(skb, pt_prev);
1663                         pt_prev = ptype;
1664                 }
1665         }
1666
1667 #ifdef CONFIG_NET_CLS_ACT
1668         if (pt_prev) {
1669                 ret = deliver_skb(skb, pt_prev);
1670                 pt_prev = NULL; /* noone else should process this after*/
1671         } else {
1672                 skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd);
1673         }
1674
1675         ret = ing_filter(skb);
1676
1677         if (ret == TC_ACT_SHOT || (ret == TC_ACT_STOLEN)) {
1678                 kfree_skb(skb);
1679                 goto out;
1680         }
1681
1682         skb->tc_verd = 0;
1683 ncls:
1684 #endif
1685
1686         handle_diverter(skb);
1687
1688         if (handle_bridge(&skb, &pt_prev, &ret))
1689                 goto out;
1690
1691         type = skb->protocol;
1692         list_for_each_entry_rcu(ptype, &ptype_base[ntohs(type)&15], list) {
1693                 if (ptype->type == type &&
1694                     (!ptype->dev || ptype->dev == skb->dev)) {
1695                         if (pt_prev)
1696                                 ret = deliver_skb(skb, pt_prev);
1697                         pt_prev = ptype;
1698                 }
1699         }
1700
1701         if (pt_prev) {
1702                 ret = pt_prev->func(skb, skb->dev, pt_prev);
1703         } else {
1704                 kfree_skb(skb);
1705                 /* Jamal, now you will not able to escape explaining
1706                  * me how you were going to use this. :-)
1707                  */
1708                 ret = NET_RX_DROP;
1709         }
1710
1711 out:
1712         rcu_read_unlock();
1713         return ret;
1714 }
1715
1716 static int process_backlog(struct net_device *backlog_dev, int *budget)
1717 {
1718         int work = 0;
1719         int quota = min(backlog_dev->quota, *budget);
1720         struct softnet_data *queue = &__get_cpu_var(softnet_data);
1721         unsigned long start_time = jiffies;
1722
1723         for (;;) {
1724                 struct sk_buff *skb;
1725                 struct net_device *dev;
1726
1727                 local_irq_disable();
1728                 skb = __skb_dequeue(&queue->input_pkt_queue);
1729                 if (!skb)
1730                         goto job_done;
1731                 local_irq_enable();
1732
1733                 dev = skb->dev;
1734
1735                 netif_receive_skb(skb);
1736
1737                 dev_put(dev);
1738
1739                 work++;
1740
1741                 if (work >= quota || jiffies - start_time > 1)
1742                         break;
1743
1744         }
1745
1746         backlog_dev->quota -= work;
1747         *budget -= work;
1748         return -1;
1749
1750 job_done:
1751         backlog_dev->quota -= work;
1752         *budget -= work;
1753
1754         list_del(&backlog_dev->poll_list);
1755         smp_mb__before_clear_bit();
1756         netif_poll_enable(backlog_dev);
1757
1758         if (queue->throttle)
1759                 queue->throttle = 0;
1760         local_irq_enable();
1761         return 0;
1762 }
1763
1764 static void net_rx_action(struct softirq_action *h)
1765 {
1766         struct softnet_data *queue = &__get_cpu_var(softnet_data);
1767         unsigned long start_time = jiffies;
1768         int budget = netdev_max_backlog;
1769
1770
1771         local_irq_disable();
1772
1773         while (!list_empty(&queue->poll_list)) {
1774                 struct net_device *dev;
1775
1776                 if (budget <= 0 || jiffies - start_time > 1)
1777                         goto softnet_break;
1778
1779                 local_irq_enable();
1780
1781                 dev = list_entry(queue->poll_list.next,
1782                                  struct net_device, poll_list);
1783
1784                 if (dev->quota <= 0 || dev->poll(dev, &budget)) {
1785                         local_irq_disable();
1786                         list_del(&dev->poll_list);
1787                         list_add_tail(&dev->poll_list, &queue->poll_list);
1788                         if (dev->quota < 0)
1789                                 dev->quota += dev->weight;
1790                         else
1791                                 dev->quota = dev->weight;
1792                 } else {
1793                         dev_put(dev);
1794                         local_irq_disable();
1795                 }
1796         }
1797 out:
1798         local_irq_enable();
1799         return;
1800
1801 softnet_break:
1802         __get_cpu_var(netdev_rx_stat).time_squeeze++;
1803         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
1804         goto out;
1805 }
1806
1807 static gifconf_func_t * gifconf_list [NPROTO];
1808
1809 /**
1810  *      register_gifconf        -       register a SIOCGIF handler
1811  *      @family: Address family
1812  *      @gifconf: Function handler
1813  *
1814  *      Register protocol dependent address dumping routines. The handler
1815  *      that is passed must not be freed or reused until it has been replaced
1816  *      by another handler.
1817  */
1818 int register_gifconf(unsigned int family, gifconf_func_t * gifconf)
1819 {
1820         if (family >= NPROTO)
1821                 return -EINVAL;
1822         gifconf_list[family] = gifconf;
1823         return 0;
1824 }
1825
1826
1827 /*
1828  *      Map an interface index to its name (SIOCGIFNAME)
1829  */
1830
1831 /*
1832  *      We need this ioctl for efficient implementation of the
1833  *      if_indextoname() function required by the IPv6 API.  Without
1834  *      it, we would have to search all the interfaces to find a
1835  *      match.  --pb
1836  */
1837
1838 static int dev_ifname(struct ifreq __user *arg)
1839 {
1840         struct net_device *dev;
1841         struct ifreq ifr;
1842
1843         /*
1844          *      Fetch the caller's info block.
1845          */
1846
1847         if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
1848                 return -EFAULT;
1849
1850         read_lock(&dev_base_lock);
1851         dev = __dev_get_by_index(ifr.ifr_ifindex);
1852         if (!dev) {
1853                 read_unlock(&dev_base_lock);
1854                 return -ENODEV;
1855         }
1856
1857         strcpy(ifr.ifr_name, dev->name);
1858         read_unlock(&dev_base_lock);
1859
1860         if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
1861                 return -EFAULT;
1862         return 0;
1863 }
1864
1865 /*
1866  *      Perform a SIOCGIFCONF call. This structure will change
1867  *      size eventually, and there is nothing I can do about it.
1868  *      Thus we will need a 'compatibility mode'.
1869  */
1870
1871 static int dev_ifconf(char __user *arg)
1872 {
1873         struct ifconf ifc;
1874         struct net_device *dev;
1875         char __user *pos;
1876         int len;
1877         int total;
1878         int i;
1879
1880         /*
1881          *      Fetch the caller's info block.
1882          */
1883
1884         if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
1885                 return -EFAULT;
1886
1887         pos = ifc.ifc_buf;
1888         len = ifc.ifc_len;
1889
1890         /*
1891          *      Loop over the interfaces, and write an info block for each.
1892          */
1893
1894         total = 0;
1895         for (dev = dev_base; dev; dev = dev->next) {
1896                 for (i = 0; i < NPROTO; i++) {
1897                         if (gifconf_list[i]) {
1898                                 int done;
1899                                 if (!pos)
1900                                         done = gifconf_list[i](dev, NULL, 0);
1901                                 else
1902                                         done = gifconf_list[i](dev, pos + total,
1903                                                                len - total);
1904                                 if (done < 0)
1905                                         return -EFAULT;
1906                                 total += done;
1907                         }
1908                 }
1909         }
1910
1911         /*
1912          *      All done.  Write the updated control block back to the caller.
1913          */
1914         ifc.ifc_len = total;
1915
1916         /*
1917          *      Both BSD and Solaris return 0 here, so we do too.
1918          */
1919         return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
1920 }
1921
1922 #ifdef CONFIG_PROC_FS
1923 /*
1924  *      This is invoked by the /proc filesystem handler to display a device
1925  *      in detail.
1926  */
1927 static __inline__ struct net_device *dev_get_idx(loff_t pos)
1928 {
1929         struct net_device *dev;
1930         loff_t i;
1931
1932         for (i = 0, dev = dev_base; dev && i < pos; ++i, dev = dev->next);
1933
1934         return i == pos ? dev : NULL;
1935 }
1936
1937 void *dev_seq_start(struct seq_file *seq, loff_t *pos)
1938 {
1939         read_lock(&dev_base_lock);
1940         return *pos ? dev_get_idx(*pos - 1) : SEQ_START_TOKEN;
1941 }
1942
1943 void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1944 {
1945         ++*pos;
1946         return v == SEQ_START_TOKEN ? dev_base : ((struct net_device *)v)->next;
1947 }
1948
1949 void dev_seq_stop(struct seq_file *seq, void *v)
1950 {
1951         read_unlock(&dev_base_lock);
1952 }
1953
1954 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
1955 {
1956         if (dev->get_stats) {
1957                 struct net_device_stats *stats = dev->get_stats(dev);
1958
1959                 seq_printf(seq, "%6s:%8lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu "
1960                                 "%8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n",
1961                            dev->name, stats->rx_bytes, stats->rx_packets,
1962                            stats->rx_errors,
1963                            stats->rx_dropped + stats->rx_missed_errors,
1964                            stats->rx_fifo_errors,
1965                            stats->rx_length_errors + stats->rx_over_errors +
1966                              stats->rx_crc_errors + stats->rx_frame_errors,
1967                            stats->rx_compressed, stats->multicast,
1968                            stats->tx_bytes, stats->tx_packets,
1969                            stats->tx_errors, stats->tx_dropped,
1970                            stats->tx_fifo_errors, stats->collisions,
1971                            stats->tx_carrier_errors +
1972                              stats->tx_aborted_errors +
1973                              stats->tx_window_errors +
1974                              stats->tx_heartbeat_errors,
1975                            stats->tx_compressed);
1976         } else
1977                 seq_printf(seq, "%6s: No statistics available.\n", dev->name);
1978 }
1979
1980 /*
1981  *      Called from the PROCfs module. This now uses the new arbitrary sized
1982  *      /proc/net interface to create /proc/net/dev
1983  */
1984 static int dev_seq_show(struct seq_file *seq, void *v)
1985 {
1986         if (v == SEQ_START_TOKEN)
1987                 seq_puts(seq, "Inter-|   Receive                            "
1988                               "                    |  Transmit\n"
1989                               " face |bytes    packets errs drop fifo frame "
1990                               "compressed multicast|bytes    packets errs "
1991                               "drop fifo colls carrier compressed\n");
1992         else
1993                 dev_seq_printf_stats(seq, v);
1994         return 0;
1995 }
1996
1997 static struct netif_rx_stats *softnet_get_online(loff_t *pos)
1998 {
1999         struct netif_rx_stats *rc = NULL;
2000
2001         while (*pos < NR_CPUS)
2002                 if (cpu_online(*pos)) {
2003                         rc = &per_cpu(netdev_rx_stat, *pos);
2004                         break;
2005                 } else
2006                         ++*pos;
2007         return rc;
2008 }
2009
2010 static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
2011 {
2012         return softnet_get_online(pos);
2013 }
2014
2015 static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2016 {
2017         ++*pos;
2018         return softnet_get_online(pos);
2019 }
2020
2021 static void softnet_seq_stop(struct seq_file *seq, void *v)
2022 {
2023 }
2024
2025 static int softnet_seq_show(struct seq_file *seq, void *v)
2026 {
2027         struct netif_rx_stats *s = v;
2028
2029         seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
2030                    s->total, s->dropped, s->time_squeeze, s->throttled,
2031                    s->fastroute_hit, s->fastroute_success, s->fastroute_defer,
2032                    s->fastroute_deferred_out,
2033 #if 0
2034                    s->fastroute_latency_reduction
2035 #else
2036                    s->cpu_collision
2037 #endif
2038                   );
2039         return 0;
2040 }
2041
2042 static struct seq_operations dev_seq_ops = {
2043         .start = dev_seq_start,
2044         .next  = dev_seq_next,
2045         .stop  = dev_seq_stop,
2046         .show  = dev_seq_show,
2047 };
2048
2049 static int dev_seq_open(struct inode *inode, struct file *file)
2050 {
2051         return seq_open(file, &dev_seq_ops);
2052 }
2053
2054 static struct file_operations dev_seq_fops = {
2055         .owner   = THIS_MODULE,
2056         .open    = dev_seq_open,
2057         .read    = seq_read,
2058         .llseek  = seq_lseek,
2059         .release = seq_release,
2060 };
2061
2062 static struct seq_operations softnet_seq_ops = {
2063         .start = softnet_seq_start,
2064         .next  = softnet_seq_next,
2065         .stop  = softnet_seq_stop,
2066         .show  = softnet_seq_show,
2067 };
2068
2069 static int softnet_seq_open(struct inode *inode, struct file *file)
2070 {
2071         return seq_open(file, &softnet_seq_ops);
2072 }
2073
2074 static struct file_operations softnet_seq_fops = {
2075         .owner   = THIS_MODULE,
2076         .open    = softnet_seq_open,
2077         .read    = seq_read,
2078         .llseek  = seq_lseek,
2079         .release = seq_release,
2080 };
2081
2082 #ifdef WIRELESS_EXT
2083 extern int wireless_proc_init(void);
2084 #else
2085 #define wireless_proc_init() 0
2086 #endif
2087
2088 static int __init dev_proc_init(void)
2089 {
2090         int rc = -ENOMEM;
2091
2092         if (!proc_net_fops_create("dev", S_IRUGO, &dev_seq_fops))
2093                 goto out;
2094         if (!proc_net_fops_create("softnet_stat", S_IRUGO, &softnet_seq_fops))
2095                 goto out_dev;
2096         if (wireless_proc_init())
2097                 goto out_softnet;
2098         rc = 0;
2099 out:
2100         return rc;
2101 out_softnet:
2102         proc_net_remove("softnet_stat");
2103 out_dev:
2104         proc_net_remove("dev");
2105         goto out;
2106 }
2107 #else
2108 #define dev_proc_init() 0
2109 #endif  /* CONFIG_PROC_FS */
2110
2111
2112 /**
2113  *      netdev_set_master       -       set up master/slave pair
2114  *      @slave: slave device
2115  *      @master: new master device
2116  *
2117  *      Changes the master device of the slave. Pass %NULL to break the
2118  *      bonding. The caller must hold the RTNL semaphore. On a failure
2119  *      a negative errno code is returned. On success the reference counts
2120  *      are adjusted, %RTM_NEWLINK is sent to the routing socket and the
2121  *      function returns zero.
2122  */
2123 int netdev_set_master(struct net_device *slave, struct net_device *master)
2124 {
2125         struct net_device *old = slave->master;
2126
2127         ASSERT_RTNL();
2128
2129         if (master) {
2130                 if (old)
2131                         return -EBUSY;
2132                 dev_hold(master);
2133         }
2134
2135         slave->master = master;
2136
2137         synchronize_net();
2138
2139         if (old)
2140                 dev_put(old);
2141
2142         if (master)
2143                 slave->flags |= IFF_SLAVE;
2144         else
2145                 slave->flags &= ~IFF_SLAVE;
2146
2147         rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
2148         return 0;
2149 }
2150
2151 /**
2152  *      dev_set_promiscuity     - update promiscuity count on a device
2153  *      @dev: device
2154  *      @inc: modifier
2155  *
2156  *      Add or remove promsicuity from a device. While the count in the device
2157  *      remains above zero the interface remains promiscuous. Once it hits zero
2158  *      the device reverts back to normal filtering operation. A negative inc
2159  *      value is used to drop promiscuity on the device.
2160  */
2161 void dev_set_promiscuity(struct net_device *dev, int inc)
2162 {
2163         unsigned short old_flags = dev->flags;
2164
2165         dev->flags |= IFF_PROMISC;
2166         if ((dev->promiscuity += inc) == 0)
2167                 dev->flags &= ~IFF_PROMISC;
2168         if (dev->flags ^ old_flags) {
2169                 dev_mc_upload(dev);
2170                 printk(KERN_INFO "device %s %s promiscuous mode\n",
2171                        dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
2172                                                                "left");
2173         }
2174 }
2175
2176 /**
2177  *      dev_set_allmulti        - update allmulti count on a device
2178  *      @dev: device
2179  *      @inc: modifier
2180  *
2181  *      Add or remove reception of all multicast frames to a device. While the
2182  *      count in the device remains above zero the interface remains listening
2183  *      to all interfaces. Once it hits zero the device reverts back to normal
2184  *      filtering operation. A negative @inc value is used to drop the counter
2185  *      when releasing a resource needing all multicasts.
2186  */
2187
2188 void dev_set_allmulti(struct net_device *dev, int inc)
2189 {
2190         unsigned short old_flags = dev->flags;
2191
2192         dev->flags |= IFF_ALLMULTI;
2193         if ((dev->allmulti += inc) == 0)
2194                 dev->flags &= ~IFF_ALLMULTI;
2195         if (dev->flags ^ old_flags)
2196                 dev_mc_upload(dev);
2197 }
2198
2199 unsigned dev_get_flags(const struct net_device *dev)
2200 {
2201         unsigned flags;
2202
2203         flags = (dev->flags & ~(IFF_PROMISC |
2204                                 IFF_ALLMULTI |
2205                                 IFF_RUNNING)) |
2206                 (dev->gflags & (IFF_PROMISC |
2207                                 IFF_ALLMULTI));
2208
2209         if (netif_running(dev) && netif_carrier_ok(dev))
2210                 flags |= IFF_RUNNING;
2211
2212         return flags;
2213 }
2214
2215 int dev_change_flags(struct net_device *dev, unsigned flags)
2216 {
2217         int ret;
2218         int old_flags = dev->flags;
2219
2220         /*
2221          *      Set the flags on our device.
2222          */
2223
2224         dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
2225                                IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
2226                                IFF_AUTOMEDIA)) |
2227                      (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
2228                                     IFF_ALLMULTI));
2229
2230         /*
2231          *      Load in the correct multicast list now the flags have changed.
2232          */
2233
2234         dev_mc_upload(dev);
2235
2236         /*
2237          *      Have we downed the interface. We handle IFF_UP ourselves
2238          *      according to user attempts to set it, rather than blindly
2239          *      setting it.
2240          */
2241
2242         ret = 0;
2243         if ((old_flags ^ flags) & IFF_UP) {     /* Bit is different  ? */
2244                 ret = ((old_flags & IFF_UP) ? dev_close : dev_open)(dev);
2245
2246                 if (!ret)
2247                         dev_mc_upload(dev);
2248         }
2249
2250         if (dev->flags & IFF_UP &&
2251             ((old_flags ^ dev->flags) &~ (IFF_UP | IFF_PROMISC | IFF_ALLMULTI |
2252                                           IFF_VOLATILE)))
2253                 notifier_call_chain(&netdev_chain, NETDEV_CHANGE, dev);
2254
2255         if ((flags ^ dev->gflags) & IFF_PROMISC) {
2256                 int inc = (flags & IFF_PROMISC) ? +1 : -1;
2257                 dev->gflags ^= IFF_PROMISC;
2258                 dev_set_promiscuity(dev, inc);
2259         }
2260
2261         /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
2262            is important. Some (broken) drivers set IFF_PROMISC, when
2263            IFF_ALLMULTI is requested not asking us and not reporting.
2264          */
2265         if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
2266                 int inc = (flags & IFF_ALLMULTI) ? +1 : -1;
2267                 dev->gflags ^= IFF_ALLMULTI;
2268                 dev_set_allmulti(dev, inc);
2269         }
2270
2271         if (old_flags ^ dev->flags)
2272                 rtmsg_ifinfo(RTM_NEWLINK, dev, old_flags ^ dev->flags);
2273
2274         return ret;
2275 }
2276
2277 int dev_set_mtu(struct net_device *dev, int new_mtu)
2278 {
2279         int err;
2280
2281         if (new_mtu == dev->mtu)
2282                 return 0;
2283
2284         /*      MTU must be positive.    */
2285         if (new_mtu < 0)
2286                 return -EINVAL;
2287
2288         if (!netif_device_present(dev))
2289                 return -ENODEV;
2290
2291         err = 0;
2292         if (dev->change_mtu)
2293                 err = dev->change_mtu(dev, new_mtu);
2294         else
2295                 dev->mtu = new_mtu;
2296         if (!err && dev->flags & IFF_UP)
2297                 notifier_call_chain(&netdev_chain,
2298                                     NETDEV_CHANGEMTU, dev);
2299         return err;
2300 }
2301
2302
2303 /*
2304  *      Perform the SIOCxIFxxx calls.
2305  */
2306 static int dev_ifsioc(struct ifreq *ifr, unsigned int cmd)
2307 {
2308         int err;
2309         struct net_device *dev = __dev_get_by_name(ifr->ifr_name);
2310
2311         if (!dev)
2312                 return -ENODEV;
2313
2314         switch (cmd) {
2315                 case SIOCGIFFLAGS:      /* Get interface flags */
2316                         ifr->ifr_flags = dev_get_flags(dev);
2317                         return 0;
2318
2319                 case SIOCSIFFLAGS:      /* Set interface flags */
2320                         return dev_change_flags(dev, ifr->ifr_flags);
2321
2322                 case SIOCGIFMETRIC:     /* Get the metric on the interface
2323                                            (currently unused) */
2324                         ifr->ifr_metric = 0;
2325                         return 0;
2326
2327                 case SIOCSIFMETRIC:     /* Set the metric on the interface
2328                                            (currently unused) */
2329                         return -EOPNOTSUPP;
2330
2331                 case SIOCGIFMTU:        /* Get the MTU of a device */
2332                         ifr->ifr_mtu = dev->mtu;
2333                         return 0;
2334
2335                 case SIOCSIFMTU:        /* Set the MTU of a device */
2336                         return dev_set_mtu(dev, ifr->ifr_mtu);
2337
2338                 case SIOCGIFHWADDR:
2339                         if (!dev->addr_len)
2340                                 memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
2341                         else
2342                                 memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
2343                                        min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
2344                         ifr->ifr_hwaddr.sa_family = dev->type;
2345                         return 0;
2346
2347                 case SIOCSIFHWADDR:
2348                         if (!dev->set_mac_address)
2349                                 return -EOPNOTSUPP;
2350                         if (ifr->ifr_hwaddr.sa_family != dev->type)
2351                                 return -EINVAL;
2352                         if (!netif_device_present(dev))
2353                                 return -ENODEV;
2354                         err = dev->set_mac_address(dev, &ifr->ifr_hwaddr);
2355                         if (!err)
2356                                 notifier_call_chain(&netdev_chain,
2357                                                     NETDEV_CHANGEADDR, dev);
2358                         return err;
2359
2360                 case SIOCSIFHWBROADCAST:
2361                         if (ifr->ifr_hwaddr.sa_family != dev->type)
2362                                 return -EINVAL;
2363                         memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
2364                                min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
2365                         notifier_call_chain(&netdev_chain,
2366                                             NETDEV_CHANGEADDR, dev);
2367                         return 0;
2368
2369                 case SIOCGIFMAP:
2370                         ifr->ifr_map.mem_start = dev->mem_start;
2371                         ifr->ifr_map.mem_end   = dev->mem_end;
2372                         ifr->ifr_map.base_addr = dev->base_addr;
2373                         ifr->ifr_map.irq       = dev->irq;
2374                         ifr->ifr_map.dma       = dev->dma;
2375                         ifr->ifr_map.port      = dev->if_port;
2376                         return 0;
2377
2378                 case SIOCSIFMAP:
2379                         if (dev->set_config) {
2380                                 if (!netif_device_present(dev))
2381                                         return -ENODEV;
2382                                 return dev->set_config(dev, &ifr->ifr_map);
2383                         }
2384                         return -EOPNOTSUPP;
2385
2386                 case SIOCADDMULTI:
2387                         if (!dev->set_multicast_list ||
2388                             ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
2389                                 return -EINVAL;
2390                         if (!netif_device_present(dev))
2391                                 return -ENODEV;
2392                         return dev_mc_add(dev, ifr->ifr_hwaddr.sa_data,
2393                                           dev->addr_len, 1);
2394
2395                 case SIOCDELMULTI:
2396                         if (!dev->set_multicast_list ||
2397                             ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
2398                                 return -EINVAL;
2399                         if (!netif_device_present(dev))
2400                                 return -ENODEV;
2401                         return dev_mc_delete(dev, ifr->ifr_hwaddr.sa_data,
2402                                              dev->addr_len, 1);
2403
2404                 case SIOCGIFINDEX:
2405                         ifr->ifr_ifindex = dev->ifindex;
2406                         return 0;
2407
2408                 case SIOCGIFTXQLEN:
2409                         ifr->ifr_qlen = dev->tx_queue_len;
2410                         return 0;
2411
2412                 case SIOCSIFTXQLEN:
2413                         if (ifr->ifr_qlen < 0)
2414                                 return -EINVAL;
2415                         dev->tx_queue_len = ifr->ifr_qlen;
2416                         return 0;
2417
2418                 case SIOCSIFNAME:
2419                         ifr->ifr_newname[IFNAMSIZ-1] = '\0';
2420                         return dev_change_name(dev, ifr->ifr_newname);
2421
2422                 /*
2423                  *      Unknown or private ioctl
2424                  */
2425
2426                 default:
2427                         if ((cmd >= SIOCDEVPRIVATE &&
2428                             cmd <= SIOCDEVPRIVATE + 15) ||
2429                             cmd == SIOCBONDENSLAVE ||
2430                             cmd == SIOCBONDRELEASE ||
2431                             cmd == SIOCBONDSETHWADDR ||
2432                             cmd == SIOCBONDSLAVEINFOQUERY ||
2433                             cmd == SIOCBONDINFOQUERY ||
2434                             cmd == SIOCBONDCHANGEACTIVE ||
2435                             cmd == SIOCGMIIPHY ||
2436                             cmd == SIOCGMIIREG ||
2437                             cmd == SIOCSMIIREG ||
2438                             cmd == SIOCBRADDIF ||
2439                             cmd == SIOCBRDELIF ||
2440                             cmd == SIOCWANDEV) {
2441                                 err = -EOPNOTSUPP;
2442                                 if (dev->do_ioctl) {
2443                                         if (netif_device_present(dev))
2444                                                 err = dev->do_ioctl(dev, ifr,
2445                                                                     cmd);
2446                                         else
2447                                                 err = -ENODEV;
2448                                 }
2449                         } else
2450                                 err = -EINVAL;
2451
2452         }
2453         return err;
2454 }
2455
2456 /*
2457  *      This function handles all "interface"-type I/O control requests. The actual
2458  *      'doing' part of this is dev_ifsioc above.
2459  */
2460
2461 /**
2462  *      dev_ioctl       -       network device ioctl
2463  *      @cmd: command to issue
2464  *      @arg: pointer to a struct ifreq in user space
2465  *
2466  *      Issue ioctl functions to devices. This is normally called by the
2467  *      user space syscall interfaces but can sometimes be useful for
2468  *      other purposes. The return value is the return from the syscall if
2469  *      positive or a negative errno code on error.
2470  */
2471
2472 int dev_ioctl(unsigned int cmd, void __user *arg)
2473 {
2474         struct ifreq ifr;
2475         int ret;
2476         char *colon;
2477
2478         /* One special case: SIOCGIFCONF takes ifconf argument
2479            and requires shared lock, because it sleeps writing
2480            to user space.
2481          */
2482
2483         if (cmd == SIOCGIFCONF) {
2484                 rtnl_shlock();
2485                 ret = dev_ifconf((char __user *) arg);
2486                 rtnl_shunlock();
2487                 return ret;
2488         }
2489         if (cmd == SIOCGIFNAME)
2490                 return dev_ifname((struct ifreq __user *)arg);
2491
2492         if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
2493                 return -EFAULT;
2494
2495         ifr.ifr_name[IFNAMSIZ-1] = 0;
2496
2497         colon = strchr(ifr.ifr_name, ':');
2498         if (colon)
2499                 *colon = 0;
2500
2501         /*
2502          *      See which interface the caller is talking about.
2503          */
2504
2505         switch (cmd) {
2506                 /*
2507                  *      These ioctl calls:
2508                  *      - can be done by all.
2509                  *      - atomic and do not require locking.
2510                  *      - return a value
2511                  */
2512                 case SIOCGIFFLAGS:
2513                 case SIOCGIFMETRIC:
2514                 case SIOCGIFMTU:
2515                 case SIOCGIFHWADDR:
2516                 case SIOCGIFSLAVE:
2517                 case SIOCGIFMAP:
2518                 case SIOCGIFINDEX:
2519                 case SIOCGIFTXQLEN:
2520                         dev_load(ifr.ifr_name);
2521                         read_lock(&dev_base_lock);
2522                         ret = dev_ifsioc(&ifr, cmd);
2523                         read_unlock(&dev_base_lock);
2524                         if (!ret) {
2525                                 if (colon)
2526                                         *colon = ':';
2527                                 if (copy_to_user(arg, &ifr,
2528                                                  sizeof(struct ifreq)))
2529                                         ret = -EFAULT;
2530                         }
2531                         return ret;
2532
2533                 case SIOCETHTOOL:
2534                         dev_load(ifr.ifr_name);
2535                         rtnl_lock();
2536                         ret = dev_ethtool(&ifr);
2537                         rtnl_unlock();
2538                         if (!ret) {
2539                                 if (colon)
2540                                         *colon = ':';
2541                                 if (copy_to_user(arg, &ifr,
2542                                                  sizeof(struct ifreq)))
2543                                         ret = -EFAULT;
2544                         }
2545                         return ret;
2546
2547                 /*
2548                  *      These ioctl calls:
2549                  *      - require superuser power.
2550                  *      - require strict serialization.
2551                  *      - return a value
2552                  */
2553                 case SIOCGMIIPHY:
2554                 case SIOCGMIIREG:
2555                 case SIOCSIFNAME:
2556                         if (!capable(CAP_NET_ADMIN))
2557                                 return -EPERM;
2558                         dev_load(ifr.ifr_name);
2559                         rtnl_lock();
2560                         ret = dev_ifsioc(&ifr, cmd);
2561                         rtnl_unlock();
2562                         if (!ret) {
2563                                 if (colon)
2564                                         *colon = ':';
2565                                 if (copy_to_user(arg, &ifr,
2566                                                  sizeof(struct ifreq)))
2567                                         ret = -EFAULT;
2568                         }
2569                         return ret;
2570
2571                 /*
2572                  *      These ioctl calls:
2573                  *      - require superuser power.
2574                  *      - require strict serialization.
2575                  *      - do not return a value
2576                  */
2577                 case SIOCSIFFLAGS:
2578                 case SIOCSIFMETRIC:
2579                 case SIOCSIFMTU:
2580                 case SIOCSIFMAP:
2581                 case SIOCSIFHWADDR:
2582                 case SIOCSIFSLAVE:
2583                 case SIOCADDMULTI:
2584                 case SIOCDELMULTI:
2585                 case SIOCSIFHWBROADCAST:
2586                 case SIOCSIFTXQLEN:
2587                 case SIOCSMIIREG:
2588                 case SIOCBONDENSLAVE:
2589                 case SIOCBONDRELEASE:
2590                 case SIOCBONDSETHWADDR:
2591                 case SIOCBONDSLAVEINFOQUERY:
2592                 case SIOCBONDINFOQUERY:
2593                 case SIOCBONDCHANGEACTIVE:
2594                 case SIOCBRADDIF:
2595                 case SIOCBRDELIF:
2596                         if (!capable(CAP_NET_ADMIN))
2597                                 return -EPERM;
2598                         dev_load(ifr.ifr_name);
2599                         rtnl_lock();
2600                         ret = dev_ifsioc(&ifr, cmd);
2601                         rtnl_unlock();
2602                         return ret;
2603
2604                 case SIOCGIFMEM:
2605                         /* Get the per device memory space. We can add this but
2606                          * currently do not support it */
2607                 case SIOCSIFMEM:
2608                         /* Set the per device memory buffer space.
2609                          * Not applicable in our case */
2610                 case SIOCSIFLINK:
2611                         return -EINVAL;
2612
2613                 /*
2614                  *      Unknown or private ioctl.
2615                  */
2616                 default:
2617                         if (cmd == SIOCWANDEV ||
2618                             (cmd >= SIOCDEVPRIVATE &&
2619                              cmd <= SIOCDEVPRIVATE + 15)) {
2620                                 dev_load(ifr.ifr_name);
2621                                 rtnl_lock();
2622                                 ret = dev_ifsioc(&ifr, cmd);
2623                                 rtnl_unlock();
2624                                 if (!ret && copy_to_user(arg, &ifr,
2625                                                          sizeof(struct ifreq)))
2626                                         ret = -EFAULT;
2627                                 return ret;
2628                         }
2629 #ifdef WIRELESS_EXT
2630                         /* Take care of Wireless Extensions */
2631                         if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) {
2632                                 /* If command is `set a parameter', or
2633                                  * `get the encoding parameters', check if
2634                                  * the user has the right to do it */
2635                                 if (IW_IS_SET(cmd) || cmd == SIOCGIWENCODE) {
2636                                         if (!capable(CAP_NET_ADMIN))
2637                                                 return -EPERM;
2638                                 }
2639                                 dev_load(ifr.ifr_name);
2640                                 rtnl_lock();
2641                                 /* Follow me in net/core/wireless.c */
2642                                 ret = wireless_process_ioctl(&ifr, cmd);
2643                                 rtnl_unlock();
2644                                 if (IW_IS_GET(cmd) &&
2645                                     copy_to_user(arg, &ifr,
2646                                                  sizeof(struct ifreq)))
2647                                         ret = -EFAULT;
2648                                 return ret;
2649                         }
2650 #endif  /* WIRELESS_EXT */
2651                         return -EINVAL;
2652         }
2653 }
2654
2655
2656 /**
2657  *      dev_new_index   -       allocate an ifindex
2658  *
2659  *      Returns a suitable unique value for a new device interface
2660  *      number.  The caller must hold the rtnl semaphore or the
2661  *      dev_base_lock to be sure it remains unique.
2662  */
2663 static int dev_new_index(void)
2664 {
2665         static int ifindex;
2666         for (;;) {
2667                 if (++ifindex <= 0)
2668                         ifindex = 1;
2669                 if (!__dev_get_by_index(ifindex))
2670                         return ifindex;
2671         }
2672 }
2673
2674 static int dev_boot_phase = 1;
2675
2676 /* Delayed registration/unregisteration */
2677 static spinlock_t net_todo_list_lock = SPIN_LOCK_UNLOCKED;
2678 static struct list_head net_todo_list = LIST_HEAD_INIT(net_todo_list);
2679
2680 static inline void net_set_todo(struct net_device *dev)
2681 {
2682         spin_lock(&net_todo_list_lock);
2683         list_add_tail(&dev->todo_list, &net_todo_list);
2684         spin_unlock(&net_todo_list_lock);
2685 }
2686
2687 /**
2688  *      register_netdevice      - register a network device
2689  *      @dev: device to register
2690  *
2691  *      Take a completed network device structure and add it to the kernel
2692  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
2693  *      chain. 0 is returned on success. A negative errno code is returned
2694  *      on a failure to set up the device, or if the name is a duplicate.
2695  *
2696  *      Callers must hold the rtnl semaphore.  See the comment at the
2697  *      end of Space.c for details about the locking.  You may want
2698  *      register_netdev() instead of this.
2699  *
2700  *      BUGS:
2701  *      The locking appears insufficient to guarantee two parallel registers
2702  *      will not get the same name.
2703  */
2704
2705 int register_netdevice(struct net_device *dev)
2706 {
2707         struct hlist_head *head;
2708         struct hlist_node *p;
2709         int ret;
2710
2711         BUG_ON(dev_boot_phase);
2712         ASSERT_RTNL();
2713
2714         /* When net_device's are persistent, this will be fatal. */
2715         BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
2716
2717         spin_lock_init(&dev->queue_lock);
2718         spin_lock_init(&dev->xmit_lock);
2719         dev->xmit_lock_owner = -1;
2720 #ifdef CONFIG_NET_CLS_ACT
2721         spin_lock_init(&dev->ingress_lock);
2722 #endif
2723
2724         ret = alloc_divert_blk(dev);
2725         if (ret)
2726                 goto out;
2727
2728         dev->iflink = -1;
2729
2730         /* Init, if this function is available */
2731         if (dev->init) {
2732                 ret = dev->init(dev);
2733                 if (ret) {
2734                         if (ret > 0)
2735                                 ret = -EIO;
2736                         goto out_err;
2737                 }
2738         }
2739
2740         if (!dev_valid_name(dev->name)) {
2741                 ret = -EINVAL;
2742                 goto out_err;
2743         }
2744
2745         dev->ifindex = dev_new_index();
2746         if (dev->iflink == -1)
2747                 dev->iflink = dev->ifindex;
2748
2749         /* Check for existence of name */
2750         head = dev_name_hash(dev->name);
2751         hlist_for_each(p, head) {
2752                 struct net_device *d
2753                         = hlist_entry(p, struct net_device, name_hlist);
2754                 if (!strncmp(d->name, dev->name, IFNAMSIZ)) {
2755                         ret = -EEXIST;
2756                         goto out_err;
2757                 }
2758         }
2759
2760         /* Fix illegal SG+CSUM combinations. */
2761         if ((dev->features & NETIF_F_SG) &&
2762             !(dev->features & (NETIF_F_IP_CSUM |
2763                                NETIF_F_NO_CSUM |
2764                                NETIF_F_HW_CSUM))) {
2765                 printk("%s: Dropping NETIF_F_SG since no checksum feature.\n",
2766                        dev->name);
2767                 dev->features &= ~NETIF_F_SG;
2768         }
2769
2770         /* TSO requires that SG is present as well. */
2771         if ((dev->features & NETIF_F_TSO) &&
2772             !(dev->features & NETIF_F_SG)) {
2773                 printk("%s: Dropping NETIF_F_TSO since no SG feature.\n",
2774                        dev->name);
2775                 dev->features &= ~NETIF_F_TSO;
2776         }
2777
2778         /*
2779          *      nil rebuild_header routine,
2780          *      that should be never called and used as just bug trap.
2781          */
2782
2783         if (!dev->rebuild_header)
2784                 dev->rebuild_header = default_rebuild_header;
2785
2786         /*
2787          *      Default initial state at registry is that the
2788          *      device is present.
2789          */
2790
2791         set_bit(__LINK_STATE_PRESENT, &dev->state);
2792
2793         dev->next = NULL;
2794         dev_init_scheduler(dev);
2795         write_lock_bh(&dev_base_lock);
2796         *dev_tail = dev;
2797         dev_tail = &dev->next;
2798         hlist_add_head(&dev->name_hlist, head);
2799         hlist_add_head(&dev->index_hlist, dev_index_hash(dev->ifindex));
2800         dev_hold(dev);
2801         dev->reg_state = NETREG_REGISTERING;
2802         write_unlock_bh(&dev_base_lock);
2803
2804         /* Notify protocols, that a new device appeared. */
2805         notifier_call_chain(&netdev_chain, NETDEV_REGISTER, dev);
2806
2807         /* Finish registration after unlock */
2808         net_set_todo(dev);
2809         ret = 0;
2810
2811 out:
2812         return ret;
2813 out_err:
2814         free_divert_blk(dev);
2815         goto out;
2816 }
2817
2818 /*
2819  * netdev_wait_allrefs - wait until all references are gone.
2820  *
2821  * This is called when unregistering network devices.
2822  *
2823  * Any protocol or device that holds a reference should register
2824  * for netdevice notification, and cleanup and put back the
2825  * reference if they receive an UNREGISTER event.
2826  * We can get stuck here if buggy protocols don't correctly
2827  * call dev_put.
2828  */
2829 static void netdev_wait_allrefs(struct net_device *dev)
2830 {
2831         unsigned long rebroadcast_time, warning_time;
2832
2833         rebroadcast_time = warning_time = jiffies;
2834         while (atomic_read(&dev->refcnt) != 0) {
2835                 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
2836                         rtnl_shlock();
2837
2838                         /* Rebroadcast unregister notification */
2839                         notifier_call_chain(&netdev_chain,
2840                                             NETDEV_UNREGISTER, dev);
2841
2842                         if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
2843                                      &dev->state)) {
2844                                 /* We must not have linkwatch events
2845                                  * pending on unregister. If this
2846                                  * happens, we simply run the queue
2847                                  * unscheduled, resulting in a noop
2848                                  * for this device.
2849                                  */
2850                                 linkwatch_run_queue();
2851                         }
2852
2853                         rtnl_shunlock();
2854
2855                         rebroadcast_time = jiffies;
2856                 }
2857
2858                 current->state = TASK_INTERRUPTIBLE;
2859                 schedule_timeout(HZ / 4);
2860
2861                 if (time_after(jiffies, warning_time + 10 * HZ)) {
2862                         printk(KERN_EMERG "unregister_netdevice: "
2863                                "waiting for %s to become free. Usage "
2864                                "count = %d\n",
2865                                dev->name, atomic_read(&dev->refcnt));
2866                         warning_time = jiffies;
2867                 }
2868         }
2869 }
2870
2871 /* The sequence is:
2872  *
2873  *      rtnl_lock();
2874  *      ...
2875  *      register_netdevice(x1);
2876  *      register_netdevice(x2);
2877  *      ...
2878  *      unregister_netdevice(y1);
2879  *      unregister_netdevice(y2);
2880  *      ...
2881  *      rtnl_unlock();
2882  *      free_netdev(y1);
2883  *      free_netdev(y2);
2884  *
2885  * We are invoked by rtnl_unlock() after it drops the semaphore.
2886  * This allows us to deal with problems:
2887  * 1) We can create/delete sysfs objects which invoke hotplug
2888  *    without deadlocking with linkwatch via keventd.
2889  * 2) Since we run with the RTNL semaphore not held, we can sleep
2890  *    safely in order to wait for the netdev refcnt to drop to zero.
2891  */
2892 static DECLARE_MUTEX(net_todo_run_mutex);
2893 void netdev_run_todo(void)
2894 {
2895         struct list_head list = LIST_HEAD_INIT(list);
2896         int err;
2897
2898
2899         /* Need to guard against multiple cpu's getting out of order. */
2900         down(&net_todo_run_mutex);
2901
2902         /* Not safe to do outside the semaphore.  We must not return
2903          * until all unregister events invoked by the local processor
2904          * have been completed (either by this todo run, or one on
2905          * another cpu).
2906          */
2907         if (list_empty(&net_todo_list))
2908                 goto out;
2909
2910         /* Snapshot list, allow later requests */
2911         spin_lock(&net_todo_list_lock);
2912         list_splice_init(&net_todo_list, &list);
2913         spin_unlock(&net_todo_list_lock);
2914
2915         while (!list_empty(&list)) {
2916                 struct net_device *dev
2917                         = list_entry(list.next, struct net_device, todo_list);
2918                 list_del(&dev->todo_list);
2919
2920                 switch(dev->reg_state) {
2921                 case NETREG_REGISTERING:
2922                         err = netdev_register_sysfs(dev);
2923                         if (err)
2924                                 printk(KERN_ERR "%s: failed sysfs registration (%d)\n",
2925                                        dev->name, err);
2926                         dev->reg_state = NETREG_REGISTERED;
2927                         break;
2928
2929                 case NETREG_UNREGISTERING:
2930                         netdev_unregister_sysfs(dev);
2931                         dev->reg_state = NETREG_UNREGISTERED;
2932
2933                         netdev_wait_allrefs(dev);
2934
2935                         /* paranoia */
2936                         BUG_ON(atomic_read(&dev->refcnt));
2937                         BUG_TRAP(!dev->ip_ptr);
2938                         BUG_TRAP(!dev->ip6_ptr);
2939                         BUG_TRAP(!dev->dn_ptr);
2940
2941
2942                         /* It must be the very last action,
2943                          * after this 'dev' may point to freed up memory.
2944                          */
2945                         if (dev->destructor)
2946                                 dev->destructor(dev);
2947                         break;
2948
2949                 default:
2950                         printk(KERN_ERR "network todo '%s' but state %d\n",
2951                                dev->name, dev->reg_state);
2952                         break;
2953                 }
2954         }
2955
2956 out:
2957         up(&net_todo_run_mutex);
2958 }
2959
2960 /**
2961  *      free_netdev - free network device
2962  *      @dev: device
2963  *
2964  *      This function does the last stage of destroying an allocated device
2965  *      interface. The reference to the device object is released.
2966  *      If this is the last reference then it will be freed.
2967  */
2968 void free_netdev(struct net_device *dev)
2969 {
2970 #ifdef CONFIG_SYSFS
2971         /*  Compatiablity with error handling in drivers */
2972         if (dev->reg_state == NETREG_UNINITIALIZED) {
2973                 kfree((char *)dev - dev->padded);
2974                 return;
2975         }
2976
2977         BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
2978         dev->reg_state = NETREG_RELEASED;
2979
2980         /* will free via class release */
2981         class_device_put(&dev->class_dev);
2982 #else
2983         kfree((char *)dev - dev->padded);
2984 #endif
2985 }
2986
2987 /* Synchronize with packet receive processing. */
2988 void synchronize_net(void)
2989 {
2990         might_sleep();
2991         synchronize_kernel();
2992 }
2993
2994 /**
2995  *      unregister_netdevice - remove device from the kernel
2996  *      @dev: device
2997  *
2998  *      This function shuts down a device interface and removes it
2999  *      from the kernel tables. On success 0 is returned, on a failure
3000  *      a negative errno code is returned.
3001  *
3002  *      Callers must hold the rtnl semaphore.  See the comment at the
3003  *      end of Space.c for details about the locking.  You may want
3004  *      unregister_netdev() instead of this.
3005  */
3006
3007 int unregister_netdevice(struct net_device *dev)
3008 {
3009         struct net_device *d, **dp;
3010
3011         BUG_ON(dev_boot_phase);
3012         ASSERT_RTNL();
3013
3014         /* Some devices call without registering for initialization unwind. */
3015         if (dev->reg_state == NETREG_UNINITIALIZED) {
3016                 printk(KERN_DEBUG "unregister_netdevice: device %s/%p never "
3017                                   "was registered\n", dev->name, dev);
3018                 return -ENODEV;
3019         }
3020
3021         BUG_ON(dev->reg_state != NETREG_REGISTERED);
3022
3023         /* If device is running, close it first. */
3024         if (dev->flags & IFF_UP)
3025                 dev_close(dev);
3026
3027         /* And unlink it from device chain. */
3028         for (dp = &dev_base; (d = *dp) != NULL; dp = &d->next) {
3029                 if (d == dev) {
3030                         write_lock_bh(&dev_base_lock);
3031                         hlist_del(&dev->name_hlist);
3032                         hlist_del(&dev->index_hlist);
3033                         if (dev_tail == &dev->next)
3034                                 dev_tail = dp;
3035                         *dp = d->next;
3036                         write_unlock_bh(&dev_base_lock);
3037                         break;
3038                 }
3039         }
3040         if (!d) {
3041                 printk(KERN_ERR "unregister net_device: '%s' not found\n",
3042                        dev->name);
3043                 return -ENODEV;
3044         }
3045
3046         dev->reg_state = NETREG_UNREGISTERING;
3047
3048         synchronize_net();
3049
3050         /* Shutdown queueing discipline. */
3051         dev_shutdown(dev);
3052
3053
3054         /* Notify protocols, that we are about to destroy
3055            this device. They should clean all the things.
3056         */
3057         notifier_call_chain(&netdev_chain, NETDEV_UNREGISTER, dev);
3058
3059         /*
3060          *      Flush the multicast chain
3061          */
3062         dev_mc_discard(dev);
3063
3064         if (dev->uninit)
3065                 dev->uninit(dev);
3066
3067         /* Notifier chain MUST detach us from master device. */
3068         BUG_TRAP(!dev->master);
3069
3070         free_divert_blk(dev);
3071
3072         /* Finish processing unregister after unlock */
3073         net_set_todo(dev);
3074
3075         synchronize_net();
3076
3077         dev_put(dev);
3078         return 0;
3079 }
3080
3081 #ifdef CONFIG_HOTPLUG_CPU
3082 static int dev_cpu_callback(struct notifier_block *nfb,
3083                             unsigned long action,
3084                             void *ocpu)
3085 {
3086         struct sk_buff **list_skb;
3087         struct net_device **list_net;
3088         struct sk_buff *skb;
3089         unsigned int cpu, oldcpu = (unsigned long)ocpu;
3090         struct softnet_data *sd, *oldsd;
3091
3092         if (action != CPU_DEAD)
3093                 return NOTIFY_OK;
3094
3095         local_irq_disable();
3096         cpu = smp_processor_id();
3097         sd = &per_cpu(softnet_data, cpu);
3098         oldsd = &per_cpu(softnet_data, oldcpu);
3099
3100         /* Find end of our completion_queue. */
3101         list_skb = &sd->completion_queue;
3102         while (*list_skb)
3103                 list_skb = &(*list_skb)->next;
3104         /* Append completion queue from offline CPU. */
3105         *list_skb = oldsd->completion_queue;
3106         oldsd->completion_queue = NULL;
3107
3108         /* Find end of our output_queue. */
3109         list_net = &sd->output_queue;
3110         while (*list_net)
3111                 list_net = &(*list_net)->next_sched;
3112         /* Append output queue from offline CPU. */
3113         *list_net = oldsd->output_queue;
3114         oldsd->output_queue = NULL;
3115
3116         raise_softirq_irqoff(NET_TX_SOFTIRQ);
3117         local_irq_enable();
3118
3119         /* Process offline CPU's input_pkt_queue */
3120         while ((skb = __skb_dequeue(&oldsd->input_pkt_queue)))
3121                 netif_rx(skb);
3122
3123         return NOTIFY_OK;
3124 }
3125 #endif /* CONFIG_HOTPLUG_CPU */
3126
3127
3128 /*
3129  *      Initialize the DEV module. At boot time this walks the device list and
3130  *      unhooks any devices that fail to initialise (normally hardware not
3131  *      present) and leaves us with a valid list of present and active devices.
3132  *
3133  */
3134
3135 /*
3136  *       This is called single threaded during boot, so no need
3137  *       to take the rtnl semaphore.
3138  */
3139 static int __init net_dev_init(void)
3140 {
3141         int i, rc = -ENOMEM;
3142
3143         BUG_ON(!dev_boot_phase);
3144
3145         net_random_init();
3146
3147         if (dev_proc_init())
3148                 goto out;
3149
3150         if (netdev_sysfs_init())
3151                 goto out;
3152
3153         INIT_LIST_HEAD(&ptype_all);
3154         for (i = 0; i < 16; i++)
3155                 INIT_LIST_HEAD(&ptype_base[i]);
3156
3157         for (i = 0; i < ARRAY_SIZE(dev_name_head); i++)
3158                 INIT_HLIST_HEAD(&dev_name_head[i]);
3159
3160         for (i = 0; i < ARRAY_SIZE(dev_index_head); i++)
3161                 INIT_HLIST_HEAD(&dev_index_head[i]);
3162
3163         /*
3164          *      Initialise the packet receive queues.
3165          */
3166
3167         for (i = 0; i < NR_CPUS; i++) {
3168                 struct softnet_data *queue;
3169
3170                 queue = &per_cpu(softnet_data, i);
3171                 skb_queue_head_init(&queue->input_pkt_queue);
3172                 queue->throttle = 0;
3173                 queue->cng_level = 0;
3174                 queue->avg_blog = 10; /* arbitrary non-zero */
3175                 queue->completion_queue = NULL;
3176                 INIT_LIST_HEAD(&queue->poll_list);
3177                 set_bit(__LINK_STATE_START, &queue->backlog_dev.state);
3178                 queue->backlog_dev.weight = weight_p;
3179                 queue->backlog_dev.poll = process_backlog;
3180                 atomic_set(&queue->backlog_dev.refcnt, 1);
3181         }
3182
3183 #ifdef OFFLINE_SAMPLE
3184         samp_timer.expires = jiffies + (10 * HZ);
3185         add_timer(&samp_timer);
3186 #endif
3187
3188         dev_boot_phase = 0;
3189
3190         open_softirq(NET_TX_SOFTIRQ, net_tx_action, NULL);
3191         open_softirq(NET_RX_SOFTIRQ, net_rx_action, NULL);
3192
3193         hotcpu_notifier(dev_cpu_callback, 0);
3194         dst_init();
3195         dev_mcast_init();
3196         rc = 0;
3197 out:
3198         return rc;
3199 }
3200
3201 subsys_initcall(net_dev_init);
3202
3203 EXPORT_SYMBOL(__dev_get_by_index);
3204 EXPORT_SYMBOL(__dev_get_by_name);
3205 EXPORT_SYMBOL(__dev_remove_pack);
3206 EXPORT_SYMBOL(__skb_linearize);
3207 EXPORT_SYMBOL(dev_add_pack);
3208 EXPORT_SYMBOL(dev_alloc_name);
3209 EXPORT_SYMBOL(dev_close);
3210 EXPORT_SYMBOL(dev_get_by_flags);
3211 EXPORT_SYMBOL(dev_get_by_index);
3212 EXPORT_SYMBOL(dev_get_by_name);
3213 EXPORT_SYMBOL(dev_ioctl);
3214 EXPORT_SYMBOL(dev_open);
3215 EXPORT_SYMBOL(dev_queue_xmit);
3216 EXPORT_SYMBOL(dev_remove_pack);
3217 EXPORT_SYMBOL(dev_set_allmulti);
3218 EXPORT_SYMBOL(dev_set_promiscuity);
3219 EXPORT_SYMBOL(dev_change_flags);
3220 EXPORT_SYMBOL(dev_set_mtu);
3221 EXPORT_SYMBOL(free_netdev);
3222 EXPORT_SYMBOL(netdev_boot_setup_check);
3223 EXPORT_SYMBOL(netdev_set_master);
3224 EXPORT_SYMBOL(netdev_state_change);
3225 EXPORT_SYMBOL(netif_receive_skb);
3226 EXPORT_SYMBOL(netif_rx);
3227 EXPORT_SYMBOL(register_gifconf);
3228 EXPORT_SYMBOL(register_netdevice);
3229 EXPORT_SYMBOL(register_netdevice_notifier);
3230 EXPORT_SYMBOL(skb_checksum_help);
3231 EXPORT_SYMBOL(synchronize_net);
3232 EXPORT_SYMBOL(unregister_netdevice);
3233 EXPORT_SYMBOL(unregister_netdevice_notifier);
3234 EXPORT_SYMBOL(net_enable_timestamp);
3235 EXPORT_SYMBOL(net_disable_timestamp);
3236
3237 #if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)
3238 EXPORT_SYMBOL(br_handle_frame_hook);
3239 #endif
3240
3241 #ifdef CONFIG_KMOD
3242 EXPORT_SYMBOL(dev_load);
3243 #endif
3244
3245 #ifdef CONFIG_NET_CLS_ACT
3246 EXPORT_SYMBOL(ing_filter);
3247 #endif
3248
3249
3250 EXPORT_PER_CPU_SYMBOL(softnet_data);