net/core/dev.c

   1 /*
   2  *      NET3    Protocol independent device support routines.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  *      Derived from the non IP parts of dev.c 1.0.19
  10  *              Authors:        Ross Biro, <bir7@leland.Stanford.Edu>
  11  *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *
  14  *      Additional Authors:
  15  *              Florian la Roche <rzsfl@rz.uni-sb.de>
  16  *              Alan Cox <gw4pts@gw4pts.ampr.org>
  17  *              David Hinds <dahinds@users.sourceforge.net>
  18  *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19  *              Adam Sulmicki <adam@cfar.umd.edu>
  20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21  *
  22  *      Changes:
  23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24  *                                      to 2 if register_netdev gets called
  25  *                                      before net_dev_init & also removed a
  26  *                                      few lines of code in the process.
  27  *              Alan Cox        :       device private ioctl copies fields back.
  28  *              Alan Cox        :       Transmit queue code does relevant
  29  *                                      stunts to keep the queue safe.
  30  *              Alan Cox        :       Fixed double lock.
  31  *              Alan Cox        :       Fixed promisc NULL pointer trap
  32  *              ????????        :       Support the full private ioctl range
  33  *              Alan Cox        :       Moved ioctl permission check into
  34  *                                      drivers
  35  *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
  36  *              Alan Cox        :       100 backlog just doesn't cut it when
  37  *                                      you start doing multicast video 8)
  38  *              Alan Cox        :       Rewrote net_bh and list manager.
  39  *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  40  *              Alan Cox        :       Took out transmit every packet pass
  41  *                                      Saved a few bytes in the ioctl handler
  42  *              Alan Cox        :       Network driver sets packet type before
  43  *                                      calling netif_rx. Saves a function
  44  *                                      call a packet.
  45  *              Alan Cox        :       Hashed net_bh()
  46  *              Richard Kooijman:       Timestamp fixes.
  47  *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
  48  *              Alan Cox        :       Device lock protection.
  49  *              Alan Cox        :       Fixed nasty side effect of device close
  50  *                                      changes.
  51  *              Rudi Cilibrasi  :       Pass the right thing to
  52  *                                      set_mac_address()
  53  *              Dave Miller     :       32bit quantity for the device lock to
  54  *                                      make it work out on a Sparc.
  55  *              Bjorn Ekwall    :       Added KERNELD hack.
  56  *              Alan Cox        :       Cleaned up the backlog initialise.
  57  *              Craig Metz      :       SIOCGIFCONF fix if space for under
  58  *                                      1 device.
  59  *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
  60  *                                      is no device open function.
  61  *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  62  *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  63  *              Cyrus Durgin    :       Cleaned for KMOD
  64  *              Adam Sulmicki   :       Bug Fix : Network Device Unload
  65  *                                      A network device unload needs to purge
  66  *                                      the backlog queue.
  67  *      Paul Rusty Russell      :       SIOCSIFNAME
  68  *              Pekka Riikonen  :       Netdev boot-time settings code
  69  *              Andrew Morton   :       Make unregister_netdevice wait
  70  *                                      indefinitely on dev->refcnt
  71  *              J Hadi Salim    :       - Backlog queue sampling
  72  *                                      - netif_rx() feedback
  73  */
  74
  75 #include <asm/uaccess.h>
  76 #include <asm/system.h>
  77 #include <linux/bitops.h>
  78 #include <linux/config.h>
  79 #include <linux/cpu.h>
  80 #include <linux/types.h>
  81 #include <linux/kernel.h>
  82 #include <linux/sched.h>
  83 #include <linux/string.h>
  84 #include <linux/mm.h>
  85 #include <linux/socket.h>
  86 #include <linux/sockios.h>
  87 #include <linux/errno.h>
  88 #include <linux/interrupt.h>
  89 #include <linux/if_ether.h>
  90 #include <linux/netdevice.h>
  91 #include <linux/etherdevice.h>
  92 #include <linux/notifier.h>
  93 #include <linux/skbuff.h>
  94 #include <net/sock.h>
  95 #include <linux/rtnetlink.h>
  96 #include <linux/proc_fs.h>
  97 #include <linux/seq_file.h>
  98 #include <linux/stat.h>
  99 #include <linux/if_bridge.h>
 100 #include <linux/divert.h>
 101 #include <net/dst.h>
 102 #include <net/pkt_sched.h>
 103 #include <net/checksum.h>
 104 #include <linux/highmem.h>
 105 #include <linux/init.h>
 106 #include <linux/kmod.h>
 107 #include <linux/module.h>
 108 #include <linux/kallsyms.h>
 109 #include <linux/netpoll.h>
 110 #include <linux/rcupdate.h>
 111 #include <linux/delay.h>
 112 #ifdef CONFIG_NET_RADIO
 113 #include <linux/wireless.h>             /* Note : will define WIRELESS_EXT */
 114 #include <net/iw_handler.h>
 115 #endif  /* CONFIG_NET_RADIO */
 116 #include <linux/vs_network.h>
 117 #include <asm/current.h>
 118
 119 /* This define, if set, will randomly drop a packet when congestion
 120  * is more than moderate.  It helps fairness in the multi-interface
 121  * case when one of them is a hog, but it kills performance for the
 122  * single interface case so it is off now by default.
 123  */
 124 #undef RAND_LIE
 125
 126 /* Setting this will sample the queue lengths and thus congestion
 127  * via a timer instead of as each packet is received.
 128  */
 129 #undef OFFLINE_SAMPLE
 130
 131 /*
 132  *      The list of packet types we will receive (as opposed to discard)
 133  *      and the routines to invoke.
 134  *
 135  *      Why 16. Because with 16 the only overlap we get on a hash of the
 136  *      low nibble of the protocol value is RARP/SNAP/X.25.
 137  *
 138  *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
 139  *             sure which should go first, but I bet it won't make much
 140  *             difference if we are running VLANs.  The good news is that
 141  *             this protocol won't be in the list unless compiled in, so
 142  *             the average user (w/out VLANs) will not be adversly affected.
 143  *             --BLG
 144  *
 145  *              0800    IP
 146  *              8100    802.1Q VLAN
 147  *              0001    802.3
 148  *              0002    AX.25
 149  *              0004    802.2
 150  *              8035    RARP
 151  *              0005    SNAP
 152  *              0805    X.25
 153  *              0806    ARP
 154  *              8137    IPX
 155  *              0009    Localtalk
 156  *              86DD    IPv6
 157  */
 158
 159 static DEFINE_SPINLOCK(ptype_lock);
 160 static struct list_head ptype_base[16]; /* 16 way hashed list */
 161 static struct list_head ptype_all;              /* Taps */
 162
 163 #ifdef OFFLINE_SAMPLE
 164 static void sample_queue(unsigned long dummy);
 165 static struct timer_list samp_timer = TIMER_INITIALIZER(sample_queue, 0, 0);
 166 #endif
 167
 168 /*
 169  * The @dev_base list is protected by @dev_base_lock and the rtln
 170  * semaphore.
 171  *
 172  * Pure readers hold dev_base_lock for reading.
 173  *
 174  * Writers must hold the rtnl semaphore while they loop through the
 175  * dev_base list, and hold dev_base_lock for writing when they do the
 176  * actual updates.  This allows pure readers to access the list even
 177  * while a writer is preparing to update it.
 178  *
 179  * To put it another way, dev_base_lock is held for writing only to
 180  * protect against pure readers; the rtnl semaphore provides the
 181  * protection against other writers.
 182  *
 183  * See, for example usages, register_netdevice() and
 184  * unregister_netdevice(), which must be called with the rtnl
 185  * semaphore held.
 186  */
 187 struct net_device *dev_base;
 188 static struct net_device **dev_tail = &dev_base;
 189 DEFINE_RWLOCK(dev_base_lock);
 190
 191 EXPORT_SYMBOL(dev_base);
 192 EXPORT_SYMBOL(dev_base_lock);
 193
 194 #define NETDEV_HASHBITS 8
 195 static struct hlist_head dev_name_head[1<<NETDEV_HASHBITS];
 196 static struct hlist_head dev_index_head[1<<NETDEV_HASHBITS];
 197
 198 static inline struct hlist_head *dev_name_hash(const char *name)
 199 {
 200         unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 201         return &dev_name_head[hash & ((1<<NETDEV_HASHBITS)-1)];
 202 }
 203
 204 static inline struct hlist_head *dev_index_hash(int ifindex)
 205 {
 206         return &dev_index_head[ifindex & ((1<<NETDEV_HASHBITS)-1)];
 207 }
 208
 209 /*
 210  *      Our notifier list
 211  */
 212
 213 static struct notifier_block *netdev_chain;
 214
 215 /*
 216  *      Device drivers call our routines to queue packets here. We empty the
 217  *      queue in the local softnet handler.
 218  */
 219 DEFINE_PER_CPU(struct softnet_data, softnet_data) = { 0, };
 220
 221 #ifdef CONFIG_SYSFS
 222 extern int netdev_sysfs_init(void);
 223 extern int netdev_register_sysfs(struct net_device *);
 224 extern void netdev_unregister_sysfs(struct net_device *);
 225 #else
 226 #define netdev_sysfs_init()             (0)
 227 #define netdev_register_sysfs(dev)      (0)
 228 #define netdev_unregister_sysfs(dev)    do { } while(0)
 229 #endif
 230
 231
 232 /*******************************************************************************
 233
 234                 Protocol management and registration routines
 235
 236 *******************************************************************************/
 237
 238 /*
 239  *      For efficiency
 240  */
 241
 242 int netdev_nit;
 243
 244 /*
 245  *      Add a protocol ID to the list. Now that the input handler is
 246  *      smarter we can dispense with all the messy stuff that used to be
 247  *      here.
 248  *
 249  *      BEWARE!!! Protocol handlers, mangling input packets,
 250  *      MUST BE last in hash buckets and checking protocol handlers
 251  *      MUST start from promiscuous ptype_all chain in net_bh.
 252  *      It is true now, do not change it.
 253  *      Explanation follows: if protocol handler, mangling packet, will
 254  *      be the first on list, it is not able to sense, that packet
 255  *      is cloned and should be copied-on-write, so that it will
 256  *      change it and subsequent readers will get broken packet.
 257  *                                                      --ANK (980803)
 258  */
 259
 260 /**
 261  *      dev_add_pack - add packet handler
 262  *      @pt: packet type declaration
 263  *
 264  *      Add a protocol handler to the networking stack. The passed &packet_type
 265  *      is linked into kernel lists and may not be freed until it has been
 266  *      removed from the kernel lists.
 267  *
 268  *      This call does not sleep therefore it can not
 269  *      guarantee all CPU's that are in middle of receiving packets
 270  *      will see the new packet type (until the next received packet).
 271  */
 272
 273 void dev_add_pack(struct packet_type *pt)
 274 {
 275         int hash;
 276
 277         spin_lock_bh(&ptype_lock);
 278         if (pt->type == htons(ETH_P_ALL)) {
 279                 netdev_nit++;
 280                 list_add_rcu(&pt->list, &ptype_all);
 281         } else {
 282                 hash = ntohs(pt->type) & 15;
 283                 list_add_rcu(&pt->list, &ptype_base[hash]);
 284         }
 285         spin_unlock_bh(&ptype_lock);
 286 }
 287
 288 extern void linkwatch_run_queue(void);
 289
 290
 291
 292 /**
 293  *      __dev_remove_pack        - remove packet handler
 294  *      @pt: packet type declaration
 295  *
 296  *      Remove a protocol handler that was previously added to the kernel
 297  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 298  *      from the kernel lists and can be freed or reused once this function
 299  *      returns.
 300  *
 301  *      The packet type might still be in use by receivers
 302  *      and must not be freed until after all the CPU's have gone
 303  *      through a quiescent state.
 304  */
 305 void __dev_remove_pack(struct packet_type *pt)
 306 {
 307         struct list_head *head;
 308         struct packet_type *pt1;
 309
 310         spin_lock_bh(&ptype_lock);
 311
 312         if (pt->type == htons(ETH_P_ALL)) {
 313                 netdev_nit--;
 314                 head = &ptype_all;
 315         } else
 316                 head = &ptype_base[ntohs(pt->type) & 15];
 317
 318         list_for_each_entry(pt1, head, list) {
 319                 if (pt == pt1) {
 320                         list_del_rcu(&pt->list);
 321                         goto out;
 322                 }
 323         }
 324
 325         printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
 326 out:
 327         spin_unlock_bh(&ptype_lock);
 328 }
 329 /**
 330  *      dev_remove_pack  - remove packet handler
 331  *      @pt: packet type declaration
 332  *
 333  *      Remove a protocol handler that was previously added to the kernel
 334  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 335  *      from the kernel lists and can be freed or reused once this function
 336  *      returns.
 337  *
 338  *      This call sleeps to guarantee that no CPU is looking at the packet
 339  *      type after return.
 340  */
 341 void dev_remove_pack(struct packet_type *pt)
 342 {
 343         __dev_remove_pack(pt);
 344
 345         synchronize_net();
 346 }
 347
 348 /******************************************************************************
 349
 350                       Device Boot-time Settings Routines
 351
 352 *******************************************************************************/
 353
 354 /* Boot time configuration table */
 355 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 356
 357 /**
 358  *      netdev_boot_setup_add   - add new setup entry
 359  *      @name: name of the device
 360  *      @map: configured settings for the device
 361  *
 362  *      Adds new setup entry to the dev_boot_setup list.  The function
 363  *      returns 0 on error and 1 on success.  This is a generic routine to
 364  *      all netdevices.
 365  */
 366 static int netdev_boot_setup_add(char *name, struct ifmap *map)
 367 {
 368         struct netdev_boot_setup *s;
 369         int i;
 370
 371         s = dev_boot_setup;
 372         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 373                 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 374                         memset(s[i].name, 0, sizeof(s[i].name));
 375                         strcpy(s[i].name, name);
 376                         memcpy(&s[i].map, map, sizeof(s[i].map));
 377                         break;
 378                 }
 379         }
 380
 381         return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 382 }
 383
 384 /**
 385  *      netdev_boot_setup_check - check boot time settings
 386  *      @dev: the netdevice
 387  *
 388  *      Check boot time settings for the device.
 389  *      The found settings are set for the device to be used
 390  *      later in the device probing.
 391  *      Returns 0 if no settings found, 1 if they are.
 392  */
 393 int netdev_boot_setup_check(struct net_device *dev)
 394 {
 395         struct netdev_boot_setup *s = dev_boot_setup;
 396         int i;
 397
 398         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 399                 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 400                     !strncmp(dev->name, s[i].name, strlen(s[i].name))) {
 401                         dev->irq        = s[i].map.irq;
 402                         dev->base_addr  = s[i].map.base_addr;
 403                         dev->mem_start  = s[i].map.mem_start;
 404                         dev->mem_end    = s[i].map.mem_end;
 405                         return 1;
 406                 }
 407         }
 408         return 0;
 409 }
 410
 411
 412 /**
 413  *      netdev_boot_base        - get address from boot time settings
 414  *      @prefix: prefix for network device
 415  *      @unit: id for network device
 416  *
 417  *      Check boot time settings for the base address of device.
 418  *      The found settings are set for the device to be used
 419  *      later in the device probing.
 420  *      Returns 0 if no settings found.
 421  */
 422 unsigned long netdev_boot_base(const char *prefix, int unit)
 423 {
 424         const struct netdev_boot_setup *s = dev_boot_setup;
 425         char name[IFNAMSIZ];
 426         int i;
 427
 428         sprintf(name, "%s%d", prefix, unit);
 429
 430         /*
 431          * If device already registered then return base of 1
 432          * to indicate not to probe for this interface
 433          */
 434         if (__dev_get_by_name(name))
 435                 return 1;
 436
 437         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 438                 if (!strcmp(name, s[i].name))
 439                         return s[i].map.base_addr;
 440         return 0;
 441 }
 442
 443 /*
 444  * Saves at boot time configured settings for any netdevice.
 445  */
 446 int __init netdev_boot_setup(char *str)
 447 {
 448         int ints[5];
 449         struct ifmap map;
 450
 451         str = get_options(str, ARRAY_SIZE(ints), ints);
 452         if (!str || !*str)
 453                 return 0;
 454
 455         /* Save settings */
 456         memset(&map, 0, sizeof(map));
 457         if (ints[0] > 0)
 458                 map.irq = ints[1];
 459         if (ints[0] > 1)
 460                 map.base_addr = ints[2];
 461         if (ints[0] > 2)
 462                 map.mem_start = ints[3];
 463         if (ints[0] > 3)
 464                 map.mem_end = ints[4];
 465
 466         /* Add new entry to the list */
 467         return netdev_boot_setup_add(str, &map);
 468 }
 469
 470 __setup("netdev=", netdev_boot_setup);
 471
 472 /*******************************************************************************
 473
 474                             Device Interface Subroutines
 475
 476 *******************************************************************************/
 477
 478 /**
 479  *      __dev_get_by_name       - find a device by its name
 480  *      @name: name to find
 481  *
 482  *      Find an interface by name. Must be called under RTNL semaphore
 483  *      or @dev_base_lock. If the name is found a pointer to the device
 484  *      is returned. If the name is not found then %NULL is returned. The
 485  *      reference counters are not incremented so the caller must be
 486  *      careful with locks.
 487  */
 488
 489 struct net_device *__dev_get_by_name(const char *name)
 490 {
 491         struct hlist_node *p;
 492
 493         hlist_for_each(p, dev_name_hash(name)) {
 494                 struct net_device *dev
 495                         = hlist_entry(p, struct net_device, name_hlist);
 496                 if (!strncmp(dev->name, name, IFNAMSIZ))
 497                         return dev;
 498         }
 499         return NULL;
 500 }
 501
 502 /**
 503  *      dev_get_by_name         - find a device by its name
 504  *      @name: name to find
 505  *
 506  *      Find an interface by name. This can be called from any
 507  *      context and does its own locking. The returned handle has
 508  *      the usage count incremented and the caller must use dev_put() to
 509  *      release it when it is no longer needed. %NULL is returned if no
 510  *      matching device is found.
 511  */
 512
 513 struct net_device *dev_get_by_name(const char *name)
 514 {
 515         struct net_device *dev;
 516
 517         read_lock(&dev_base_lock);
 518         dev = __dev_get_by_name(name);
 519         if (dev)
 520                 dev_hold(dev);
 521         read_unlock(&dev_base_lock);
 522         return dev;
 523 }
 524
 525 /**
 526  *      __dev_get_by_index - find a device by its ifindex
 527  *      @ifindex: index of device
 528  *
 529  *      Search for an interface by index. Returns %NULL if the device
 530  *      is not found or a pointer to the device. The device has not
 531  *      had its reference counter increased so the caller must be careful
 532  *      about locking. The caller must hold either the RTNL semaphore
 533  *      or @dev_base_lock.
 534  */
 535
 536 struct net_device *__dev_get_by_index(int ifindex)
 537 {
 538         struct hlist_node *p;
 539
 540         hlist_for_each(p, dev_index_hash(ifindex)) {
 541                 struct net_device *dev
 542                         = hlist_entry(p, struct net_device, index_hlist);
 543                 if (dev->ifindex == ifindex)
 544                         return dev;
 545         }
 546         return NULL;
 547 }
 548
 549
 550 /**
 551  *      dev_get_by_index - find a device by its ifindex
 552  *      @ifindex: index of device
 553  *
 554  *      Search for an interface by index. Returns NULL if the device
 555  *      is not found or a pointer to the device. The device returned has
 556  *      had a reference added and the pointer is safe until the user calls
 557  *      dev_put to indicate they have finished with it.
 558  */
 559
 560 struct net_device *dev_get_by_index(int ifindex)
 561 {
 562         struct net_device *dev;
 563
 564         read_lock(&dev_base_lock);
 565         dev = __dev_get_by_index(ifindex);
 566         if (dev)
 567                 dev_hold(dev);
 568         read_unlock(&dev_base_lock);
 569         return dev;
 570 }
 571
 572 /**
 573  *      dev_getbyhwaddr - find a device by its hardware address
 574  *      @type: media type of device
 575  *      @ha: hardware address
 576  *
 577  *      Search for an interface by MAC address. Returns NULL if the device
 578  *      is not found or a pointer to the device. The caller must hold the
 579  *      rtnl semaphore. The returned device has not had its ref count increased
 580  *      and the caller must therefore be careful about locking
 581  *
 582  *      BUGS:
 583  *      If the API was consistent this would be __dev_get_by_hwaddr
 584  */
 585
 586 struct net_device *dev_getbyhwaddr(unsigned short type, char *ha)
 587 {
 588         struct net_device *dev;
 589
 590         ASSERT_RTNL();
 591
 592         for (dev = dev_base; dev; dev = dev->next)
 593                 if (dev->type == type &&
 594                     !memcmp(dev->dev_addr, ha, dev->addr_len))
 595                         break;
 596         return dev;
 597 }
 598
 599 struct net_device *dev_getfirstbyhwtype(unsigned short type)
 600 {
 601         struct net_device *dev;
 602
 603         rtnl_lock();
 604         for (dev = dev_base; dev; dev = dev->next) {
 605                 if (dev->type == type) {
 606                         dev_hold(dev);
 607                         break;
 608                 }
 609         }
 610         rtnl_unlock();
 611         return dev;
 612 }
 613
 614 EXPORT_SYMBOL(dev_getfirstbyhwtype);
 615
 616 /**
 617  *      dev_get_by_flags - find any device with given flags
 618  *      @if_flags: IFF_* values
 619  *      @mask: bitmask of bits in if_flags to check
 620  *
 621  *      Search for any interface with the given flags. Returns NULL if a device
 622  *      is not found or a pointer to the device. The device returned has
 623  *      had a reference added and the pointer is safe until the user calls
 624  *      dev_put to indicate they have finished with it.
 625  */
 626
 627 struct net_device * dev_get_by_flags(unsigned short if_flags, unsigned short mask)
 628 {
 629         struct net_device *dev;
 630
 631         read_lock(&dev_base_lock);
 632         for (dev = dev_base; dev != NULL; dev = dev->next) {
 633                 if (((dev->flags ^ if_flags) & mask) == 0) {
 634                         dev_hold(dev);
 635                         break;
 636                 }
 637         }
 638         read_unlock(&dev_base_lock);
 639         return dev;
 640 }
 641
 642 /**
 643  *      dev_valid_name - check if name is okay for network device
 644  *      @name: name string
 645  *
 646  *      Network device names need to be valid file names to
 647  *      to allow sysfs to work
 648  */
 649 static int dev_valid_name(const char *name)
 650 {
 651         return !(*name == '\0'
 652                  || !strcmp(name, ".")
 653                  || !strcmp(name, "..")
 654                  || strchr(name, '/'));
 655 }
 656
 657 /**
 658  *      dev_alloc_name - allocate a name for a device
 659  *      @dev: device
 660  *      @name: name format string
 661  *
 662  *      Passed a format string - eg "lt%d" it will try and find a suitable
 663  *      id. Not efficient for many devices, not called a lot. The caller
 664  *      must hold the dev_base or rtnl lock while allocating the name and
 665  *      adding the device in order to avoid duplicates. Returns the number
 666  *      of the unit assigned or a negative errno code.
 667  */
 668
 669 int dev_alloc_name(struct net_device *dev, const char *name)
 670 {
 671         int i = 0;
 672         char buf[IFNAMSIZ];
 673         const char *p;
 674         const int max_netdevices = 8*PAGE_SIZE;
 675         long *inuse;
 676         struct net_device *d;
 677
 678         p = strnchr(name, IFNAMSIZ-1, '%');
 679         if (p) {
 680                 /*
 681                  * Verify the string as this thing may have come from
 682                  * the user.  There must be either one "%d" and no other "%"
 683                  * characters.
 684                  */
 685                 if (p[1] != 'd' || strchr(p + 2, '%'))
 686                         return -EINVAL;
 687
 688                 /* Use one page as a bit array of possible slots */
 689                 inuse = (long *) get_zeroed_page(GFP_ATOMIC);
 690                 if (!inuse)
 691                         return -ENOMEM;
 692
 693                 for (d = dev_base; d; d = d->next) {
 694                         if (!sscanf(d->name, name, &i))
 695                                 continue;
 696                         if (i < 0 || i >= max_netdevices)
 697                                 continue;
 698
 699                         /*  avoid cases where sscanf is not exact inverse of printf */
 700                         snprintf(buf, sizeof(buf), name, i);
 701                         if (!strncmp(buf, d->name, IFNAMSIZ))
 702                                 set_bit(i, inuse);
 703                 }
 704
 705                 i = find_first_zero_bit(inuse, max_netdevices);
 706                 free_page((unsigned long) inuse);
 707         }
 708
 709         snprintf(buf, sizeof(buf), name, i);
 710         if (!__dev_get_by_name(buf)) {
 711                 strlcpy(dev->name, buf, IFNAMSIZ);
 712                 return i;
 713         }
 714
 715         /* It is possible to run out of possible slots
 716          * when the name is long and there isn't enough space left
 717          * for the digits, or if all bits are used.
 718          */
 719         return -ENFILE;
 720 }
 721
 722
 723 /**
 724  *      dev_change_name - change name of a device
 725  *      @dev: device
 726  *      @newname: name (or format string) must be at least IFNAMSIZ
 727  *
 728  *      Change name of a device, can pass format strings "eth%d".
 729  *      for wildcarding.
 730  */
 731 int dev_change_name(struct net_device *dev, char *newname)
 732 {
 733         int err = 0;
 734
 735         ASSERT_RTNL();
 736
 737         if (dev->flags & IFF_UP)
 738                 return -EBUSY;
 739
 740         if (!dev_valid_name(newname))
 741                 return -EINVAL;
 742
 743         if (strchr(newname, '%')) {
 744                 err = dev_alloc_name(dev, newname);
 745                 if (err < 0)
 746                         return err;
 747                 strcpy(newname, dev->name);
 748         }
 749         else if (__dev_get_by_name(newname))
 750                 return -EEXIST;
 751         else
 752                 strlcpy(dev->name, newname, IFNAMSIZ);
 753
 754         err = class_device_rename(&dev->class_dev, dev->name);
 755         if (!err) {
 756                 hlist_del(&dev->name_hlist);
 757                 hlist_add_head(&dev->name_hlist, dev_name_hash(dev->name));
 758                 notifier_call_chain(&netdev_chain, NETDEV_CHANGENAME, dev);
 759         }
 760
 761         return err;
 762 }
 763
 764 /**
 765  *      netdev_state_change - device changes state
 766  *      @dev: device to cause notification
 767  *
 768  *      Called to indicate a device has changed state. This function calls
 769  *      the notifier chains for netdev_chain and sends a NEWLINK message
 770  *      to the routing socket.
 771  */
 772 void netdev_state_change(struct net_device *dev)
 773 {
 774         if (dev->flags & IFF_UP) {
 775                 notifier_call_chain(&netdev_chain, NETDEV_CHANGE, dev);
 776                 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
 777         }
 778 }
 779
 780 /**
 781  *      dev_load        - load a network module
 782  *      @name: name of interface
 783  *
 784  *      If a network interface is not present and the process has suitable
 785  *      privileges this function loads the module. If module loading is not
 786  *      available in this kernel then it becomes a nop.
 787  */
 788
 789 void dev_load(const char *name)
 790 {
 791         struct net_device *dev;
 792
 793         read_lock(&dev_base_lock);
 794         dev = __dev_get_by_name(name);
 795         read_unlock(&dev_base_lock);
 796
 797         if (!dev && capable(CAP_SYS_MODULE))
 798                 request_module("%s", name);
 799 }
 800
 801 static int default_rebuild_header(struct sk_buff *skb)
 802 {
 803         printk(KERN_DEBUG "%s: default_rebuild_header called -- BUG!\n",
 804                skb->dev ? skb->dev->name : "NULL!!!");
 805         kfree_skb(skb);
 806         return 1;
 807 }
 808
 809
 810 /**
 811  *      dev_open        - prepare an interface for use.
 812  *      @dev:   device to open
 813  *
 814  *      Takes a device from down to up state. The device's private open
 815  *      function is invoked and then the multicast lists are loaded. Finally
 816  *      the device is moved into the up state and a %NETDEV_UP message is
 817  *      sent to the netdev notifier chain.
 818  *
 819  *      Calling this function on an active interface is a nop. On a failure
 820  *      a negative errno code is returned.
 821  */
 822 int dev_open(struct net_device *dev)
 823 {
 824         int ret = 0;
 825
 826         /*
 827          *      Is it already up?
 828          */
 829
 830         if (dev->flags & IFF_UP)
 831                 return 0;
 832
 833         /*
 834          *      Is it even present?
 835          */
 836         if (!netif_device_present(dev))
 837                 return -ENODEV;
 838
 839         /*
 840          *      Call device private open method
 841          */
 842         set_bit(__LINK_STATE_START, &dev->state);
 843         if (dev->open) {
 844                 ret = dev->open(dev);
 845                 if (ret)
 846                         clear_bit(__LINK_STATE_START, &dev->state);
 847         }
 848
 849         /*
 850          *      If it went open OK then:
 851          */
 852
 853         if (!ret) {
 854                 /*
 855                  *      Set the flags.
 856                  */
 857                 dev->flags |= IFF_UP;
 858
 859                 /*
 860                  *      Initialize multicasting status
 861                  */
 862                 dev_mc_upload(dev);
 863
 864                 /*
 865                  *      Wakeup transmit queue engine
 866                  */
 867                 dev_activate(dev);
 868
 869                 /*
 870                  *      ... and announce new interface.
 871                  */
 872                 notifier_call_chain(&netdev_chain, NETDEV_UP, dev);
 873         }
 874         return ret;
 875 }
 876
 877 /**
 878  *      dev_close - shutdown an interface.
 879  *      @dev: device to shutdown
 880  *
 881  *      This function moves an active device into down state. A
 882  *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
 883  *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
 884  *      chain.
 885  */
 886 int dev_close(struct net_device *dev)
 887 {
 888         if (!(dev->flags & IFF_UP))
 889                 return 0;
 890
 891         /*
 892          *      Tell people we are going down, so that they can
 893          *      prepare to death, when device is still operating.
 894          */
 895         notifier_call_chain(&netdev_chain, NETDEV_GOING_DOWN, dev);
 896
 897         dev_deactivate(dev);
 898
 899         clear_bit(__LINK_STATE_START, &dev->state);
 900
 901         /* Synchronize to scheduled poll. We cannot touch poll list,
 902          * it can be even on different cpu. So just clear netif_running(),
 903          * and wait when poll really will happen. Actually, the best place
 904          * for this is inside dev->stop() after device stopped its irq
 905          * engine, but this requires more changes in devices. */
 906
 907         smp_mb__after_clear_bit(); /* Commit netif_running(). */
 908         while (test_bit(__LINK_STATE_RX_SCHED, &dev->state)) {
 909                 /* No hurry. */
 910                 current->state = TASK_INTERRUPTIBLE;
 911                 schedule_timeout(1);
 912         }
 913
 914         /*
 915          *      Call the device specific close. This cannot fail.
 916          *      Only if device is UP
 917          *
 918          *      We allow it to be called even after a DETACH hot-plug
 919          *      event.
 920          */
 921         if (dev->stop)
 922                 dev->stop(dev);
 923
 924         /*
 925          *      Device is now down.
 926          */
 927
 928         dev->flags &= ~IFF_UP;
 929
 930         /*
 931          * Tell people we are down
 932          */
 933         notifier_call_chain(&netdev_chain, NETDEV_DOWN, dev);
 934
 935         return 0;
 936 }
 937
 938
 939 /*
 940  *      Device change register/unregister. These are not inline or static
 941  *      as we export them to the world.
 942  */
 943
 944 /**
 945  *      register_netdevice_notifier - register a network notifier block
 946  *      @nb: notifier
 947  *
 948  *      Register a notifier to be called when network device events occur.
 949  *      The notifier passed is linked into the kernel structures and must
 950  *      not be reused until it has been unregistered. A negative errno code
 951  *      is returned on a failure.
 952  *
 953  *      When registered all registration and up events are replayed
 954  *      to the new notifier to allow device to have a race free
 955  *      view of the network device list.
 956  */
 957
 958 int register_netdevice_notifier(struct notifier_block *nb)
 959 {
 960         struct net_device *dev;
 961         int err;
 962
 963         rtnl_lock();
 964         err = notifier_chain_register(&netdev_chain, nb);
 965         if (!err) {
 966                 for (dev = dev_base; dev; dev = dev->next) {
 967                         nb->notifier_call(nb, NETDEV_REGISTER, dev);
 968
 969                         if (dev->flags & IFF_UP)
 970                                 nb->notifier_call(nb, NETDEV_UP, dev);
 971                 }
 972         }
 973         rtnl_unlock();
 974         return err;
 975 }
 976
 977 /**
 978  *      unregister_netdevice_notifier - unregister a network notifier block
 979  *      @nb: notifier
 980  *
 981  *      Unregister a notifier previously registered by
 982  *      register_netdevice_notifier(). The notifier is unlinked into the
 983  *      kernel structures and may then be reused. A negative errno code
 984  *      is returned on a failure.
 985  */
 986
 987 int unregister_netdevice_notifier(struct notifier_block *nb)
 988 {
 989         return notifier_chain_unregister(&netdev_chain, nb);
 990 }
 991
 992 /**
 993  *      call_netdevice_notifiers - call all network notifier blocks
 994  *      @val: value passed unmodified to notifier function
 995  *      @v:   pointer passed unmodified to notifier function
 996  *
 997  *      Call all network notifier blocks.  Parameters and return value
 998  *      are as for notifier_call_chain().
 999  */
1000
1001 int call_netdevice_notifiers(unsigned long val, void *v)
1002 {
1003         return notifier_call_chain(&netdev_chain, val, v);
1004 }
1005
1006 /* When > 0 there are consumers of rx skb time stamps */
1007 static atomic_t netstamp_needed = ATOMIC_INIT(0);
1008
1009 void net_enable_timestamp(void)
1010 {
1011         atomic_inc(&netstamp_needed);
1012 }
1013
1014 void net_disable_timestamp(void)
1015 {
1016         atomic_dec(&netstamp_needed);
1017 }
1018
1019 static inline void net_timestamp(struct timeval *stamp)
1020 {
1021         if (atomic_read(&netstamp_needed))
1022                 do_gettimeofday(stamp);
1023         else {
1024                 stamp->tv_sec = 0;
1025                 stamp->tv_usec = 0;
1026         }
1027 }
1028
1029 /*
1030  *      Support routine. Sends outgoing frames to any network
1031  *      taps currently in use.
1032  */
1033
1034 void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1035 {
1036         struct packet_type *ptype;
1037         net_timestamp(&skb->stamp);
1038
1039         rcu_read_lock();
1040         list_for_each_entry_rcu(ptype, &ptype_all, list) {
1041                 /* Never send packets back to the socket
1042                  * they originated from - MvS (miquels@drinkel.ow.org)
1043                  */
1044                 if ((ptype->dev == dev || !ptype->dev) &&
1045                     (ptype->af_packet_priv == NULL ||
1046                      (struct sock *)ptype->af_packet_priv != skb->sk)) {
1047                         struct sk_buff *skb2= skb_clone(skb, GFP_ATOMIC);
1048                         if (!skb2)
1049                                 break;
1050
1051                         /* skb->nh should be correctly
1052                            set by sender, so that the second statement is
1053                            just protection against buggy protocols.
1054                          */
1055                         skb2->mac.raw = skb2->data;
1056
1057                         if (skb2->nh.raw < skb2->data ||
1058                             skb2->nh.raw > skb2->tail) {
1059                                 if (net_ratelimit())
1060                                         printk(KERN_CRIT "protocol %04x is "
1061                                                "buggy, dev %s\n",
1062                                                skb2->protocol, dev->name);
1063                                 skb2->nh.raw = skb2->data;
1064                         }
1065
1066                         skb2->h.raw = skb2->nh.raw;
1067                         skb2->pkt_type = PACKET_OUTGOING;
1068                         ptype->func(skb2, skb->dev, ptype);
1069                 }
1070         }
1071         rcu_read_unlock();
1072 }
1073
1074 /*
1075  * Invalidate hardware checksum when packet is to be mangled, and
1076  * complete checksum manually on outgoing path.
1077  */
1078 int skb_checksum_help(struct sk_buff *skb, int inward)
1079 {
1080         unsigned int csum;
1081         int ret = 0, offset = skb->h.raw - skb->data;
1082
1083         if (inward) {
1084                 skb->ip_summed = CHECKSUM_NONE;
1085                 goto out;
1086         }
1087
1088         if (skb_cloned(skb)) {
1089                 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1090                 if (ret)
1091                         goto out;
1092         }
1093
1094         if (offset > (int)skb->len)
1095                 BUG();
1096         csum = skb_checksum(skb, offset, skb->len-offset, 0);
1097
1098         offset = skb->tail - skb->h.raw;
1099         if (offset <= 0)
1100                 BUG();
1101         if (skb->csum + 2 > offset)
1102                 BUG();
1103
1104         *(u16*)(skb->h.raw + skb->csum) = csum_fold(csum);
1105         skb->ip_summed = CHECKSUM_NONE;
1106 out:
1107         return ret;
1108 }
1109
1110 #ifdef CONFIG_HIGHMEM
1111 /* Actually, we should eliminate this check as soon as we know, that:
1112  * 1. IOMMU is present and allows to map all the memory.
1113  * 2. No high memory really exists on this machine.
1114  */
1115
1116 static inline int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1117 {
1118         int i;
1119
1120         if (dev->features & NETIF_F_HIGHDMA)
1121                 return 0;
1122
1123         for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1124                 if (PageHighMem(skb_shinfo(skb)->frags[i].page))
1125                         return 1;
1126
1127         return 0;
1128 }
1129 #else
1130 #define illegal_highdma(dev, skb)       (0)
1131 #endif
1132
1133 extern void skb_release_data(struct sk_buff *);
1134
1135 /* Keep head the same: replace data */
1136 int __skb_linearize(struct sk_buff *skb, int gfp_mask)
1137 {
1138         unsigned int size;
1139         u8 *data;
1140         long offset;
1141         struct skb_shared_info *ninfo;
1142         int headerlen = skb->data - skb->head;
1143         int expand = (skb->tail + skb->data_len) - skb->end;
1144
1145         if (skb_shared(skb))
1146                 BUG();
1147
1148         if (expand <= 0)
1149                 expand = 0;
1150
1151         size = skb->end - skb->head + expand;
1152         size = SKB_DATA_ALIGN(size);
1153         data = kmalloc(size + sizeof(struct skb_shared_info), gfp_mask);
1154         if (!data)
1155                 return -ENOMEM;
1156
1157         /* Copy entire thing */
1158         if (skb_copy_bits(skb, -headerlen, data, headerlen + skb->len))
1159                 BUG();
1160
1161         /* Set up shinfo */
1162         ninfo = (struct skb_shared_info*)(data + size);
1163         atomic_set(&ninfo->dataref, 1);
1164         ninfo->tso_size = skb_shinfo(skb)->tso_size;
1165         ninfo->tso_segs = skb_shinfo(skb)->tso_segs;
1166         ninfo->nr_frags = 0;
1167         ninfo->frag_list = NULL;
1168
1169         /* Offset between the two in bytes */
1170         offset = data - skb->head;
1171
1172         /* Free old data. */
1173         skb_release_data(skb);
1174
1175         skb->head = data;
1176         skb->end  = data + size;
1177
1178         /* Set up new pointers */
1179         skb->h.raw   += offset;
1180         skb->nh.raw  += offset;
1181         skb->mac.raw += offset;
1182         skb->tail    += offset;
1183         skb->data    += offset;
1184
1185         /* We are no longer a clone, even if we were. */
1186         skb->cloned    = 0;
1187
1188         skb->tail     += skb->data_len;
1189         skb->data_len  = 0;
1190         return 0;
1191 }
1192
1193 #define HARD_TX_LOCK(dev, cpu) {                        \
1194         if ((dev->features & NETIF_F_LLTX) == 0) {      \
1195                 spin_lock(&dev->xmit_lock);             \
1196                 dev->xmit_lock_owner = cpu;             \
1197         }                                               \
1198 }
1199
1200 #define HARD_TX_UNLOCK(dev) {                           \
1201         if ((dev->features & NETIF_F_LLTX) == 0) {      \
1202                 dev->xmit_lock_owner = -1;              \
1203                 spin_unlock(&dev->xmit_lock);           \
1204         }                                               \
1205 }
1206
1207 /**
1208  *      dev_queue_xmit - transmit a buffer
1209  *      @skb: buffer to transmit
1210  *
1211  *      Queue a buffer for transmission to a network device. The caller must
1212  *      have set the device and priority and built the buffer before calling
1213  *      this function. The function can be called from an interrupt.
1214  *
1215  *      A negative errno code is returned on a failure. A success does not
1216  *      guarantee the frame will be transmitted as it may be dropped due
1217  *      to congestion or traffic shaping.
1218  */
1219
1220 int dev_queue_xmit(struct sk_buff *skb)
1221 {
1222         struct net_device *dev = skb->dev;
1223         struct Qdisc *q;
1224         int rc = -ENOMEM;
1225
1226         if (skb_shinfo(skb)->frag_list &&
1227             !(dev->features & NETIF_F_FRAGLIST) &&
1228             __skb_linearize(skb, GFP_ATOMIC))
1229                 goto out_kfree_skb;
1230
1231         /* Fragmented skb is linearized if device does not support SG,
1232          * or if at least one of fragments is in highmem and device
1233          * does not support DMA from it.
1234          */
1235         if (skb_shinfo(skb)->nr_frags &&
1236             (!(dev->features & NETIF_F_SG) || illegal_highdma(dev, skb)) &&
1237             __skb_linearize(skb, GFP_ATOMIC))
1238                 goto out_kfree_skb;
1239
1240         /* If packet is not checksummed and device does not support
1241          * checksumming for this protocol, complete checksumming here.
1242          */
1243         if (skb->ip_summed == CHECKSUM_HW &&
1244             (!(dev->features & (NETIF_F_HW_CSUM | NETIF_F_NO_CSUM)) &&
1245              (!(dev->features & NETIF_F_IP_CSUM) ||
1246               skb->protocol != htons(ETH_P_IP))))
1247                 if (skb_checksum_help(skb, 0))
1248                         goto out_kfree_skb;
1249
1250         /* Disable soft irqs for various locks below. Also
1251          * stops preemption for RCU.
1252          */
1253         local_bh_disable();
1254
1255         /* Updates of qdisc are serialized by queue_lock.
1256          * The struct Qdisc which is pointed to by qdisc is now a
1257          * rcu structure - it may be accessed without acquiring
1258          * a lock (but the structure may be stale.) The freeing of the
1259          * qdisc will be deferred until it's known that there are no
1260          * more references to it.
1261          *
1262          * If the qdisc has an enqueue function, we still need to
1263          * hold the queue_lock before calling it, since queue_lock
1264          * also serializes access to the device queue.
1265          */
1266
1267         q = rcu_dereference(dev->qdisc);
1268 #ifdef CONFIG_NET_CLS_ACT
1269         skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_EGRESS);
1270 #endif
1271         if (q->enqueue) {
1272                 /* Grab device queue */
1273                 spin_lock(&dev->queue_lock);
1274
1275                 rc = q->enqueue(skb, q);
1276
1277                 qdisc_run(dev);
1278
1279                 spin_unlock(&dev->queue_lock);
1280                 rc = rc == NET_XMIT_BYPASS ? NET_XMIT_SUCCESS : rc;
1281                 goto out;
1282         }
1283
1284         /* The device has no queue. Common case for software devices:
1285            loopback, all the sorts of tunnels...
1286
1287            Really, it is unlikely that xmit_lock protection is necessary here.
1288            (f.e. loopback and IP tunnels are clean ignoring statistics
1289            counters.)
1290            However, it is possible, that they rely on protection
1291            made by us here.
1292
1293            Check this and shot the lock. It is not prone from deadlocks.
1294            Either shot noqueue qdisc, it is even simpler 8)
1295          */
1296         if (dev->flags & IFF_UP) {
1297                 int cpu = smp_processor_id(); /* ok because BHs are off */
1298
1299                 if (dev->xmit_lock_owner != cpu) {
1300
1301                         HARD_TX_LOCK(dev, cpu);
1302
1303                         if (!netif_queue_stopped(dev)) {
1304                                 if (netdev_nit)
1305                                         dev_queue_xmit_nit(skb, dev);
1306
1307                                 rc = 0;
1308                                 if (!dev->hard_start_xmit(skb, dev)) {
1309                                         HARD_TX_UNLOCK(dev);
1310                                         goto out;
1311                                 }
1312                         }
1313                         HARD_TX_UNLOCK(dev);
1314                         if (net_ratelimit())
1315                                 printk(KERN_CRIT "Virtual device %s asks to "
1316                                        "queue packet!\n", dev->name);
1317                 } else {
1318                         /* Recursion is detected! It is possible,
1319                          * unfortunately */
1320                         if (net_ratelimit())
1321                                 printk(KERN_CRIT "Dead loop on virtual device "
1322                                        "%s, fix it urgently!\n", dev->name);
1323                 }
1324         }
1325
1326         rc = -ENETDOWN;
1327         local_bh_enable();
1328
1329 out_kfree_skb:
1330         kfree_skb(skb);
1331         return rc;
1332 out:
1333         local_bh_enable();
1334         return rc;
1335 }
1336
1337
1338 /*=======================================================================
1339                         Receiver routines
1340   =======================================================================*/
1341
1342 int netdev_max_backlog = 300;
1343 int weight_p = 64;            /* old backlog weight */
1344 /* These numbers are selected based on intuition and some
1345  * experimentatiom, if you have more scientific way of doing this
1346  * please go ahead and fix things.
1347  */
1348 int no_cong_thresh = 10;
1349 int no_cong = 20;
1350 int lo_cong = 100;
1351 int mod_cong = 290;
1352
1353 DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
1354
1355
1356 static void get_sample_stats(int cpu)
1357 {
1358 #ifdef RAND_LIE
1359         unsigned long rd;
1360         int rq;
1361 #endif
1362         struct softnet_data *sd = &per_cpu(softnet_data, cpu);
1363         int blog = sd->input_pkt_queue.qlen;
1364         int avg_blog = sd->avg_blog;
1365
1366         avg_blog = (avg_blog >> 1) + (blog >> 1);
1367
1368         if (avg_blog > mod_cong) {
1369                 /* Above moderate congestion levels. */
1370                 sd->cng_level = NET_RX_CN_HIGH;
1371 #ifdef RAND_LIE
1372                 rd = net_random();
1373                 rq = rd % netdev_max_backlog;
1374                 if (rq < avg_blog) /* unlucky bastard */
1375                         sd->cng_level = NET_RX_DROP;
1376 #endif
1377         } else if (avg_blog > lo_cong) {
1378                 sd->cng_level = NET_RX_CN_MOD;
1379 #ifdef RAND_LIE
1380                 rd = net_random();
1381                 rq = rd % netdev_max_backlog;
1382                         if (rq < avg_blog) /* unlucky bastard */
1383                                 sd->cng_level = NET_RX_CN_HIGH;
1384 #endif
1385         } else if (avg_blog > no_cong)
1386                 sd->cng_level = NET_RX_CN_LOW;
1387         else  /* no congestion */
1388                 sd->cng_level = NET_RX_SUCCESS;
1389
1390         sd->avg_blog = avg_blog;
1391 }
1392
1393 #ifdef OFFLINE_SAMPLE
1394 static void sample_queue(unsigned long dummy)
1395 {
1396 /* 10 ms 0r 1ms -- i don't care -- JHS */
1397         int next_tick = 1;
1398         int cpu = smp_processor_id();
1399
1400         get_sample_stats(cpu);
1401         next_tick += jiffies;
1402         mod_timer(&samp_timer, next_tick);
1403 }
1404 #endif
1405
1406
1407 /**
1408  *      netif_rx        -       post buffer to the network code
1409  *      @skb: buffer to post
1410  *
1411  *      This function receives a packet from a device driver and queues it for
1412  *      the upper (protocol) levels to process.  It always succeeds. The buffer
1413  *      may be dropped during processing for congestion control or by the
1414  *      protocol layers.
1415  *
1416  *      return values:
1417  *      NET_RX_SUCCESS  (no congestion)
1418  *      NET_RX_CN_LOW   (low congestion)
1419  *      NET_RX_CN_MOD   (moderate congestion)
1420  *      NET_RX_CN_HIGH  (high congestion)
1421  *      NET_RX_DROP     (packet was dropped)
1422  *
1423  */
1424
1425 int netif_rx(struct sk_buff *skb)
1426 {
1427         int this_cpu;
1428         struct softnet_data *queue;
1429         unsigned long flags;
1430
1431 #ifdef CONFIG_NETPOLL
1432         if (skb->dev->netpoll_rx && netpoll_rx(skb)) {
1433                 kfree_skb(skb);
1434                 return NET_RX_DROP;
1435         }
1436 #endif
1437
1438         if (!skb->stamp.tv_sec)
1439                 net_timestamp(&skb->stamp);
1440
1441         /*
1442          * The code is rearranged so that the path is the most
1443          * short when CPU is congested, but is still operating.
1444          */
1445         local_irq_save(flags);
1446         this_cpu = smp_processor_id();
1447         queue = &__get_cpu_var(softnet_data);
1448
1449         __get_cpu_var(netdev_rx_stat).total++;
1450         if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
1451                 if (queue->input_pkt_queue.qlen) {
1452                         if (queue->throttle)
1453                                 goto drop;
1454
1455 enqueue:
1456                         dev_hold(skb->dev);
1457                         __skb_queue_tail(&queue->input_pkt_queue, skb);
1458 #ifndef OFFLINE_SAMPLE
1459                         get_sample_stats(this_cpu);
1460 #endif
1461                         local_irq_restore(flags);
1462                         return queue->cng_level;
1463                 }
1464
1465                 if (queue->throttle)
1466                         queue->throttle = 0;
1467
1468                 netif_rx_schedule(&queue->backlog_dev);
1469                 goto enqueue;
1470         }
1471
1472         if (!queue->throttle) {
1473                 queue->throttle = 1;
1474                 __get_cpu_var(netdev_rx_stat).throttled++;
1475         }
1476
1477 drop:
1478         __get_cpu_var(netdev_rx_stat).dropped++;
1479         local_irq_restore(flags);
1480
1481         kfree_skb(skb);
1482         return NET_RX_DROP;
1483 }
1484
1485 int netif_rx_ni(struct sk_buff *skb)
1486 {
1487         int err;
1488
1489         preempt_disable();
1490         err = netif_rx(skb);
1491         if (local_softirq_pending())
1492                 do_softirq();
1493         preempt_enable();
1494
1495         return err;
1496 }
1497
1498 EXPORT_SYMBOL(netif_rx_ni);
1499
1500 static __inline__ void skb_bond(struct sk_buff *skb)
1501 {
1502         struct net_device *dev = skb->dev;
1503
1504         if (dev->master) {
1505                 skb->real_dev = skb->dev;
1506                 skb->dev = dev->master;
1507         }
1508 }
1509
1510 static void net_tx_action(struct softirq_action *h)
1511 {
1512         struct softnet_data *sd = &__get_cpu_var(softnet_data);
1513
1514         if (sd->completion_queue) {
1515                 struct sk_buff *clist;
1516
1517                 local_irq_disable();
1518                 clist = sd->completion_queue;
1519                 sd->completion_queue = NULL;
1520                 local_irq_enable();
1521
1522                 while (clist) {
1523                         struct sk_buff *skb = clist;
1524                         clist = clist->next;
1525
1526                         BUG_TRAP(!atomic_read(&skb->users));
1527                         __kfree_skb(skb);
1528                 }
1529         }
1530
1531         if (sd->output_queue) {
1532                 struct net_device *head;
1533
1534                 local_irq_disable();
1535                 head = sd->output_queue;
1536                 sd->output_queue = NULL;
1537                 local_irq_enable();
1538
1539                 while (head) {
1540                         struct net_device *dev = head;
1541                         head = head->next_sched;
1542
1543                         smp_mb__before_clear_bit();
1544                         clear_bit(__LINK_STATE_SCHED, &dev->state);
1545
1546                         if (spin_trylock(&dev->queue_lock)) {
1547                                 qdisc_run(dev);
1548                                 spin_unlock(&dev->queue_lock);
1549                         } else {
1550                                 netif_schedule(dev);
1551                         }
1552                 }
1553         }
1554 }
1555
1556 static __inline__ int deliver_skb(struct sk_buff *skb,
1557                                   struct packet_type *pt_prev)
1558 {
1559         atomic_inc(&skb->users);
1560         return pt_prev->func(skb, skb->dev, pt_prev);
1561 }
1562
1563 #if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE)
1564 int (*br_handle_frame_hook)(struct net_bridge_port *p, struct sk_buff **pskb);
1565
1566 static __inline__ int handle_bridge(struct sk_buff **pskb,
1567                                     struct packet_type **pt_prev, int *ret)
1568 {
1569         struct net_bridge_port *port;
1570
1571         if ((*pskb)->pkt_type == PACKET_LOOPBACK ||
1572             (port = rcu_dereference((*pskb)->dev->br_port)) == NULL)
1573                 return 0;
1574
1575         if (*pt_prev) {
1576                 *ret = deliver_skb(*pskb, *pt_prev);
1577                 *pt_prev = NULL;
1578         }
1579
1580         return br_handle_frame_hook(port, pskb);
1581 }
1582 #else
1583 #define handle_bridge(skb, pt_prev, ret)        (0)
1584 #endif
1585
1586 #ifdef CONFIG_NET_CLS_ACT
1587 /* TODO: Maybe we should just force sch_ingress to be compiled in
1588  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
1589  * a compare and 2 stores extra right now if we dont have it on
1590  * but have CONFIG_NET_CLS_ACT
1591  * NOTE: This doesnt stop any functionality; if you dont have
1592  * the ingress scheduler, you just cant add policies on ingress.
1593  *
1594  */
1595 static int ing_filter(struct sk_buff *skb)
1596 {
1597         struct Qdisc *q;
1598         struct net_device *dev = skb->dev;
1599         int result = TC_ACT_OK;
1600
1601         if (dev->qdisc_ingress) {
1602                 __u32 ttl = (__u32) G_TC_RTTL(skb->tc_verd);
1603                 if (MAX_RED_LOOP < ttl++) {
1604                         printk("Redir loop detected Dropping packet (%s->%s)\n",
1605                                 skb->input_dev?skb->input_dev->name:"??",skb->dev->name);
1606                         return TC_ACT_SHOT;
1607                 }
1608
1609                 skb->tc_verd = SET_TC_RTTL(skb->tc_verd,ttl);
1610
1611                 skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_INGRESS);
1612                 if (NULL == skb->input_dev) {
1613                         skb->input_dev = skb->dev;
1614                         printk("ing_filter:  fixed  %s out %s\n",skb->input_dev->name,skb->dev->name);
1615                 }
1616                 spin_lock(&dev->ingress_lock);
1617                 if ((q = dev->qdisc_ingress) != NULL)
1618                         result = q->enqueue(skb, q);
1619                 spin_unlock(&dev->ingress_lock);
1620
1621         }
1622
1623         return result;
1624 }
1625 #endif
1626
1627 int netif_receive_skb(struct sk_buff *skb)
1628 {
1629         struct packet_type *ptype, *pt_prev;
1630         int ret = NET_RX_DROP;
1631         unsigned short type;
1632
1633 #ifdef CONFIG_NETPOLL
1634         if (skb->dev->netpoll_rx && skb->dev->poll && netpoll_rx(skb)) {
1635                 kfree_skb(skb);
1636                 return NET_RX_DROP;
1637         }
1638 #endif
1639
1640         if (!skb->stamp.tv_sec)
1641                 net_timestamp(&skb->stamp);
1642
1643         skb_bond(skb);
1644
1645         __get_cpu_var(netdev_rx_stat).total++;
1646
1647         skb->h.raw = skb->nh.raw = skb->data;
1648         skb->mac_len = skb->nh.raw - skb->mac.raw;
1649
1650         pt_prev = NULL;
1651
1652         rcu_read_lock();
1653
1654 #ifdef CONFIG_NET_CLS_ACT
1655         if (skb->tc_verd & TC_NCLS) {
1656                 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
1657                 goto ncls;
1658         }
1659 #endif
1660
1661         list_for_each_entry_rcu(ptype, &ptype_all, list) {
1662                 if (!ptype->dev || ptype->dev == skb->dev) {
1663                         if (pt_prev)
1664                                 ret = deliver_skb(skb, pt_prev);
1665                         pt_prev = ptype;
1666                 }
1667         }
1668
1669 #ifdef CONFIG_NET_CLS_ACT
1670         if (pt_prev) {
1671                 ret = deliver_skb(skb, pt_prev);
1672                 pt_prev = NULL; /* noone else should process this after*/
1673         } else {
1674                 skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd);
1675         }
1676
1677         ret = ing_filter(skb);
1678
1679         if (ret == TC_ACT_SHOT || (ret == TC_ACT_STOLEN)) {
1680                 kfree_skb(skb);
1681                 goto out;
1682         }
1683
1684         skb->tc_verd = 0;
1685 ncls:
1686 #endif
1687
1688         handle_diverter(skb);
1689
1690         if (handle_bridge(&skb, &pt_prev, &ret))
1691                 goto out;
1692
1693         type = skb->protocol;
1694         list_for_each_entry_rcu(ptype, &ptype_base[ntohs(type)&15], list) {
1695                 if (ptype->type == type &&
1696                     (!ptype->dev || ptype->dev == skb->dev)) {
1697                         if (pt_prev)
1698                                 ret = deliver_skb(skb, pt_prev);
1699                         pt_prev = ptype;
1700                 }
1701         }
1702
1703         if (pt_prev) {
1704                 ret = pt_prev->func(skb, skb->dev, pt_prev);
1705         } else {
1706                 kfree_skb(skb);
1707                 /* Jamal, now you will not able to escape explaining
1708                  * me how you were going to use this. :-)
1709                  */
1710                 ret = NET_RX_DROP;
1711         }
1712
1713 out:
1714         rcu_read_unlock();
1715         return ret;
1716 }
1717
1718 static int process_backlog(struct net_device *backlog_dev, int *budget)
1719 {
1720         int work = 0;
1721         int quota = min(backlog_dev->quota, *budget);
1722         struct softnet_data *queue = &__get_cpu_var(softnet_data);
1723         unsigned long start_time = jiffies;
1724
1725         for (;;) {
1726                 struct sk_buff *skb;
1727                 struct net_device *dev;
1728
1729                 local_irq_disable();
1730                 skb = __skb_dequeue(&queue->input_pkt_queue);
1731                 if (!skb)
1732                         goto job_done;
1733                 local_irq_enable();
1734
1735                 dev = skb->dev;
1736
1737                 netif_receive_skb(skb);
1738
1739                 dev_put(dev);
1740
1741                 work++;
1742
1743                 if (work >= quota || jiffies - start_time > 1)
1744                         break;
1745
1746         }
1747
1748         backlog_dev->quota -= work;
1749         *budget -= work;
1750         return -1;
1751
1752 job_done:
1753         backlog_dev->quota -= work;
1754         *budget -= work;
1755
1756         list_del(&backlog_dev->poll_list);
1757         smp_mb__before_clear_bit();
1758         netif_poll_enable(backlog_dev);
1759
1760         if (queue->throttle)
1761                 queue->throttle = 0;
1762         local_irq_enable();
1763         return 0;
1764 }
1765
1766 static void net_rx_action(struct softirq_action *h)
1767 {
1768         struct softnet_data *queue = &__get_cpu_var(softnet_data);
1769         unsigned long start_time = jiffies;
1770         int budget = netdev_max_backlog;
1771
1772
1773         local_irq_disable();
1774
1775         while (!list_empty(&queue->poll_list)) {
1776                 struct net_device *dev;
1777
1778                 if (budget <= 0 || jiffies - start_time > 1)
1779                         goto softnet_break;
1780
1781                 local_irq_enable();
1782
1783                 dev = list_entry(queue->poll_list.next,
1784                                  struct net_device, poll_list);
1785
1786                 if (dev->quota <= 0 || dev->poll(dev, &budget)) {
1787                         local_irq_disable();
1788                         list_del(&dev->poll_list);
1789                         list_add_tail(&dev->poll_list, &queue->poll_list);
1790                         if (dev->quota < 0)
1791                                 dev->quota += dev->weight;
1792                         else
1793                                 dev->quota = dev->weight;
1794                 } else {
1795                         dev_put(dev);
1796                         local_irq_disable();
1797                 }
1798         }
1799 out:
1800         local_irq_enable();
1801         return;
1802
1803 softnet_break:
1804         __get_cpu_var(netdev_rx_stat).time_squeeze++;
1805         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
1806         goto out;
1807 }
1808
1809 static gifconf_func_t * gifconf_list [NPROTO];
1810
1811 /**
1812  *      register_gifconf        -       register a SIOCGIF handler
1813  *      @family: Address family
1814  *      @gifconf: Function handler
1815  *
1816  *      Register protocol dependent address dumping routines. The handler
1817  *      that is passed must not be freed or reused until it has been replaced
1818  *      by another handler.
1819  */
1820 int register_gifconf(unsigned int family, gifconf_func_t * gifconf)
1821 {
1822         if (family >= NPROTO)
1823                 return -EINVAL;
1824         gifconf_list[family] = gifconf;
1825         return 0;
1826 }
1827
1828
1829 /*
1830  *      Map an interface index to its name (SIOCGIFNAME)
1831  */
1832
1833 /*
1834  *      We need this ioctl for efficient implementation of the
1835  *      if_indextoname() function required by the IPv6 API.  Without
1836  *      it, we would have to search all the interfaces to find a
1837  *      match.  --pb
1838  */
1839
1840 static int dev_ifname(struct ifreq __user *arg)
1841 {
1842         struct net_device *dev;
1843         struct ifreq ifr;
1844
1845         /*
1846          *      Fetch the caller's info block.
1847          */
1848
1849         if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
1850                 return -EFAULT;
1851
1852         read_lock(&dev_base_lock);
1853         dev = __dev_get_by_index(ifr.ifr_ifindex);
1854         if (!dev) {
1855                 read_unlock(&dev_base_lock);
1856                 return -ENODEV;
1857         }
1858
1859         strcpy(ifr.ifr_name, dev->name);
1860         read_unlock(&dev_base_lock);
1861
1862         if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
1863                 return -EFAULT;
1864         return 0;
1865 }
1866
1867 /*
1868  *      Perform a SIOCGIFCONF call. This structure will change
1869  *      size eventually, and there is nothing I can do about it.
1870  *      Thus we will need a 'compatibility mode'.
1871  */
1872
1873 static int dev_ifconf(char __user *arg)
1874 {
1875         struct ifconf ifc;
1876         struct net_device *dev;
1877         char __user *pos;
1878         int len;
1879         int total;
1880         int i;
1881
1882         /*
1883          *      Fetch the caller's info block.
1884          */
1885
1886         if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
1887                 return -EFAULT;
1888
1889         pos = ifc.ifc_buf;
1890         len = ifc.ifc_len;
1891
1892         /*
1893          *      Loop over the interfaces, and write an info block for each.
1894          */
1895
1896         total = 0;
1897         for (dev = dev_base; dev; dev = dev->next) {
1898                 if (vx_flags(VXF_HIDE_NETIF, 0) &&
1899                         !dev_in_nx_info(dev, current->nx_info))
1900                         continue;
1901                 for (i = 0; i < NPROTO; i++) {
1902                         if (gifconf_list[i]) {
1903                                 int done;
1904                                 if (!pos)
1905                                         done = gifconf_list[i](dev, NULL, 0);
1906                                 else
1907                                         done = gifconf_list[i](dev, pos + total,
1908                                                                len - total);
1909                                 if (done < 0)
1910                                         return -EFAULT;
1911                                 total += done;
1912                         }
1913                 }
1914         }
1915
1916         /*
1917          *      All done.  Write the updated control block back to the caller.
1918          */
1919         ifc.ifc_len = total;
1920
1921         /*
1922          *      Both BSD and Solaris return 0 here, so we do too.
1923          */
1924         return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
1925 }
1926
1927 #ifdef CONFIG_PROC_FS
1928 /*
1929  *      This is invoked by the /proc filesystem handler to display a device
1930  *      in detail.
1931  */
1932 static __inline__ struct net_device *dev_get_idx(loff_t pos)
1933 {
1934         struct net_device *dev;
1935         loff_t i;
1936
1937         for (i = 0, dev = dev_base; dev && i < pos; ++i, dev = dev->next);
1938
1939         return i == pos ? dev : NULL;
1940 }
1941
1942 void *dev_seq_start(struct seq_file *seq, loff_t *pos)
1943 {
1944         read_lock(&dev_base_lock);
1945         return *pos ? dev_get_idx(*pos - 1) : SEQ_START_TOKEN;
1946 }
1947
1948 void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1949 {
1950         ++*pos;
1951         return v == SEQ_START_TOKEN ? dev_base : ((struct net_device *)v)->next;
1952 }
1953
1954 void dev_seq_stop(struct seq_file *seq, void *v)
1955 {
1956         read_unlock(&dev_base_lock);
1957 }
1958
1959 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
1960 {
1961         struct nx_info *nxi = current->nx_info;
1962
1963         if (vx_flags(VXF_HIDE_NETIF, 0) && !dev_in_nx_info(dev, nxi))
1964                 return;
1965         if (dev->get_stats) {
1966                 struct net_device_stats *stats = dev->get_stats(dev);
1967
1968                 seq_printf(seq, "%6s:%8lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu "
1969                                 "%8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n",
1970                            dev->name, stats->rx_bytes, stats->rx_packets,
1971                            stats->rx_errors,
1972                            stats->rx_dropped + stats->rx_missed_errors,
1973                            stats->rx_fifo_errors,
1974                            stats->rx_length_errors + stats->rx_over_errors +
1975                              stats->rx_crc_errors + stats->rx_frame_errors,
1976                            stats->rx_compressed, stats->multicast,
1977                            stats->tx_bytes, stats->tx_packets,
1978                            stats->tx_errors, stats->tx_dropped,
1979                            stats->tx_fifo_errors, stats->collisions,
1980                            stats->tx_carrier_errors +
1981                              stats->tx_aborted_errors +
1982                              stats->tx_window_errors +
1983                              stats->tx_heartbeat_errors,
1984                            stats->tx_compressed);
1985         } else
1986                 seq_printf(seq, "%6s: No statistics available.\n", dev->name);
1987 }
1988
1989 /*
1990  *      Called from the PROCfs module. This now uses the new arbitrary sized
1991  *      /proc/net interface to create /proc/net/dev
1992  */
1993 static int dev_seq_show(struct seq_file *seq, void *v)
1994 {
1995         if (v == SEQ_START_TOKEN)
1996                 seq_puts(seq, "Inter-|   Receive                            "
1997                               "                    |  Transmit\n"
1998                               " face |bytes    packets errs drop fifo frame "
1999                               "compressed multicast|bytes    packets errs "
2000                               "drop fifo colls carrier compressed\n");
2001         else
2002                 dev_seq_printf_stats(seq, v);
2003         return 0;
2004 }
2005
2006 static struct netif_rx_stats *softnet_get_online(loff_t *pos)
2007 {
2008         struct netif_rx_stats *rc = NULL;
2009
2010         while (*pos < NR_CPUS)
2011                 if (cpu_online(*pos)) {
2012                         rc = &per_cpu(netdev_rx_stat, *pos);
2013                         break;
2014                 } else
2015                         ++*pos;
2016         return rc;
2017 }
2018
2019 static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
2020 {
2021         return softnet_get_online(pos);
2022 }
2023
2024 static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2025 {
2026         ++*pos;
2027         return softnet_get_online(pos);
2028 }
2029
2030 static void softnet_seq_stop(struct seq_file *seq, void *v)
2031 {
2032 }
2033
2034 static int softnet_seq_show(struct seq_file *seq, void *v)
2035 {
2036         struct netif_rx_stats *s = v;
2037
2038         seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
2039                    s->total, s->dropped, s->time_squeeze, s->throttled,
2040                    s->fastroute_hit, s->fastroute_success, s->fastroute_defer,
2041                    s->fastroute_deferred_out,
2042 #if 0
2043                    s->fastroute_latency_reduction
2044 #else
2045                    s->cpu_collision
2046 #endif
2047                   );
2048         return 0;
2049 }
2050
2051 static struct seq_operations dev_seq_ops = {
2052         .start = dev_seq_start,
2053         .next  = dev_seq_next,
2054         .stop  = dev_seq_stop,
2055         .show  = dev_seq_show,
2056 };
2057
2058 static int dev_seq_open(struct inode *inode, struct file *file)
2059 {
2060         return seq_open(file, &dev_seq_ops);
2061 }
2062
2063 static struct file_operations dev_seq_fops = {
2064         .owner   = THIS_MODULE,
2065         .open    = dev_seq_open,
2066         .read    = seq_read,
2067         .llseek  = seq_lseek,
2068         .release = seq_release,
2069 };
2070
2071 static struct seq_operations softnet_seq_ops = {
2072         .start = softnet_seq_start,
2073         .next  = softnet_seq_next,
2074         .stop  = softnet_seq_stop,
2075         .show  = softnet_seq_show,
2076 };
2077
2078 static int softnet_seq_open(struct inode *inode, struct file *file)
2079 {
2080         return seq_open(file, &softnet_seq_ops);
2081 }
2082
2083 static struct file_operations softnet_seq_fops = {
2084         .owner   = THIS_MODULE,
2085         .open    = softnet_seq_open,
2086         .read    = seq_read,
2087         .llseek  = seq_lseek,
2088         .release = seq_release,
2089 };
2090
2091 #ifdef WIRELESS_EXT
2092 extern int wireless_proc_init(void);
2093 #else
2094 #define wireless_proc_init() 0
2095 #endif
2096
2097 static int __init dev_proc_init(void)
2098 {
2099         int rc = -ENOMEM;
2100
2101         if (!proc_net_fops_create("dev", S_IRUGO, &dev_seq_fops))
2102                 goto out;
2103         if (!proc_net_fops_create("softnet_stat", S_IRUGO, &softnet_seq_fops))
2104                 goto out_dev;
2105         if (wireless_proc_init())
2106                 goto out_softnet;
2107         rc = 0;
2108 out:
2109         return rc;
2110 out_softnet:
2111         proc_net_remove("softnet_stat");
2112 out_dev:
2113         proc_net_remove("dev");
2114         goto out;
2115 }
2116 #else
2117 #define dev_proc_init() 0
2118 #endif  /* CONFIG_PROC_FS */
2119
2120
2121 /**
2122  *      netdev_set_master       -       set up master/slave pair
2123  *      @slave: slave device
2124  *      @master: new master device
2125  *
2126  *      Changes the master device of the slave. Pass %NULL to break the
2127  *      bonding. The caller must hold the RTNL semaphore. On a failure
2128  *      a negative errno code is returned. On success the reference counts
2129  *      are adjusted, %RTM_NEWLINK is sent to the routing socket and the
2130  *      function returns zero.
2131  */
2132 int netdev_set_master(struct net_device *slave, struct net_device *master)
2133 {
2134         struct net_device *old = slave->master;
2135
2136         ASSERT_RTNL();
2137
2138         if (master) {
2139                 if (old)
2140                         return -EBUSY;
2141                 dev_hold(master);
2142         }
2143
2144         slave->master = master;
2145
2146         synchronize_net();
2147
2148         if (old)
2149                 dev_put(old);
2150
2151         if (master)
2152                 slave->flags |= IFF_SLAVE;
2153         else
2154                 slave->flags &= ~IFF_SLAVE;
2155
2156         rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
2157         return 0;
2158 }
2159
2160 /**
2161  *      dev_set_promiscuity     - update promiscuity count on a device
2162  *      @dev: device
2163  *      @inc: modifier
2164  *
2165  *      Add or remove promsicuity from a device. While the count in the device
2166  *      remains above zero the interface remains promiscuous. Once it hits zero
2167  *      the device reverts back to normal filtering operation. A negative inc
2168  *      value is used to drop promiscuity on the device.
2169  */
2170 void dev_set_promiscuity(struct net_device *dev, int inc)
2171 {
2172         unsigned short old_flags = dev->flags;
2173
2174         dev->flags |= IFF_PROMISC;
2175         if ((dev->promiscuity += inc) == 0)
2176                 dev->flags &= ~IFF_PROMISC;
2177         if (dev->flags ^ old_flags) {
2178                 dev_mc_upload(dev);
2179                 printk(KERN_INFO "device %s %s promiscuous mode\n",
2180                        dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
2181                                                                "left");
2182         }
2183 }
2184
2185 /**
2186  *      dev_set_allmulti        - update allmulti count on a device
2187  *      @dev: device
2188  *      @inc: modifier
2189  *
2190  *      Add or remove reception of all multicast frames to a device. While the
2191  *      count in the device remains above zero the interface remains listening
2192  *      to all interfaces. Once it hits zero the device reverts back to normal
2193  *      filtering operation. A negative @inc value is used to drop the counter
2194  *      when releasing a resource needing all multicasts.
2195  */
2196
2197 void dev_set_allmulti(struct net_device *dev, int inc)
2198 {
2199         unsigned short old_flags = dev->flags;
2200
2201         dev->flags |= IFF_ALLMULTI;
2202         if ((dev->allmulti += inc) == 0)
2203                 dev->flags &= ~IFF_ALLMULTI;
2204         if (dev->flags ^ old_flags)
2205                 dev_mc_upload(dev);
2206 }
2207
2208 unsigned dev_get_flags(const struct net_device *dev)
2209 {
2210         unsigned flags;
2211
2212         flags = (dev->flags & ~(IFF_PROMISC |
2213                                 IFF_ALLMULTI |
2214                                 IFF_RUNNING)) |
2215                 (dev->gflags & (IFF_PROMISC |
2216                                 IFF_ALLMULTI));
2217
2218         if (netif_running(dev) && netif_carrier_ok(dev))
2219                 flags |= IFF_RUNNING;
2220
2221         return flags;
2222 }
2223
2224 int dev_change_flags(struct net_device *dev, unsigned flags)
2225 {
2226         int ret;
2227         int old_flags = dev->flags;
2228
2229         /*
2230          *      Set the flags on our device.
2231          */
2232
2233         dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
2234                                IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
2235                                IFF_AUTOMEDIA)) |
2236                      (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
2237                                     IFF_ALLMULTI));
2238
2239         /*
2240          *      Load in the correct multicast list now the flags have changed.
2241          */
2242
2243         dev_mc_upload(dev);
2244
2245         /*
2246          *      Have we downed the interface. We handle IFF_UP ourselves
2247          *      according to user attempts to set it, rather than blindly
2248          *      setting it.
2249          */
2250
2251         ret = 0;
2252         if ((old_flags ^ flags) & IFF_UP) {     /* Bit is different  ? */
2253                 ret = ((old_flags & IFF_UP) ? dev_close : dev_open)(dev);
2254
2255                 if (!ret)
2256                         dev_mc_upload(dev);
2257         }
2258
2259         if (dev->flags & IFF_UP &&
2260             ((old_flags ^ dev->flags) &~ (IFF_UP | IFF_PROMISC | IFF_ALLMULTI |
2261                                           IFF_VOLATILE)))
2262                 notifier_call_chain(&netdev_chain, NETDEV_CHANGE, dev);
2263
2264         if ((flags ^ dev->gflags) & IFF_PROMISC) {
2265                 int inc = (flags & IFF_PROMISC) ? +1 : -1;
2266                 dev->gflags ^= IFF_PROMISC;
2267                 dev_set_promiscuity(dev, inc);
2268         }
2269
2270         /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
2271            is important. Some (broken) drivers set IFF_PROMISC, when
2272            IFF_ALLMULTI is requested not asking us and not reporting.
2273          */
2274         if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
2275                 int inc = (flags & IFF_ALLMULTI) ? +1 : -1;
2276                 dev->gflags ^= IFF_ALLMULTI;
2277                 dev_set_allmulti(dev, inc);
2278         }
2279
2280         if (old_flags ^ dev->flags)
2281                 rtmsg_ifinfo(RTM_NEWLINK, dev, old_flags ^ dev->flags);
2282
2283         return ret;
2284 }
2285
2286 int dev_set_mtu(struct net_device *dev, int new_mtu)
2287 {
2288         int err;
2289
2290         if (new_mtu == dev->mtu)
2291                 return 0;
2292
2293         /*      MTU must be positive.    */
2294         if (new_mtu < 0)
2295                 return -EINVAL;
2296
2297         if (!netif_device_present(dev))
2298                 return -ENODEV;
2299
2300         err = 0;
2301         if (dev->change_mtu)
2302                 err = dev->change_mtu(dev, new_mtu);
2303         else
2304                 dev->mtu = new_mtu;
2305         if (!err && dev->flags & IFF_UP)
2306                 notifier_call_chain(&netdev_chain,
2307                                     NETDEV_CHANGEMTU, dev);
2308         return err;
2309 }
2310
2311
2312 /*
2313  *      Perform the SIOCxIFxxx calls.
2314  */
2315 static int dev_ifsioc(struct ifreq *ifr, unsigned int cmd)
2316 {
2317         int err;
2318         struct net_device *dev = __dev_get_by_name(ifr->ifr_name);
2319
2320         if (!dev)
2321                 return -ENODEV;
2322
2323         switch (cmd) {
2324                 case SIOCGIFFLAGS:      /* Get interface flags */
2325                         ifr->ifr_flags = dev_get_flags(dev);
2326                         return 0;
2327
2328                 case SIOCSIFFLAGS:      /* Set interface flags */
2329                         return dev_change_flags(dev, ifr->ifr_flags);
2330
2331                 case SIOCGIFMETRIC:     /* Get the metric on the interface
2332                                            (currently unused) */
2333                         ifr->ifr_metric = 0;
2334                         return 0;
2335
2336                 case SIOCSIFMETRIC:     /* Set the metric on the interface
2337                                            (currently unused) */
2338                         return -EOPNOTSUPP;
2339
2340                 case SIOCGIFMTU:        /* Get the MTU of a device */
2341                         ifr->ifr_mtu = dev->mtu;
2342                         return 0;
2343
2344                 case SIOCSIFMTU:        /* Set the MTU of a device */
2345                         return dev_set_mtu(dev, ifr->ifr_mtu);
2346
2347                 case SIOCGIFHWADDR:
2348                         if (!dev->addr_len)
2349                                 memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
2350                         else
2351                                 memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
2352                                        min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
2353                         ifr->ifr_hwaddr.sa_family = dev->type;
2354                         return 0;
2355
2356                 case SIOCSIFHWADDR:
2357                         if (!dev->set_mac_address)
2358                                 return -EOPNOTSUPP;
2359                         if (ifr->ifr_hwaddr.sa_family != dev->type)
2360                                 return -EINVAL;
2361                         if (!netif_device_present(dev))
2362                                 return -ENODEV;
2363                         err = dev->set_mac_address(dev, &ifr->ifr_hwaddr);
2364                         if (!err)
2365                                 notifier_call_chain(&netdev_chain,
2366                                                     NETDEV_CHANGEADDR, dev);
2367                         return err;
2368
2369                 case SIOCSIFHWBROADCAST:
2370                         if (ifr->ifr_hwaddr.sa_family != dev->type)
2371                                 return -EINVAL;
2372                         memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
2373                                min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
2374                         notifier_call_chain(&netdev_chain,
2375                                             NETDEV_CHANGEADDR, dev);
2376                         return 0;
2377
2378                 case SIOCGIFMAP:
2379                         ifr->ifr_map.mem_start = dev->mem_start;
2380                         ifr->ifr_map.mem_end   = dev->mem_end;
2381                         ifr->ifr_map.base_addr = dev->base_addr;
2382                         ifr->ifr_map.irq       = dev->irq;
2383                         ifr->ifr_map.dma       = dev->dma;
2384                         ifr->ifr_map.port      = dev->if_port;
2385                         return 0;
2386
2387                 case SIOCSIFMAP:
2388                         if (dev->set_config) {
2389                                 if (!netif_device_present(dev))
2390                                         return -ENODEV;
2391                                 return dev->set_config(dev, &ifr->ifr_map);
2392                         }
2393                         return -EOPNOTSUPP;
2394
2395                 case SIOCADDMULTI:
2396                         if (!dev->set_multicast_list ||
2397                             ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
2398                                 return -EINVAL;
2399                         if (!netif_device_present(dev))
2400                                 return -ENODEV;
2401                         return dev_mc_add(dev, ifr->ifr_hwaddr.sa_data,
2402                                           dev->addr_len, 1);
2403
2404                 case SIOCDELMULTI:
2405                         if (!dev->set_multicast_list ||
2406                             ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
2407                                 return -EINVAL;
2408                         if (!netif_device_present(dev))
2409                                 return -ENODEV;
2410                         return dev_mc_delete(dev, ifr->ifr_hwaddr.sa_data,
2411                                              dev->addr_len, 1);
2412
2413                 case SIOCGIFINDEX:
2414                         ifr->ifr_ifindex = dev->ifindex;
2415                         return 0;
2416
2417                 case SIOCGIFTXQLEN:
2418                         ifr->ifr_qlen = dev->tx_queue_len;
2419                         return 0;
2420
2421                 case SIOCSIFTXQLEN:
2422                         if (ifr->ifr_qlen < 0)
2423                                 return -EINVAL;
2424                         dev->tx_queue_len = ifr->ifr_qlen;
2425                         return 0;
2426
2427                 case SIOCSIFNAME:
2428                         ifr->ifr_newname[IFNAMSIZ-1] = '\0';
2429                         return dev_change_name(dev, ifr->ifr_newname);
2430
2431                 /*
2432                  *      Unknown or private ioctl
2433                  */
2434
2435                 default:
2436                         if ((cmd >= SIOCDEVPRIVATE &&
2437                             cmd <= SIOCDEVPRIVATE + 15) ||
2438                             cmd == SIOCBONDENSLAVE ||
2439                             cmd == SIOCBONDRELEASE ||
2440                             cmd == SIOCBONDSETHWADDR ||
2441                             cmd == SIOCBONDSLAVEINFOQUERY ||
2442                             cmd == SIOCBONDINFOQUERY ||
2443                             cmd == SIOCBONDCHANGEACTIVE ||
2444                             cmd == SIOCGMIIPHY ||
2445                             cmd == SIOCGMIIREG ||
2446                             cmd == SIOCSMIIREG ||
2447                             cmd == SIOCBRADDIF ||
2448                             cmd == SIOCBRDELIF ||
2449                             cmd == SIOCWANDEV) {
2450                                 err = -EOPNOTSUPP;
2451                                 if (dev->do_ioctl) {
2452                                         if (netif_device_present(dev))
2453                                                 err = dev->do_ioctl(dev, ifr,
2454                                                                     cmd);
2455                                         else
2456                                                 err = -ENODEV;
2457                                 }
2458                         } else
2459                                 err = -EINVAL;
2460
2461         }
2462         return err;
2463 }
2464
2465 /*
2466  *      This function handles all "interface"-type I/O control requests. The actual
2467  *      'doing' part of this is dev_ifsioc above.
2468  */
2469
2470 /**
2471  *      dev_ioctl       -       network device ioctl
2472  *      @cmd: command to issue
2473  *      @arg: pointer to a struct ifreq in user space
2474  *
2475  *      Issue ioctl functions to devices. This is normally called by the
2476  *      user space syscall interfaces but can sometimes be useful for
2477  *      other purposes. The return value is the return from the syscall if
2478  *      positive or a negative errno code on error.
2479  */
2480
2481 int dev_ioctl(unsigned int cmd, void __user *arg)
2482 {
2483         struct ifreq ifr;
2484         int ret;
2485         char *colon;
2486
2487         /* One special case: SIOCGIFCONF takes ifconf argument
2488            and requires shared lock, because it sleeps writing
2489            to user space.
2490          */
2491
2492         if (cmd == SIOCGIFCONF) {
2493                 rtnl_shlock();
2494                 ret = dev_ifconf((char __user *) arg);
2495                 rtnl_shunlock();
2496                 return ret;
2497         }
2498         if (cmd == SIOCGIFNAME)
2499                 return dev_ifname((struct ifreq __user *)arg);
2500
2501         if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
2502                 return -EFAULT;
2503
2504         ifr.ifr_name[IFNAMSIZ-1] = 0;
2505
2506         colon = strchr(ifr.ifr_name, ':');
2507         if (colon)
2508                 *colon = 0;
2509
2510         /*
2511          *      See which interface the caller is talking about.
2512          */
2513
2514         switch (cmd) {
2515                 /*
2516                  *      These ioctl calls:
2517                  *      - can be done by all.
2518                  *      - atomic and do not require locking.
2519                  *      - return a value
2520                  */
2521                 case SIOCGIFFLAGS:
2522                 case SIOCGIFMETRIC:
2523                 case SIOCGIFMTU:
2524                 case SIOCGIFHWADDR:
2525                 case SIOCGIFSLAVE:
2526                 case SIOCGIFMAP:
2527                 case SIOCGIFINDEX:
2528                 case SIOCGIFTXQLEN:
2529                         dev_load(ifr.ifr_name);
2530                         read_lock(&dev_base_lock);
2531                         ret = dev_ifsioc(&ifr, cmd);
2532                         read_unlock(&dev_base_lock);
2533                         if (!ret) {
2534                                 if (colon)
2535                                         *colon = ':';
2536                                 if (copy_to_user(arg, &ifr,
2537                                                  sizeof(struct ifreq)))
2538                                         ret = -EFAULT;
2539                         }
2540                         return ret;
2541
2542                 case SIOCETHTOOL:
2543                         dev_load(ifr.ifr_name);
2544                         rtnl_lock();
2545                         ret = dev_ethtool(&ifr);
2546                         rtnl_unlock();
2547                         if (!ret) {
2548                                 if (colon)
2549                                         *colon = ':';
2550                                 if (copy_to_user(arg, &ifr,
2551                                                  sizeof(struct ifreq)))
2552                                         ret = -EFAULT;
2553                         }
2554                         return ret;
2555
2556                 /*
2557                  *      These ioctl calls:
2558                  *      - require superuser power.
2559                  *      - require strict serialization.
2560                  *      - return a value
2561                  */
2562                 case SIOCGMIIPHY:
2563                 case SIOCGMIIREG:
2564                 case SIOCSIFNAME:
2565                         if (!capable(CAP_NET_ADMIN))
2566                                 return -EPERM;
2567                         dev_load(ifr.ifr_name);
2568                         rtnl_lock();
2569                         ret = dev_ifsioc(&ifr, cmd);
2570                         rtnl_unlock();
2571                         if (!ret) {
2572                                 if (colon)
2573                                         *colon = ':';
2574                                 if (copy_to_user(arg, &ifr,
2575                                                  sizeof(struct ifreq)))
2576                                         ret = -EFAULT;
2577                         }
2578                         return ret;
2579
2580                 /*
2581                  *      These ioctl calls:
2582                  *      - require superuser power.
2583                  *      - require strict serialization.
2584                  *      - do not return a value
2585                  */
2586                 case SIOCSIFFLAGS:
2587                 case SIOCSIFMETRIC:
2588                 case SIOCSIFMTU:
2589                 case SIOCSIFMAP:
2590                 case SIOCSIFHWADDR:
2591                 case SIOCSIFSLAVE:
2592                 case SIOCADDMULTI:
2593                 case SIOCDELMULTI:
2594                 case SIOCSIFHWBROADCAST:
2595                 case SIOCSIFTXQLEN:
2596                 case SIOCSMIIREG:
2597                 case SIOCBONDENSLAVE:
2598                 case SIOCBONDRELEASE:
2599                 case SIOCBONDSETHWADDR:
2600                 case SIOCBONDSLAVEINFOQUERY:
2601                 case SIOCBONDINFOQUERY:
2602                 case SIOCBONDCHANGEACTIVE:
2603                 case SIOCBRADDIF:
2604                 case SIOCBRDELIF:
2605                         if (!capable(CAP_NET_ADMIN))
2606                                 return -EPERM;
2607                         dev_load(ifr.ifr_name);
2608                         rtnl_lock();
2609                         ret = dev_ifsioc(&ifr, cmd);
2610                         rtnl_unlock();
2611                         return ret;
2612
2613                 case SIOCGIFMEM:
2614                         /* Get the per device memory space. We can add this but
2615                          * currently do not support it */
2616                 case SIOCSIFMEM:
2617                         /* Set the per device memory buffer space.
2618                          * Not applicable in our case */
2619                 case SIOCSIFLINK:
2620                         return -EINVAL;
2621
2622                 /*
2623                  *      Unknown or private ioctl.
2624                  */
2625                 default:
2626                         if (cmd == SIOCWANDEV ||
2627                             (cmd >= SIOCDEVPRIVATE &&
2628                              cmd <= SIOCDEVPRIVATE + 15)) {
2629                                 dev_load(ifr.ifr_name);
2630                                 rtnl_lock();
2631                                 ret = dev_ifsioc(&ifr, cmd);
2632                                 rtnl_unlock();
2633                                 if (!ret && copy_to_user(arg, &ifr,
2634                                                          sizeof(struct ifreq)))
2635                                         ret = -EFAULT;
2636                                 return ret;
2637                         }
2638 #ifdef WIRELESS_EXT
2639                         /* Take care of Wireless Extensions */
2640                         if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) {
2641                                 /* If command is `set a parameter', or
2642                                  * `get the encoding parameters', check if
2643                                  * the user has the right to do it */
2644                                 if (IW_IS_SET(cmd) || cmd == SIOCGIWENCODE) {
2645                                         if (!capable(CAP_NET_ADMIN))
2646                                                 return -EPERM;
2647                                 }
2648                                 dev_load(ifr.ifr_name);
2649                                 rtnl_lock();
2650                                 /* Follow me in net/core/wireless.c */
2651                                 ret = wireless_process_ioctl(&ifr, cmd);
2652                                 rtnl_unlock();
2653                                 if (IW_IS_GET(cmd) &&
2654                                     copy_to_user(arg, &ifr,
2655                                                  sizeof(struct ifreq)))
2656                                         ret = -EFAULT;
2657                                 return ret;
2658                         }
2659 #endif  /* WIRELESS_EXT */
2660                         return -EINVAL;
2661         }
2662 }
2663
2664
2665 /**
2666  *      dev_new_index   -       allocate an ifindex
2667  *
2668  *      Returns a suitable unique value for a new device interface
2669  *      number.  The caller must hold the rtnl semaphore or the
2670  *      dev_base_lock to be sure it remains unique.
2671  */
2672 static int dev_new_index(void)
2673 {
2674         static int ifindex;
2675         for (;;) {
2676                 if (++ifindex <= 0)
2677                         ifindex = 1;
2678                 if (!__dev_get_by_index(ifindex))
2679                         return ifindex;
2680         }
2681 }
2682
2683 static int dev_boot_phase = 1;
2684
2685 /* Delayed registration/unregisteration */
2686 static DEFINE_SPINLOCK(net_todo_list_lock);
2687 static struct list_head net_todo_list = LIST_HEAD_INIT(net_todo_list);
2688
2689 static inline void net_set_todo(struct net_device *dev)
2690 {
2691         spin_lock(&net_todo_list_lock);
2692         list_add_tail(&dev->todo_list, &net_todo_list);
2693         spin_unlock(&net_todo_list_lock);
2694 }
2695
2696 /**
2697  *      register_netdevice      - register a network device
2698  *      @dev: device to register
2699  *
2700  *      Take a completed network device structure and add it to the kernel
2701  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
2702  *      chain. 0 is returned on success. A negative errno code is returned
2703  *      on a failure to set up the device, or if the name is a duplicate.
2704  *
2705  *      Callers must hold the rtnl semaphore. You may want
2706  *      register_netdev() instead of this.
2707  *
2708  *      BUGS:
2709  *      The locking appears insufficient to guarantee two parallel registers
2710  *      will not get the same name.
2711  */
2712
2713 int register_netdevice(struct net_device *dev)
2714 {
2715         struct hlist_head *head;
2716         struct hlist_node *p;
2717         int ret;
2718
2719         BUG_ON(dev_boot_phase);
2720         ASSERT_RTNL();
2721
2722         /* When net_device's are persistent, this will be fatal. */
2723         BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
2724
2725         spin_lock_init(&dev->queue_lock);
2726         spin_lock_init(&dev->xmit_lock);
2727         dev->xmit_lock_owner = -1;
2728 #ifdef CONFIG_NET_CLS_ACT
2729         spin_lock_init(&dev->ingress_lock);
2730 #endif
2731
2732         ret = alloc_divert_blk(dev);
2733         if (ret)
2734                 goto out;
2735
2736         dev->iflink = -1;
2737
2738         /* Init, if this function is available */
2739         if (dev->init) {
2740                 ret = dev->init(dev);
2741                 if (ret) {
2742                         if (ret > 0)
2743                                 ret = -EIO;
2744                         goto out_err;
2745                 }
2746         }
2747
2748         if (!dev_valid_name(dev->name)) {
2749                 ret = -EINVAL;
2750                 goto out_err;
2751         }
2752
2753         dev->ifindex = dev_new_index();
2754         if (dev->iflink == -1)
2755                 dev->iflink = dev->ifindex;
2756
2757         /* Check for existence of name */
2758         head = dev_name_hash(dev->name);
2759         hlist_for_each(p, head) {
2760                 struct net_device *d
2761                         = hlist_entry(p, struct net_device, name_hlist);
2762                 if (!strncmp(d->name, dev->name, IFNAMSIZ)) {
2763                         ret = -EEXIST;
2764                         goto out_err;
2765                 }
2766         }
2767
2768         /* Fix illegal SG+CSUM combinations. */
2769         if ((dev->features & NETIF_F_SG) &&
2770             !(dev->features & (NETIF_F_IP_CSUM |
2771                                NETIF_F_NO_CSUM |
2772                                NETIF_F_HW_CSUM))) {
2773                 printk("%s: Dropping NETIF_F_SG since no checksum feature.\n",
2774                        dev->name);
2775                 dev->features &= ~NETIF_F_SG;
2776         }
2777
2778         /* TSO requires that SG is present as well. */
2779         if ((dev->features & NETIF_F_TSO) &&
2780             !(dev->features & NETIF_F_SG)) {
2781                 printk("%s: Dropping NETIF_F_TSO since no SG feature.\n",
2782                        dev->name);
2783                 dev->features &= ~NETIF_F_TSO;
2784         }
2785
2786         /*
2787          *      nil rebuild_header routine,
2788          *      that should be never called and used as just bug trap.
2789          */
2790
2791         if (!dev->rebuild_header)
2792                 dev->rebuild_header = default_rebuild_header;
2793
2794         /*
2795          *      Default initial state at registry is that the
2796          *      device is present.
2797          */
2798
2799         set_bit(__LINK_STATE_PRESENT, &dev->state);
2800
2801         dev->next = NULL;
2802         dev_init_scheduler(dev);
2803         write_lock_bh(&dev_base_lock);
2804         *dev_tail = dev;
2805         dev_tail = &dev->next;
2806         hlist_add_head(&dev->name_hlist, head);
2807         hlist_add_head(&dev->index_hlist, dev_index_hash(dev->ifindex));
2808         dev_hold(dev);
2809         dev->reg_state = NETREG_REGISTERING;
2810         write_unlock_bh(&dev_base_lock);
2811
2812         /* Notify protocols, that a new device appeared. */
2813         notifier_call_chain(&netdev_chain, NETDEV_REGISTER, dev);
2814
2815         /* Finish registration after unlock */
2816         net_set_todo(dev);
2817         ret = 0;
2818
2819 out:
2820         return ret;
2821 out_err:
2822         free_divert_blk(dev);
2823         goto out;
2824 }
2825
2826 /**
2827  *      register_netdev - register a network device
2828  *      @dev: device to register
2829  *
2830  *      Take a completed network device structure and add it to the kernel
2831  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
2832  *      chain. 0 is returned on success. A negative errno code is returned
2833  *      on a failure to set up the device, or if the name is a duplicate.
2834  *
2835  *      This is a wrapper around register_netdev that takes the rtnl semaphore
2836  *      and expands the device name if you passed a format string to
2837  *      alloc_netdev.
2838  */
2839 int register_netdev(struct net_device *dev)
2840 {
2841         int err;
2842
2843         rtnl_lock();
2844
2845         /*
2846          * If the name is a format string the caller wants us to do a
2847          * name allocation.
2848          */
2849         if (strchr(dev->name, '%')) {
2850                 err = dev_alloc_name(dev, dev->name);
2851                 if (err < 0)
2852                         goto out;
2853         }
2854
2855         /*
2856          * Back compatibility hook. Kill this one in 2.5
2857          */
2858         if (dev->name[0] == 0 || dev->name[0] == ' ') {
2859                 err = dev_alloc_name(dev, "eth%d");
2860                 if (err < 0)
2861                         goto out;
2862         }
2863
2864         err = register_netdevice(dev);
2865 out:
2866         rtnl_unlock();
2867         return err;
2868 }
2869 EXPORT_SYMBOL(register_netdev);
2870
2871 /*
2872  * netdev_wait_allrefs - wait until all references are gone.
2873  *
2874  * This is called when unregistering network devices.
2875  *
2876  * Any protocol or device that holds a reference should register
2877  * for netdevice notification, and cleanup and put back the
2878  * reference if they receive an UNREGISTER event.
2879  * We can get stuck here if buggy protocols don't correctly
2880  * call dev_put.
2881  */
2882 static void netdev_wait_allrefs(struct net_device *dev)
2883 {
2884         unsigned long rebroadcast_time, warning_time;
2885
2886         rebroadcast_time = warning_time = jiffies;
2887         while (atomic_read(&dev->refcnt) != 0) {
2888                 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
2889                         rtnl_shlock();
2890
2891                         /* Rebroadcast unregister notification */
2892                         notifier_call_chain(&netdev_chain,
2893                                             NETDEV_UNREGISTER, dev);
2894
2895                         if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
2896                                      &dev->state)) {
2897                                 /* We must not have linkwatch events
2898                                  * pending on unregister. If this
2899                                  * happens, we simply run the queue
2900                                  * unscheduled, resulting in a noop
2901                                  * for this device.
2902                                  */
2903                                 linkwatch_run_queue();
2904                         }
2905
2906                         rtnl_shunlock();
2907
2908                         rebroadcast_time = jiffies;
2909                 }
2910
2911                 msleep(250);
2912
2913                 if (time_after(jiffies, warning_time + 10 * HZ)) {
2914                         printk(KERN_EMERG "unregister_netdevice: "
2915                                "waiting for %s to become free. Usage "
2916                                "count = %d\n",
2917                                dev->name, atomic_read(&dev->refcnt));
2918                         warning_time = jiffies;
2919                 }
2920         }
2921 }
2922
2923 /* The sequence is:
2924  *
2925  *      rtnl_lock();
2926  *      ...
2927  *      register_netdevice(x1);
2928  *      register_netdevice(x2);
2929  *      ...
2930  *      unregister_netdevice(y1);
2931  *      unregister_netdevice(y2);
2932  *      ...
2933  *      rtnl_unlock();
2934  *      free_netdev(y1);
2935  *      free_netdev(y2);
2936  *
2937  * We are invoked by rtnl_unlock() after it drops the semaphore.
2938  * This allows us to deal with problems:
2939  * 1) We can create/delete sysfs objects which invoke hotplug
2940  *    without deadlocking with linkwatch via keventd.
2941  * 2) Since we run with the RTNL semaphore not held, we can sleep
2942  *    safely in order to wait for the netdev refcnt to drop to zero.
2943  */
2944 static DECLARE_MUTEX(net_todo_run_mutex);
2945 void netdev_run_todo(void)
2946 {
2947         struct list_head list = LIST_HEAD_INIT(list);
2948         int err;
2949
2950
2951         /* Need to guard against multiple cpu's getting out of order. */
2952         down(&net_todo_run_mutex);
2953
2954         /* Not safe to do outside the semaphore.  We must not return
2955          * until all unregister events invoked by the local processor
2956          * have been completed (either by this todo run, or one on
2957          * another cpu).
2958          */
2959         if (list_empty(&net_todo_list))
2960                 goto out;
2961
2962         /* Snapshot list, allow later requests */
2963         spin_lock(&net_todo_list_lock);
2964         list_splice_init(&net_todo_list, &list);
2965         spin_unlock(&net_todo_list_lock);
2966
2967         while (!list_empty(&list)) {
2968                 struct net_device *dev
2969                         = list_entry(list.next, struct net_device, todo_list);
2970                 list_del(&dev->todo_list);
2971
2972                 switch(dev->reg_state) {
2973                 case NETREG_REGISTERING:
2974                         err = netdev_register_sysfs(dev);
2975                         if (err)
2976                                 printk(KERN_ERR "%s: failed sysfs registration (%d)\n",
2977                                        dev->name, err);
2978                         dev->reg_state = NETREG_REGISTERED;
2979                         break;
2980
2981                 case NETREG_UNREGISTERING:
2982                         netdev_unregister_sysfs(dev);
2983                         dev->reg_state = NETREG_UNREGISTERED;
2984
2985                         netdev_wait_allrefs(dev);
2986
2987                         /* paranoia */
2988                         BUG_ON(atomic_read(&dev->refcnt));
2989                         BUG_TRAP(!dev->ip_ptr);
2990                         BUG_TRAP(!dev->ip6_ptr);
2991                         BUG_TRAP(!dev->dn_ptr);
2992
2993
2994                         /* It must be the very last action,
2995                          * after this 'dev' may point to freed up memory.
2996                          */
2997                         if (dev->destructor)
2998                                 dev->destructor(dev);
2999                         break;
3000
3001                 default:
3002                         printk(KERN_ERR "network todo '%s' but state %d\n",
3003                                dev->name, dev->reg_state);
3004                         break;
3005                 }
3006         }
3007
3008 out:
3009         up(&net_todo_run_mutex);
3010 }
3011
3012 /**
3013  *      alloc_netdev - allocate network device
3014  *      @sizeof_priv:   size of private data to allocate space for
3015  *      @name:          device name format string
3016  *      @setup:         callback to initialize device
3017  *
3018  *      Allocates a struct net_device with private data area for driver use
3019  *      and performs basic initialization.
3020  */
3021 struct net_device *alloc_netdev(int sizeof_priv, const char *name,
3022                 void (*setup)(struct net_device *))
3023 {
3024         void *p;
3025         struct net_device *dev;
3026         int alloc_size;
3027
3028         /* ensure 32-byte alignment of both the device and private area */
3029         alloc_size = (sizeof(*dev) + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST;
3030         alloc_size += sizeof_priv + NETDEV_ALIGN_CONST;
3031
3032         p = kmalloc(alloc_size, GFP_KERNEL);
3033         if (!p) {
3034                 printk(KERN_ERR "alloc_dev: Unable to allocate device.\n");
3035                 return NULL;
3036         }
3037         memset(p, 0, alloc_size);
3038
3039         dev = (struct net_device *)
3040                 (((long)p + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST);
3041         dev->padded = (char *)dev - (char *)p;
3042
3043         if (sizeof_priv)
3044                 dev->priv = netdev_priv(dev);
3045
3046         setup(dev);
3047         strcpy(dev->name, name);
3048         return dev;
3049 }
3050 EXPORT_SYMBOL(alloc_netdev);
3051
3052 /**
3053  *      free_netdev - free network device
3054  *      @dev: device
3055  *
3056  *      This function does the last stage of destroying an allocated device
3057  *      interface. The reference to the device object is released.
3058  *      If this is the last reference then it will be freed.
3059  */
3060 void free_netdev(struct net_device *dev)
3061 {
3062 #ifdef CONFIG_SYSFS
3063         /*  Compatiablity with error handling in drivers */
3064         if (dev->reg_state == NETREG_UNINITIALIZED) {
3065                 kfree((char *)dev - dev->padded);
3066                 return;
3067         }
3068
3069         BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
3070         dev->reg_state = NETREG_RELEASED;
3071
3072         /* will free via class release */
3073         class_device_put(&dev->class_dev);
3074 #else
3075         kfree((char *)dev - dev->padded);
3076 #endif
3077 }
3078
3079 /* Synchronize with packet receive processing. */
3080 void synchronize_net(void)
3081 {
3082         might_sleep();
3083         synchronize_kernel();
3084 }
3085
3086 /**
3087  *      unregister_netdevice - remove device from the kernel
3088  *      @dev: device
3089  *
3090  *      This function shuts down a device interface and removes it
3091  *      from the kernel tables. On success 0 is returned, on a failure
3092  *      a negative errno code is returned.
3093  *
3094  *      Callers must hold the rtnl semaphore.  You may want
3095  *      unregister_netdev() instead of this.
3096  */
3097
3098 int unregister_netdevice(struct net_device *dev)
3099 {
3100         struct net_device *d, **dp;
3101
3102         BUG_ON(dev_boot_phase);
3103         ASSERT_RTNL();
3104
3105         /* Some devices call without registering for initialization unwind. */
3106         if (dev->reg_state == NETREG_UNINITIALIZED) {
3107                 printk(KERN_DEBUG "unregister_netdevice: device %s/%p never "
3108                                   "was registered\n", dev->name, dev);
3109                 return -ENODEV;
3110         }
3111
3112         BUG_ON(dev->reg_state != NETREG_REGISTERED);
3113
3114         /* If device is running, close it first. */
3115         if (dev->flags & IFF_UP)
3116                 dev_close(dev);
3117
3118         /* And unlink it from device chain. */
3119         for (dp = &dev_base; (d = *dp) != NULL; dp = &d->next) {
3120                 if (d == dev) {
3121                         write_lock_bh(&dev_base_lock);
3122                         hlist_del(&dev->name_hlist);
3123                         hlist_del(&dev->index_hlist);
3124                         if (dev_tail == &dev->next)
3125                                 dev_tail = dp;
3126                         *dp = d->next;
3127                         write_unlock_bh(&dev_base_lock);
3128                         break;
3129                 }
3130         }
3131         if (!d) {
3132                 printk(KERN_ERR "unregister net_device: '%s' not found\n",
3133                        dev->name);
3134                 return -ENODEV;
3135         }
3136
3137         dev->reg_state = NETREG_UNREGISTERING;
3138
3139         synchronize_net();
3140
3141         /* Shutdown queueing discipline. */
3142         dev_shutdown(dev);
3143
3144
3145         /* Notify protocols, that we are about to destroy
3146            this device. They should clean all the things.
3147         */
3148         notifier_call_chain(&netdev_chain, NETDEV_UNREGISTER, dev);
3149
3150         /*
3151          *      Flush the multicast chain
3152          */
3153         dev_mc_discard(dev);
3154
3155         if (dev->uninit)
3156                 dev->uninit(dev);
3157
3158         /* Notifier chain MUST detach us from master device. */
3159         BUG_TRAP(!dev->master);
3160
3161         free_divert_blk(dev);
3162
3163         /* Finish processing unregister after unlock */
3164         net_set_todo(dev);
3165
3166         synchronize_net();
3167
3168         dev_put(dev);
3169         return 0;
3170 }
3171
3172 /**
3173  *      unregister_netdev - remove device from the kernel
3174  *      @dev: device
3175  *
3176  *      This function shuts down a device interface and removes it
3177  *      from the kernel tables. On success 0 is returned, on a failure
3178  *      a negative errno code is returned.
3179  *
3180  *      This is just a wrapper for unregister_netdevice that takes
3181  *      the rtnl semaphore.  In general you want to use this and not
3182  *      unregister_netdevice.
3183  */
3184 void unregister_netdev(struct net_device *dev)
3185 {
3186         rtnl_lock();
3187         unregister_netdevice(dev);
3188         rtnl_unlock();
3189 }
3190
3191 EXPORT_SYMBOL(unregister_netdev);
3192
3193 #ifdef CONFIG_HOTPLUG_CPU
3194 static int dev_cpu_callback(struct notifier_block *nfb,
3195                             unsigned long action,
3196                             void *ocpu)
3197 {
3198         struct sk_buff **list_skb;
3199         struct net_device **list_net;
3200         struct sk_buff *skb;
3201         unsigned int cpu, oldcpu = (unsigned long)ocpu;
3202         struct softnet_data *sd, *oldsd;
3203
3204         if (action != CPU_DEAD)
3205                 return NOTIFY_OK;
3206
3207         local_irq_disable();
3208         cpu = smp_processor_id();
3209         sd = &per_cpu(softnet_data, cpu);
3210         oldsd = &per_cpu(softnet_data, oldcpu);
3211
3212         /* Find end of our completion_queue. */
3213         list_skb = &sd->completion_queue;
3214         while (*list_skb)
3215                 list_skb = &(*list_skb)->next;
3216         /* Append completion queue from offline CPU. */
3217         *list_skb = oldsd->completion_queue;
3218         oldsd->completion_queue = NULL;
3219
3220         /* Find end of our output_queue. */
3221         list_net = &sd->output_queue;
3222         while (*list_net)
3223                 list_net = &(*list_net)->next_sched;
3224         /* Append output queue from offline CPU. */
3225         *list_net = oldsd->output_queue;
3226         oldsd->output_queue = NULL;
3227
3228         raise_softirq_irqoff(NET_TX_SOFTIRQ);
3229         local_irq_enable();
3230
3231         /* Process offline CPU's input_pkt_queue */
3232         while ((skb = __skb_dequeue(&oldsd->input_pkt_queue)))
3233                 netif_rx(skb);
3234
3235         return NOTIFY_OK;
3236 }
3237 #endif /* CONFIG_HOTPLUG_CPU */
3238
3239
3240 /*
3241  *      Initialize the DEV module. At boot time this walks the device list and
3242  *      unhooks any devices that fail to initialise (normally hardware not
3243  *      present) and leaves us with a valid list of present and active devices.
3244  *
3245  */
3246
3247 /*
3248  *       This is called single threaded during boot, so no need
3249  *       to take the rtnl semaphore.
3250  */
3251 static int __init net_dev_init(void)
3252 {
3253         int i, rc = -ENOMEM;
3254
3255         BUG_ON(!dev_boot_phase);
3256
3257         net_random_init();
3258
3259         if (dev_proc_init())
3260                 goto out;
3261
3262         if (netdev_sysfs_init())
3263                 goto out;
3264
3265         INIT_LIST_HEAD(&ptype_all);
3266         for (i = 0; i < 16; i++)
3267                 INIT_LIST_HEAD(&ptype_base[i]);
3268
3269         for (i = 0; i < ARRAY_SIZE(dev_name_head); i++)
3270                 INIT_HLIST_HEAD(&dev_name_head[i]);
3271
3272         for (i = 0; i < ARRAY_SIZE(dev_index_head); i++)
3273                 INIT_HLIST_HEAD(&dev_index_head[i]);
3274
3275         /*
3276          *      Initialise the packet receive queues.
3277          */
3278
3279         for (i = 0; i < NR_CPUS; i++) {
3280                 struct softnet_data *queue;
3281
3282                 queue = &per_cpu(softnet_data, i);
3283                 skb_queue_head_init(&queue->input_pkt_queue);
3284                 queue->throttle = 0;
3285                 queue->cng_level = 0;
3286                 queue->avg_blog = 10; /* arbitrary non-zero */
3287                 queue->completion_queue = NULL;
3288                 INIT_LIST_HEAD(&queue->poll_list);
3289                 set_bit(__LINK_STATE_START, &queue->backlog_dev.state);
3290                 queue->backlog_dev.weight = weight_p;
3291                 queue->backlog_dev.poll = process_backlog;
3292                 atomic_set(&queue->backlog_dev.refcnt, 1);
3293         }
3294
3295 #ifdef OFFLINE_SAMPLE
3296         samp_timer.expires = jiffies + (10 * HZ);
3297         add_timer(&samp_timer);
3298 #endif
3299
3300         dev_boot_phase = 0;
3301
3302         open_softirq(NET_TX_SOFTIRQ, net_tx_action, NULL);
3303         open_softirq(NET_RX_SOFTIRQ, net_rx_action, NULL);
3304
3305         hotcpu_notifier(dev_cpu_callback, 0);
3306         dst_init();
3307         dev_mcast_init();
3308         rc = 0;
3309 out:
3310         return rc;
3311 }
3312
3313 subsys_initcall(net_dev_init);
3314
3315 EXPORT_SYMBOL(__dev_get_by_index);
3316 EXPORT_SYMBOL(__dev_get_by_name);
3317 EXPORT_SYMBOL(__dev_remove_pack);
3318 EXPORT_SYMBOL(__skb_linearize);
3319 EXPORT_SYMBOL(dev_add_pack);
3320 EXPORT_SYMBOL(dev_alloc_name);
3321 EXPORT_SYMBOL(dev_close);
3322 EXPORT_SYMBOL(dev_get_by_flags);
3323 EXPORT_SYMBOL(dev_get_by_index);
3324 EXPORT_SYMBOL(dev_get_by_name);
3325 EXPORT_SYMBOL(dev_ioctl);
3326 EXPORT_SYMBOL(dev_open);
3327 EXPORT_SYMBOL(dev_queue_xmit);
3328 EXPORT_SYMBOL(dev_remove_pack);
3329 EXPORT_SYMBOL(dev_set_allmulti);
3330 EXPORT_SYMBOL(dev_set_promiscuity);
3331 EXPORT_SYMBOL(dev_change_flags);
3332 EXPORT_SYMBOL(dev_set_mtu);
3333 EXPORT_SYMBOL(free_netdev);
3334 EXPORT_SYMBOL(netdev_boot_setup_check);
3335 EXPORT_SYMBOL(netdev_set_master);
3336 EXPORT_SYMBOL(netdev_state_change);
3337 EXPORT_SYMBOL(netif_receive_skb);
3338 EXPORT_SYMBOL(netif_rx);
3339 EXPORT_SYMBOL(register_gifconf);
3340 EXPORT_SYMBOL(register_netdevice);
3341 EXPORT_SYMBOL(register_netdevice_notifier);
3342 EXPORT_SYMBOL(skb_checksum_help);
3343 EXPORT_SYMBOL(synchronize_net);
3344 EXPORT_SYMBOL(unregister_netdevice);
3345 EXPORT_SYMBOL(unregister_netdevice_notifier);
3346 EXPORT_SYMBOL(net_enable_timestamp);
3347 EXPORT_SYMBOL(net_disable_timestamp);
3348
3349 #if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)
3350 EXPORT_SYMBOL(br_handle_frame_hook);
3351 #endif
3352
3353 #ifdef CONFIG_KMOD
3354 EXPORT_SYMBOL(dev_load);
3355 #endif
3356
3357 EXPORT_PER_CPU_SYMBOL(softnet_data);