net/core/dev.c

   1 /*
   2  *      NET3    Protocol independent device support routines.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  *      Derived from the non IP parts of dev.c 1.0.19
  10  *              Authors:        Ross Biro
  11  *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *
  14  *      Additional Authors:
  15  *              Florian la Roche <rzsfl@rz.uni-sb.de>
  16  *              Alan Cox <gw4pts@gw4pts.ampr.org>
  17  *              David Hinds <dahinds@users.sourceforge.net>
  18  *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19  *              Adam Sulmicki <adam@cfar.umd.edu>
  20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21  *
  22  *      Changes:
  23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24  *                                      to 2 if register_netdev gets called
  25  *                                      before net_dev_init & also removed a
  26  *                                      few lines of code in the process.
  27  *              Alan Cox        :       device private ioctl copies fields back.
  28  *              Alan Cox        :       Transmit queue code does relevant
  29  *                                      stunts to keep the queue safe.
  30  *              Alan Cox        :       Fixed double lock.
  31  *              Alan Cox        :       Fixed promisc NULL pointer trap
  32  *              ????????        :       Support the full private ioctl range
  33  *              Alan Cox        :       Moved ioctl permission check into
  34  *                                      drivers
  35  *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
  36  *              Alan Cox        :       100 backlog just doesn't cut it when
  37  *                                      you start doing multicast video 8)
  38  *              Alan Cox        :       Rewrote net_bh and list manager.
  39  *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  40  *              Alan Cox        :       Took out transmit every packet pass
  41  *                                      Saved a few bytes in the ioctl handler
  42  *              Alan Cox        :       Network driver sets packet type before
  43  *                                      calling netif_rx. Saves a function
  44  *                                      call a packet.
  45  *              Alan Cox        :       Hashed net_bh()
  46  *              Richard Kooijman:       Timestamp fixes.
  47  *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
  48  *              Alan Cox        :       Device lock protection.
  49  *              Alan Cox        :       Fixed nasty side effect of device close
  50  *                                      changes.
  51  *              Rudi Cilibrasi  :       Pass the right thing to
  52  *                                      set_mac_address()
  53  *              Dave Miller     :       32bit quantity for the device lock to
  54  *                                      make it work out on a Sparc.
  55  *              Bjorn Ekwall    :       Added KERNELD hack.
  56  *              Alan Cox        :       Cleaned up the backlog initialise.
  57  *              Craig Metz      :       SIOCGIFCONF fix if space for under
  58  *                                      1 device.
  59  *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
  60  *                                      is no device open function.
  61  *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  62  *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  63  *              Cyrus Durgin    :       Cleaned for KMOD
  64  *              Adam Sulmicki   :       Bug Fix : Network Device Unload
  65  *                                      A network device unload needs to purge
  66  *                                      the backlog queue.
  67  *      Paul Rusty Russell      :       SIOCSIFNAME
  68  *              Pekka Riikonen  :       Netdev boot-time settings code
  69  *              Andrew Morton   :       Make unregister_netdevice wait
  70  *                                      indefinitely on dev->refcnt
  71  *              J Hadi Salim    :       - Backlog queue sampling
  72  *                                      - netif_rx() feedback
  73  */
  74
  75 #include <asm/uaccess.h>
  76 #include <asm/system.h>
  77 #include <linux/bitops.h>
  78 #include <linux/config.h>
  79 #include <linux/cpu.h>
  80 #include <linux/types.h>
  81 #include <linux/kernel.h>
  82 #include <linux/sched.h>
  83 #include <linux/string.h>
  84 #include <linux/mm.h>
  85 #include <linux/socket.h>
  86 #include <linux/sockios.h>
  87 #include <linux/errno.h>
  88 #include <linux/interrupt.h>
  89 #include <linux/if_ether.h>
  90 #include <linux/netdevice.h>
  91 #include <linux/etherdevice.h>
  92 #include <linux/notifier.h>
  93 #include <linux/skbuff.h>
  94 #include <net/sock.h>
  95 #include <linux/rtnetlink.h>
  96 #include <linux/proc_fs.h>
  97 #include <linux/seq_file.h>
  98 #include <linux/stat.h>
  99 #include <linux/if_bridge.h>
 100 #include <linux/divert.h>
 101 #include <net/dst.h>
 102 #include <net/pkt_sched.h>
 103 #include <net/checksum.h>
 104 #include <linux/highmem.h>
 105 #include <linux/init.h>
 106 #include <linux/kmod.h>
 107 #include <linux/module.h>
 108 #include <linux/kallsyms.h>
 109 #include <linux/netpoll.h>
 110 #include <linux/rcupdate.h>
 111 #include <linux/delay.h>
 112 #ifdef CONFIG_NET_RADIO
 113 #include <linux/wireless.h>             /* Note : will define WIRELESS_EXT */
 114 #include <net/iw_handler.h>
 115 #endif  /* CONFIG_NET_RADIO */
 116 #include <linux/vs_network.h>
 117 #include <asm/current.h>
 118 #include <linux/vs_network.h>
 119
 120 /* This define, if set, will randomly drop a packet when congestion
 121  * is more than moderate.  It helps fairness in the multi-interface
 122  * case when one of them is a hog, but it kills performance for the
 123  * single interface case so it is off now by default.
 124  */
 125 #undef RAND_LIE
 126
 127 /* Setting this will sample the queue lengths and thus congestion
 128  * via a timer instead of as each packet is received.
 129  */
 130 #undef OFFLINE_SAMPLE
 131
 132 /*
 133  *      The list of packet types we will receive (as opposed to discard)
 134  *      and the routines to invoke.
 135  *
 136  *      Why 16. Because with 16 the only overlap we get on a hash of the
 137  *      low nibble of the protocol value is RARP/SNAP/X.25.
 138  *
 139  *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
 140  *             sure which should go first, but I bet it won't make much
 141  *             difference if we are running VLANs.  The good news is that
 142  *             this protocol won't be in the list unless compiled in, so
 143  *             the average user (w/out VLANs) will not be adversly affected.
 144  *             --BLG
 145  *
 146  *              0800    IP
 147  *              8100    802.1Q VLAN
 148  *              0001    802.3
 149  *              0002    AX.25
 150  *              0004    802.2
 151  *              8035    RARP
 152  *              0005    SNAP
 153  *              0805    X.25
 154  *              0806    ARP
 155  *              8137    IPX
 156  *              0009    Localtalk
 157  *              86DD    IPv6
 158  */
 159
 160 static DEFINE_SPINLOCK(ptype_lock);
 161 static struct list_head ptype_base[16]; /* 16 way hashed list */
 162 static struct list_head ptype_all;              /* Taps */
 163
 164 #ifdef OFFLINE_SAMPLE
 165 static void sample_queue(unsigned long dummy);
 166 static struct timer_list samp_timer = TIMER_INITIALIZER(sample_queue, 0, 0);
 167 #endif
 168
 169 /*
 170  * The @dev_base list is protected by @dev_base_lock and the rtln
 171  * semaphore.
 172  *
 173  * Pure readers hold dev_base_lock for reading.
 174  *
 175  * Writers must hold the rtnl semaphore while they loop through the
 176  * dev_base list, and hold dev_base_lock for writing when they do the
 177  * actual updates.  This allows pure readers to access the list even
 178  * while a writer is preparing to update it.
 179  *
 180  * To put it another way, dev_base_lock is held for writing only to
 181  * protect against pure readers; the rtnl semaphore provides the
 182  * protection against other writers.
 183  *
 184  * See, for example usages, register_netdevice() and
 185  * unregister_netdevice(), which must be called with the rtnl
 186  * semaphore held.
 187  */
 188 struct net_device *dev_base;
 189 static struct net_device **dev_tail = &dev_base;
 190 DEFINE_RWLOCK(dev_base_lock);
 191
 192 EXPORT_SYMBOL(dev_base);
 193 EXPORT_SYMBOL(dev_base_lock);
 194
 195 #define NETDEV_HASHBITS 8
 196 static struct hlist_head dev_name_head[1<<NETDEV_HASHBITS];
 197 static struct hlist_head dev_index_head[1<<NETDEV_HASHBITS];
 198
 199 static inline struct hlist_head *dev_name_hash(const char *name)
 200 {
 201         unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 202         return &dev_name_head[hash & ((1<<NETDEV_HASHBITS)-1)];
 203 }
 204
 205 static inline struct hlist_head *dev_index_hash(int ifindex)
 206 {
 207         return &dev_index_head[ifindex & ((1<<NETDEV_HASHBITS)-1)];
 208 }
 209
 210 /*
 211  *      Our notifier list
 212  */
 213
 214 static struct notifier_block *netdev_chain;
 215
 216 /*
 217  *      Device drivers call our routines to queue packets here. We empty the
 218  *      queue in the local softnet handler.
 219  */
 220 DEFINE_PER_CPU(struct softnet_data, softnet_data) = { 0, };
 221
 222 #ifdef CONFIG_SYSFS
 223 extern int netdev_sysfs_init(void);
 224 extern int netdev_register_sysfs(struct net_device *);
 225 extern void netdev_unregister_sysfs(struct net_device *);
 226 #else
 227 #define netdev_sysfs_init()             (0)
 228 #define netdev_register_sysfs(dev)      (0)
 229 #define netdev_unregister_sysfs(dev)    do { } while(0)
 230 #endif
 231
 232
 233 /*******************************************************************************
 234
 235                 Protocol management and registration routines
 236
 237 *******************************************************************************/
 238
 239 /*
 240  *      For efficiency
 241  */
 242
 243 int netdev_nit;
 244
 245 /*
 246  *      Add a protocol ID to the list. Now that the input handler is
 247  *      smarter we can dispense with all the messy stuff that used to be
 248  *      here.
 249  *
 250  *      BEWARE!!! Protocol handlers, mangling input packets,
 251  *      MUST BE last in hash buckets and checking protocol handlers
 252  *      MUST start from promiscuous ptype_all chain in net_bh.
 253  *      It is true now, do not change it.
 254  *      Explanation follows: if protocol handler, mangling packet, will
 255  *      be the first on list, it is not able to sense, that packet
 256  *      is cloned and should be copied-on-write, so that it will
 257  *      change it and subsequent readers will get broken packet.
 258  *                                                      --ANK (980803)
 259  */
 260
 261 /**
 262  *      dev_add_pack - add packet handler
 263  *      @pt: packet type declaration
 264  *
 265  *      Add a protocol handler to the networking stack. The passed &packet_type
 266  *      is linked into kernel lists and may not be freed until it has been
 267  *      removed from the kernel lists.
 268  *
 269  *      This call does not sleep therefore it can not
 270  *      guarantee all CPU's that are in middle of receiving packets
 271  *      will see the new packet type (until the next received packet).
 272  */
 273
 274 void dev_add_pack(struct packet_type *pt)
 275 {
 276         int hash;
 277
 278         spin_lock_bh(&ptype_lock);
 279         if (pt->type == htons(ETH_P_ALL)) {
 280                 netdev_nit++;
 281                 list_add_rcu(&pt->list, &ptype_all);
 282         } else {
 283                 hash = ntohs(pt->type) & 15;
 284                 list_add_rcu(&pt->list, &ptype_base[hash]);
 285         }
 286         spin_unlock_bh(&ptype_lock);
 287 }
 288
 289 extern void linkwatch_run_queue(void);
 290
 291
 292
 293 /**
 294  *      __dev_remove_pack        - remove packet handler
 295  *      @pt: packet type declaration
 296  *
 297  *      Remove a protocol handler that was previously added to the kernel
 298  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 299  *      from the kernel lists and can be freed or reused once this function
 300  *      returns.
 301  *
 302  *      The packet type might still be in use by receivers
 303  *      and must not be freed until after all the CPU's have gone
 304  *      through a quiescent state.
 305  */
 306 void __dev_remove_pack(struct packet_type *pt)
 307 {
 308         struct list_head *head;
 309         struct packet_type *pt1;
 310
 311         spin_lock_bh(&ptype_lock);
 312
 313         if (pt->type == htons(ETH_P_ALL)) {
 314                 netdev_nit--;
 315                 head = &ptype_all;
 316         } else
 317                 head = &ptype_base[ntohs(pt->type) & 15];
 318
 319         list_for_each_entry(pt1, head, list) {
 320                 if (pt == pt1) {
 321                         list_del_rcu(&pt->list);
 322                         goto out;
 323                 }
 324         }
 325
 326         printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
 327 out:
 328         spin_unlock_bh(&ptype_lock);
 329 }
 330 /**
 331  *      dev_remove_pack  - remove packet handler
 332  *      @pt: packet type declaration
 333  *
 334  *      Remove a protocol handler that was previously added to the kernel
 335  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 336  *      from the kernel lists and can be freed or reused once this function
 337  *      returns.
 338  *
 339  *      This call sleeps to guarantee that no CPU is looking at the packet
 340  *      type after return.
 341  */
 342 void dev_remove_pack(struct packet_type *pt)
 343 {
 344         __dev_remove_pack(pt);
 345
 346         synchronize_net();
 347 }
 348
 349 /******************************************************************************
 350
 351                       Device Boot-time Settings Routines
 352
 353 *******************************************************************************/
 354
 355 /* Boot time configuration table */
 356 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 357
 358 /**
 359  *      netdev_boot_setup_add   - add new setup entry
 360  *      @name: name of the device
 361  *      @map: configured settings for the device
 362  *
 363  *      Adds new setup entry to the dev_boot_setup list.  The function
 364  *      returns 0 on error and 1 on success.  This is a generic routine to
 365  *      all netdevices.
 366  */
 367 static int netdev_boot_setup_add(char *name, struct ifmap *map)
 368 {
 369         struct netdev_boot_setup *s;
 370         int i;
 371
 372         s = dev_boot_setup;
 373         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 374                 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 375                         memset(s[i].name, 0, sizeof(s[i].name));
 376                         strcpy(s[i].name, name);
 377                         memcpy(&s[i].map, map, sizeof(s[i].map));
 378                         break;
 379                 }
 380         }
 381
 382         return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 383 }
 384
 385 /**
 386  *      netdev_boot_setup_check - check boot time settings
 387  *      @dev: the netdevice
 388  *
 389  *      Check boot time settings for the device.
 390  *      The found settings are set for the device to be used
 391  *      later in the device probing.
 392  *      Returns 0 if no settings found, 1 if they are.
 393  */
 394 int netdev_boot_setup_check(struct net_device *dev)
 395 {
 396         struct netdev_boot_setup *s = dev_boot_setup;
 397         int i;
 398
 399         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 400                 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 401                     !strncmp(dev->name, s[i].name, strlen(s[i].name))) {
 402                         dev->irq        = s[i].map.irq;
 403                         dev->base_addr  = s[i].map.base_addr;
 404                         dev->mem_start  = s[i].map.mem_start;
 405                         dev->mem_end    = s[i].map.mem_end;
 406                         return 1;
 407                 }
 408         }
 409         return 0;
 410 }
 411
 412
 413 /**
 414  *      netdev_boot_base        - get address from boot time settings
 415  *      @prefix: prefix for network device
 416  *      @unit: id for network device
 417  *
 418  *      Check boot time settings for the base address of device.
 419  *      The found settings are set for the device to be used
 420  *      later in the device probing.
 421  *      Returns 0 if no settings found.
 422  */
 423 unsigned long netdev_boot_base(const char *prefix, int unit)
 424 {
 425         const struct netdev_boot_setup *s = dev_boot_setup;
 426         char name[IFNAMSIZ];
 427         int i;
 428
 429         sprintf(name, "%s%d", prefix, unit);
 430
 431         /*
 432          * If device already registered then return base of 1
 433          * to indicate not to probe for this interface
 434          */
 435         if (__dev_get_by_name(name))
 436                 return 1;
 437
 438         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 439                 if (!strcmp(name, s[i].name))
 440                         return s[i].map.base_addr;
 441         return 0;
 442 }
 443
 444 /*
 445  * Saves at boot time configured settings for any netdevice.
 446  */
 447 int __init netdev_boot_setup(char *str)
 448 {
 449         int ints[5];
 450         struct ifmap map;
 451
 452         str = get_options(str, ARRAY_SIZE(ints), ints);
 453         if (!str || !*str)
 454                 return 0;
 455
 456         /* Save settings */
 457         memset(&map, 0, sizeof(map));
 458         if (ints[0] > 0)
 459                 map.irq = ints[1];
 460         if (ints[0] > 1)
 461                 map.base_addr = ints[2];
 462         if (ints[0] > 2)
 463                 map.mem_start = ints[3];
 464         if (ints[0] > 3)
 465                 map.mem_end = ints[4];
 466
 467         /* Add new entry to the list */
 468         return netdev_boot_setup_add(str, &map);
 469 }
 470
 471 __setup("netdev=", netdev_boot_setup);
 472
 473 /*******************************************************************************
 474
 475                             Device Interface Subroutines
 476
 477 *******************************************************************************/
 478
 479 /**
 480  *      __dev_get_by_name       - find a device by its name
 481  *      @name: name to find
 482  *
 483  *      Find an interface by name. Must be called under RTNL semaphore
 484  *      or @dev_base_lock. If the name is found a pointer to the device
 485  *      is returned. If the name is not found then %NULL is returned. The
 486  *      reference counters are not incremented so the caller must be
 487  *      careful with locks.
 488  */
 489
 490 struct net_device *__dev_get_by_name(const char *name)
 491 {
 492         struct hlist_node *p;
 493
 494         hlist_for_each(p, dev_name_hash(name)) {
 495                 struct net_device *dev
 496                         = hlist_entry(p, struct net_device, name_hlist);
 497                 if (!strncmp(dev->name, name, IFNAMSIZ))
 498                         return dev;
 499         }
 500         return NULL;
 501 }
 502
 503 /**
 504  *      dev_get_by_name         - find a device by its name
 505  *      @name: name to find
 506  *
 507  *      Find an interface by name. This can be called from any
 508  *      context and does its own locking. The returned handle has
 509  *      the usage count incremented and the caller must use dev_put() to
 510  *      release it when it is no longer needed. %NULL is returned if no
 511  *      matching device is found.
 512  */
 513
 514 struct net_device *dev_get_by_name(const char *name)
 515 {
 516         struct net_device *dev;
 517
 518         read_lock(&dev_base_lock);
 519         dev = __dev_get_by_name(name);
 520         if (dev)
 521                 dev_hold(dev);
 522         read_unlock(&dev_base_lock);
 523         return dev;
 524 }
 525
 526 /**
 527  *      __dev_get_by_index - find a device by its ifindex
 528  *      @ifindex: index of device
 529  *
 530  *      Search for an interface by index. Returns %NULL if the device
 531  *      is not found or a pointer to the device. The device has not
 532  *      had its reference counter increased so the caller must be careful
 533  *      about locking. The caller must hold either the RTNL semaphore
 534  *      or @dev_base_lock.
 535  */
 536
 537 struct net_device *__dev_get_by_index(int ifindex)
 538 {
 539         struct hlist_node *p;
 540
 541         hlist_for_each(p, dev_index_hash(ifindex)) {
 542                 struct net_device *dev
 543                         = hlist_entry(p, struct net_device, index_hlist);
 544                 if (dev->ifindex == ifindex)
 545                         return dev;
 546         }
 547         return NULL;
 548 }
 549
 550
 551 /**
 552  *      dev_get_by_index - find a device by its ifindex
 553  *      @ifindex: index of device
 554  *
 555  *      Search for an interface by index. Returns NULL if the device
 556  *      is not found or a pointer to the device. The device returned has
 557  *      had a reference added and the pointer is safe until the user calls
 558  *      dev_put to indicate they have finished with it.
 559  */
 560
 561 struct net_device *dev_get_by_index(int ifindex)
 562 {
 563         struct net_device *dev;
 564
 565         read_lock(&dev_base_lock);
 566         dev = __dev_get_by_index(ifindex);
 567         if (dev)
 568                 dev_hold(dev);
 569         read_unlock(&dev_base_lock);
 570         return dev;
 571 }
 572
 573 /**
 574  *      dev_getbyhwaddr - find a device by its hardware address
 575  *      @type: media type of device
 576  *      @ha: hardware address
 577  *
 578  *      Search for an interface by MAC address. Returns NULL if the device
 579  *      is not found or a pointer to the device. The caller must hold the
 580  *      rtnl semaphore. The returned device has not had its ref count increased
 581  *      and the caller must therefore be careful about locking
 582  *
 583  *      BUGS:
 584  *      If the API was consistent this would be __dev_get_by_hwaddr
 585  */
 586
 587 struct net_device *dev_getbyhwaddr(unsigned short type, char *ha)
 588 {
 589         struct net_device *dev;
 590
 591         ASSERT_RTNL();
 592
 593         for (dev = dev_base; dev; dev = dev->next)
 594                 if (dev->type == type &&
 595                     !memcmp(dev->dev_addr, ha, dev->addr_len))
 596                         break;
 597         return dev;
 598 }
 599
 600 struct net_device *dev_getfirstbyhwtype(unsigned short type)
 601 {
 602         struct net_device *dev;
 603
 604         rtnl_lock();
 605         for (dev = dev_base; dev; dev = dev->next) {
 606                 if (dev->type == type) {
 607                         dev_hold(dev);
 608                         break;
 609                 }
 610         }
 611         rtnl_unlock();
 612         return dev;
 613 }
 614
 615 EXPORT_SYMBOL(dev_getfirstbyhwtype);
 616
 617 /**
 618  *      dev_get_by_flags - find any device with given flags
 619  *      @if_flags: IFF_* values
 620  *      @mask: bitmask of bits in if_flags to check
 621  *
 622  *      Search for any interface with the given flags. Returns NULL if a device
 623  *      is not found or a pointer to the device. The device returned has
 624  *      had a reference added and the pointer is safe until the user calls
 625  *      dev_put to indicate they have finished with it.
 626  */
 627
 628 struct net_device * dev_get_by_flags(unsigned short if_flags, unsigned short mask)
 629 {
 630         struct net_device *dev;
 631
 632         read_lock(&dev_base_lock);
 633         for (dev = dev_base; dev != NULL; dev = dev->next) {
 634                 if (((dev->flags ^ if_flags) & mask) == 0) {
 635                         dev_hold(dev);
 636                         break;
 637                 }
 638         }
 639         read_unlock(&dev_base_lock);
 640         return dev;
 641 }
 642
 643 /**
 644  *      dev_valid_name - check if name is okay for network device
 645  *      @name: name string
 646  *
 647  *      Network device names need to be valid file names to
 648  *      to allow sysfs to work
 649  */
 650 static int dev_valid_name(const char *name)
 651 {
 652         return !(*name == '\0'
 653                  || !strcmp(name, ".")
 654                  || !strcmp(name, "..")
 655                  || strchr(name, '/'));
 656 }
 657
 658 /**
 659  *      dev_alloc_name - allocate a name for a device
 660  *      @dev: device
 661  *      @name: name format string
 662  *
 663  *      Passed a format string - eg "lt%d" it will try and find a suitable
 664  *      id. Not efficient for many devices, not called a lot. The caller
 665  *      must hold the dev_base or rtnl lock while allocating the name and
 666  *      adding the device in order to avoid duplicates. Returns the number
 667  *      of the unit assigned or a negative errno code.
 668  */
 669
 670 int dev_alloc_name(struct net_device *dev, const char *name)
 671 {
 672         int i = 0;
 673         char buf[IFNAMSIZ];
 674         const char *p;
 675         const int max_netdevices = 8*PAGE_SIZE;
 676         long *inuse;
 677         struct net_device *d;
 678
 679         p = strnchr(name, IFNAMSIZ-1, '%');
 680         if (p) {
 681                 /*
 682                  * Verify the string as this thing may have come from
 683                  * the user.  There must be either one "%d" and no other "%"
 684                  * characters.
 685                  */
 686                 if (p[1] != 'd' || strchr(p + 2, '%'))
 687                         return -EINVAL;
 688
 689                 /* Use one page as a bit array of possible slots */
 690                 inuse = (long *) get_zeroed_page(GFP_ATOMIC);
 691                 if (!inuse)
 692                         return -ENOMEM;
 693
 694                 for (d = dev_base; d; d = d->next) {
 695                         if (!sscanf(d->name, name, &i))
 696                                 continue;
 697                         if (i < 0 || i >= max_netdevices)
 698                                 continue;
 699
 700                         /*  avoid cases where sscanf is not exact inverse of printf */
 701                         snprintf(buf, sizeof(buf), name, i);
 702                         if (!strncmp(buf, d->name, IFNAMSIZ))
 703                                 set_bit(i, inuse);
 704                 }
 705
 706                 i = find_first_zero_bit(inuse, max_netdevices);
 707                 free_page((unsigned long) inuse);
 708         }
 709
 710         snprintf(buf, sizeof(buf), name, i);
 711         if (!__dev_get_by_name(buf)) {
 712                 strlcpy(dev->name, buf, IFNAMSIZ);
 713                 return i;
 714         }
 715
 716         /* It is possible to run out of possible slots
 717          * when the name is long and there isn't enough space left
 718          * for the digits, or if all bits are used.
 719          */
 720         return -ENFILE;
 721 }
 722
 723
 724 /**
 725  *      dev_change_name - change name of a device
 726  *      @dev: device
 727  *      @newname: name (or format string) must be at least IFNAMSIZ
 728  *
 729  *      Change name of a device, can pass format strings "eth%d".
 730  *      for wildcarding.
 731  */
 732 int dev_change_name(struct net_device *dev, char *newname)
 733 {
 734         int err = 0;
 735
 736         ASSERT_RTNL();
 737
 738         if (dev->flags & IFF_UP)
 739                 return -EBUSY;
 740
 741         if (!dev_valid_name(newname))
 742                 return -EINVAL;
 743
 744         if (strchr(newname, '%')) {
 745                 err = dev_alloc_name(dev, newname);
 746                 if (err < 0)
 747                         return err;
 748                 strcpy(newname, dev->name);
 749         }
 750         else if (__dev_get_by_name(newname))
 751                 return -EEXIST;
 752         else
 753                 strlcpy(dev->name, newname, IFNAMSIZ);
 754
 755         err = class_device_rename(&dev->class_dev, dev->name);
 756         if (!err) {
 757                 hlist_del(&dev->name_hlist);
 758                 hlist_add_head(&dev->name_hlist, dev_name_hash(dev->name));
 759                 notifier_call_chain(&netdev_chain, NETDEV_CHANGENAME, dev);
 760         }
 761
 762         return err;
 763 }
 764
 765 /**
 766  *      netdev_features_change - device changes fatures
 767  *      @dev: device to cause notification
 768  *
 769  *      Called to indicate a device has changed features.
 770  */
 771 void netdev_features_change(struct net_device *dev)
 772 {
 773         notifier_call_chain(&netdev_chain, NETDEV_FEAT_CHANGE, dev);
 774 }
 775 EXPORT_SYMBOL(netdev_features_change);
 776
 777 /**
 778  *      netdev_state_change - device changes state
 779  *      @dev: device to cause notification
 780  *
 781  *      Called to indicate a device has changed state. This function calls
 782  *      the notifier chains for netdev_chain and sends a NEWLINK message
 783  *      to the routing socket.
 784  */
 785 void netdev_state_change(struct net_device *dev)
 786 {
 787         if (dev->flags & IFF_UP) {
 788                 notifier_call_chain(&netdev_chain, NETDEV_CHANGE, dev);
 789                 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
 790         }
 791 }
 792
 793 /**
 794  *      dev_load        - load a network module
 795  *      @name: name of interface
 796  *
 797  *      If a network interface is not present and the process has suitable
 798  *      privileges this function loads the module. If module loading is not
 799  *      available in this kernel then it becomes a nop.
 800  */
 801
 802 void dev_load(const char *name)
 803 {
 804         struct net_device *dev;
 805
 806         read_lock(&dev_base_lock);
 807         dev = __dev_get_by_name(name);
 808         read_unlock(&dev_base_lock);
 809
 810         if (!dev && capable(CAP_SYS_MODULE))
 811                 request_module("%s", name);
 812 }
 813
 814 static int default_rebuild_header(struct sk_buff *skb)
 815 {
 816         printk(KERN_DEBUG "%s: default_rebuild_header called -- BUG!\n",
 817                skb->dev ? skb->dev->name : "NULL!!!");
 818         kfree_skb(skb);
 819         return 1;
 820 }
 821
 822
 823 /**
 824  *      dev_open        - prepare an interface for use.
 825  *      @dev:   device to open
 826  *
 827  *      Takes a device from down to up state. The device's private open
 828  *      function is invoked and then the multicast lists are loaded. Finally
 829  *      the device is moved into the up state and a %NETDEV_UP message is
 830  *      sent to the netdev notifier chain.
 831  *
 832  *      Calling this function on an active interface is a nop. On a failure
 833  *      a negative errno code is returned.
 834  */
 835 int dev_open(struct net_device *dev)
 836 {
 837         int ret = 0;
 838
 839         /*
 840          *      Is it already up?
 841          */
 842
 843         if (dev->flags & IFF_UP)
 844                 return 0;
 845
 846         /*
 847          *      Is it even present?
 848          */
 849         if (!netif_device_present(dev))
 850                 return -ENODEV;
 851
 852         /*
 853          *      Call device private open method
 854          */
 855         set_bit(__LINK_STATE_START, &dev->state);
 856         if (dev->open) {
 857                 ret = dev->open(dev);
 858                 if (ret)
 859                         clear_bit(__LINK_STATE_START, &dev->state);
 860         }
 861
 862         /*
 863          *      If it went open OK then:
 864          */
 865
 866         if (!ret) {
 867                 /*
 868                  *      Set the flags.
 869                  */
 870                 dev->flags |= IFF_UP;
 871
 872                 /*
 873                  *      Initialize multicasting status
 874                  */
 875                 dev_mc_upload(dev);
 876
 877                 /*
 878                  *      Wakeup transmit queue engine
 879                  */
 880                 dev_activate(dev);
 881
 882                 /*
 883                  *      ... and announce new interface.
 884                  */
 885                 notifier_call_chain(&netdev_chain, NETDEV_UP, dev);
 886         }
 887         return ret;
 888 }
 889
 890 /**
 891  *      dev_close - shutdown an interface.
 892  *      @dev: device to shutdown
 893  *
 894  *      This function moves an active device into down state. A
 895  *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
 896  *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
 897  *      chain.
 898  */
 899 int dev_close(struct net_device *dev)
 900 {
 901         if (!(dev->flags & IFF_UP))
 902                 return 0;
 903
 904         /*
 905          *      Tell people we are going down, so that they can
 906          *      prepare to death, when device is still operating.
 907          */
 908         notifier_call_chain(&netdev_chain, NETDEV_GOING_DOWN, dev);
 909
 910         dev_deactivate(dev);
 911
 912         clear_bit(__LINK_STATE_START, &dev->state);
 913
 914         /* Synchronize to scheduled poll. We cannot touch poll list,
 915          * it can be even on different cpu. So just clear netif_running(),
 916          * and wait when poll really will happen. Actually, the best place
 917          * for this is inside dev->stop() after device stopped its irq
 918          * engine, but this requires more changes in devices. */
 919
 920         smp_mb__after_clear_bit(); /* Commit netif_running(). */
 921         while (test_bit(__LINK_STATE_RX_SCHED, &dev->state)) {
 922                 /* No hurry. */
 923                 current->state = TASK_INTERRUPTIBLE;
 924                 schedule_timeout(1);
 925         }
 926
 927         /*
 928          *      Call the device specific close. This cannot fail.
 929          *      Only if device is UP
 930          *
 931          *      We allow it to be called even after a DETACH hot-plug
 932          *      event.
 933          */
 934         if (dev->stop)
 935                 dev->stop(dev);
 936
 937         /*
 938          *      Device is now down.
 939          */
 940
 941         dev->flags &= ~IFF_UP;
 942
 943         /*
 944          * Tell people we are down
 945          */
 946         notifier_call_chain(&netdev_chain, NETDEV_DOWN, dev);
 947
 948         return 0;
 949 }
 950
 951
 952 /*
 953  *      Device change register/unregister. These are not inline or static
 954  *      as we export them to the world.
 955  */
 956
 957 /**
 958  *      register_netdevice_notifier - register a network notifier block
 959  *      @nb: notifier
 960  *
 961  *      Register a notifier to be called when network device events occur.
 962  *      The notifier passed is linked into the kernel structures and must
 963  *      not be reused until it has been unregistered. A negative errno code
 964  *      is returned on a failure.
 965  *
 966  *      When registered all registration and up events are replayed
 967  *      to the new notifier to allow device to have a race free
 968  *      view of the network device list.
 969  */
 970
 971 int register_netdevice_notifier(struct notifier_block *nb)
 972 {
 973         struct net_device *dev;
 974         int err;
 975
 976         rtnl_lock();
 977         err = notifier_chain_register(&netdev_chain, nb);
 978         if (!err) {
 979                 for (dev = dev_base; dev; dev = dev->next) {
 980                         nb->notifier_call(nb, NETDEV_REGISTER, dev);
 981
 982                         if (dev->flags & IFF_UP)
 983                                 nb->notifier_call(nb, NETDEV_UP, dev);
 984                 }
 985         }
 986         rtnl_unlock();
 987         return err;
 988 }
 989
 990 /**
 991  *      unregister_netdevice_notifier - unregister a network notifier block
 992  *      @nb: notifier
 993  *
 994  *      Unregister a notifier previously registered by
 995  *      register_netdevice_notifier(). The notifier is unlinked into the
 996  *      kernel structures and may then be reused. A negative errno code
 997  *      is returned on a failure.
 998  */
 999
1000 int unregister_netdevice_notifier(struct notifier_block *nb)
1001 {
1002         return notifier_chain_unregister(&netdev_chain, nb);
1003 }
1004
1005 /**
1006  *      call_netdevice_notifiers - call all network notifier blocks
1007  *      @val: value passed unmodified to notifier function
1008  *      @v:   pointer passed unmodified to notifier function
1009  *
1010  *      Call all network notifier blocks.  Parameters and return value
1011  *      are as for notifier_call_chain().
1012  */
1013
1014 int call_netdevice_notifiers(unsigned long val, void *v)
1015 {
1016         return notifier_call_chain(&netdev_chain, val, v);
1017 }
1018
1019 /* When > 0 there are consumers of rx skb time stamps */
1020 static atomic_t netstamp_needed = ATOMIC_INIT(0);
1021
1022 void net_enable_timestamp(void)
1023 {
1024         atomic_inc(&netstamp_needed);
1025 }
1026
1027 void net_disable_timestamp(void)
1028 {
1029         atomic_dec(&netstamp_needed);
1030 }
1031
1032 static inline void net_timestamp(struct timeval *stamp)
1033 {
1034         if (atomic_read(&netstamp_needed))
1035                 do_gettimeofday(stamp);
1036         else {
1037                 stamp->tv_sec = 0;
1038                 stamp->tv_usec = 0;
1039         }
1040 }
1041
1042 /*
1043  *      Support routine. Sends outgoing frames to any network
1044  *      taps currently in use.
1045  */
1046
1047 void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1048 {
1049         struct packet_type *ptype;
1050         net_timestamp(&skb->stamp);
1051
1052         rcu_read_lock();
1053         list_for_each_entry_rcu(ptype, &ptype_all, list) {
1054                 /* Never send packets back to the socket
1055                  * they originated from - MvS (miquels@drinkel.ow.org)
1056                  */
1057                 if ((ptype->dev == dev || !ptype->dev) &&
1058                     (ptype->af_packet_priv == NULL ||
1059                      (struct sock *)ptype->af_packet_priv != skb->sk)) {
1060                         struct sk_buff *skb2= skb_clone(skb, GFP_ATOMIC);
1061                         if (!skb2)
1062                                 break;
1063
1064                         /* skb->nh should be correctly
1065                            set by sender, so that the second statement is
1066                            just protection against buggy protocols.
1067                          */
1068                         skb2->mac.raw = skb2->data;
1069
1070                         if (skb2->nh.raw < skb2->data ||
1071                             skb2->nh.raw > skb2->tail) {
1072                                 if (net_ratelimit())
1073                                         printk(KERN_CRIT "protocol %04x is "
1074                                                "buggy, dev %s\n",
1075                                                skb2->protocol, dev->name);
1076                                 skb2->nh.raw = skb2->data;
1077                         }
1078
1079                         skb2->h.raw = skb2->nh.raw;
1080                         skb2->pkt_type = PACKET_OUTGOING;
1081                         ptype->func(skb2, skb->dev, ptype);
1082                 }
1083         }
1084         rcu_read_unlock();
1085 }
1086
1087 /*
1088  * Invalidate hardware checksum when packet is to be mangled, and
1089  * complete checksum manually on outgoing path.
1090  */
1091 int skb_checksum_help(struct sk_buff *skb, int inward)
1092 {
1093         unsigned int csum;
1094         int ret = 0, offset = skb->h.raw - skb->data;
1095
1096         if (inward) {
1097                 skb->ip_summed = CHECKSUM_NONE;
1098                 goto out;
1099         }
1100
1101         if (skb_cloned(skb)) {
1102                 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1103                 if (ret)
1104                         goto out;
1105         }
1106
1107         if (offset > (int)skb->len)
1108                 BUG();
1109         csum = skb_checksum(skb, offset, skb->len-offset, 0);
1110
1111         offset = skb->tail - skb->h.raw;
1112         if (offset <= 0)
1113                 BUG();
1114         if (skb->csum + 2 > offset)
1115                 BUG();
1116
1117         *(u16*)(skb->h.raw + skb->csum) = csum_fold(csum);
1118         skb->ip_summed = CHECKSUM_NONE;
1119 out:
1120         return ret;
1121 }
1122
1123 #ifdef CONFIG_HIGHMEM
1124 /* Actually, we should eliminate this check as soon as we know, that:
1125  * 1. IOMMU is present and allows to map all the memory.
1126  * 2. No high memory really exists on this machine.
1127  */
1128
1129 static inline int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1130 {
1131         int i;
1132
1133         if (dev->features & NETIF_F_HIGHDMA)
1134                 return 0;
1135
1136         for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1137                 if (PageHighMem(skb_shinfo(skb)->frags[i].page))
1138                         return 1;
1139
1140         return 0;
1141 }
1142 #else
1143 #define illegal_highdma(dev, skb)       (0)
1144 #endif
1145
1146 extern void skb_release_data(struct sk_buff *);
1147
1148 /* Keep head the same: replace data */
1149 int __skb_linearize(struct sk_buff *skb, int gfp_mask)
1150 {
1151         unsigned int size;
1152         u8 *data;
1153         long offset;
1154         struct skb_shared_info *ninfo;
1155         int headerlen = skb->data - skb->head;
1156         int expand = (skb->tail + skb->data_len) - skb->end;
1157
1158         if (skb_shared(skb))
1159                 BUG();
1160
1161         if (expand <= 0)
1162                 expand = 0;
1163
1164         size = skb->end - skb->head + expand;
1165         size = SKB_DATA_ALIGN(size);
1166         data = kmalloc(size + sizeof(struct skb_shared_info), gfp_mask);
1167         if (!data)
1168                 return -ENOMEM;
1169
1170         /* Copy entire thing */
1171         if (skb_copy_bits(skb, -headerlen, data, headerlen + skb->len))
1172                 BUG();
1173
1174         /* Set up shinfo */
1175         ninfo = (struct skb_shared_info*)(data + size);
1176         atomic_set(&ninfo->dataref, 1);
1177         ninfo->tso_size = skb_shinfo(skb)->tso_size;
1178         ninfo->tso_segs = skb_shinfo(skb)->tso_segs;
1179         ninfo->nr_frags = 0;
1180         ninfo->frag_list = NULL;
1181
1182         /* Offset between the two in bytes */
1183         offset = data - skb->head;
1184
1185         /* Free old data. */
1186         skb_release_data(skb);
1187
1188         skb->head = data;
1189         skb->end  = data + size;
1190
1191         /* Set up new pointers */
1192         skb->h.raw   += offset;
1193         skb->nh.raw  += offset;
1194         skb->mac.raw += offset;
1195         skb->tail    += offset;
1196         skb->data    += offset;
1197
1198         /* We are no longer a clone, even if we were. */
1199         skb->cloned    = 0;
1200
1201         skb->tail     += skb->data_len;
1202         skb->data_len  = 0;
1203         return 0;
1204 }
1205
1206 #define HARD_TX_LOCK(dev, cpu) {                        \
1207         if ((dev->features & NETIF_F_LLTX) == 0) {      \
1208                 spin_lock(&dev->xmit_lock);             \
1209                 dev->xmit_lock_owner = cpu;             \
1210         }                                               \
1211 }
1212
1213 #define HARD_TX_UNLOCK(dev) {                           \
1214         if ((dev->features & NETIF_F_LLTX) == 0) {      \
1215                 dev->xmit_lock_owner = -1;              \
1216                 spin_unlock(&dev->xmit_lock);           \
1217         }                                               \
1218 }
1219
1220 /**
1221  *      dev_queue_xmit - transmit a buffer
1222  *      @skb: buffer to transmit
1223  *
1224  *      Queue a buffer for transmission to a network device. The caller must
1225  *      have set the device and priority and built the buffer before calling
1226  *      this function. The function can be called from an interrupt.
1227  *
1228  *      A negative errno code is returned on a failure. A success does not
1229  *      guarantee the frame will be transmitted as it may be dropped due
1230  *      to congestion or traffic shaping.
1231  *
1232  * -----------------------------------------------------------------------------------
1233  *      I notice this method can also return errors from the queue disciplines,
1234  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
1235  *      be positive.
1236  *
1237  *      Regardless of the return value, the skb is consumed, so it is currently
1238  *      difficult to retry a send to this method.  (You can bump the ref count
1239  *      before sending to hold a reference for retry if you are careful.)
1240  *
1241  *      When calling this method, interrupts MUST be enabled.  This is because
1242  *      the BH enable code must have IRQs enabled so that it will not deadlock.
1243  *          --BLG
1244  */
1245
1246 int dev_queue_xmit(struct sk_buff *skb)
1247 {
1248         struct net_device *dev = skb->dev;
1249         struct Qdisc *q;
1250         int rc = -ENOMEM;
1251
1252         if (skb_shinfo(skb)->frag_list &&
1253             !(dev->features & NETIF_F_FRAGLIST) &&
1254             __skb_linearize(skb, GFP_ATOMIC))
1255                 goto out_kfree_skb;
1256
1257         /* Fragmented skb is linearized if device does not support SG,
1258          * or if at least one of fragments is in highmem and device
1259          * does not support DMA from it.
1260          */
1261         if (skb_shinfo(skb)->nr_frags &&
1262             (!(dev->features & NETIF_F_SG) || illegal_highdma(dev, skb)) &&
1263             __skb_linearize(skb, GFP_ATOMIC))
1264                 goto out_kfree_skb;
1265
1266         /* If packet is not checksummed and device does not support
1267          * checksumming for this protocol, complete checksumming here.
1268          */
1269         if (skb->ip_summed == CHECKSUM_HW &&
1270             (!(dev->features & (NETIF_F_HW_CSUM | NETIF_F_NO_CSUM)) &&
1271              (!(dev->features & NETIF_F_IP_CSUM) ||
1272               skb->protocol != htons(ETH_P_IP))))
1273                 if (skb_checksum_help(skb, 0))
1274                         goto out_kfree_skb;
1275
1276         /* Disable soft irqs for various locks below. Also
1277          * stops preemption for RCU.
1278          */
1279         local_bh_disable();
1280
1281         /* Updates of qdisc are serialized by queue_lock.
1282          * The struct Qdisc which is pointed to by qdisc is now a
1283          * rcu structure - it may be accessed without acquiring
1284          * a lock (but the structure may be stale.) The freeing of the
1285          * qdisc will be deferred until it's known that there are no
1286          * more references to it.
1287          *
1288          * If the qdisc has an enqueue function, we still need to
1289          * hold the queue_lock before calling it, since queue_lock
1290          * also serializes access to the device queue.
1291          */
1292
1293         q = rcu_dereference(dev->qdisc);
1294 #ifdef CONFIG_NET_CLS_ACT
1295         skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_EGRESS);
1296 #endif
1297         if (q->enqueue) {
1298                 /* Grab device queue */
1299                 spin_lock(&dev->queue_lock);
1300
1301                 rc = q->enqueue(skb, q);
1302
1303                 qdisc_run(dev);
1304
1305                 spin_unlock(&dev->queue_lock);
1306                 rc = rc == NET_XMIT_BYPASS ? NET_XMIT_SUCCESS : rc;
1307                 goto out;
1308         }
1309
1310         /* The device has no queue. Common case for software devices:
1311            loopback, all the sorts of tunnels...
1312
1313            Really, it is unlikely that xmit_lock protection is necessary here.
1314            (f.e. loopback and IP tunnels are clean ignoring statistics
1315            counters.)
1316            However, it is possible, that they rely on protection
1317            made by us here.
1318
1319            Check this and shot the lock. It is not prone from deadlocks.
1320            Either shot noqueue qdisc, it is even simpler 8)
1321          */
1322         if (dev->flags & IFF_UP) {
1323                 int cpu = smp_processor_id(); /* ok because BHs are off */
1324
1325                 if (dev->xmit_lock_owner != cpu) {
1326
1327                         HARD_TX_LOCK(dev, cpu);
1328
1329                         if (!netif_queue_stopped(dev)) {
1330                                 if (netdev_nit)
1331                                         dev_queue_xmit_nit(skb, dev);
1332
1333                                 rc = 0;
1334                                 if (!dev->hard_start_xmit(skb, dev)) {
1335                                         HARD_TX_UNLOCK(dev);
1336                                         goto out;
1337                                 }
1338                         }
1339                         HARD_TX_UNLOCK(dev);
1340                         if (net_ratelimit())
1341                                 printk(KERN_CRIT "Virtual device %s asks to "
1342                                        "queue packet!\n", dev->name);
1343                 } else {
1344                         /* Recursion is detected! It is possible,
1345                          * unfortunately */
1346                         if (net_ratelimit())
1347                                 printk(KERN_CRIT "Dead loop on virtual device "
1348                                        "%s, fix it urgently!\n", dev->name);
1349                 }
1350         }
1351
1352         rc = -ENETDOWN;
1353         local_bh_enable();
1354
1355 out_kfree_skb:
1356         kfree_skb(skb);
1357         return rc;
1358 out:
1359         local_bh_enable();
1360         return rc;
1361 }
1362
1363
1364 /*=======================================================================
1365                         Receiver routines
1366   =======================================================================*/
1367
1368 int netdev_max_backlog = 300;
1369 int weight_p = 64;            /* old backlog weight */
1370 /* These numbers are selected based on intuition and some
1371  * experimentatiom, if you have more scientific way of doing this
1372  * please go ahead and fix things.
1373  */
1374 int no_cong_thresh = 10;
1375 int no_cong = 20;
1376 int lo_cong = 100;
1377 int mod_cong = 290;
1378
1379 DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
1380
1381
1382 static void get_sample_stats(int cpu)
1383 {
1384 #ifdef RAND_LIE
1385         unsigned long rd;
1386         int rq;
1387 #endif
1388         struct softnet_data *sd = &per_cpu(softnet_data, cpu);
1389         int blog = sd->input_pkt_queue.qlen;
1390         int avg_blog = sd->avg_blog;
1391
1392         avg_blog = (avg_blog >> 1) + (blog >> 1);
1393
1394         if (avg_blog > mod_cong) {
1395                 /* Above moderate congestion levels. */
1396                 sd->cng_level = NET_RX_CN_HIGH;
1397 #ifdef RAND_LIE
1398                 rd = net_random();
1399                 rq = rd % netdev_max_backlog;
1400                 if (rq < avg_blog) /* unlucky bastard */
1401                         sd->cng_level = NET_RX_DROP;
1402 #endif
1403         } else if (avg_blog > lo_cong) {
1404                 sd->cng_level = NET_RX_CN_MOD;
1405 #ifdef RAND_LIE
1406                 rd = net_random();
1407                 rq = rd % netdev_max_backlog;
1408                         if (rq < avg_blog) /* unlucky bastard */
1409                                 sd->cng_level = NET_RX_CN_HIGH;
1410 #endif
1411         } else if (avg_blog > no_cong)
1412                 sd->cng_level = NET_RX_CN_LOW;
1413         else  /* no congestion */
1414                 sd->cng_level = NET_RX_SUCCESS;
1415
1416         sd->avg_blog = avg_blog;
1417 }
1418
1419 #ifdef OFFLINE_SAMPLE
1420 static void sample_queue(unsigned long dummy)
1421 {
1422 /* 10 ms 0r 1ms -- i don't care -- JHS */
1423         int next_tick = 1;
1424         int cpu = smp_processor_id();
1425
1426         get_sample_stats(cpu);
1427         next_tick += jiffies;
1428         mod_timer(&samp_timer, next_tick);
1429 }
1430 #endif
1431
1432
1433 /**
1434  *      netif_rx        -       post buffer to the network code
1435  *      @skb: buffer to post
1436  *
1437  *      This function receives a packet from a device driver and queues it for
1438  *      the upper (protocol) levels to process.  It always succeeds. The buffer
1439  *      may be dropped during processing for congestion control or by the
1440  *      protocol layers.
1441  *
1442  *      return values:
1443  *      NET_RX_SUCCESS  (no congestion)
1444  *      NET_RX_CN_LOW   (low congestion)
1445  *      NET_RX_CN_MOD   (moderate congestion)
1446  *      NET_RX_CN_HIGH  (high congestion)
1447  *      NET_RX_DROP     (packet was dropped)
1448  *
1449  */
1450
1451 int netif_rx(struct sk_buff *skb)
1452 {
1453         int this_cpu;
1454         struct softnet_data *queue;
1455         unsigned long flags;
1456
1457         /* if netpoll wants it, pretend we never saw it */
1458         if (netpoll_rx(skb))
1459                 return NET_RX_DROP;
1460
1461         if (!skb->stamp.tv_sec)
1462                 net_timestamp(&skb->stamp);
1463
1464         /*
1465          * The code is rearranged so that the path is the most
1466          * short when CPU is congested, but is still operating.
1467          */
1468         local_irq_save(flags);
1469         this_cpu = smp_processor_id();
1470         queue = &__get_cpu_var(softnet_data);
1471
1472         __get_cpu_var(netdev_rx_stat).total++;
1473         if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
1474                 if (queue->input_pkt_queue.qlen) {
1475                         if (queue->throttle)
1476                                 goto drop;
1477
1478 enqueue:
1479                         dev_hold(skb->dev);
1480                         __skb_queue_tail(&queue->input_pkt_queue, skb);
1481 #ifndef OFFLINE_SAMPLE
1482                         get_sample_stats(this_cpu);
1483 #endif
1484                         local_irq_restore(flags);
1485                         return queue->cng_level;
1486                 }
1487
1488                 if (queue->throttle)
1489                         queue->throttle = 0;
1490
1491                 netif_rx_schedule(&queue->backlog_dev);
1492                 goto enqueue;
1493         }
1494
1495         if (!queue->throttle) {
1496                 queue->throttle = 1;
1497                 __get_cpu_var(netdev_rx_stat).throttled++;
1498         }
1499
1500 drop:
1501         __get_cpu_var(netdev_rx_stat).dropped++;
1502         local_irq_restore(flags);
1503
1504         kfree_skb(skb);
1505         return NET_RX_DROP;
1506 }
1507
1508 int netif_rx_ni(struct sk_buff *skb)
1509 {
1510         int err;
1511
1512         preempt_disable();
1513         err = netif_rx(skb);
1514         if (local_softirq_pending())
1515                 do_softirq();
1516         preempt_enable();
1517
1518         return err;
1519 }
1520
1521 EXPORT_SYMBOL(netif_rx_ni);
1522
1523 static __inline__ void skb_bond(struct sk_buff *skb)
1524 {
1525         struct net_device *dev = skb->dev;
1526
1527         if (dev->master) {
1528                 skb->real_dev = skb->dev;
1529                 skb->dev = dev->master;
1530         }
1531 }
1532
1533 static void net_tx_action(struct softirq_action *h)
1534 {
1535         struct softnet_data *sd = &__get_cpu_var(softnet_data);
1536
1537         if (sd->completion_queue) {
1538                 struct sk_buff *clist;
1539
1540                 local_irq_disable();
1541                 clist = sd->completion_queue;
1542                 sd->completion_queue = NULL;
1543                 local_irq_enable();
1544
1545                 while (clist) {
1546                         struct sk_buff *skb = clist;
1547                         clist = clist->next;
1548
1549                         BUG_TRAP(!atomic_read(&skb->users));
1550                         __kfree_skb(skb);
1551                 }
1552         }
1553
1554         if (sd->output_queue) {
1555                 struct net_device *head;
1556
1557                 local_irq_disable();
1558                 head = sd->output_queue;
1559                 sd->output_queue = NULL;
1560                 local_irq_enable();
1561
1562                 while (head) {
1563                         struct net_device *dev = head;
1564                         head = head->next_sched;
1565
1566                         smp_mb__before_clear_bit();
1567                         clear_bit(__LINK_STATE_SCHED, &dev->state);
1568
1569                         if (spin_trylock(&dev->queue_lock)) {
1570                                 qdisc_run(dev);
1571                                 spin_unlock(&dev->queue_lock);
1572                         } else {
1573                                 netif_schedule(dev);
1574                         }
1575                 }
1576         }
1577 }
1578
1579 static __inline__ int deliver_skb(struct sk_buff *skb,
1580                                   struct packet_type *pt_prev)
1581 {
1582         atomic_inc(&skb->users);
1583         return pt_prev->func(skb, skb->dev, pt_prev);
1584 }
1585
1586 #if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE)
1587 int (*br_handle_frame_hook)(struct net_bridge_port *p, struct sk_buff **pskb);
1588 struct net_bridge;
1589 struct net_bridge_fdb_entry *(*br_fdb_get_hook)(struct net_bridge *br,
1590                                                 unsigned char *addr);
1591 void (*br_fdb_put_hook)(struct net_bridge_fdb_entry *ent);
1592
1593 static __inline__ int handle_bridge(struct sk_buff **pskb,
1594                                     struct packet_type **pt_prev, int *ret)
1595 {
1596         struct net_bridge_port *port;
1597
1598         if ((*pskb)->pkt_type == PACKET_LOOPBACK ||
1599             (port = rcu_dereference((*pskb)->dev->br_port)) == NULL)
1600                 return 0;
1601
1602         if (*pt_prev) {
1603                 *ret = deliver_skb(*pskb, *pt_prev);
1604                 *pt_prev = NULL;
1605         }
1606
1607         return br_handle_frame_hook(port, pskb);
1608 }
1609 #else
1610 #define handle_bridge(skb, pt_prev, ret)        (0)
1611 #endif
1612
1613 #ifdef CONFIG_NET_CLS_ACT
1614 /* TODO: Maybe we should just force sch_ingress to be compiled in
1615  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
1616  * a compare and 2 stores extra right now if we dont have it on
1617  * but have CONFIG_NET_CLS_ACT
1618  * NOTE: This doesnt stop any functionality; if you dont have
1619  * the ingress scheduler, you just cant add policies on ingress.
1620  *
1621  */
1622 static int ing_filter(struct sk_buff *skb)
1623 {
1624         struct Qdisc *q;
1625         struct net_device *dev = skb->dev;
1626         int result = TC_ACT_OK;
1627
1628         if (dev->qdisc_ingress) {
1629                 __u32 ttl = (__u32) G_TC_RTTL(skb->tc_verd);
1630                 if (MAX_RED_LOOP < ttl++) {
1631                         printk("Redir loop detected Dropping packet (%s->%s)\n",
1632                                 skb->input_dev?skb->input_dev->name:"??",skb->dev->name);
1633                         return TC_ACT_SHOT;
1634                 }
1635
1636                 skb->tc_verd = SET_TC_RTTL(skb->tc_verd,ttl);
1637
1638                 skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_INGRESS);
1639                 if (NULL == skb->input_dev) {
1640                         skb->input_dev = skb->dev;
1641                         printk("ing_filter:  fixed  %s out %s\n",skb->input_dev->name,skb->dev->name);
1642                 }
1643                 spin_lock(&dev->ingress_lock);
1644                 if ((q = dev->qdisc_ingress) != NULL)
1645                         result = q->enqueue(skb, q);
1646                 spin_unlock(&dev->ingress_lock);
1647
1648         }
1649
1650         return result;
1651 }
1652 #endif
1653
1654 int netif_receive_skb(struct sk_buff *skb)
1655 {
1656         struct packet_type *ptype, *pt_prev;
1657         int ret = NET_RX_DROP;
1658         unsigned short type;
1659
1660         /* if we've gotten here through NAPI, check netpoll */
1661         if (skb->dev->poll && netpoll_rx(skb))
1662                 return NET_RX_DROP;
1663
1664         if (!skb->stamp.tv_sec)
1665                 net_timestamp(&skb->stamp);
1666
1667         skb_bond(skb);
1668
1669         __get_cpu_var(netdev_rx_stat).total++;
1670
1671         skb->h.raw = skb->nh.raw = skb->data;
1672         skb->mac_len = skb->nh.raw - skb->mac.raw;
1673
1674         pt_prev = NULL;
1675
1676         rcu_read_lock();
1677
1678 #ifdef CONFIG_NET_CLS_ACT
1679         if (skb->tc_verd & TC_NCLS) {
1680                 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
1681                 goto ncls;
1682         }
1683 #endif
1684
1685         list_for_each_entry_rcu(ptype, &ptype_all, list) {
1686                 if (!ptype->dev || ptype->dev == skb->dev) {
1687                         if (pt_prev)
1688                                 ret = deliver_skb(skb, pt_prev);
1689                         pt_prev = ptype;
1690                 }
1691         }
1692
1693 #ifdef CONFIG_NET_CLS_ACT
1694         if (pt_prev) {
1695                 ret = deliver_skb(skb, pt_prev);
1696                 pt_prev = NULL; /* noone else should process this after*/
1697         } else {
1698                 skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd);
1699         }
1700
1701         ret = ing_filter(skb);
1702
1703         if (ret == TC_ACT_SHOT || (ret == TC_ACT_STOLEN)) {
1704                 kfree_skb(skb);
1705                 goto out;
1706         }
1707
1708         skb->tc_verd = 0;
1709 ncls:
1710 #endif
1711
1712         handle_diverter(skb);
1713
1714         if (handle_bridge(&skb, &pt_prev, &ret))
1715                 goto out;
1716
1717         type = skb->protocol;
1718         list_for_each_entry_rcu(ptype, &ptype_base[ntohs(type)&15], list) {
1719                 if (ptype->type == type &&
1720                     (!ptype->dev || ptype->dev == skb->dev)) {
1721                         if (pt_prev)
1722                                 ret = deliver_skb(skb, pt_prev);
1723                         pt_prev = ptype;
1724                 }
1725         }
1726
1727         if (pt_prev) {
1728                 ret = pt_prev->func(skb, skb->dev, pt_prev);
1729         } else {
1730                 kfree_skb(skb);
1731                 /* Jamal, now you will not able to escape explaining
1732                  * me how you were going to use this. :-)
1733                  */
1734                 ret = NET_RX_DROP;
1735         }
1736
1737 out:
1738         rcu_read_unlock();
1739         return ret;
1740 }
1741
1742 static int process_backlog(struct net_device *backlog_dev, int *budget)
1743 {
1744         int work = 0;
1745         int quota = min(backlog_dev->quota, *budget);
1746         struct softnet_data *queue = &__get_cpu_var(softnet_data);
1747         unsigned long start_time = jiffies;
1748
1749         backlog_dev->weight = weight_p;
1750         for (;;) {
1751                 struct sk_buff *skb;
1752                 struct net_device *dev;
1753
1754                 local_irq_disable();
1755                 skb = __skb_dequeue(&queue->input_pkt_queue);
1756                 if (!skb)
1757                         goto job_done;
1758                 local_irq_enable();
1759
1760                 dev = skb->dev;
1761
1762                 netif_receive_skb(skb);
1763
1764                 dev_put(dev);
1765
1766                 work++;
1767
1768                 if (work >= quota || jiffies - start_time > 1)
1769                         break;
1770
1771         }
1772
1773         backlog_dev->quota -= work;
1774         *budget -= work;
1775         return -1;
1776
1777 job_done:
1778         backlog_dev->quota -= work;
1779         *budget -= work;
1780
1781         list_del(&backlog_dev->poll_list);
1782         smp_mb__before_clear_bit();
1783         netif_poll_enable(backlog_dev);
1784
1785         if (queue->throttle)
1786                 queue->throttle = 0;
1787         local_irq_enable();
1788         return 0;
1789 }
1790
1791 static void net_rx_action(struct softirq_action *h)
1792 {
1793         struct softnet_data *queue = &__get_cpu_var(softnet_data);
1794         unsigned long start_time = jiffies;
1795         int budget = netdev_max_backlog;
1796
1797
1798         local_irq_disable();
1799
1800         while (!list_empty(&queue->poll_list)) {
1801                 struct net_device *dev;
1802
1803                 if (budget <= 0 || jiffies - start_time > 1)
1804                         goto softnet_break;
1805
1806                 local_irq_enable();
1807
1808                 dev = list_entry(queue->poll_list.next,
1809                                  struct net_device, poll_list);
1810                 netpoll_poll_lock(dev);
1811
1812                 if (dev->quota <= 0 || dev->poll(dev, &budget)) {
1813                         netpoll_poll_unlock(dev);
1814                         local_irq_disable();
1815                         list_del(&dev->poll_list);
1816                         list_add_tail(&dev->poll_list, &queue->poll_list);
1817                         if (dev->quota < 0)
1818                                 dev->quota += dev->weight;
1819                         else
1820                                 dev->quota = dev->weight;
1821                 } else {
1822                         netpoll_poll_unlock(dev);
1823                         dev_put(dev);
1824                         local_irq_disable();
1825                 }
1826         }
1827 out:
1828         local_irq_enable();
1829         return;
1830
1831 softnet_break:
1832         __get_cpu_var(netdev_rx_stat).time_squeeze++;
1833         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
1834         goto out;
1835 }
1836
1837 static gifconf_func_t * gifconf_list [NPROTO];
1838
1839 /**
1840  *      register_gifconf        -       register a SIOCGIF handler
1841  *      @family: Address family
1842  *      @gifconf: Function handler
1843  *
1844  *      Register protocol dependent address dumping routines. The handler
1845  *      that is passed must not be freed or reused until it has been replaced
1846  *      by another handler.
1847  */
1848 int register_gifconf(unsigned int family, gifconf_func_t * gifconf)
1849 {
1850         if (family >= NPROTO)
1851                 return -EINVAL;
1852         gifconf_list[family] = gifconf;
1853         return 0;
1854 }
1855
1856
1857 /*
1858  *      Map an interface index to its name (SIOCGIFNAME)
1859  */
1860
1861 /*
1862  *      We need this ioctl for efficient implementation of the
1863  *      if_indextoname() function required by the IPv6 API.  Without
1864  *      it, we would have to search all the interfaces to find a
1865  *      match.  --pb
1866  */
1867
1868 static int dev_ifname(struct ifreq __user *arg)
1869 {
1870         struct net_device *dev;
1871         struct ifreq ifr;
1872
1873         /*
1874          *      Fetch the caller's info block.
1875          */
1876
1877         if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
1878                 return -EFAULT;
1879
1880         read_lock(&dev_base_lock);
1881         dev = __dev_get_by_index(ifr.ifr_ifindex);
1882         if (!dev) {
1883                 read_unlock(&dev_base_lock);
1884                 return -ENODEV;
1885         }
1886
1887         strcpy(ifr.ifr_name, dev->name);
1888         read_unlock(&dev_base_lock);
1889
1890         if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
1891                 return -EFAULT;
1892         return 0;
1893 }
1894
1895 /*
1896  *      Perform a SIOCGIFCONF call. This structure will change
1897  *      size eventually, and there is nothing I can do about it.
1898  *      Thus we will need a 'compatibility mode'.
1899  */
1900
1901 static int dev_ifconf(char __user *arg)
1902 {
1903         struct ifconf ifc;
1904         struct net_device *dev;
1905         char __user *pos;
1906         int len;
1907         int total;
1908         int i;
1909
1910         /*
1911          *      Fetch the caller's info block.
1912          */
1913
1914         if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
1915                 return -EFAULT;
1916
1917         pos = ifc.ifc_buf;
1918         len = ifc.ifc_len;
1919
1920         /*
1921          *      Loop over the interfaces, and write an info block for each.
1922          */
1923
1924         total = 0;
1925         for (dev = dev_base; dev; dev = dev->next) {
1926                 if (vx_flags(VXF_HIDE_NETIF, 0) &&
1927                         !dev_in_nx_info(dev, current->nx_info))
1928                         continue;
1929                 for (i = 0; i < NPROTO; i++) {
1930                         if (gifconf_list[i]) {
1931                                 int done;
1932                                 if (!pos)
1933                                         done = gifconf_list[i](dev, NULL, 0);
1934                                 else
1935                                         done = gifconf_list[i](dev, pos + total,
1936                                                                len - total);
1937                                 if (done < 0)
1938                                         return -EFAULT;
1939                                 total += done;
1940                         }
1941                 }
1942         }
1943
1944         /*
1945          *      All done.  Write the updated control block back to the caller.
1946          */
1947         ifc.ifc_len = total;
1948
1949         /*
1950          *      Both BSD and Solaris return 0 here, so we do too.
1951          */
1952         return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
1953 }
1954
1955 #ifdef CONFIG_PROC_FS
1956 /*
1957  *      This is invoked by the /proc filesystem handler to display a device
1958  *      in detail.
1959  */
1960 static __inline__ struct net_device *dev_get_idx(loff_t pos)
1961 {
1962         struct net_device *dev;
1963         loff_t i;
1964
1965         for (i = 0, dev = dev_base; dev && i < pos; ++i, dev = dev->next);
1966
1967         return i == pos ? dev : NULL;
1968 }
1969
1970 void *dev_seq_start(struct seq_file *seq, loff_t *pos)
1971 {
1972         read_lock(&dev_base_lock);
1973         return *pos ? dev_get_idx(*pos - 1) : SEQ_START_TOKEN;
1974 }
1975
1976 void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1977 {
1978         ++*pos;
1979         return v == SEQ_START_TOKEN ? dev_base : ((struct net_device *)v)->next;
1980 }
1981
1982 void dev_seq_stop(struct seq_file *seq, void *v)
1983 {
1984         read_unlock(&dev_base_lock);
1985 }
1986
1987 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
1988 {
1989         struct nx_info *nxi = current->nx_info;
1990
1991         if (vx_flags(VXF_HIDE_NETIF, 0) && !dev_in_nx_info(dev, nxi))
1992                 return;
1993         if (dev->get_stats) {
1994                 struct net_device_stats *stats = dev->get_stats(dev);
1995
1996                 seq_printf(seq, "%6s:%8lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu "
1997                                 "%8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n",
1998                            dev->name, stats->rx_bytes, stats->rx_packets,
1999                            stats->rx_errors,
2000                            stats->rx_dropped + stats->rx_missed_errors,
2001                            stats->rx_fifo_errors,
2002                            stats->rx_length_errors + stats->rx_over_errors +
2003                              stats->rx_crc_errors + stats->rx_frame_errors,
2004                            stats->rx_compressed, stats->multicast,
2005                            stats->tx_bytes, stats->tx_packets,
2006                            stats->tx_errors, stats->tx_dropped,
2007                            stats->tx_fifo_errors, stats->collisions,
2008                            stats->tx_carrier_errors +
2009                              stats->tx_aborted_errors +
2010                              stats->tx_window_errors +
2011                              stats->tx_heartbeat_errors,
2012                            stats->tx_compressed);
2013         } else
2014                 seq_printf(seq, "%6s: No statistics available.\n", dev->name);
2015 }
2016
2017 /*
2018  *      Called from the PROCfs module. This now uses the new arbitrary sized
2019  *      /proc/net interface to create /proc/net/dev
2020  */
2021 static int dev_seq_show(struct seq_file *seq, void *v)
2022 {
2023         if (v == SEQ_START_TOKEN)
2024                 seq_puts(seq, "Inter-|   Receive                            "
2025                               "                    |  Transmit\n"
2026                               " face |bytes    packets errs drop fifo frame "
2027                               "compressed multicast|bytes    packets errs "
2028                               "drop fifo colls carrier compressed\n");
2029         else
2030                 dev_seq_printf_stats(seq, v);
2031         return 0;
2032 }
2033
2034 static struct netif_rx_stats *softnet_get_online(loff_t *pos)
2035 {
2036         struct netif_rx_stats *rc = NULL;
2037
2038         while (*pos < NR_CPUS)
2039                 if (cpu_online(*pos)) {
2040                         rc = &per_cpu(netdev_rx_stat, *pos);
2041                         break;
2042                 } else
2043                         ++*pos;
2044         return rc;
2045 }
2046
2047 static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
2048 {
2049         return softnet_get_online(pos);
2050 }
2051
2052 static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2053 {
2054         ++*pos;
2055         return softnet_get_online(pos);
2056 }
2057
2058 static void softnet_seq_stop(struct seq_file *seq, void *v)
2059 {
2060 }
2061
2062 static int softnet_seq_show(struct seq_file *seq, void *v)
2063 {
2064         struct netif_rx_stats *s = v;
2065
2066         seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
2067                    s->total, s->dropped, s->time_squeeze, s->throttled,
2068                    s->fastroute_hit, s->fastroute_success, s->fastroute_defer,
2069                    s->fastroute_deferred_out,
2070 #if 0
2071                    s->fastroute_latency_reduction
2072 #else
2073                    s->cpu_collision
2074 #endif
2075                   );
2076         return 0;
2077 }
2078
2079 static struct seq_operations dev_seq_ops = {
2080         .start = dev_seq_start,
2081         .next  = dev_seq_next,
2082         .stop  = dev_seq_stop,
2083         .show  = dev_seq_show,
2084 };
2085
2086 static int dev_seq_open(struct inode *inode, struct file *file)
2087 {
2088         return seq_open(file, &dev_seq_ops);
2089 }
2090
2091 static struct file_operations dev_seq_fops = {
2092         .owner   = THIS_MODULE,
2093         .open    = dev_seq_open,
2094         .read    = seq_read,
2095         .llseek  = seq_lseek,
2096         .release = seq_release,
2097 };
2098
2099 static struct seq_operations softnet_seq_ops = {
2100         .start = softnet_seq_start,
2101         .next  = softnet_seq_next,
2102         .stop  = softnet_seq_stop,
2103         .show  = softnet_seq_show,
2104 };
2105
2106 static int softnet_seq_open(struct inode *inode, struct file *file)
2107 {
2108         return seq_open(file, &softnet_seq_ops);
2109 }
2110
2111 static struct file_operations softnet_seq_fops = {
2112         .owner   = THIS_MODULE,
2113         .open    = softnet_seq_open,
2114         .read    = seq_read,
2115         .llseek  = seq_lseek,
2116         .release = seq_release,
2117 };
2118
2119 #ifdef WIRELESS_EXT
2120 extern int wireless_proc_init(void);
2121 #else
2122 #define wireless_proc_init() 0
2123 #endif
2124
2125 static int __init dev_proc_init(void)
2126 {
2127         int rc = -ENOMEM;
2128
2129         if (!proc_net_fops_create("dev", S_IRUGO, &dev_seq_fops))
2130                 goto out;
2131         if (!proc_net_fops_create("softnet_stat", S_IRUGO, &softnet_seq_fops))
2132                 goto out_dev;
2133         if (wireless_proc_init())
2134                 goto out_softnet;
2135         rc = 0;
2136 out:
2137         return rc;
2138 out_softnet:
2139         proc_net_remove("softnet_stat");
2140 out_dev:
2141         proc_net_remove("dev");
2142         goto out;
2143 }
2144 #else
2145 #define dev_proc_init() 0
2146 #endif  /* CONFIG_PROC_FS */
2147
2148
2149 /**
2150  *      netdev_set_master       -       set up master/slave pair
2151  *      @slave: slave device
2152  *      @master: new master device
2153  *
2154  *      Changes the master device of the slave. Pass %NULL to break the
2155  *      bonding. The caller must hold the RTNL semaphore. On a failure
2156  *      a negative errno code is returned. On success the reference counts
2157  *      are adjusted, %RTM_NEWLINK is sent to the routing socket and the
2158  *      function returns zero.
2159  */
2160 int netdev_set_master(struct net_device *slave, struct net_device *master)
2161 {
2162         struct net_device *old = slave->master;
2163
2164         ASSERT_RTNL();
2165
2166         if (master) {
2167                 if (old)
2168                         return -EBUSY;
2169                 dev_hold(master);
2170         }
2171
2172         slave->master = master;
2173
2174         synchronize_net();
2175
2176         if (old)
2177                 dev_put(old);
2178
2179         if (master)
2180                 slave->flags |= IFF_SLAVE;
2181         else
2182                 slave->flags &= ~IFF_SLAVE;
2183
2184         rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
2185         return 0;
2186 }
2187
2188 /**
2189  *      dev_set_promiscuity     - update promiscuity count on a device
2190  *      @dev: device
2191  *      @inc: modifier
2192  *
2193  *      Add or remove promsicuity from a device. While the count in the device
2194  *      remains above zero the interface remains promiscuous. Once it hits zero
2195  *      the device reverts back to normal filtering operation. A negative inc
2196  *      value is used to drop promiscuity on the device.
2197  */
2198 void dev_set_promiscuity(struct net_device *dev, int inc)
2199 {
2200         unsigned short old_flags = dev->flags;
2201
2202         dev->flags |= IFF_PROMISC;
2203         if ((dev->promiscuity += inc) == 0)
2204                 dev->flags &= ~IFF_PROMISC;
2205         if (dev->flags ^ old_flags) {
2206                 dev_mc_upload(dev);
2207                 printk(KERN_INFO "device %s %s promiscuous mode\n",
2208                        dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
2209                                                                "left");
2210         }
2211 }
2212
2213 /**
2214  *      dev_set_allmulti        - update allmulti count on a device
2215  *      @dev: device
2216  *      @inc: modifier
2217  *
2218  *      Add or remove reception of all multicast frames to a device. While the
2219  *      count in the device remains above zero the interface remains listening
2220  *      to all interfaces. Once it hits zero the device reverts back to normal
2221  *      filtering operation. A negative @inc value is used to drop the counter
2222  *      when releasing a resource needing all multicasts.
2223  */
2224
2225 void dev_set_allmulti(struct net_device *dev, int inc)
2226 {
2227         unsigned short old_flags = dev->flags;
2228
2229         dev->flags |= IFF_ALLMULTI;
2230         if ((dev->allmulti += inc) == 0)
2231                 dev->flags &= ~IFF_ALLMULTI;
2232         if (dev->flags ^ old_flags)
2233                 dev_mc_upload(dev);
2234 }
2235
2236 unsigned dev_get_flags(const struct net_device *dev)
2237 {
2238         unsigned flags;
2239
2240         flags = (dev->flags & ~(IFF_PROMISC |
2241                                 IFF_ALLMULTI |
2242                                 IFF_RUNNING)) |
2243                 (dev->gflags & (IFF_PROMISC |
2244                                 IFF_ALLMULTI));
2245
2246         if (netif_running(dev) && netif_carrier_ok(dev))
2247                 flags |= IFF_RUNNING;
2248
2249         return flags;
2250 }
2251
2252 int dev_change_flags(struct net_device *dev, unsigned flags)
2253 {
2254         int ret;
2255         int old_flags = dev->flags;
2256
2257         /*
2258          *      Set the flags on our device.
2259          */
2260
2261         dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
2262                                IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
2263                                IFF_AUTOMEDIA)) |
2264                      (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
2265                                     IFF_ALLMULTI));
2266
2267         /*
2268          *      Load in the correct multicast list now the flags have changed.
2269          */
2270
2271         dev_mc_upload(dev);
2272
2273         /*
2274          *      Have we downed the interface. We handle IFF_UP ourselves
2275          *      according to user attempts to set it, rather than blindly
2276          *      setting it.
2277          */
2278
2279         ret = 0;
2280         if ((old_flags ^ flags) & IFF_UP) {     /* Bit is different  ? */
2281                 ret = ((old_flags & IFF_UP) ? dev_close : dev_open)(dev);
2282
2283                 if (!ret)
2284                         dev_mc_upload(dev);
2285         }
2286
2287         if (dev->flags & IFF_UP &&
2288             ((old_flags ^ dev->flags) &~ (IFF_UP | IFF_PROMISC | IFF_ALLMULTI |
2289                                           IFF_VOLATILE)))
2290                 notifier_call_chain(&netdev_chain, NETDEV_CHANGE, dev);
2291
2292         if ((flags ^ dev->gflags) & IFF_PROMISC) {
2293                 int inc = (flags & IFF_PROMISC) ? +1 : -1;
2294                 dev->gflags ^= IFF_PROMISC;
2295                 dev_set_promiscuity(dev, inc);
2296         }
2297
2298         /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
2299            is important. Some (broken) drivers set IFF_PROMISC, when
2300            IFF_ALLMULTI is requested not asking us and not reporting.
2301          */
2302         if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
2303                 int inc = (flags & IFF_ALLMULTI) ? +1 : -1;
2304                 dev->gflags ^= IFF_ALLMULTI;
2305                 dev_set_allmulti(dev, inc);
2306         }
2307
2308         if (old_flags ^ dev->flags)
2309                 rtmsg_ifinfo(RTM_NEWLINK, dev, old_flags ^ dev->flags);
2310
2311         return ret;
2312 }
2313
2314 int dev_set_mtu(struct net_device *dev, int new_mtu)
2315 {
2316         int err;
2317
2318         if (new_mtu == dev->mtu)
2319                 return 0;
2320
2321         /*      MTU must be positive.    */
2322         if (new_mtu < 0)
2323                 return -EINVAL;
2324
2325         if (!netif_device_present(dev))
2326                 return -ENODEV;
2327
2328         err = 0;
2329         if (dev->change_mtu)
2330                 err = dev->change_mtu(dev, new_mtu);
2331         else
2332                 dev->mtu = new_mtu;
2333         if (!err && dev->flags & IFF_UP)
2334                 notifier_call_chain(&netdev_chain,
2335                                     NETDEV_CHANGEMTU, dev);
2336         return err;
2337 }
2338
2339 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
2340 {
2341         int err;
2342
2343         if (!dev->set_mac_address)
2344                 return -EOPNOTSUPP;
2345         if (sa->sa_family != dev->type)
2346                 return -EINVAL;
2347         if (!netif_device_present(dev))
2348                 return -ENODEV;
2349         err = dev->set_mac_address(dev, sa);
2350         if (!err)
2351                 notifier_call_chain(&netdev_chain, NETDEV_CHANGEADDR, dev);
2352         return err;
2353 }
2354
2355 /*
2356  *      Perform the SIOCxIFxxx calls.
2357  */
2358 static int dev_ifsioc(struct ifreq *ifr, unsigned int cmd)
2359 {
2360         int err;
2361         struct net_device *dev = __dev_get_by_name(ifr->ifr_name);
2362
2363         if (!dev)
2364                 return -ENODEV;
2365
2366         switch (cmd) {
2367                 case SIOCGIFFLAGS:      /* Get interface flags */
2368                         ifr->ifr_flags = dev_get_flags(dev);
2369                         return 0;
2370
2371                 case SIOCSIFFLAGS:      /* Set interface flags */
2372                         return dev_change_flags(dev, ifr->ifr_flags);
2373
2374                 case SIOCGIFMETRIC:     /* Get the metric on the interface
2375                                            (currently unused) */
2376                         ifr->ifr_metric = 0;
2377                         return 0;
2378
2379                 case SIOCSIFMETRIC:     /* Set the metric on the interface
2380                                            (currently unused) */
2381                         return -EOPNOTSUPP;
2382
2383                 case SIOCGIFMTU:        /* Get the MTU of a device */
2384                         ifr->ifr_mtu = dev->mtu;
2385                         return 0;
2386
2387                 case SIOCSIFMTU:        /* Set the MTU of a device */
2388                         return dev_set_mtu(dev, ifr->ifr_mtu);
2389
2390                 case SIOCGIFHWADDR:
2391                         if (!dev->addr_len)
2392                                 memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
2393                         else
2394                                 memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
2395                                        min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
2396                         ifr->ifr_hwaddr.sa_family = dev->type;
2397                         return 0;
2398
2399                 case SIOCSIFHWADDR:
2400                         return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
2401
2402                 case SIOCSIFHWBROADCAST:
2403                         if (ifr->ifr_hwaddr.sa_family != dev->type)
2404                                 return -EINVAL;
2405                         memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
2406                                min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
2407                         notifier_call_chain(&netdev_chain,
2408                                             NETDEV_CHANGEADDR, dev);
2409                         return 0;
2410
2411                 case SIOCGIFMAP:
2412                         ifr->ifr_map.mem_start = dev->mem_start;
2413                         ifr->ifr_map.mem_end   = dev->mem_end;
2414                         ifr->ifr_map.base_addr = dev->base_addr;
2415                         ifr->ifr_map.irq       = dev->irq;
2416                         ifr->ifr_map.dma       = dev->dma;
2417                         ifr->ifr_map.port      = dev->if_port;
2418                         return 0;
2419
2420                 case SIOCSIFMAP:
2421                         if (dev->set_config) {
2422                                 if (!netif_device_present(dev))
2423                                         return -ENODEV;
2424                                 return dev->set_config(dev, &ifr->ifr_map);
2425                         }
2426                         return -EOPNOTSUPP;
2427
2428                 case SIOCADDMULTI:
2429                         if (!dev->set_multicast_list ||
2430                             ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
2431                                 return -EINVAL;
2432                         if (!netif_device_present(dev))
2433                                 return -ENODEV;
2434                         return dev_mc_add(dev, ifr->ifr_hwaddr.sa_data,
2435                                           dev->addr_len, 1);
2436
2437                 case SIOCDELMULTI:
2438                         if (!dev->set_multicast_list ||
2439                             ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
2440                                 return -EINVAL;
2441                         if (!netif_device_present(dev))
2442                                 return -ENODEV;
2443                         return dev_mc_delete(dev, ifr->ifr_hwaddr.sa_data,
2444                                              dev->addr_len, 1);
2445
2446                 case SIOCGIFINDEX:
2447                         ifr->ifr_ifindex = dev->ifindex;
2448                         return 0;
2449
2450                 case SIOCGIFTXQLEN:
2451                         ifr->ifr_qlen = dev->tx_queue_len;
2452                         return 0;
2453
2454                 case SIOCSIFTXQLEN:
2455                         if (ifr->ifr_qlen < 0)
2456                                 return -EINVAL;
2457                         dev->tx_queue_len = ifr->ifr_qlen;
2458                         return 0;
2459
2460                 case SIOCSIFNAME:
2461                         ifr->ifr_newname[IFNAMSIZ-1] = '\0';
2462                         return dev_change_name(dev, ifr->ifr_newname);
2463
2464                 /*
2465                  *      Unknown or private ioctl
2466                  */
2467
2468                 default:
2469                         if ((cmd >= SIOCDEVPRIVATE &&
2470                             cmd <= SIOCDEVPRIVATE + 15) ||
2471                             cmd == SIOCBONDENSLAVE ||
2472                             cmd == SIOCBONDRELEASE ||
2473                             cmd == SIOCBONDSETHWADDR ||
2474                             cmd == SIOCBONDSLAVEINFOQUERY ||
2475                             cmd == SIOCBONDINFOQUERY ||
2476                             cmd == SIOCBONDCHANGEACTIVE ||
2477                             cmd == SIOCGMIIPHY ||
2478                             cmd == SIOCGMIIREG ||
2479                             cmd == SIOCSMIIREG ||
2480                             cmd == SIOCBRADDIF ||
2481                             cmd == SIOCBRDELIF ||
2482                             cmd == SIOCWANDEV) {
2483                                 err = -EOPNOTSUPP;
2484                                 if (dev->do_ioctl) {
2485                                         if (netif_device_present(dev))
2486                                                 err = dev->do_ioctl(dev, ifr,
2487                                                                     cmd);
2488                                         else
2489                                                 err = -ENODEV;
2490                                 }
2491                         } else
2492                                 err = -EINVAL;
2493
2494         }
2495         return err;
2496 }
2497
2498 /*
2499  *      This function handles all "interface"-type I/O control requests. The actual
2500  *      'doing' part of this is dev_ifsioc above.
2501  */
2502
2503 /**
2504  *      dev_ioctl       -       network device ioctl
2505  *      @cmd: command to issue
2506  *      @arg: pointer to a struct ifreq in user space
2507  *
2508  *      Issue ioctl functions to devices. This is normally called by the
2509  *      user space syscall interfaces but can sometimes be useful for
2510  *      other purposes. The return value is the return from the syscall if
2511  *      positive or a negative errno code on error.
2512  */
2513
2514 int dev_ioctl(unsigned int cmd, void __user *arg)
2515 {
2516         struct ifreq ifr;
2517         int ret;
2518         char *colon;
2519
2520         /* One special case: SIOCGIFCONF takes ifconf argument
2521            and requires shared lock, because it sleeps writing
2522            to user space.
2523          */
2524
2525         if (cmd == SIOCGIFCONF) {
2526                 rtnl_shlock();
2527                 ret = dev_ifconf((char __user *) arg);
2528                 rtnl_shunlock();
2529                 return ret;
2530         }
2531         if (cmd == SIOCGIFNAME)
2532                 return dev_ifname((struct ifreq __user *)arg);
2533
2534         if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
2535                 return -EFAULT;
2536
2537         ifr.ifr_name[IFNAMSIZ-1] = 0;
2538
2539         colon = strchr(ifr.ifr_name, ':');
2540         if (colon)
2541                 *colon = 0;
2542
2543         /*
2544          *      See which interface the caller is talking about.
2545          */
2546
2547         switch (cmd) {
2548                 /*
2549                  *      These ioctl calls:
2550                  *      - can be done by all.
2551                  *      - atomic and do not require locking.
2552                  *      - return a value
2553                  */
2554                 case SIOCGIFFLAGS:
2555                 case SIOCGIFMETRIC:
2556                 case SIOCGIFMTU:
2557                 case SIOCGIFHWADDR:
2558                 case SIOCGIFSLAVE:
2559                 case SIOCGIFMAP:
2560                 case SIOCGIFINDEX:
2561                 case SIOCGIFTXQLEN:
2562                         dev_load(ifr.ifr_name);
2563                         read_lock(&dev_base_lock);
2564                         ret = dev_ifsioc(&ifr, cmd);
2565                         read_unlock(&dev_base_lock);
2566                         if (!ret) {
2567                                 if (colon)
2568                                         *colon = ':';
2569                                 if (copy_to_user(arg, &ifr,
2570                                                  sizeof(struct ifreq)))
2571                                         ret = -EFAULT;
2572                         }
2573                         return ret;
2574
2575                 case SIOCETHTOOL:
2576                         dev_load(ifr.ifr_name);
2577                         rtnl_lock();
2578                         ret = dev_ethtool(&ifr);
2579                         rtnl_unlock();
2580                         if (!ret) {
2581                                 if (colon)
2582                                         *colon = ':';
2583                                 if (copy_to_user(arg, &ifr,
2584                                                  sizeof(struct ifreq)))
2585                                         ret = -EFAULT;
2586                         }
2587                         return ret;
2588
2589                 /*
2590                  *      These ioctl calls:
2591                  *      - require superuser power.
2592                  *      - require strict serialization.
2593                  *      - return a value
2594                  */
2595                 case SIOCGMIIPHY:
2596                 case SIOCGMIIREG:
2597                 case SIOCSIFNAME:
2598                         if (!capable(CAP_NET_ADMIN))
2599                                 return -EPERM;
2600                         dev_load(ifr.ifr_name);
2601                         rtnl_lock();
2602                         ret = dev_ifsioc(&ifr, cmd);
2603                         rtnl_unlock();
2604                         if (!ret) {
2605                                 if (colon)
2606                                         *colon = ':';
2607                                 if (copy_to_user(arg, &ifr,
2608                                                  sizeof(struct ifreq)))
2609                                         ret = -EFAULT;
2610                         }
2611                         return ret;
2612
2613                 /*
2614                  *      These ioctl calls:
2615                  *      - require superuser power.
2616                  *      - require strict serialization.
2617                  *      - do not return a value
2618                  */
2619                 case SIOCSIFFLAGS:
2620                 case SIOCSIFMETRIC:
2621                 case SIOCSIFMTU:
2622                 case SIOCSIFMAP:
2623                 case SIOCSIFHWADDR:
2624                 case SIOCSIFSLAVE:
2625                 case SIOCADDMULTI:
2626                 case SIOCDELMULTI:
2627                 case SIOCSIFHWBROADCAST:
2628                 case SIOCSIFTXQLEN:
2629                 case SIOCSMIIREG:
2630                 case SIOCBONDENSLAVE:
2631                 case SIOCBONDRELEASE:
2632                 case SIOCBONDSETHWADDR:
2633                 case SIOCBONDSLAVEINFOQUERY:
2634                 case SIOCBONDINFOQUERY:
2635                 case SIOCBONDCHANGEACTIVE:
2636                 case SIOCBRADDIF:
2637                 case SIOCBRDELIF:
2638                         if (!capable(CAP_NET_ADMIN))
2639                                 return -EPERM;
2640                         dev_load(ifr.ifr_name);
2641                         rtnl_lock();
2642                         ret = dev_ifsioc(&ifr, cmd);
2643                         rtnl_unlock();
2644                         return ret;
2645
2646                 case SIOCGIFMEM:
2647                         /* Get the per device memory space. We can add this but
2648                          * currently do not support it */
2649                 case SIOCSIFMEM:
2650                         /* Set the per device memory buffer space.
2651                          * Not applicable in our case */
2652                 case SIOCSIFLINK:
2653                         return -EINVAL;
2654
2655                 /*
2656                  *      Unknown or private ioctl.
2657                  */
2658                 default:
2659                         if (cmd == SIOCWANDEV ||
2660                             (cmd >= SIOCDEVPRIVATE &&
2661                              cmd <= SIOCDEVPRIVATE + 15)) {
2662                                 dev_load(ifr.ifr_name);
2663                                 rtnl_lock();
2664                                 ret = dev_ifsioc(&ifr, cmd);
2665                                 rtnl_unlock();
2666                                 if (!ret && copy_to_user(arg, &ifr,
2667                                                          sizeof(struct ifreq)))
2668                                         ret = -EFAULT;
2669                                 return ret;
2670                         }
2671 #ifdef WIRELESS_EXT
2672                         /* Take care of Wireless Extensions */
2673                         if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) {
2674                                 /* If command is `set a parameter', or
2675                                  * `get the encoding parameters', check if
2676                                  * the user has the right to do it */
2677                                 if (IW_IS_SET(cmd) || cmd == SIOCGIWENCODE) {
2678                                         if (!capable(CAP_NET_ADMIN))
2679                                                 return -EPERM;
2680                                 }
2681                                 dev_load(ifr.ifr_name);
2682                                 rtnl_lock();
2683                                 /* Follow me in net/core/wireless.c */
2684                                 ret = wireless_process_ioctl(&ifr, cmd);
2685                                 rtnl_unlock();
2686                                 if (IW_IS_GET(cmd) &&
2687                                     copy_to_user(arg, &ifr,
2688                                                  sizeof(struct ifreq)))
2689                                         ret = -EFAULT;
2690                                 return ret;
2691                         }
2692 #endif  /* WIRELESS_EXT */
2693                         return -EINVAL;
2694         }
2695 }
2696
2697
2698 /**
2699  *      dev_new_index   -       allocate an ifindex
2700  *
2701  *      Returns a suitable unique value for a new device interface
2702  *      number.  The caller must hold the rtnl semaphore or the
2703  *      dev_base_lock to be sure it remains unique.
2704  */
2705 static int dev_new_index(void)
2706 {
2707         static int ifindex;
2708         for (;;) {
2709                 if (++ifindex <= 0)
2710                         ifindex = 1;
2711                 if (!__dev_get_by_index(ifindex))
2712                         return ifindex;
2713         }
2714 }
2715
2716 static int dev_boot_phase = 1;
2717
2718 /* Delayed registration/unregisteration */
2719 static DEFINE_SPINLOCK(net_todo_list_lock);
2720 static struct list_head net_todo_list = LIST_HEAD_INIT(net_todo_list);
2721
2722 static inline void net_set_todo(struct net_device *dev)
2723 {
2724         spin_lock(&net_todo_list_lock);
2725         list_add_tail(&dev->todo_list, &net_todo_list);
2726         spin_unlock(&net_todo_list_lock);
2727 }
2728
2729 /**
2730  *      register_netdevice      - register a network device
2731  *      @dev: device to register
2732  *
2733  *      Take a completed network device structure and add it to the kernel
2734  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
2735  *      chain. 0 is returned on success. A negative errno code is returned
2736  *      on a failure to set up the device, or if the name is a duplicate.
2737  *
2738  *      Callers must hold the rtnl semaphore. You may want
2739  *      register_netdev() instead of this.
2740  *
2741  *      BUGS:
2742  *      The locking appears insufficient to guarantee two parallel registers
2743  *      will not get the same name.
2744  */
2745
2746 int register_netdevice(struct net_device *dev)
2747 {
2748         struct hlist_head *head;
2749         struct hlist_node *p;
2750         int ret;
2751
2752         BUG_ON(dev_boot_phase);
2753         ASSERT_RTNL();
2754
2755         /* When net_device's are persistent, this will be fatal. */
2756         BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
2757
2758         spin_lock_init(&dev->queue_lock);
2759         spin_lock_init(&dev->xmit_lock);
2760         dev->xmit_lock_owner = -1;
2761 #ifdef CONFIG_NET_CLS_ACT
2762         spin_lock_init(&dev->ingress_lock);
2763 #endif
2764
2765         ret = alloc_divert_blk(dev);
2766         if (ret)
2767                 goto out;
2768
2769         dev->iflink = -1;
2770
2771         /* Init, if this function is available */
2772         if (dev->init) {
2773                 ret = dev->init(dev);
2774                 if (ret) {
2775                         if (ret > 0)
2776                                 ret = -EIO;
2777                         goto out_err;
2778                 }
2779         }
2780
2781         if (!dev_valid_name(dev->name)) {
2782                 ret = -EINVAL;
2783                 goto out_err;
2784         }
2785
2786         dev->ifindex = dev_new_index();
2787         if (dev->iflink == -1)
2788                 dev->iflink = dev->ifindex;
2789
2790         /* Check for existence of name */
2791         head = dev_name_hash(dev->name);
2792         hlist_for_each(p, head) {
2793                 struct net_device *d
2794                         = hlist_entry(p, struct net_device, name_hlist);
2795                 if (!strncmp(d->name, dev->name, IFNAMSIZ)) {
2796                         ret = -EEXIST;
2797                         goto out_err;
2798                 }
2799         }
2800
2801         /* Fix illegal SG+CSUM combinations. */
2802         if ((dev->features & NETIF_F_SG) &&
2803             !(dev->features & (NETIF_F_IP_CSUM |
2804                                NETIF_F_NO_CSUM |
2805                                NETIF_F_HW_CSUM))) {
2806                 printk("%s: Dropping NETIF_F_SG since no checksum feature.\n",
2807                        dev->name);
2808                 dev->features &= ~NETIF_F_SG;
2809         }
2810
2811         /* TSO requires that SG is present as well. */
2812         if ((dev->features & NETIF_F_TSO) &&
2813             !(dev->features & NETIF_F_SG)) {
2814                 printk("%s: Dropping NETIF_F_TSO since no SG feature.\n",
2815                        dev->name);
2816                 dev->features &= ~NETIF_F_TSO;
2817         }
2818
2819         /*
2820          *      nil rebuild_header routine,
2821          *      that should be never called and used as just bug trap.
2822          */
2823
2824         if (!dev->rebuild_header)
2825                 dev->rebuild_header = default_rebuild_header;
2826
2827         /*
2828          *      Default initial state at registry is that the
2829          *      device is present.
2830          */
2831
2832         set_bit(__LINK_STATE_PRESENT, &dev->state);
2833
2834         dev->next = NULL;
2835         dev_init_scheduler(dev);
2836         write_lock_bh(&dev_base_lock);
2837         *dev_tail = dev;
2838         dev_tail = &dev->next;
2839         hlist_add_head(&dev->name_hlist, head);
2840         hlist_add_head(&dev->index_hlist, dev_index_hash(dev->ifindex));
2841         dev_hold(dev);
2842         dev->reg_state = NETREG_REGISTERING;
2843         write_unlock_bh(&dev_base_lock);
2844
2845         /* Notify protocols, that a new device appeared. */
2846         notifier_call_chain(&netdev_chain, NETDEV_REGISTER, dev);
2847
2848         /* Finish registration after unlock */
2849         net_set_todo(dev);
2850         ret = 0;
2851
2852 out:
2853         return ret;
2854 out_err:
2855         free_divert_blk(dev);
2856         goto out;
2857 }
2858
2859 /**
2860  *      register_netdev - register a network device
2861  *      @dev: device to register
2862  *
2863  *      Take a completed network device structure and add it to the kernel
2864  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
2865  *      chain. 0 is returned on success. A negative errno code is returned
2866  *      on a failure to set up the device, or if the name is a duplicate.
2867  *
2868  *      This is a wrapper around register_netdev that takes the rtnl semaphore
2869  *      and expands the device name if you passed a format string to
2870  *      alloc_netdev.
2871  */
2872 int register_netdev(struct net_device *dev)
2873 {
2874         int err;
2875
2876         rtnl_lock();
2877
2878         /*
2879          * If the name is a format string the caller wants us to do a
2880          * name allocation.
2881          */
2882         if (strchr(dev->name, '%')) {
2883                 err = dev_alloc_name(dev, dev->name);
2884                 if (err < 0)
2885                         goto out;
2886         }
2887
2888         /*
2889          * Back compatibility hook. Kill this one in 2.5
2890          */
2891         if (dev->name[0] == 0 || dev->name[0] == ' ') {
2892                 err = dev_alloc_name(dev, "eth%d");
2893                 if (err < 0)
2894                         goto out;
2895         }
2896
2897         err = register_netdevice(dev);
2898 out:
2899         rtnl_unlock();
2900         return err;
2901 }
2902 EXPORT_SYMBOL(register_netdev);
2903
2904 /*
2905  * netdev_wait_allrefs - wait until all references are gone.
2906  *
2907  * This is called when unregistering network devices.
2908  *
2909  * Any protocol or device that holds a reference should register
2910  * for netdevice notification, and cleanup and put back the
2911  * reference if they receive an UNREGISTER event.
2912  * We can get stuck here if buggy protocols don't correctly
2913  * call dev_put.
2914  */
2915 static void netdev_wait_allrefs(struct net_device *dev)
2916 {
2917         unsigned long rebroadcast_time, warning_time;
2918
2919         rebroadcast_time = warning_time = jiffies;
2920         while (atomic_read(&dev->refcnt) != 0) {
2921                 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
2922                         rtnl_shlock();
2923
2924                         /* Rebroadcast unregister notification */
2925                         notifier_call_chain(&netdev_chain,
2926                                             NETDEV_UNREGISTER, dev);
2927
2928                         if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
2929                                      &dev->state)) {
2930                                 /* We must not have linkwatch events
2931                                  * pending on unregister. If this
2932                                  * happens, we simply run the queue
2933                                  * unscheduled, resulting in a noop
2934                                  * for this device.
2935                                  */
2936                                 linkwatch_run_queue();
2937                         }
2938
2939                         rtnl_shunlock();
2940
2941                         rebroadcast_time = jiffies;
2942                 }
2943
2944                 msleep(250);
2945
2946                 if (time_after(jiffies, warning_time + 10 * HZ)) {
2947                         printk(KERN_EMERG "unregister_netdevice: "
2948                                "waiting for %s to become free. Usage "
2949                                "count = %d\n",
2950                                dev->name, atomic_read(&dev->refcnt));
2951                         warning_time = jiffies;
2952                 }
2953         }
2954 }
2955
2956 /* The sequence is:
2957  *
2958  *      rtnl_lock();
2959  *      ...
2960  *      register_netdevice(x1);
2961  *      register_netdevice(x2);
2962  *      ...
2963  *      unregister_netdevice(y1);
2964  *      unregister_netdevice(y2);
2965  *      ...
2966  *      rtnl_unlock();
2967  *      free_netdev(y1);
2968  *      free_netdev(y2);
2969  *
2970  * We are invoked by rtnl_unlock() after it drops the semaphore.
2971  * This allows us to deal with problems:
2972  * 1) We can create/delete sysfs objects which invoke hotplug
2973  *    without deadlocking with linkwatch via keventd.
2974  * 2) Since we run with the RTNL semaphore not held, we can sleep
2975  *    safely in order to wait for the netdev refcnt to drop to zero.
2976  */
2977 static DECLARE_MUTEX(net_todo_run_mutex);
2978 void netdev_run_todo(void)
2979 {
2980         struct list_head list = LIST_HEAD_INIT(list);
2981         int err;
2982
2983
2984         /* Need to guard against multiple cpu's getting out of order. */
2985         down(&net_todo_run_mutex);
2986
2987         /* Not safe to do outside the semaphore.  We must not return
2988          * until all unregister events invoked by the local processor
2989          * have been completed (either by this todo run, or one on
2990          * another cpu).
2991          */
2992         if (list_empty(&net_todo_list))
2993                 goto out;
2994
2995         /* Snapshot list, allow later requests */
2996         spin_lock(&net_todo_list_lock);
2997         list_splice_init(&net_todo_list, &list);
2998         spin_unlock(&net_todo_list_lock);
2999
3000         while (!list_empty(&list)) {
3001                 struct net_device *dev
3002                         = list_entry(list.next, struct net_device, todo_list);
3003                 list_del(&dev->todo_list);
3004
3005                 switch(dev->reg_state) {
3006                 case NETREG_REGISTERING:
3007                         err = netdev_register_sysfs(dev);
3008                         if (err)
3009                                 printk(KERN_ERR "%s: failed sysfs registration (%d)\n",
3010                                        dev->name, err);
3011                         dev->reg_state = NETREG_REGISTERED;
3012                         break;
3013
3014                 case NETREG_UNREGISTERING:
3015                         netdev_unregister_sysfs(dev);
3016                         dev->reg_state = NETREG_UNREGISTERED;
3017
3018                         netdev_wait_allrefs(dev);
3019
3020                         /* paranoia */
3021                         BUG_ON(atomic_read(&dev->refcnt));
3022                         BUG_TRAP(!dev->ip_ptr);
3023                         BUG_TRAP(!dev->ip6_ptr);
3024                         BUG_TRAP(!dev->dn_ptr);
3025
3026
3027                         /* It must be the very last action,
3028                          * after this 'dev' may point to freed up memory.
3029                          */
3030                         if (dev->destructor)
3031                                 dev->destructor(dev);
3032                         break;
3033
3034                 default:
3035                         printk(KERN_ERR "network todo '%s' but state %d\n",
3036                                dev->name, dev->reg_state);
3037                         break;
3038                 }
3039         }
3040
3041 out:
3042         up(&net_todo_run_mutex);
3043 }
3044
3045 /**
3046  *      alloc_netdev - allocate network device
3047  *      @sizeof_priv:   size of private data to allocate space for
3048  *      @name:          device name format string
3049  *      @setup:         callback to initialize device
3050  *
3051  *      Allocates a struct net_device with private data area for driver use
3052  *      and performs basic initialization.
3053  */
3054 struct net_device *alloc_netdev(int sizeof_priv, const char *name,
3055                 void (*setup)(struct net_device *))
3056 {
3057         void *p;
3058         struct net_device *dev;
3059         int alloc_size;
3060
3061         /* ensure 32-byte alignment of both the device and private area */
3062         alloc_size = (sizeof(*dev) + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST;
3063         alloc_size += sizeof_priv + NETDEV_ALIGN_CONST;
3064
3065         p = kmalloc(alloc_size, GFP_KERNEL);
3066         if (!p) {
3067                 printk(KERN_ERR "alloc_dev: Unable to allocate device.\n");
3068                 return NULL;
3069         }
3070         memset(p, 0, alloc_size);
3071
3072         dev = (struct net_device *)
3073                 (((long)p + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST);
3074         dev->padded = (char *)dev - (char *)p;
3075
3076         if (sizeof_priv)
3077                 dev->priv = netdev_priv(dev);
3078
3079         setup(dev);
3080         strcpy(dev->name, name);
3081         return dev;
3082 }
3083 EXPORT_SYMBOL(alloc_netdev);
3084
3085 /**
3086  *      free_netdev - free network device
3087  *      @dev: device
3088  *
3089  *      This function does the last stage of destroying an allocated device
3090  *      interface. The reference to the device object is released.
3091  *      If this is the last reference then it will be freed.
3092  */
3093 void free_netdev(struct net_device *dev)
3094 {
3095 #ifdef CONFIG_SYSFS
3096         /*  Compatiablity with error handling in drivers */
3097         if (dev->reg_state == NETREG_UNINITIALIZED) {
3098                 kfree((char *)dev - dev->padded);
3099                 return;
3100         }
3101
3102         BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
3103         dev->reg_state = NETREG_RELEASED;
3104
3105         /* will free via class release */
3106         class_device_put(&dev->class_dev);
3107 #else
3108         kfree((char *)dev - dev->padded);
3109 #endif
3110 }
3111
3112 /* Synchronize with packet receive processing. */
3113 void synchronize_net(void)
3114 {
3115         might_sleep();
3116         synchronize_rcu();
3117 }
3118
3119 /**
3120  *      unregister_netdevice - remove device from the kernel
3121  *      @dev: device
3122  *
3123  *      This function shuts down a device interface and removes it
3124  *      from the kernel tables. On success 0 is returned, on a failure
3125  *      a negative errno code is returned.
3126  *
3127  *      Callers must hold the rtnl semaphore.  You may want
3128  *      unregister_netdev() instead of this.
3129  */
3130
3131 int unregister_netdevice(struct net_device *dev)
3132 {
3133         struct net_device *d, **dp;
3134
3135         BUG_ON(dev_boot_phase);
3136         ASSERT_RTNL();
3137
3138         /* Some devices call without registering for initialization unwind. */
3139         if (dev->reg_state == NETREG_UNINITIALIZED) {
3140                 printk(KERN_DEBUG "unregister_netdevice: device %s/%p never "
3141                                   "was registered\n", dev->name, dev);
3142                 return -ENODEV;
3143         }
3144
3145         BUG_ON(dev->reg_state != NETREG_REGISTERED);
3146
3147         /* If device is running, close it first. */
3148         if (dev->flags & IFF_UP)
3149                 dev_close(dev);
3150
3151         /* And unlink it from device chain. */
3152         for (dp = &dev_base; (d = *dp) != NULL; dp = &d->next) {
3153                 if (d == dev) {
3154                         write_lock_bh(&dev_base_lock);
3155                         hlist_del(&dev->name_hlist);
3156                         hlist_del(&dev->index_hlist);
3157                         if (dev_tail == &dev->next)
3158                                 dev_tail = dp;
3159                         *dp = d->next;
3160                         write_unlock_bh(&dev_base_lock);
3161                         break;
3162                 }
3163         }
3164         if (!d) {
3165                 printk(KERN_ERR "unregister net_device: '%s' not found\n",
3166                        dev->name);
3167                 return -ENODEV;
3168         }
3169
3170         dev->reg_state = NETREG_UNREGISTERING;
3171
3172         synchronize_net();
3173
3174         /* Shutdown queueing discipline. */
3175         dev_shutdown(dev);
3176
3177
3178         /* Notify protocols, that we are about to destroy
3179            this device. They should clean all the things.
3180         */
3181         notifier_call_chain(&netdev_chain, NETDEV_UNREGISTER, dev);
3182
3183         /*
3184          *      Flush the multicast chain
3185          */
3186         dev_mc_discard(dev);
3187
3188         if (dev->uninit)
3189                 dev->uninit(dev);
3190
3191         /* Notifier chain MUST detach us from master device. */
3192         BUG_TRAP(!dev->master);
3193
3194         free_divert_blk(dev);
3195
3196         /* Finish processing unregister after unlock */
3197         net_set_todo(dev);
3198
3199         synchronize_net();
3200
3201         dev_put(dev);
3202         return 0;
3203 }
3204
3205 /**
3206  *      unregister_netdev - remove device from the kernel
3207  *      @dev: device
3208  *
3209  *      This function shuts down a device interface and removes it
3210  *      from the kernel tables. On success 0 is returned, on a failure
3211  *      a negative errno code is returned.
3212  *
3213  *      This is just a wrapper for unregister_netdevice that takes
3214  *      the rtnl semaphore.  In general you want to use this and not
3215  *      unregister_netdevice.
3216  */
3217 void unregister_netdev(struct net_device *dev)
3218 {
3219         rtnl_lock();
3220         unregister_netdevice(dev);
3221         rtnl_unlock();
3222 }
3223
3224 EXPORT_SYMBOL(unregister_netdev);
3225
3226 #ifdef CONFIG_HOTPLUG_CPU
3227 static int dev_cpu_callback(struct notifier_block *nfb,
3228                             unsigned long action,
3229                             void *ocpu)
3230 {
3231         struct sk_buff **list_skb;
3232         struct net_device **list_net;
3233         struct sk_buff *skb;
3234         unsigned int cpu, oldcpu = (unsigned long)ocpu;
3235         struct softnet_data *sd, *oldsd;
3236
3237         if (action != CPU_DEAD)
3238                 return NOTIFY_OK;
3239
3240         local_irq_disable();
3241         cpu = smp_processor_id();
3242         sd = &per_cpu(softnet_data, cpu);
3243         oldsd = &per_cpu(softnet_data, oldcpu);
3244
3245         /* Find end of our completion_queue. */
3246         list_skb = &sd->completion_queue;
3247         while (*list_skb)
3248                 list_skb = &(*list_skb)->next;
3249         /* Append completion queue from offline CPU. */
3250         *list_skb = oldsd->completion_queue;
3251         oldsd->completion_queue = NULL;
3252
3253         /* Find end of our output_queue. */
3254         list_net = &sd->output_queue;
3255         while (*list_net)
3256                 list_net = &(*list_net)->next_sched;
3257         /* Append output queue from offline CPU. */
3258         *list_net = oldsd->output_queue;
3259         oldsd->output_queue = NULL;
3260
3261         raise_softirq_irqoff(NET_TX_SOFTIRQ);
3262         local_irq_enable();
3263
3264         /* Process offline CPU's input_pkt_queue */
3265         while ((skb = __skb_dequeue(&oldsd->input_pkt_queue)))
3266                 netif_rx(skb);
3267
3268         return NOTIFY_OK;
3269 }
3270 #endif /* CONFIG_HOTPLUG_CPU */
3271
3272
3273 /*
3274  *      Initialize the DEV module. At boot time this walks the device list and
3275  *      unhooks any devices that fail to initialise (normally hardware not
3276  *      present) and leaves us with a valid list of present and active devices.
3277  *
3278  */
3279
3280 /*
3281  *       This is called single threaded during boot, so no need
3282  *       to take the rtnl semaphore.
3283  */
3284 static int __init net_dev_init(void)
3285 {
3286         int i, rc = -ENOMEM;
3287
3288         BUG_ON(!dev_boot_phase);
3289
3290         net_random_init();
3291
3292         if (dev_proc_init())
3293                 goto out;
3294
3295         if (netdev_sysfs_init())
3296                 goto out;
3297
3298         INIT_LIST_HEAD(&ptype_all);
3299         for (i = 0; i < 16; i++)
3300                 INIT_LIST_HEAD(&ptype_base[i]);
3301
3302         for (i = 0; i < ARRAY_SIZE(dev_name_head); i++)
3303                 INIT_HLIST_HEAD(&dev_name_head[i]);
3304
3305         for (i = 0; i < ARRAY_SIZE(dev_index_head); i++)
3306                 INIT_HLIST_HEAD(&dev_index_head[i]);
3307
3308         /*
3309          *      Initialise the packet receive queues.
3310          */
3311
3312         for (i = 0; i < NR_CPUS; i++) {
3313                 struct softnet_data *queue;
3314
3315                 queue = &per_cpu(softnet_data, i);
3316                 skb_queue_head_init(&queue->input_pkt_queue);
3317                 queue->throttle = 0;
3318                 queue->cng_level = 0;
3319                 queue->avg_blog = 10; /* arbitrary non-zero */
3320                 queue->completion_queue = NULL;
3321                 INIT_LIST_HEAD(&queue->poll_list);
3322                 set_bit(__LINK_STATE_START, &queue->backlog_dev.state);
3323                 queue->backlog_dev.weight = weight_p;
3324                 queue->backlog_dev.poll = process_backlog;
3325                 atomic_set(&queue->backlog_dev.refcnt, 1);
3326         }
3327
3328 #ifdef OFFLINE_SAMPLE
3329         samp_timer.expires = jiffies + (10 * HZ);
3330         add_timer(&samp_timer);
3331 #endif
3332
3333         dev_boot_phase = 0;
3334
3335         open_softirq(NET_TX_SOFTIRQ, net_tx_action, NULL);
3336         open_softirq(NET_RX_SOFTIRQ, net_rx_action, NULL);
3337
3338         hotcpu_notifier(dev_cpu_callback, 0);
3339         dst_init();
3340         dev_mcast_init();
3341         rc = 0;
3342 out:
3343         return rc;
3344 }
3345
3346 subsys_initcall(net_dev_init);
3347
3348 EXPORT_SYMBOL(__dev_get_by_index);
3349 EXPORT_SYMBOL(__dev_get_by_name);
3350 EXPORT_SYMBOL(__dev_remove_pack);
3351 EXPORT_SYMBOL(__skb_linearize);
3352 EXPORT_SYMBOL(dev_add_pack);
3353 EXPORT_SYMBOL(dev_alloc_name);
3354 EXPORT_SYMBOL(dev_close);
3355 EXPORT_SYMBOL(dev_get_by_flags);
3356 EXPORT_SYMBOL(dev_get_by_index);
3357 EXPORT_SYMBOL(dev_get_by_name);
3358 EXPORT_SYMBOL(dev_ioctl);
3359 EXPORT_SYMBOL(dev_open);
3360 EXPORT_SYMBOL(dev_queue_xmit);
3361 #if defined(CONFIG_VNET) || defined(CONFIG_VNET_MODULE)
3362 EXPORT_SYMBOL(dev_queue_xmit_nit);
3363 #endif
3364 EXPORT_SYMBOL(dev_remove_pack);
3365 EXPORT_SYMBOL(dev_set_allmulti);
3366 EXPORT_SYMBOL(dev_set_promiscuity);
3367 EXPORT_SYMBOL(dev_change_flags);
3368 EXPORT_SYMBOL(dev_change_name);
3369 EXPORT_SYMBOL(dev_set_mtu);
3370 EXPORT_SYMBOL(dev_set_mac_address);
3371 EXPORT_SYMBOL(free_netdev);
3372 EXPORT_SYMBOL(netdev_boot_setup_check);
3373 EXPORT_SYMBOL(netdev_set_master);
3374 EXPORT_SYMBOL(netdev_state_change);
3375 EXPORT_SYMBOL(netif_receive_skb);
3376 EXPORT_SYMBOL(netif_rx);
3377 EXPORT_SYMBOL(register_gifconf);
3378 EXPORT_SYMBOL(register_netdevice);
3379 EXPORT_SYMBOL(register_netdevice_notifier);
3380 EXPORT_SYMBOL(skb_checksum_help);
3381 EXPORT_SYMBOL(synchronize_net);
3382 EXPORT_SYMBOL(unregister_netdevice);
3383 EXPORT_SYMBOL(unregister_netdevice_notifier);
3384 EXPORT_SYMBOL(net_enable_timestamp);
3385 EXPORT_SYMBOL(net_disable_timestamp);
3386 EXPORT_SYMBOL(dev_get_flags);
3387
3388 #if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)
3389 EXPORT_SYMBOL(br_handle_frame_hook);
3390 EXPORT_SYMBOL(br_fdb_get_hook);
3391 EXPORT_SYMBOL(br_fdb_put_hook);
3392 #endif
3393
3394 #ifdef CONFIG_KMOD
3395 EXPORT_SYMBOL(dev_load);
3396 #endif
3397
3398 EXPORT_PER_CPU_SYMBOL(softnet_data);