net/core/dev.c

   1 /*
   2  *      NET3    Protocol independent device support routines.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  *      Derived from the non IP parts of dev.c 1.0.19
  10  *              Authors:        Ross Biro, <bir7@leland.Stanford.Edu>
  11  *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *
  14  *      Additional Authors:
  15  *              Florian la Roche <rzsfl@rz.uni-sb.de>
  16  *              Alan Cox <gw4pts@gw4pts.ampr.org>
  17  *              David Hinds <dahinds@users.sourceforge.net>
  18  *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19  *              Adam Sulmicki <adam@cfar.umd.edu>
  20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21  *
  22  *      Changes:
  23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24  *                                      to 2 if register_netdev gets called
  25  *                                      before net_dev_init & also removed a
  26  *                                      few lines of code in the process.
  27  *              Alan Cox        :       device private ioctl copies fields back.
  28  *              Alan Cox        :       Transmit queue code does relevant
  29  *                                      stunts to keep the queue safe.
  30  *              Alan Cox        :       Fixed double lock.
  31  *              Alan Cox        :       Fixed promisc NULL pointer trap
  32  *              ????????        :       Support the full private ioctl range
  33  *              Alan Cox        :       Moved ioctl permission check into
  34  *                                      drivers
  35  *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
  36  *              Alan Cox        :       100 backlog just doesn't cut it when
  37  *                                      you start doing multicast video 8)
  38  *              Alan Cox        :       Rewrote net_bh and list manager.
  39  *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  40  *              Alan Cox        :       Took out transmit every packet pass
  41  *                                      Saved a few bytes in the ioctl handler
  42  *              Alan Cox        :       Network driver sets packet type before
  43  *                                      calling netif_rx. Saves a function
  44  *                                      call a packet.
  45  *              Alan Cox        :       Hashed net_bh()
  46  *              Richard Kooijman:       Timestamp fixes.
  47  *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
  48  *              Alan Cox        :       Device lock protection.
  49  *              Alan Cox        :       Fixed nasty side effect of device close
  50  *                                      changes.
  51  *              Rudi Cilibrasi  :       Pass the right thing to
  52  *                                      set_mac_address()
  53  *              Dave Miller     :       32bit quantity for the device lock to
  54  *                                      make it work out on a Sparc.
  55  *              Bjorn Ekwall    :       Added KERNELD hack.
  56  *              Alan Cox        :       Cleaned up the backlog initialise.
  57  *              Craig Metz      :       SIOCGIFCONF fix if space for under
  58  *                                      1 device.
  59  *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
  60  *                                      is no device open function.
  61  *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  62  *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  63  *              Cyrus Durgin    :       Cleaned for KMOD
  64  *              Adam Sulmicki   :       Bug Fix : Network Device Unload
  65  *                                      A network device unload needs to purge
  66  *                                      the backlog queue.
  67  *      Paul Rusty Russell      :       SIOCSIFNAME
  68  *              Pekka Riikonen  :       Netdev boot-time settings code
  69  *              Andrew Morton   :       Make unregister_netdevice wait
  70  *                                      indefinitely on dev->refcnt
  71  *              J Hadi Salim    :       - Backlog queue sampling
  72  *                                      - netif_rx() feedback
  73  */
  74
  75 #include <asm/uaccess.h>
  76 #include <asm/system.h>
  77 #include <asm/bitops.h>
  78 #include <linux/config.h>
  79 #include <linux/cpu.h>
  80 #include <linux/types.h>
  81 #include <linux/kernel.h>
  82 #include <linux/sched.h>
  83 #include <linux/string.h>
  84 #include <linux/mm.h>
  85 #include <linux/socket.h>
  86 #include <linux/sockios.h>
  87 #include <linux/errno.h>
  88 #include <linux/interrupt.h>
  89 #include <linux/if_ether.h>
  90 #include <linux/netdevice.h>
  91 #include <linux/etherdevice.h>
  92 #include <linux/notifier.h>
  93 #include <linux/skbuff.h>
  94 #include <net/sock.h>
  95 #include <linux/rtnetlink.h>
  96 #include <linux/proc_fs.h>
  97 #include <linux/seq_file.h>
  98 #include <linux/stat.h>
  99 #include <linux/if_bridge.h>
 100 #include <linux/divert.h>
 101 #include <net/dst.h>
 102 #include <net/pkt_sched.h>
 103 #include <net/checksum.h>
 104 #include <linux/highmem.h>
 105 #include <linux/init.h>
 106 #include <linux/kmod.h>
 107 #include <linux/module.h>
 108 #include <linux/kallsyms.h>
 109 #include <linux/netpoll.h>
 110 #include <linux/rcupdate.h>
 111 #ifdef CONFIG_NET_RADIO
 112 #include <linux/wireless.h>             /* Note : will define WIRELESS_EXT */
 113 #include <net/iw_handler.h>
 114 #endif  /* CONFIG_NET_RADIO */
 115 #include <asm/current.h>
 116 #include <linux/vs_network.h>
 117
 118 /* This define, if set, will randomly drop a packet when congestion
 119  * is more than moderate.  It helps fairness in the multi-interface
 120  * case when one of them is a hog, but it kills performance for the
 121  * single interface case so it is off now by default.
 122  */
 123 #undef RAND_LIE
 124
 125 /* Setting this will sample the queue lengths and thus congestion
 126  * via a timer instead of as each packet is received.
 127  */
 128 #undef OFFLINE_SAMPLE
 129
 130 /*
 131  *      The list of packet types we will receive (as opposed to discard)
 132  *      and the routines to invoke.
 133  *
 134  *      Why 16. Because with 16 the only overlap we get on a hash of the
 135  *      low nibble of the protocol value is RARP/SNAP/X.25.
 136  *
 137  *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
 138  *             sure which should go first, but I bet it won't make much
 139  *             difference if we are running VLANs.  The good news is that
 140  *             this protocol won't be in the list unless compiled in, so
 141  *             the average user (w/out VLANs) will not be adversly affected.
 142  *             --BLG
 143  *
 144  *              0800    IP
 145  *              8100    802.1Q VLAN
 146  *              0001    802.3
 147  *              0002    AX.25
 148  *              0004    802.2
 149  *              8035    RARP
 150  *              0005    SNAP
 151  *              0805    X.25
 152  *              0806    ARP
 153  *              8137    IPX
 154  *              0009    Localtalk
 155  *              86DD    IPv6
 156  */
 157
 158 static spinlock_t ptype_lock = SPIN_LOCK_UNLOCKED;
 159 static struct list_head ptype_base[16]; /* 16 way hashed list */
 160 static struct list_head ptype_all;              /* Taps */
 161
 162 #ifdef OFFLINE_SAMPLE
 163 static void sample_queue(unsigned long dummy);
 164 static struct timer_list samp_timer = TIMER_INITIALIZER(sample_queue, 0, 0);
 165 #endif
 166
 167 /*
 168  * The @dev_base list is protected by @dev_base_lock and the rtln
 169  * semaphore.
 170  *
 171  * Pure readers hold dev_base_lock for reading.
 172  *
 173  * Writers must hold the rtnl semaphore while they loop through the
 174  * dev_base list, and hold dev_base_lock for writing when they do the
 175  * actual updates.  This allows pure readers to access the list even
 176  * while a writer is preparing to update it.
 177  *
 178  * To put it another way, dev_base_lock is held for writing only to
 179  * protect against pure readers; the rtnl semaphore provides the
 180  * protection against other writers.
 181  *
 182  * See, for example usages, register_netdevice() and
 183  * unregister_netdevice(), which must be called with the rtnl
 184  * semaphore held.
 185  */
 186 struct net_device *dev_base;
 187 struct net_device **dev_tail = &dev_base;
 188 rwlock_t dev_base_lock = RW_LOCK_UNLOCKED;
 189
 190 EXPORT_SYMBOL(dev_base);
 191 EXPORT_SYMBOL(dev_base_lock);
 192
 193 #define NETDEV_HASHBITS 8
 194 static struct hlist_head dev_name_head[1<<NETDEV_HASHBITS];
 195 static struct hlist_head dev_index_head[1<<NETDEV_HASHBITS];
 196
 197 static inline struct hlist_head *dev_name_hash(const char *name)
 198 {
 199         unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 200         return &dev_name_head[hash & ((1<<NETDEV_HASHBITS)-1)];
 201 }
 202
 203 static inline struct hlist_head *dev_index_hash(int ifindex)
 204 {
 205         return &dev_index_head[ifindex & ((1<<NETDEV_HASHBITS)-1)];
 206 }
 207
 208 /*
 209  *      Our notifier list
 210  */
 211
 212 static struct notifier_block *netdev_chain;
 213
 214 /*
 215  *      Device drivers call our routines to queue packets here. We empty the
 216  *      queue in the local softnet handler.
 217  */
 218 DEFINE_PER_CPU(struct softnet_data, softnet_data) = { 0, };
 219
 220 #ifdef CONFIG_SYSFS
 221 extern int netdev_sysfs_init(void);
 222 extern int netdev_register_sysfs(struct net_device *);
 223 extern void netdev_unregister_sysfs(struct net_device *);
 224 #else
 225 #define netdev_sysfs_init()             (0)
 226 #define netdev_register_sysfs(dev)      (0)
 227 #define netdev_unregister_sysfs(dev)    do { } while(0)
 228 #endif
 229
 230
 231 /*******************************************************************************
 232
 233                 Protocol management and registration routines
 234
 235 *******************************************************************************/
 236
 237 /*
 238  *      For efficiency
 239  */
 240
 241 int netdev_nit;
 242
 243 /*
 244  *      Add a protocol ID to the list. Now that the input handler is
 245  *      smarter we can dispense with all the messy stuff that used to be
 246  *      here.
 247  *
 248  *      BEWARE!!! Protocol handlers, mangling input packets,
 249  *      MUST BE last in hash buckets and checking protocol handlers
 250  *      MUST start from promiscuous ptype_all chain in net_bh.
 251  *      It is true now, do not change it.
 252  *      Explanation follows: if protocol handler, mangling packet, will
 253  *      be the first on list, it is not able to sense, that packet
 254  *      is cloned and should be copied-on-write, so that it will
 255  *      change it and subsequent readers will get broken packet.
 256  *                                                      --ANK (980803)
 257  */
 258
 259 /**
 260  *      dev_add_pack - add packet handler
 261  *      @pt: packet type declaration
 262  *
 263  *      Add a protocol handler to the networking stack. The passed &packet_type
 264  *      is linked into kernel lists and may not be freed until it has been
 265  *      removed from the kernel lists.
 266  *
 267  *      This call does not sleep therefore it can not
 268  *      guarantee all CPU's that are in middle of receiving packets
 269  *      will see the new packet type (until the next received packet).
 270  */
 271
 272 void dev_add_pack(struct packet_type *pt)
 273 {
 274         int hash;
 275
 276         spin_lock_bh(&ptype_lock);
 277         if (pt->type == htons(ETH_P_ALL)) {
 278                 netdev_nit++;
 279                 list_add_rcu(&pt->list, &ptype_all);
 280         } else {
 281                 hash = ntohs(pt->type) & 15;
 282                 list_add_rcu(&pt->list, &ptype_base[hash]);
 283         }
 284         spin_unlock_bh(&ptype_lock);
 285 }
 286
 287 extern void linkwatch_run_queue(void);
 288
 289
 290
 291 /**
 292  *      __dev_remove_pack        - remove packet handler
 293  *      @pt: packet type declaration
 294  *
 295  *      Remove a protocol handler that was previously added to the kernel
 296  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 297  *      from the kernel lists and can be freed or reused once this function
 298  *      returns.
 299  *
 300  *      The packet type might still be in use by receivers
 301  *      and must not be freed until after all the CPU's have gone
 302  *      through a quiescent state.
 303  */
 304 void __dev_remove_pack(struct packet_type *pt)
 305 {
 306         struct list_head *head;
 307         struct packet_type *pt1;
 308
 309         spin_lock_bh(&ptype_lock);
 310
 311         if (pt->type == htons(ETH_P_ALL)) {
 312                 netdev_nit--;
 313                 head = &ptype_all;
 314         } else
 315                 head = &ptype_base[ntohs(pt->type) & 15];
 316
 317         list_for_each_entry(pt1, head, list) {
 318                 if (pt == pt1) {
 319                         list_del_rcu(&pt->list);
 320                         goto out;
 321                 }
 322         }
 323
 324         printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
 325 out:
 326         spin_unlock_bh(&ptype_lock);
 327 }
 328 /**
 329  *      dev_remove_pack  - remove packet handler
 330  *      @pt: packet type declaration
 331  *
 332  *      Remove a protocol handler that was previously added to the kernel
 333  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 334  *      from the kernel lists and can be freed or reused once this function
 335  *      returns.
 336  *
 337  *      This call sleeps to guarantee that no CPU is looking at the packet
 338  *      type after return.
 339  */
 340 void dev_remove_pack(struct packet_type *pt)
 341 {
 342         __dev_remove_pack(pt);
 343
 344         synchronize_net();
 345 }
 346
 347 /******************************************************************************
 348
 349                       Device Boot-time Settings Routines
 350
 351 *******************************************************************************/
 352
 353 /* Boot time configuration table */
 354 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 355
 356 /**
 357  *      netdev_boot_setup_add   - add new setup entry
 358  *      @name: name of the device
 359  *      @map: configured settings for the device
 360  *
 361  *      Adds new setup entry to the dev_boot_setup list.  The function
 362  *      returns 0 on error and 1 on success.  This is a generic routine to
 363  *      all netdevices.
 364  */
 365 int netdev_boot_setup_add(char *name, struct ifmap *map)
 366 {
 367         struct netdev_boot_setup *s;
 368         int i;
 369
 370         s = dev_boot_setup;
 371         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 372                 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 373                         memset(s[i].name, 0, sizeof(s[i].name));
 374                         strcpy(s[i].name, name);
 375                         memcpy(&s[i].map, map, sizeof(s[i].map));
 376                         break;
 377                 }
 378         }
 379
 380         return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 381 }
 382
 383 /**
 384  *      netdev_boot_setup_check - check boot time settings
 385  *      @dev: the netdevice
 386  *
 387  *      Check boot time settings for the device.
 388  *      The found settings are set for the device to be used
 389  *      later in the device probing.
 390  *      Returns 0 if no settings found, 1 if they are.
 391  */
 392 int netdev_boot_setup_check(struct net_device *dev)
 393 {
 394         struct netdev_boot_setup *s = dev_boot_setup;
 395         int i;
 396
 397         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 398                 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 399                     !strncmp(dev->name, s[i].name, strlen(s[i].name))) {
 400                         dev->irq        = s[i].map.irq;
 401                         dev->base_addr  = s[i].map.base_addr;
 402                         dev->mem_start  = s[i].map.mem_start;
 403                         dev->mem_end    = s[i].map.mem_end;
 404                         return 1;
 405                 }
 406         }
 407         return 0;
 408 }
 409
 410
 411 /**
 412  *      netdev_boot_base        - get address from boot time settings
 413  *      @prefix: prefix for network device
 414  *      @unit: id for network device
 415  *
 416  *      Check boot time settings for the base address of device.
 417  *      The found settings are set for the device to be used
 418  *      later in the device probing.
 419  *      Returns 0 if no settings found.
 420  */
 421 unsigned long netdev_boot_base(const char *prefix, int unit)
 422 {
 423         const struct netdev_boot_setup *s = dev_boot_setup;
 424         char name[IFNAMSIZ];
 425         int i;
 426
 427         sprintf(name, "%s%d", prefix, unit);
 428
 429         /*
 430          * If device already registered then return base of 1
 431          * to indicate not to probe for this interface
 432          */
 433         if (__dev_get_by_name(name))
 434                 return 1;
 435
 436         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 437                 if (!strcmp(name, s[i].name))
 438                         return s[i].map.base_addr;
 439         return 0;
 440 }
 441
 442 /*
 443  * Saves at boot time configured settings for any netdevice.
 444  */
 445 int __init netdev_boot_setup(char *str)
 446 {
 447         int ints[5];
 448         struct ifmap map;
 449
 450         str = get_options(str, ARRAY_SIZE(ints), ints);
 451         if (!str || !*str)
 452                 return 0;
 453
 454         /* Save settings */
 455         memset(&map, 0, sizeof(map));
 456         if (ints[0] > 0)
 457                 map.irq = ints[1];
 458         if (ints[0] > 1)
 459                 map.base_addr = ints[2];
 460         if (ints[0] > 2)
 461                 map.mem_start = ints[3];
 462         if (ints[0] > 3)
 463                 map.mem_end = ints[4];
 464
 465         /* Add new entry to the list */
 466         return netdev_boot_setup_add(str, &map);
 467 }
 468
 469 __setup("netdev=", netdev_boot_setup);
 470
 471 /*******************************************************************************
 472
 473                             Device Interface Subroutines
 474
 475 *******************************************************************************/
 476
 477 /**
 478  *      __dev_get_by_name       - find a device by its name
 479  *      @name: name to find
 480  *
 481  *      Find an interface by name. Must be called under RTNL semaphore
 482  *      or @dev_base_lock. If the name is found a pointer to the device
 483  *      is returned. If the name is not found then %NULL is returned. The
 484  *      reference counters are not incremented so the caller must be
 485  *      careful with locks.
 486  */
 487
 488 struct net_device *__dev_get_by_name(const char *name)
 489 {
 490         struct hlist_node *p;
 491
 492         hlist_for_each(p, dev_name_hash(name)) {
 493                 struct net_device *dev
 494                         = hlist_entry(p, struct net_device, name_hlist);
 495                 if (!strncmp(dev->name, name, IFNAMSIZ))
 496                         return dev;
 497         }
 498         return NULL;
 499 }
 500
 501 /**
 502  *      dev_get_by_name         - find a device by its name
 503  *      @name: name to find
 504  *
 505  *      Find an interface by name. This can be called from any
 506  *      context and does its own locking. The returned handle has
 507  *      the usage count incremented and the caller must use dev_put() to
 508  *      release it when it is no longer needed. %NULL is returned if no
 509  *      matching device is found.
 510  */
 511
 512 struct net_device *dev_get_by_name(const char *name)
 513 {
 514         struct net_device *dev;
 515
 516         read_lock(&dev_base_lock);
 517         dev = __dev_get_by_name(name);
 518         if (dev)
 519                 dev_hold(dev);
 520         read_unlock(&dev_base_lock);
 521         return dev;
 522 }
 523
 524 /**
 525  *      __dev_get_by_index - find a device by its ifindex
 526  *      @ifindex: index of device
 527  *
 528  *      Search for an interface by index. Returns %NULL if the device
 529  *      is not found or a pointer to the device. The device has not
 530  *      had its reference counter increased so the caller must be careful
 531  *      about locking. The caller must hold either the RTNL semaphore
 532  *      or @dev_base_lock.
 533  */
 534
 535 struct net_device *__dev_get_by_index(int ifindex)
 536 {
 537         struct hlist_node *p;
 538
 539         hlist_for_each(p, dev_index_hash(ifindex)) {
 540                 struct net_device *dev
 541                         = hlist_entry(p, struct net_device, index_hlist);
 542                 if (dev->ifindex == ifindex)
 543                         return dev;
 544         }
 545         return NULL;
 546 }
 547
 548
 549 /**
 550  *      dev_get_by_index - find a device by its ifindex
 551  *      @ifindex: index of device
 552  *
 553  *      Search for an interface by index. Returns NULL if the device
 554  *      is not found or a pointer to the device. The device returned has
 555  *      had a reference added and the pointer is safe until the user calls
 556  *      dev_put to indicate they have finished with it.
 557  */
 558
 559 struct net_device *dev_get_by_index(int ifindex)
 560 {
 561         struct net_device *dev;
 562
 563         read_lock(&dev_base_lock);
 564         dev = __dev_get_by_index(ifindex);
 565         if (dev)
 566                 dev_hold(dev);
 567         read_unlock(&dev_base_lock);
 568         return dev;
 569 }
 570
 571 /**
 572  *      dev_getbyhwaddr - find a device by its hardware address
 573  *      @type: media type of device
 574  *      @ha: hardware address
 575  *
 576  *      Search for an interface by MAC address. Returns NULL if the device
 577  *      is not found or a pointer to the device. The caller must hold the
 578  *      rtnl semaphore. The returned device has not had its ref count increased
 579  *      and the caller must therefore be careful about locking
 580  *
 581  *      BUGS:
 582  *      If the API was consistent this would be __dev_get_by_hwaddr
 583  */
 584
 585 struct net_device *dev_getbyhwaddr(unsigned short type, char *ha)
 586 {
 587         struct net_device *dev;
 588
 589         ASSERT_RTNL();
 590
 591         for (dev = dev_base; dev; dev = dev->next)
 592                 if (dev->type == type &&
 593                     !memcmp(dev->dev_addr, ha, dev->addr_len))
 594                         break;
 595         return dev;
 596 }
 597
 598 struct net_device *dev_getfirstbyhwtype(unsigned short type)
 599 {
 600         struct net_device *dev;
 601
 602         rtnl_lock();
 603         for (dev = dev_base; dev; dev = dev->next) {
 604                 if (dev->type == type) {
 605                         dev_hold(dev);
 606                         break;
 607                 }
 608         }
 609         rtnl_unlock();
 610         return dev;
 611 }
 612
 613 EXPORT_SYMBOL(dev_getfirstbyhwtype);
 614
 615 /**
 616  *      dev_get_by_flags - find any device with given flags
 617  *      @if_flags: IFF_* values
 618  *      @mask: bitmask of bits in if_flags to check
 619  *
 620  *      Search for any interface with the given flags. Returns NULL if a device
 621  *      is not found or a pointer to the device. The device returned has
 622  *      had a reference added and the pointer is safe until the user calls
 623  *      dev_put to indicate they have finished with it.
 624  */
 625
 626 struct net_device * dev_get_by_flags(unsigned short if_flags, unsigned short mask)
 627 {
 628         struct net_device *dev;
 629
 630         read_lock(&dev_base_lock);
 631         for (dev = dev_base; dev != NULL; dev = dev->next) {
 632                 if (((dev->flags ^ if_flags) & mask) == 0) {
 633                         dev_hold(dev);
 634                         break;
 635                 }
 636         }
 637         read_unlock(&dev_base_lock);
 638         return dev;
 639 }
 640
 641 /**
 642  *      dev_valid_name - check if name is okay for network device
 643  *      @name: name string
 644  *
 645  *      Network device names need to be valid file names to
 646  *      to allow sysfs to work
 647  */
 648 int dev_valid_name(const char *name)
 649 {
 650         return !(*name == '\0'
 651                  || !strcmp(name, ".")
 652                  || !strcmp(name, "..")
 653                  || strchr(name, '/'));
 654 }
 655
 656 /**
 657  *      dev_alloc_name - allocate a name for a device
 658  *      @dev: device
 659  *      @name: name format string
 660  *
 661  *      Passed a format string - eg "lt%d" it will try and find a suitable
 662  *      id. Not efficient for many devices, not called a lot. The caller
 663  *      must hold the dev_base or rtnl lock while allocating the name and
 664  *      adding the device in order to avoid duplicates. Returns the number
 665  *      of the unit assigned or a negative errno code.
 666  */
 667
 668 int dev_alloc_name(struct net_device *dev, const char *name)
 669 {
 670         int i = 0;
 671         char buf[IFNAMSIZ];
 672         const char *p;
 673         const int max_netdevices = 8*PAGE_SIZE;
 674         long *inuse;
 675         struct net_device *d;
 676
 677         p = strnchr(name, IFNAMSIZ-1, '%');
 678         if (p) {
 679                 /*
 680                  * Verify the string as this thing may have come from
 681                  * the user.  There must be either one "%d" and no other "%"
 682                  * characters.
 683                  */
 684                 if (p[1] != 'd' || strchr(p + 2, '%'))
 685                         return -EINVAL;
 686
 687                 /* Use one page as a bit array of possible slots */
 688                 inuse = (long *) get_zeroed_page(GFP_ATOMIC);
 689                 if (!inuse)
 690                         return -ENOMEM;
 691
 692                 for (d = dev_base; d; d = d->next) {
 693                         if (!sscanf(d->name, name, &i))
 694                                 continue;
 695                         if (i < 0 || i >= max_netdevices)
 696                                 continue;
 697
 698                         /*  avoid cases where sscanf is not exact inverse of printf */
 699                         snprintf(buf, sizeof(buf), name, i);
 700                         if (!strncmp(buf, d->name, IFNAMSIZ))
 701                                 set_bit(i, inuse);
 702                 }
 703
 704                 i = find_first_zero_bit(inuse, max_netdevices);
 705                 free_page((unsigned long) inuse);
 706         }
 707
 708         snprintf(buf, sizeof(buf), name, i);
 709         if (!__dev_get_by_name(buf)) {
 710                 strlcpy(dev->name, buf, IFNAMSIZ);
 711                 return i;
 712         }
 713
 714         /* It is possible to run out of possible slots
 715          * when the name is long and there isn't enough space left
 716          * for the digits, or if all bits are used.
 717          */
 718         return -ENFILE;
 719 }
 720
 721
 722 /**
 723  *      dev_change_name - change name of a device
 724  *      @dev: device
 725  *      @newname: name (or format string) must be at least IFNAMSIZ
 726  *
 727  *      Change name of a device, can pass format strings "eth%d".
 728  *      for wildcarding.
 729  */
 730 int dev_change_name(struct net_device *dev, char *newname)
 731 {
 732         int err = 0;
 733
 734         ASSERT_RTNL();
 735
 736         if (dev->flags & IFF_UP)
 737                 return -EBUSY;
 738
 739         if (!dev_valid_name(newname))
 740                 return -EINVAL;
 741
 742         if (strchr(newname, '%')) {
 743                 err = dev_alloc_name(dev, newname);
 744                 if (err < 0)
 745                         return err;
 746                 strcpy(newname, dev->name);
 747         }
 748         else if (__dev_get_by_name(newname))
 749                 return -EEXIST;
 750         else
 751                 strlcpy(dev->name, newname, IFNAMSIZ);
 752
 753         err = class_device_rename(&dev->class_dev, dev->name);
 754         if (!err) {
 755                 hlist_del(&dev->name_hlist);
 756                 hlist_add_head(&dev->name_hlist, dev_name_hash(dev->name));
 757                 notifier_call_chain(&netdev_chain, NETDEV_CHANGENAME, dev);
 758         }
 759
 760         return err;
 761 }
 762
 763 /**
 764  *      netdev_state_change - device changes state
 765  *      @dev: device to cause notification
 766  *
 767  *      Called to indicate a device has changed state. This function calls
 768  *      the notifier chains for netdev_chain and sends a NEWLINK message
 769  *      to the routing socket.
 770  */
 771 void netdev_state_change(struct net_device *dev)
 772 {
 773         if (dev->flags & IFF_UP) {
 774                 notifier_call_chain(&netdev_chain, NETDEV_CHANGE, dev);
 775                 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
 776         }
 777 }
 778
 779 /**
 780  *      dev_load        - load a network module
 781  *      @name: name of interface
 782  *
 783  *      If a network interface is not present and the process has suitable
 784  *      privileges this function loads the module. If module loading is not
 785  *      available in this kernel then it becomes a nop.
 786  */
 787
 788 void dev_load(const char *name)
 789 {
 790         struct net_device *dev;
 791
 792         read_lock(&dev_base_lock);
 793         dev = __dev_get_by_name(name);
 794         read_unlock(&dev_base_lock);
 795
 796         if (!dev && capable(CAP_SYS_MODULE))
 797                 request_module("%s", name);
 798 }
 799
 800 static int default_rebuild_header(struct sk_buff *skb)
 801 {
 802         printk(KERN_DEBUG "%s: default_rebuild_header called -- BUG!\n",
 803                skb->dev ? skb->dev->name : "NULL!!!");
 804         kfree_skb(skb);
 805         return 1;
 806 }
 807
 808
 809 /**
 810  *      dev_open        - prepare an interface for use.
 811  *      @dev:   device to open
 812  *
 813  *      Takes a device from down to up state. The device's private open
 814  *      function is invoked and then the multicast lists are loaded. Finally
 815  *      the device is moved into the up state and a %NETDEV_UP message is
 816  *      sent to the netdev notifier chain.
 817  *
 818  *      Calling this function on an active interface is a nop. On a failure
 819  *      a negative errno code is returned.
 820  */
 821 int dev_open(struct net_device *dev)
 822 {
 823         int ret = 0;
 824
 825         /*
 826          *      Is it already up?
 827          */
 828
 829         if (dev->flags & IFF_UP)
 830                 return 0;
 831
 832         /*
 833          *      Is it even present?
 834          */
 835         if (!netif_device_present(dev))
 836                 return -ENODEV;
 837
 838         /*
 839          *      Call device private open method
 840          */
 841         set_bit(__LINK_STATE_START, &dev->state);
 842         if (dev->open) {
 843                 ret = dev->open(dev);
 844                 if (ret)
 845                         clear_bit(__LINK_STATE_START, &dev->state);
 846         }
 847
 848         /*
 849          *      If it went open OK then:
 850          */
 851
 852         if (!ret) {
 853                 /*
 854                  *      Set the flags.
 855                  */
 856                 dev->flags |= IFF_UP;
 857
 858                 /*
 859                  *      Initialize multicasting status
 860                  */
 861                 dev_mc_upload(dev);
 862
 863                 /*
 864                  *      Wakeup transmit queue engine
 865                  */
 866                 dev_activate(dev);
 867
 868                 /*
 869                  *      ... and announce new interface.
 870                  */
 871                 notifier_call_chain(&netdev_chain, NETDEV_UP, dev);
 872         }
 873         return ret;
 874 }
 875
 876 /**
 877  *      dev_close - shutdown an interface.
 878  *      @dev: device to shutdown
 879  *
 880  *      This function moves an active device into down state. A
 881  *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
 882  *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
 883  *      chain.
 884  */
 885 int dev_close(struct net_device *dev)
 886 {
 887         if (!(dev->flags & IFF_UP))
 888                 return 0;
 889
 890         /*
 891          *      Tell people we are going down, so that they can
 892          *      prepare to death, when device is still operating.
 893          */
 894         notifier_call_chain(&netdev_chain, NETDEV_GOING_DOWN, dev);
 895
 896         dev_deactivate(dev);
 897
 898         clear_bit(__LINK_STATE_START, &dev->state);
 899
 900         /* Synchronize to scheduled poll. We cannot touch poll list,
 901          * it can be even on different cpu. So just clear netif_running(),
 902          * and wait when poll really will happen. Actually, the best place
 903          * for this is inside dev->stop() after device stopped its irq
 904          * engine, but this requires more changes in devices. */
 905
 906         smp_mb__after_clear_bit(); /* Commit netif_running(). */
 907         while (test_bit(__LINK_STATE_RX_SCHED, &dev->state)) {
 908                 /* No hurry. */
 909                 current->state = TASK_INTERRUPTIBLE;
 910                 schedule_timeout(1);
 911         }
 912
 913         /*
 914          *      Call the device specific close. This cannot fail.
 915          *      Only if device is UP
 916          *
 917          *      We allow it to be called even after a DETACH hot-plug
 918          *      event.
 919          */
 920         if (dev->stop)
 921                 dev->stop(dev);
 922
 923         /*
 924          *      Device is now down.
 925          */
 926
 927         dev->flags &= ~IFF_UP;
 928
 929         /*
 930          * Tell people we are down
 931          */
 932         notifier_call_chain(&netdev_chain, NETDEV_DOWN, dev);
 933
 934         return 0;
 935 }
 936
 937
 938 /*
 939  *      Device change register/unregister. These are not inline or static
 940  *      as we export them to the world.
 941  */
 942
 943 /**
 944  *      register_netdevice_notifier - register a network notifier block
 945  *      @nb: notifier
 946  *
 947  *      Register a notifier to be called when network device events occur.
 948  *      The notifier passed is linked into the kernel structures and must
 949  *      not be reused until it has been unregistered. A negative errno code
 950  *      is returned on a failure.
 951  *
 952  *      When registered all registration and up events are replayed
 953  *      to the new notifier to allow device to have a race free
 954  *      view of the network device list.
 955  */
 956
 957 int register_netdevice_notifier(struct notifier_block *nb)
 958 {
 959         struct net_device *dev;
 960         int err;
 961
 962         rtnl_lock();
 963         err = notifier_chain_register(&netdev_chain, nb);
 964         if (!err) {
 965                 for (dev = dev_base; dev; dev = dev->next) {
 966                         nb->notifier_call(nb, NETDEV_REGISTER, dev);
 967
 968                         if (dev->flags & IFF_UP)
 969                                 nb->notifier_call(nb, NETDEV_UP, dev);
 970                 }
 971         }
 972         rtnl_unlock();
 973         return err;
 974 }
 975
 976 /**
 977  *      unregister_netdevice_notifier - unregister a network notifier block
 978  *      @nb: notifier
 979  *
 980  *      Unregister a notifier previously registered by
 981  *      register_netdevice_notifier(). The notifier is unlinked into the
 982  *      kernel structures and may then be reused. A negative errno code
 983  *      is returned on a failure.
 984  */
 985
 986 int unregister_netdevice_notifier(struct notifier_block *nb)
 987 {
 988         return notifier_chain_unregister(&netdev_chain, nb);
 989 }
 990
 991 /**
 992  *      call_netdevice_notifiers - call all network notifier blocks
 993  *      @val: value passed unmodified to notifier function
 994  *      @v:   pointer passed unmodified to notifier function
 995  *
 996  *      Call all network notifier blocks.  Parameters and return value
 997  *      are as for notifier_call_chain().
 998  */
 999
1000 int call_netdevice_notifiers(unsigned long val, void *v)
1001 {
1002         return notifier_call_chain(&netdev_chain, val, v);
1003 }
1004
1005 /*
1006  *      Support routine. Sends outgoing frames to any network
1007  *      taps currently in use.
1008  */
1009
1010 void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1011 {
1012         struct packet_type *ptype;
1013         net_timestamp(&skb->stamp);
1014
1015         rcu_read_lock();
1016         list_for_each_entry_rcu(ptype, &ptype_all, list) {
1017                 /* Never send packets back to the socket
1018                  * they originated from - MvS (miquels@drinkel.ow.org)
1019                  */
1020                 if ((ptype->dev == dev || !ptype->dev) &&
1021                     (ptype->af_packet_priv == NULL ||
1022                      (struct sock *)ptype->af_packet_priv != skb->sk)) {
1023                         struct sk_buff *skb2= skb_clone(skb, GFP_ATOMIC);
1024                         if (!skb2)
1025                                 break;
1026
1027                         /* skb->nh should be correctly
1028                            set by sender, so that the second statement is
1029                            just protection against buggy protocols.
1030                          */
1031                         skb2->mac.raw = skb2->data;
1032
1033                         if (skb2->nh.raw < skb2->data ||
1034                             skb2->nh.raw > skb2->tail) {
1035                                 if (net_ratelimit())
1036                                         printk(KERN_CRIT "protocol %04x is "
1037                                                "buggy, dev %s\n",
1038                                                skb2->protocol, dev->name);
1039                                 skb2->nh.raw = skb2->data;
1040                         }
1041
1042                         skb2->h.raw = skb2->nh.raw;
1043                         skb2->pkt_type = PACKET_OUTGOING;
1044                         ptype->func(skb2, skb->dev, ptype);
1045                 }
1046         }
1047         rcu_read_unlock();
1048 }
1049
1050 /*
1051  * Invalidate hardware checksum when packet is to be mangled, and
1052  * complete checksum manually on outgoing path.
1053  */
1054 int skb_checksum_help(struct sk_buff **pskb, int inward)
1055 {
1056         unsigned int csum;
1057         int ret = 0, offset = (*pskb)->h.raw - (*pskb)->data;
1058
1059         if (inward) {
1060                 (*pskb)->ip_summed = CHECKSUM_NONE;
1061                 goto out;
1062         }
1063
1064         if (skb_cloned(*pskb)) {
1065                 ret = pskb_expand_head(*pskb, 0, 0, GFP_ATOMIC);
1066                 if (ret)
1067                         goto out;
1068         }
1069
1070         if (offset > (int)(*pskb)->len)
1071                 BUG();
1072         csum = skb_checksum(*pskb, offset, (*pskb)->len-offset, 0);
1073
1074         offset = (*pskb)->tail - (*pskb)->h.raw;
1075         if (offset <= 0)
1076                 BUG();
1077         if ((*pskb)->csum + 2 > offset)
1078                 BUG();
1079
1080         *(u16*)((*pskb)->h.raw + (*pskb)->csum) = csum_fold(csum);
1081         (*pskb)->ip_summed = CHECKSUM_NONE;
1082 out:
1083         return ret;
1084 }
1085
1086 #ifdef CONFIG_HIGHMEM
1087 /* Actually, we should eliminate this check as soon as we know, that:
1088  * 1. IOMMU is present and allows to map all the memory.
1089  * 2. No high memory really exists on this machine.
1090  */
1091
1092 static inline int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1093 {
1094         int i;
1095
1096         if (dev->features & NETIF_F_HIGHDMA)
1097                 return 0;
1098
1099         for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1100                 if (skb_shinfo(skb)->frags[i].page >= highmem_start_page)
1101                         return 1;
1102
1103         return 0;
1104 }
1105 #else
1106 #define illegal_highdma(dev, skb)       (0)
1107 #endif
1108
1109 extern void skb_release_data(struct sk_buff *);
1110
1111 /* Keep head the same: replace data */
1112 int __skb_linearize(struct sk_buff *skb, int gfp_mask)
1113 {
1114         unsigned int size;
1115         u8 *data;
1116         long offset;
1117         struct skb_shared_info *ninfo;
1118         int headerlen = skb->data - skb->head;
1119         int expand = (skb->tail + skb->data_len) - skb->end;
1120
1121         if (skb_shared(skb))
1122                 BUG();
1123
1124         if (expand <= 0)
1125                 expand = 0;
1126
1127         size = skb->end - skb->head + expand;
1128         size = SKB_DATA_ALIGN(size);
1129         data = kmalloc(size + sizeof(struct skb_shared_info), gfp_mask);
1130         if (!data)
1131                 return -ENOMEM;
1132
1133         /* Copy entire thing */
1134         if (skb_copy_bits(skb, -headerlen, data, headerlen + skb->len))
1135                 BUG();
1136
1137         /* Set up shinfo */
1138         ninfo = (struct skb_shared_info*)(data + size);
1139         atomic_set(&ninfo->dataref, 1);
1140         ninfo->tso_size = skb_shinfo(skb)->tso_size;
1141         ninfo->tso_segs = skb_shinfo(skb)->tso_segs;
1142         ninfo->nr_frags = 0;
1143         ninfo->frag_list = NULL;
1144
1145         /* Offset between the two in bytes */
1146         offset = data - skb->head;
1147
1148         /* Free old data. */
1149         skb_release_data(skb);
1150
1151         skb->head = data;
1152         skb->end  = data + size;
1153
1154         /* Set up new pointers */
1155         skb->h.raw   += offset;
1156         skb->nh.raw  += offset;
1157         skb->mac.raw += offset;
1158         skb->tail    += offset;
1159         skb->data    += offset;
1160
1161         /* We are no longer a clone, even if we were. */
1162         skb->cloned    = 0;
1163
1164         skb->tail     += skb->data_len;
1165         skb->data_len  = 0;
1166         return 0;
1167 }
1168
1169 #define HARD_TX_LOCK(dev, cpu) {                        \
1170         if ((dev->features & NETIF_F_LLTX) == 0) {      \
1171                 spin_lock(&dev->xmit_lock);             \
1172                 dev->xmit_lock_owner = cpu;             \
1173         }                                               \
1174 }
1175
1176 #define HARD_TX_UNLOCK(dev) {                           \
1177         if ((dev->features & NETIF_F_LLTX) == 0) {      \
1178                 dev->xmit_lock_owner = -1;              \
1179                 spin_unlock(&dev->xmit_lock);           \
1180         }                                               \
1181 }
1182
1183 static inline void qdisc_run(struct net_device *dev)
1184 {
1185         while (!netif_queue_stopped(dev) &&
1186                qdisc_restart(dev)<0)
1187                 /* NOTHING */;
1188 }
1189
1190 /**
1191  *      dev_queue_xmit - transmit a buffer
1192  *      @skb: buffer to transmit
1193  *
1194  *      Queue a buffer for transmission to a network device. The caller must
1195  *      have set the device and priority and built the buffer before calling
1196  *      this function. The function can be called from an interrupt.
1197  *
1198  *      A negative errno code is returned on a failure. A success does not
1199  *      guarantee the frame will be transmitted as it may be dropped due
1200  *      to congestion or traffic shaping.
1201  */
1202
1203 int dev_queue_xmit(struct sk_buff *skb)
1204 {
1205         struct net_device *dev = skb->dev;
1206         struct Qdisc *q;
1207         int rc = -ENOMEM;
1208
1209         if (skb_shinfo(skb)->frag_list &&
1210             !(dev->features & NETIF_F_FRAGLIST) &&
1211             __skb_linearize(skb, GFP_ATOMIC))
1212                 goto out_kfree_skb;
1213
1214         /* Fragmented skb is linearized if device does not support SG,
1215          * or if at least one of fragments is in highmem and device
1216          * does not support DMA from it.
1217          */
1218         if (skb_shinfo(skb)->nr_frags &&
1219             (!(dev->features & NETIF_F_SG) || illegal_highdma(dev, skb)) &&
1220             __skb_linearize(skb, GFP_ATOMIC))
1221                 goto out_kfree_skb;
1222
1223         /* If packet is not checksummed and device does not support
1224          * checksumming for this protocol, complete checksumming here.
1225          */
1226         if (skb->ip_summed == CHECKSUM_HW &&
1227             (!(dev->features & (NETIF_F_HW_CSUM | NETIF_F_NO_CSUM)) &&
1228              (!(dev->features & NETIF_F_IP_CSUM) ||
1229               skb->protocol != htons(ETH_P_IP))))
1230                 if (skb_checksum_help(&skb, 0))
1231                         goto out_kfree_skb;
1232
1233
1234         /* Disable soft irqs for various locks below. Also
1235          * stops preemption for RCU.
1236          */
1237         local_bh_disable();
1238
1239         /* Updates of qdisc are serialized by queue_lock.
1240          * The struct Qdisc which is pointed to by qdisc is now a
1241          * rcu structure - it may be accessed without acquiring
1242          * a lock (but the structure may be stale.) The freeing of the
1243          * qdisc will be deferred until it's known that there are no
1244          * more references to it.
1245          *
1246          * If the qdisc has an enqueue function, we still need to
1247          * hold the queue_lock before calling it, since queue_lock
1248          * also serializes access to the device queue.
1249          */
1250
1251         q = rcu_dereference(dev->qdisc);
1252 #ifdef CONFIG_NET_CLS_ACT
1253         skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_EGRESS);
1254 #endif
1255         if (q->enqueue) {
1256                 /* Grab device queue */
1257                 spin_lock(&dev->queue_lock);
1258
1259                 rc = q->enqueue(skb, q);
1260
1261                 qdisc_run(dev);
1262
1263                 spin_unlock(&dev->queue_lock);
1264                 rc = rc == NET_XMIT_BYPASS ? NET_XMIT_SUCCESS : rc;
1265                 goto out;
1266         }
1267
1268         /* The device has no queue. Common case for software devices:
1269            loopback, all the sorts of tunnels...
1270
1271            Really, it is unlikely that xmit_lock protection is necessary here.
1272            (f.e. loopback and IP tunnels are clean ignoring statistics
1273            counters.)
1274            However, it is possible, that they rely on protection
1275            made by us here.
1276
1277            Check this and shot the lock. It is not prone from deadlocks.
1278            Either shot noqueue qdisc, it is even simpler 8)
1279          */
1280         if (dev->flags & IFF_UP) {
1281                 int cpu = smp_processor_id(); /* ok because BHs are off */
1282
1283                 if (dev->xmit_lock_owner != cpu) {
1284
1285                         HARD_TX_LOCK(dev, cpu);
1286
1287                         if (!netif_queue_stopped(dev)) {
1288                                 if (netdev_nit)
1289                                         dev_queue_xmit_nit(skb, dev);
1290
1291                                 rc = 0;
1292                                 if (!dev->hard_start_xmit(skb, dev)) {
1293                                         HARD_TX_UNLOCK(dev);
1294                                         goto out;
1295                                 }
1296                         }
1297                         HARD_TX_UNLOCK(dev);
1298                         if (net_ratelimit())
1299                                 printk(KERN_CRIT "Virtual device %s asks to "
1300                                        "queue packet!\n", dev->name);
1301                         goto out_enetdown;
1302                 } else {
1303                         /* Recursion is detected! It is possible,
1304                          * unfortunately */
1305                         if (net_ratelimit())
1306                                 printk(KERN_CRIT "Dead loop on virtual device "
1307                                        "%s, fix it urgently!\n", dev->name);
1308                 }
1309         }
1310 out_enetdown:
1311         rc = -ENETDOWN;
1312 out_kfree_skb:
1313         kfree_skb(skb);
1314 out:
1315         local_bh_enable();
1316         return rc;
1317 }
1318
1319
1320 /*=======================================================================
1321                         Receiver routines
1322   =======================================================================*/
1323
1324 int netdev_max_backlog = 300;
1325 int weight_p = 64;            /* old backlog weight */
1326 /* These numbers are selected based on intuition and some
1327  * experimentatiom, if you have more scientific way of doing this
1328  * please go ahead and fix things.
1329  */
1330 int no_cong_thresh = 10;
1331 int no_cong = 20;
1332 int lo_cong = 100;
1333 int mod_cong = 290;
1334
1335 DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
1336
1337
1338 static void get_sample_stats(int cpu)
1339 {
1340 #ifdef RAND_LIE
1341         unsigned long rd;
1342         int rq;
1343 #endif
1344         struct softnet_data *sd = &per_cpu(softnet_data, cpu);
1345         int blog = sd->input_pkt_queue.qlen;
1346         int avg_blog = sd->avg_blog;
1347
1348         avg_blog = (avg_blog >> 1) + (blog >> 1);
1349
1350         if (avg_blog > mod_cong) {
1351                 /* Above moderate congestion levels. */
1352                 sd->cng_level = NET_RX_CN_HIGH;
1353 #ifdef RAND_LIE
1354                 rd = net_random();
1355                 rq = rd % netdev_max_backlog;
1356                 if (rq < avg_blog) /* unlucky bastard */
1357                         sd->cng_level = NET_RX_DROP;
1358 #endif
1359         } else if (avg_blog > lo_cong) {
1360                 sd->cng_level = NET_RX_CN_MOD;
1361 #ifdef RAND_LIE
1362                 rd = net_random();
1363                 rq = rd % netdev_max_backlog;
1364                         if (rq < avg_blog) /* unlucky bastard */
1365                                 sd->cng_level = NET_RX_CN_HIGH;
1366 #endif
1367         } else if (avg_blog > no_cong)
1368                 sd->cng_level = NET_RX_CN_LOW;
1369         else  /* no congestion */
1370                 sd->cng_level = NET_RX_SUCCESS;
1371
1372         sd->avg_blog = avg_blog;
1373 }
1374
1375 #ifdef OFFLINE_SAMPLE
1376 static void sample_queue(unsigned long dummy)
1377 {
1378 /* 10 ms 0r 1ms -- i don't care -- JHS */
1379         int next_tick = 1;
1380         int cpu = smp_processor_id();
1381
1382         get_sample_stats(cpu);
1383         next_tick += jiffies;
1384         mod_timer(&samp_timer, next_tick);
1385 }
1386 #endif
1387
1388
1389 /**
1390  *      netif_rx        -       post buffer to the network code
1391  *      @skb: buffer to post
1392  *
1393  *      This function receives a packet from a device driver and queues it for
1394  *      the upper (protocol) levels to process.  It always succeeds. The buffer
1395  *      may be dropped during processing for congestion control or by the
1396  *      protocol layers.
1397  *
1398  *      return values:
1399  *      NET_RX_SUCCESS  (no congestion)
1400  *      NET_RX_CN_LOW   (low congestion)
1401  *      NET_RX_CN_MOD   (moderate congestion)
1402  *      NET_RX_CN_HIGH  (high congestion)
1403  *      NET_RX_DROP     (packet was dropped)
1404  *
1405  */
1406
1407 int netif_rx(struct sk_buff *skb)
1408 {
1409         int this_cpu;
1410         struct softnet_data *queue;
1411         unsigned long flags;
1412
1413 #ifdef CONFIG_NETPOLL
1414         if (skb->dev->netpoll_rx && netpoll_rx(skb)) {
1415                 kfree_skb(skb);
1416                 return NET_RX_DROP;
1417         }
1418 #endif
1419
1420         if (!skb->stamp.tv_sec)
1421                 net_timestamp(&skb->stamp);
1422
1423         /*
1424          * The code is rearranged so that the path is the most
1425          * short when CPU is congested, but is still operating.
1426          */
1427         local_irq_save(flags);
1428         this_cpu = smp_processor_id();
1429         queue = &__get_cpu_var(softnet_data);
1430
1431         __get_cpu_var(netdev_rx_stat).total++;
1432         if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
1433                 if (queue->input_pkt_queue.qlen) {
1434                         if (queue->throttle)
1435                                 goto drop;
1436
1437 enqueue:
1438                         dev_hold(skb->dev);
1439                         __skb_queue_tail(&queue->input_pkt_queue, skb);
1440 #ifndef OFFLINE_SAMPLE
1441                         get_sample_stats(this_cpu);
1442 #endif
1443                         local_irq_restore(flags);
1444                         return queue->cng_level;
1445                 }
1446
1447                 if (queue->throttle)
1448                         queue->throttle = 0;
1449
1450                 netif_rx_schedule(&queue->backlog_dev);
1451                 goto enqueue;
1452         }
1453
1454         if (!queue->throttle) {
1455                 queue->throttle = 1;
1456                 __get_cpu_var(netdev_rx_stat).throttled++;
1457         }
1458
1459 drop:
1460         __get_cpu_var(netdev_rx_stat).dropped++;
1461         local_irq_restore(flags);
1462
1463         kfree_skb(skb);
1464         return NET_RX_DROP;
1465 }
1466
1467 static __inline__ void skb_bond(struct sk_buff *skb)
1468 {
1469         struct net_device *dev = skb->dev;
1470
1471         if (dev->master) {
1472                 skb->real_dev = skb->dev;
1473                 skb->dev = dev->master;
1474         }
1475 }
1476
1477 static void net_tx_action(struct softirq_action *h)
1478 {
1479         struct softnet_data *sd = &__get_cpu_var(softnet_data);
1480
1481         if (sd->completion_queue) {
1482                 struct sk_buff *clist;
1483
1484                 local_irq_disable();
1485                 clist = sd->completion_queue;
1486                 sd->completion_queue = NULL;
1487                 local_irq_enable();
1488
1489                 while (clist) {
1490                         struct sk_buff *skb = clist;
1491                         clist = clist->next;
1492
1493                         BUG_TRAP(!atomic_read(&skb->users));
1494                         __kfree_skb(skb);
1495                 }
1496         }
1497
1498         if (sd->output_queue) {
1499                 struct net_device *head;
1500
1501                 local_irq_disable();
1502                 head = sd->output_queue;
1503                 sd->output_queue = NULL;
1504                 local_irq_enable();
1505
1506                 while (head) {
1507                         struct net_device *dev = head;
1508                         head = head->next_sched;
1509
1510                         smp_mb__before_clear_bit();
1511                         clear_bit(__LINK_STATE_SCHED, &dev->state);
1512
1513                         if (spin_trylock(&dev->queue_lock)) {
1514                                 qdisc_run(dev);
1515                                 spin_unlock(&dev->queue_lock);
1516                         } else {
1517                                 netif_schedule(dev);
1518                         }
1519                 }
1520         }
1521 }
1522
1523 static __inline__ int deliver_skb(struct sk_buff *skb,
1524                                   struct packet_type *pt_prev)
1525 {
1526         atomic_inc(&skb->users);
1527         return pt_prev->func(skb, skb->dev, pt_prev);
1528 }
1529
1530 #if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE)
1531 int (*br_handle_frame_hook)(struct net_bridge_port *p, struct sk_buff **pskb);
1532
1533 static __inline__ int handle_bridge(struct sk_buff **pskb,
1534                                     struct packet_type **pt_prev, int *ret)
1535 {
1536         struct net_bridge_port *port;
1537
1538         if ((*pskb)->pkt_type == PACKET_LOOPBACK ||
1539             (port = rcu_dereference((*pskb)->dev->br_port)) == NULL)
1540                 return 0;
1541
1542         if (*pt_prev) {
1543                 *ret = deliver_skb(*pskb, *pt_prev);
1544                 *pt_prev = NULL;
1545         }
1546
1547         return br_handle_frame_hook(port, pskb);
1548 }
1549 #else
1550 #define handle_bridge(skb, pt_prev, ret)        (0)
1551 #endif
1552
1553 #ifdef CONFIG_NET_CLS_ACT
1554 /* TODO: Maybe we should just force sch_ingress to be compiled in
1555  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
1556  * a compare and 2 stores extra right now if we dont have it on
1557  * but have CONFIG_NET_CLS_ACT
1558  * NOTE: This doesnt stop any functionality; if you dont have
1559  * the ingress scheduler, you just cant add policies on ingress.
1560  *
1561  */
1562 int ing_filter(struct sk_buff *skb)
1563 {
1564         struct Qdisc *q;
1565         struct net_device *dev = skb->dev;
1566         int result = TC_ACT_OK;
1567
1568         if (dev->qdisc_ingress) {
1569                 __u32 ttl = (__u32) G_TC_RTTL(skb->tc_verd);
1570                 if (MAX_RED_LOOP < ttl++) {
1571                         printk("Redir loop detected Dropping packet (%s->%s)\n",
1572                                 skb->input_dev?skb->input_dev->name:"??",skb->dev->name);
1573                         return TC_ACT_SHOT;
1574                 }
1575
1576                 skb->tc_verd = SET_TC_RTTL(skb->tc_verd,ttl);
1577
1578                 skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_INGRESS);
1579                 if (NULL == skb->input_dev) {
1580                         skb->input_dev = skb->dev;
1581                         printk("ing_filter:  fixed  %s out %s\n",skb->input_dev->name,skb->dev->name);
1582                 }
1583                 spin_lock(&dev->ingress_lock);
1584                 if ((q = dev->qdisc_ingress) != NULL)
1585                         result = q->enqueue(skb, q);
1586                 spin_unlock(&dev->ingress_lock);
1587
1588         }
1589
1590         return result;
1591 }
1592 #endif
1593
1594 int netif_receive_skb(struct sk_buff *skb)
1595 {
1596         struct packet_type *ptype, *pt_prev;
1597         int ret = NET_RX_DROP;
1598         unsigned short type;
1599
1600 #ifdef CONFIG_NETPOLL
1601         if (skb->dev->netpoll_rx && skb->dev->poll && netpoll_rx(skb)) {
1602                 kfree_skb(skb);
1603                 return NET_RX_DROP;
1604         }
1605 #endif
1606
1607         if (!skb->stamp.tv_sec)
1608                 net_timestamp(&skb->stamp);
1609
1610         skb_bond(skb);
1611
1612         __get_cpu_var(netdev_rx_stat).total++;
1613
1614         skb->h.raw = skb->nh.raw = skb->data;
1615         skb->mac_len = skb->nh.raw - skb->mac.raw;
1616
1617         pt_prev = NULL;
1618
1619         rcu_read_lock();
1620
1621 #ifdef CONFIG_NET_CLS_ACT
1622         if (skb->tc_verd & TC_NCLS) {
1623                 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
1624                 goto ncls;
1625         }
1626 #endif
1627
1628         list_for_each_entry_rcu(ptype, &ptype_all, list) {
1629                 if (!ptype->dev || ptype->dev == skb->dev) {
1630                         if (pt_prev)
1631                                 ret = deliver_skb(skb, pt_prev);
1632                         pt_prev = ptype;
1633                 }
1634         }
1635
1636 #ifdef CONFIG_NET_CLS_ACT
1637         if (pt_prev) {
1638                 ret = deliver_skb(skb, pt_prev);
1639                 pt_prev = NULL; /* noone else should process this after*/
1640         } else {
1641                 skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd);
1642         }
1643
1644         ret = ing_filter(skb);
1645
1646         if (ret == TC_ACT_SHOT || (ret == TC_ACT_STOLEN)) {
1647                 kfree_skb(skb);
1648                 goto out;
1649         }
1650
1651         skb->tc_verd = 0;
1652 ncls:
1653 #endif
1654
1655         handle_diverter(skb);
1656
1657         if (handle_bridge(&skb, &pt_prev, &ret))
1658                 goto out;
1659
1660         type = skb->protocol;
1661         list_for_each_entry_rcu(ptype, &ptype_base[ntohs(type)&15], list) {
1662                 if (ptype->type == type &&
1663                     (!ptype->dev || ptype->dev == skb->dev)) {
1664                         if (pt_prev)
1665                                 ret = deliver_skb(skb, pt_prev);
1666                         pt_prev = ptype;
1667                 }
1668         }
1669
1670         if (pt_prev) {
1671                 ret = pt_prev->func(skb, skb->dev, pt_prev);
1672         } else {
1673                 kfree_skb(skb);
1674                 /* Jamal, now you will not able to escape explaining
1675                  * me how you were going to use this. :-)
1676                  */
1677                 ret = NET_RX_DROP;
1678         }
1679
1680 out:
1681         rcu_read_unlock();
1682         return ret;
1683 }
1684
1685 static int process_backlog(struct net_device *backlog_dev, int *budget)
1686 {
1687         int work = 0;
1688         int quota = min(backlog_dev->quota, *budget);
1689         struct softnet_data *queue = &__get_cpu_var(softnet_data);
1690         unsigned long start_time = jiffies;
1691
1692         for (;;) {
1693                 struct sk_buff *skb;
1694                 struct net_device *dev;
1695
1696                 local_irq_disable();
1697                 skb = __skb_dequeue(&queue->input_pkt_queue);
1698                 if (!skb)
1699                         goto job_done;
1700                 local_irq_enable();
1701
1702                 dev = skb->dev;
1703
1704                 netif_receive_skb(skb);
1705
1706                 dev_put(dev);
1707
1708                 work++;
1709
1710                 if (work >= quota || jiffies - start_time > 1)
1711                         break;
1712
1713         }
1714
1715         backlog_dev->quota -= work;
1716         *budget -= work;
1717         return -1;
1718
1719 job_done:
1720         backlog_dev->quota -= work;
1721         *budget -= work;
1722
1723         list_del(&backlog_dev->poll_list);
1724         smp_mb__before_clear_bit();
1725         netif_poll_enable(backlog_dev);
1726
1727         if (queue->throttle)
1728                 queue->throttle = 0;
1729         local_irq_enable();
1730         return 0;
1731 }
1732
1733 static void net_rx_action(struct softirq_action *h)
1734 {
1735         struct softnet_data *queue = &__get_cpu_var(softnet_data);
1736         unsigned long start_time = jiffies;
1737         int budget = netdev_max_backlog;
1738
1739
1740         local_irq_disable();
1741
1742         while (!list_empty(&queue->poll_list)) {
1743                 struct net_device *dev;
1744
1745                 if (budget <= 0 || jiffies - start_time > 1)
1746                         goto softnet_break;
1747
1748                 local_irq_enable();
1749
1750                 dev = list_entry(queue->poll_list.next,
1751                                  struct net_device, poll_list);
1752
1753                 if (dev->quota <= 0 || dev->poll(dev, &budget)) {
1754                         local_irq_disable();
1755                         list_del(&dev->poll_list);
1756                         list_add_tail(&dev->poll_list, &queue->poll_list);
1757                         if (dev->quota < 0)
1758                                 dev->quota += dev->weight;
1759                         else
1760                                 dev->quota = dev->weight;
1761                 } else {
1762                         dev_put(dev);
1763                         local_irq_disable();
1764                 }
1765         }
1766 out:
1767         local_irq_enable();
1768         return;
1769
1770 softnet_break:
1771         __get_cpu_var(netdev_rx_stat).time_squeeze++;
1772         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
1773         goto out;
1774 }
1775
1776 static gifconf_func_t * gifconf_list [NPROTO];
1777
1778 /**
1779  *      register_gifconf        -       register a SIOCGIF handler
1780  *      @family: Address family
1781  *      @gifconf: Function handler
1782  *
1783  *      Register protocol dependent address dumping routines. The handler
1784  *      that is passed must not be freed or reused until it has been replaced
1785  *      by another handler.
1786  */
1787 int register_gifconf(unsigned int family, gifconf_func_t * gifconf)
1788 {
1789         if (family >= NPROTO)
1790                 return -EINVAL;
1791         gifconf_list[family] = gifconf;
1792         return 0;
1793 }
1794
1795
1796 /*
1797  *      Map an interface index to its name (SIOCGIFNAME)
1798  */
1799
1800 /*
1801  *      We need this ioctl for efficient implementation of the
1802  *      if_indextoname() function required by the IPv6 API.  Without
1803  *      it, we would have to search all the interfaces to find a
1804  *      match.  --pb
1805  */
1806
1807 static int dev_ifname(struct ifreq __user *arg)
1808 {
1809         struct net_device *dev;
1810         struct ifreq ifr;
1811
1812         /*
1813          *      Fetch the caller's info block.
1814          */
1815
1816         if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
1817                 return -EFAULT;
1818
1819         read_lock(&dev_base_lock);
1820         dev = __dev_get_by_index(ifr.ifr_ifindex);
1821         if (!dev) {
1822                 read_unlock(&dev_base_lock);
1823                 return -ENODEV;
1824         }
1825
1826         strcpy(ifr.ifr_name, dev->name);
1827         read_unlock(&dev_base_lock);
1828
1829         if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
1830                 return -EFAULT;
1831         return 0;
1832 }
1833
1834 /*
1835  *      Perform a SIOCGIFCONF call. This structure will change
1836  *      size eventually, and there is nothing I can do about it.
1837  *      Thus we will need a 'compatibility mode'.
1838  */
1839
1840 static int dev_ifconf(char __user *arg)
1841 {
1842         struct ifconf ifc;
1843         struct net_device *dev;
1844         char __user *pos;
1845         int len;
1846         int total;
1847         int i;
1848
1849         /*
1850          *      Fetch the caller's info block.
1851          */
1852
1853         if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
1854                 return -EFAULT;
1855
1856         pos = ifc.ifc_buf;
1857         len = ifc.ifc_len;
1858
1859         /*
1860          *      Loop over the interfaces, and write an info block for each.
1861          */
1862
1863         total = 0;
1864         for (dev = dev_base; dev; dev = dev->next) {
1865                 for (i = 0; i < NPROTO; i++) {
1866                         if (gifconf_list[i]) {
1867                                 int done;
1868                                 if (!pos)
1869                                         done = gifconf_list[i](dev, NULL, 0);
1870                                 else
1871                                         done = gifconf_list[i](dev, pos + total,
1872                                                                len - total);
1873                                 if (done < 0)
1874                                         return -EFAULT;
1875                                 total += done;
1876                         }
1877                 }
1878         }
1879
1880         /*
1881          *      All done.  Write the updated control block back to the caller.
1882          */
1883         ifc.ifc_len = total;
1884
1885         /*
1886          *      Both BSD and Solaris return 0 here, so we do too.
1887          */
1888         return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
1889 }
1890
1891 #ifdef CONFIG_PROC_FS
1892 /*
1893  *      This is invoked by the /proc filesystem handler to display a device
1894  *      in detail.
1895  */
1896 static __inline__ struct net_device *dev_get_idx(loff_t pos)
1897 {
1898         struct net_device *dev;
1899         loff_t i;
1900
1901         for (i = 0, dev = dev_base; dev && i < pos; ++i, dev = dev->next);
1902
1903         return i == pos ? dev : NULL;
1904 }
1905
1906 void *dev_seq_start(struct seq_file *seq, loff_t *pos)
1907 {
1908         read_lock(&dev_base_lock);
1909         return *pos ? dev_get_idx(*pos - 1) : SEQ_START_TOKEN;
1910 }
1911
1912 void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1913 {
1914         ++*pos;
1915         return v == SEQ_START_TOKEN ? dev_base : ((struct net_device *)v)->next;
1916 }
1917
1918 void dev_seq_stop(struct seq_file *seq, void *v)
1919 {
1920         read_unlock(&dev_base_lock);
1921 }
1922
1923 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
1924 {
1925         if (dev->get_stats) {
1926                 struct net_device_stats *stats = dev->get_stats(dev);
1927
1928                 seq_printf(seq, "%6s:%8lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu "
1929                                 "%8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n",
1930                            dev->name, stats->rx_bytes, stats->rx_packets,
1931                            stats->rx_errors,
1932                            stats->rx_dropped + stats->rx_missed_errors,
1933                            stats->rx_fifo_errors,
1934                            stats->rx_length_errors + stats->rx_over_errors +
1935                              stats->rx_crc_errors + stats->rx_frame_errors,
1936                            stats->rx_compressed, stats->multicast,
1937                            stats->tx_bytes, stats->tx_packets,
1938                            stats->tx_errors, stats->tx_dropped,
1939                            stats->tx_fifo_errors, stats->collisions,
1940                            stats->tx_carrier_errors +
1941                              stats->tx_aborted_errors +
1942                              stats->tx_window_errors +
1943                              stats->tx_heartbeat_errors,
1944                            stats->tx_compressed);
1945         } else
1946                 seq_printf(seq, "%6s: No statistics available.\n", dev->name);
1947 }
1948
1949 /*
1950  *      Called from the PROCfs module. This now uses the new arbitrary sized
1951  *      /proc/net interface to create /proc/net/dev
1952  */
1953 static int dev_seq_show(struct seq_file *seq, void *v)
1954 {
1955         if (v == SEQ_START_TOKEN)
1956                 seq_puts(seq, "Inter-|   Receive                            "
1957                               "                    |  Transmit\n"
1958                               " face |bytes    packets errs drop fifo frame "
1959                               "compressed multicast|bytes    packets errs "
1960                               "drop fifo colls carrier compressed\n");
1961         else
1962                 dev_seq_printf_stats(seq, v);
1963         return 0;
1964 }
1965
1966 static struct netif_rx_stats *softnet_get_online(loff_t *pos)
1967 {
1968         struct netif_rx_stats *rc = NULL;
1969
1970         while (*pos < NR_CPUS)
1971                 if (cpu_online(*pos)) {
1972                         rc = &per_cpu(netdev_rx_stat, *pos);
1973                         break;
1974                 } else
1975                         ++*pos;
1976         return rc;
1977 }
1978
1979 static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
1980 {
1981         return softnet_get_online(pos);
1982 }
1983
1984 static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1985 {
1986         ++*pos;
1987         return softnet_get_online(pos);
1988 }
1989
1990 static void softnet_seq_stop(struct seq_file *seq, void *v)
1991 {
1992 }
1993
1994 static int softnet_seq_show(struct seq_file *seq, void *v)
1995 {
1996         struct netif_rx_stats *s = v;
1997
1998         seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
1999                    s->total, s->dropped, s->time_squeeze, s->throttled,
2000                    s->fastroute_hit, s->fastroute_success, s->fastroute_defer,
2001                    s->fastroute_deferred_out,
2002 #if 0
2003                    s->fastroute_latency_reduction
2004 #else
2005                    s->cpu_collision
2006 #endif
2007                   );
2008         return 0;
2009 }
2010
2011 static struct seq_operations dev_seq_ops = {
2012         .start = dev_seq_start,
2013         .next  = dev_seq_next,
2014         .stop  = dev_seq_stop,
2015         .show  = dev_seq_show,
2016 };
2017
2018 static int dev_seq_open(struct inode *inode, struct file *file)
2019 {
2020         return seq_open(file, &dev_seq_ops);
2021 }
2022
2023 static struct file_operations dev_seq_fops = {
2024         .owner   = THIS_MODULE,
2025         .open    = dev_seq_open,
2026         .read    = seq_read,
2027         .llseek  = seq_lseek,
2028         .release = seq_release,
2029 };
2030
2031 static struct seq_operations softnet_seq_ops = {
2032         .start = softnet_seq_start,
2033         .next  = softnet_seq_next,
2034         .stop  = softnet_seq_stop,
2035         .show  = softnet_seq_show,
2036 };
2037
2038 static int softnet_seq_open(struct inode *inode, struct file *file)
2039 {
2040         return seq_open(file, &softnet_seq_ops);
2041 }
2042
2043 static struct file_operations softnet_seq_fops = {
2044         .owner   = THIS_MODULE,
2045         .open    = softnet_seq_open,
2046         .read    = seq_read,
2047         .llseek  = seq_lseek,
2048         .release = seq_release,
2049 };
2050
2051 #ifdef WIRELESS_EXT
2052 extern int wireless_proc_init(void);
2053 #else
2054 #define wireless_proc_init() 0
2055 #endif
2056
2057 static int __init dev_proc_init(void)
2058 {
2059         int rc = -ENOMEM;
2060
2061         if (!proc_net_fops_create("dev", S_IRUGO, &dev_seq_fops))
2062                 goto out;
2063         if (!proc_net_fops_create("softnet_stat", S_IRUGO, &softnet_seq_fops))
2064                 goto out_dev;
2065         if (wireless_proc_init())
2066                 goto out_softnet;
2067         rc = 0;
2068 out:
2069         return rc;
2070 out_softnet:
2071         proc_net_remove("softnet_stat");
2072 out_dev:
2073         proc_net_remove("dev");
2074         goto out;
2075 }
2076 #else
2077 #define dev_proc_init() 0
2078 #endif  /* CONFIG_PROC_FS */
2079
2080
2081 /**
2082  *      netdev_set_master       -       set up master/slave pair
2083  *      @slave: slave device
2084  *      @master: new master device
2085  *
2086  *      Changes the master device of the slave. Pass %NULL to break the
2087  *      bonding. The caller must hold the RTNL semaphore. On a failure
2088  *      a negative errno code is returned. On success the reference counts
2089  *      are adjusted, %RTM_NEWLINK is sent to the routing socket and the
2090  *      function returns zero.
2091  */
2092 int netdev_set_master(struct net_device *slave, struct net_device *master)
2093 {
2094         struct net_device *old = slave->master;
2095
2096         ASSERT_RTNL();
2097
2098         if (master) {
2099                 if (old)
2100                         return -EBUSY;
2101                 dev_hold(master);
2102         }
2103
2104         slave->master = master;
2105
2106         synchronize_net();
2107
2108         if (old)
2109                 dev_put(old);
2110
2111         if (master)
2112                 slave->flags |= IFF_SLAVE;
2113         else
2114                 slave->flags &= ~IFF_SLAVE;
2115
2116         rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
2117         return 0;
2118 }
2119
2120 /**
2121  *      dev_set_promiscuity     - update promiscuity count on a device
2122  *      @dev: device
2123  *      @inc: modifier
2124  *
2125  *      Add or remove promsicuity from a device. While the count in the device
2126  *      remains above zero the interface remains promiscuous. Once it hits zero
2127  *      the device reverts back to normal filtering operation. A negative inc
2128  *      value is used to drop promiscuity on the device.
2129  */
2130 void dev_set_promiscuity(struct net_device *dev, int inc)
2131 {
2132         unsigned short old_flags = dev->flags;
2133
2134         dev->flags |= IFF_PROMISC;
2135         if ((dev->promiscuity += inc) == 0)
2136                 dev->flags &= ~IFF_PROMISC;
2137         if (dev->flags ^ old_flags) {
2138                 dev_mc_upload(dev);
2139                 printk(KERN_INFO "device %s %s promiscuous mode\n",
2140                        dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
2141                                                                "left");
2142         }
2143 }
2144
2145 /**
2146  *      dev_set_allmulti        - update allmulti count on a device
2147  *      @dev: device
2148  *      @inc: modifier
2149  *
2150  *      Add or remove reception of all multicast frames to a device. While the
2151  *      count in the device remains above zero the interface remains listening
2152  *      to all interfaces. Once it hits zero the device reverts back to normal
2153  *      filtering operation. A negative @inc value is used to drop the counter
2154  *      when releasing a resource needing all multicasts.
2155  */
2156
2157 void dev_set_allmulti(struct net_device *dev, int inc)
2158 {
2159         unsigned short old_flags = dev->flags;
2160
2161         dev->flags |= IFF_ALLMULTI;
2162         if ((dev->allmulti += inc) == 0)
2163                 dev->flags &= ~IFF_ALLMULTI;
2164         if (dev->flags ^ old_flags)
2165                 dev_mc_upload(dev);
2166 }
2167
2168 unsigned dev_get_flags(const struct net_device *dev)
2169 {
2170         unsigned flags;
2171
2172         flags = (dev->flags & ~(IFF_PROMISC |
2173                                 IFF_ALLMULTI |
2174                                 IFF_RUNNING)) |
2175                 (dev->gflags & (IFF_PROMISC |
2176                                 IFF_ALLMULTI));
2177
2178         if (netif_running(dev) && netif_carrier_ok(dev))
2179                 flags |= IFF_RUNNING;
2180
2181         return flags;
2182 }
2183
2184 int dev_change_flags(struct net_device *dev, unsigned flags)
2185 {
2186         int ret;
2187         int old_flags = dev->flags;
2188
2189         /*
2190          *      Set the flags on our device.
2191          */
2192
2193         dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
2194                                IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
2195                                IFF_AUTOMEDIA)) |
2196                      (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
2197                                     IFF_ALLMULTI));
2198
2199         /*
2200          *      Load in the correct multicast list now the flags have changed.
2201          */
2202
2203         dev_mc_upload(dev);
2204
2205         /*
2206          *      Have we downed the interface. We handle IFF_UP ourselves
2207          *      according to user attempts to set it, rather than blindly
2208          *      setting it.
2209          */
2210
2211         ret = 0;
2212         if ((old_flags ^ flags) & IFF_UP) {     /* Bit is different  ? */
2213                 ret = ((old_flags & IFF_UP) ? dev_close : dev_open)(dev);
2214
2215                 if (!ret)
2216                         dev_mc_upload(dev);
2217         }
2218
2219         if (dev->flags & IFF_UP &&
2220             ((old_flags ^ dev->flags) &~ (IFF_UP | IFF_PROMISC | IFF_ALLMULTI |
2221                                           IFF_VOLATILE)))
2222                 notifier_call_chain(&netdev_chain, NETDEV_CHANGE, dev);
2223
2224         if ((flags ^ dev->gflags) & IFF_PROMISC) {
2225                 int inc = (flags & IFF_PROMISC) ? +1 : -1;
2226                 dev->gflags ^= IFF_PROMISC;
2227                 dev_set_promiscuity(dev, inc);
2228         }
2229
2230         /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
2231            is important. Some (broken) drivers set IFF_PROMISC, when
2232            IFF_ALLMULTI is requested not asking us and not reporting.
2233          */
2234         if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
2235                 int inc = (flags & IFF_ALLMULTI) ? +1 : -1;
2236                 dev->gflags ^= IFF_ALLMULTI;
2237                 dev_set_allmulti(dev, inc);
2238         }
2239
2240         if (old_flags ^ dev->flags)
2241                 rtmsg_ifinfo(RTM_NEWLINK, dev, old_flags ^ dev->flags);
2242
2243         return ret;
2244 }
2245
2246 int dev_set_mtu(struct net_device *dev, int new_mtu)
2247 {
2248         int err;
2249
2250         if (new_mtu == dev->mtu)
2251                 return 0;
2252
2253         /*      MTU must be positive.    */
2254         if (new_mtu < 0)
2255                 return -EINVAL;
2256
2257         if (!netif_device_present(dev))
2258                 return -ENODEV;
2259
2260         err = 0;
2261         if (dev->change_mtu)
2262                 err = dev->change_mtu(dev, new_mtu);
2263         else
2264                 dev->mtu = new_mtu;
2265         if (!err && dev->flags & IFF_UP)
2266                 notifier_call_chain(&netdev_chain,
2267                                     NETDEV_CHANGEMTU, dev);
2268         return err;
2269 }
2270
2271
2272 /*
2273  *      Perform the SIOCxIFxxx calls.
2274  */
2275 static int dev_ifsioc(struct ifreq *ifr, unsigned int cmd)
2276 {
2277         int err;
2278         struct net_device *dev = __dev_get_by_name(ifr->ifr_name);
2279
2280         if (!dev)
2281                 return -ENODEV;
2282
2283         switch (cmd) {
2284                 case SIOCGIFFLAGS:      /* Get interface flags */
2285                         ifr->ifr_flags = dev_get_flags(dev);
2286                         return 0;
2287
2288                 case SIOCSIFFLAGS:      /* Set interface flags */
2289                         return dev_change_flags(dev, ifr->ifr_flags);
2290
2291                 case SIOCGIFMETRIC:     /* Get the metric on the interface
2292                                            (currently unused) */
2293                         ifr->ifr_metric = 0;
2294                         return 0;
2295
2296                 case SIOCSIFMETRIC:     /* Set the metric on the interface
2297                                            (currently unused) */
2298                         return -EOPNOTSUPP;
2299
2300                 case SIOCGIFMTU:        /* Get the MTU of a device */
2301                         ifr->ifr_mtu = dev->mtu;
2302                         return 0;
2303
2304                 case SIOCSIFMTU:        /* Set the MTU of a device */
2305                         return dev_set_mtu(dev, ifr->ifr_mtu);
2306
2307                 case SIOCGIFHWADDR:
2308                         if ((size_t) dev->addr_len > sizeof ifr->ifr_hwaddr.sa_data)
2309                                 return -EOVERFLOW;
2310                         memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
2311                         memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr, dev->addr_len);
2312                         ifr->ifr_hwaddr.sa_family = dev->type;
2313                         return 0;
2314
2315                 case SIOCSIFHWADDR:
2316                         if (!dev->set_mac_address)
2317                                 return -EOPNOTSUPP;
2318                         if (ifr->ifr_hwaddr.sa_family != dev->type)
2319                                 return -EINVAL;
2320                         if (!netif_device_present(dev))
2321                                 return -ENODEV;
2322                         err = dev->set_mac_address(dev, &ifr->ifr_hwaddr);
2323                         if (!err)
2324                                 notifier_call_chain(&netdev_chain,
2325                                                     NETDEV_CHANGEADDR, dev);
2326                         return err;
2327
2328                 case SIOCSIFHWBROADCAST:
2329                         if (ifr->ifr_hwaddr.sa_family != dev->type)
2330                                 return -EINVAL;
2331                         memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
2332                                min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
2333                         notifier_call_chain(&netdev_chain,
2334                                             NETDEV_CHANGEADDR, dev);
2335                         return 0;
2336
2337                 case SIOCGIFMAP:
2338                         ifr->ifr_map.mem_start = dev->mem_start;
2339                         ifr->ifr_map.mem_end   = dev->mem_end;
2340                         ifr->ifr_map.base_addr = dev->base_addr;
2341                         ifr->ifr_map.irq       = dev->irq;
2342                         ifr->ifr_map.dma       = dev->dma;
2343                         ifr->ifr_map.port      = dev->if_port;
2344                         return 0;
2345
2346                 case SIOCSIFMAP:
2347                         if (dev->set_config) {
2348                                 if (!netif_device_present(dev))
2349                                         return -ENODEV;
2350                                 return dev->set_config(dev, &ifr->ifr_map);
2351                         }
2352                         return -EOPNOTSUPP;
2353
2354                 case SIOCADDMULTI:
2355                         if (!dev->set_multicast_list ||
2356                             ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
2357                                 return -EINVAL;
2358                         if (!netif_device_present(dev))
2359                                 return -ENODEV;
2360                         return dev_mc_add(dev, ifr->ifr_hwaddr.sa_data,
2361                                           dev->addr_len, 1);
2362
2363                 case SIOCDELMULTI:
2364                         if (!dev->set_multicast_list ||
2365                             ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
2366                                 return -EINVAL;
2367                         if (!netif_device_present(dev))
2368                                 return -ENODEV;
2369                         return dev_mc_delete(dev, ifr->ifr_hwaddr.sa_data,
2370                                              dev->addr_len, 1);
2371
2372                 case SIOCGIFINDEX:
2373                         ifr->ifr_ifindex = dev->ifindex;
2374                         return 0;
2375
2376                 case SIOCGIFTXQLEN:
2377                         ifr->ifr_qlen = dev->tx_queue_len;
2378                         return 0;
2379
2380                 case SIOCSIFTXQLEN:
2381                         if (ifr->ifr_qlen < 0)
2382                                 return -EINVAL;
2383                         dev->tx_queue_len = ifr->ifr_qlen;
2384                         return 0;
2385
2386                 case SIOCSIFNAME:
2387                         ifr->ifr_newname[IFNAMSIZ-1] = '\0';
2388                         return dev_change_name(dev, ifr->ifr_newname);
2389
2390                 /*
2391                  *      Unknown or private ioctl
2392                  */
2393
2394                 default:
2395                         if ((cmd >= SIOCDEVPRIVATE &&
2396                             cmd <= SIOCDEVPRIVATE + 15) ||
2397                             cmd == SIOCBONDENSLAVE ||
2398                             cmd == SIOCBONDRELEASE ||
2399                             cmd == SIOCBONDSETHWADDR ||
2400                             cmd == SIOCBONDSLAVEINFOQUERY ||
2401                             cmd == SIOCBONDINFOQUERY ||
2402                             cmd == SIOCBONDCHANGEACTIVE ||
2403                             cmd == SIOCGMIIPHY ||
2404                             cmd == SIOCGMIIREG ||
2405                             cmd == SIOCSMIIREG ||
2406                             cmd == SIOCBRADDIF ||
2407                             cmd == SIOCBRDELIF ||
2408                             cmd == SIOCWANDEV) {
2409                                 err = -EOPNOTSUPP;
2410                                 if (dev->do_ioctl) {
2411                                         if (netif_device_present(dev))
2412                                                 err = dev->do_ioctl(dev, ifr,
2413                                                                     cmd);
2414                                         else
2415                                                 err = -ENODEV;
2416                                 }
2417                         } else
2418                                 err = -EINVAL;
2419
2420         }
2421         return err;
2422 }
2423
2424 /*
2425  *      This function handles all "interface"-type I/O control requests. The actual
2426  *      'doing' part of this is dev_ifsioc above.
2427  */
2428
2429 /**
2430  *      dev_ioctl       -       network device ioctl
2431  *      @cmd: command to issue
2432  *      @arg: pointer to a struct ifreq in user space
2433  *
2434  *      Issue ioctl functions to devices. This is normally called by the
2435  *      user space syscall interfaces but can sometimes be useful for
2436  *      other purposes. The return value is the return from the syscall if
2437  *      positive or a negative errno code on error.
2438  */
2439
2440 int dev_ioctl(unsigned int cmd, void __user *arg)
2441 {
2442         struct ifreq ifr;
2443         int ret;
2444         char *colon;
2445
2446         /* One special case: SIOCGIFCONF takes ifconf argument
2447            and requires shared lock, because it sleeps writing
2448            to user space.
2449          */
2450
2451         if (cmd == SIOCGIFCONF) {
2452                 rtnl_shlock();
2453                 ret = dev_ifconf((char __user *) arg);
2454                 rtnl_shunlock();
2455                 return ret;
2456         }
2457         if (cmd == SIOCGIFNAME)
2458                 return dev_ifname((struct ifreq __user *)arg);
2459
2460         if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
2461                 return -EFAULT;
2462
2463         ifr.ifr_name[IFNAMSIZ-1] = 0;
2464
2465         colon = strchr(ifr.ifr_name, ':');
2466         if (colon)
2467                 *colon = 0;
2468
2469         /*
2470          *      See which interface the caller is talking about.
2471          */
2472
2473         switch (cmd) {
2474                 /*
2475                  *      These ioctl calls:
2476                  *      - can be done by all.
2477                  *      - atomic and do not require locking.
2478                  *      - return a value
2479                  */
2480                 case SIOCGIFFLAGS:
2481                 case SIOCGIFMETRIC:
2482                 case SIOCGIFMTU:
2483                 case SIOCGIFHWADDR:
2484                 case SIOCGIFSLAVE:
2485                 case SIOCGIFMAP:
2486                 case SIOCGIFINDEX:
2487                 case SIOCGIFTXQLEN:
2488                         dev_load(ifr.ifr_name);
2489                         read_lock(&dev_base_lock);
2490                         ret = dev_ifsioc(&ifr, cmd);
2491                         read_unlock(&dev_base_lock);
2492                         if (!ret) {
2493                                 if (colon)
2494                                         *colon = ':';
2495                                 if (copy_to_user(arg, &ifr,
2496                                                  sizeof(struct ifreq)))
2497                                         ret = -EFAULT;
2498                         }
2499                         return ret;
2500
2501                 case SIOCETHTOOL:
2502                         dev_load(ifr.ifr_name);
2503                         rtnl_lock();
2504                         ret = dev_ethtool(&ifr);
2505                         rtnl_unlock();
2506                         if (!ret) {
2507                                 if (colon)
2508                                         *colon = ':';
2509                                 if (copy_to_user(arg, &ifr,
2510                                                  sizeof(struct ifreq)))
2511                                         ret = -EFAULT;
2512                         }
2513                         return ret;
2514
2515                 /*
2516                  *      These ioctl calls:
2517                  *      - require superuser power.
2518                  *      - require strict serialization.
2519                  *      - return a value
2520                  */
2521                 case SIOCGMIIPHY:
2522                 case SIOCGMIIREG:
2523                 case SIOCSIFNAME:
2524                         if (!capable(CAP_NET_ADMIN))
2525                                 return -EPERM;
2526                         dev_load(ifr.ifr_name);
2527                         rtnl_lock();
2528                         ret = dev_ifsioc(&ifr, cmd);
2529                         rtnl_unlock();
2530                         if (!ret) {
2531                                 if (colon)
2532                                         *colon = ':';
2533                                 if (copy_to_user(arg, &ifr,
2534                                                  sizeof(struct ifreq)))
2535                                         ret = -EFAULT;
2536                         }
2537                         return ret;
2538
2539                 /*
2540                  *      These ioctl calls:
2541                  *      - require superuser power.
2542                  *      - require strict serialization.
2543                  *      - do not return a value
2544                  */
2545                 case SIOCSIFFLAGS:
2546                 case SIOCSIFMETRIC:
2547                 case SIOCSIFMTU:
2548                 case SIOCSIFMAP:
2549                 case SIOCSIFHWADDR:
2550                 case SIOCSIFSLAVE:
2551                 case SIOCADDMULTI:
2552                 case SIOCDELMULTI:
2553                 case SIOCSIFHWBROADCAST:
2554                 case SIOCSIFTXQLEN:
2555                 case SIOCSMIIREG:
2556                 case SIOCBONDENSLAVE:
2557                 case SIOCBONDRELEASE:
2558                 case SIOCBONDSETHWADDR:
2559                 case SIOCBONDSLAVEINFOQUERY:
2560                 case SIOCBONDINFOQUERY:
2561                 case SIOCBONDCHANGEACTIVE:
2562                 case SIOCBRADDIF:
2563                 case SIOCBRDELIF:
2564                         if (!capable(CAP_NET_ADMIN))
2565                                 return -EPERM;
2566                         dev_load(ifr.ifr_name);
2567                         rtnl_lock();
2568                         ret = dev_ifsioc(&ifr, cmd);
2569                         rtnl_unlock();
2570                         return ret;
2571
2572                 case SIOCGIFMEM:
2573                         /* Get the per device memory space. We can add this but
2574                          * currently do not support it */
2575                 case SIOCSIFMEM:
2576                         /* Set the per device memory buffer space.
2577                          * Not applicable in our case */
2578                 case SIOCSIFLINK:
2579                         return -EINVAL;
2580
2581                 /*
2582                  *      Unknown or private ioctl.
2583                  */
2584                 default:
2585                         if (cmd == SIOCWANDEV ||
2586                             (cmd >= SIOCDEVPRIVATE &&
2587                              cmd <= SIOCDEVPRIVATE + 15)) {
2588                                 dev_load(ifr.ifr_name);
2589                                 rtnl_lock();
2590                                 ret = dev_ifsioc(&ifr, cmd);
2591                                 rtnl_unlock();
2592                                 if (!ret && copy_to_user(arg, &ifr,
2593                                                          sizeof(struct ifreq)))
2594                                         ret = -EFAULT;
2595                                 return ret;
2596                         }
2597 #ifdef WIRELESS_EXT
2598                         /* Take care of Wireless Extensions */
2599                         if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) {
2600                                 /* If command is `set a parameter', or
2601                                  * `get the encoding parameters', check if
2602                                  * the user has the right to do it */
2603                                 if (IW_IS_SET(cmd) || cmd == SIOCGIWENCODE) {
2604                                         if (!capable(CAP_NET_ADMIN))
2605                                                 return -EPERM;
2606                                 }
2607                                 dev_load(ifr.ifr_name);
2608                                 rtnl_lock();
2609                                 /* Follow me in net/core/wireless.c */
2610                                 ret = wireless_process_ioctl(&ifr, cmd);
2611                                 rtnl_unlock();
2612                                 if (!ret && IW_IS_GET(cmd) &&
2613                                     copy_to_user(arg, &ifr,
2614                                                  sizeof(struct ifreq)))
2615                                         ret = -EFAULT;
2616                                 return ret;
2617                         }
2618 #endif  /* WIRELESS_EXT */
2619                         return -EINVAL;
2620         }
2621 }
2622
2623
2624 /**
2625  *      dev_new_index   -       allocate an ifindex
2626  *
2627  *      Returns a suitable unique value for a new device interface
2628  *      number.  The caller must hold the rtnl semaphore or the
2629  *      dev_base_lock to be sure it remains unique.
2630  */
2631 static int dev_new_index(void)
2632 {
2633         static int ifindex;
2634         for (;;) {
2635                 if (++ifindex <= 0)
2636                         ifindex = 1;
2637                 if (!__dev_get_by_index(ifindex))
2638                         return ifindex;
2639         }
2640 }
2641
2642 static int dev_boot_phase = 1;
2643
2644 /* Delayed registration/unregisteration */
2645 static spinlock_t net_todo_list_lock = SPIN_LOCK_UNLOCKED;
2646 static struct list_head net_todo_list = LIST_HEAD_INIT(net_todo_list);
2647
2648 static inline void net_set_todo(struct net_device *dev)
2649 {
2650         spin_lock(&net_todo_list_lock);
2651         list_add_tail(&dev->todo_list, &net_todo_list);
2652         spin_unlock(&net_todo_list_lock);
2653 }
2654
2655 /**
2656  *      register_netdevice      - register a network device
2657  *      @dev: device to register
2658  *
2659  *      Take a completed network device structure and add it to the kernel
2660  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
2661  *      chain. 0 is returned on success. A negative errno code is returned
2662  *      on a failure to set up the device, or if the name is a duplicate.
2663  *
2664  *      Callers must hold the rtnl semaphore.  See the comment at the
2665  *      end of Space.c for details about the locking.  You may want
2666  *      register_netdev() instead of this.
2667  *
2668  *      BUGS:
2669  *      The locking appears insufficient to guarantee two parallel registers
2670  *      will not get the same name.
2671  */
2672
2673 int register_netdevice(struct net_device *dev)
2674 {
2675         struct hlist_head *head;
2676         struct hlist_node *p;
2677         int ret;
2678
2679         BUG_ON(dev_boot_phase);
2680         ASSERT_RTNL();
2681
2682         /* When net_device's are persistent, this will be fatal. */
2683         BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
2684
2685         spin_lock_init(&dev->queue_lock);
2686         spin_lock_init(&dev->xmit_lock);
2687         dev->xmit_lock_owner = -1;
2688 #ifdef CONFIG_NET_CLS_ACT
2689         spin_lock_init(&dev->ingress_lock);
2690 #endif
2691
2692         ret = alloc_divert_blk(dev);
2693         if (ret)
2694                 goto out;
2695
2696         dev->iflink = -1;
2697
2698         /* Init, if this function is available */
2699         if (dev->init) {
2700                 ret = dev->init(dev);
2701                 if (ret) {
2702                         if (ret > 0)
2703                                 ret = -EIO;
2704                         goto out_err;
2705                 }
2706         }
2707
2708         if (!dev_valid_name(dev->name)) {
2709                 ret = -EINVAL;
2710                 goto out_err;
2711         }
2712
2713         dev->ifindex = dev_new_index();
2714         if (dev->iflink == -1)
2715                 dev->iflink = dev->ifindex;
2716
2717         /* Check for existence of name */
2718         head = dev_name_hash(dev->name);
2719         hlist_for_each(p, head) {
2720                 struct net_device *d
2721                         = hlist_entry(p, struct net_device, name_hlist);
2722                 if (!strncmp(d->name, dev->name, IFNAMSIZ)) {
2723                         ret = -EEXIST;
2724                         goto out_err;
2725                 }
2726         }
2727
2728         /* Fix illegal SG+CSUM combinations. */
2729         if ((dev->features & NETIF_F_SG) &&
2730             !(dev->features & (NETIF_F_IP_CSUM |
2731                                NETIF_F_NO_CSUM |
2732                                NETIF_F_HW_CSUM))) {
2733                 printk("%s: Dropping NETIF_F_SG since no checksum feature.\n",
2734                        dev->name);
2735                 dev->features &= ~NETIF_F_SG;
2736         }
2737
2738         /*
2739          *      nil rebuild_header routine,
2740          *      that should be never called and used as just bug trap.
2741          */
2742
2743         if (!dev->rebuild_header)
2744                 dev->rebuild_header = default_rebuild_header;
2745
2746         /*
2747          *      Default initial state at registry is that the
2748          *      device is present.
2749          */
2750
2751         set_bit(__LINK_STATE_PRESENT, &dev->state);
2752
2753         dev->next = NULL;
2754         dev_init_scheduler(dev);
2755         write_lock_bh(&dev_base_lock);
2756         *dev_tail = dev;
2757         dev_tail = &dev->next;
2758         hlist_add_head(&dev->name_hlist, head);
2759         hlist_add_head(&dev->index_hlist, dev_index_hash(dev->ifindex));
2760         dev_hold(dev);
2761         dev->reg_state = NETREG_REGISTERING;
2762         write_unlock_bh(&dev_base_lock);
2763
2764         /* Notify protocols, that a new device appeared. */
2765         notifier_call_chain(&netdev_chain, NETDEV_REGISTER, dev);
2766
2767         /* Finish registration after unlock */
2768         net_set_todo(dev);
2769         ret = 0;
2770
2771 out:
2772         return ret;
2773 out_err:
2774         free_divert_blk(dev);
2775         goto out;
2776 }
2777
2778 /*
2779  * netdev_wait_allrefs - wait until all references are gone.
2780  *
2781  * This is called when unregistering network devices.
2782  *
2783  * Any protocol or device that holds a reference should register
2784  * for netdevice notification, and cleanup and put back the
2785  * reference if they receive an UNREGISTER event.
2786  * We can get stuck here if buggy protocols don't correctly
2787  * call dev_put.
2788  */
2789 static void netdev_wait_allrefs(struct net_device *dev)
2790 {
2791         unsigned long rebroadcast_time, warning_time;
2792
2793         rebroadcast_time = warning_time = jiffies;
2794         while (atomic_read(&dev->refcnt) != 0) {
2795                 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
2796                         rtnl_shlock();
2797
2798                         /* Rebroadcast unregister notification */
2799                         notifier_call_chain(&netdev_chain,
2800                                             NETDEV_UNREGISTER, dev);
2801
2802                         if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
2803                                      &dev->state)) {
2804                                 /* We must not have linkwatch events
2805                                  * pending on unregister. If this
2806                                  * happens, we simply run the queue
2807                                  * unscheduled, resulting in a noop
2808                                  * for this device.
2809                                  */
2810                                 linkwatch_run_queue();
2811                         }
2812
2813                         rtnl_shunlock();
2814
2815                         rebroadcast_time = jiffies;
2816                 }
2817
2818                 current->state = TASK_INTERRUPTIBLE;
2819                 schedule_timeout(HZ / 4);
2820
2821                 if (time_after(jiffies, warning_time + 10 * HZ)) {
2822                         printk(KERN_EMERG "unregister_netdevice: "
2823                                "waiting for %s to become free. Usage "
2824                                "count = %d\n",
2825                                dev->name, atomic_read(&dev->refcnt));
2826                         warning_time = jiffies;
2827                 }
2828         }
2829 }
2830
2831 /* The sequence is:
2832  *
2833  *      rtnl_lock();
2834  *      ...
2835  *      register_netdevice(x1);
2836  *      register_netdevice(x2);
2837  *      ...
2838  *      unregister_netdevice(y1);
2839  *      unregister_netdevice(y2);
2840  *      ...
2841  *      rtnl_unlock();
2842  *      free_netdev(y1);
2843  *      free_netdev(y2);
2844  *
2845  * We are invoked by rtnl_unlock() after it drops the semaphore.
2846  * This allows us to deal with problems:
2847  * 1) We can create/delete sysfs objects which invoke hotplug
2848  *    without deadlocking with linkwatch via keventd.
2849  * 2) Since we run with the RTNL semaphore not held, we can sleep
2850  *    safely in order to wait for the netdev refcnt to drop to zero.
2851  */
2852 static DECLARE_MUTEX(net_todo_run_mutex);
2853 void netdev_run_todo(void)
2854 {
2855         struct list_head list = LIST_HEAD_INIT(list);
2856         int err;
2857
2858
2859         /* Need to guard against multiple cpu's getting out of order. */
2860         down(&net_todo_run_mutex);
2861
2862         /* Not safe to do outside the semaphore.  We must not return
2863          * until all unregister events invoked by the local processor
2864          * have been completed (either by this todo run, or one on
2865          * another cpu).
2866          */
2867         if (list_empty(&net_todo_list))
2868                 goto out;
2869
2870         /* Snapshot list, allow later requests */
2871         spin_lock(&net_todo_list_lock);
2872         list_splice_init(&net_todo_list, &list);
2873         spin_unlock(&net_todo_list_lock);
2874
2875         while (!list_empty(&list)) {
2876                 struct net_device *dev
2877                         = list_entry(list.next, struct net_device, todo_list);
2878                 list_del(&dev->todo_list);
2879
2880                 switch(dev->reg_state) {
2881                 case NETREG_REGISTERING:
2882                         err = netdev_register_sysfs(dev);
2883                         if (err)
2884                                 printk(KERN_ERR "%s: failed sysfs registration (%d)\n",
2885                                        dev->name, err);
2886                         dev->reg_state = NETREG_REGISTERED;
2887                         break;
2888
2889                 case NETREG_UNREGISTERING:
2890                         netdev_unregister_sysfs(dev);
2891                         dev->reg_state = NETREG_UNREGISTERED;
2892
2893                         netdev_wait_allrefs(dev);
2894
2895                         /* paranoia */
2896                         BUG_ON(atomic_read(&dev->refcnt));
2897                         BUG_TRAP(!dev->ip_ptr);
2898                         BUG_TRAP(!dev->ip6_ptr);
2899                         BUG_TRAP(!dev->dn_ptr);
2900
2901
2902                         /* It must be the very last action,
2903                          * after this 'dev' may point to freed up memory.
2904                          */
2905                         if (dev->destructor)
2906                                 dev->destructor(dev);
2907                         break;
2908
2909                 default:
2910                         printk(KERN_ERR "network todo '%s' but state %d\n",
2911                                dev->name, dev->reg_state);
2912                         break;
2913                 }
2914         }
2915
2916 out:
2917         up(&net_todo_run_mutex);
2918 }
2919
2920 /**
2921  *      free_netdev - free network device
2922  *      @dev: device
2923  *
2924  *      This function does the last stage of destroying an allocated device
2925  *      interface. The reference to the device object is released.
2926  *      If this is the last reference then it will be freed.
2927  */
2928 void free_netdev(struct net_device *dev)
2929 {
2930 #ifdef CONFIG_SYSFS
2931         /*  Compatiablity with error handling in drivers */
2932         if (dev->reg_state == NETREG_UNINITIALIZED) {
2933                 kfree((char *)dev - dev->padded);
2934                 return;
2935         }
2936
2937         BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
2938         dev->reg_state = NETREG_RELEASED;
2939
2940         /* will free via class release */
2941         class_device_put(&dev->class_dev);
2942 #else
2943         kfree((char *)dev - dev->padded);
2944 #endif
2945 }
2946
2947 /* Synchronize with packet receive processing. */
2948 void synchronize_net(void)
2949 {
2950         might_sleep();
2951         synchronize_kernel();
2952 }
2953
2954 /**
2955  *      unregister_netdevice - remove device from the kernel
2956  *      @dev: device
2957  *
2958  *      This function shuts down a device interface and removes it
2959  *      from the kernel tables. On success 0 is returned, on a failure
2960  *      a negative errno code is returned.
2961  *
2962  *      Callers must hold the rtnl semaphore.  See the comment at the
2963  *      end of Space.c for details about the locking.  You may want
2964  *      unregister_netdev() instead of this.
2965  */
2966
2967 int unregister_netdevice(struct net_device *dev)
2968 {
2969         struct net_device *d, **dp;
2970
2971         BUG_ON(dev_boot_phase);
2972         ASSERT_RTNL();
2973
2974         /* Some devices call without registering for initialization unwind. */
2975         if (dev->reg_state == NETREG_UNINITIALIZED) {
2976                 printk(KERN_DEBUG "unregister_netdevice: device %s/%p never "
2977                                   "was registered\n", dev->name, dev);
2978                 return -ENODEV;
2979         }
2980
2981         BUG_ON(dev->reg_state != NETREG_REGISTERED);
2982
2983         /* If device is running, close it first. */
2984         if (dev->flags & IFF_UP)
2985                 dev_close(dev);
2986
2987         /* And unlink it from device chain. */
2988         for (dp = &dev_base; (d = *dp) != NULL; dp = &d->next) {
2989                 if (d == dev) {
2990                         write_lock_bh(&dev_base_lock);
2991                         hlist_del(&dev->name_hlist);
2992                         hlist_del(&dev->index_hlist);
2993                         if (dev_tail == &dev->next)
2994                                 dev_tail = dp;
2995                         *dp = d->next;
2996                         write_unlock_bh(&dev_base_lock);
2997                         break;
2998                 }
2999         }
3000         if (!d) {
3001                 printk(KERN_ERR "unregister net_device: '%s' not found\n",
3002                        dev->name);
3003                 return -ENODEV;
3004         }
3005
3006         dev->reg_state = NETREG_UNREGISTERING;
3007
3008         synchronize_net();
3009
3010         /* Shutdown queueing discipline. */
3011         dev_shutdown(dev);
3012
3013
3014         /* Notify protocols, that we are about to destroy
3015            this device. They should clean all the things.
3016         */
3017         notifier_call_chain(&netdev_chain, NETDEV_UNREGISTER, dev);
3018
3019         /*
3020          *      Flush the multicast chain
3021          */
3022         dev_mc_discard(dev);
3023
3024         if (dev->uninit)
3025                 dev->uninit(dev);
3026
3027         /* Notifier chain MUST detach us from master device. */
3028         BUG_TRAP(!dev->master);
3029
3030         free_divert_blk(dev);
3031
3032         /* Finish processing unregister after unlock */
3033         net_set_todo(dev);
3034
3035         synchronize_net();
3036
3037         dev_put(dev);
3038         return 0;
3039 }
3040
3041 #ifdef CONFIG_HOTPLUG_CPU
3042 static int dev_cpu_callback(struct notifier_block *nfb,
3043                             unsigned long action,
3044                             void *ocpu)
3045 {
3046         struct sk_buff **list_skb;
3047         struct net_device **list_net;
3048         struct sk_buff *skb;
3049         unsigned int cpu, oldcpu = (unsigned long)ocpu;
3050         struct softnet_data *sd, *oldsd;
3051
3052         if (action != CPU_DEAD)
3053                 return NOTIFY_OK;
3054
3055         local_irq_disable();
3056         cpu = smp_processor_id();
3057         sd = &per_cpu(softnet_data, cpu);
3058         oldsd = &per_cpu(softnet_data, oldcpu);
3059
3060         /* Find end of our completion_queue. */
3061         list_skb = &sd->completion_queue;
3062         while (*list_skb)
3063                 list_skb = &(*list_skb)->next;
3064         /* Append completion queue from offline CPU. */
3065         *list_skb = oldsd->completion_queue;
3066         oldsd->completion_queue = NULL;
3067
3068         /* Find end of our output_queue. */
3069         list_net = &sd->output_queue;
3070         while (*list_net)
3071                 list_net = &(*list_net)->next_sched;
3072         /* Append output queue from offline CPU. */
3073         *list_net = oldsd->output_queue;
3074         oldsd->output_queue = NULL;
3075
3076         raise_softirq_irqoff(NET_TX_SOFTIRQ);
3077         local_irq_enable();
3078
3079         /* Process offline CPU's input_pkt_queue */
3080         while ((skb = __skb_dequeue(&oldsd->input_pkt_queue)))
3081                 netif_rx(skb);
3082
3083         return NOTIFY_OK;
3084 }
3085 #endif /* CONFIG_HOTPLUG_CPU */
3086
3087
3088 /*
3089  *      Initialize the DEV module. At boot time this walks the device list and
3090  *      unhooks any devices that fail to initialise (normally hardware not
3091  *      present) and leaves us with a valid list of present and active devices.
3092  *
3093  */
3094
3095 /*
3096  *       This is called single threaded during boot, so no need
3097  *       to take the rtnl semaphore.
3098  */
3099 static int __init net_dev_init(void)
3100 {
3101         int i, rc = -ENOMEM;
3102
3103         BUG_ON(!dev_boot_phase);
3104
3105         net_random_init();
3106
3107         if (dev_proc_init())
3108                 goto out;
3109
3110         if (netdev_sysfs_init())
3111                 goto out;
3112
3113         INIT_LIST_HEAD(&ptype_all);
3114         for (i = 0; i < 16; i++)
3115                 INIT_LIST_HEAD(&ptype_base[i]);
3116
3117         for (i = 0; i < ARRAY_SIZE(dev_name_head); i++)
3118                 INIT_HLIST_HEAD(&dev_name_head[i]);
3119
3120         for (i = 0; i < ARRAY_SIZE(dev_index_head); i++)
3121                 INIT_HLIST_HEAD(&dev_index_head[i]);
3122
3123         /*
3124          *      Initialise the packet receive queues.
3125          */
3126
3127         for (i = 0; i < NR_CPUS; i++) {
3128                 struct softnet_data *queue;
3129
3130                 queue = &per_cpu(softnet_data, i);
3131                 skb_queue_head_init(&queue->input_pkt_queue);
3132                 queue->throttle = 0;
3133                 queue->cng_level = 0;
3134                 queue->avg_blog = 10; /* arbitrary non-zero */
3135                 queue->completion_queue = NULL;
3136                 INIT_LIST_HEAD(&queue->poll_list);
3137                 set_bit(__LINK_STATE_START, &queue->backlog_dev.state);
3138                 queue->backlog_dev.weight = weight_p;
3139                 queue->backlog_dev.poll = process_backlog;
3140                 atomic_set(&queue->backlog_dev.refcnt, 1);
3141         }
3142
3143 #ifdef OFFLINE_SAMPLE
3144         samp_timer.expires = jiffies + (10 * HZ);
3145         add_timer(&samp_timer);
3146 #endif
3147
3148         dev_boot_phase = 0;
3149
3150         open_softirq(NET_TX_SOFTIRQ, net_tx_action, NULL);
3151         open_softirq(NET_RX_SOFTIRQ, net_rx_action, NULL);
3152
3153         hotcpu_notifier(dev_cpu_callback, 0);
3154         dst_init();
3155         dev_mcast_init();
3156         rc = 0;
3157 out:
3158         return rc;
3159 }
3160
3161 subsys_initcall(net_dev_init);
3162
3163 EXPORT_SYMBOL(__dev_get_by_index);
3164 EXPORT_SYMBOL(__dev_get_by_name);
3165 EXPORT_SYMBOL(__dev_remove_pack);
3166 EXPORT_SYMBOL(__skb_linearize);
3167 EXPORT_SYMBOL(call_netdevice_notifiers);
3168 EXPORT_SYMBOL(dev_add_pack);
3169 EXPORT_SYMBOL(dev_alloc_name);
3170 EXPORT_SYMBOL(dev_close);
3171 EXPORT_SYMBOL(dev_get_by_flags);
3172 EXPORT_SYMBOL(dev_get_by_index);
3173 EXPORT_SYMBOL(dev_get_by_name);
3174 EXPORT_SYMBOL(dev_ioctl);
3175 EXPORT_SYMBOL(dev_open);
3176 EXPORT_SYMBOL(dev_queue_xmit);
3177 #if defined(CONFIG_VNET) || defined(CONFIG_VNET_MODULE)
3178 EXPORT_SYMBOL(dev_queue_xmit_nit);
3179 #endif
3180 EXPORT_SYMBOL(dev_remove_pack);
3181 EXPORT_SYMBOL(dev_set_allmulti);
3182 EXPORT_SYMBOL(dev_set_promiscuity);
3183 EXPORT_SYMBOL(dev_change_flags);
3184 EXPORT_SYMBOL(dev_change_name);
3185 EXPORT_SYMBOL(dev_set_mtu);
3186 EXPORT_SYMBOL(free_netdev);
3187 EXPORT_SYMBOL(netdev_boot_setup_check);
3188 EXPORT_SYMBOL(netdev_set_master);
3189 EXPORT_SYMBOL(netdev_state_change);
3190 EXPORT_SYMBOL(netif_receive_skb);
3191 EXPORT_SYMBOL(netif_rx);
3192 EXPORT_SYMBOL(register_gifconf);
3193 EXPORT_SYMBOL(register_netdevice);
3194 EXPORT_SYMBOL(register_netdevice_notifier);
3195 EXPORT_SYMBOL(skb_checksum_help);
3196 EXPORT_SYMBOL(synchronize_net);
3197 EXPORT_SYMBOL(unregister_netdevice);
3198 EXPORT_SYMBOL(unregister_netdevice_notifier);
3199
3200 #if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)
3201 EXPORT_SYMBOL(br_handle_frame_hook);
3202 #endif
3203
3204 #ifdef CONFIG_KMOD
3205 EXPORT_SYMBOL(dev_load);
3206 #endif
3207
3208 #ifdef CONFIG_NET_CLS_ACT
3209 EXPORT_SYMBOL(ing_filter);
3210 #endif
3211
3212
3213 EXPORT_PER_CPU_SYMBOL(softnet_data);