Merge to Fedora kernel-2.6.17-1.2187_FC5 patched with stable patch-2.6.17.13-vs2...
[linux-2.6.git] / net / core / dev.c
index eb965a7..d8bab16 100644 (file)
 #include <net/iw_handler.h>
 #include <asm/current.h>
 #include <linux/audit.h>
+#include <linux/err.h>
 #include <linux/vs_network.h>
 
+#ifdef CONFIG_XEN
+#include <net/ip.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#endif
+
 /*
  *     The list of packet types we will receive (as opposed to discard)
  *     and the routines to invoke.
@@ -1042,7 +1049,7 @@ static inline void net_timestamp(struct sk_buff *skb)
  *     taps currently in use.
  */
 
-void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
+static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
 {
        struct packet_type *ptype;
 
@@ -1180,6 +1187,45 @@ out:
        return ret;
 }
 
+/**
+ *     skb_gso_segment - Perform segmentation on skb.
+ *     @skb: buffer to segment
+ *     @features: features for the output path (see dev->features)
+ *
+ *     This function segments the given skb and returns a list of segments.
+ *
+ *     It may return NULL if the skb requires no segmentation.  This is
+ *     only possible when GSO is used for verifying header integrity.
+ */
+struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features)
+{
+       struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
+       struct packet_type *ptype;
+       int type = skb->protocol;
+
+       BUG_ON(skb_shinfo(skb)->frag_list);
+       BUG_ON(skb->ip_summed != CHECKSUM_HW);
+
+       skb->mac.raw = skb->data;
+       skb->mac_len = skb->nh.raw - skb->data;
+       __skb_pull(skb, skb->mac_len);
+
+       rcu_read_lock();
+       list_for_each_entry_rcu(ptype, &ptype_base[ntohs(type) & 15], list) {
+               if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
+                       segs = ptype->gso_segment(skb, features);
+                       break;
+               }
+       }
+       rcu_read_unlock();
+
+       __skb_push(skb, skb->data - skb->mac.raw);
+
+       return segs;
+}
+
+EXPORT_SYMBOL(skb_gso_segment);
+
 /* Take action when hardware reception checksum errors are detected. */
 #ifdef CONFIG_BUG
 void netdev_rx_csum_fault(struct net_device *dev)
@@ -1216,78 +1262,148 @@ static inline int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
 #define illegal_highdma(dev, skb)      (0)
 #endif
 
-/* Keep head the same: replace data */
-int __skb_linearize(struct sk_buff *skb, gfp_t gfp_mask)
-{
-       unsigned int size;
-       u8 *data;
-       long offset;
-       struct skb_shared_info *ninfo;
-       int headerlen = skb->data - skb->head;
-       int expand = (skb->tail + skb->data_len) - skb->end;
-
-       if (skb_shared(skb))
-               BUG();
-
-       if (expand <= 0)
-               expand = 0;
-
-       size = skb->end - skb->head + expand;
-       size = SKB_DATA_ALIGN(size);
-       data = kmalloc(size + sizeof(struct skb_shared_info), gfp_mask);
-       if (!data)
-               return -ENOMEM;
-
-       /* Copy entire thing */
-       if (skb_copy_bits(skb, -headerlen, data, headerlen + skb->len))
-               BUG();
-
-       /* Set up shinfo */
-       ninfo = (struct skb_shared_info*)(data + size);
-       atomic_set(&ninfo->dataref, 1);
-       ninfo->tso_size = skb_shinfo(skb)->tso_size;
-       ninfo->tso_segs = skb_shinfo(skb)->tso_segs;
-       ninfo->nr_frags = 0;
-       ninfo->frag_list = NULL;
-
-       /* Offset between the two in bytes */
-       offset = data - skb->head;
-
-       /* Free old data. */
-       skb_release_data(skb);
-
-       skb->head = data;
-       skb->end  = data + size;
-
-       /* Set up new pointers */
-       skb->h.raw   += offset;
-       skb->nh.raw  += offset;
-       skb->mac.raw += offset;
-       skb->tail    += offset;
-       skb->data    += offset;
-
-       /* We are no longer a clone, even if we were. */
-       skb->cloned    = 0;
-
-       skb->tail     += skb->data_len;
-       skb->data_len  = 0;
+struct dev_gso_cb {
+       void (*destructor)(struct sk_buff *skb);
+};
+
+#define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
+
+static void dev_gso_skb_destructor(struct sk_buff *skb)
+{
+       struct dev_gso_cb *cb;
+
+       do {
+               struct sk_buff *nskb = skb->next;
+
+               skb->next = nskb->next;
+               nskb->next = NULL;
+               kfree_skb(nskb);
+       } while (skb->next);
+
+       cb = DEV_GSO_CB(skb);
+       if (cb->destructor)
+               cb->destructor(skb);
+}
+
+/**
+ *     dev_gso_segment - Perform emulated hardware segmentation on skb.
+ *     @skb: buffer to segment
+ *
+ *     This function segments the given skb and stores the list of segments
+ *     in skb->next.
+ */
+static int dev_gso_segment(struct sk_buff *skb)
+{
+       struct net_device *dev = skb->dev;
+       struct sk_buff *segs;
+       int features = dev->features & ~(illegal_highdma(dev, skb) ?
+                                        NETIF_F_SG : 0);
+
+       segs = skb_gso_segment(skb, features);
+
+       /* Verifying header integrity only. */
+       if (!segs)
+               return 0;
+
+       if (unlikely(IS_ERR(segs)))
+               return PTR_ERR(segs);
+
+       skb->next = segs;
+       DEV_GSO_CB(skb)->destructor = skb->destructor;
+       skb->destructor = dev_gso_skb_destructor;
+
+       return 0;
+}
+
+int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+       if (likely(!skb->next)) {
+               if (netdev_nit)
+                       dev_queue_xmit_nit(skb, dev);
+
+               if (netif_needs_gso(dev, skb)) {
+                       if (unlikely(dev_gso_segment(skb)))
+                               goto out_kfree_skb;
+                       if (skb->next)
+                               goto gso;
+               }
+
+               return dev->hard_start_xmit(skb, dev);
+       }
+
+gso:
+       do {
+               struct sk_buff *nskb = skb->next;
+               int rc;
+
+               skb->next = nskb->next;
+               nskb->next = NULL;
+               rc = dev->hard_start_xmit(nskb, dev);
+               if (unlikely(rc)) {
+                       nskb->next = skb->next;
+                       skb->next = nskb;
+                       return rc;
+               }
+               if (unlikely(netif_queue_stopped(dev) && skb->next))
+                       return NETDEV_TX_BUSY;
+       } while (skb->next);
+       
+       skb->destructor = DEV_GSO_CB(skb)->destructor;
+
+out_kfree_skb:
+       kfree_skb(skb);
        return 0;
 }
 
 #define HARD_TX_LOCK(dev, cpu) {                       \
        if ((dev->features & NETIF_F_LLTX) == 0) {      \
-               spin_lock(&dev->xmit_lock);             \
-               dev->xmit_lock_owner = cpu;             \
+               netif_tx_lock(dev);                     \
        }                                               \
 }
 
 #define HARD_TX_UNLOCK(dev) {                          \
        if ((dev->features & NETIF_F_LLTX) == 0) {      \
-               dev->xmit_lock_owner = -1;              \
-               spin_unlock(&dev->xmit_lock);           \
+               netif_tx_unlock(dev);                   \
        }                                               \
 }
 
+#ifdef CONFIG_XEN
+inline int skb_checksum_setup(struct sk_buff *skb)
+{
+       if (skb->proto_csum_blank) {
+               if (skb->protocol != htons(ETH_P_IP))
+                       goto out;
+               skb->h.raw = (unsigned char *)skb->nh.iph + 4*skb->nh.iph->ihl;
+               if (skb->h.raw >= skb->tail)
+                       goto out;
+               switch (skb->nh.iph->protocol) {
+               case IPPROTO_TCP:
+                       skb->csum = offsetof(struct tcphdr, check);
+                       break;
+               case IPPROTO_UDP:
+                       skb->csum = offsetof(struct udphdr, check);
+                       break;
+               default:
+                       if (net_ratelimit())
+                               printk(KERN_ERR "Attempting to checksum a non-"
+                                      "TCP/UDP packet, dropping a protocol"
+                                      " %d packet", skb->nh.iph->protocol);
+                       goto out;
+               }
+               if ((skb->h.raw + skb->csum + 2) > skb->tail)
+                       goto out;
+               skb->ip_summed = CHECKSUM_HW;
+               skb->proto_csum_blank = 0;
+       }
+       return 0;
+out:
+       return -EPROTO;
+}
+#else
+inline int skb_checksum_setup(struct sk_buff *skb) { return 0; }
+#endif
+
+
 /**
  *     dev_queue_xmit - transmit a buffer
  *     @skb: buffer to transmit
@@ -1320,9 +1436,19 @@ int dev_queue_xmit(struct sk_buff *skb)
        struct Qdisc *q;
        int rc = -ENOMEM;
 
+       /* If a checksum-deferred packet is forwarded to a device that needs a
+        * checksum, correct the pointers and force checksumming.
+        */
+       if (skb_checksum_setup(skb))
+               goto out_kfree_skb;
+
+       /* GSO will handle the following emulations directly. */
+       if (netif_needs_gso(dev, skb))
+               goto gso;
+
        if (skb_shinfo(skb)->frag_list &&
            !(dev->features & NETIF_F_FRAGLIST) &&
-           __skb_linearize(skb, GFP_ATOMIC))
+           __skb_linearize(skb))
                goto out_kfree_skb;
 
        /* Fragmented skb is linearized if device does not support SG,
@@ -1331,25 +1457,26 @@ int dev_queue_xmit(struct sk_buff *skb)
         */
        if (skb_shinfo(skb)->nr_frags &&
            (!(dev->features & NETIF_F_SG) || illegal_highdma(dev, skb)) &&
-           __skb_linearize(skb, GFP_ATOMIC))
+           __skb_linearize(skb))
                goto out_kfree_skb;
 
        /* If packet is not checksummed and device does not support
         * checksumming for this protocol, complete checksumming here.
         */
        if (skb->ip_summed == CHECKSUM_HW &&
-           (!(dev->features & (NETIF_F_HW_CSUM | NETIF_F_NO_CSUM)) &&
+           (!(dev->features & NETIF_F_GEN_CSUM) &&
             (!(dev->features & NETIF_F_IP_CSUM) ||
              skb->protocol != htons(ETH_P_IP))))
                if (skb_checksum_help(skb, 0))
                        goto out_kfree_skb;
 
+gso:
        spin_lock_prefetch(&dev->queue_lock);
 
        /* Disable soft irqs for various locks below. Also 
         * stops preemption for RCU. 
         */
-       local_bh_disable(); 
+       rcu_read_lock_bh(); 
 
        /* Updates of qdisc are serialized by queue_lock. 
         * The struct Qdisc which is pointed to by qdisc is now a 
@@ -1383,8 +1510,8 @@ int dev_queue_xmit(struct sk_buff *skb)
        /* The device has no queue. Common case for software devices:
           loopback, all the sorts of tunnels...
 
-          Really, it is unlikely that xmit_lock protection is necessary here.
-          (f.e. loopback and IP tunnels are clean ignoring statistics
+          Really, it is unlikely that netif_tx_lock protection is necessary
+          here.  (f.e. loopback and IP tunnels are clean ignoring statistics
           counters.)
           However, it is possible, that they rely on protection
           made by us here.
@@ -1400,11 +1527,8 @@ int dev_queue_xmit(struct sk_buff *skb)
                        HARD_TX_LOCK(dev, cpu);
 
                        if (!netif_queue_stopped(dev)) {
-                               if (netdev_nit)
-                                       dev_queue_xmit_nit(skb, dev);
-
                                rc = 0;
-                               if (!dev->hard_start_xmit(skb, dev)) {
+                               if (!dev_hard_start_xmit(skb, dev)) {
                                        HARD_TX_UNLOCK(dev);
                                        goto out;
                                }
@@ -1423,13 +1547,13 @@ int dev_queue_xmit(struct sk_buff *skb)
        }
 
        rc = -ENETDOWN;
-       local_bh_enable();
+       rcu_read_unlock_bh();
 
 out_kfree_skb:
        kfree_skb(skb);
        return rc;
 out:
-       local_bh_enable();
+       rcu_read_unlock_bh();
        return rc;
 }
 
@@ -1707,6 +1831,19 @@ int netif_receive_skb(struct sk_buff *skb)
        }
 #endif
 
+#ifdef CONFIG_XEN
+       switch (skb->ip_summed) {
+       case CHECKSUM_UNNECESSARY:
+               skb->proto_data_valid = 1;
+               break;
+       case CHECKSUM_HW:
+               /* XXX Implement me. */
+       default:
+               skb->proto_data_valid = 0;
+               break;
+       }
+#endif
+
        list_for_each_entry_rcu(ptype, &ptype_all, list) {
                if (!ptype->dev || ptype->dev == skb->dev) {
                        if (pt_prev) 
@@ -2793,7 +2930,7 @@ int register_netdevice(struct net_device *dev)
        BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
 
        spin_lock_init(&dev->queue_lock);
-       spin_lock_init(&dev->xmit_lock);
+       spin_lock_init(&dev->_xmit_lock);
        dev->xmit_lock_owner = -1;
 #ifdef CONFIG_NET_CLS_ACT
        spin_lock_init(&dev->ingress_lock);
@@ -2837,9 +2974,7 @@ int register_netdevice(struct net_device *dev)
 
        /* Fix illegal SG+CSUM combinations. */
        if ((dev->features & NETIF_F_SG) &&
-           !(dev->features & (NETIF_F_IP_CSUM |
-                              NETIF_F_NO_CSUM |
-                              NETIF_F_HW_CSUM))) {
+           !(dev->features & NETIF_F_ALL_CSUM)) {
                printk("%s: Dropping NETIF_F_SG since no checksum feature.\n",
                       dev->name);
                dev->features &= ~NETIF_F_SG;
@@ -3379,7 +3514,6 @@ subsys_initcall(net_dev_init);
 EXPORT_SYMBOL(__dev_get_by_index);
 EXPORT_SYMBOL(__dev_get_by_name);
 EXPORT_SYMBOL(__dev_remove_pack);
-EXPORT_SYMBOL(__skb_linearize);
 EXPORT_SYMBOL(dev_valid_name);
 EXPORT_SYMBOL(dev_add_pack);
 EXPORT_SYMBOL(dev_alloc_name);
@@ -3411,6 +3545,7 @@ EXPORT_SYMBOL(unregister_netdevice_notifier);
 EXPORT_SYMBOL(net_enable_timestamp);
 EXPORT_SYMBOL(net_disable_timestamp);
 EXPORT_SYMBOL(dev_get_flags);
+EXPORT_SYMBOL(skb_checksum_setup);
 
 #if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)
 EXPORT_SYMBOL(br_handle_frame_hook);