fedora core 6 1.2949 + vserver 2.2.0
[linux-2.6.git] / net / core / skbuff.c
index a26b234..830b607 100644 (file)
@@ -38,7 +38,6 @@
  *     The functions in this file will not compile correctly with gcc 2.4.x
  */
 
-#include <linux/config.h>
 #include <linux/module.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
@@ -57,7 +56,6 @@
 #include <linux/cache.h>
 #include <linux/rtnetlink.h>
 #include <linux/init.h>
-#include <linux/highmem.h>
 
 #include <net/protocol.h>
 #include <net/dst.h>
 #include <asm/uaccess.h>
 #include <asm/system.h>
 
-static kmem_cache_t *skbuff_head_cache __read_mostly;
-static kmem_cache_t *skbuff_fclone_cache __read_mostly;
+#include "kmap_skb.h"
+
+static struct kmem_cache *skbuff_head_cache __read_mostly;
+static struct kmem_cache *skbuff_fclone_cache __read_mostly;
 
 /*
  *     Keep out-of-line to prevent kernel bloat.
@@ -112,6 +112,14 @@ void skb_under_panic(struct sk_buff *skb, int sz, void *here)
        BUG();
 }
 
+void skb_truesize_bug(struct sk_buff *skb)
+{
+       printk(KERN_ERR "SKB BUG: Invalid truesize (%u) "
+              "len=%u, sizeof(sk_buff)=%Zd\n",
+              skb->truesize, skb->len, sizeof(struct sk_buff));
+}
+EXPORT_SYMBOL(skb_truesize_bug);
+
 /*     Allocate a new skbuff. We do this ourselves so we can fill in a few
  *     'private' fields and also do memory statistics to find all the
  *     [BEEP] leaks.
@@ -124,6 +132,7 @@ void skb_under_panic(struct sk_buff *skb, int sz, void *here)
  *     @gfp_mask: allocation mask
  *     @fclone: allocate from fclone cache instead of head cache
  *             and allocate a cloned (child) skb
+ *     @node: numa node to allocate memory on
  *
  *     Allocate a new &sk_buff. The returned buffer has no headroom and a
  *     tail room of size bytes. The object has a reference count of one.
@@ -132,10 +141,11 @@ void skb_under_panic(struct sk_buff *skb, int sz, void *here)
  *     Buffers may only be allocated from interrupts using a @gfp_mask of
  *     %GFP_ATOMIC.
  */
+#ifndef CONFIG_HAVE_ARCH_ALLOC_SKB
 struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
-                           int fclone)
+                           int fclone, int node)
 {
-       kmem_cache_t *cache;
+       struct kmem_cache *cache;
        struct skb_shared_info *shinfo;
        struct sk_buff *skb;
        u8 *data;
@@ -143,13 +153,14 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
        cache = fclone ? skbuff_fclone_cache : skbuff_head_cache;
 
        /* Get the HEAD */
-       skb = kmem_cache_alloc(cache, gfp_mask & ~__GFP_DMA);
+       skb = kmem_cache_alloc_node(cache, gfp_mask & ~__GFP_DMA, node);
        if (!skb)
                goto out;
 
        /* Get the DATA. Size must match skb_add_mtu(). */
        size = SKB_DATA_ALIGN(size);
-       data = kmalloc(size + sizeof(struct skb_shared_info), gfp_mask);
+       data = kmalloc_node_track_caller(size + sizeof(struct skb_shared_info),
+                       gfp_mask, node);
        if (!data)
                goto nodata;
 
@@ -164,9 +175,9 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
        shinfo = skb_shinfo(skb);
        atomic_set(&shinfo->dataref, 1);
        shinfo->nr_frags  = 0;
-       shinfo->tso_size = 0;
-       shinfo->tso_segs = 0;
-       shinfo->ufo_size = 0;
+       shinfo->gso_size = 0;
+       shinfo->gso_segs = 0;
+       shinfo->gso_type = 0;
        shinfo->ip6_frag_id = 0;
        shinfo->frag_list = NULL;
 
@@ -186,6 +197,7 @@ nodata:
        skb = NULL;
        goto out;
 }
+#endif /* !CONFIG_HAVE_ARCH_ALLOC_SKB */
 
 /**
  *     alloc_skb_from_cache    -       allocate a network buffer
@@ -201,16 +213,19 @@ nodata:
  *     Buffers may only be allocated from interrupts using a @gfp_mask of
  *     %GFP_ATOMIC.
  */
-struct sk_buff *alloc_skb_from_cache(kmem_cache_t *cp,
+struct sk_buff *alloc_skb_from_cache(struct kmem_cache *cp,
                                     unsigned int size,
-                                    gfp_t gfp_mask)
+                                    gfp_t gfp_mask,
+                                    int fclone)
 {
+       struct kmem_cache *cache;
        struct sk_buff *skb;
        u8 *data;
 
+       cache = fclone ? skbuff_fclone_cache : skbuff_head_cache;
+
        /* Get the HEAD */
-       skb = kmem_cache_alloc(skbuff_head_cache,
-                              gfp_mask & ~__GFP_DMA);
+       skb = kmem_cache_alloc(cache, gfp_mask & ~__GFP_DMA);
        if (!skb)
                goto out;
 
@@ -230,18 +245,54 @@ struct sk_buff *alloc_skb_from_cache(kmem_cache_t *cp,
 
        atomic_set(&(skb_shinfo(skb)->dataref), 1);
        skb_shinfo(skb)->nr_frags  = 0;
-       skb_shinfo(skb)->tso_size = 0;
-       skb_shinfo(skb)->tso_segs = 0;
-       skb_shinfo(skb)->ufo_size = 0;
+       skb_shinfo(skb)->gso_size = 0;
+       skb_shinfo(skb)->gso_segs = 0;
+       skb_shinfo(skb)->gso_type = 0;
        skb_shinfo(skb)->frag_list = NULL;
+
+       if (fclone) {
+               struct sk_buff *child = skb + 1;
+               atomic_t *fclone_ref = (atomic_t *) (child + 1);
+
+               skb->fclone = SKB_FCLONE_ORIG;
+               atomic_set(fclone_ref, 1);
+
+               child->fclone = SKB_FCLONE_UNAVAILABLE;
+       }
 out:
        return skb;
 nodata:
-       kmem_cache_free(skbuff_head_cache, skb);
+       kmem_cache_free(cache, skb);
        skb = NULL;
        goto out;
 }
 
+/**
+ *     __netdev_alloc_skb - allocate an skbuff for rx on a specific device
+ *     @dev: network device to receive on
+ *     @length: length to allocate
+ *     @gfp_mask: get_free_pages mask, passed to alloc_skb
+ *
+ *     Allocate a new &sk_buff and assign it a usage count of one. The
+ *     buffer has unspecified headroom built in. Users should allocate
+ *     the headroom they think they need without accounting for the
+ *     built in space. The built in space is used for optimisations.
+ *
+ *     %NULL is returned if there is no free memory.
+ */
+struct sk_buff *__netdev_alloc_skb(struct net_device *dev,
+               unsigned int length, gfp_t gfp_mask)
+{
+       int node = dev->class_dev.dev ? dev_to_node(dev->class_dev.dev) : -1;
+       struct sk_buff *skb;
+
+       skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask, 0, node);
+       if (likely(skb)) {
+               skb_reserve(skb, NET_SKB_PAD);
+               skb->dev = dev;
+       }
+       return skb;
+}
 
 static void skb_drop_list(struct sk_buff **listp)
 {
@@ -269,7 +320,7 @@ static void skb_clone_fraglist(struct sk_buff *skb)
                skb_get(list);
 }
 
-void skb_release_data(struct sk_buff *skb)
+static void skb_release_data(struct sk_buff *skb)
 {
        if (!skb->cloned ||
            !atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1,
@@ -361,6 +412,24 @@ void __kfree_skb(struct sk_buff *skb)
        kfree_skbmem(skb);
 }
 
+/**
+ *     kfree_skb - free an sk_buff
+ *     @skb: buffer to free
+ *
+ *     Drop a reference to the buffer and free it if the usage count has
+ *     hit zero.
+ */
+void kfree_skb(struct sk_buff *skb)
+{
+       if (unlikely(!skb))
+               return;
+       if (likely(atomic_read(&skb->users) == 1))
+               smp_rmb();
+       else if (likely(!atomic_dec_and_test(&skb->users)))
+               return;
+       __kfree_skb(skb);
+}
+
 /**
  *     skb_clone       -       duplicate an sk_buff
  *     @skb: buffer to clone
@@ -410,10 +479,15 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
        memcpy(n->cb, skb->cb, sizeof(skb->cb));
        C(len);
        C(data_len);
+       C(mac_len);
        C(csum);
        C(local_df);
        n->cloned = 1;
        n->nohdr = 0;
+#ifdef CONFIG_XEN
+       C(proto_data_valid);
+       C(proto_csum_blank);
+#endif
        C(pkt_type);
        C(ip_summed);
        C(priority);
@@ -422,8 +496,8 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
 #endif
        C(protocol);
        n->destructor = NULL;
+       C(mark);
 #ifdef CONFIG_NETFILTER
-       C(nfmark);
        C(nfct);
        nf_conntrack_get(skb->nfct);
        C(nfctinfo);
@@ -442,9 +516,9 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
        n->tc_verd = SET_TC_VERD(skb->tc_verd,0);
        n->tc_verd = CLR_TC_OK2MUNGE(n->tc_verd);
        n->tc_verd = CLR_TC_MUNGED(n->tc_verd);
-       C(input_dev);
+       C(iif);
 #endif
-
+       skb_copy_secmark(n, skb);
 #endif
        C(truesize);
        atomic_set(&n->users, 1);
@@ -483,8 +557,8 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
        new->pkt_type   = old->pkt_type;
        new->tstamp     = old->tstamp;
        new->destructor = NULL;
+       new->mark       = old->mark;
 #ifdef CONFIG_NETFILTER
-       new->nfmark     = old->nfmark;
        new->nfct       = old->nfct;
        nf_conntrack_get(old->nfct);
        new->nfctinfo   = old->nfctinfo;
@@ -506,10 +580,11 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
 #endif
        new->tc_index   = old->tc_index;
 #endif
+       skb_copy_secmark(new, old);
        atomic_set(&new->users, 1);
-       skb_shinfo(new)->tso_size = skb_shinfo(old)->tso_size;
-       skb_shinfo(new)->tso_segs = skb_shinfo(old)->tso_segs;
-       skb_shinfo(new)->ufo_size = skb_shinfo(old)->ufo_size;
+       skb_shinfo(new)->gso_size = skb_shinfo(old)->gso_size;
+       skb_shinfo(new)->gso_segs = skb_shinfo(old)->gso_segs;
+       skb_shinfo(new)->gso_type = skb_shinfo(old)->gso_type;
 }
 
 /**
@@ -762,24 +837,40 @@ struct sk_buff *skb_copy_expand(const struct sk_buff *skb,
  *     filled. Used by network drivers which may DMA or transfer data
  *     beyond the buffer end onto the wire.
  *
- *     May return NULL in out of memory cases.
+ *     May return error in out of memory cases. The skb is freed on error.
  */
  
-struct sk_buff *skb_pad(struct sk_buff *skb, int pad)
+int skb_pad(struct sk_buff *skb, int pad)
 {
-       struct sk_buff *nskb;
+       int err;
+       int ntail;
        
        /* If the skbuff is non linear tailroom is always zero.. */
-       if (skb_tailroom(skb) >= pad) {
+       if (!skb_cloned(skb) && skb_tailroom(skb) >= pad) {
                memset(skb->data+skb->len, 0, pad);
-               return skb;
+               return 0;
        }
-       
-       nskb = skb_copy_expand(skb, skb_headroom(skb), skb_tailroom(skb) + pad, GFP_ATOMIC);
+
+       ntail = skb->data_len + pad - (skb->end - skb->tail);
+       if (likely(skb_cloned(skb) || ntail > 0)) {
+               err = pskb_expand_head(skb, 0, ntail, GFP_ATOMIC);
+               if (unlikely(err))
+                       goto free_skb;
+       }
+
+       /* FIXME: The use of this function with non-linear skb's really needs
+        * to be audited.
+        */
+       err = skb_linearize(skb);
+       if (unlikely(err))
+               goto free_skb;
+
+       memset(skb->data + skb->len, 0, pad);
+       return 0;
+
+free_skb:
        kfree_skb(skb);
-       if (nskb)
-               memset(nskb->data+nskb->len, 0, pad);
-       return nskb;
+       return err;
 }      
  
 /* Trims skb to length len. It can change skb pointers.
@@ -1172,8 +1263,8 @@ EXPORT_SYMBOL(skb_store_bits);
 
 /* Checksum skb data. */
 
-unsigned int skb_checksum(const struct sk_buff *skb, int offset,
-                         int len, unsigned int csum)
+__wsum skb_checksum(const struct sk_buff *skb, int offset,
+                         int len, __wsum csum)
 {
        int start = skb_headlen(skb);
        int i, copy = start - offset;
@@ -1197,7 +1288,7 @@ unsigned int skb_checksum(const struct sk_buff *skb, int offset,
 
                end = start + skb_shinfo(skb)->frags[i].size;
                if ((copy = end - offset) > 0) {
-                       unsigned int csum2;
+                       __wsum csum2;
                        u8 *vaddr;
                        skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
 
@@ -1226,7 +1317,7 @@ unsigned int skb_checksum(const struct sk_buff *skb, int offset,
 
                        end = start + list->len;
                        if ((copy = end - offset) > 0) {
-                               unsigned int csum2;
+                               __wsum csum2;
                                if (copy > len)
                                        copy = len;
                                csum2 = skb_checksum(list, offset - start,
@@ -1247,8 +1338,8 @@ unsigned int skb_checksum(const struct sk_buff *skb, int offset,
 
 /* Both of above in one bottle. */
 
-unsigned int skb_copy_and_csum_bits(const struct sk_buff *skb, int offset,
-                                   u8 *to, int len, unsigned int csum)
+__wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset,
+                                   u8 *to, int len, __wsum csum)
 {
        int start = skb_headlen(skb);
        int i, copy = start - offset;
@@ -1274,7 +1365,7 @@ unsigned int skb_copy_and_csum_bits(const struct sk_buff *skb, int offset,
 
                end = start + skb_shinfo(skb)->frags[i].size;
                if ((copy = end - offset) > 0) {
-                       unsigned int csum2;
+                       __wsum csum2;
                        u8 *vaddr;
                        skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
 
@@ -1300,7 +1391,7 @@ unsigned int skb_copy_and_csum_bits(const struct sk_buff *skb, int offset,
                struct sk_buff *list = skb_shinfo(skb)->frag_list;
 
                for (; list; list = list->next) {
-                       unsigned int csum2;
+                       __wsum csum2;
                        int end;
 
                        BUG_TRAP(start <= offset + len);
@@ -1328,10 +1419,10 @@ unsigned int skb_copy_and_csum_bits(const struct sk_buff *skb, int offset,
 
 void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to)
 {
-       unsigned int csum;
+       __wsum csum;
        long csstart;
 
-       if (skb->ip_summed == CHECKSUM_HW)
+       if (skb->ip_summed == CHECKSUM_PARTIAL)
                csstart = skb->h.raw - skb->data;
        else
                csstart = skb_headlen(skb);
@@ -1345,10 +1436,10 @@ void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to)
                csum = skb_copy_and_csum_bits(skb, csstart, to + csstart,
                                              skb->len - csstart, 0);
 
-       if (skb->ip_summed == CHECKSUM_HW) {
-               long csstuff = csstart + skb->csum;
+       if (skb->ip_summed == CHECKSUM_PARTIAL) {
+               long csstuff = csstart + skb->csum_offset;
 
-               *((unsigned short *)(to + csstuff)) = csum_fold(csum);
+               *((__sum16 *)(to + csstuff)) = csum_fold(csum);
        }
 }
 
@@ -1742,12 +1833,15 @@ unsigned int skb_find_text(struct sk_buff *skb, unsigned int from,
                           unsigned int to, struct ts_config *config,
                           struct ts_state *state)
 {
+       unsigned int ret;
+
        config->get_next_block = skb_ts_get_next_block;
        config->finish = skb_ts_finish;
 
        skb_prepare_seq_read(skb, from, to, TS_SKB_CB(state));
 
-       return textsearch_find(config, state);
+       ret = textsearch_find(config, state);
+       return (ret <= to - from ? ret : UINT_MAX);
 }
 
 /**
@@ -1822,30 +1916,176 @@ int skb_append_datato_frags(struct sock *sk, struct sk_buff *skb,
        return 0;
 }
 
+/**
+ *     skb_pull_rcsum - pull skb and update receive checksum
+ *     @skb: buffer to update
+ *     @start: start of data before pull
+ *     @len: length of data pulled
+ *
+ *     This function performs an skb_pull on the packet and updates
+ *     update the CHECKSUM_COMPLETE checksum.  It should be used on
+ *     receive path processing instead of skb_pull unless you know
+ *     that the checksum difference is zero (e.g., a valid IP header)
+ *     or you are setting ip_summed to CHECKSUM_NONE.
+ */
+unsigned char *skb_pull_rcsum(struct sk_buff *skb, unsigned int len)
+{
+       BUG_ON(len > skb->len);
+       skb->len -= len;
+       BUG_ON(skb->len < skb->data_len);
+       skb_postpull_rcsum(skb, skb->data, len);
+       return skb->data += len;
+}
+
+EXPORT_SYMBOL_GPL(skb_pull_rcsum);
+
+/**
+ *     skb_segment - Perform protocol segmentation on skb.
+ *     @skb: buffer to segment
+ *     @features: features for the output path (see dev->features)
+ *
+ *     This function performs segmentation on the given skb.  It returns
+ *     the segment at the given position.  It returns NULL if there are
+ *     no more segments to generate, or when an error is encountered.
+ */
+struct sk_buff *skb_segment(struct sk_buff *skb, int features)
+{
+       struct sk_buff *segs = NULL;
+       struct sk_buff *tail = NULL;
+       unsigned int mss = skb_shinfo(skb)->gso_size;
+       unsigned int doffset = skb->data - skb->mac.raw;
+       unsigned int offset = doffset;
+       unsigned int headroom;
+       unsigned int len;
+       int sg = features & NETIF_F_SG;
+       int nfrags = skb_shinfo(skb)->nr_frags;
+       int err = -ENOMEM;
+       int i = 0;
+       int pos;
+
+       __skb_push(skb, doffset);
+       headroom = skb_headroom(skb);
+       pos = skb_headlen(skb);
+
+       do {
+               struct sk_buff *nskb;
+               skb_frag_t *frag;
+               int hsize;
+               int k;
+               int size;
+
+               len = skb->len - offset;
+               if (len > mss)
+                       len = mss;
+
+               hsize = skb_headlen(skb) - offset;
+               if (hsize < 0)
+                       hsize = 0;
+               if (hsize > len || !sg)
+                       hsize = len;
+
+               nskb = alloc_skb(hsize + doffset + headroom, GFP_ATOMIC);
+               if (unlikely(!nskb))
+                       goto err;
+
+               if (segs)
+                       tail->next = nskb;
+               else
+                       segs = nskb;
+               tail = nskb;
+
+               nskb->dev = skb->dev;
+               nskb->priority = skb->priority;
+               nskb->protocol = skb->protocol;
+               nskb->dst = dst_clone(skb->dst);
+               memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
+               nskb->pkt_type = skb->pkt_type;
+               nskb->mac_len = skb->mac_len;
+
+               skb_reserve(nskb, headroom);
+               nskb->mac.raw = nskb->data;
+               nskb->nh.raw = nskb->data + skb->mac_len;
+               nskb->h.raw = nskb->nh.raw + (skb->h.raw - skb->nh.raw);
+               memcpy(skb_put(nskb, doffset), skb->data, doffset);
+
+               if (!sg) {
+                       nskb->csum = skb_copy_and_csum_bits(skb, offset,
+                                                           skb_put(nskb, len),
+                                                           len, 0);
+                       continue;
+               }
+
+               frag = skb_shinfo(nskb)->frags;
+               k = 0;
+
+               nskb->ip_summed = CHECKSUM_PARTIAL;
+               nskb->csum = skb->csum;
+               memcpy(skb_put(nskb, hsize), skb->data + offset, hsize);
+
+               while (pos < offset + len) {
+                       BUG_ON(i >= nfrags);
+
+                       *frag = skb_shinfo(skb)->frags[i];
+                       get_page(frag->page);
+                       size = frag->size;
+
+                       if (pos < offset) {
+                               frag->page_offset += offset - pos;
+                               frag->size -= offset - pos;
+                       }
+
+                       k++;
+
+                       if (pos + size <= offset + len) {
+                               i++;
+                               pos += size;
+                       } else {
+                               frag->size -= pos + size - (offset + len);
+                               break;
+                       }
+
+                       frag++;
+               }
+
+               skb_shinfo(nskb)->nr_frags = k;
+               nskb->data_len = len - hsize;
+               nskb->len += nskb->data_len;
+               nskb->truesize += nskb->data_len;
+       } while ((offset += len) < skb->len);
+
+       return segs;
+
+err:
+       while ((skb = segs)) {
+               segs = skb->next;
+               kfree(skb);
+       }
+       return ERR_PTR(err);
+}
+
+EXPORT_SYMBOL_GPL(skb_segment);
+
 void __init skb_init(void)
 {
        skbuff_head_cache = kmem_cache_create("skbuff_head_cache",
                                              sizeof(struct sk_buff),
                                              0,
-                                             SLAB_HWCACHE_ALIGN,
+                                             SLAB_HWCACHE_ALIGN|SLAB_PANIC,
                                              NULL, NULL);
-       if (!skbuff_head_cache)
-               panic("cannot create skbuff cache");
-
        skbuff_fclone_cache = kmem_cache_create("skbuff_fclone_cache",
                                                (2*sizeof(struct sk_buff)) +
                                                sizeof(atomic_t),
                                                0,
-                                               SLAB_HWCACHE_ALIGN,
+                                               SLAB_HWCACHE_ALIGN|SLAB_PANIC,
                                                NULL, NULL);
-       if (!skbuff_fclone_cache)
-               panic("cannot create skbuff cache");
 }
 
 EXPORT_SYMBOL(___pskb_trim);
 EXPORT_SYMBOL(__kfree_skb);
+EXPORT_SYMBOL(kfree_skb);
 EXPORT_SYMBOL(__pskb_pull_tail);
 EXPORT_SYMBOL(__alloc_skb);
+EXPORT_SYMBOL(__netdev_alloc_skb);
 EXPORT_SYMBOL(pskb_copy);
 EXPORT_SYMBOL(pskb_expand_head);
 EXPORT_SYMBOL(skb_checksum);