1 diff -Nurb linux-2.6.27-524/include/linux/netdevice.h linux-2.6.27-525/include/linux/netdevice.h
2 --- linux-2.6.27-524/include/linux/netdevice.h 2008-10-09 18:13:53.000000000 -0400
3 +++ linux-2.6.27-525/include/linux/netdevice.h 2009-12-04 16:03:56.000000000 -0500
6 __be16 type; /* This is really htons(ether_type). */
7 struct net_device *dev; /* NULL is wildcarded here */
8 + unsigned char sknid_elevator;
9 int (*func) (struct sk_buff *,
12 diff -Nurb linux-2.6.27-524/net/core/dev.c linux-2.6.27-525/net/core/dev.c
13 --- linux-2.6.27-524/net/core/dev.c 2009-12-04 16:03:48.000000000 -0500
14 +++ linux-2.6.27-525/net/core/dev.c 2009-12-04 16:05:48.000000000 -0500
16 #include <linux/proc_fs.h>
17 #include <linux/seq_file.h>
18 #include <linux/stat.h>
19 +#include <linux/ip.h>
20 +#include <linux/tcp.h>
21 #include <linux/if_bridge.h>
22 #include <linux/if_macvlan.h>
25 if ((ptype->dev == dev || !ptype->dev) &&
26 (ptype->af_packet_priv == NULL ||
27 (struct sock *)ptype->af_packet_priv != skb->sk)) {
28 - struct sk_buff *skb2= skb_clone(skb, GFP_ATOMIC);
29 + struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
33 @@ -2170,6 +2172,10 @@
37 +/* The code already makes the assumption that packet handlers run
38 + * sequentially on the same CPU. -Sapan */
39 +DEFINE_PER_CPU(int, sknid_elevator) = 0;
42 * netif_receive_skb - process receive buffer from network
43 * @skb: buffer to process
44 @@ -2191,8 +2197,11 @@
45 struct net_device *orig_dev;
46 struct net_device *null_or_orig;
47 int ret = NET_RX_DROP;
48 + int *cur_elevator = &__get_cpu_var(sknid_elevator);
53 if (skb->vlan_tci && vlan_hwaccel_do_receive(skb))
54 return NET_RX_SUCCESS;
56 @@ -2272,7 +2281,27 @@
60 + /* At this point, cur_elevator may be -2 or a positive value, in
61 + * case a previous protocol handler marked it */
62 + if (*cur_elevator) {
63 + atomic_inc(&skb->users);
66 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
68 + if ((*cur_elevator)>0) {
69 + skb->skb_tag = *cur_elevator;
70 + list_for_each_entry_rcu(ptype, &ptype_all, list) {
71 + if ((!ptype->dev || ptype->dev == skb->dev) && (ptype->sknid_elevator)) {
72 + ret = deliver_skb(skb, ptype, orig_dev);
77 + if (*cur_elevator) {
78 + /* We have a packet */
83 /* Jamal, now you will not able to escape explaining
85 EXPORT_SYMBOL(net_enable_timestamp);
86 EXPORT_SYMBOL(net_disable_timestamp);
87 EXPORT_SYMBOL(dev_get_flags);
88 +EXPORT_PER_CPU_SYMBOL(sknid_elevator);
90 #if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)
91 EXPORT_SYMBOL(br_handle_frame_hook);
92 diff -Nurb linux-2.6.27-524/net/core/skbuff.c.orig linux-2.6.27-525/net/core/skbuff.c.orig
93 --- linux-2.6.27-524/net/core/skbuff.c.orig 2009-12-04 16:03:47.000000000 -0500
94 +++ linux-2.6.27-525/net/core/skbuff.c.orig 1969-12-31 19:00:00.000000000 -0500
97 - * Routines having to do with the 'struct sk_buff' memory handlers.
99 - * Authors: Alan Cox <iiitac@pyr.swan.ac.uk>
100 - * Florian La Roche <rzsfl@rz.uni-sb.de>
103 - * Alan Cox : Fixed the worst of the load
105 - * Dave Platt : Interrupt stacking fix.
106 - * Richard Kooijman : Timestamp fixes.
107 - * Alan Cox : Changed buffer format.
108 - * Alan Cox : destructor hook for AF_UNIX etc.
109 - * Linus Torvalds : Better skb_clone.
110 - * Alan Cox : Added skb_copy.
111 - * Alan Cox : Added all the changed routines Linus
112 - * only put in the headers
113 - * Ray VanTassle : Fixed --skb->lock in free
114 - * Alan Cox : skb_copy copy arp field
115 - * Andi Kleen : slabified it.
116 - * Robert Olsson : Removed skb_head_pool
119 - * The __skb_ routines should be called with interrupts
120 - * disabled, or you better be *real* sure that the operation is atomic
121 - * with respect to whatever list is being frobbed (e.g. via lock_sock()
122 - * or via disabling bottom half handlers, etc).
124 - * This program is free software; you can redistribute it and/or
125 - * modify it under the terms of the GNU General Public License
126 - * as published by the Free Software Foundation; either version
127 - * 2 of the License, or (at your option) any later version.
131 - * The functions in this file will not compile correctly with gcc 2.4.x
134 -#include <linux/module.h>
135 -#include <linux/types.h>
136 -#include <linux/kernel.h>
137 -#include <linux/mm.h>
138 -#include <linux/interrupt.h>
139 -#include <linux/in.h>
140 -#include <linux/inet.h>
141 -#include <linux/slab.h>
142 -#include <linux/netdevice.h>
143 -#ifdef CONFIG_NET_CLS_ACT
144 -#include <net/pkt_sched.h>
146 -#include <linux/string.h>
147 -#include <linux/skbuff.h>
148 -#include <linux/splice.h>
149 -#include <linux/cache.h>
150 -#include <linux/rtnetlink.h>
151 -#include <linux/init.h>
152 -#include <linux/scatterlist.h>
154 -#include <net/protocol.h>
155 -#include <net/dst.h>
156 -#include <net/sock.h>
157 -#include <net/checksum.h>
158 -#include <net/xfrm.h>
160 -#include <asm/uaccess.h>
161 -#include <asm/system.h>
163 -#include "kmap_skb.h"
165 -static struct kmem_cache *skbuff_head_cache __read_mostly;
166 -static struct kmem_cache *skbuff_fclone_cache __read_mostly;
168 -static void sock_pipe_buf_release(struct pipe_inode_info *pipe,
169 - struct pipe_buffer *buf)
171 - put_page(buf->page);
174 -static void sock_pipe_buf_get(struct pipe_inode_info *pipe,
175 - struct pipe_buffer *buf)
177 - get_page(buf->page);
180 -static int sock_pipe_buf_steal(struct pipe_inode_info *pipe,
181 - struct pipe_buffer *buf)
187 -/* Pipe buffer operations for a socket. */
188 -static struct pipe_buf_operations sock_pipe_buf_ops = {
190 - .map = generic_pipe_buf_map,
191 - .unmap = generic_pipe_buf_unmap,
192 - .confirm = generic_pipe_buf_confirm,
193 - .release = sock_pipe_buf_release,
194 - .steal = sock_pipe_buf_steal,
195 - .get = sock_pipe_buf_get,
199 - * Keep out-of-line to prevent kernel bloat.
200 - * __builtin_return_address is not used because it is not always
205 - * skb_over_panic - private function
210 - * Out of line support code for skb_put(). Not user callable.
212 -void skb_over_panic(struct sk_buff *skb, int sz, void *here)
214 - printk(KERN_EMERG "skb_over_panic: text:%p len:%d put:%d head:%p "
215 - "data:%p tail:%#lx end:%#lx dev:%s\n",
216 - here, skb->len, sz, skb->head, skb->data,
217 - (unsigned long)skb->tail, (unsigned long)skb->end,
218 - skb->dev ? skb->dev->name : "<NULL>");
223 - * skb_under_panic - private function
228 - * Out of line support code for skb_push(). Not user callable.
231 -void skb_under_panic(struct sk_buff *skb, int sz, void *here)
233 - printk(KERN_EMERG "skb_under_panic: text:%p len:%d put:%d head:%p "
234 - "data:%p tail:%#lx end:%#lx dev:%s\n",
235 - here, skb->len, sz, skb->head, skb->data,
236 - (unsigned long)skb->tail, (unsigned long)skb->end,
237 - skb->dev ? skb->dev->name : "<NULL>");
241 -/* Allocate a new skbuff. We do this ourselves so we can fill in a few
242 - * 'private' fields and also do memory statistics to find all the
248 - * __alloc_skb - allocate a network buffer
249 - * @size: size to allocate
250 - * @gfp_mask: allocation mask
251 - * @fclone: allocate from fclone cache instead of head cache
252 - * and allocate a cloned (child) skb
253 - * @node: numa node to allocate memory on
255 - * Allocate a new &sk_buff. The returned buffer has no headroom and a
256 - * tail room of size bytes. The object has a reference count of one.
257 - * The return is the buffer. On a failure the return is %NULL.
259 - * Buffers may only be allocated from interrupts using a @gfp_mask of
262 -struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
263 - int fclone, int node)
265 - struct kmem_cache *cache;
266 - struct skb_shared_info *shinfo;
267 - struct sk_buff *skb;
270 - cache = fclone ? skbuff_fclone_cache : skbuff_head_cache;
273 - skb = kmem_cache_alloc_node(cache, gfp_mask & ~__GFP_DMA, node);
277 - size = SKB_DATA_ALIGN(size);
278 - data = kmalloc_node_track_caller(size + sizeof(struct skb_shared_info),
284 - * Only clear those fields we need to clear, not those that we will
285 - * actually initialise below. Hence, don't put any more fields after
286 - * the tail pointer in struct sk_buff!
288 - memset(skb, 0, offsetof(struct sk_buff, tail));
289 - skb->truesize = size + sizeof(struct sk_buff);
290 - atomic_set(&skb->users, 1);
293 - skb_reset_tail_pointer(skb);
294 - skb->end = skb->tail + size;
295 - /* make sure we initialize shinfo sequentially */
296 - shinfo = skb_shinfo(skb);
297 - atomic_set(&shinfo->dataref, 1);
298 - shinfo->nr_frags = 0;
299 - shinfo->gso_size = 0;
300 - shinfo->gso_segs = 0;
301 - shinfo->gso_type = 0;
302 - shinfo->ip6_frag_id = 0;
303 - shinfo->frag_list = NULL;
306 - struct sk_buff *child = skb + 1;
307 - atomic_t *fclone_ref = (atomic_t *) (child + 1);
309 - skb->fclone = SKB_FCLONE_ORIG;
310 - atomic_set(fclone_ref, 1);
312 - child->fclone = SKB_FCLONE_UNAVAILABLE;
317 - kmem_cache_free(cache, skb);
323 - * __netdev_alloc_skb - allocate an skbuff for rx on a specific device
324 - * @dev: network device to receive on
325 - * @length: length to allocate
326 - * @gfp_mask: get_free_pages mask, passed to alloc_skb
328 - * Allocate a new &sk_buff and assign it a usage count of one. The
329 - * buffer has unspecified headroom built in. Users should allocate
330 - * the headroom they think they need without accounting for the
331 - * built in space. The built in space is used for optimisations.
333 - * %NULL is returned if there is no free memory.
335 -struct sk_buff *__netdev_alloc_skb(struct net_device *dev,
336 - unsigned int length, gfp_t gfp_mask)
338 - int node = dev->dev.parent ? dev_to_node(dev->dev.parent) : -1;
339 - struct sk_buff *skb;
341 - skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask, 0, node);
343 - skb_reserve(skb, NET_SKB_PAD);
350 - * dev_alloc_skb - allocate an skbuff for receiving
351 - * @length: length to allocate
353 - * Allocate a new &sk_buff and assign it a usage count of one. The
354 - * buffer has unspecified headroom built in. Users should allocate
355 - * the headroom they think they need without accounting for the
356 - * built in space. The built in space is used for optimisations.
358 - * %NULL is returned if there is no free memory. Although this function
359 - * allocates memory it can be called from an interrupt.
361 -struct sk_buff *dev_alloc_skb(unsigned int length)
364 - * There is more code here than it seems:
365 - * __dev_alloc_skb is an inline
367 - return __dev_alloc_skb(length, GFP_ATOMIC);
369 -EXPORT_SYMBOL(dev_alloc_skb);
371 -static void skb_drop_list(struct sk_buff **listp)
373 - struct sk_buff *list = *listp;
378 - struct sk_buff *this = list;
384 -static inline void skb_drop_fraglist(struct sk_buff *skb)
386 - skb_drop_list(&skb_shinfo(skb)->frag_list);
389 -static void skb_clone_fraglist(struct sk_buff *skb)
391 - struct sk_buff *list;
393 - for (list = skb_shinfo(skb)->frag_list; list; list = list->next)
397 -static void skb_release_data(struct sk_buff *skb)
399 - if (!skb->cloned ||
400 - !atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1,
401 - &skb_shinfo(skb)->dataref)) {
402 - if (skb_shinfo(skb)->nr_frags) {
404 - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
405 - put_page(skb_shinfo(skb)->frags[i].page);
408 - if (skb_shinfo(skb)->frag_list)
409 - skb_drop_fraglist(skb);
416 - * Free an skbuff by memory without cleaning the state.
418 -static void kfree_skbmem(struct sk_buff *skb)
420 - struct sk_buff *other;
421 - atomic_t *fclone_ref;
423 - switch (skb->fclone) {
424 - case SKB_FCLONE_UNAVAILABLE:
425 - kmem_cache_free(skbuff_head_cache, skb);
428 - case SKB_FCLONE_ORIG:
429 - fclone_ref = (atomic_t *) (skb + 2);
430 - if (atomic_dec_and_test(fclone_ref))
431 - kmem_cache_free(skbuff_fclone_cache, skb);
434 - case SKB_FCLONE_CLONE:
435 - fclone_ref = (atomic_t *) (skb + 1);
438 - /* The clone portion is available for
439 - * fast-cloning again.
441 - skb->fclone = SKB_FCLONE_UNAVAILABLE;
443 - if (atomic_dec_and_test(fclone_ref))
444 - kmem_cache_free(skbuff_fclone_cache, other);
449 -/* Free everything but the sk_buff shell. */
450 -static void skb_release_all(struct sk_buff *skb)
452 - dst_release(skb->dst);
454 - secpath_put(skb->sp);
456 - if (skb->destructor) {
458 - skb->destructor(skb);
460 -#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
461 - nf_conntrack_put(skb->nfct);
462 - nf_conntrack_put_reasm(skb->nfct_reasm);
464 -#ifdef CONFIG_BRIDGE_NETFILTER
465 - nf_bridge_put(skb->nf_bridge);
467 -/* XXX: IS this still necessary? - JHS */
468 -#ifdef CONFIG_NET_SCHED
470 -#ifdef CONFIG_NET_CLS_ACT
474 - skb_release_data(skb);
478 - * __kfree_skb - private function
481 - * Free an sk_buff. Release anything attached to the buffer.
482 - * Clean the state. This is an internal helper function. Users should
483 - * always call kfree_skb
486 -void __kfree_skb(struct sk_buff *skb)
488 - skb_release_all(skb);
493 - * kfree_skb - free an sk_buff
494 - * @skb: buffer to free
496 - * Drop a reference to the buffer and free it if the usage count has
499 -void kfree_skb(struct sk_buff *skb)
501 - if (unlikely(!skb))
503 - if (likely(atomic_read(&skb->users) == 1))
505 - else if (likely(!atomic_dec_and_test(&skb->users)))
510 -static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
512 - new->tstamp = old->tstamp;
513 - new->dev = old->dev;
514 - new->transport_header = old->transport_header;
515 - new->network_header = old->network_header;
516 - new->mac_header = old->mac_header;
517 - new->dst = dst_clone(old->dst);
519 - new->sp = secpath_get(old->sp);
521 - memcpy(new->cb, old->cb, sizeof(old->cb));
522 - new->csum_start = old->csum_start;
523 - new->csum_offset = old->csum_offset;
524 - new->local_df = old->local_df;
525 - new->pkt_type = old->pkt_type;
526 - new->ip_summed = old->ip_summed;
527 - skb_copy_queue_mapping(new, old);
528 - new->priority = old->priority;
529 -#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
530 - new->ipvs_property = old->ipvs_property;
532 - new->protocol = old->protocol;
533 - new->mark = old->mark;
534 - __nf_copy(new, old);
535 -#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
536 - defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
537 - new->nf_trace = old->nf_trace;
539 -#ifdef CONFIG_NET_SCHED
540 - new->tc_index = old->tc_index;
541 -#ifdef CONFIG_NET_CLS_ACT
542 - new->tc_verd = old->tc_verd;
545 - new->vlan_tci = old->vlan_tci;
547 - skb_copy_secmark(new, old);
550 -static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb)
552 -#define C(x) n->x = skb->x
554 - n->next = n->prev = NULL;
556 - __copy_skb_header(n, skb);
561 - n->hdr_len = skb->nohdr ? skb_headroom(skb) : skb->hdr_len;
564 - n->destructor = NULL;
571 -#if defined(CONFIG_MAC80211) || defined(CONFIG_MAC80211_MODULE)
574 - atomic_set(&n->users, 1);
576 - atomic_inc(&(skb_shinfo(skb)->dataref));
584 - * skb_morph - morph one skb into another
585 - * @dst: the skb to receive the contents
586 - * @src: the skb to supply the contents
588 - * This is identical to skb_clone except that the target skb is
589 - * supplied by the user.
591 - * The target skb is returned upon exit.
593 -struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src)
595 - skb_release_all(dst);
596 - return __skb_clone(dst, src);
598 -EXPORT_SYMBOL_GPL(skb_morph);
601 - * skb_clone - duplicate an sk_buff
602 - * @skb: buffer to clone
603 - * @gfp_mask: allocation priority
605 - * Duplicate an &sk_buff. The new one is not owned by a socket. Both
606 - * copies share the same packet data but not structure. The new
607 - * buffer has a reference count of 1. If the allocation fails the
608 - * function returns %NULL otherwise the new buffer is returned.
610 - * If this function is called from an interrupt gfp_mask() must be
614 -struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
619 - if (skb->fclone == SKB_FCLONE_ORIG &&
620 - n->fclone == SKB_FCLONE_UNAVAILABLE) {
621 - atomic_t *fclone_ref = (atomic_t *) (n + 1);
622 - n->fclone = SKB_FCLONE_CLONE;
623 - atomic_inc(fclone_ref);
625 - n = kmem_cache_alloc(skbuff_head_cache, gfp_mask);
628 - n->fclone = SKB_FCLONE_UNAVAILABLE;
631 - return __skb_clone(n, skb);
634 -static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
636 -#ifndef NET_SKBUFF_DATA_USES_OFFSET
638 - * Shift between the two data areas in bytes
640 - unsigned long offset = new->data - old->data;
643 - __copy_skb_header(new, old);
645 -#ifndef NET_SKBUFF_DATA_USES_OFFSET
646 - /* {transport,network,mac}_header are relative to skb->head */
647 - new->transport_header += offset;
648 - new->network_header += offset;
649 - new->mac_header += offset;
651 - skb_shinfo(new)->gso_size = skb_shinfo(old)->gso_size;
652 - skb_shinfo(new)->gso_segs = skb_shinfo(old)->gso_segs;
653 - skb_shinfo(new)->gso_type = skb_shinfo(old)->gso_type;
657 - * skb_copy - create private copy of an sk_buff
658 - * @skb: buffer to copy
659 - * @gfp_mask: allocation priority
661 - * Make a copy of both an &sk_buff and its data. This is used when the
662 - * caller wishes to modify the data and needs a private copy of the
663 - * data to alter. Returns %NULL on failure or the pointer to the buffer
664 - * on success. The returned buffer has a reference count of 1.
666 - * As by-product this function converts non-linear &sk_buff to linear
667 - * one, so that &sk_buff becomes completely private and caller is allowed
668 - * to modify all the data of returned buffer. This means that this
669 - * function is not recommended for use in circumstances when only
670 - * header is going to be modified. Use pskb_copy() instead.
673 -struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask)
675 - int headerlen = skb->data - skb->head;
677 - * Allocate the copy buffer
680 -#ifdef NET_SKBUFF_DATA_USES_OFFSET
681 - n = alloc_skb(skb->end + skb->data_len, gfp_mask);
683 - n = alloc_skb(skb->end - skb->head + skb->data_len, gfp_mask);
688 - /* Set the data pointer */
689 - skb_reserve(n, headerlen);
690 - /* Set the tail pointer and length */
691 - skb_put(n, skb->len);
693 - if (skb_copy_bits(skb, -headerlen, n->head, headerlen + skb->len))
696 - copy_skb_header(n, skb);
702 - * pskb_copy - create copy of an sk_buff with private head.
703 - * @skb: buffer to copy
704 - * @gfp_mask: allocation priority
706 - * Make a copy of both an &sk_buff and part of its data, located
707 - * in header. Fragmented data remain shared. This is used when
708 - * the caller wishes to modify only header of &sk_buff and needs
709 - * private copy of the header to alter. Returns %NULL on failure
710 - * or the pointer to the buffer on success.
711 - * The returned buffer has a reference count of 1.
714 -struct sk_buff *pskb_copy(struct sk_buff *skb, gfp_t gfp_mask)
717 - * Allocate the copy buffer
720 -#ifdef NET_SKBUFF_DATA_USES_OFFSET
721 - n = alloc_skb(skb->end, gfp_mask);
723 - n = alloc_skb(skb->end - skb->head, gfp_mask);
728 - /* Set the data pointer */
729 - skb_reserve(n, skb->data - skb->head);
730 - /* Set the tail pointer and length */
731 - skb_put(n, skb_headlen(skb));
732 - /* Copy the bytes */
733 - skb_copy_from_linear_data(skb, n->data, n->len);
735 - n->truesize += skb->data_len;
736 - n->data_len = skb->data_len;
739 - if (skb_shinfo(skb)->nr_frags) {
742 - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
743 - skb_shinfo(n)->frags[i] = skb_shinfo(skb)->frags[i];
744 - get_page(skb_shinfo(n)->frags[i].page);
746 - skb_shinfo(n)->nr_frags = i;
749 - if (skb_shinfo(skb)->frag_list) {
750 - skb_shinfo(n)->frag_list = skb_shinfo(skb)->frag_list;
751 - skb_clone_fraglist(n);
754 - copy_skb_header(n, skb);
760 - * pskb_expand_head - reallocate header of &sk_buff
761 - * @skb: buffer to reallocate
762 - * @nhead: room to add at head
763 - * @ntail: room to add at tail
764 - * @gfp_mask: allocation priority
766 - * Expands (or creates identical copy, if &nhead and &ntail are zero)
767 - * header of skb. &sk_buff itself is not changed. &sk_buff MUST have
768 - * reference count of 1. Returns zero in the case of success or error,
769 - * if expansion failed. In the last case, &sk_buff is not changed.
771 - * All the pointers pointing into skb header may change and must be
772 - * reloaded after call to this function.
775 -int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
780 -#ifdef NET_SKBUFF_DATA_USES_OFFSET
781 - int size = nhead + skb->end + ntail;
783 - int size = nhead + (skb->end - skb->head) + ntail;
787 - if (skb_shared(skb))
790 - size = SKB_DATA_ALIGN(size);
792 - data = kmalloc(size + sizeof(struct skb_shared_info), gfp_mask);
796 - /* Copy only real data... and, alas, header. This should be
797 - * optimized for the cases when header is void. */
798 -#ifdef NET_SKBUFF_DATA_USES_OFFSET
799 - memcpy(data + nhead, skb->head, skb->tail);
801 - memcpy(data + nhead, skb->head, skb->tail - skb->head);
803 - memcpy(data + size, skb_end_pointer(skb),
804 - sizeof(struct skb_shared_info));
806 - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
807 - get_page(skb_shinfo(skb)->frags[i].page);
809 - if (skb_shinfo(skb)->frag_list)
810 - skb_clone_fraglist(skb);
812 - skb_release_data(skb);
814 - off = (data + nhead) - skb->head;
818 -#ifdef NET_SKBUFF_DATA_USES_OFFSET
822 - skb->end = skb->head + size;
824 - /* {transport,network,mac}_header and tail are relative to skb->head */
826 - skb->transport_header += off;
827 - skb->network_header += off;
828 - skb->mac_header += off;
829 - skb->csum_start += nhead;
833 - atomic_set(&skb_shinfo(skb)->dataref, 1);
840 -/* Make private copy of skb with writable head and some headroom */
842 -struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom)
844 - struct sk_buff *skb2;
845 - int delta = headroom - skb_headroom(skb);
848 - skb2 = pskb_copy(skb, GFP_ATOMIC);
850 - skb2 = skb_clone(skb, GFP_ATOMIC);
851 - if (skb2 && pskb_expand_head(skb2, SKB_DATA_ALIGN(delta), 0,
862 - * skb_copy_expand - copy and expand sk_buff
863 - * @skb: buffer to copy
864 - * @newheadroom: new free bytes at head
865 - * @newtailroom: new free bytes at tail
866 - * @gfp_mask: allocation priority
868 - * Make a copy of both an &sk_buff and its data and while doing so
869 - * allocate additional space.
871 - * This is used when the caller wishes to modify the data and needs a
872 - * private copy of the data to alter as well as more space for new fields.
873 - * Returns %NULL on failure or the pointer to the buffer
874 - * on success. The returned buffer has a reference count of 1.
876 - * You must pass %GFP_ATOMIC as the allocation priority if this function
877 - * is called from an interrupt.
879 -struct sk_buff *skb_copy_expand(const struct sk_buff *skb,
880 - int newheadroom, int newtailroom,
884 - * Allocate the copy buffer
886 - struct sk_buff *n = alloc_skb(newheadroom + skb->len + newtailroom,
888 - int oldheadroom = skb_headroom(skb);
889 - int head_copy_len, head_copy_off;
895 - skb_reserve(n, newheadroom);
897 - /* Set the tail pointer and length */
898 - skb_put(n, skb->len);
900 - head_copy_len = oldheadroom;
902 - if (newheadroom <= head_copy_len)
903 - head_copy_len = newheadroom;
905 - head_copy_off = newheadroom - head_copy_len;
907 - /* Copy the linear header and data. */
908 - if (skb_copy_bits(skb, -head_copy_len, n->head + head_copy_off,
909 - skb->len + head_copy_len))
912 - copy_skb_header(n, skb);
914 - off = newheadroom - oldheadroom;
915 - n->csum_start += off;
916 -#ifdef NET_SKBUFF_DATA_USES_OFFSET
917 - n->transport_header += off;
918 - n->network_header += off;
919 - n->mac_header += off;
926 - * skb_pad - zero pad the tail of an skb
927 - * @skb: buffer to pad
928 - * @pad: space to pad
930 - * Ensure that a buffer is followed by a padding area that is zero
931 - * filled. Used by network drivers which may DMA or transfer data
932 - * beyond the buffer end onto the wire.
934 - * May return error in out of memory cases. The skb is freed on error.
937 -int skb_pad(struct sk_buff *skb, int pad)
942 - /* If the skbuff is non linear tailroom is always zero.. */
943 - if (!skb_cloned(skb) && skb_tailroom(skb) >= pad) {
944 - memset(skb->data+skb->len, 0, pad);
948 - ntail = skb->data_len + pad - (skb->end - skb->tail);
949 - if (likely(skb_cloned(skb) || ntail > 0)) {
950 - err = pskb_expand_head(skb, 0, ntail, GFP_ATOMIC);
955 - /* FIXME: The use of this function with non-linear skb's really needs
958 - err = skb_linearize(skb);
962 - memset(skb->data + skb->len, 0, pad);
971 - * skb_put - add data to a buffer
972 - * @skb: buffer to use
973 - * @len: amount of data to add
975 - * This function extends the used data area of the buffer. If this would
976 - * exceed the total buffer size the kernel will panic. A pointer to the
977 - * first byte of the extra data is returned.
979 -unsigned char *skb_put(struct sk_buff *skb, unsigned int len)
981 - unsigned char *tmp = skb_tail_pointer(skb);
982 - SKB_LINEAR_ASSERT(skb);
985 - if (unlikely(skb->tail > skb->end))
986 - skb_over_panic(skb, len, __builtin_return_address(0));
989 -EXPORT_SYMBOL(skb_put);
992 - * skb_push - add data to the start of a buffer
993 - * @skb: buffer to use
994 - * @len: amount of data to add
996 - * This function extends the used data area of the buffer at the buffer
997 - * start. If this would exceed the total buffer headroom the kernel will
998 - * panic. A pointer to the first byte of the extra data is returned.
1000 -unsigned char *skb_push(struct sk_buff *skb, unsigned int len)
1004 - if (unlikely(skb->data<skb->head))
1005 - skb_under_panic(skb, len, __builtin_return_address(0));
1008 -EXPORT_SYMBOL(skb_push);
1011 - * skb_pull - remove data from the start of a buffer
1012 - * @skb: buffer to use
1013 - * @len: amount of data to remove
1015 - * This function removes data from the start of a buffer, returning
1016 - * the memory to the headroom. A pointer to the next data in the buffer
1017 - * is returned. Once the data has been pulled future pushes will overwrite
1020 -unsigned char *skb_pull(struct sk_buff *skb, unsigned int len)
1022 - return unlikely(len > skb->len) ? NULL : __skb_pull(skb, len);
1024 -EXPORT_SYMBOL(skb_pull);
1027 - * skb_trim - remove end from a buffer
1028 - * @skb: buffer to alter
1029 - * @len: new length
1031 - * Cut the length of a buffer down by removing data from the tail. If
1032 - * the buffer is already under the length specified it is not modified.
1033 - * The skb must be linear.
1035 -void skb_trim(struct sk_buff *skb, unsigned int len)
1037 - if (skb->len > len)
1038 - __skb_trim(skb, len);
1040 -EXPORT_SYMBOL(skb_trim);
1042 -/* Trims skb to length len. It can change skb pointers.
1045 -int ___pskb_trim(struct sk_buff *skb, unsigned int len)
1047 - struct sk_buff **fragp;
1048 - struct sk_buff *frag;
1049 - int offset = skb_headlen(skb);
1050 - int nfrags = skb_shinfo(skb)->nr_frags;
1054 - if (skb_cloned(skb) &&
1055 - unlikely((err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC))))
1059 - if (offset >= len)
1062 - for (; i < nfrags; i++) {
1063 - int end = offset + skb_shinfo(skb)->frags[i].size;
1070 - skb_shinfo(skb)->frags[i++].size = len - offset;
1073 - skb_shinfo(skb)->nr_frags = i;
1075 - for (; i < nfrags; i++)
1076 - put_page(skb_shinfo(skb)->frags[i].page);
1078 - if (skb_shinfo(skb)->frag_list)
1079 - skb_drop_fraglist(skb);
1083 - for (fragp = &skb_shinfo(skb)->frag_list; (frag = *fragp);
1084 - fragp = &frag->next) {
1085 - int end = offset + frag->len;
1087 - if (skb_shared(frag)) {
1088 - struct sk_buff *nfrag;
1090 - nfrag = skb_clone(frag, GFP_ATOMIC);
1091 - if (unlikely(!nfrag))
1094 - nfrag->next = frag->next;
1106 - unlikely((err = pskb_trim(frag, len - offset))))
1110 - skb_drop_list(&frag->next);
1115 - if (len > skb_headlen(skb)) {
1116 - skb->data_len -= skb->len - len;
1120 - skb->data_len = 0;
1121 - skb_set_tail_pointer(skb, len);
1128 - * __pskb_pull_tail - advance tail of skb header
1129 - * @skb: buffer to reallocate
1130 - * @delta: number of bytes to advance tail
1132 - * The function makes a sense only on a fragmented &sk_buff,
1133 - * it expands header moving its tail forward and copying necessary
1134 - * data from fragmented part.
1136 - * &sk_buff MUST have reference count of 1.
1138 - * Returns %NULL (and &sk_buff does not change) if pull failed
1139 - * or value of new tail of skb in the case of success.
1141 - * All the pointers pointing into skb header may change and must be
1142 - * reloaded after call to this function.
1145 -/* Moves tail of skb head forward, copying data from fragmented part,
1146 - * when it is necessary.
1147 - * 1. It may fail due to malloc failure.
1148 - * 2. It may change skb pointers.
1150 - * It is pretty complicated. Luckily, it is called only in exceptional cases.
1152 -unsigned char *__pskb_pull_tail(struct sk_buff *skb, int delta)
1154 - /* If skb has not enough free space at tail, get new one
1155 - * plus 128 bytes for future expansions. If we have enough
1156 - * room at tail, reallocate without expansion only if skb is cloned.
1158 - int i, k, eat = (skb->tail + delta) - skb->end;
1160 - if (eat > 0 || skb_cloned(skb)) {
1161 - if (pskb_expand_head(skb, 0, eat > 0 ? eat + 128 : 0,
1166 - if (skb_copy_bits(skb, skb_headlen(skb), skb_tail_pointer(skb), delta))
1169 - /* Optimization: no fragments, no reasons to preestimate
1170 - * size of pulled pages. Superb.
1172 - if (!skb_shinfo(skb)->frag_list)
1175 - /* Estimate size of pulled pages. */
1177 - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
1178 - if (skb_shinfo(skb)->frags[i].size >= eat)
1180 - eat -= skb_shinfo(skb)->frags[i].size;
1183 - /* If we need update frag list, we are in troubles.
1184 - * Certainly, it possible to add an offset to skb data,
1185 - * but taking into account that pulling is expected to
1186 - * be very rare operation, it is worth to fight against
1187 - * further bloating skb head and crucify ourselves here instead.
1188 - * Pure masohism, indeed. 8)8)
1191 - struct sk_buff *list = skb_shinfo(skb)->frag_list;
1192 - struct sk_buff *clone = NULL;
1193 - struct sk_buff *insp = NULL;
1198 - if (list->len <= eat) {
1199 - /* Eaten as whole. */
1201 - list = list->next;
1204 - /* Eaten partially. */
1206 - if (skb_shared(list)) {
1207 - /* Sucks! We need to fork list. :-( */
1208 - clone = skb_clone(list, GFP_ATOMIC);
1211 - insp = list->next;
1214 - /* This may be pulled without
1218 - if (!pskb_pull(list, eat)) {
1227 - /* Free pulled out fragments. */
1228 - while ((list = skb_shinfo(skb)->frag_list) != insp) {
1229 - skb_shinfo(skb)->frag_list = list->next;
1232 - /* And insert new clone at head. */
1234 - clone->next = list;
1235 - skb_shinfo(skb)->frag_list = clone;
1238 - /* Success! Now we may commit changes to skb data. */
1243 - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
1244 - if (skb_shinfo(skb)->frags[i].size <= eat) {
1245 - put_page(skb_shinfo(skb)->frags[i].page);
1246 - eat -= skb_shinfo(skb)->frags[i].size;
1248 - skb_shinfo(skb)->frags[k] = skb_shinfo(skb)->frags[i];
1250 - skb_shinfo(skb)->frags[k].page_offset += eat;
1251 - skb_shinfo(skb)->frags[k].size -= eat;
1257 - skb_shinfo(skb)->nr_frags = k;
1259 - skb->tail += delta;
1260 - skb->data_len -= delta;
1262 - return skb_tail_pointer(skb);
1265 -/* Copy some data bits from skb to kernel buffer. */
1267 -int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len)
1270 - int start = skb_headlen(skb);
1272 - if (offset > (int)skb->len - len)
1275 - /* Copy header. */
1276 - if ((copy = start - offset) > 0) {
1279 - skb_copy_from_linear_data_offset(skb, offset, to, copy);
1280 - if ((len -= copy) == 0)
1286 - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
1289 - WARN_ON(start > offset + len);
1291 - end = start + skb_shinfo(skb)->frags[i].size;
1292 - if ((copy = end - offset) > 0) {
1298 - vaddr = kmap_skb_frag(&skb_shinfo(skb)->frags[i]);
1300 - vaddr + skb_shinfo(skb)->frags[i].page_offset+
1301 - offset - start, copy);
1302 - kunmap_skb_frag(vaddr);
1304 - if ((len -= copy) == 0)
1312 - if (skb_shinfo(skb)->frag_list) {
1313 - struct sk_buff *list = skb_shinfo(skb)->frag_list;
1315 - for (; list; list = list->next) {
1318 - WARN_ON(start > offset + len);
1320 - end = start + list->len;
1321 - if ((copy = end - offset) > 0) {
1324 - if (skb_copy_bits(list, offset - start,
1327 - if ((len -= copy) == 0)
1343 - * Callback from splice_to_pipe(), if we need to release some pages
1344 - * at the end of the spd in case we error'ed out in filling the pipe.
1346 -static void sock_spd_release(struct splice_pipe_desc *spd, unsigned int i)
1348 - put_page(spd->pages[i]);
1351 -static inline struct page *linear_to_page(struct page *page, unsigned int len,
1352 - unsigned int offset)
1354 - struct page *p = alloc_pages(GFP_KERNEL, 0);
1358 - memcpy(page_address(p) + offset, page_address(page) + offset, len);
1364 - * Fill page/offset/length into spd, if it can hold more pages.
1366 -static inline int spd_fill_page(struct splice_pipe_desc *spd, struct page *page,
1367 - unsigned int len, unsigned int offset,
1368 - struct sk_buff *skb, int linear)
1370 - if (unlikely(spd->nr_pages == PIPE_BUFFERS))
1374 - page = linear_to_page(page, len, offset);
1380 - spd->pages[spd->nr_pages] = page;
1381 - spd->partial[spd->nr_pages].len = len;
1382 - spd->partial[spd->nr_pages].offset = offset;
1388 -static inline void __segment_seek(struct page **page, unsigned int *poff,
1389 - unsigned int *plen, unsigned int off)
1392 - *page += *poff / PAGE_SIZE;
1393 - *poff = *poff % PAGE_SIZE;
1397 -static inline int __splice_segment(struct page *page, unsigned int poff,
1398 - unsigned int plen, unsigned int *off,
1399 - unsigned int *len, struct sk_buff *skb,
1400 - struct splice_pipe_desc *spd, int linear)
1405 - /* skip this segment if already processed */
1406 - if (*off >= plen) {
1411 - /* ignore any bits we already processed */
1413 - __segment_seek(&page, &poff, &plen, *off);
1418 - unsigned int flen = min(*len, plen);
1420 - /* the linear region may spread across several pages */
1421 - flen = min_t(unsigned int, flen, PAGE_SIZE - poff);
1423 - if (spd_fill_page(spd, page, flen, poff, skb, linear))
1426 - __segment_seek(&page, &poff, &plen, flen);
1429 - } while (*len && plen);
1435 - * Map linear and fragment data from the skb to spd. It reports failure if the
1436 - * pipe is full or if we already spliced the requested length.
1438 -static int __skb_splice_bits(struct sk_buff *skb, unsigned int *offset,
1439 - unsigned int *len,
1440 - struct splice_pipe_desc *spd)
1445 - * map the linear part
1447 - if (__splice_segment(virt_to_page(skb->data),
1448 - (unsigned long) skb->data & (PAGE_SIZE - 1),
1450 - offset, len, skb, spd, 1))
1454 - * then map the fragments
1456 - for (seg = 0; seg < skb_shinfo(skb)->nr_frags; seg++) {
1457 - const skb_frag_t *f = &skb_shinfo(skb)->frags[seg];
1459 - if (__splice_segment(f->page, f->page_offset, f->size,
1460 - offset, len, skb, spd, 0))
1468 - * Map data from the skb to a pipe. Should handle both the linear part,
1469 - * the fragments, and the frag list. It does NOT handle frag lists within
1470 - * the frag list, if such a thing exists. We'd probably need to recurse to
1471 - * handle that cleanly.
1473 -int skb_splice_bits(struct sk_buff *skb, unsigned int offset,
1474 - struct pipe_inode_info *pipe, unsigned int tlen,
1475 - unsigned int flags)
1477 - struct partial_page partial[PIPE_BUFFERS];
1478 - struct page *pages[PIPE_BUFFERS];
1479 - struct splice_pipe_desc spd = {
1481 - .partial = partial,
1483 - .ops = &sock_pipe_buf_ops,
1484 - .spd_release = sock_spd_release,
1488 - * __skb_splice_bits() only fails if the output has no room left,
1489 - * so no point in going over the frag_list for the error case.
1491 - if (__skb_splice_bits(skb, &offset, &tlen, &spd))
1497 - * now see if we have a frag_list to map
1499 - if (skb_shinfo(skb)->frag_list) {
1500 - struct sk_buff *list = skb_shinfo(skb)->frag_list;
1502 - for (; list && tlen; list = list->next) {
1503 - if (__skb_splice_bits(list, &offset, &tlen, &spd))
1509 - if (spd.nr_pages) {
1510 - struct sock *sk = skb->sk;
1514 - * Drop the socket lock, otherwise we have reverse
1515 - * locking dependencies between sk_lock and i_mutex
1516 - * here as compared to sendfile(). We enter here
1517 - * with the socket lock held, and splice_to_pipe() will
1518 - * grab the pipe inode lock. For sendfile() emulation,
1519 - * we call into ->sendpage() with the i_mutex lock held
1520 - * and networking will grab the socket lock.
1523 - ret = splice_to_pipe(pipe, &spd);
1532 - * skb_store_bits - store bits from kernel buffer to skb
1533 - * @skb: destination buffer
1534 - * @offset: offset in destination
1535 - * @from: source buffer
1536 - * @len: number of bytes to copy
1538 - * Copy the specified number of bytes from the source buffer to the
1539 - * destination skb. This function handles all the messy bits of
1540 - * traversing fragment lists and such.
1543 -int skb_store_bits(struct sk_buff *skb, int offset, const void *from, int len)
1546 - int start = skb_headlen(skb);
1548 - if (offset > (int)skb->len - len)
1551 - if ((copy = start - offset) > 0) {
1554 - skb_copy_to_linear_data_offset(skb, offset, from, copy);
1555 - if ((len -= copy) == 0)
1561 - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
1562 - skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
1565 - WARN_ON(start > offset + len);
1567 - end = start + frag->size;
1568 - if ((copy = end - offset) > 0) {
1574 - vaddr = kmap_skb_frag(frag);
1575 - memcpy(vaddr + frag->page_offset + offset - start,
1577 - kunmap_skb_frag(vaddr);
1579 - if ((len -= copy) == 0)
1587 - if (skb_shinfo(skb)->frag_list) {
1588 - struct sk_buff *list = skb_shinfo(skb)->frag_list;
1590 - for (; list; list = list->next) {
1593 - WARN_ON(start > offset + len);
1595 - end = start + list->len;
1596 - if ((copy = end - offset) > 0) {
1599 - if (skb_store_bits(list, offset - start,
1602 - if ((len -= copy) == 0)
1617 -EXPORT_SYMBOL(skb_store_bits);
1619 -/* Checksum skb data. */
1621 -__wsum skb_checksum(const struct sk_buff *skb, int offset,
1622 - int len, __wsum csum)
1624 - int start = skb_headlen(skb);
1625 - int i, copy = start - offset;
1628 - /* Checksum header. */
1632 - csum = csum_partial(skb->data + offset, copy, csum);
1633 - if ((len -= copy) == 0)
1639 - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
1642 - WARN_ON(start > offset + len);
1644 - end = start + skb_shinfo(skb)->frags[i].size;
1645 - if ((copy = end - offset) > 0) {
1648 - skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
1652 - vaddr = kmap_skb_frag(frag);
1653 - csum2 = csum_partial(vaddr + frag->page_offset +
1654 - offset - start, copy, 0);
1655 - kunmap_skb_frag(vaddr);
1656 - csum = csum_block_add(csum, csum2, pos);
1657 - if (!(len -= copy))
1665 - if (skb_shinfo(skb)->frag_list) {
1666 - struct sk_buff *list = skb_shinfo(skb)->frag_list;
1668 - for (; list; list = list->next) {
1671 - WARN_ON(start > offset + len);
1673 - end = start + list->len;
1674 - if ((copy = end - offset) > 0) {
1678 - csum2 = skb_checksum(list, offset - start,
1680 - csum = csum_block_add(csum, csum2, pos);
1681 - if ((len -= copy) == 0)
1694 -/* Both of above in one bottle. */
1696 -__wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset,
1697 - u8 *to, int len, __wsum csum)
1699 - int start = skb_headlen(skb);
1700 - int i, copy = start - offset;
1703 - /* Copy header. */
1707 - csum = csum_partial_copy_nocheck(skb->data + offset, to,
1709 - if ((len -= copy) == 0)
1716 - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
1719 - WARN_ON(start > offset + len);
1721 - end = start + skb_shinfo(skb)->frags[i].size;
1722 - if ((copy = end - offset) > 0) {
1725 - skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
1729 - vaddr = kmap_skb_frag(frag);
1730 - csum2 = csum_partial_copy_nocheck(vaddr +
1731 - frag->page_offset +
1732 - offset - start, to,
1734 - kunmap_skb_frag(vaddr);
1735 - csum = csum_block_add(csum, csum2, pos);
1736 - if (!(len -= copy))
1745 - if (skb_shinfo(skb)->frag_list) {
1746 - struct sk_buff *list = skb_shinfo(skb)->frag_list;
1748 - for (; list; list = list->next) {
1752 - WARN_ON(start > offset + len);
1754 - end = start + list->len;
1755 - if ((copy = end - offset) > 0) {
1758 - csum2 = skb_copy_and_csum_bits(list,
1761 - csum = csum_block_add(csum, csum2, pos);
1762 - if ((len -= copy) == 0)
1775 -void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to)
1780 - if (skb->ip_summed == CHECKSUM_PARTIAL)
1781 - csstart = skb->csum_start - skb_headroom(skb);
1783 - csstart = skb_headlen(skb);
1785 - BUG_ON(csstart > skb_headlen(skb));
1787 - skb_copy_from_linear_data(skb, to, csstart);
1790 - if (csstart != skb->len)
1791 - csum = skb_copy_and_csum_bits(skb, csstart, to + csstart,
1792 - skb->len - csstart, 0);
1794 - if (skb->ip_summed == CHECKSUM_PARTIAL) {
1795 - long csstuff = csstart + skb->csum_offset;
1797 - *((__sum16 *)(to + csstuff)) = csum_fold(csum);
1802 - * skb_dequeue - remove from the head of the queue
1803 - * @list: list to dequeue from
1805 - * Remove the head of the list. The list lock is taken so the function
1806 - * may be used safely with other locking list functions. The head item is
1807 - * returned or %NULL if the list is empty.
1810 -struct sk_buff *skb_dequeue(struct sk_buff_head *list)
1812 - unsigned long flags;
1813 - struct sk_buff *result;
1815 - spin_lock_irqsave(&list->lock, flags);
1816 - result = __skb_dequeue(list);
1817 - spin_unlock_irqrestore(&list->lock, flags);
1822 - * skb_dequeue_tail - remove from the tail of the queue
1823 - * @list: list to dequeue from
1825 - * Remove the tail of the list. The list lock is taken so the function
1826 - * may be used safely with other locking list functions. The tail item is
1827 - * returned or %NULL if the list is empty.
1829 -struct sk_buff *skb_dequeue_tail(struct sk_buff_head *list)
1831 - unsigned long flags;
1832 - struct sk_buff *result;
1834 - spin_lock_irqsave(&list->lock, flags);
1835 - result = __skb_dequeue_tail(list);
1836 - spin_unlock_irqrestore(&list->lock, flags);
1841 - * skb_queue_purge - empty a list
1842 - * @list: list to empty
1844 - * Delete all buffers on an &sk_buff list. Each buffer is removed from
1845 - * the list and one reference dropped. This function takes the list
1846 - * lock and is atomic with respect to other list locking functions.
1848 -void skb_queue_purge(struct sk_buff_head *list)
1850 - struct sk_buff *skb;
1851 - while ((skb = skb_dequeue(list)) != NULL)
1856 - * skb_queue_head - queue a buffer at the list head
1857 - * @list: list to use
1858 - * @newsk: buffer to queue
1860 - * Queue a buffer at the start of the list. This function takes the
1861 - * list lock and can be used safely with other locking &sk_buff functions
1864 - * A buffer cannot be placed on two lists at the same time.
1866 -void skb_queue_head(struct sk_buff_head *list, struct sk_buff *newsk)
1868 - unsigned long flags;
1870 - spin_lock_irqsave(&list->lock, flags);
1871 - __skb_queue_head(list, newsk);
1872 - spin_unlock_irqrestore(&list->lock, flags);
1876 - * skb_queue_tail - queue a buffer at the list tail
1877 - * @list: list to use
1878 - * @newsk: buffer to queue
1880 - * Queue a buffer at the tail of the list. This function takes the
1881 - * list lock and can be used safely with other locking &sk_buff functions
1884 - * A buffer cannot be placed on two lists at the same time.
1886 -void skb_queue_tail(struct sk_buff_head *list, struct sk_buff *newsk)
1888 - unsigned long flags;
1890 - spin_lock_irqsave(&list->lock, flags);
1891 - __skb_queue_tail(list, newsk);
1892 - spin_unlock_irqrestore(&list->lock, flags);
1896 - * skb_unlink - remove a buffer from a list
1897 - * @skb: buffer to remove
1898 - * @list: list to use
1900 - * Remove a packet from a list. The list locks are taken and this
1901 - * function is atomic with respect to other list locked calls
1903 - * You must know what list the SKB is on.
1905 -void skb_unlink(struct sk_buff *skb, struct sk_buff_head *list)
1907 - unsigned long flags;
1909 - spin_lock_irqsave(&list->lock, flags);
1910 - __skb_unlink(skb, list);
1911 - spin_unlock_irqrestore(&list->lock, flags);
1915 - * skb_append - append a buffer
1916 - * @old: buffer to insert after
1917 - * @newsk: buffer to insert
1918 - * @list: list to use
1920 - * Place a packet after a given packet in a list. The list locks are taken
1921 - * and this function is atomic with respect to other list locked calls.
1922 - * A buffer cannot be placed on two lists at the same time.
1924 -void skb_append(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list)
1926 - unsigned long flags;
1928 - spin_lock_irqsave(&list->lock, flags);
1929 - __skb_queue_after(list, old, newsk);
1930 - spin_unlock_irqrestore(&list->lock, flags);
1935 - * skb_insert - insert a buffer
1936 - * @old: buffer to insert before
1937 - * @newsk: buffer to insert
1938 - * @list: list to use
1940 - * Place a packet before a given packet in a list. The list locks are
1941 - * taken and this function is atomic with respect to other list locked
1944 - * A buffer cannot be placed on two lists at the same time.
1946 -void skb_insert(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list)
1948 - unsigned long flags;
1950 - spin_lock_irqsave(&list->lock, flags);
1951 - __skb_insert(newsk, old->prev, old, list);
1952 - spin_unlock_irqrestore(&list->lock, flags);
1955 -static inline void skb_split_inside_header(struct sk_buff *skb,
1956 - struct sk_buff* skb1,
1957 - const u32 len, const int pos)
1961 - skb_copy_from_linear_data_offset(skb, len, skb_put(skb1, pos - len),
1963 - /* And move data appendix as is. */
1964 - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1965 - skb_shinfo(skb1)->frags[i] = skb_shinfo(skb)->frags[i];
1967 - skb_shinfo(skb1)->nr_frags = skb_shinfo(skb)->nr_frags;
1968 - skb_shinfo(skb)->nr_frags = 0;
1969 - skb1->data_len = skb->data_len;
1970 - skb1->len += skb1->data_len;
1971 - skb->data_len = 0;
1973 - skb_set_tail_pointer(skb, len);
1976 -static inline void skb_split_no_header(struct sk_buff *skb,
1977 - struct sk_buff* skb1,
1978 - const u32 len, int pos)
1981 - const int nfrags = skb_shinfo(skb)->nr_frags;
1983 - skb_shinfo(skb)->nr_frags = 0;
1984 - skb1->len = skb1->data_len = skb->len - len;
1986 - skb->data_len = len - pos;
1988 - for (i = 0; i < nfrags; i++) {
1989 - int size = skb_shinfo(skb)->frags[i].size;
1991 - if (pos + size > len) {
1992 - skb_shinfo(skb1)->frags[k] = skb_shinfo(skb)->frags[i];
1996 - * We have two variants in this case:
1997 - * 1. Move all the frag to the second
1998 - * part, if it is possible. F.e.
1999 - * this approach is mandatory for TUX,
2000 - * where splitting is expensive.
2001 - * 2. Split is accurately. We make this.
2003 - get_page(skb_shinfo(skb)->frags[i].page);
2004 - skb_shinfo(skb1)->frags[0].page_offset += len - pos;
2005 - skb_shinfo(skb1)->frags[0].size -= len - pos;
2006 - skb_shinfo(skb)->frags[i].size = len - pos;
2007 - skb_shinfo(skb)->nr_frags++;
2011 - skb_shinfo(skb)->nr_frags++;
2014 - skb_shinfo(skb1)->nr_frags = k;
2018 - * skb_split - Split fragmented skb to two parts at length len.
2019 - * @skb: the buffer to split
2020 - * @skb1: the buffer to receive the second part
2021 - * @len: new length for skb
2023 -void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len)
2025 - int pos = skb_headlen(skb);
2027 - if (len < pos) /* Split line is inside header. */
2028 - skb_split_inside_header(skb, skb1, len, pos);
2029 - else /* Second chunk has no header, nothing to copy. */
2030 - skb_split_no_header(skb, skb1, len, pos);
2034 - * skb_prepare_seq_read - Prepare a sequential read of skb data
2035 - * @skb: the buffer to read
2036 - * @from: lower offset of data to be read
2037 - * @to: upper offset of data to be read
2038 - * @st: state variable
2040 - * Initializes the specified state variable. Must be called before
2041 - * invoking skb_seq_read() for the first time.
2043 -void skb_prepare_seq_read(struct sk_buff *skb, unsigned int from,
2044 - unsigned int to, struct skb_seq_state *st)
2046 - st->lower_offset = from;
2047 - st->upper_offset = to;
2048 - st->root_skb = st->cur_skb = skb;
2049 - st->frag_idx = st->stepped_offset = 0;
2050 - st->frag_data = NULL;
2054 - * skb_seq_read - Sequentially read skb data
2055 - * @consumed: number of bytes consumed by the caller so far
2056 - * @data: destination pointer for data to be returned
2057 - * @st: state variable
2059 - * Reads a block of skb data at &consumed relative to the
2060 - * lower offset specified to skb_prepare_seq_read(). Assigns
2061 - * the head of the data block to &data and returns the length
2062 - * of the block or 0 if the end of the skb data or the upper
2063 - * offset has been reached.
2065 - * The caller is not required to consume all of the data
2066 - * returned, i.e. &consumed is typically set to the number
2067 - * of bytes already consumed and the next call to
2068 - * skb_seq_read() will return the remaining part of the block.
2070 - * Note 1: The size of each block of data returned can be arbitary,
2071 - * this limitation is the cost for zerocopy seqeuental
2072 - * reads of potentially non linear data.
2074 - * Note 2: Fragment lists within fragments are not implemented
2075 - * at the moment, state->root_skb could be replaced with
2076 - * a stack for this purpose.
2078 -unsigned int skb_seq_read(unsigned int consumed, const u8 **data,
2079 - struct skb_seq_state *st)
2081 - unsigned int block_limit, abs_offset = consumed + st->lower_offset;
2084 - if (unlikely(abs_offset >= st->upper_offset))
2088 - block_limit = skb_headlen(st->cur_skb) + st->stepped_offset;
2090 - if (abs_offset < block_limit && !st->frag_data) {
2091 - *data = st->cur_skb->data + (abs_offset - st->stepped_offset);
2092 - return block_limit - abs_offset;
2095 - if (st->frag_idx == 0 && !st->frag_data)
2096 - st->stepped_offset += skb_headlen(st->cur_skb);
2098 - while (st->frag_idx < skb_shinfo(st->cur_skb)->nr_frags) {
2099 - frag = &skb_shinfo(st->cur_skb)->frags[st->frag_idx];
2100 - block_limit = frag->size + st->stepped_offset;
2102 - if (abs_offset < block_limit) {
2103 - if (!st->frag_data)
2104 - st->frag_data = kmap_skb_frag(frag);
2106 - *data = (u8 *) st->frag_data + frag->page_offset +
2107 - (abs_offset - st->stepped_offset);
2109 - return block_limit - abs_offset;
2112 - if (st->frag_data) {
2113 - kunmap_skb_frag(st->frag_data);
2114 - st->frag_data = NULL;
2118 - st->stepped_offset += frag->size;
2121 - if (st->frag_data) {
2122 - kunmap_skb_frag(st->frag_data);
2123 - st->frag_data = NULL;
2126 - if (st->root_skb == st->cur_skb &&
2127 - skb_shinfo(st->root_skb)->frag_list) {
2128 - st->cur_skb = skb_shinfo(st->root_skb)->frag_list;
2131 - } else if (st->cur_skb->next) {
2132 - st->cur_skb = st->cur_skb->next;
2141 - * skb_abort_seq_read - Abort a sequential read of skb data
2142 - * @st: state variable
2144 - * Must be called if skb_seq_read() was not called until it
2147 -void skb_abort_seq_read(struct skb_seq_state *st)
2149 - if (st->frag_data)
2150 - kunmap_skb_frag(st->frag_data);
2153 -#define TS_SKB_CB(state) ((struct skb_seq_state *) &((state)->cb))
2155 -static unsigned int skb_ts_get_next_block(unsigned int offset, const u8 **text,
2156 - struct ts_config *conf,
2157 - struct ts_state *state)
2159 - return skb_seq_read(offset, text, TS_SKB_CB(state));
2162 -static void skb_ts_finish(struct ts_config *conf, struct ts_state *state)
2164 - skb_abort_seq_read(TS_SKB_CB(state));
2168 - * skb_find_text - Find a text pattern in skb data
2169 - * @skb: the buffer to look in
2170 - * @from: search offset
2171 - * @to: search limit
2172 - * @config: textsearch configuration
2173 - * @state: uninitialized textsearch state variable
2175 - * Finds a pattern in the skb data according to the specified
2176 - * textsearch configuration. Use textsearch_next() to retrieve
2177 - * subsequent occurrences of the pattern. Returns the offset
2178 - * to the first occurrence or UINT_MAX if no match was found.
2180 -unsigned int skb_find_text(struct sk_buff *skb, unsigned int from,
2181 - unsigned int to, struct ts_config *config,
2182 - struct ts_state *state)
2186 - config->get_next_block = skb_ts_get_next_block;
2187 - config->finish = skb_ts_finish;
2189 - skb_prepare_seq_read(skb, from, to, TS_SKB_CB(state));
2191 - ret = textsearch_find(config, state);
2192 - return (ret <= to - from ? ret : UINT_MAX);
2196 - * skb_append_datato_frags: - append the user data to a skb
2197 - * @sk: sock structure
2198 - * @skb: skb structure to be appened with user data.
2199 - * @getfrag: call back function to be used for getting the user data
2200 - * @from: pointer to user message iov
2201 - * @length: length of the iov message
2203 - * Description: This procedure append the user data in the fragment part
2204 - * of the skb if any page alloc fails user this procedure returns -ENOMEM
2206 -int skb_append_datato_frags(struct sock *sk, struct sk_buff *skb,
2207 - int (*getfrag)(void *from, char *to, int offset,
2208 - int len, int odd, struct sk_buff *skb),
2209 - void *from, int length)
2212 - skb_frag_t *frag = NULL;
2213 - struct page *page = NULL;
2219 - /* Return error if we don't have space for new frag */
2220 - frg_cnt = skb_shinfo(skb)->nr_frags;
2221 - if (frg_cnt >= MAX_SKB_FRAGS)
2224 - /* allocate a new page for next frag */
2225 - page = alloc_pages(sk->sk_allocation, 0);
2227 - /* If alloc_page fails just return failure and caller will
2228 - * free previous allocated pages by doing kfree_skb()
2233 - /* initialize the next frag */
2234 - sk->sk_sndmsg_page = page;
2235 - sk->sk_sndmsg_off = 0;
2236 - skb_fill_page_desc(skb, frg_cnt, page, 0, 0);
2237 - skb->truesize += PAGE_SIZE;
2238 - atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc);
2240 - /* get the new initialized frag */
2241 - frg_cnt = skb_shinfo(skb)->nr_frags;
2242 - frag = &skb_shinfo(skb)->frags[frg_cnt - 1];
2244 - /* copy the user data to page */
2245 - left = PAGE_SIZE - frag->page_offset;
2246 - copy = (length > left)? left : length;
2248 - ret = getfrag(from, (page_address(frag->page) +
2249 - frag->page_offset + frag->size),
2250 - offset, copy, 0, skb);
2254 - /* copy was successful so update the size parameters */
2255 - sk->sk_sndmsg_off += copy;
2256 - frag->size += copy;
2258 - skb->data_len += copy;
2262 - } while (length > 0);
2268 - * skb_pull_rcsum - pull skb and update receive checksum
2269 - * @skb: buffer to update
2270 - * @len: length of data pulled
2272 - * This function performs an skb_pull on the packet and updates
2273 - * the CHECKSUM_COMPLETE checksum. It should be used on
2274 - * receive path processing instead of skb_pull unless you know
2275 - * that the checksum difference is zero (e.g., a valid IP header)
2276 - * or you are setting ip_summed to CHECKSUM_NONE.
2278 -unsigned char *skb_pull_rcsum(struct sk_buff *skb, unsigned int len)
2280 - BUG_ON(len > skb->len);
2282 - BUG_ON(skb->len < skb->data_len);
2283 - skb_postpull_rcsum(skb, skb->data, len);
2284 - return skb->data += len;
2287 -EXPORT_SYMBOL_GPL(skb_pull_rcsum);
2290 - * skb_segment - Perform protocol segmentation on skb.
2291 - * @skb: buffer to segment
2292 - * @features: features for the output path (see dev->features)
2294 - * This function performs segmentation on the given skb. It returns
2295 - * a pointer to the first in a list of new skbs for the segments.
2296 - * In case of error it returns ERR_PTR(err).
2298 -struct sk_buff *skb_segment(struct sk_buff *skb, int features)
2300 - struct sk_buff *segs = NULL;
2301 - struct sk_buff *tail = NULL;
2302 - unsigned int mss = skb_shinfo(skb)->gso_size;
2303 - unsigned int doffset = skb->data - skb_mac_header(skb);
2304 - unsigned int offset = doffset;
2305 - unsigned int headroom;
2307 - int sg = features & NETIF_F_SG;
2308 - int nfrags = skb_shinfo(skb)->nr_frags;
2309 - int err = -ENOMEM;
2313 - __skb_push(skb, doffset);
2314 - headroom = skb_headroom(skb);
2315 - pos = skb_headlen(skb);
2318 - struct sk_buff *nskb;
2324 - len = skb->len - offset;
2328 - hsize = skb_headlen(skb) - offset;
2331 - if (hsize > len || !sg)
2334 - nskb = alloc_skb(hsize + doffset + headroom, GFP_ATOMIC);
2335 - if (unlikely(!nskb))
2339 - tail->next = nskb;
2344 - __copy_skb_header(nskb, skb);
2345 - nskb->mac_len = skb->mac_len;
2347 - skb_reserve(nskb, headroom);
2348 - skb_reset_mac_header(nskb);
2349 - skb_set_network_header(nskb, skb->mac_len);
2350 - nskb->transport_header = (nskb->network_header +
2351 - skb_network_header_len(skb));
2352 - skb_copy_from_linear_data(skb, skb_put(nskb, doffset),
2355 - nskb->ip_summed = CHECKSUM_NONE;
2356 - nskb->csum = skb_copy_and_csum_bits(skb, offset,
2357 - skb_put(nskb, len),
2362 - frag = skb_shinfo(nskb)->frags;
2365 - skb_copy_from_linear_data_offset(skb, offset,
2366 - skb_put(nskb, hsize), hsize);
2368 - while (pos < offset + len) {
2369 - BUG_ON(i >= nfrags);
2371 - *frag = skb_shinfo(skb)->frags[i];
2372 - get_page(frag->page);
2373 - size = frag->size;
2375 - if (pos < offset) {
2376 - frag->page_offset += offset - pos;
2377 - frag->size -= offset - pos;
2382 - if (pos + size <= offset + len) {
2386 - frag->size -= pos + size - (offset + len);
2393 - skb_shinfo(nskb)->nr_frags = k;
2394 - nskb->data_len = len - hsize;
2395 - nskb->len += nskb->data_len;
2396 - nskb->truesize += nskb->data_len;
2397 - } while ((offset += len) < skb->len);
2402 - while ((skb = segs)) {
2406 - return ERR_PTR(err);
2409 -EXPORT_SYMBOL_GPL(skb_segment);
2411 -void __init skb_init(void)
2413 - skbuff_head_cache = kmem_cache_create("skbuff_head_cache",
2414 - sizeof(struct sk_buff),
2416 - SLAB_HWCACHE_ALIGN|SLAB_PANIC,
2418 - skbuff_fclone_cache = kmem_cache_create("skbuff_fclone_cache",
2419 - (2*sizeof(struct sk_buff)) +
2422 - SLAB_HWCACHE_ALIGN|SLAB_PANIC,
2427 - * skb_to_sgvec - Fill a scatter-gather list from a socket buffer
2428 - * @skb: Socket buffer containing the buffers to be mapped
2429 - * @sg: The scatter-gather list to map into
2430 - * @offset: The offset into the buffer's contents to start mapping
2431 - * @len: Length of buffer space to be mapped
2433 - * Fill the specified scatter-gather list with mappings/pointers into a
2434 - * region of the buffer space attached to a socket buffer.
2437 -__skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len)
2439 - int start = skb_headlen(skb);
2440 - int i, copy = start - offset;
2446 - sg_set_buf(sg, skb->data + offset, copy);
2448 - if ((len -= copy) == 0)
2453 - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2456 - WARN_ON(start > offset + len);
2458 - end = start + skb_shinfo(skb)->frags[i].size;
2459 - if ((copy = end - offset) > 0) {
2460 - skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2464 - sg_set_page(&sg[elt], frag->page, copy,
2465 - frag->page_offset+offset-start);
2467 - if (!(len -= copy))
2474 - if (skb_shinfo(skb)->frag_list) {
2475 - struct sk_buff *list = skb_shinfo(skb)->frag_list;
2477 - for (; list; list = list->next) {
2480 - WARN_ON(start > offset + len);
2482 - end = start + list->len;
2483 - if ((copy = end - offset) > 0) {
2486 - elt += __skb_to_sgvec(list, sg+elt, offset - start,
2488 - if ((len -= copy) == 0)
2499 -int skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len)
2501 - int nsg = __skb_to_sgvec(skb, sg, offset, len);
2503 - sg_mark_end(&sg[nsg - 1]);
2509 - * skb_cow_data - Check that a socket buffer's data buffers are writable
2510 - * @skb: The socket buffer to check.
2511 - * @tailbits: Amount of trailing space to be added
2512 - * @trailer: Returned pointer to the skb where the @tailbits space begins
2514 - * Make sure that the data buffers attached to a socket buffer are
2515 - * writable. If they are not, private copies are made of the data buffers
2516 - * and the socket buffer is set to use these instead.
2518 - * If @tailbits is given, make sure that there is space to write @tailbits
2519 - * bytes of data beyond current end of socket buffer. @trailer will be
2520 - * set to point to the skb in which this space begins.
2522 - * The number of scatterlist elements required to completely map the
2523 - * COW'd and extended socket buffer will be returned.
2525 -int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer)
2529 - struct sk_buff *skb1, **skb_p;
2531 - /* If skb is cloned or its head is paged, reallocate
2532 - * head pulling out all the pages (pages are considered not writable
2533 - * at the moment even if they are anonymous).
2535 - if ((skb_cloned(skb) || skb_shinfo(skb)->nr_frags) &&
2536 - __pskb_pull_tail(skb, skb_pagelen(skb)-skb_headlen(skb)) == NULL)
2539 - /* Easy case. Most of packets will go this way. */
2540 - if (!skb_shinfo(skb)->frag_list) {
2541 - /* A little of trouble, not enough of space for trailer.
2542 - * This should not happen, when stack is tuned to generate
2543 - * good frames. OK, on miss we reallocate and reserve even more
2544 - * space, 128 bytes is fair. */
2546 - if (skb_tailroom(skb) < tailbits &&
2547 - pskb_expand_head(skb, 0, tailbits-skb_tailroom(skb)+128, GFP_ATOMIC))
2555 - /* Misery. We are in troubles, going to mincer fragments... */
2558 - skb_p = &skb_shinfo(skb)->frag_list;
2561 - while ((skb1 = *skb_p) != NULL) {
2564 - /* The fragment is partially pulled by someone,
2565 - * this can happen on input. Copy it and everything
2568 - if (skb_shared(skb1))
2571 - /* If the skb is the last, worry about trailer. */
2573 - if (skb1->next == NULL && tailbits) {
2574 - if (skb_shinfo(skb1)->nr_frags ||
2575 - skb_shinfo(skb1)->frag_list ||
2576 - skb_tailroom(skb1) < tailbits)
2577 - ntail = tailbits + 128;
2581 - skb_cloned(skb1) ||
2583 - skb_shinfo(skb1)->nr_frags ||
2584 - skb_shinfo(skb1)->frag_list) {
2585 - struct sk_buff *skb2;
2587 - /* Fuck, we are miserable poor guys... */
2589 - skb2 = skb_copy(skb1, GFP_ATOMIC);
2591 - skb2 = skb_copy_expand(skb1,
2592 - skb_headroom(skb1),
2595 - if (unlikely(skb2 == NULL))
2599 - skb_set_owner_w(skb2, skb1->sk);
2601 - /* Looking around. Are we still alive?
2602 - * OK, link new skb, drop old one */
2604 - skb2->next = skb1->next;
2611 - skb_p = &skb1->next;
2618 - * skb_partial_csum_set - set up and verify partial csum values for packet
2619 - * @skb: the skb to set
2620 - * @start: the number of bytes after skb->data to start checksumming.
2621 - * @off: the offset from start to place the checksum.
2623 - * For untrusted partially-checksummed packets, we need to make sure the values
2624 - * for skb->csum_start and skb->csum_offset are valid so we don't oops.
2626 - * This function checks and sets those values and skb->ip_summed: if this
2627 - * returns false you should drop the packet.
2629 -bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off)
2631 - if (unlikely(start > skb->len - 2) ||
2632 - unlikely((int)start + off > skb->len - 2)) {
2633 - if (net_ratelimit())
2634 - printk(KERN_WARNING
2635 - "bad partial csum: csum=%u/%u len=%u\n",
2636 - start, off, skb->len);
2639 - skb->ip_summed = CHECKSUM_PARTIAL;
2640 - skb->csum_start = skb_headroom(skb) + start;
2641 - skb->csum_offset = off;
2645 -void __skb_warn_lro_forwarding(const struct sk_buff *skb)
2647 - if (net_ratelimit())
2648 - pr_warning("%s: received packets cannot be forwarded"
2649 - " while LRO is enabled\n", skb->dev->name);
2652 -EXPORT_SYMBOL(___pskb_trim);
2653 -EXPORT_SYMBOL(__kfree_skb);
2654 -EXPORT_SYMBOL(kfree_skb);
2655 -EXPORT_SYMBOL(__pskb_pull_tail);
2656 -EXPORT_SYMBOL(__alloc_skb);
2657 -EXPORT_SYMBOL(__netdev_alloc_skb);
2658 -EXPORT_SYMBOL(pskb_copy);
2659 -EXPORT_SYMBOL(pskb_expand_head);
2660 -EXPORT_SYMBOL(skb_checksum);
2661 -EXPORT_SYMBOL(skb_clone);
2662 -EXPORT_SYMBOL(skb_copy);
2663 -EXPORT_SYMBOL(skb_copy_and_csum_bits);
2664 -EXPORT_SYMBOL(skb_copy_and_csum_dev);
2665 -EXPORT_SYMBOL(skb_copy_bits);
2666 -EXPORT_SYMBOL(skb_copy_expand);
2667 -EXPORT_SYMBOL(skb_over_panic);
2668 -EXPORT_SYMBOL(skb_pad);
2669 -EXPORT_SYMBOL(skb_realloc_headroom);
2670 -EXPORT_SYMBOL(skb_under_panic);
2671 -EXPORT_SYMBOL(skb_dequeue);
2672 -EXPORT_SYMBOL(skb_dequeue_tail);
2673 -EXPORT_SYMBOL(skb_insert);
2674 -EXPORT_SYMBOL(skb_queue_purge);
2675 -EXPORT_SYMBOL(skb_queue_head);
2676 -EXPORT_SYMBOL(skb_queue_tail);
2677 -EXPORT_SYMBOL(skb_unlink);
2678 -EXPORT_SYMBOL(skb_append);
2679 -EXPORT_SYMBOL(skb_split);
2680 -EXPORT_SYMBOL(skb_prepare_seq_read);
2681 -EXPORT_SYMBOL(skb_seq_read);
2682 -EXPORT_SYMBOL(skb_abort_seq_read);
2683 -EXPORT_SYMBOL(skb_find_text);
2684 -EXPORT_SYMBOL(skb_append_datato_frags);
2685 -EXPORT_SYMBOL(__skb_warn_lro_forwarding);
2687 -EXPORT_SYMBOL_GPL(skb_to_sgvec);
2688 -EXPORT_SYMBOL_GPL(skb_cow_data);
2689 -EXPORT_SYMBOL_GPL(skb_partial_csum_set);
2690 diff -Nurb linux-2.6.27-524/net/core/sock.c.orig linux-2.6.27-525/net/core/sock.c.orig
2691 --- linux-2.6.27-524/net/core/sock.c.orig 2009-12-04 16:03:48.000000000 -0500
2692 +++ linux-2.6.27-525/net/core/sock.c.orig 1969-12-31 19:00:00.000000000 -0500
2695 - * INET An implementation of the TCP/IP protocol suite for the LINUX
2696 - * operating system. INET is implemented using the BSD Socket
2697 - * interface as the means of communication with the user level.
2699 - * Generic socket support routines. Memory allocators, socket lock/release
2700 - * handler for protocols to use and generic option handler.
2703 - * Authors: Ross Biro
2704 - * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
2705 - * Florian La Roche, <flla@stud.uni-sb.de>
2706 - * Alan Cox, <A.Cox@swansea.ac.uk>
2709 - * Alan Cox : Numerous verify_area() problems
2710 - * Alan Cox : Connecting on a connecting socket
2711 - * now returns an error for tcp.
2712 - * Alan Cox : sock->protocol is set correctly.
2713 - * and is not sometimes left as 0.
2714 - * Alan Cox : connect handles icmp errors on a
2715 - * connect properly. Unfortunately there
2716 - * is a restart syscall nasty there. I
2717 - * can't match BSD without hacking the C
2718 - * library. Ideas urgently sought!
2719 - * Alan Cox : Disallow bind() to addresses that are
2720 - * not ours - especially broadcast ones!!
2721 - * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost)
2722 - * Alan Cox : sock_wfree/sock_rfree don't destroy sockets,
2723 - * instead they leave that for the DESTROY timer.
2724 - * Alan Cox : Clean up error flag in accept
2725 - * Alan Cox : TCP ack handling is buggy, the DESTROY timer
2726 - * was buggy. Put a remove_sock() in the handler
2727 - * for memory when we hit 0. Also altered the timer
2728 - * code. The ACK stuff can wait and needs major
2729 - * TCP layer surgery.
2730 - * Alan Cox : Fixed TCP ack bug, removed remove sock
2731 - * and fixed timer/inet_bh race.
2732 - * Alan Cox : Added zapped flag for TCP
2733 - * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code
2734 - * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
2735 - * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources
2736 - * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing.
2737 - * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
2738 - * Rick Sladkey : Relaxed UDP rules for matching packets.
2739 - * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support
2740 - * Pauline Middelink : identd support
2741 - * Alan Cox : Fixed connect() taking signals I think.
2742 - * Alan Cox : SO_LINGER supported
2743 - * Alan Cox : Error reporting fixes
2744 - * Anonymous : inet_create tidied up (sk->reuse setting)
2745 - * Alan Cox : inet sockets don't set sk->type!
2746 - * Alan Cox : Split socket option code
2747 - * Alan Cox : Callbacks
2748 - * Alan Cox : Nagle flag for Charles & Johannes stuff
2749 - * Alex : Removed restriction on inet fioctl
2750 - * Alan Cox : Splitting INET from NET core
2751 - * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt()
2752 - * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code
2753 - * Alan Cox : Split IP from generic code
2754 - * Alan Cox : New kfree_skbmem()
2755 - * Alan Cox : Make SO_DEBUG superuser only.
2756 - * Alan Cox : Allow anyone to clear SO_DEBUG
2757 - * (compatibility fix)
2758 - * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput.
2759 - * Alan Cox : Allocator for a socket is settable.
2760 - * Alan Cox : SO_ERROR includes soft errors.
2761 - * Alan Cox : Allow NULL arguments on some SO_ opts
2762 - * Alan Cox : Generic socket allocation to make hooks
2763 - * easier (suggested by Craig Metz).
2764 - * Michael Pall : SO_ERROR returns positive errno again
2765 - * Steve Whitehouse: Added default destructor to free
2766 - * protocol private data.
2767 - * Steve Whitehouse: Added various other default routines
2768 - * common to several socket families.
2769 - * Chris Evans : Call suser() check last on F_SETOWN
2770 - * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
2771 - * Andi Kleen : Add sock_kmalloc()/sock_kfree_s()
2772 - * Andi Kleen : Fix write_space callback
2773 - * Chris Evans : Security fixes - signedness again
2774 - * Arnaldo C. Melo : cleanups, use skb_queue_purge
2779 - * This program is free software; you can redistribute it and/or
2780 - * modify it under the terms of the GNU General Public License
2781 - * as published by the Free Software Foundation; either version
2782 - * 2 of the License, or (at your option) any later version.
2785 -#include <linux/capability.h>
2786 -#include <linux/errno.h>
2787 -#include <linux/types.h>
2788 -#include <linux/socket.h>
2789 -#include <linux/in.h>
2790 -#include <linux/kernel.h>
2791 -#include <linux/module.h>
2792 -#include <linux/proc_fs.h>
2793 -#include <linux/seq_file.h>
2794 -#include <linux/sched.h>
2795 -#include <linux/timer.h>
2796 -#include <linux/string.h>
2797 -#include <linux/sockios.h>
2798 -#include <linux/net.h>
2799 -#include <linux/mm.h>
2800 -#include <linux/slab.h>
2801 -#include <linux/interrupt.h>
2802 -#include <linux/poll.h>
2803 -#include <linux/tcp.h>
2804 -#include <linux/init.h>
2805 -#include <linux/highmem.h>
2807 -#include <asm/uaccess.h>
2808 -#include <asm/system.h>
2810 -#include <linux/netdevice.h>
2811 -#include <net/protocol.h>
2812 -#include <linux/skbuff.h>
2813 -#include <net/net_namespace.h>
2814 -#include <net/request_sock.h>
2815 -#include <net/sock.h>
2816 -#include <net/xfrm.h>
2817 -#include <linux/ipsec.h>
2819 -#include <linux/filter.h>
2820 -#include <linux/vs_socket.h>
2821 -#include <linux/vs_limit.h>
2822 -#include <linux/vs_context.h>
2823 -#include <linux/vs_network.h>
2826 -#include <net/tcp.h>
2830 - * Each address family might have different locking rules, so we have
2831 - * one slock key per address family:
2833 -static struct lock_class_key af_family_keys[AF_MAX];
2834 -static struct lock_class_key af_family_slock_keys[AF_MAX];
2836 -#ifdef CONFIG_DEBUG_LOCK_ALLOC
2838 - * Make lock validator output more readable. (we pre-construct these
2839 - * strings build-time, so that runtime initialization of socket
2842 -static const char *af_family_key_strings[AF_MAX+1] = {
2843 - "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX" , "sk_lock-AF_INET" ,
2844 - "sk_lock-AF_AX25" , "sk_lock-AF_IPX" , "sk_lock-AF_APPLETALK",
2845 - "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE" , "sk_lock-AF_ATMPVC" ,
2846 - "sk_lock-AF_X25" , "sk_lock-AF_INET6" , "sk_lock-AF_ROSE" ,
2847 - "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI" , "sk_lock-AF_SECURITY" ,
2848 - "sk_lock-AF_KEY" , "sk_lock-AF_NETLINK" , "sk_lock-AF_PACKET" ,
2849 - "sk_lock-AF_ASH" , "sk_lock-AF_ECONET" , "sk_lock-AF_ATMSVC" ,
2850 - "sk_lock-21" , "sk_lock-AF_SNA" , "sk_lock-AF_IRDA" ,
2851 - "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE" , "sk_lock-AF_LLC" ,
2852 - "sk_lock-27" , "sk_lock-28" , "sk_lock-AF_CAN" ,
2853 - "sk_lock-AF_TIPC" , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV" ,
2854 - "sk_lock-AF_RXRPC" , "sk_lock-AF_MAX"
2856 -static const char *af_family_slock_key_strings[AF_MAX+1] = {
2857 - "slock-AF_UNSPEC", "slock-AF_UNIX" , "slock-AF_INET" ,
2858 - "slock-AF_AX25" , "slock-AF_IPX" , "slock-AF_APPLETALK",
2859 - "slock-AF_NETROM", "slock-AF_BRIDGE" , "slock-AF_ATMPVC" ,
2860 - "slock-AF_X25" , "slock-AF_INET6" , "slock-AF_ROSE" ,
2861 - "slock-AF_DECnet", "slock-AF_NETBEUI" , "slock-AF_SECURITY" ,
2862 - "slock-AF_KEY" , "slock-AF_NETLINK" , "slock-AF_PACKET" ,
2863 - "slock-AF_ASH" , "slock-AF_ECONET" , "slock-AF_ATMSVC" ,
2864 - "slock-21" , "slock-AF_SNA" , "slock-AF_IRDA" ,
2865 - "slock-AF_PPPOX" , "slock-AF_WANPIPE" , "slock-AF_LLC" ,
2866 - "slock-27" , "slock-28" , "slock-AF_CAN" ,
2867 - "slock-AF_TIPC" , "slock-AF_BLUETOOTH", "slock-AF_IUCV" ,
2868 - "slock-AF_RXRPC" , "slock-AF_MAX"
2870 -static const char *af_family_clock_key_strings[AF_MAX+1] = {
2871 - "clock-AF_UNSPEC", "clock-AF_UNIX" , "clock-AF_INET" ,
2872 - "clock-AF_AX25" , "clock-AF_IPX" , "clock-AF_APPLETALK",
2873 - "clock-AF_NETROM", "clock-AF_BRIDGE" , "clock-AF_ATMPVC" ,
2874 - "clock-AF_X25" , "clock-AF_INET6" , "clock-AF_ROSE" ,
2875 - "clock-AF_DECnet", "clock-AF_NETBEUI" , "clock-AF_SECURITY" ,
2876 - "clock-AF_KEY" , "clock-AF_NETLINK" , "clock-AF_PACKET" ,
2877 - "clock-AF_ASH" , "clock-AF_ECONET" , "clock-AF_ATMSVC" ,
2878 - "clock-21" , "clock-AF_SNA" , "clock-AF_IRDA" ,
2879 - "clock-AF_PPPOX" , "clock-AF_WANPIPE" , "clock-AF_LLC" ,
2880 - "clock-27" , "clock-28" , "clock-AF_CAN" ,
2881 - "clock-AF_TIPC" , "clock-AF_BLUETOOTH", "clock-AF_IUCV" ,
2882 - "clock-AF_RXRPC" , "clock-AF_MAX"
2887 - * sk_callback_lock locking rules are per-address-family,
2888 - * so split the lock classes by using a per-AF key:
2890 -static struct lock_class_key af_callback_keys[AF_MAX];
2892 -/* Take into consideration the size of the struct sk_buff overhead in the
2893 - * determination of these values, since that is non-constant across
2894 - * platforms. This makes socket queueing behavior and performance
2895 - * not depend upon such differences.
2897 -#define _SK_MEM_PACKETS 256
2898 -#define _SK_MEM_OVERHEAD (sizeof(struct sk_buff) + 256)
2899 -#define SK_WMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
2900 -#define SK_RMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
2902 -/* Run time adjustable parameters. */
2903 -__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
2904 -__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
2905 -__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
2906 -__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
2908 -/* Maximal space eaten by iovec or ancilliary data plus some space */
2909 -int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
2911 -static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
2913 - struct timeval tv;
2915 - if (optlen < sizeof(tv))
2917 - if (copy_from_user(&tv, optval, sizeof(tv)))
2919 - if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
2922 - if (tv.tv_sec < 0) {
2923 - static int warned __read_mostly;
2926 - if (warned < 10 && net_ratelimit()) {
2928 - printk(KERN_INFO "sock_set_timeout: `%s' (pid %d) "
2929 - "tries to set negative timeout\n",
2930 - current->comm, task_pid_nr(current));
2934 - *timeo_p = MAX_SCHEDULE_TIMEOUT;
2935 - if (tv.tv_sec == 0 && tv.tv_usec == 0)
2937 - if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
2938 - *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
2942 -static void sock_warn_obsolete_bsdism(const char *name)
2944 - static int warned;
2945 - static char warncomm[TASK_COMM_LEN];
2946 - if (strcmp(warncomm, current->comm) && warned < 5) {
2947 - strcpy(warncomm, current->comm);
2948 - printk(KERN_WARNING "process `%s' is using obsolete "
2949 - "%s SO_BSDCOMPAT\n", warncomm, name);
2954 -static void sock_disable_timestamp(struct sock *sk)
2956 - if (sock_flag(sk, SOCK_TIMESTAMP)) {
2957 - sock_reset_flag(sk, SOCK_TIMESTAMP);
2958 - net_disable_timestamp();
2963 -int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
2968 - /* Cast sk->rcvbuf to unsigned... It's pointless, but reduces
2969 - number of warnings when compiling with -W --ANK
2971 - if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
2972 - (unsigned)sk->sk_rcvbuf) {
2977 - err = sk_filter(sk, skb);
2981 - if (!sk_rmem_schedule(sk, skb->truesize)) {
2987 - skb_set_owner_r(skb, sk);
2989 - /* Cache the SKB length before we tack it onto the receive
2990 - * queue. Once it is added it no longer belongs to us and
2991 - * may be freed by other threads of control pulling packets
2994 - skb_len = skb->len;
2996 - skb_queue_tail(&sk->sk_receive_queue, skb);
2998 - if (!sock_flag(sk, SOCK_DEAD))
2999 - sk->sk_data_ready(sk, skb_len);
3003 -EXPORT_SYMBOL(sock_queue_rcv_skb);
3005 -int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
3007 - int rc = NET_RX_SUCCESS;
3009 - if (sk_filter(sk, skb))
3010 - goto discard_and_relse;
3015 - bh_lock_sock_nested(sk);
3018 - if (!sock_owned_by_user(sk)) {
3020 - * trylock + unlock semantics:
3022 - mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
3024 - rc = sk->sk_backlog_rcv(sk, skb);
3026 - mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
3028 - sk_add_backlog(sk, skb);
3029 - bh_unlock_sock(sk);
3037 -EXPORT_SYMBOL(sk_receive_skb);
3039 -struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
3041 - struct dst_entry *dst = sk->sk_dst_cache;
3043 - if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
3044 - sk->sk_dst_cache = NULL;
3051 -EXPORT_SYMBOL(__sk_dst_check);
3053 -struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
3055 - struct dst_entry *dst = sk_dst_get(sk);
3057 - if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
3065 -EXPORT_SYMBOL(sk_dst_check);
3067 -static int sock_bindtodevice(struct sock *sk, char __user *optval, int optlen)
3069 - int ret = -ENOPROTOOPT;
3070 -#ifdef CONFIG_NETDEVICES
3071 - struct net *net = sock_net(sk);
3072 - char devname[IFNAMSIZ];
3077 - if (!capable(CAP_NET_RAW))
3084 - /* Bind this socket to a particular device like "eth0",
3085 - * as specified in the passed interface name. If the
3086 - * name is "" or the option length is zero the socket
3089 - if (optlen > IFNAMSIZ - 1)
3090 - optlen = IFNAMSIZ - 1;
3091 - memset(devname, 0, sizeof(devname));
3094 - if (copy_from_user(devname, optval, optlen))
3097 - if (devname[0] == '\0') {
3100 - struct net_device *dev = dev_get_by_name(net, devname);
3106 - index = dev->ifindex;
3111 - sk->sk_bound_dev_if = index;
3123 -static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
3126 - sock_set_flag(sk, bit);
3128 - sock_reset_flag(sk, bit);
3132 - * This is meant for all protocols to use and covers goings on
3133 - * at the socket level. Everything here is generic.
3136 -int sock_setsockopt(struct socket *sock, int level, int optname,
3137 - char __user *optval, int optlen)
3139 - struct sock *sk=sock->sk;
3142 - struct linger ling;
3146 - * Options without arguments
3149 - if (optname == SO_BINDTODEVICE)
3150 - return sock_bindtodevice(sk, optval, optlen);
3152 - if (optlen < sizeof(int))
3155 - if (get_user(val, (int __user *)optval))
3158 - valbool = val?1:0;
3164 - if (val && !capable(CAP_NET_ADMIN)) {
3167 - sock_valbool_flag(sk, SOCK_DBG, valbool);
3169 - case SO_REUSEADDR:
3170 - sk->sk_reuse = valbool;
3174 - ret = -ENOPROTOOPT;
3176 - case SO_DONTROUTE:
3177 - sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
3179 - case SO_BROADCAST:
3180 - sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
3183 - /* Don't error on this BSD doesn't and if you think
3184 - about it this is right. Otherwise apps have to
3185 - play 'guess the biggest size' games. RCVBUF/SNDBUF
3186 - are treated in BSD as hints */
3188 - if (val > sysctl_wmem_max)
3189 - val = sysctl_wmem_max;
3191 - sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
3192 - if ((val * 2) < SOCK_MIN_SNDBUF)
3193 - sk->sk_sndbuf = SOCK_MIN_SNDBUF;
3195 - sk->sk_sndbuf = val * 2;
3198 - * Wake up sending tasks if we
3199 - * upped the value.
3201 - sk->sk_write_space(sk);
3204 - case SO_SNDBUFFORCE:
3205 - if (!capable(CAP_NET_ADMIN)) {
3212 - /* Don't error on this BSD doesn't and if you think
3213 - about it this is right. Otherwise apps have to
3214 - play 'guess the biggest size' games. RCVBUF/SNDBUF
3215 - are treated in BSD as hints */
3217 - if (val > sysctl_rmem_max)
3218 - val = sysctl_rmem_max;
3220 - sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
3222 - * We double it on the way in to account for
3223 - * "struct sk_buff" etc. overhead. Applications
3224 - * assume that the SO_RCVBUF setting they make will
3225 - * allow that much actual data to be received on that
3228 - * Applications are unaware that "struct sk_buff" and
3229 - * other overheads allocate from the receive buffer
3230 - * during socket buffer allocation.
3232 - * And after considering the possible alternatives,
3233 - * returning the value we actually used in getsockopt
3234 - * is the most desirable behavior.
3236 - if ((val * 2) < SOCK_MIN_RCVBUF)
3237 - sk->sk_rcvbuf = SOCK_MIN_RCVBUF;
3239 - sk->sk_rcvbuf = val * 2;
3242 - case SO_RCVBUFFORCE:
3243 - if (!capable(CAP_NET_ADMIN)) {
3249 - case SO_KEEPALIVE:
3251 - if (sk->sk_protocol == IPPROTO_TCP)
3252 - tcp_set_keepalive(sk, valbool);
3254 - sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
3257 - case SO_OOBINLINE:
3258 - sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
3262 - sk->sk_no_check = valbool;
3266 - if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN))
3267 - sk->sk_priority = val;
3273 - if (optlen < sizeof(ling)) {
3274 - ret = -EINVAL; /* 1003.1g */
3277 - if (copy_from_user(&ling,optval,sizeof(ling))) {
3281 - if (!ling.l_onoff)
3282 - sock_reset_flag(sk, SOCK_LINGER);
3284 -#if (BITS_PER_LONG == 32)
3285 - if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
3286 - sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
3289 - sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
3290 - sock_set_flag(sk, SOCK_LINGER);
3294 - case SO_BSDCOMPAT:
3295 - sock_warn_obsolete_bsdism("setsockopt");
3300 - set_bit(SOCK_PASSCRED, &sock->flags);
3302 - clear_bit(SOCK_PASSCRED, &sock->flags);
3305 - case SO_TIMESTAMP:
3306 - case SO_TIMESTAMPNS:
3308 - if (optname == SO_TIMESTAMP)
3309 - sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
3311 - sock_set_flag(sk, SOCK_RCVTSTAMPNS);
3312 - sock_set_flag(sk, SOCK_RCVTSTAMP);
3313 - sock_enable_timestamp(sk);
3315 - sock_reset_flag(sk, SOCK_RCVTSTAMP);
3316 - sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
3323 - sk->sk_rcvlowat = val ? : 1;
3327 - ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
3331 - ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
3334 - case SO_ATTACH_FILTER:
3336 - if (optlen == sizeof(struct sock_fprog)) {
3337 - struct sock_fprog fprog;
3340 - if (copy_from_user(&fprog, optval, sizeof(fprog)))
3343 - ret = sk_attach_filter(&fprog, sk);
3347 - case SO_DETACH_FILTER:
3348 - ret = sk_detach_filter(sk);
3353 - set_bit(SOCK_PASSSEC, &sock->flags);
3355 - clear_bit(SOCK_PASSSEC, &sock->flags);
3358 - if (!capable(CAP_NET_ADMIN))
3361 - sk->sk_mark = val;
3365 - /* We implement the SO_SNDLOWAT etc to
3366 - not be settable (1003.1g 5.3) */
3368 - ret = -ENOPROTOOPT;
3376 -int sock_getsockopt(struct socket *sock, int level, int optname,
3377 - char __user *optval, int __user *optlen)
3379 - struct sock *sk = sock->sk;
3383 - struct linger ling;
3384 - struct timeval tm;
3387 - unsigned int lv = sizeof(int);
3390 - if (get_user(len, optlen))
3395 - memset(&v, 0, sizeof(v));
3399 - v.val = sock_flag(sk, SOCK_DBG);
3402 - case SO_DONTROUTE:
3403 - v.val = sock_flag(sk, SOCK_LOCALROUTE);
3406 - case SO_BROADCAST:
3407 - v.val = !!sock_flag(sk, SOCK_BROADCAST);
3411 - v.val = sk->sk_sndbuf;
3415 - v.val = sk->sk_rcvbuf;
3418 - case SO_REUSEADDR:
3419 - v.val = sk->sk_reuse;
3422 - case SO_KEEPALIVE:
3423 - v.val = !!sock_flag(sk, SOCK_KEEPOPEN);
3427 - v.val = sk->sk_type;
3431 - v.val = -sock_error(sk);
3433 - v.val = xchg(&sk->sk_err_soft, 0);
3436 - case SO_OOBINLINE:
3437 - v.val = !!sock_flag(sk, SOCK_URGINLINE);
3441 - v.val = sk->sk_no_check;
3445 - v.val = sk->sk_priority;
3449 - lv = sizeof(v.ling);
3450 - v.ling.l_onoff = !!sock_flag(sk, SOCK_LINGER);
3451 - v.ling.l_linger = sk->sk_lingertime / HZ;
3454 - case SO_BSDCOMPAT:
3455 - sock_warn_obsolete_bsdism("getsockopt");
3458 - case SO_TIMESTAMP:
3459 - v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
3460 - !sock_flag(sk, SOCK_RCVTSTAMPNS);
3463 - case SO_TIMESTAMPNS:
3464 - v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
3468 - lv=sizeof(struct timeval);
3469 - if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
3473 - v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
3474 - v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
3479 - lv=sizeof(struct timeval);
3480 - if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
3484 - v.tm.tv_sec = sk->sk_sndtimeo / HZ;
3485 - v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
3490 - v.val = sk->sk_rcvlowat;
3498 - v.val = test_bit(SOCK_PASSCRED, &sock->flags) ? 1 : 0;
3502 - if (len > sizeof(sk->sk_peercred))
3503 - len = sizeof(sk->sk_peercred);
3504 - if (copy_to_user(optval, &sk->sk_peercred, len))
3510 - char address[128];
3512 - if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
3516 - if (copy_to_user(optval, address, len))
3521 - /* Dubious BSD thing... Probably nobody even uses it, but
3522 - * the UNIX standard wants it for whatever reason... -DaveM
3524 - case SO_ACCEPTCONN:
3525 - v.val = sk->sk_state == TCP_LISTEN;
3529 - v.val = test_bit(SOCK_PASSSEC, &sock->flags) ? 1 : 0;
3533 - return security_socket_getpeersec_stream(sock, optval, optlen, len);
3536 - v.val = sk->sk_mark;
3540 - return -ENOPROTOOPT;
3545 - if (copy_to_user(optval, &v, len))
3548 - if (put_user(len, optlen))
3554 - * Initialize an sk_lock.
3556 - * (We also register the sk_lock with the lock validator.)
3558 -static inline void sock_lock_init(struct sock *sk)
3560 - sock_lock_init_class_and_name(sk,
3561 - af_family_slock_key_strings[sk->sk_family],
3562 - af_family_slock_keys + sk->sk_family,
3563 - af_family_key_strings[sk->sk_family],
3564 - af_family_keys + sk->sk_family);
3567 -static void sock_copy(struct sock *nsk, const struct sock *osk)
3569 -#ifdef CONFIG_SECURITY_NETWORK
3570 - void *sptr = nsk->sk_security;
3573 - memcpy(nsk, osk, osk->sk_prot->obj_size);
3574 -#ifdef CONFIG_SECURITY_NETWORK
3575 - nsk->sk_security = sptr;
3576 - security_sk_clone(osk, nsk);
3580 -static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
3584 - struct kmem_cache *slab;
3586 - slab = prot->slab;
3588 - sk = kmem_cache_alloc(slab, priority);
3590 - sk = kmalloc(prot->obj_size, priority);
3593 - if (security_sk_alloc(sk, family, priority))
3596 - if (!try_module_get(prot->owner))
3597 - goto out_free_sec;
3605 - security_sk_free(sk);
3608 - kmem_cache_free(slab, sk);
3614 -static void sk_prot_free(struct proto *prot, struct sock *sk)
3616 - struct kmem_cache *slab;
3617 - struct module *owner;
3619 - owner = prot->owner;
3620 - slab = prot->slab;
3622 - security_sk_free(sk);
3624 - kmem_cache_free(slab, sk);
3627 - module_put(owner);
3631 - * sk_alloc - All socket objects are allocated here
3632 - * @net: the applicable net namespace
3633 - * @family: protocol family
3634 - * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
3635 - * @prot: struct proto associated with this new sock instance
3637 -struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
3638 - struct proto *prot)
3642 - sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
3644 - sk->sk_family = family;
3646 - * See comment in struct sock definition to understand
3647 - * why we need sk_prot_creator -acme
3649 - sk->sk_prot = sk->sk_prot_creator = prot;
3650 - sock_lock_init(sk);
3651 - sock_net_set(sk, get_net(net));
3657 -void sk_free(struct sock *sk)
3659 - struct sk_filter *filter;
3661 - if (sk->sk_destruct)
3662 - sk->sk_destruct(sk);
3664 - filter = rcu_dereference(sk->sk_filter);
3666 - sk_filter_uncharge(sk, filter);
3667 - rcu_assign_pointer(sk->sk_filter, NULL);
3670 - sock_disable_timestamp(sk);
3672 - if (atomic_read(&sk->sk_omem_alloc))
3673 - printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n",
3674 - __func__, atomic_read(&sk->sk_omem_alloc));
3676 - put_net(sock_net(sk));
3678 - clr_vx_info(&sk->sk_vx_info);
3680 - clr_nx_info(&sk->sk_nx_info);
3682 - sk_prot_free(sk->sk_prot_creator, sk);
3686 - * Last sock_put should drop referrence to sk->sk_net. It has already
3687 - * been dropped in sk_change_net. Taking referrence to stopping namespace
3688 - * is not an option.
3689 - * Take referrence to a socket to remove it from hash _alive_ and after that
3690 - * destroy it in the context of init_net.
3692 -void sk_release_kernel(struct sock *sk)
3694 - if (sk == NULL || sk->sk_socket == NULL)
3698 - sock_release(sk->sk_socket);
3699 - release_net(sock_net(sk));
3700 - sock_net_set(sk, get_net(&init_net));
3703 -EXPORT_SYMBOL(sk_release_kernel);
3705 -struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
3707 - struct sock *newsk;
3709 - newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
3710 - if (newsk != NULL) {
3711 - struct sk_filter *filter;
3713 - sock_copy(newsk, sk);
3716 - get_net(sock_net(newsk));
3717 - sock_vx_init(newsk);
3718 - sock_nx_init(newsk);
3719 - sk_node_init(&newsk->sk_node);
3720 - sock_lock_init(newsk);
3721 - bh_lock_sock(newsk);
3722 - newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
3724 - atomic_set(&newsk->sk_rmem_alloc, 0);
3725 - atomic_set(&newsk->sk_wmem_alloc, 0);
3726 - atomic_set(&newsk->sk_omem_alloc, 0);
3727 - skb_queue_head_init(&newsk->sk_receive_queue);
3728 - skb_queue_head_init(&newsk->sk_write_queue);
3729 -#ifdef CONFIG_NET_DMA
3730 - skb_queue_head_init(&newsk->sk_async_wait_queue);
3733 - rwlock_init(&newsk->sk_dst_lock);
3734 - rwlock_init(&newsk->sk_callback_lock);
3735 - lockdep_set_class_and_name(&newsk->sk_callback_lock,
3736 - af_callback_keys + newsk->sk_family,
3737 - af_family_clock_key_strings[newsk->sk_family]);
3739 - newsk->sk_dst_cache = NULL;
3740 - newsk->sk_wmem_queued = 0;
3741 - newsk->sk_forward_alloc = 0;
3742 - newsk->sk_send_head = NULL;
3743 - newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
3745 - sock_reset_flag(newsk, SOCK_DONE);
3746 - skb_queue_head_init(&newsk->sk_error_queue);
3748 - filter = newsk->sk_filter;
3749 - if (filter != NULL)
3750 - sk_filter_charge(newsk, filter);
3752 - if (unlikely(xfrm_sk_clone_policy(newsk))) {
3753 - /* It is still raw copy of parent, so invalidate
3754 - * destructor and make plain sk_free() */
3755 - newsk->sk_destruct = NULL;
3761 - newsk->sk_err = 0;
3762 - newsk->sk_priority = 0;
3763 - atomic_set(&newsk->sk_refcnt, 2);
3765 - set_vx_info(&newsk->sk_vx_info, sk->sk_vx_info);
3766 - newsk->sk_xid = sk->sk_xid;
3767 - vx_sock_inc(newsk);
3768 - set_nx_info(&newsk->sk_nx_info, sk->sk_nx_info);
3769 - newsk->sk_nid = sk->sk_nid;
3772 - * Increment the counter in the same struct proto as the master
3773 - * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
3774 - * is the same as sk->sk_prot->socks, as this field was copied
3777 - * This _changes_ the previous behaviour, where
3778 - * tcp_create_openreq_child always was incrementing the
3779 - * equivalent to tcp_prot->socks (inet_sock_nr), so this have
3780 - * to be taken into account in all callers. -acme
3782 - sk_refcnt_debug_inc(newsk);
3783 - sk_set_socket(newsk, NULL);
3784 - newsk->sk_sleep = NULL;
3786 - if (newsk->sk_prot->sockets_allocated)
3787 - atomic_inc(newsk->sk_prot->sockets_allocated);
3793 -EXPORT_SYMBOL_GPL(sk_clone);
3795 -void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
3797 - __sk_dst_set(sk, dst);
3798 - sk->sk_route_caps = dst->dev->features;
3799 - if (sk->sk_route_caps & NETIF_F_GSO)
3800 - sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
3801 - if (sk_can_gso(sk)) {
3802 - if (dst->header_len) {
3803 - sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
3805 - sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
3806 - sk->sk_gso_max_size = dst->dev->gso_max_size;
3810 -EXPORT_SYMBOL_GPL(sk_setup_caps);
3812 -void __init sk_init(void)
3814 - if (num_physpages <= 4096) {
3815 - sysctl_wmem_max = 32767;
3816 - sysctl_rmem_max = 32767;
3817 - sysctl_wmem_default = 32767;
3818 - sysctl_rmem_default = 32767;
3819 - } else if (num_physpages >= 131072) {
3820 - sysctl_wmem_max = 131071;
3821 - sysctl_rmem_max = 131071;
3826 - * Simple resource managers for sockets.
3831 - * Write buffer destructor automatically called from kfree_skb.
3833 -void sock_wfree(struct sk_buff *skb)
3835 - struct sock *sk = skb->sk;
3837 - /* In case it might be waiting for more memory. */
3838 - atomic_sub(skb->truesize, &sk->sk_wmem_alloc);
3839 - if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE))
3840 - sk->sk_write_space(sk);
3845 - * Read buffer destructor automatically called from kfree_skb.
3847 -void sock_rfree(struct sk_buff *skb)
3849 - struct sock *sk = skb->sk;
3851 - atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
3852 - sk_mem_uncharge(skb->sk, skb->truesize);
3856 -int sock_i_uid(struct sock *sk)
3860 - read_lock(&sk->sk_callback_lock);
3861 - uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0;
3862 - read_unlock(&sk->sk_callback_lock);
3866 -unsigned long sock_i_ino(struct sock *sk)
3868 - unsigned long ino;
3870 - read_lock(&sk->sk_callback_lock);
3871 - ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
3872 - read_unlock(&sk->sk_callback_lock);
3877 - * Allocate a skb from the socket's send buffer.
3879 -struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
3882 - if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
3883 - struct sk_buff * skb = alloc_skb(size, priority);
3885 - skb_set_owner_w(skb, sk);
3893 - * Allocate a skb from the socket's receive buffer.
3895 -struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
3898 - if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
3899 - struct sk_buff *skb = alloc_skb(size, priority);
3901 - skb_set_owner_r(skb, sk);
3909 - * Allocate a memory block from the socket's option memory buffer.
3911 -void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
3913 - if ((unsigned)size <= sysctl_optmem_max &&
3914 - atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
3916 - /* First do the add, to avoid the race if kmalloc
3919 - atomic_add(size, &sk->sk_omem_alloc);
3920 - mem = kmalloc(size, priority);
3923 - atomic_sub(size, &sk->sk_omem_alloc);
3929 - * Free an option memory block.
3931 -void sock_kfree_s(struct sock *sk, void *mem, int size)
3934 - atomic_sub(size, &sk->sk_omem_alloc);
3937 -/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
3938 - I think, these locks should be removed for datagram sockets.
3940 -static long sock_wait_for_wmem(struct sock * sk, long timeo)
3942 - DEFINE_WAIT(wait);
3944 - clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
3948 - if (signal_pending(current))
3950 - set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
3951 - prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
3952 - if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
3954 - if (sk->sk_shutdown & SEND_SHUTDOWN)
3958 - timeo = schedule_timeout(timeo);
3960 - finish_wait(sk->sk_sleep, &wait);
3966 - * Generic send/receive buffer handlers
3969 -static struct sk_buff *sock_alloc_send_pskb(struct sock *sk,
3970 - unsigned long header_len,
3971 - unsigned long data_len,
3972 - int noblock, int *errcode)
3974 - struct sk_buff *skb;
3979 - gfp_mask = sk->sk_allocation;
3980 - if (gfp_mask & __GFP_WAIT)
3981 - gfp_mask |= __GFP_REPEAT;
3983 - timeo = sock_sndtimeo(sk, noblock);
3985 - err = sock_error(sk);
3990 - if (sk->sk_shutdown & SEND_SHUTDOWN)
3993 - if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
3994 - skb = alloc_skb(header_len, gfp_mask);
3999 - /* No pages, we're done... */
4003 - npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
4004 - skb->truesize += data_len;
4005 - skb_shinfo(skb)->nr_frags = npages;
4006 - for (i = 0; i < npages; i++) {
4007 - struct page *page;
4010 - page = alloc_pages(sk->sk_allocation, 0);
4013 - skb_shinfo(skb)->nr_frags = i;
4018 - frag = &skb_shinfo(skb)->frags[i];
4019 - frag->page = page;
4020 - frag->page_offset = 0;
4021 - frag->size = (data_len >= PAGE_SIZE ?
4024 - data_len -= PAGE_SIZE;
4027 - /* Full success... */
4033 - set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
4034 - set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
4038 - if (signal_pending(current))
4040 - timeo = sock_wait_for_wmem(sk, timeo);
4043 - skb_set_owner_w(skb, sk);
4047 - err = sock_intr_errno(timeo);
4053 -struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
4054 - int noblock, int *errcode)
4056 - return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
4059 -static void __lock_sock(struct sock *sk)
4061 - DEFINE_WAIT(wait);
4064 - prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
4065 - TASK_UNINTERRUPTIBLE);
4066 - spin_unlock_bh(&sk->sk_lock.slock);
4068 - spin_lock_bh(&sk->sk_lock.slock);
4069 - if (!sock_owned_by_user(sk))
4072 - finish_wait(&sk->sk_lock.wq, &wait);
4075 -static void __release_sock(struct sock *sk)
4077 - struct sk_buff *skb = sk->sk_backlog.head;
4080 - sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
4081 - bh_unlock_sock(sk);
4084 - struct sk_buff *next = skb->next;
4087 - sk->sk_backlog_rcv(sk, skb);
4090 - * We are in process context here with softirqs
4091 - * disabled, use cond_resched_softirq() to preempt.
4092 - * This is safe to do because we've taken the backlog
4095 - cond_resched_softirq();
4098 - } while (skb != NULL);
4101 - } while ((skb = sk->sk_backlog.head) != NULL);
4105 - * sk_wait_data - wait for data to arrive at sk_receive_queue
4106 - * @sk: sock to wait on
4107 - * @timeo: for how long
4109 - * Now socket state including sk->sk_err is changed only under lock,
4110 - * hence we may omit checks after joining wait queue.
4111 - * We check receive queue before schedule() only as optimization;
4112 - * it is very likely that release_sock() added new data.
4114 -int sk_wait_data(struct sock *sk, long *timeo)
4117 - DEFINE_WAIT(wait);
4119 - prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
4120 - set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
4121 - rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
4122 - clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
4123 - finish_wait(sk->sk_sleep, &wait);
4127 -EXPORT_SYMBOL(sk_wait_data);
4130 - * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
4132 - * @size: memory size to allocate
4133 - * @kind: allocation type
4135 - * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
4136 - * rmem allocation. This function assumes that protocols which have
4137 - * memory_pressure use sk_wmem_queued as write buffer accounting.
4139 -int __sk_mem_schedule(struct sock *sk, int size, int kind)
4141 - struct proto *prot = sk->sk_prot;
4142 - int amt = sk_mem_pages(size);
4145 - sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
4146 - allocated = atomic_add_return(amt, prot->memory_allocated);
4148 - /* Under limit. */
4149 - if (allocated <= prot->sysctl_mem[0]) {
4150 - if (prot->memory_pressure && *prot->memory_pressure)
4151 - *prot->memory_pressure = 0;
4155 - /* Under pressure. */
4156 - if (allocated > prot->sysctl_mem[1])
4157 - if (prot->enter_memory_pressure)
4158 - prot->enter_memory_pressure(sk);
4160 - /* Over hard limit. */
4161 - if (allocated > prot->sysctl_mem[2])
4162 - goto suppress_allocation;
4164 - /* guarantee minimum buffer size under pressure */
4165 - if (kind == SK_MEM_RECV) {
4166 - if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
4168 - } else { /* SK_MEM_SEND */
4169 - if (sk->sk_type == SOCK_STREAM) {
4170 - if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
4172 - } else if (atomic_read(&sk->sk_wmem_alloc) <
4173 - prot->sysctl_wmem[0])
4177 - if (prot->memory_pressure) {
4178 - if (!*prot->memory_pressure ||
4179 - prot->sysctl_mem[2] > atomic_read(prot->sockets_allocated) *
4180 - sk_mem_pages(sk->sk_wmem_queued +
4181 - atomic_read(&sk->sk_rmem_alloc) +
4182 - sk->sk_forward_alloc))
4186 -suppress_allocation:
4188 - if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
4189 - sk_stream_moderate_sndbuf(sk);
4191 - /* Fail only if socket is _under_ its sndbuf.
4192 - * In this case we cannot block, so that we have to fail.
4194 - if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
4198 - /* Alas. Undo changes. */
4199 - sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
4200 - atomic_sub(amt, prot->memory_allocated);
4204 -EXPORT_SYMBOL(__sk_mem_schedule);
4207 - * __sk_reclaim - reclaim memory_allocated
4210 -void __sk_mem_reclaim(struct sock *sk)
4212 - struct proto *prot = sk->sk_prot;
4214 - atomic_sub(sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT,
4215 - prot->memory_allocated);
4216 - sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
4218 - if (prot->memory_pressure && *prot->memory_pressure &&
4219 - (atomic_read(prot->memory_allocated) < prot->sysctl_mem[0]))
4220 - *prot->memory_pressure = 0;
4223 -EXPORT_SYMBOL(__sk_mem_reclaim);
4227 - * Set of default routines for initialising struct proto_ops when
4228 - * the protocol does not support a particular function. In certain
4229 - * cases where it makes no sense for a protocol to have a "do nothing"
4230 - * function, some default processing is provided.
4233 -int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
4235 - return -EOPNOTSUPP;
4238 -int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
4239 - int len, int flags)
4241 - return -EOPNOTSUPP;
4244 -int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
4246 - return -EOPNOTSUPP;
4249 -int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
4251 - return -EOPNOTSUPP;
4254 -int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
4255 - int *len, int peer)
4257 - return -EOPNOTSUPP;
4260 -unsigned int sock_no_poll(struct file * file, struct socket *sock, poll_table *pt)
4265 -int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
4267 - return -EOPNOTSUPP;
4270 -int sock_no_listen(struct socket *sock, int backlog)
4272 - return -EOPNOTSUPP;
4275 -int sock_no_shutdown(struct socket *sock, int how)
4277 - return -EOPNOTSUPP;
4280 -int sock_no_setsockopt(struct socket *sock, int level, int optname,
4281 - char __user *optval, int optlen)
4283 - return -EOPNOTSUPP;
4286 -int sock_no_getsockopt(struct socket *sock, int level, int optname,
4287 - char __user *optval, int __user *optlen)
4289 - return -EOPNOTSUPP;
4292 -int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
4295 - return -EOPNOTSUPP;
4298 -int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
4299 - size_t len, int flags)
4301 - return -EOPNOTSUPP;
4304 -int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
4306 - /* Mirror missing mmap method error code */
4310 -ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
4313 - struct msghdr msg = {.msg_flags = flags};
4315 - char *kaddr = kmap(page);
4316 - iov.iov_base = kaddr + offset;
4317 - iov.iov_len = size;
4318 - res = kernel_sendmsg(sock, &msg, &iov, 1, size);
4324 - * Default Socket Callbacks
4327 -static void sock_def_wakeup(struct sock *sk)
4329 - read_lock(&sk->sk_callback_lock);
4330 - if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
4331 - wake_up_interruptible_all(sk->sk_sleep);
4332 - read_unlock(&sk->sk_callback_lock);
4335 -static void sock_def_error_report(struct sock *sk)
4337 - read_lock(&sk->sk_callback_lock);
4338 - if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
4339 - wake_up_interruptible(sk->sk_sleep);
4340 - sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
4341 - read_unlock(&sk->sk_callback_lock);
4344 -static void sock_def_readable(struct sock *sk, int len)
4346 - read_lock(&sk->sk_callback_lock);
4347 - if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
4348 - wake_up_interruptible_sync(sk->sk_sleep);
4349 - sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
4350 - read_unlock(&sk->sk_callback_lock);
4353 -static void sock_def_write_space(struct sock *sk)
4355 - read_lock(&sk->sk_callback_lock);
4357 - /* Do not wake up a writer until he can make "significant"
4358 - * progress. --DaveM
4360 - if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
4361 - if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
4362 - wake_up_interruptible_sync(sk->sk_sleep);
4364 - /* Should agree with poll, otherwise some programs break */
4365 - if (sock_writeable(sk))
4366 - sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
4369 - read_unlock(&sk->sk_callback_lock);
4372 -static void sock_def_destruct(struct sock *sk)
4374 - kfree(sk->sk_protinfo);
4377 -void sk_send_sigurg(struct sock *sk)
4379 - if (sk->sk_socket && sk->sk_socket->file)
4380 - if (send_sigurg(&sk->sk_socket->file->f_owner))
4381 - sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
4384 -void sk_reset_timer(struct sock *sk, struct timer_list* timer,
4385 - unsigned long expires)
4387 - if (!mod_timer(timer, expires))
4391 -EXPORT_SYMBOL(sk_reset_timer);
4393 -void sk_stop_timer(struct sock *sk, struct timer_list* timer)
4395 - if (timer_pending(timer) && del_timer(timer))
4399 -EXPORT_SYMBOL(sk_stop_timer);
4401 -void sock_init_data(struct socket *sock, struct sock *sk)
4403 - skb_queue_head_init(&sk->sk_receive_queue);
4404 - skb_queue_head_init(&sk->sk_write_queue);
4405 - skb_queue_head_init(&sk->sk_error_queue);
4406 -#ifdef CONFIG_NET_DMA
4407 - skb_queue_head_init(&sk->sk_async_wait_queue);
4410 - sk->sk_send_head = NULL;
4412 - init_timer(&sk->sk_timer);
4414 - sk->sk_allocation = GFP_KERNEL;
4415 - sk->sk_rcvbuf = sysctl_rmem_default;
4416 - sk->sk_sndbuf = sysctl_wmem_default;
4417 - sk->sk_state = TCP_CLOSE;
4418 - sk_set_socket(sk, sock);
4420 - sock_set_flag(sk, SOCK_ZAPPED);
4423 - sk->sk_type = sock->type;
4424 - sk->sk_sleep = &sock->wait;
4427 - sk->sk_sleep = NULL;
4429 - rwlock_init(&sk->sk_dst_lock);
4430 - rwlock_init(&sk->sk_callback_lock);
4431 - lockdep_set_class_and_name(&sk->sk_callback_lock,
4432 - af_callback_keys + sk->sk_family,
4433 - af_family_clock_key_strings[sk->sk_family]);
4435 - sk->sk_state_change = sock_def_wakeup;
4436 - sk->sk_data_ready = sock_def_readable;
4437 - sk->sk_write_space = sock_def_write_space;
4438 - sk->sk_error_report = sock_def_error_report;
4439 - sk->sk_destruct = sock_def_destruct;
4441 - sk->sk_sndmsg_page = NULL;
4442 - sk->sk_sndmsg_off = 0;
4444 - sk->sk_peercred.pid = 0;
4445 - sk->sk_peercred.uid = -1;
4446 - sk->sk_peercred.gid = -1;
4447 - sk->sk_write_pending = 0;
4448 - sk->sk_rcvlowat = 1;
4449 - sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
4450 - sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
4452 - sk->sk_stamp = ktime_set(-1L, 0);
4454 - set_vx_info(&sk->sk_vx_info, current->vx_info);
4455 - sk->sk_xid = vx_current_xid();
4457 - set_nx_info(&sk->sk_nx_info, current->nx_info);
4458 - sk->sk_nid = nx_current_nid();
4459 - atomic_set(&sk->sk_refcnt, 1);
4460 - atomic_set(&sk->sk_drops, 0);
4463 -void lock_sock_nested(struct sock *sk, int subclass)
4466 - spin_lock_bh(&sk->sk_lock.slock);
4467 - if (sk->sk_lock.owned)
4469 - sk->sk_lock.owned = 1;
4470 - spin_unlock(&sk->sk_lock.slock);
4472 - * The sk_lock has mutex_lock() semantics here:
4474 - mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
4475 - local_bh_enable();
4478 -EXPORT_SYMBOL(lock_sock_nested);
4480 -void release_sock(struct sock *sk)
4483 - * The sk_lock has mutex_unlock() semantics:
4485 - mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
4487 - spin_lock_bh(&sk->sk_lock.slock);
4488 - if (sk->sk_backlog.tail)
4489 - __release_sock(sk);
4490 - sk->sk_lock.owned = 0;
4491 - if (waitqueue_active(&sk->sk_lock.wq))
4492 - wake_up(&sk->sk_lock.wq);
4493 - spin_unlock_bh(&sk->sk_lock.slock);
4495 -EXPORT_SYMBOL(release_sock);
4497 -int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
4499 - struct timeval tv;
4500 - if (!sock_flag(sk, SOCK_TIMESTAMP))
4501 - sock_enable_timestamp(sk);
4502 - tv = ktime_to_timeval(sk->sk_stamp);
4503 - if (tv.tv_sec == -1)
4505 - if (tv.tv_sec == 0) {
4506 - sk->sk_stamp = ktime_get_real();
4507 - tv = ktime_to_timeval(sk->sk_stamp);
4509 - return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
4511 -EXPORT_SYMBOL(sock_get_timestamp);
4513 -int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
4515 - struct timespec ts;
4516 - if (!sock_flag(sk, SOCK_TIMESTAMP))
4517 - sock_enable_timestamp(sk);
4518 - ts = ktime_to_timespec(sk->sk_stamp);
4519 - if (ts.tv_sec == -1)
4521 - if (ts.tv_sec == 0) {
4522 - sk->sk_stamp = ktime_get_real();
4523 - ts = ktime_to_timespec(sk->sk_stamp);
4525 - return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
4527 -EXPORT_SYMBOL(sock_get_timestampns);
4529 -void sock_enable_timestamp(struct sock *sk)
4531 - if (!sock_flag(sk, SOCK_TIMESTAMP)) {
4532 - sock_set_flag(sk, SOCK_TIMESTAMP);
4533 - net_enable_timestamp();
4538 - * Get a socket option on an socket.
4540 - * FIX: POSIX 1003.1g is very ambiguous here. It states that
4541 - * asynchronous errors should be reported by getsockopt. We assume
4542 - * this means if you specify SO_ERROR (otherwise whats the point of it).
4544 -int sock_common_getsockopt(struct socket *sock, int level, int optname,
4545 - char __user *optval, int __user *optlen)
4547 - struct sock *sk = sock->sk;
4549 - return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
4552 -EXPORT_SYMBOL(sock_common_getsockopt);
4554 -#ifdef CONFIG_COMPAT
4555 -int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
4556 - char __user *optval, int __user *optlen)
4558 - struct sock *sk = sock->sk;
4560 - if (sk->sk_prot->compat_getsockopt != NULL)
4561 - return sk->sk_prot->compat_getsockopt(sk, level, optname,
4563 - return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
4565 -EXPORT_SYMBOL(compat_sock_common_getsockopt);
4568 -int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
4569 - struct msghdr *msg, size_t size, int flags)
4571 - struct sock *sk = sock->sk;
4575 - err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
4576 - flags & ~MSG_DONTWAIT, &addr_len);
4578 - msg->msg_namelen = addr_len;
4582 -EXPORT_SYMBOL(sock_common_recvmsg);
4585 - * Set socket options on an inet socket.
4587 -int sock_common_setsockopt(struct socket *sock, int level, int optname,
4588 - char __user *optval, int optlen)
4590 - struct sock *sk = sock->sk;
4592 - return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
4595 -EXPORT_SYMBOL(sock_common_setsockopt);
4597 -#ifdef CONFIG_COMPAT
4598 -int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
4599 - char __user *optval, int optlen)
4601 - struct sock *sk = sock->sk;
4603 - if (sk->sk_prot->compat_setsockopt != NULL)
4604 - return sk->sk_prot->compat_setsockopt(sk, level, optname,
4606 - return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
4608 -EXPORT_SYMBOL(compat_sock_common_setsockopt);
4611 -void sk_common_release(struct sock *sk)
4613 - if (sk->sk_prot->destroy)
4614 - sk->sk_prot->destroy(sk);
4617 - * Observation: when sock_common_release is called, processes have
4618 - * no access to socket. But net still has.
4619 - * Step one, detach it from networking:
4621 - * A. Remove from hash tables.
4624 - sk->sk_prot->unhash(sk);
4627 - * In this point socket cannot receive new packets, but it is possible
4628 - * that some packets are in flight because some CPU runs receiver and
4629 - * did hash table lookup before we unhashed socket. They will achieve
4630 - * receive queue and will be purged by socket destructor.
4632 - * Also we still have packets pending on receive queue and probably,
4633 - * our own packets waiting in device queues. sock_destroy will drain
4634 - * receive queue, but transmitted packets will delay socket destruction
4635 - * until the last reference will be released.
4640 - xfrm_sk_free_policy(sk);
4642 - sk_refcnt_debug_release(sk);
4646 -EXPORT_SYMBOL(sk_common_release);
4648 -static DEFINE_RWLOCK(proto_list_lock);
4649 -static LIST_HEAD(proto_list);
4651 -#ifdef CONFIG_PROC_FS
4652 -#define PROTO_INUSE_NR 64 /* should be enough for the first time */
4653 -struct prot_inuse {
4654 - int val[PROTO_INUSE_NR];
4657 -static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
4659 -#ifdef CONFIG_NET_NS
4660 -void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
4662 - int cpu = smp_processor_id();
4663 - per_cpu_ptr(net->core.inuse, cpu)->val[prot->inuse_idx] += val;
4665 -EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
4667 -int sock_prot_inuse_get(struct net *net, struct proto *prot)
4669 - int cpu, idx = prot->inuse_idx;
4672 - for_each_possible_cpu(cpu)
4673 - res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
4675 - return res >= 0 ? res : 0;
4677 -EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
4679 -static int sock_inuse_init_net(struct net *net)
4681 - net->core.inuse = alloc_percpu(struct prot_inuse);
4682 - return net->core.inuse ? 0 : -ENOMEM;
4685 -static void sock_inuse_exit_net(struct net *net)
4687 - free_percpu(net->core.inuse);
4690 -static struct pernet_operations net_inuse_ops = {
4691 - .init = sock_inuse_init_net,
4692 - .exit = sock_inuse_exit_net,
4695 -static __init int net_inuse_init(void)
4697 - if (register_pernet_subsys(&net_inuse_ops))
4698 - panic("Cannot initialize net inuse counters");
4703 -core_initcall(net_inuse_init);
4705 -static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
4707 -void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
4709 - __get_cpu_var(prot_inuse).val[prot->inuse_idx] += val;
4711 -EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
4713 -int sock_prot_inuse_get(struct net *net, struct proto *prot)
4715 - int cpu, idx = prot->inuse_idx;
4718 - for_each_possible_cpu(cpu)
4719 - res += per_cpu(prot_inuse, cpu).val[idx];
4721 - return res >= 0 ? res : 0;
4723 -EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
4726 -static void assign_proto_idx(struct proto *prot)
4728 - prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
4730 - if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
4731 - printk(KERN_ERR "PROTO_INUSE_NR exhausted\n");
4735 - set_bit(prot->inuse_idx, proto_inuse_idx);
4738 -static void release_proto_idx(struct proto *prot)
4740 - if (prot->inuse_idx != PROTO_INUSE_NR - 1)
4741 - clear_bit(prot->inuse_idx, proto_inuse_idx);
4744 -static inline void assign_proto_idx(struct proto *prot)
4748 -static inline void release_proto_idx(struct proto *prot)
4753 -int proto_register(struct proto *prot, int alloc_slab)
4755 - char *request_sock_slab_name = NULL;
4756 - char *timewait_sock_slab_name;
4759 - prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
4760 - SLAB_HWCACHE_ALIGN, NULL);
4762 - if (prot->slab == NULL) {
4763 - printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n",
4768 - if (prot->rsk_prot != NULL) {
4769 - static const char mask[] = "request_sock_%s";
4771 - request_sock_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
4772 - if (request_sock_slab_name == NULL)
4773 - goto out_free_sock_slab;
4775 - sprintf(request_sock_slab_name, mask, prot->name);
4776 - prot->rsk_prot->slab = kmem_cache_create(request_sock_slab_name,
4777 - prot->rsk_prot->obj_size, 0,
4778 - SLAB_HWCACHE_ALIGN, NULL);
4780 - if (prot->rsk_prot->slab == NULL) {
4781 - printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n",
4783 - goto out_free_request_sock_slab_name;
4787 - if (prot->twsk_prot != NULL) {
4788 - static const char mask[] = "tw_sock_%s";
4790 - timewait_sock_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
4792 - if (timewait_sock_slab_name == NULL)
4793 - goto out_free_request_sock_slab;
4795 - sprintf(timewait_sock_slab_name, mask, prot->name);
4796 - prot->twsk_prot->twsk_slab =
4797 - kmem_cache_create(timewait_sock_slab_name,
4798 - prot->twsk_prot->twsk_obj_size,
4799 - 0, SLAB_HWCACHE_ALIGN,
4801 - if (prot->twsk_prot->twsk_slab == NULL)
4802 - goto out_free_timewait_sock_slab_name;
4806 - write_lock(&proto_list_lock);
4807 - list_add(&prot->node, &proto_list);
4808 - assign_proto_idx(prot);
4809 - write_unlock(&proto_list_lock);
4812 -out_free_timewait_sock_slab_name:
4813 - kfree(timewait_sock_slab_name);
4814 -out_free_request_sock_slab:
4815 - if (prot->rsk_prot && prot->rsk_prot->slab) {
4816 - kmem_cache_destroy(prot->rsk_prot->slab);
4817 - prot->rsk_prot->slab = NULL;
4819 -out_free_request_sock_slab_name:
4820 - kfree(request_sock_slab_name);
4821 -out_free_sock_slab:
4822 - kmem_cache_destroy(prot->slab);
4823 - prot->slab = NULL;
4828 -EXPORT_SYMBOL(proto_register);
4830 -void proto_unregister(struct proto *prot)
4832 - write_lock(&proto_list_lock);
4833 - release_proto_idx(prot);
4834 - list_del(&prot->node);
4835 - write_unlock(&proto_list_lock);
4837 - if (prot->slab != NULL) {
4838 - kmem_cache_destroy(prot->slab);
4839 - prot->slab = NULL;
4842 - if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
4843 - const char *name = kmem_cache_name(prot->rsk_prot->slab);
4845 - kmem_cache_destroy(prot->rsk_prot->slab);
4847 - prot->rsk_prot->slab = NULL;
4850 - if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
4851 - const char *name = kmem_cache_name(prot->twsk_prot->twsk_slab);
4853 - kmem_cache_destroy(prot->twsk_prot->twsk_slab);
4855 - prot->twsk_prot->twsk_slab = NULL;
4859 -EXPORT_SYMBOL(proto_unregister);
4861 -#ifdef CONFIG_PROC_FS
4862 -static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
4863 - __acquires(proto_list_lock)
4865 - read_lock(&proto_list_lock);
4866 - return seq_list_start_head(&proto_list, *pos);
4869 -static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4871 - return seq_list_next(v, &proto_list, pos);
4874 -static void proto_seq_stop(struct seq_file *seq, void *v)
4875 - __releases(proto_list_lock)
4877 - read_unlock(&proto_list_lock);
4880 -static char proto_method_implemented(const void *method)
4882 - return method == NULL ? 'n' : 'y';
4885 -static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
4887 - seq_printf(seq, "%-9s %4u %6d %6d %-3s %6u %-3s %-10s "
4888 - "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
4891 - proto->sockets_allocated != NULL ? atomic_read(proto->sockets_allocated) : -1,
4892 - proto->memory_allocated != NULL ? atomic_read(proto->memory_allocated) : -1,
4893 - proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI",
4894 - proto->max_header,
4895 - proto->slab == NULL ? "no" : "yes",
4896 - module_name(proto->owner),
4897 - proto_method_implemented(proto->close),
4898 - proto_method_implemented(proto->connect),
4899 - proto_method_implemented(proto->disconnect),
4900 - proto_method_implemented(proto->accept),
4901 - proto_method_implemented(proto->ioctl),
4902 - proto_method_implemented(proto->init),
4903 - proto_method_implemented(proto->destroy),
4904 - proto_method_implemented(proto->shutdown),
4905 - proto_method_implemented(proto->setsockopt),
4906 - proto_method_implemented(proto->getsockopt),
4907 - proto_method_implemented(proto->sendmsg),
4908 - proto_method_implemented(proto->recvmsg),
4909 - proto_method_implemented(proto->sendpage),
4910 - proto_method_implemented(proto->bind),
4911 - proto_method_implemented(proto->backlog_rcv),
4912 - proto_method_implemented(proto->hash),
4913 - proto_method_implemented(proto->unhash),
4914 - proto_method_implemented(proto->get_port),
4915 - proto_method_implemented(proto->enter_memory_pressure));
4918 -static int proto_seq_show(struct seq_file *seq, void *v)
4920 - if (v == &proto_list)
4921 - seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
4930 - "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
4932 - proto_seq_printf(seq, list_entry(v, struct proto, node));
4936 -static const struct seq_operations proto_seq_ops = {
4937 - .start = proto_seq_start,
4938 - .next = proto_seq_next,
4939 - .stop = proto_seq_stop,
4940 - .show = proto_seq_show,
4943 -static int proto_seq_open(struct inode *inode, struct file *file)
4945 - return seq_open(file, &proto_seq_ops);
4948 -static const struct file_operations proto_seq_fops = {
4949 - .owner = THIS_MODULE,
4950 - .open = proto_seq_open,
4952 - .llseek = seq_lseek,
4953 - .release = seq_release,
4956 -static int __init proto_init(void)
4958 - /* register /proc/net/protocols */
4959 - return proc_net_fops_create(&init_net, "protocols", S_IRUGO, &proto_seq_fops) == NULL ? -ENOBUFS : 0;
4962 -subsys_initcall(proto_init);
4964 -#endif /* PROC_FS */
4966 -EXPORT_SYMBOL(sk_alloc);
4967 -EXPORT_SYMBOL(sk_free);
4968 -EXPORT_SYMBOL(sk_send_sigurg);
4969 -EXPORT_SYMBOL(sock_alloc_send_skb);
4970 -EXPORT_SYMBOL(sock_init_data);
4971 -EXPORT_SYMBOL(sock_kfree_s);
4972 -EXPORT_SYMBOL(sock_kmalloc);
4973 -EXPORT_SYMBOL(sock_no_accept);
4974 -EXPORT_SYMBOL(sock_no_bind);
4975 -EXPORT_SYMBOL(sock_no_connect);
4976 -EXPORT_SYMBOL(sock_no_getname);
4977 -EXPORT_SYMBOL(sock_no_getsockopt);
4978 -EXPORT_SYMBOL(sock_no_ioctl);
4979 -EXPORT_SYMBOL(sock_no_listen);
4980 -EXPORT_SYMBOL(sock_no_mmap);
4981 -EXPORT_SYMBOL(sock_no_poll);
4982 -EXPORT_SYMBOL(sock_no_recvmsg);
4983 -EXPORT_SYMBOL(sock_no_sendmsg);
4984 -EXPORT_SYMBOL(sock_no_sendpage);
4985 -EXPORT_SYMBOL(sock_no_setsockopt);
4986 -EXPORT_SYMBOL(sock_no_shutdown);
4987 -EXPORT_SYMBOL(sock_no_socketpair);
4988 -EXPORT_SYMBOL(sock_rfree);
4989 -EXPORT_SYMBOL(sock_setsockopt);
4990 -EXPORT_SYMBOL(sock_wfree);
4991 -EXPORT_SYMBOL(sock_wmalloc);
4992 -EXPORT_SYMBOL(sock_i_uid);
4993 -EXPORT_SYMBOL(sock_i_ino);
4994 -EXPORT_SYMBOL(sysctl_optmem_max);
4995 diff -Nurb linux-2.6.27-524/net/ipv4/udp.c.orig linux-2.6.27-525/net/ipv4/udp.c.orig
4996 --- linux-2.6.27-524/net/ipv4/udp.c.orig 2009-12-04 16:03:48.000000000 -0500
4997 +++ linux-2.6.27-525/net/ipv4/udp.c.orig 1969-12-31 19:00:00.000000000 -0500
5000 - * INET An implementation of the TCP/IP protocol suite for the LINUX
5001 - * operating system. INET is implemented using the BSD Socket
5002 - * interface as the means of communication with the user level.
5004 - * The User Datagram Protocol (UDP).
5006 - * Authors: Ross Biro
5007 - * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
5008 - * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
5009 - * Alan Cox, <Alan.Cox@linux.org>
5010 - * Hirokazu Takahashi, <taka@valinux.co.jp>
5013 - * Alan Cox : verify_area() calls
5014 - * Alan Cox : stopped close while in use off icmp
5015 - * messages. Not a fix but a botch that
5016 - * for udp at least is 'valid'.
5017 - * Alan Cox : Fixed icmp handling properly
5018 - * Alan Cox : Correct error for oversized datagrams
5019 - * Alan Cox : Tidied select() semantics.
5020 - * Alan Cox : udp_err() fixed properly, also now
5021 - * select and read wake correctly on errors
5022 - * Alan Cox : udp_send verify_area moved to avoid mem leak
5023 - * Alan Cox : UDP can count its memory
5024 - * Alan Cox : send to an unknown connection causes
5025 - * an ECONNREFUSED off the icmp, but
5027 - * Alan Cox : Switched to new sk_buff handlers. No more backlog!
5028 - * Alan Cox : Using generic datagram code. Even smaller and the PEEK
5029 - * bug no longer crashes it.
5030 - * Fred Van Kempen : Net2e support for sk->broadcast.
5031 - * Alan Cox : Uses skb_free_datagram
5032 - * Alan Cox : Added get/set sockopt support.
5033 - * Alan Cox : Broadcasting without option set returns EACCES.
5034 - * Alan Cox : No wakeup calls. Instead we now use the callbacks.
5035 - * Alan Cox : Use ip_tos and ip_ttl
5036 - * Alan Cox : SNMP Mibs
5037 - * Alan Cox : MSG_DONTROUTE, and 0.0.0.0 support.
5038 - * Matt Dillon : UDP length checks.
5039 - * Alan Cox : Smarter af_inet used properly.
5040 - * Alan Cox : Use new kernel side addressing.
5041 - * Alan Cox : Incorrect return on truncated datagram receive.
5042 - * Arnt Gulbrandsen : New udp_send and stuff
5043 - * Alan Cox : Cache last socket
5044 - * Alan Cox : Route cache
5045 - * Jon Peatfield : Minor efficiency fix to sendto().
5046 - * Mike Shaver : RFC1122 checks.
5047 - * Alan Cox : Nonblocking error fix.
5048 - * Willy Konynenberg : Transparent proxying support.
5049 - * Mike McLagan : Routing by source
5050 - * David S. Miller : New socket lookup architecture.
5051 - * Last socket cache retained as it
5052 - * does have a high hit rate.
5053 - * Olaf Kirch : Don't linearise iovec on sendmsg.
5054 - * Andi Kleen : Some cleanups, cache destination entry
5056 - * Vitaly E. Lavrov : Transparent proxy revived after year coma.
5057 - * Melvin Smith : Check msg_name not msg_namelen in sendto(),
5058 - * return ENOTCONN for unconnected sockets (POSIX)
5059 - * Janos Farkas : don't deliver multi/broadcasts to a different
5060 - * bound-to-device socket
5061 - * Hirokazu Takahashi : HW checksumming for outgoing UDP
5063 - * Hirokazu Takahashi : sendfile() on UDP works now.
5064 - * Arnaldo C. Melo : convert /proc/net/udp to seq_file
5065 - * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
5066 - * Alexey Kuznetsov: allow both IPv4 and IPv6 sockets to bind
5067 - * a single port at the same time.
5068 - * Derek Atkins <derek@ihtfp.com>: Add Encapulation Support
5069 - * James Chapman : Add L2TP encapsulation type.
5072 - * This program is free software; you can redistribute it and/or
5073 - * modify it under the terms of the GNU General Public License
5074 - * as published by the Free Software Foundation; either version
5075 - * 2 of the License, or (at your option) any later version.
5078 -#include <asm/system.h>
5079 -#include <asm/uaccess.h>
5080 -#include <asm/ioctls.h>
5081 -#include <linux/bootmem.h>
5082 -#include <linux/types.h>
5083 -#include <linux/fcntl.h>
5084 -#include <linux/module.h>
5085 -#include <linux/socket.h>
5086 -#include <linux/sockios.h>
5087 -#include <linux/igmp.h>
5088 -#include <linux/in.h>
5089 -#include <linux/errno.h>
5090 -#include <linux/timer.h>
5091 -#include <linux/mm.h>
5092 -#include <linux/inet.h>
5093 -#include <linux/netdevice.h>
5094 -#include <net/tcp_states.h>
5095 -#include <linux/skbuff.h>
5096 -#include <linux/proc_fs.h>
5097 -#include <linux/seq_file.h>
5098 -#include <net/net_namespace.h>
5099 -#include <net/icmp.h>
5100 -#include <net/route.h>
5101 -#include <net/checksum.h>
5102 -#include <net/xfrm.h>
5103 -#include "udp_impl.h"
5106 - * Snmp MIB for the UDP layer
5109 -DEFINE_SNMP_STAT(struct udp_mib, udp_stats_in6) __read_mostly;
5110 -EXPORT_SYMBOL(udp_stats_in6);
5112 -struct hlist_head udp_hash[UDP_HTABLE_SIZE];
5113 -DEFINE_RWLOCK(udp_hash_lock);
5115 -int sysctl_udp_mem[3] __read_mostly;
5116 -int sysctl_udp_rmem_min __read_mostly;
5117 -int sysctl_udp_wmem_min __read_mostly;
5119 -EXPORT_SYMBOL(sysctl_udp_mem);
5120 -EXPORT_SYMBOL(sysctl_udp_rmem_min);
5121 -EXPORT_SYMBOL(sysctl_udp_wmem_min);
5123 -atomic_t udp_memory_allocated;
5124 -EXPORT_SYMBOL(udp_memory_allocated);
5126 -static inline int __udp_lib_lport_inuse(struct net *net, __u16 num,
5127 - const struct hlist_head udptable[])
5130 - struct hlist_node *node;
5132 - sk_for_each(sk, node, &udptable[udp_hashfn(net, num)])
5133 - if (net_eq(sock_net(sk), net) && sk->sk_hash == num)
5139 - * udp_lib_get_port - UDP/-Lite port lookup for IPv4 and IPv6
5141 - * @sk: socket struct in question
5142 - * @snum: port number to look up
5143 - * @saddr_comp: AF-dependent comparison of bound local IP addresses
5145 -int udp_lib_get_port(struct sock *sk, unsigned short snum,
5146 - int (*saddr_comp)(const struct sock *sk1,
5147 - const struct sock *sk2 ) )
5149 - struct hlist_head *udptable = sk->sk_prot->h.udp_hash;
5150 - struct hlist_node *node;
5151 - struct hlist_head *head;
5154 - struct net *net = sock_net(sk);
5156 - write_lock_bh(&udp_hash_lock);
5159 - int i, low, high, remaining;
5160 - unsigned rover, best, best_size_so_far;
5162 - inet_get_local_port_range(&low, &high);
5163 - remaining = (high - low) + 1;
5165 - best_size_so_far = UINT_MAX;
5166 - best = rover = net_random() % remaining + low;
5168 - /* 1st pass: look for empty (or shortest) hash chain */
5169 - for (i = 0; i < UDP_HTABLE_SIZE; i++) {
5172 - head = &udptable[udp_hashfn(net, rover)];
5173 - if (hlist_empty(head))
5176 - sk_for_each(sk2, node, head) {
5177 - if (++size >= best_size_so_far)
5180 - best_size_so_far = size;
5183 - /* fold back if end of range */
5184 - if (++rover > high)
5185 - rover = low + ((rover - low)
5186 - & (UDP_HTABLE_SIZE - 1));
5191 - /* 2nd pass: find hole in shortest hash chain */
5193 - for (i = 0; i < (1 << 16) / UDP_HTABLE_SIZE; i++) {
5194 - if (! __udp_lib_lport_inuse(net, rover, udptable))
5196 - rover += UDP_HTABLE_SIZE;
5198 - rover = low + ((rover - low)
5199 - & (UDP_HTABLE_SIZE - 1));
5203 - /* All ports in use! */
5209 - head = &udptable[udp_hashfn(net, snum)];
5211 - sk_for_each(sk2, node, head)
5212 - if (sk2->sk_hash == snum &&
5214 - net_eq(sock_net(sk2), net) &&
5215 - (!sk2->sk_reuse || !sk->sk_reuse) &&
5216 - (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if
5217 - || sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
5218 - (*saddr_comp)(sk, sk2) )
5222 - inet_sk(sk)->num = snum;
5223 - sk->sk_hash = snum;
5224 - if (sk_unhashed(sk)) {
5225 - head = &udptable[udp_hashfn(net, snum)];
5226 - sk_add_node(sk, head);
5227 - sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
5231 - write_unlock_bh(&udp_hash_lock);
5235 -extern int ipv4_rcv_saddr_equal(const struct sock *, const struct sock *);
5237 -int udp_v4_get_port(struct sock *sk, unsigned short snum)
5239 - return udp_lib_get_port(sk, snum, ipv4_rcv_saddr_equal);
5243 -/* UDP is nearly always wildcards out the wazoo, it makes no sense to try
5244 - * harder than this. -DaveM
5246 -static struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
5247 - __be16 sport, __be32 daddr, __be16 dport,
5248 - int dif, struct hlist_head udptable[])
5250 - struct sock *sk, *result = NULL;
5251 - struct hlist_node *node;
5252 - unsigned short hnum = ntohs(dport);
5255 - read_lock(&udp_hash_lock);
5256 - sk_for_each(sk, node, &udptable[udp_hashfn(net, hnum)]) {
5257 - struct inet_sock *inet = inet_sk(sk);
5259 - if (net_eq(sock_net(sk), net) && sk->sk_hash == hnum &&
5260 - !ipv6_only_sock(sk)) {
5261 - int score = (sk->sk_family == PF_INET ? 1 : 0);
5263 - if (inet->rcv_saddr) {
5264 - if (inet->rcv_saddr != daddr)
5268 - /* block non nx_info ips */
5269 - if (!v4_addr_in_nx_info(sk->sk_nx_info,
5270 - daddr, NXA_MASK_BIND))
5273 - if (inet->daddr) {
5274 - if (inet->daddr != saddr)
5278 - if (inet->dport) {
5279 - if (inet->dport != sport)
5283 - if (sk->sk_bound_dev_if) {
5284 - if (sk->sk_bound_dev_if != dif)
5291 - } else if (score > badness) {
5299 - sock_hold(result);
5300 - read_unlock(&udp_hash_lock);
5304 -static inline struct sock *udp_v4_mcast_next(struct net *net, struct sock *sk,
5305 - __be16 loc_port, __be32 loc_addr,
5306 - __be16 rmt_port, __be32 rmt_addr,
5309 - struct hlist_node *node;
5310 - struct sock *s = sk;
5311 - unsigned short hnum = ntohs(loc_port);
5313 - sk_for_each_from(s, node) {
5314 - struct inet_sock *inet = inet_sk(s);
5316 - if (!net_eq(sock_net(s), net) ||
5317 - s->sk_hash != hnum ||
5318 - (inet->daddr && inet->daddr != rmt_addr) ||
5319 - (inet->dport != rmt_port && inet->dport) ||
5320 - !v4_sock_addr_match(sk->sk_nx_info, inet, loc_addr) ||
5321 - ipv6_only_sock(s) ||
5322 - (s->sk_bound_dev_if && s->sk_bound_dev_if != dif))
5324 - if (!ip_mc_sf_allow(s, loc_addr, rmt_addr, dif))
5334 - * This routine is called by the ICMP module when it gets some
5335 - * sort of error condition. If err < 0 then the socket should
5336 - * be closed and the error returned to the user. If err > 0
5337 - * it's just the icmp type << 8 | icmp code.
5338 - * Header points to the ip header of the error packet. We move
5339 - * on past this. Then (as it used to claim before adjustment)
5340 - * header points to the first 8 bytes of the udp header. We need
5341 - * to find the appropriate port.
5344 -void __udp4_lib_err(struct sk_buff *skb, u32 info, struct hlist_head udptable[])
5346 - struct inet_sock *inet;
5347 - struct iphdr *iph = (struct iphdr*)skb->data;
5348 - struct udphdr *uh = (struct udphdr*)(skb->data+(iph->ihl<<2));
5349 - const int type = icmp_hdr(skb)->type;
5350 - const int code = icmp_hdr(skb)->code;
5354 - struct net *net = dev_net(skb->dev);
5356 - sk = __udp4_lib_lookup(net, iph->daddr, uh->dest,
5357 - iph->saddr, uh->source, skb->dev->ifindex, udptable);
5359 - ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
5360 - return; /* No socket for error */
5365 - inet = inet_sk(sk);
5369 - case ICMP_TIME_EXCEEDED:
5370 - err = EHOSTUNREACH;
5372 - case ICMP_SOURCE_QUENCH:
5374 - case ICMP_PARAMETERPROB:
5378 - case ICMP_DEST_UNREACH:
5379 - if (code == ICMP_FRAG_NEEDED) { /* Path MTU discovery */
5380 - if (inet->pmtudisc != IP_PMTUDISC_DONT) {
5387 - err = EHOSTUNREACH;
5388 - if (code <= NR_ICMP_UNREACH) {
5389 - harderr = icmp_err_convert[code].fatal;
5390 - err = icmp_err_convert[code].errno;
5396 - * RFC1122: OK. Passes ICMP errors back to application, as per
5399 - if (!inet->recverr) {
5400 - if (!harderr || sk->sk_state != TCP_ESTABLISHED)
5403 - ip_icmp_error(sk, skb, err, uh->dest, info, (u8*)(uh+1));
5406 - sk->sk_error_report(sk);
5411 -void udp_err(struct sk_buff *skb, u32 info)
5413 - __udp4_lib_err(skb, info, udp_hash);
5417 - * Throw away all pending data and cancel the corking. Socket is locked.
5419 -void udp_flush_pending_frames(struct sock *sk)
5421 - struct udp_sock *up = udp_sk(sk);
5423 - if (up->pending) {
5426 - ip_flush_pending_frames(sk);
5429 -EXPORT_SYMBOL(udp_flush_pending_frames);
5432 - * udp4_hwcsum_outgoing - handle outgoing HW checksumming
5433 - * @sk: socket we are sending on
5434 - * @skb: sk_buff containing the filled-in UDP header
5435 - * (checksum field must be zeroed out)
5437 -static void udp4_hwcsum_outgoing(struct sock *sk, struct sk_buff *skb,
5438 - __be32 src, __be32 dst, int len )
5440 - unsigned int offset;
5441 - struct udphdr *uh = udp_hdr(skb);
5444 - if (skb_queue_len(&sk->sk_write_queue) == 1) {
5446 - * Only one fragment on the socket.
5448 - skb->csum_start = skb_transport_header(skb) - skb->head;
5449 - skb->csum_offset = offsetof(struct udphdr, check);
5450 - uh->check = ~csum_tcpudp_magic(src, dst, len, IPPROTO_UDP, 0);
5453 - * HW-checksum won't work as there are two or more
5454 - * fragments on the socket so that all csums of sk_buffs
5455 - * should be together
5457 - offset = skb_transport_offset(skb);
5458 - skb->csum = skb_checksum(skb, offset, skb->len - offset, 0);
5460 - skb->ip_summed = CHECKSUM_NONE;
5462 - skb_queue_walk(&sk->sk_write_queue, skb) {
5463 - csum = csum_add(csum, skb->csum);
5466 - uh->check = csum_tcpudp_magic(src, dst, len, IPPROTO_UDP, csum);
5467 - if (uh->check == 0)
5468 - uh->check = CSUM_MANGLED_0;
5473 - * Push out all pending data as one UDP datagram. Socket is locked.
5475 -static int udp_push_pending_frames(struct sock *sk)
5477 - struct udp_sock *up = udp_sk(sk);
5478 - struct inet_sock *inet = inet_sk(sk);
5479 - struct flowi *fl = &inet->cork.fl;
5480 - struct sk_buff *skb;
5481 - struct udphdr *uh;
5483 - int is_udplite = IS_UDPLITE(sk);
5486 - /* Grab the skbuff where UDP header space exists. */
5487 - if ((skb = skb_peek(&sk->sk_write_queue)) == NULL)
5491 - * Create a UDP header
5493 - uh = udp_hdr(skb);
5494 - uh->source = fl->fl_ip_sport;
5495 - uh->dest = fl->fl_ip_dport;
5496 - uh->len = htons(up->len);
5499 - if (is_udplite) /* UDP-Lite */
5500 - csum = udplite_csum_outgoing(sk, skb);
5502 - else if (sk->sk_no_check == UDP_CSUM_NOXMIT) { /* UDP csum disabled */
5504 - skb->ip_summed = CHECKSUM_NONE;
5507 - } else if (skb->ip_summed == CHECKSUM_PARTIAL) { /* UDP hardware csum */
5509 - udp4_hwcsum_outgoing(sk, skb, fl->fl4_src,fl->fl4_dst, up->len);
5512 - } else /* `normal' UDP */
5513 - csum = udp_csum_outgoing(sk, skb);
5515 - /* add protocol-dependent pseudo-header */
5516 - uh->check = csum_tcpudp_magic(fl->fl4_src, fl->fl4_dst, up->len,
5517 - sk->sk_protocol, csum );
5518 - if (uh->check == 0)
5519 - uh->check = CSUM_MANGLED_0;
5522 - err = ip_push_pending_frames(sk);
5527 - UDP_INC_STATS_USER(sock_net(sk),
5528 - UDP_MIB_OUTDATAGRAMS, is_udplite);
5532 -int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
5535 - struct inet_sock *inet = inet_sk(sk);
5536 - struct udp_sock *up = udp_sk(sk);
5538 - struct ipcm_cookie ipc;
5539 - struct rtable *rt = NULL;
5541 - int connected = 0;
5542 - __be32 daddr, faddr, saddr;
5545 - int err, is_udplite = IS_UDPLITE(sk);
5546 - int corkreq = up->corkflag || msg->msg_flags&MSG_MORE;
5547 - int (*getfrag)(void *, char *, int, int, int, struct sk_buff *);
5553 - * Check the flags.
5556 - if (msg->msg_flags&MSG_OOB) /* Mirror BSD error message compatibility */
5557 - return -EOPNOTSUPP;
5561 - if (up->pending) {
5563 - * There are pending frames.
5564 - * The socket lock must be held while it's corked.
5567 - if (likely(up->pending)) {
5568 - if (unlikely(up->pending != AF_INET)) {
5572 - goto do_append_data;
5576 - ulen += sizeof(struct udphdr);
5579 - * Get and verify the address.
5581 - if (msg->msg_name) {
5582 - struct sockaddr_in * usin = (struct sockaddr_in*)msg->msg_name;
5583 - if (msg->msg_namelen < sizeof(*usin))
5585 - if (usin->sin_family != AF_INET) {
5586 - if (usin->sin_family != AF_UNSPEC)
5587 - return -EAFNOSUPPORT;
5590 - daddr = usin->sin_addr.s_addr;
5591 - dport = usin->sin_port;
5595 - if (sk->sk_state != TCP_ESTABLISHED)
5596 - return -EDESTADDRREQ;
5597 - daddr = inet->daddr;
5598 - dport = inet->dport;
5599 - /* Open fast path for connected socket.
5600 - Route will not be used, if at least one option is set.
5604 - ipc.addr = inet->saddr;
5606 - ipc.oif = sk->sk_bound_dev_if;
5607 - if (msg->msg_controllen) {
5608 - err = ip_cmsg_send(sock_net(sk), msg, &ipc);
5616 - ipc.opt = inet->opt;
5619 - ipc.addr = faddr = daddr;
5621 - if (ipc.opt && ipc.opt->srr) {
5624 - faddr = ipc.opt->faddr;
5627 - tos = RT_TOS(inet->tos);
5628 - if (sock_flag(sk, SOCK_LOCALROUTE) ||
5629 - (msg->msg_flags & MSG_DONTROUTE) ||
5630 - (ipc.opt && ipc.opt->is_strictroute)) {
5631 - tos |= RTO_ONLINK;
5635 - if (ipv4_is_multicast(daddr)) {
5637 - ipc.oif = inet->mc_index;
5639 - saddr = inet->mc_addr;
5644 - rt = (struct rtable*)sk_dst_check(sk, 0);
5647 - struct flowi fl = { .oif = ipc.oif,
5648 - .nl_u = { .ip4_u =
5652 - .proto = sk->sk_protocol,
5653 - .uli_u = { .ports =
5654 - { .sport = inet->sport,
5655 - .dport = dport } } };
5656 - struct net *net = sock_net(sk);
5657 - struct nx_info *nxi = sk->sk_nx_info;
5659 - security_sk_classify_flow(sk, &fl);
5660 - err = ip_v4_find_src(net, nxi, &rt, &fl);
5664 - err = ip_route_output_flow(net, &rt, &fl, sk, 1);
5666 - if (err == -ENETUNREACH)
5667 - IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES);
5672 - if ((rt->rt_flags & RTCF_BROADCAST) &&
5673 - !sock_flag(sk, SOCK_BROADCAST))
5676 - sk_dst_set(sk, dst_clone(&rt->u.dst));
5679 - if (msg->msg_flags&MSG_CONFIRM)
5683 - saddr = rt->rt_src;
5685 - daddr = ipc.addr = rt->rt_dst;
5688 - if (unlikely(up->pending)) {
5689 - /* The socket is already corked while preparing it. */
5690 - /* ... which is an evident application bug. --ANK */
5693 - LIMIT_NETDEBUG(KERN_DEBUG "udp cork app bug 2\n");
5698 - * Now cork the socket to pend data.
5700 - inet->cork.fl.fl4_dst = daddr;
5701 - inet->cork.fl.fl_ip_dport = dport;
5702 - inet->cork.fl.fl4_src = saddr;
5703 - inet->cork.fl.fl_ip_sport = inet->sport;
5704 - up->pending = AF_INET;
5708 - getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag;
5709 - err = ip_append_data(sk, getfrag, msg->msg_iov, ulen,
5710 - sizeof(struct udphdr), &ipc, rt,
5711 - corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags);
5713 - udp_flush_pending_frames(sk);
5714 - else if (!corkreq)
5715 - err = udp_push_pending_frames(sk);
5716 - else if (unlikely(skb_queue_empty(&sk->sk_write_queue)))
5727 - * ENOBUFS = no kernel mem, SOCK_NOSPACE = no sndbuf space. Reporting
5728 - * ENOBUFS might not be good (it's not tunable per se), but otherwise
5729 - * we don't have a good statistic (IpOutDiscards but it can be too many
5730 - * things). We could add another new stat but at least for now that
5731 - * seems like overkill.
5733 - if (err == -ENOBUFS || test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
5734 - UDP_INC_STATS_USER(sock_net(sk),
5735 - UDP_MIB_SNDBUFERRORS, is_udplite);
5740 - dst_confirm(&rt->u.dst);
5741 - if (!(msg->msg_flags&MSG_PROBE) || len)
5742 - goto back_from_confirm;
5747 -int udp_sendpage(struct sock *sk, struct page *page, int offset,
5748 - size_t size, int flags)
5750 - struct udp_sock *up = udp_sk(sk);
5753 - if (!up->pending) {
5754 - struct msghdr msg = { .msg_flags = flags|MSG_MORE };
5756 - /* Call udp_sendmsg to specify destination address which
5757 - * sendpage interface can't pass.
5758 - * This will succeed only when the socket is connected.
5760 - ret = udp_sendmsg(NULL, sk, &msg, 0);
5767 - if (unlikely(!up->pending)) {
5770 - LIMIT_NETDEBUG(KERN_DEBUG "udp cork app bug 3\n");
5774 - ret = ip_append_page(sk, page, offset, size, flags);
5775 - if (ret == -EOPNOTSUPP) {
5777 - return sock_no_sendpage(sk->sk_socket, page, offset,
5781 - udp_flush_pending_frames(sk);
5786 - if (!(up->corkflag || (flags&MSG_MORE)))
5787 - ret = udp_push_pending_frames(sk);
5796 - * IOCTL requests applicable to the UDP protocol
5799 -int udp_ioctl(struct sock *sk, int cmd, unsigned long arg)
5804 - int amount = atomic_read(&sk->sk_wmem_alloc);
5805 - return put_user(amount, (int __user *)arg);
5810 - struct sk_buff *skb;
5811 - unsigned long amount;
5814 - spin_lock_bh(&sk->sk_receive_queue.lock);
5815 - skb = skb_peek(&sk->sk_receive_queue);
5816 - if (skb != NULL) {
5818 - * We will only return the amount
5819 - * of this packet since that is all
5820 - * that will be read.
5822 - amount = skb->len - sizeof(struct udphdr);
5824 - spin_unlock_bh(&sk->sk_receive_queue.lock);
5825 - return put_user(amount, (int __user *)arg);
5829 - return -ENOIOCTLCMD;
5836 - * This should be easy, if there is something there we
5837 - * return it, otherwise we block.
5840 -int udp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
5841 - size_t len, int noblock, int flags, int *addr_len)
5843 - struct inet_sock *inet = inet_sk(sk);
5844 - struct sockaddr_in *sin = (struct sockaddr_in *)msg->msg_name;
5845 - struct sk_buff *skb;
5846 - unsigned int ulen, copied;
5849 - int is_udplite = IS_UDPLITE(sk);
5852 - * Check any passed addresses
5855 - *addr_len=sizeof(*sin);
5857 - if (flags & MSG_ERRQUEUE)
5858 - return ip_recv_error(sk, msg, len);
5861 - skb = __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0),
5866 - ulen = skb->len - sizeof(struct udphdr);
5868 - if (copied > ulen)
5870 - else if (copied < ulen)
5871 - msg->msg_flags |= MSG_TRUNC;
5874 - * If checksum is needed at all, try to do it while copying the
5875 - * data. If the data is truncated, or if we only want a partial
5876 - * coverage checksum (UDP-Lite), do it before the copy.
5879 - if (copied < ulen || UDP_SKB_CB(skb)->partial_cov) {
5880 - if (udp_lib_checksum_complete(skb))
5881 - goto csum_copy_err;
5884 - if (skb_csum_unnecessary(skb))
5885 - err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr),
5886 - msg->msg_iov, copied );
5888 - err = skb_copy_and_csum_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov);
5890 - if (err == -EINVAL)
5891 - goto csum_copy_err;
5898 - UDP_INC_STATS_USER(sock_net(sk),
5899 - UDP_MIB_INDATAGRAMS, is_udplite);
5901 - sock_recv_timestamp(msg, sk, skb);
5903 - /* Copy the address. */
5906 - sin->sin_family = AF_INET;
5907 - sin->sin_port = udp_hdr(skb)->source;
5908 - sin->sin_addr.s_addr = nx_map_sock_lback(
5909 - skb->sk->sk_nx_info, ip_hdr(skb)->saddr);
5910 - memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
5912 - if (inet->cmsg_flags)
5913 - ip_cmsg_recv(msg, skb);
5916 - if (flags & MSG_TRUNC)
5921 - skb_free_datagram(sk, skb);
5928 - if (!skb_kill_datagram(sk, skb, flags))
5929 - UDP_INC_STATS_USER(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
5938 -int udp_disconnect(struct sock *sk, int flags)
5940 - struct inet_sock *inet = inet_sk(sk);
5942 - * 1003.1g - break association.
5945 - sk->sk_state = TCP_CLOSE;
5948 - sk->sk_bound_dev_if = 0;
5949 - if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
5950 - inet_reset_saddr(sk);
5952 - if (!(sk->sk_userlocks & SOCK_BINDPORT_LOCK)) {
5953 - sk->sk_prot->unhash(sk);
5960 -static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
5962 - int is_udplite = IS_UDPLITE(sk);
5965 - if ((rc = sock_queue_rcv_skb(sk, skb)) < 0) {
5966 - /* Note that an ENOMEM error is charged twice */
5967 - if (rc == -ENOMEM) {
5968 - UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS,
5970 - atomic_inc(&sk->sk_drops);
5978 - UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
5986 - * >0: "udp encap" protocol resubmission
5988 - * Note that in the success and error cases, the skb is assumed to
5989 - * have either been requeued or freed.
5991 -int udp_queue_rcv_skb(struct sock * sk, struct sk_buff *skb)
5993 - struct udp_sock *up = udp_sk(sk);
5995 - int is_udplite = IS_UDPLITE(sk);
5998 - * Charge it to the socket, dropping if the queue is full.
6000 - if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
6004 - if (up->encap_type) {
6006 - * This is an encapsulation socket so pass the skb to
6007 - * the socket's udp_encap_rcv() hook. Otherwise, just
6008 - * fall through and pass this up the UDP socket.
6009 - * up->encap_rcv() returns the following value:
6010 - * =0 if skb was successfully passed to the encap
6011 - * handler or was discarded by it.
6012 - * >0 if skb should be passed on to UDP.
6013 - * <0 if skb should be resubmitted as proto -N
6016 - /* if we're overly short, let UDP handle it */
6017 - if (skb->len > sizeof(struct udphdr) &&
6018 - up->encap_rcv != NULL) {
6021 - ret = (*up->encap_rcv)(sk, skb);
6023 - UDP_INC_STATS_BH(sock_net(sk),
6024 - UDP_MIB_INDATAGRAMS,
6030 - /* FALLTHROUGH -- it's a UDP Packet */
6034 - * UDP-Lite specific tests, ignored on UDP sockets
6036 - if ((is_udplite & UDPLITE_RECV_CC) && UDP_SKB_CB(skb)->partial_cov) {
6039 - * MIB statistics other than incrementing the error count are
6040 - * disabled for the following two types of errors: these depend
6041 - * on the application settings, not on the functioning of the
6042 - * protocol stack as such.
6044 - * RFC 3828 here recommends (sec 3.3): "There should also be a
6045 - * way ... to ... at least let the receiving application block
6046 - * delivery of packets with coverage values less than a value
6047 - * provided by the application."
6049 - if (up->pcrlen == 0) { /* full coverage was set */
6050 - LIMIT_NETDEBUG(KERN_WARNING "UDPLITE: partial coverage "
6051 - "%d while full coverage %d requested\n",
6052 - UDP_SKB_CB(skb)->cscov, skb->len);
6055 - /* The next case involves violating the min. coverage requested
6056 - * by the receiver. This is subtle: if receiver wants x and x is
6057 - * greater than the buffersize/MTU then receiver will complain
6058 - * that it wants x while sender emits packets of smaller size y.
6059 - * Therefore the above ...()->partial_cov statement is essential.
6061 - if (UDP_SKB_CB(skb)->cscov < up->pcrlen) {
6062 - LIMIT_NETDEBUG(KERN_WARNING
6063 - "UDPLITE: coverage %d too small, need min %d\n",
6064 - UDP_SKB_CB(skb)->cscov, up->pcrlen);
6069 - if (sk->sk_filter) {
6070 - if (udp_lib_checksum_complete(skb))
6077 - if (!sock_owned_by_user(sk))
6078 - rc = __udp_queue_rcv_skb(sk, skb);
6080 - sk_add_backlog(sk, skb);
6081 - bh_unlock_sock(sk);
6086 - UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
6092 - * Multicasts and broadcasts go to each listener.
6094 - * Note: called only from the BH handler context,
6095 - * so we don't need to lock the hashes.
6097 -static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
6098 - struct udphdr *uh,
6099 - __be32 saddr, __be32 daddr,
6100 - struct hlist_head udptable[])
6105 - read_lock(&udp_hash_lock);
6106 - sk = sk_head(&udptable[udp_hashfn(net, ntohs(uh->dest))]);
6107 - dif = skb->dev->ifindex;
6108 - sk = udp_v4_mcast_next(net, sk, uh->dest, daddr, uh->source, saddr, dif);
6110 - struct sock *sknext = NULL;
6113 - struct sk_buff *skb1 = skb;
6115 - sknext = udp_v4_mcast_next(net, sk_next(sk), uh->dest,
6116 - daddr, uh->source, saddr,
6119 - skb1 = skb_clone(skb, GFP_ATOMIC);
6122 - int ret = udp_queue_rcv_skb(sk, skb1);
6124 - /* we should probably re-process instead
6125 - * of dropping packets here. */
6132 - read_unlock(&udp_hash_lock);
6136 -/* Initialize UDP checksum. If exited with zero value (success),
6137 - * CHECKSUM_UNNECESSARY means, that no more checks are required.
6138 - * Otherwise, csum completion requires chacksumming packet body,
6139 - * including udp header and folding it to skb->csum.
6141 -static inline int udp4_csum_init(struct sk_buff *skb, struct udphdr *uh,
6144 - const struct iphdr *iph;
6147 - UDP_SKB_CB(skb)->partial_cov = 0;
6148 - UDP_SKB_CB(skb)->cscov = skb->len;
6150 - if (proto == IPPROTO_UDPLITE) {
6151 - err = udplite_checksum_init(skb, uh);
6156 - iph = ip_hdr(skb);
6157 - if (uh->check == 0) {
6158 - skb->ip_summed = CHECKSUM_UNNECESSARY;
6159 - } else if (skb->ip_summed == CHECKSUM_COMPLETE) {
6160 - if (!csum_tcpudp_magic(iph->saddr, iph->daddr, skb->len,
6161 - proto, skb->csum))
6162 - skb->ip_summed = CHECKSUM_UNNECESSARY;
6164 - if (!skb_csum_unnecessary(skb))
6165 - skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
6166 - skb->len, proto, 0);
6167 - /* Probably, we should checksum udp header (it should be in cache
6168 - * in any case) and data in tiny packets (< rx copybreak).
6175 - * All we need to do is get the socket, and then do a checksum.
6178 -int __udp4_lib_rcv(struct sk_buff *skb, struct hlist_head udptable[],
6182 - struct udphdr *uh;
6183 - unsigned short ulen;
6184 - struct rtable *rt = (struct rtable*)skb->dst;
6185 - __be32 saddr = ip_hdr(skb)->saddr;
6186 - __be32 daddr = ip_hdr(skb)->daddr;
6187 - struct net *net = dev_net(skb->dev);
6190 - * Validate the packet.
6192 - if (!pskb_may_pull(skb, sizeof(struct udphdr)))
6193 - goto drop; /* No space for header. */
6195 - uh = udp_hdr(skb);
6196 - ulen = ntohs(uh->len);
6197 - if (ulen > skb->len)
6198 - goto short_packet;
6200 - if (proto == IPPROTO_UDP) {
6201 - /* UDP validates ulen. */
6202 - if (ulen < sizeof(*uh) || pskb_trim_rcsum(skb, ulen))
6203 - goto short_packet;
6204 - uh = udp_hdr(skb);
6207 - if (udp4_csum_init(skb, uh, proto))
6210 - if (rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST))
6211 - return __udp4_lib_mcast_deliver(net, skb, uh,
6212 - saddr, daddr, udptable);
6214 - sk = __udp4_lib_lookup(net, saddr, uh->source, daddr,
6215 - uh->dest, inet_iif(skb), udptable);
6218 - int ret = udp_queue_rcv_skb(sk, skb);
6221 - /* a return value > 0 means to resubmit the input, but
6222 - * it wants the return to be -protocol, or 0
6229 - if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
6233 - /* No socket. Drop packet silently, if checksum is wrong */
6234 - if (udp_lib_checksum_complete(skb))
6237 - UDP_INC_STATS_BH(net, UDP_MIB_NOPORTS, proto == IPPROTO_UDPLITE);
6238 - icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
6241 - * Hmm. We got an UDP packet to a port to which we
6242 - * don't wanna listen. Ignore it.
6248 - LIMIT_NETDEBUG(KERN_DEBUG "UDP%s: short packet: From " NIPQUAD_FMT ":%u %d/%d to " NIPQUAD_FMT ":%u\n",
6249 - proto == IPPROTO_UDPLITE ? "-Lite" : "",
6251 - ntohs(uh->source),
6260 - * RFC1122: OK. Discards the bad packet silently (as far as
6261 - * the network is concerned, anyway) as per 4.1.3.4 (MUST).
6263 - LIMIT_NETDEBUG(KERN_DEBUG "UDP%s: bad checksum. From " NIPQUAD_FMT ":%u to " NIPQUAD_FMT ":%u ulen %d\n",
6264 - proto == IPPROTO_UDPLITE ? "-Lite" : "",
6266 - ntohs(uh->source),
6271 - UDP_INC_STATS_BH(net, UDP_MIB_INERRORS, proto == IPPROTO_UDPLITE);
6276 -int udp_rcv(struct sk_buff *skb)
6278 - return __udp4_lib_rcv(skb, udp_hash, IPPROTO_UDP);
6281 -void udp_destroy_sock(struct sock *sk)
6284 - udp_flush_pending_frames(sk);
6289 - * Socket option code for UDP
6291 -int udp_lib_setsockopt(struct sock *sk, int level, int optname,
6292 - char __user *optval, int optlen,
6293 - int (*push_pending_frames)(struct sock *))
6295 - struct udp_sock *up = udp_sk(sk);
6298 - int is_udplite = IS_UDPLITE(sk);
6300 - if (optlen<sizeof(int))
6303 - if (get_user(val, (int __user *)optval))
6306 - switch (optname) {
6313 - (*push_pending_frames)(sk);
6321 - case UDP_ENCAP_ESPINUDP:
6322 - case UDP_ENCAP_ESPINUDP_NON_IKE:
6323 - up->encap_rcv = xfrm4_udp_encap_rcv;
6325 - case UDP_ENCAP_L2TPINUDP:
6326 - up->encap_type = val;
6329 - err = -ENOPROTOOPT;
6335 - * UDP-Lite's partial checksum coverage (RFC 3828).
6337 - /* The sender sets actual checksum coverage length via this option.
6338 - * The case coverage > packet length is handled by send module. */
6339 - case UDPLITE_SEND_CSCOV:
6340 - if (!is_udplite) /* Disable the option on UDP sockets */
6341 - return -ENOPROTOOPT;
6342 - if (val != 0 && val < 8) /* Illegal coverage: use default (8) */
6344 - else if (val > USHORT_MAX)
6347 - up->pcflag |= UDPLITE_SEND_CC;
6350 - /* The receiver specifies a minimum checksum coverage value. To make
6351 - * sense, this should be set to at least 8 (as done below). If zero is
6352 - * used, this again means full checksum coverage. */
6353 - case UDPLITE_RECV_CSCOV:
6354 - if (!is_udplite) /* Disable the option on UDP sockets */
6355 - return -ENOPROTOOPT;
6356 - if (val != 0 && val < 8) /* Avoid silly minimal values. */
6358 - else if (val > USHORT_MAX)
6361 - up->pcflag |= UDPLITE_RECV_CC;
6365 - err = -ENOPROTOOPT;
6372 -int udp_setsockopt(struct sock *sk, int level, int optname,
6373 - char __user *optval, int optlen)
6375 - if (level == SOL_UDP || level == SOL_UDPLITE)
6376 - return udp_lib_setsockopt(sk, level, optname, optval, optlen,
6377 - udp_push_pending_frames);
6378 - return ip_setsockopt(sk, level, optname, optval, optlen);
6381 -#ifdef CONFIG_COMPAT
6382 -int compat_udp_setsockopt(struct sock *sk, int level, int optname,
6383 - char __user *optval, int optlen)
6385 - if (level == SOL_UDP || level == SOL_UDPLITE)
6386 - return udp_lib_setsockopt(sk, level, optname, optval, optlen,
6387 - udp_push_pending_frames);
6388 - return compat_ip_setsockopt(sk, level, optname, optval, optlen);
6392 -int udp_lib_getsockopt(struct sock *sk, int level, int optname,
6393 - char __user *optval, int __user *optlen)
6395 - struct udp_sock *up = udp_sk(sk);
6398 - if (get_user(len,optlen))
6401 - len = min_t(unsigned int, len, sizeof(int));
6406 - switch (optname) {
6408 - val = up->corkflag;
6412 - val = up->encap_type;
6415 - /* The following two cannot be changed on UDP sockets, the return is
6416 - * always 0 (which corresponds to the full checksum coverage of UDP). */
6417 - case UDPLITE_SEND_CSCOV:
6421 - case UDPLITE_RECV_CSCOV:
6426 - return -ENOPROTOOPT;
6429 - if (put_user(len, optlen))
6431 - if (copy_to_user(optval, &val,len))
6436 -int udp_getsockopt(struct sock *sk, int level, int optname,
6437 - char __user *optval, int __user *optlen)
6439 - if (level == SOL_UDP || level == SOL_UDPLITE)
6440 - return udp_lib_getsockopt(sk, level, optname, optval, optlen);
6441 - return ip_getsockopt(sk, level, optname, optval, optlen);
6444 -#ifdef CONFIG_COMPAT
6445 -int compat_udp_getsockopt(struct sock *sk, int level, int optname,
6446 - char __user *optval, int __user *optlen)
6448 - if (level == SOL_UDP || level == SOL_UDPLITE)
6449 - return udp_lib_getsockopt(sk, level, optname, optval, optlen);
6450 - return compat_ip_getsockopt(sk, level, optname, optval, optlen);
6454 - * udp_poll - wait for a UDP event.
6455 - * @file - file struct
6457 - * @wait - poll table
6459 - * This is same as datagram poll, except for the special case of
6460 - * blocking sockets. If application is using a blocking fd
6461 - * and a packet with checksum error is in the queue;
6462 - * then it could get return from select indicating data available
6463 - * but then block when reading it. Add special case code
6464 - * to work around these arguably broken applications.
6466 -unsigned int udp_poll(struct file *file, struct socket *sock, poll_table *wait)
6468 - unsigned int mask = datagram_poll(file, sock, wait);
6469 - struct sock *sk = sock->sk;
6470 - int is_lite = IS_UDPLITE(sk);
6472 - /* Check for false positives due to checksum errors */
6473 - if ( (mask & POLLRDNORM) &&
6474 - !(file->f_flags & O_NONBLOCK) &&
6475 - !(sk->sk_shutdown & RCV_SHUTDOWN)){
6476 - struct sk_buff_head *rcvq = &sk->sk_receive_queue;
6477 - struct sk_buff *skb;
6479 - spin_lock_bh(&rcvq->lock);
6480 - while ((skb = skb_peek(rcvq)) != NULL &&
6481 - udp_lib_checksum_complete(skb)) {
6482 - UDP_INC_STATS_BH(sock_net(sk),
6483 - UDP_MIB_INERRORS, is_lite);
6484 - __skb_unlink(skb, rcvq);
6487 - spin_unlock_bh(&rcvq->lock);
6489 - /* nothing to see, move along */
6491 - mask &= ~(POLLIN | POLLRDNORM);
6498 -struct proto udp_prot = {
6500 - .owner = THIS_MODULE,
6501 - .close = udp_lib_close,
6502 - .connect = ip4_datagram_connect,
6503 - .disconnect = udp_disconnect,
6504 - .ioctl = udp_ioctl,
6505 - .destroy = udp_destroy_sock,
6506 - .setsockopt = udp_setsockopt,
6507 - .getsockopt = udp_getsockopt,
6508 - .sendmsg = udp_sendmsg,
6509 - .recvmsg = udp_recvmsg,
6510 - .sendpage = udp_sendpage,
6511 - .backlog_rcv = __udp_queue_rcv_skb,
6512 - .hash = udp_lib_hash,
6513 - .unhash = udp_lib_unhash,
6514 - .get_port = udp_v4_get_port,
6515 - .memory_allocated = &udp_memory_allocated,
6516 - .sysctl_mem = sysctl_udp_mem,
6517 - .sysctl_wmem = &sysctl_udp_wmem_min,
6518 - .sysctl_rmem = &sysctl_udp_rmem_min,
6519 - .obj_size = sizeof(struct udp_sock),
6520 - .h.udp_hash = udp_hash,
6521 -#ifdef CONFIG_COMPAT
6522 - .compat_setsockopt = compat_udp_setsockopt,
6523 - .compat_getsockopt = compat_udp_getsockopt,
6527 -/* ------------------------------------------------------------------------ */
6528 -#ifdef CONFIG_PROC_FS
6530 -static struct sock *udp_get_first(struct seq_file *seq)
6533 - struct udp_iter_state *state = seq->private;
6534 - struct net *net = seq_file_net(seq);
6536 - for (state->bucket = 0; state->bucket < UDP_HTABLE_SIZE; ++state->bucket) {
6537 - struct hlist_node *node;
6538 - sk_for_each(sk, node, state->hashtable + state->bucket) {
6539 - if (!net_eq(sock_net(sk), net))
6541 - if (!nx_check(sk->sk_nid, VS_WATCH_P | VS_IDENT))
6543 - if (sk->sk_family == state->family)
6552 -static struct sock *udp_get_next(struct seq_file *seq, struct sock *sk)
6554 - struct udp_iter_state *state = seq->private;
6555 - struct net *net = seq_file_net(seq);
6561 - } while (sk && (!net_eq(sock_net(sk), net) ||
6562 - sk->sk_family != state->family ||
6563 - !nx_check(sk->sk_nid, VS_WATCH_P | VS_IDENT)));
6565 - if (!sk && ++state->bucket < UDP_HTABLE_SIZE) {
6566 - sk = sk_head(state->hashtable + state->bucket);
6572 -static struct sock *udp_get_idx(struct seq_file *seq, loff_t pos)
6574 - struct sock *sk = udp_get_first(seq);
6577 - while (pos && (sk = udp_get_next(seq, sk)) != NULL)
6579 - return pos ? NULL : sk;
6582 -static void *udp_seq_start(struct seq_file *seq, loff_t *pos)
6583 - __acquires(udp_hash_lock)
6585 - read_lock(&udp_hash_lock);
6586 - return *pos ? udp_get_idx(seq, *pos-1) : SEQ_START_TOKEN;
6589 -static void *udp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
6593 - if (v == SEQ_START_TOKEN)
6594 - sk = udp_get_idx(seq, 0);
6596 - sk = udp_get_next(seq, v);
6602 -static void udp_seq_stop(struct seq_file *seq, void *v)
6603 - __releases(udp_hash_lock)
6605 - read_unlock(&udp_hash_lock);
6608 -static int udp_seq_open(struct inode *inode, struct file *file)
6610 - struct udp_seq_afinfo *afinfo = PDE(inode)->data;
6611 - struct udp_iter_state *s;
6614 - err = seq_open_net(inode, file, &afinfo->seq_ops,
6615 - sizeof(struct udp_iter_state));
6619 - s = ((struct seq_file *)file->private_data)->private;
6620 - s->family = afinfo->family;
6621 - s->hashtable = afinfo->hashtable;
6625 -/* ------------------------------------------------------------------------ */
6626 -int udp_proc_register(struct net *net, struct udp_seq_afinfo *afinfo)
6628 - struct proc_dir_entry *p;
6631 - afinfo->seq_fops.open = udp_seq_open;
6632 - afinfo->seq_fops.read = seq_read;
6633 - afinfo->seq_fops.llseek = seq_lseek;
6634 - afinfo->seq_fops.release = seq_release_net;
6636 - afinfo->seq_ops.start = udp_seq_start;
6637 - afinfo->seq_ops.next = udp_seq_next;
6638 - afinfo->seq_ops.stop = udp_seq_stop;
6640 - p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
6641 - &afinfo->seq_fops, afinfo);
6647 -void udp_proc_unregister(struct net *net, struct udp_seq_afinfo *afinfo)
6649 - proc_net_remove(net, afinfo->name);
6652 -/* ------------------------------------------------------------------------ */
6653 -static void udp4_format_sock(struct sock *sp, struct seq_file *f,
6654 - int bucket, int *len)
6656 - struct inet_sock *inet = inet_sk(sp);
6657 - __be32 dest = inet->daddr;
6658 - __be32 src = inet->rcv_saddr;
6659 - __u16 destp = ntohs(inet->dport);
6660 - __u16 srcp = ntohs(inet->sport);
6662 - seq_printf(f, "%4d: %08X:%04X %08X:%04X"
6663 - " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p %d%n",
6665 - nx_map_sock_lback(current_nx_info(), src), srcp,
6666 - nx_map_sock_lback(current_nx_info(), dest), destp,
6668 - atomic_read(&sp->sk_wmem_alloc),
6669 - atomic_read(&sp->sk_rmem_alloc),
6670 - 0, 0L, 0, sock_i_uid(sp), 0, sock_i_ino(sp),
6671 - atomic_read(&sp->sk_refcnt), sp,
6672 - atomic_read(&sp->sk_drops), len);
6675 -int udp4_seq_show(struct seq_file *seq, void *v)
6677 - if (v == SEQ_START_TOKEN)
6678 - seq_printf(seq, "%-127s\n",
6679 - " sl local_address rem_address st tx_queue "
6680 - "rx_queue tr tm->when retrnsmt uid timeout "
6681 - "inode ref pointer drops");
6683 - struct udp_iter_state *state = seq->private;
6686 - udp4_format_sock(v, seq, state->bucket, &len);
6687 - seq_printf(seq, "%*s\n", 127 - len ,"");
6692 -/* ------------------------------------------------------------------------ */
6693 -static struct udp_seq_afinfo udp4_seq_afinfo = {
6695 - .family = AF_INET,
6696 - .hashtable = udp_hash,
6698 - .owner = THIS_MODULE,
6701 - .show = udp4_seq_show,
6705 -static int udp4_proc_init_net(struct net *net)
6707 - return udp_proc_register(net, &udp4_seq_afinfo);
6710 -static void udp4_proc_exit_net(struct net *net)
6712 - udp_proc_unregister(net, &udp4_seq_afinfo);
6715 -static struct pernet_operations udp4_net_ops = {
6716 - .init = udp4_proc_init_net,
6717 - .exit = udp4_proc_exit_net,
6720 -int __init udp4_proc_init(void)
6722 - return register_pernet_subsys(&udp4_net_ops);
6725 -void udp4_proc_exit(void)
6727 - unregister_pernet_subsys(&udp4_net_ops);
6729 -#endif /* CONFIG_PROC_FS */
6731 -void __init udp_init(void)
6733 - unsigned long limit;
6735 - /* Set the pressure threshold up by the same strategy of TCP. It is a
6736 - * fraction of global memory that is up to 1/2 at 256 MB, decreasing
6737 - * toward zero with the amount of memory, with a floor of 128 pages.
6739 - limit = min(nr_all_pages, 1UL<<(28-PAGE_SHIFT)) >> (20-PAGE_SHIFT);
6740 - limit = (limit * (nr_all_pages >> (20-PAGE_SHIFT))) >> (PAGE_SHIFT-11);
6741 - limit = max(limit, 128UL);
6742 - sysctl_udp_mem[0] = limit / 4 * 3;
6743 - sysctl_udp_mem[1] = limit;
6744 - sysctl_udp_mem[2] = sysctl_udp_mem[0] * 2;
6746 - sysctl_udp_rmem_min = SK_MEM_QUANTUM;
6747 - sysctl_udp_wmem_min = SK_MEM_QUANTUM;
6750 -EXPORT_SYMBOL(udp_disconnect);
6751 -EXPORT_SYMBOL(udp_hash);
6752 -EXPORT_SYMBOL(udp_hash_lock);
6753 -EXPORT_SYMBOL(udp_ioctl);
6754 -EXPORT_SYMBOL(udp_prot);
6755 -EXPORT_SYMBOL(udp_sendmsg);
6756 -EXPORT_SYMBOL(udp_lib_getsockopt);
6757 -EXPORT_SYMBOL(udp_lib_setsockopt);
6758 -EXPORT_SYMBOL(udp_poll);
6759 -EXPORT_SYMBOL(udp_lib_get_port);
6761 -#ifdef CONFIG_PROC_FS
6762 -EXPORT_SYMBOL(udp_proc_register);
6763 -EXPORT_SYMBOL(udp_proc_unregister);
6765 diff -Nurb linux-2.6.27-524/net/packet/af_packet.c linux-2.6.27-525/net/packet/af_packet.c
6766 --- linux-2.6.27-524/net/packet/af_packet.c 2009-12-04 16:03:47.000000000 -0500
6767 +++ linux-2.6.27-525/net/packet/af_packet.c 2009-12-04 16:09:31.000000000 -0500
6769 #include <linux/poll.h>
6770 #include <linux/module.h>
6771 #include <linux/init.h>
6772 +#include <linux/vs_network.h>
6773 #include <linux/mutex.h>
6776 @@ -278,10 +279,53 @@
6778 static const struct proto_ops packet_ops_spkt;
6780 +extern DEFINE_PER_CPU(int, sknid_elevator);
6782 +static inline unsigned int slice_check_and_elevate(struct sk_buff *skb, struct sock *sk) {
6783 + /* This mechanism is quite involved, and caused us a lot of pain
6784 + * including crashes and packet loss during the 4.2 rollout. This
6785 + * function decides if a slice is allowed to see a given packet.
6786 + * Unfortunately, the first time it is invoked for a packet it does not
6787 + * have enough information to make this call, since xt_MARK has not had
6788 + * a chance to tag it with the slice id. There is also no way of
6789 + * passing state between xt_MARK and this function through a packet --
6790 + * because the skb gets cloned quite a few times between these two
6791 + * points. I'd rather not use skb_shared_info because it's treated as
6792 + * a blob of memory, and so it would be quite hard to maintain.
6794 + * What we do is to keep a global variable (per CPU) that transfers the
6795 + * required state between xt_MARK and af_packet.c. As an optimization,
6796 + * this state transfer and the step that follows is only executed for
6797 + * packets that first get dropped here. When we drop a packet, we mark
6798 + * it for 'elevation' (that's what this trick is called). When xt_MARK
6799 + * tags the packet with the right slice, it intercepts this mark and
6800 + * sets the value of sknid_elevator. Next, the packet is sent back here
6801 + * for a second round, this time with the xid tag set.
6804 + int *elevator=&__get_cpu_var(sknid_elevator);
6805 + int tag = skb->skb_tag;
6807 + if (sk->sk_nx_info && !(tag == 1 || sk->sk_nid == tag)) {
6808 + if (skb->pkt_type==PACKET_HOST) {
6809 + *elevator=-2; /* Rejecting this packet. Mark it for elevation in xt_MARK */
6813 + else if (!sk->sk_nx_info && (*elevator>0)) {
6814 + /* Root has already seen this packet once, since it has been elevated */
6821 static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
6824 struct sockaddr_pkt *spkt;
6825 + int tag = skb->skb_tag;
6829 * When we registered the protocol we saved the socket in the data
6830 @@ -301,6 +345,16 @@
6831 * so that this procedure is noop.
6835 + * (18:05:41) daniel_hozac: where?
6836 + * (18:05:58) daniel_hozac: we already have filters on PF_PACKET, don't we?
6837 + * (18:05:58) er: in packet_rcv_skpt
6838 + * (18:07:33) daniel_hozac: oh, that's evil.
6841 + if (!slice_check_and_elevate(skb, sk))
6844 if (skb->pkt_type == PACKET_LOOPBACK)
6851 + if (!nx_capable(CAP_NET_RAW, NXC_RAW_SEND))
6855 * Get and verify the address.
6857 @@ -451,11 +508,16 @@
6863 static inline unsigned int run_filter(struct sk_buff *skb, struct sock *sk,
6866 struct sk_filter *filter;
6868 + if (!slice_check_and_elevate(skb, sk))
6872 filter = rcu_dereference(sk->sk_filter);
6875 unsigned char *addr;
6876 int ifindex, err, reserve = 0;
6878 + if (!nx_capable(CAP_NET_RAW, NXC_RAW_SEND))
6882 * Get and verify the address.
6884 @@ -941,6 +1006,7 @@
6887 po->prot_hook.type = protocol;
6888 + po->prot_hook.sknid_elevator = 1;
6889 po->prot_hook.dev = dev;
6891 po->ifindex = dev ? dev->ifindex : 0;
6892 @@ -1039,8 +1105,9 @@
6893 __be16 proto = (__force __be16)protocol; /* weird, but documented */
6896 - if (!capable(CAP_NET_RAW))
6897 + if (!nx_capable(CAP_NET_RAW, NXC_RAW_SOCKET))
6900 if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
6901 sock->type != SOCK_PACKET)
6902 return -ESOCKTNOSUPPORT;
6903 @@ -1072,6 +1139,7 @@
6904 spin_lock_init(&po->bind_lock);
6905 mutex_init(&po->pg_vec_lock);
6906 po->prot_hook.func = packet_rcv;
6907 + po->prot_hook.sknid_elevator = 1;
6909 if (sock->type == SOCK_PACKET)
6910 po->prot_hook.func = packet_rcv_spkt;