X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=include%2Fnet%2Fsock.h;h=aab07e510c480f56f2495a3ba390ea85152ced0d;hb=43bc926fffd92024b46cafaf7350d669ba9ca884;hp=276d8a61279aca8e53053626586d7ddcdf761967;hpb=a8e794ca871505c8ea96cc102f4ad555c5231d7f;p=linux-2.6.git diff --git a/include/net/sock.h b/include/net/sock.h index 276d8a612..aab07e510 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -7,7 +7,7 @@ * * Version: @(#)sock.h 1.0.4 05/13/93 * - * Authors: Ross Biro, + * Authors: Ross Biro * Fred N. van Kempen, * Corey Minyard * Florian La Roche @@ -50,10 +50,10 @@ #include #include -#include #include #include +#include /* * This structure really needs to be cleaned up. @@ -61,10 +61,10 @@ * the other protocols. */ -/* Define this to get the sk->sk_debug debugging facility. */ -#define SOCK_DEBUGGING +/* Define this to get the SOCK_DBG debugging facility. */ +//#define SOCK_DEBUGGING #ifdef SOCK_DEBUGGING -#define SOCK_DEBUG(sk, msg...) do { if ((sk) && ((sk)->sk_debug)) \ +#define SOCK_DEBUG(sk, msg...) do { if ((sk) && sock_flag((sk), SOCK_DBG)) \ printk(KERN_DEBUG msg); } while (0) #else #define SOCK_DEBUG(sk, msg...) do { } while (0) @@ -88,20 +88,23 @@ do { spin_lock_init(&((__sk)->sk_lock.slock)); \ } while(0) struct sock; +struct proto; /** - * struct sock_common - minimal network layer representation of sockets - * @skc_family - network address family - * @skc_state - Connection state - * @skc_reuse - %SO_REUSEADDR setting - * @skc_bound_dev_if - bound device index if != 0 - * @skc_node - main hash linkage for various protocol lookup tables - * @skc_bind_node - bind hash linkage for various protocol lookup tables - * @skc_refcnt - reference count - * - * This is the minimal network layer representation of sockets, the header - * for struct sock and struct tcp_tw_bucket. - */ + * struct sock_common - minimal network layer representation of sockets + * @skc_family: network address family + * @skc_state: Connection state + * @skc_reuse: %SO_REUSEADDR setting + * @skc_bound_dev_if: bound device index if != 0 + * @skc_node: main hash linkage for various protocol lookup tables + * @skc_bind_node: bind hash linkage for various protocol lookup tables + * @skc_refcnt: reference count + * @skc_hash: hash value used with various protocol lookup tables + * @skc_prot: protocol handlers inside a network family + * + * This is the minimal network layer representation of sockets, the header + * for struct sock and struct inet_timewait_sock. + */ struct sock_common { unsigned short skc_family; volatile unsigned char skc_state; @@ -110,77 +113,75 @@ struct sock_common { struct hlist_node skc_node; struct hlist_node skc_bind_node; atomic_t skc_refcnt; + unsigned int skc_hash; + struct proto *skc_prot; xid_t skc_xid; - struct vx_info *skc_vx_info; + struct vx_info *skc_vx_info; nid_t skc_nid; - struct nx_info *skc_nx_info; + struct nx_info *skc_nx_info; }; /** * struct sock - network layer representation of sockets - * @__sk_common - shared layout with tcp_tw_bucket - * @sk_zapped - ax25 & ipx means !linked - * @sk_shutdown - mask of %SEND_SHUTDOWN and/or %RCV_SHUTDOWN - * @sk_use_write_queue - wheter to call sk->sk_write_space in sock_wfree - * @sk_userlocks - %SO_SNDBUF and %SO_RCVBUF settings - * @sk_lock - synchronizer - * @sk_rcvbuf - size of receive buffer in bytes - * @sk_sleep - sock wait queue - * @sk_dst_cache - destination cache - * @sk_dst_lock - destination cache lock - * @sk_policy - flow policy - * @sk_rmem_alloc - receive queue bytes committed - * @sk_receive_queue - incoming packets - * @sk_wmem_alloc - transmit queue bytes committed - * @sk_write_queue - Packet sending queue - * @sk_omem_alloc - "o" is "option" or "other" - * @sk_wmem_queued - persistent queue size - * @sk_forward_alloc - space allocated forward - * @sk_allocation - allocation mode - * @sk_sndbuf - size of send buffer in bytes - * @sk_flags - %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE, %SO_OOBINLINE settings - * @sk_no_check - %SO_NO_CHECK setting, wether or not checkup packets - * @sk_debug - %SO_DEBUG setting - * @sk_rcvtstamp - %SO_TIMESTAMP setting - * @sk_no_largesend - whether to sent large segments or not - * @sk_route_caps - route capabilities (e.g. %NETIF_F_TSO) - * @sk_lingertime - %SO_LINGER l_linger setting - * @sk_hashent - hash entry in several tables (e.g. tcp_ehash) - * @sk_pair - socket pair (e.g. AF_UNIX/unix_peer) - * @sk_backlog - always used with the per-socket spinlock held - * @sk_callback_lock - used with the callbacks in the end of this struct - * @sk_error_queue - rarely used - * @sk_prot - protocol handlers inside a network family - * @sk_err - last error - * @sk_err_soft - errors that don't cause failure but are the cause of a persistent failure not just 'timed out' - * @sk_ack_backlog - current listen backlog - * @sk_max_ack_backlog - listen backlog set in listen() - * @sk_priority - %SO_PRIORITY setting - * @sk_type - socket type (%SOCK_STREAM, etc) - * @sk_localroute - route locally only, %SO_DONTROUTE setting - * @sk_protocol - which protocol this socket belongs in this network family - * @sk_peercred - %SO_PEERCRED setting - * @sk_rcvlowat - %SO_RCVLOWAT setting - * @sk_rcvtimeo - %SO_RCVTIMEO setting - * @sk_sndtimeo - %SO_SNDTIMEO setting - * @sk_filter - socket filtering instructions - * @sk_protinfo - private area, net family specific, when not using slab - * @sk_slab - the slabcache this instance was allocated from - * @sk_timer - sock cleanup timer - * @sk_stamp - time stamp of last packet received - * @sk_socket - Identd and reporting IO signals - * @sk_user_data - RPC layer private data - * @sk_owner - module that owns this socket - * @sk_state_change - callback to indicate change in the state of the sock - * @sk_data_ready - callback to indicate there is data to be processed - * @sk_write_space - callback to indicate there is bf sending space available - * @sk_error_report - callback to indicate errors (e.g. %MSG_ERRQUEUE) - * @sk_backlog_rcv - callback to process the backlog - * @sk_destruct - called at sock freeing time, i.e. when all refcnt == 0 + * @__sk_common: shared layout with inet_timewait_sock + * @sk_shutdown: mask of %SEND_SHUTDOWN and/or %RCV_SHUTDOWN + * @sk_userlocks: %SO_SNDBUF and %SO_RCVBUF settings + * @sk_lock: synchronizer + * @sk_rcvbuf: size of receive buffer in bytes + * @sk_sleep: sock wait queue + * @sk_dst_cache: destination cache + * @sk_dst_lock: destination cache lock + * @sk_policy: flow policy + * @sk_rmem_alloc: receive queue bytes committed + * @sk_receive_queue: incoming packets + * @sk_wmem_alloc: transmit queue bytes committed + * @sk_write_queue: Packet sending queue + * @sk_omem_alloc: "o" is "option" or "other" + * @sk_wmem_queued: persistent queue size + * @sk_forward_alloc: space allocated forward + * @sk_allocation: allocation mode + * @sk_sndbuf: size of send buffer in bytes + * @sk_flags: %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE, %SO_OOBINLINE settings + * @sk_no_check: %SO_NO_CHECK setting, wether or not checkup packets + * @sk_route_caps: route capabilities (e.g. %NETIF_F_TSO) + * @sk_lingertime: %SO_LINGER l_linger setting + * @sk_backlog: always used with the per-socket spinlock held + * @sk_callback_lock: used with the callbacks in the end of this struct + * @sk_error_queue: rarely used + * @sk_prot_creator: sk_prot of original sock creator (see ipv6_setsockopt, IPV6_ADDRFORM for instance) + * @sk_err: last error + * @sk_err_soft: errors that don't cause failure but are the cause of a persistent failure not just 'timed out' + * @sk_ack_backlog: current listen backlog + * @sk_max_ack_backlog: listen backlog set in listen() + * @sk_priority: %SO_PRIORITY setting + * @sk_type: socket type (%SOCK_STREAM, etc) + * @sk_protocol: which protocol this socket belongs in this network family + * @sk_peercred: %SO_PEERCRED setting + * @sk_rcvlowat: %SO_RCVLOWAT setting + * @sk_rcvtimeo: %SO_RCVTIMEO setting + * @sk_sndtimeo: %SO_SNDTIMEO setting + * @sk_filter: socket filtering instructions + * @sk_protinfo: private area, net family specific, when not using slab + * @sk_timer: sock cleanup timer + * @sk_stamp: time stamp of last packet received + * @sk_socket: Identd and reporting IO signals + * @sk_user_data: RPC and Tux layer private data + * @sk_sndmsg_page: cached page for sendmsg + * @sk_sndmsg_off: cached offset for sendmsg + * @sk_send_head: front of stuff to transmit + * @sk_security: used by security modules + * @sk_write_pending: a write to stream socket waits to start + * @sk_state_change: callback to indicate change in the state of the sock + * @sk_data_ready: callback to indicate there is data to be processed + * @sk_write_space: callback to indicate there is bf sending space available + * @sk_error_report: callback to indicate errors (e.g. %MSG_ERRQUEUE) + * @sk_create_child - callback to get new socket events + * @sk_backlog_rcv: callback to process the backlog + * @sk_destruct: called at sock freeing time, i.e. when all refcnt == 0 */ struct sock { /* - * Now struct tcp_tw_bucket also uses sock_common, so please just + * Now struct inet_timewait_sock also uses sock_common, so please just * don't add nothing before this first member (__sk_common) --acme */ struct sock_common __sk_common; @@ -191,38 +192,36 @@ struct sock { #define sk_node __sk_common.skc_node #define sk_bind_node __sk_common.skc_bind_node #define sk_refcnt __sk_common.skc_refcnt +#define sk_hash __sk_common.skc_hash +#define sk_prot __sk_common.skc_prot #define sk_xid __sk_common.skc_xid #define sk_vx_info __sk_common.skc_vx_info #define sk_nid __sk_common.skc_nid #define sk_nx_info __sk_common.skc_nx_info - volatile unsigned char sk_zapped; - unsigned char sk_shutdown; - unsigned char sk_use_write_queue; - unsigned char sk_userlocks; - socket_lock_t sk_lock; + unsigned char sk_shutdown : 2, + sk_no_check : 2, + sk_userlocks : 4; + unsigned char sk_protocol; + unsigned short sk_type; int sk_rcvbuf; + socket_lock_t sk_lock; wait_queue_head_t *sk_sleep; struct dst_entry *sk_dst_cache; - rwlock_t sk_dst_lock; struct xfrm_policy *sk_policy[2]; + rwlock_t sk_dst_lock; atomic_t sk_rmem_alloc; - struct sk_buff_head sk_receive_queue; atomic_t sk_wmem_alloc; - struct sk_buff_head sk_write_queue; atomic_t sk_omem_alloc; + struct sk_buff_head sk_receive_queue; + struct sk_buff_head sk_write_queue; int sk_wmem_queued; int sk_forward_alloc; - unsigned int sk_allocation; + gfp_t sk_allocation; int sk_sndbuf; - unsigned long sk_flags; - char sk_no_check; - unsigned char sk_debug; - unsigned char sk_rcvtstamp; - unsigned char sk_no_largesend; int sk_route_caps; + int sk_rcvlowat; + unsigned long sk_flags; unsigned long sk_lingertime; - int sk_hashent; - struct sock *sk_pair; /* * The backlog queue is special, it is always used with * the per-socket spinlock held and requires low latency @@ -232,29 +231,27 @@ struct sock { struct sk_buff *head; struct sk_buff *tail; } sk_backlog; - rwlock_t sk_callback_lock; struct sk_buff_head sk_error_queue; - struct proto *sk_prot; + struct proto *sk_prot_creator; + rwlock_t sk_callback_lock; int sk_err, sk_err_soft; unsigned short sk_ack_backlog; unsigned short sk_max_ack_backlog; __u32 sk_priority; - unsigned short sk_type; - unsigned char sk_localroute; - unsigned char sk_protocol; struct ucred sk_peercred; - int sk_rcvlowat; long sk_rcvtimeo; long sk_sndtimeo; struct sk_filter *sk_filter; void *sk_protinfo; - kmem_cache_t *sk_slab; struct timer_list sk_timer; struct timeval sk_stamp; struct socket *sk_socket; void *sk_user_data; - struct module *sk_owner; + struct page *sk_sndmsg_page; + struct sk_buff *sk_send_head; + __u32 sk_sndmsg_off; + int sk_write_pending; void *sk_security; void (*sk_state_change)(struct sock *sk); void (*sk_data_ready)(struct sock *sk, int bytes); @@ -262,36 +259,37 @@ struct sock { void (*sk_error_report)(struct sock *sk); int (*sk_backlog_rcv)(struct sock *sk, struct sk_buff *skb); + void (*sk_create_child)(struct sock *sk, struct sock *newsk); void (*sk_destruct)(struct sock *sk); }; /* * Hashed lists helper routines */ -static inline struct sock *__sk_head(struct hlist_head *head) +static inline struct sock *__sk_head(const struct hlist_head *head) { return hlist_entry(head->first, struct sock, sk_node); } -static inline struct sock *sk_head(struct hlist_head *head) +static inline struct sock *sk_head(const struct hlist_head *head) { return hlist_empty(head) ? NULL : __sk_head(head); } -static inline struct sock *sk_next(struct sock *sk) +static inline struct sock *sk_next(const struct sock *sk) { return sk->sk_node.next ? hlist_entry(sk->sk_node.next, struct sock, sk_node) : NULL; } -static inline int sk_unhashed(struct sock *sk) +static inline int sk_unhashed(const struct sock *sk) { return hlist_unhashed(&sk->sk_node); } -static inline int sk_hashed(struct sock *sk) +static inline int sk_hashed(const struct sock *sk) { - return sk->sk_node.pprev != NULL; + return !sk_unhashed(sk); } static __inline__ void sk_node_init(struct hlist_node *node) @@ -390,8 +388,20 @@ enum sock_flags { SOCK_DESTROY, SOCK_BROADCAST, SOCK_TIMESTAMP, + SOCK_ZAPPED, + SOCK_USE_WRITE_QUEUE, /* whether to call sk->sk_write_space in sock_wfree */ + SOCK_DBG, /* %SO_DEBUG setting */ + SOCK_RCVTSTAMP, /* %SO_TIMESTAMP setting */ + SOCK_NO_LARGESEND, /* whether to sent large segments or not */ + SOCK_LOCALROUTE, /* route locally only, %SO_DONTROUTE setting */ + SOCK_QUEUE_SHRUNK, /* write queue has been shrunk recently */ }; +static inline void sock_copy_flags(struct sock *nsk, struct sock *osk) +{ + nsk->sk_flags = osk->sk_flags; +} + static inline void sock_set_flag(struct sock *sk, enum sock_flags flag) { __set_bit(flag, &sk->sk_flags); @@ -407,19 +417,96 @@ static inline int sock_flag(struct sock *sk, enum sock_flags flag) return test_bit(flag, &sk->sk_flags); } +static inline void sk_acceptq_removed(struct sock *sk) +{ + sk->sk_ack_backlog--; +} + +static inline void sk_acceptq_added(struct sock *sk) +{ + sk->sk_ack_backlog++; +} + +static inline int sk_acceptq_is_full(struct sock *sk) +{ + return sk->sk_ack_backlog > sk->sk_max_ack_backlog; +} + +/* + * Compute minimal free write space needed to queue new packets. + */ +static inline int sk_stream_min_wspace(struct sock *sk) +{ + return sk->sk_wmem_queued / 2; +} + +static inline int sk_stream_wspace(struct sock *sk) +{ + return sk->sk_sndbuf - sk->sk_wmem_queued; +} + +extern void sk_stream_write_space(struct sock *sk); + +static inline int sk_stream_memory_free(struct sock *sk) +{ + return sk->sk_wmem_queued < sk->sk_sndbuf; +} + +extern void sk_stream_rfree(struct sk_buff *skb); + +static inline void sk_stream_set_owner_r(struct sk_buff *skb, struct sock *sk) +{ + skb->sk = sk; + skb->destructor = sk_stream_rfree; + atomic_add(skb->truesize, &sk->sk_rmem_alloc); + sk->sk_forward_alloc -= skb->truesize; +} + +static inline void sk_stream_free_skb(struct sock *sk, struct sk_buff *skb) +{ + skb_truesize_check(skb); + sock_set_flag(sk, SOCK_QUEUE_SHRUNK); + sk->sk_wmem_queued -= skb->truesize; + sk->sk_forward_alloc += skb->truesize; + __kfree_skb(skb); +} + /* The per-socket spinlock must be held here. */ -#define sk_add_backlog(__sk, __skb) \ -do { if (!(__sk)->sk_backlog.tail) { \ - (__sk)->sk_backlog.head = \ - (__sk)->sk_backlog.tail = (__skb); \ - } else { \ - ((__sk)->sk_backlog.tail)->next = (__skb); \ - (__sk)->sk_backlog.tail = (__skb); \ +static inline void sk_add_backlog(struct sock *sk, struct sk_buff *skb) +{ + if (!sk->sk_backlog.tail) { + sk->sk_backlog.head = sk->sk_backlog.tail = skb; + } else { + sk->sk_backlog.tail->next = skb; + sk->sk_backlog.tail = skb; + } + skb->next = NULL; +} + +#define sk_wait_event(__sk, __timeo, __condition) \ +({ int rc; \ + release_sock(__sk); \ + rc = __condition; \ + if (!rc) { \ + *(__timeo) = schedule_timeout(*(__timeo)); \ } \ - (__skb)->next = NULL; \ -} while(0) + lock_sock(__sk); \ + rc = __condition; \ + rc; \ +}) + +extern int sk_stream_wait_connect(struct sock *sk, long *timeo_p); +extern int sk_stream_wait_memory(struct sock *sk, long *timeo_p); +extern void sk_stream_wait_close(struct sock *sk, long timeo_p); +extern int sk_stream_error(struct sock *sk, int flags, int err); +extern void sk_stream_kill_queues(struct sock *sk); + +extern int sk_wait_data(struct sock *sk, long *timeo); -/* IP protocol blocks we attach to sockets. +struct request_sock_ops; +struct timewait_sock_ops; + +/* Networking protocol blocks we attach to sockets. * socket layer -> transport layer interface * transport -> network interface is defined by struct inet_proto */ @@ -439,10 +526,19 @@ struct proto { int (*destroy)(struct sock *sk); void (*shutdown)(struct sock *sk, int how); int (*setsockopt)(struct sock *sk, int level, - int optname, char *optval, int optlen); + int optname, char __user *optval, + int optlen); int (*getsockopt)(struct sock *sk, int level, - int optname, char *optval, - int *option); + int optname, char __user *optval, + int __user *option); + int (*compat_setsockopt)(struct sock *sk, + int level, + int optname, char __user *optval, + int optlen); + int (*compat_getsockopt)(struct sock *sk, + int level, + int optname, char __user *optval, + int __user *option); int (*sendmsg)(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, size_t len); int (*recvmsg)(struct kiocb *iocb, struct sock *sk, @@ -462,31 +558,72 @@ struct proto { void (*unhash)(struct sock *sk); int (*get_port)(struct sock *sk, unsigned short snum); + /* Memory pressure */ + void (*enter_memory_pressure)(void); + atomic_t *memory_allocated; /* Current allocated memory. */ + atomic_t *sockets_allocated; /* Current number of sockets. */ + /* + * Pressure flag: try to collapse. + * Technical note: it is used by multiple contexts non atomically. + * All the sk_stream_mem_schedule() is of this nature: accounting + * is strict, actions are advisory and have some latency. + */ + int *memory_pressure; + int *sysctl_mem; + int *sysctl_wmem; + int *sysctl_rmem; + int max_header; + + kmem_cache_t *slab; + unsigned int obj_size; + + atomic_t *orphan_count; + + struct request_sock_ops *rsk_prot; + struct timewait_sock_ops *twsk_prot; + + struct module *owner; + char name[32]; + struct list_head node; +#ifdef SOCK_REFCNT_DEBUG + atomic_t socks; +#endif struct { int inuse; u8 __pad[SMP_CACHE_BYTES - sizeof(int)]; } stats[NR_CPUS]; }; -static __inline__ void sk_set_owner(struct sock *sk, struct module *owner) +extern int proto_register(struct proto *prot, int alloc_slab); +extern void proto_unregister(struct proto *prot); + +#ifdef SOCK_REFCNT_DEBUG +static inline void sk_refcnt_debug_inc(struct sock *sk) { - /* - * One should use sk_set_owner just once, after struct sock creation, - * be it shortly after sk_alloc or after a function that returns a new - * struct sock (and that down the call chain called sk_alloc), e.g. the - * IPv4 and IPv6 modules share tcp_create_openreq_child, so if - * tcp_create_openreq_child called sk_set_owner IPv6 would have to - * change the ownership of this struct sock, with one not needed - * transient sk_set_owner call. - */ - BUG_ON(sk->sk_owner != NULL); + atomic_inc(&sk->sk_prot->socks); +} - sk->sk_owner = owner; - __module_get(owner); +static inline void sk_refcnt_debug_dec(struct sock *sk) +{ + atomic_dec(&sk->sk_prot->socks); + printk(KERN_DEBUG "%s socket %p released, %d are still alive\n", + sk->sk_prot->name, sk, atomic_read(&sk->sk_prot->socks)); } +static inline void sk_refcnt_debug_release(const struct sock *sk) +{ + if (atomic_read(&sk->sk_refcnt) != 1) + printk(KERN_DEBUG "Destruction of the %s socket %p delayed, refcnt=%d\n", + sk->sk_prot->name, sk, atomic_read(&sk->sk_refcnt)); +} +#else /* SOCK_REFCNT_DEBUG */ +#define sk_refcnt_debug_inc(sk) do { } while (0) +#define sk_refcnt_debug_dec(sk) do { } while (0) +#define sk_refcnt_debug_release(sk) do { } while (0) +#endif /* SOCK_REFCNT_DEBUG */ + /* Called with local bh disabled */ static __inline__ void sock_prot_inc_use(struct proto *prot) { @@ -498,6 +635,15 @@ static __inline__ void sock_prot_dec_use(struct proto *prot) prot->stats[smp_processor_id()].inuse--; } +/* With per-bucket locks this operation is not-atomic, so that + * this version is not worse. + */ +static inline void __sk_prot_rehash(struct sock *sk) +{ + sk->sk_prot->unhash(sk); + sk->sk_prot->hash(sk); +} + /* About 10 seconds */ #define SOCK_DESTROY_TIME (10*HZ) @@ -524,17 +670,17 @@ struct sock_iocb { struct scm_cookie *scm; struct msghdr *msg, async_msg; struct iovec async_iov; + struct kiocb *kiocb; }; static inline struct sock_iocb *kiocb_to_siocb(struct kiocb *iocb) { - BUG_ON(sizeof(struct sock_iocb) > KIOCB_PRIVATE_SIZE); return (struct sock_iocb *)iocb->private; } static inline struct kiocb *siocb_to_kiocb(struct sock_iocb *si) { - return container_of((void *)si, struct kiocb, private); + return si->kiocb; } struct socket_alloc { @@ -552,6 +698,43 @@ static inline struct inode *SOCK_INODE(struct socket *socket) return &container_of(socket, struct socket_alloc, socket)->vfs_inode; } +extern void __sk_stream_mem_reclaim(struct sock *sk); +extern int sk_stream_mem_schedule(struct sock *sk, int size, int kind); + +#define SK_STREAM_MEM_QUANTUM ((int)PAGE_SIZE) + +static inline int sk_stream_pages(int amt) +{ + return (amt + SK_STREAM_MEM_QUANTUM - 1) / SK_STREAM_MEM_QUANTUM; +} + +static inline void sk_stream_mem_reclaim(struct sock *sk) +{ + if (sk->sk_forward_alloc >= SK_STREAM_MEM_QUANTUM) + __sk_stream_mem_reclaim(sk); +} + +static inline void sk_stream_writequeue_purge(struct sock *sk) +{ + struct sk_buff *skb; + + while ((skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) + sk_stream_free_skb(sk, skb); + sk_stream_mem_reclaim(sk); +} + +static inline int sk_stream_rmem_schedule(struct sock *sk, struct sk_buff *skb) +{ + return (int)skb->truesize <= sk->sk_forward_alloc || + sk_stream_mem_schedule(sk, skb->truesize, 1); +} + +static inline int sk_stream_wmem_schedule(struct sock *sk, int size) +{ + return size <= sk->sk_forward_alloc || + sk_stream_mem_schedule(sk, size, 0); +} + /* Used by processes to "lock" a socket state, so that * interrupts and bottom half handlers won't change it * from under us. It essentially blocks any incoming @@ -565,8 +748,6 @@ static inline struct inode *SOCK_INODE(struct socket *socket) * Since ~2.3.5 it is also exclusive sleep lock serializing * accesses from user process context. */ -extern void __lock_sock(struct sock *sk); -extern void __release_sock(struct sock *sk); #define sock_owned_by_user(sk) ((sk)->sk_lock.owner) extern void FASTCALL(lock_sock(struct sock *sk)); @@ -576,16 +757,19 @@ extern void FASTCALL(release_sock(struct sock *sk)); #define bh_lock_sock(__sk) spin_lock(&((__sk)->sk_lock.slock)) #define bh_unlock_sock(__sk) spin_unlock(&((__sk)->sk_lock.slock)) -extern struct sock * sk_alloc(int family, int priority, int zero_it, - kmem_cache_t *slab); +extern struct sock *sk_alloc(int family, + gfp_t priority, + struct proto *prot, int zero_it); extern void sk_free(struct sock *sk); +extern struct sock *sk_clone(struct sock *sk, + const gfp_t priority); extern struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, - int priority); + gfp_t priority); extern struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force, - int priority); + gfp_t priority); extern void sock_wfree(struct sk_buff *skb); extern void sock_rfree(struct sk_buff *skb); @@ -600,12 +784,8 @@ extern struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, int noblock, int *errcode); -extern struct sk_buff *sock_alloc_send_pskb(struct sock *sk, - unsigned long header_len, - unsigned long data_len, - int noblock, - int *errcode); -extern void *sock_kmalloc(struct sock *sk, int size, int priority); +extern void *sock_kmalloc(struct sock *sk, int size, + gfp_t priority); extern void sock_kfree_s(struct sock *sk, void *mem, int size); extern void sk_send_sigurg(struct sock *sk); @@ -613,7 +793,6 @@ extern void sk_send_sigurg(struct sock *sk); * Functions to fill in entries in struct proto_ops when a protocol * does not implement a particular function. */ -extern int sock_no_release(struct socket *); extern int sock_no_bind(struct socket *, struct sockaddr *, int); extern int sock_no_connect(struct socket *, @@ -646,12 +825,27 @@ extern ssize_t sock_no_sendpage(struct socket *sock, int offset, size_t size, int flags); +/* + * Functions to fill in entries in struct proto_ops when a protocol + * uses the inet style. + */ +extern int sock_common_getsockopt(struct socket *sock, int level, int optname, + char __user *optval, int __user *optlen); +extern int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock, + struct msghdr *msg, size_t size, int flags); +extern int sock_common_setsockopt(struct socket *sock, int level, int optname, + char __user *optval, int optlen); +extern int compat_sock_common_getsockopt(struct socket *sock, int level, + int optname, char __user *optval, int __user *optlen); +extern int compat_sock_common_setsockopt(struct socket *sock, int level, + int optname, char __user *optval, int optlen); + +extern void sk_common_release(struct sock *sk); + /* * Default socket callbacks and setup code */ -extern void sock_def_destruct(struct sock *); - /* Initialise core socket variables */ extern void sock_init_data(struct socket *sock, struct sock *sk); @@ -685,8 +879,8 @@ static inline int sk_filter(struct sock *sk, struct sk_buff *skb, int needlock) filter = sk->sk_filter; if (filter) { - int pkt_len = sk_run_filter(skb, filter->insns, - filter->len); + unsigned int pkt_len = sk_run_filter(skb, filter->insns, + filter->len); if (!pkt_len) err = -EPERM; else @@ -755,6 +949,8 @@ static inline void sock_put(struct sock *sk) sk_free(sk); } +extern int sk_receive_skb(struct sock *sk, struct sk_buff *skb); + /* Detach socket from process context. * Announce socket dead, detach it from wait queue and inode. * Note that parent inode held reference count on this struct sock, @@ -780,25 +976,8 @@ static inline void sock_graft(struct sock *sk, struct socket *parent) write_unlock_bh(&sk->sk_callback_lock); } -static inline int sock_i_uid(struct sock *sk) -{ - int uid; - - read_lock(&sk->sk_callback_lock); - uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0; - read_unlock(&sk->sk_callback_lock); - return uid; -} - -static inline unsigned long sock_i_ino(struct sock *sk) -{ - unsigned long ino; - - read_lock(&sk->sk_callback_lock); - ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0; - read_unlock(&sk->sk_callback_lock); - return ino; -} +extern int sock_i_uid(struct sock *sk); +extern unsigned long sock_i_ino(struct sock *sk); static inline struct dst_entry * __sk_dst_get(struct sock *sk) @@ -855,32 +1034,48 @@ sk_dst_reset(struct sock *sk) write_unlock(&sk->sk_dst_lock); } -static inline struct dst_entry * -__sk_dst_check(struct sock *sk, u32 cookie) -{ - struct dst_entry *dst = sk->sk_dst_cache; - - if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) { - sk->sk_dst_cache = NULL; - return NULL; - } +extern struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie); - return dst; -} +extern struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie); -static inline struct dst_entry * -sk_dst_check(struct sock *sk, u32 cookie) +static inline void sk_setup_caps(struct sock *sk, struct dst_entry *dst) { - struct dst_entry *dst = sk_dst_get(sk); - - if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) { - sk_dst_reset(sk); - return NULL; + __sk_dst_set(sk, dst); + sk->sk_route_caps = dst->dev->features; + if (sk->sk_route_caps & NETIF_F_TSO) { + if (sock_flag(sk, SOCK_NO_LARGESEND) || dst->header_len) + sk->sk_route_caps &= ~NETIF_F_TSO; } +} - return dst; +static inline void sk_charge_skb(struct sock *sk, struct sk_buff *skb) +{ + sk->sk_wmem_queued += skb->truesize; + sk->sk_forward_alloc -= skb->truesize; } +static inline int skb_copy_to_page(struct sock *sk, char __user *from, + struct sk_buff *skb, struct page *page, + int off, int copy) +{ + if (skb->ip_summed == CHECKSUM_NONE) { + int err = 0; + unsigned int csum = csum_and_copy_from_user(from, + page_address(page) + off, + copy, 0, &err); + if (err) + return err; + skb->csum = csum_block_add(skb->csum, csum, skb->len); + } else if (copy_from_user(page_address(page) + off, from, copy)) + return -EFAULT; + + skb->len += copy; + skb->data_len += copy; + skb->truesize += copy; + sk->sk_wmem_queued += copy; + sk->sk_forward_alloc -= copy; + return 0; +} /* * Queue a received datagram if it will fit. Stream and sequenced @@ -906,45 +1101,12 @@ static inline void skb_set_owner_r(struct sk_buff *skb, struct sock *sk) atomic_add(skb->truesize, &sk->sk_rmem_alloc); } -static inline int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) -{ - int err = 0; - int skb_len; - - /* Cast skb->rcvbuf to unsigned... It's pointless, but reduces - number of warnings when compiling with -W --ANK - */ - if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >= - (unsigned)sk->sk_rcvbuf) { - err = -ENOMEM; - goto out; - } - - /* It would be deadlock, if sock_queue_rcv_skb is used - with socket lock! We assume that users of this - function are lock free. - */ - err = sk_filter(sk, skb, 1); - if (err) - goto out; - - skb->dev = NULL; - skb_set_owner_r(skb, sk); +extern void sk_reset_timer(struct sock *sk, struct timer_list* timer, + unsigned long expires); - /* Cache the SKB length before we tack it onto the receive - * queue. Once it is added it no longer belongs to us and - * may be freed by other threads of control pulling packets - * from the queue. - */ - skb_len = skb->len; - - skb_queue_tail(&sk->sk_receive_queue, skb); +extern void sk_stop_timer(struct sock *sk, struct timer_list* timer); - if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_data_ready(sk, skb_len); -out: - return err; -} +extern int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb); static inline int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb) { @@ -967,7 +1129,10 @@ static inline int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb) static inline int sock_error(struct sock *sk) { - int err = xchg(&sk->sk_err, 0); + int err; + if (likely(!sk->sk_err)) + return 0; + err = xchg(&sk->sk_err, 0); return -err; } @@ -992,30 +1157,92 @@ static inline void sk_wake_async(struct sock *sk, int how, int band) #define SOCK_MIN_SNDBUF 2048 #define SOCK_MIN_RCVBUF 256 +static inline void sk_stream_moderate_sndbuf(struct sock *sk) +{ + if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK)) { + sk->sk_sndbuf = min(sk->sk_sndbuf, sk->sk_wmem_queued / 2); + sk->sk_sndbuf = max(sk->sk_sndbuf, SOCK_MIN_SNDBUF); + } +} + +static inline struct sk_buff *sk_stream_alloc_pskb(struct sock *sk, + int size, int mem, + gfp_t gfp) +{ + struct sk_buff *skb; + int hdr_len; + + hdr_len = SKB_DATA_ALIGN(sk->sk_prot->max_header); + skb = alloc_skb_fclone(size + hdr_len, gfp); + if (skb) { + skb->truesize += mem; + if (sk_stream_wmem_schedule(sk, skb->truesize)) { + skb_reserve(skb, hdr_len); + return skb; + } + __kfree_skb(skb); + } else { + sk->sk_prot->enter_memory_pressure(); + sk_stream_moderate_sndbuf(sk); + } + return NULL; +} + +static inline struct sk_buff *sk_stream_alloc_skb(struct sock *sk, + int size, + gfp_t gfp) +{ + return sk_stream_alloc_pskb(sk, size, 0, gfp); +} + +static inline struct page *sk_stream_alloc_page(struct sock *sk) +{ + struct page *page = NULL; + + page = alloc_pages(sk->sk_allocation, 0); + if (!page) { + sk->sk_prot->enter_memory_pressure(); + sk_stream_moderate_sndbuf(sk); + } + return page; +} + +#define sk_stream_for_retrans_queue(skb, sk) \ + for (skb = (sk)->sk_write_queue.next; \ + (skb != (sk)->sk_send_head) && \ + (skb != (struct sk_buff *)&(sk)->sk_write_queue); \ + skb = skb->next) + +/*from STCP for fast SACK Process*/ +#define sk_stream_for_retrans_queue_from(skb, sk) \ + for (; (skb != (sk)->sk_send_head) && \ + (skb != (struct sk_buff *)&(sk)->sk_write_queue); \ + skb = skb->next) + /* * Default write policy as shown to user space via poll/select/SIGIO */ -static inline int sock_writeable(struct sock *sk) +static inline int sock_writeable(const struct sock *sk) { return atomic_read(&sk->sk_wmem_alloc) < (sk->sk_sndbuf / 2); } -static inline int gfp_any(void) +static inline gfp_t gfp_any(void) { return in_softirq() ? GFP_ATOMIC : GFP_KERNEL; } -static inline long sock_rcvtimeo(struct sock *sk, int noblock) +static inline long sock_rcvtimeo(const struct sock *sk, int noblock) { return noblock ? 0 : sk->sk_rcvtimeo; } -static inline long sock_sndtimeo(struct sock *sk, int noblock) +static inline long sock_sndtimeo(const struct sock *sk, int noblock) { return noblock ? 0 : sk->sk_sndtimeo; } -static inline int sock_rcvlowat(struct sock *sk, int waitall, int len) +static inline int sock_rcvlowat(const struct sock *sk, int waitall, int len) { return (waitall ? len : min_t(int, sk->sk_rcvlowat, len)) ? : 1; } @@ -1031,44 +1258,48 @@ static inline int sock_intr_errno(long timeo) static __inline__ void sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb) { - struct timeval *stamp = &skb->stamp; - if (sk->sk_rcvtstamp) { + struct timeval stamp; + + skb_get_timestamp(skb, &stamp); + if (sock_flag(sk, SOCK_RCVTSTAMP)) { /* Race occurred between timestamp enabling and packet receiving. Fill in the current time for now. */ - if (stamp->tv_sec == 0) - do_gettimeofday(stamp); + if (stamp.tv_sec == 0) + do_gettimeofday(&stamp); + skb_set_timestamp(skb, &stamp); put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP, sizeof(struct timeval), - stamp); + &stamp); } else - sk->sk_stamp = *stamp; + sk->sk_stamp = stamp; } -extern atomic_t netstamp_needed; -extern void sock_enable_timestamp(struct sock *sk); -extern void sock_disable_timestamp(struct sock *sk); - -static inline void net_timestamp(struct timeval *stamp) -{ - if (atomic_read(&netstamp_needed)) - do_gettimeofday(stamp); - else { - stamp->tv_sec = 0; - stamp->tv_usec = 0; - } -} +/** + * sk_eat_skb - Release a skb if it is no longer needed + * @sk: socket to eat this skb from + * @skb: socket buffer to eat + * + * This routine must be called with interrupts disabled or with the socket + * locked so that the sk_buff queue operation is ok. +*/ +static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb) +{ + __skb_unlink(skb, &sk->sk_receive_queue); + __kfree_skb(skb); +} -extern int sock_get_timestamp(struct sock *, struct timeval *); +extern void sock_enable_timestamp(struct sock *sk); +extern int sock_get_timestamp(struct sock *, struct timeval __user *); /* * Enable debug/info messages */ -#if 0 -#define NETDEBUG(x) do { } while (0) -#define LIMIT_NETDEBUG(x) do {} while(0) +#ifdef CONFIG_NETDEBUG +#define NETDEBUG(fmt, args...) printk(fmt,##args) +#define LIMIT_NETDEBUG(fmt, args...) do { if (net_ratelimit()) printk(fmt,##args); } while(0) #else -#define NETDEBUG(x) do { x; } while (0) -#define LIMIT_NETDEBUG(x) do { if (net_ratelimit()) { x; } } while(0) +#define NETDEBUG(fmt, args...) do { } while (0) +#define LIMIT_NETDEBUG(fmt, args...) do { } while(0) #endif /* @@ -1106,6 +1337,24 @@ static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool) extern __u32 sysctl_wmem_max; extern __u32 sysctl_rmem_max; +#ifdef CONFIG_NET int siocdevprivate_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg); +#else +static inline int siocdevprivate_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg) +{ + return -ENODEV; +} +#endif + +extern void sk_init(void); + +#ifdef CONFIG_SYSCTL +extern struct ctl_table core_table[]; +#endif + +extern int sysctl_optmem_max; + +extern __u32 sysctl_wmem_default; +extern __u32 sysctl_rmem_default; #endif /* _SOCK_H */