X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=include%2Fnet%2Fsock.h;h=aab07e510c480f56f2495a3ba390ea85152ced0d;hb=43bc926fffd92024b46cafaf7350d669ba9ca884;hp=d6926d65b60903b7ad56e457a7e3ab0af120e74e;hpb=f7f1b0f1e2fbadeab12d24236000e778aa9b1ead;p=linux-2.6.git diff --git a/include/net/sock.h b/include/net/sock.h index d6926d65b..aab07e510 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -62,7 +62,7 @@ */ /* Define this to get the SOCK_DBG debugging facility. */ -#define SOCK_DEBUGGING +//#define SOCK_DEBUGGING #ifdef SOCK_DEBUGGING #define SOCK_DEBUG(sk, msg...) do { if ((sk) && sock_flag((sk), SOCK_DBG)) \ printk(KERN_DEBUG msg); } while (0) @@ -88,6 +88,7 @@ do { spin_lock_init(&((__sk)->sk_lock.slock)); \ } while(0) struct sock; +struct proto; /** * struct sock_common - minimal network layer representation of sockets @@ -98,10 +99,12 @@ struct sock; * @skc_node: main hash linkage for various protocol lookup tables * @skc_bind_node: bind hash linkage for various protocol lookup tables * @skc_refcnt: reference count + * @skc_hash: hash value used with various protocol lookup tables + * @skc_prot: protocol handlers inside a network family * * This is the minimal network layer representation of sockets, the header - * for struct sock and struct tcp_tw_bucket. - */ + * for struct sock and struct inet_timewait_sock. + */ struct sock_common { unsigned short skc_family; volatile unsigned char skc_state; @@ -110,15 +113,17 @@ struct sock_common { struct hlist_node skc_node; struct hlist_node skc_bind_node; atomic_t skc_refcnt; + unsigned int skc_hash; + struct proto *skc_prot; xid_t skc_xid; - struct vx_info *skc_vx_info; + struct vx_info *skc_vx_info; nid_t skc_nid; - struct nx_info *skc_nx_info; + struct nx_info *skc_nx_info; }; /** * struct sock - network layer representation of sockets - * @__sk_common: shared layout with tcp_tw_bucket + * @__sk_common: shared layout with inet_timewait_sock * @sk_shutdown: mask of %SEND_SHUTDOWN and/or %RCV_SHUTDOWN * @sk_userlocks: %SO_SNDBUF and %SO_RCVBUF settings * @sk_lock: synchronizer @@ -140,11 +145,9 @@ struct sock_common { * @sk_no_check: %SO_NO_CHECK setting, wether or not checkup packets * @sk_route_caps: route capabilities (e.g. %NETIF_F_TSO) * @sk_lingertime: %SO_LINGER l_linger setting - * @sk_hashent: hash entry in several tables (e.g. tcp_ehash) * @sk_backlog: always used with the per-socket spinlock held * @sk_callback_lock: used with the callbacks in the end of this struct * @sk_error_queue: rarely used - * @sk_prot: protocol handlers inside a network family * @sk_prot_creator: sk_prot of original sock creator (see ipv6_setsockopt, IPV6_ADDRFORM for instance) * @sk_err: last error * @sk_err_soft: errors that don't cause failure but are the cause of a persistent failure not just 'timed out' @@ -162,7 +165,7 @@ struct sock_common { * @sk_timer: sock cleanup timer * @sk_stamp: time stamp of last packet received * @sk_socket: Identd and reporting IO signals - * @sk_user_data: RPC layer private data + * @sk_user_data: RPC and Tux layer private data * @sk_sndmsg_page: cached page for sendmsg * @sk_sndmsg_off: cached offset for sendmsg * @sk_send_head: front of stuff to transmit @@ -172,12 +175,13 @@ struct sock_common { * @sk_data_ready: callback to indicate there is data to be processed * @sk_write_space: callback to indicate there is bf sending space available * @sk_error_report: callback to indicate errors (e.g. %MSG_ERRQUEUE) + * @sk_create_child - callback to get new socket events * @sk_backlog_rcv: callback to process the backlog * @sk_destruct: called at sock freeing time, i.e. when all refcnt == 0 */ struct sock { /* - * Now struct tcp_tw_bucket also uses sock_common, so please just + * Now struct inet_timewait_sock also uses sock_common, so please just * don't add nothing before this first member (__sk_common) --acme */ struct sock_common __sk_common; @@ -188,6 +192,8 @@ struct sock { #define sk_node __sk_common.skc_node #define sk_bind_node __sk_common.skc_bind_node #define sk_refcnt __sk_common.skc_refcnt +#define sk_hash __sk_common.skc_hash +#define sk_prot __sk_common.skc_prot #define sk_xid __sk_common.skc_xid #define sk_vx_info __sk_common.skc_vx_info #define sk_nid __sk_common.skc_nid @@ -210,10 +216,10 @@ struct sock { struct sk_buff_head sk_write_queue; int sk_wmem_queued; int sk_forward_alloc; - unsigned int sk_allocation; + gfp_t sk_allocation; int sk_sndbuf; int sk_route_caps; - int sk_hashent; + int sk_rcvlowat; unsigned long sk_flags; unsigned long sk_lingertime; /* @@ -226,7 +232,6 @@ struct sock { struct sk_buff *tail; } sk_backlog; struct sk_buff_head sk_error_queue; - struct proto *sk_prot; struct proto *sk_prot_creator; rwlock_t sk_callback_lock; int sk_err, @@ -235,7 +240,6 @@ struct sock { unsigned short sk_max_ack_backlog; __u32 sk_priority; struct ucred sk_peercred; - int sk_rcvlowat; long sk_rcvtimeo; long sk_sndtimeo; struct sk_filter *sk_filter; @@ -255,36 +259,37 @@ struct sock { void (*sk_error_report)(struct sock *sk); int (*sk_backlog_rcv)(struct sock *sk, struct sk_buff *skb); + void (*sk_create_child)(struct sock *sk, struct sock *newsk); void (*sk_destruct)(struct sock *sk); }; /* * Hashed lists helper routines */ -static inline struct sock *__sk_head(struct hlist_head *head) +static inline struct sock *__sk_head(const struct hlist_head *head) { return hlist_entry(head->first, struct sock, sk_node); } -static inline struct sock *sk_head(struct hlist_head *head) +static inline struct sock *sk_head(const struct hlist_head *head) { return hlist_empty(head) ? NULL : __sk_head(head); } -static inline struct sock *sk_next(struct sock *sk) +static inline struct sock *sk_next(const struct sock *sk) { return sk->sk_node.next ? hlist_entry(sk->sk_node.next, struct sock, sk_node) : NULL; } -static inline int sk_unhashed(struct sock *sk) +static inline int sk_unhashed(const struct sock *sk) { return hlist_unhashed(&sk->sk_node); } -static inline int sk_hashed(struct sock *sk) +static inline int sk_hashed(const struct sock *sk) { - return sk->sk_node.pprev != NULL; + return !sk_unhashed(sk); } static __inline__ void sk_node_init(struct hlist_node *node) @@ -392,6 +397,11 @@ enum sock_flags { SOCK_QUEUE_SHRUNK, /* write queue has been shrunk recently */ }; +static inline void sock_copy_flags(struct sock *nsk, struct sock *osk) +{ + nsk->sk_flags = osk->sk_flags; +} + static inline void sock_set_flag(struct sock *sk, enum sock_flags flag) { __set_bit(flag, &sk->sk_flags); @@ -454,6 +464,7 @@ static inline void sk_stream_set_owner_r(struct sk_buff *skb, struct sock *sk) static inline void sk_stream_free_skb(struct sock *sk, struct sk_buff *skb) { + skb_truesize_check(skb); sock_set_flag(sk, SOCK_QUEUE_SHRUNK); sk->sk_wmem_queued -= skb->truesize; sk->sk_forward_alloc += skb->truesize; @@ -461,16 +472,16 @@ static inline void sk_stream_free_skb(struct sock *sk, struct sk_buff *skb) } /* The per-socket spinlock must be held here. */ -#define sk_add_backlog(__sk, __skb) \ -do { if (!(__sk)->sk_backlog.tail) { \ - (__sk)->sk_backlog.head = \ - (__sk)->sk_backlog.tail = (__skb); \ - } else { \ - ((__sk)->sk_backlog.tail)->next = (__skb); \ - (__sk)->sk_backlog.tail = (__skb); \ - } \ - (__skb)->next = NULL; \ -} while(0) +static inline void sk_add_backlog(struct sock *sk, struct sk_buff *skb) +{ + if (!sk->sk_backlog.tail) { + sk->sk_backlog.head = sk->sk_backlog.tail = skb; + } else { + sk->sk_backlog.tail->next = skb; + sk->sk_backlog.tail = skb; + } + skb->next = NULL; +} #define sk_wait_event(__sk, __timeo, __condition) \ ({ int rc; \ @@ -478,9 +489,9 @@ do { if (!(__sk)->sk_backlog.tail) { \ rc = __condition; \ if (!rc) { \ *(__timeo) = schedule_timeout(*(__timeo)); \ - rc = __condition; \ } \ lock_sock(__sk); \ + rc = __condition; \ rc; \ }) @@ -492,6 +503,9 @@ extern void sk_stream_kill_queues(struct sock *sk); extern int sk_wait_data(struct sock *sk, long *timeo); +struct request_sock_ops; +struct timewait_sock_ops; + /* Networking protocol blocks we attach to sockets. * socket layer -> transport layer interface * transport -> network interface is defined by struct inet_proto @@ -517,6 +531,14 @@ struct proto { int (*getsockopt)(struct sock *sk, int level, int optname, char __user *optval, int __user *option); + int (*compat_setsockopt)(struct sock *sk, + int level, + int optname, char __user *optval, + int optlen); + int (*compat_getsockopt)(struct sock *sk, + int level, + int optname, char __user *optval, + int __user *option); int (*sendmsg)(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, size_t len); int (*recvmsg)(struct kiocb *iocb, struct sock *sk, @@ -555,12 +577,19 @@ struct proto { kmem_cache_t *slab; unsigned int obj_size; + atomic_t *orphan_count; + + struct request_sock_ops *rsk_prot; + struct timewait_sock_ops *twsk_prot; + struct module *owner; char name[32]; struct list_head node; - +#ifdef SOCK_REFCNT_DEBUG + atomic_t socks; +#endif struct { int inuse; u8 __pad[SMP_CACHE_BYTES - sizeof(int)]; @@ -570,6 +599,31 @@ struct proto { extern int proto_register(struct proto *prot, int alloc_slab); extern void proto_unregister(struct proto *prot); +#ifdef SOCK_REFCNT_DEBUG +static inline void sk_refcnt_debug_inc(struct sock *sk) +{ + atomic_inc(&sk->sk_prot->socks); +} + +static inline void sk_refcnt_debug_dec(struct sock *sk) +{ + atomic_dec(&sk->sk_prot->socks); + printk(KERN_DEBUG "%s socket %p released, %d are still alive\n", + sk->sk_prot->name, sk, atomic_read(&sk->sk_prot->socks)); +} + +static inline void sk_refcnt_debug_release(const struct sock *sk) +{ + if (atomic_read(&sk->sk_refcnt) != 1) + printk(KERN_DEBUG "Destruction of the %s socket %p delayed, refcnt=%d\n", + sk->sk_prot->name, sk, atomic_read(&sk->sk_refcnt)); +} +#else /* SOCK_REFCNT_DEBUG */ +#define sk_refcnt_debug_inc(sk) do { } while (0) +#define sk_refcnt_debug_dec(sk) do { } while (0) +#define sk_refcnt_debug_release(sk) do { } while (0) +#endif /* SOCK_REFCNT_DEBUG */ + /* Called with local bh disabled */ static __inline__ void sock_prot_inc_use(struct proto *prot) { @@ -581,6 +635,15 @@ static __inline__ void sock_prot_dec_use(struct proto *prot) prot->stats[smp_processor_id()].inuse--; } +/* With per-bucket locks this operation is not-atomic, so that + * this version is not worse. + */ +static inline void __sk_prot_rehash(struct sock *sk) +{ + sk->sk_prot->unhash(sk); + sk->sk_prot->hash(sk); +} + /* About 10 seconds */ #define SOCK_DESTROY_TIME (10*HZ) @@ -666,6 +729,12 @@ static inline int sk_stream_rmem_schedule(struct sock *sk, struct sk_buff *skb) sk_stream_mem_schedule(sk, skb->truesize, 1); } +static inline int sk_stream_wmem_schedule(struct sock *sk, int size) +{ + return size <= sk->sk_forward_alloc || + sk_stream_mem_schedule(sk, size, 0); +} + /* Used by processes to "lock" a socket state, so that * interrupts and bottom half handlers won't change it * from under us. It essentially blocks any incoming @@ -688,16 +757,19 @@ extern void FASTCALL(release_sock(struct sock *sk)); #define bh_lock_sock(__sk) spin_lock(&((__sk)->sk_lock.slock)) #define bh_unlock_sock(__sk) spin_unlock(&((__sk)->sk_lock.slock)) -extern struct sock *sk_alloc(int family, int priority, +extern struct sock *sk_alloc(int family, + gfp_t priority, struct proto *prot, int zero_it); extern void sk_free(struct sock *sk); +extern struct sock *sk_clone(struct sock *sk, + const gfp_t priority); extern struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, - int priority); + gfp_t priority); extern struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force, - int priority); + gfp_t priority); extern void sock_wfree(struct sk_buff *skb); extern void sock_rfree(struct sk_buff *skb); @@ -712,7 +784,8 @@ extern struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, int noblock, int *errcode); -extern void *sock_kmalloc(struct sock *sk, int size, int priority); +extern void *sock_kmalloc(struct sock *sk, int size, + gfp_t priority); extern void sock_kfree_s(struct sock *sk, void *mem, int size); extern void sk_send_sigurg(struct sock *sk); @@ -762,6 +835,10 @@ extern int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, size_t size, int flags); extern int sock_common_setsockopt(struct socket *sock, int level, int optname, char __user *optval, int optlen); +extern int compat_sock_common_getsockopt(struct socket *sock, int level, + int optname, char __user *optval, int __user *optlen); +extern int compat_sock_common_setsockopt(struct socket *sock, int level, + int optname, char __user *optval, int optlen); extern void sk_common_release(struct sock *sk); @@ -802,8 +879,8 @@ static inline int sk_filter(struct sock *sk, struct sk_buff *skb, int needlock) filter = sk->sk_filter; if (filter) { - int pkt_len = sk_run_filter(skb, filter->insns, - filter->len); + unsigned int pkt_len = sk_run_filter(skb, filter->insns, + filter->len); if (!pkt_len) err = -EPERM; else @@ -872,6 +949,8 @@ static inline void sock_put(struct sock *sk) sk_free(sk); } +extern int sk_receive_skb(struct sock *sk, struct sk_buff *skb); + /* Detach socket from process context. * Announce socket dead, detach it from wait queue and inode. * Note that parent inode held reference count on this struct sock, @@ -955,32 +1034,18 @@ sk_dst_reset(struct sock *sk) write_unlock(&sk->sk_dst_lock); } -static inline struct dst_entry * -__sk_dst_check(struct sock *sk, u32 cookie) -{ - struct dst_entry *dst = sk->sk_dst_cache; - - if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) { - sk->sk_dst_cache = NULL; - dst_release(dst); - return NULL; - } +extern struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie); - return dst; -} +extern struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie); -static inline struct dst_entry * -sk_dst_check(struct sock *sk, u32 cookie) +static inline void sk_setup_caps(struct sock *sk, struct dst_entry *dst) { - struct dst_entry *dst = sk_dst_get(sk); - - if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) { - sk_dst_reset(sk); - dst_release(dst); - return NULL; + __sk_dst_set(sk, dst); + sk->sk_route_caps = dst->dev->features; + if (sk->sk_route_caps & NETIF_F_TSO) { + if (sock_flag(sk, SOCK_NO_LARGESEND) || dst->header_len) + sk->sk_route_caps &= ~NETIF_F_TSO; } - - return dst; } static inline void sk_charge_skb(struct sock *sk, struct sk_buff *skb) @@ -1041,45 +1106,7 @@ extern void sk_reset_timer(struct sock *sk, struct timer_list* timer, extern void sk_stop_timer(struct sock *sk, struct timer_list* timer); -static inline int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) -{ - int err = 0; - int skb_len; - - /* Cast skb->rcvbuf to unsigned... It's pointless, but reduces - number of warnings when compiling with -W --ANK - */ - if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >= - (unsigned)sk->sk_rcvbuf) { - err = -ENOMEM; - goto out; - } - - /* It would be deadlock, if sock_queue_rcv_skb is used - with socket lock! We assume that users of this - function are lock free. - */ - err = sk_filter(sk, skb, 1); - if (err) - goto out; - - skb->dev = NULL; - skb_set_owner_r(skb, sk); - - /* Cache the SKB length before we tack it onto the receive - * queue. Once it is added it no longer belongs to us and - * may be freed by other threads of control pulling packets - * from the queue. - */ - skb_len = skb->len; - - skb_queue_tail(&sk->sk_receive_queue, skb); - - if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_data_ready(sk, skb_len); -out: - return err; -} +extern int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb); static inline int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb) { @@ -1102,7 +1129,10 @@ static inline int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb) static inline int sock_error(struct sock *sk) { - int err = xchg(&sk->sk_err, 0); + int err; + if (likely(!sk->sk_err)) + return 0; + err = xchg(&sk->sk_err, 0); return -err; } @@ -1136,15 +1166,18 @@ static inline void sk_stream_moderate_sndbuf(struct sock *sk) } static inline struct sk_buff *sk_stream_alloc_pskb(struct sock *sk, - int size, int mem, int gfp) + int size, int mem, + gfp_t gfp) { - struct sk_buff *skb = alloc_skb(size + sk->sk_prot->max_header, gfp); + struct sk_buff *skb; + int hdr_len; + hdr_len = SKB_DATA_ALIGN(sk->sk_prot->max_header); + skb = alloc_skb_fclone(size + hdr_len, gfp); if (skb) { skb->truesize += mem; - if (sk->sk_forward_alloc >= (int)skb->truesize || - sk_stream_mem_schedule(sk, skb->truesize, 0)) { - skb_reserve(skb, sk->sk_prot->max_header); + if (sk_stream_wmem_schedule(sk, skb->truesize)) { + skb_reserve(skb, hdr_len); return skb; } __kfree_skb(skb); @@ -1156,7 +1189,8 @@ static inline struct sk_buff *sk_stream_alloc_pskb(struct sock *sk, } static inline struct sk_buff *sk_stream_alloc_skb(struct sock *sk, - int size, int gfp) + int size, + gfp_t gfp) { return sk_stream_alloc_pskb(sk, size, 0, gfp); } @@ -1165,10 +1199,8 @@ static inline struct page *sk_stream_alloc_page(struct sock *sk) { struct page *page = NULL; - if (sk->sk_forward_alloc >= (int)PAGE_SIZE || - sk_stream_mem_schedule(sk, PAGE_SIZE, 0)) - page = alloc_pages(sk->sk_allocation, 0); - else { + page = alloc_pages(sk->sk_allocation, 0); + if (!page) { sk->sk_prot->enter_memory_pressure(); sk_stream_moderate_sndbuf(sk); } @@ -1181,6 +1213,12 @@ static inline struct page *sk_stream_alloc_page(struct sock *sk) (skb != (struct sk_buff *)&(sk)->sk_write_queue); \ skb = skb->next) +/*from STCP for fast SACK Process*/ +#define sk_stream_for_retrans_queue_from(skb, sk) \ + for (; (skb != (sk)->sk_send_head) && \ + (skb != (struct sk_buff *)&(sk)->sk_write_queue); \ + skb = skb->next) + /* * Default write policy as shown to user space via poll/select/SIGIO */ @@ -1189,7 +1227,7 @@ static inline int sock_writeable(const struct sock *sk) return atomic_read(&sk->sk_wmem_alloc) < (sk->sk_sndbuf / 2); } -static inline int gfp_any(void) +static inline gfp_t gfp_any(void) { return in_softirq() ? GFP_ATOMIC : GFP_KERNEL; } @@ -1220,16 +1258,19 @@ static inline int sock_intr_errno(long timeo) static __inline__ void sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb) { - struct timeval *stamp = &skb->stamp; + struct timeval stamp; + + skb_get_timestamp(skb, &stamp); if (sock_flag(sk, SOCK_RCVTSTAMP)) { /* Race occurred between timestamp enabling and packet receiving. Fill in the current time for now. */ - if (stamp->tv_sec == 0) - do_gettimeofday(stamp); + if (stamp.tv_sec == 0) + do_gettimeofday(&stamp); + skb_set_timestamp(skb, &stamp); put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP, sizeof(struct timeval), - stamp); + &stamp); } else - sk->sk_stamp = *stamp; + sk->sk_stamp = stamp; } /** @@ -1253,12 +1294,12 @@ extern int sock_get_timestamp(struct sock *, struct timeval __user *); * Enable debug/info messages */ -#if 0 -#define NETDEBUG(x) do { } while (0) -#define LIMIT_NETDEBUG(x) do {} while(0) +#ifdef CONFIG_NETDEBUG +#define NETDEBUG(fmt, args...) printk(fmt,##args) +#define LIMIT_NETDEBUG(fmt, args...) do { if (net_ratelimit()) printk(fmt,##args); } while(0) #else -#define NETDEBUG(x) do { x; } while (0) -#define LIMIT_NETDEBUG(x) do { if (net_ratelimit()) { x; } } while(0) +#define NETDEBUG(fmt, args...) do { } while (0) +#define LIMIT_NETDEBUG(fmt, args...) do { } while(0) #endif /* @@ -1305,4 +1346,15 @@ static inline int siocdevprivate_ioctl(unsigned int fd, unsigned int cmd, unsign } #endif +extern void sk_init(void); + +#ifdef CONFIG_SYSCTL +extern struct ctl_table core_table[]; +#endif + +extern int sysctl_optmem_max; + +extern __u32 sysctl_wmem_default; +extern __u32 sysctl_rmem_default; + #endif /* _SOCK_H */