X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=net%2Fipv4%2Ftcp.c;fp=net%2Fipv4%2Ftcp.c;h=072ae9c00da9bbbef5d6c1594d846eb36da76fbf;hb=64ba3f394c830ec48a1c31b53dcae312c56f1604;hp=b04e96e08adabf80c773b1e26418f178bfe22866;hpb=be1e6109ac94a859551f8e1774eb9a8469fe055c;p=linux-2.6.git diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index b04e96e08..072ae9c00 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -247,6 +247,7 @@ * TCP_CLOSE socket is finished */ +#include #include #include #include @@ -256,15 +257,13 @@ #include #include #include -#include -#include #include #include #include #include #include -#include + #include #include @@ -277,9 +276,9 @@ atomic_t tcp_orphan_count = ATOMIC_INIT(0); EXPORT_SYMBOL_GPL(tcp_orphan_count); -int sysctl_tcp_mem[3] __read_mostly; -int sysctl_tcp_wmem[3] __read_mostly; -int sysctl_tcp_rmem[3] __read_mostly; +int sysctl_tcp_mem[3]; +int sysctl_tcp_wmem[3] = { 4 * 1024, 16 * 1024, 128 * 1024 }; +int sysctl_tcp_rmem[3] = { 4 * 1024, 87380, 87380 * 2 }; EXPORT_SYMBOL(sysctl_tcp_mem); EXPORT_SYMBOL(sysctl_tcp_rmem); @@ -367,7 +366,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE) mask |= POLLHUP; if (sk->sk_shutdown & RCV_SHUTDOWN) - mask |= POLLIN | POLLRDNORM | POLLRDHUP; + mask |= POLLIN | POLLRDNORM; /* Connected? */ if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) { @@ -572,7 +571,7 @@ new_segment: skb->ip_summed = CHECKSUM_HW; tp->write_seq += copy; TCP_SKB_CB(skb)->end_seq += copy; - skb_shinfo(skb)->gso_segs = 0; + skb_shinfo(skb)->tso_segs = 0; if (!copied) TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH; @@ -623,10 +622,14 @@ ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset, ssize_t res; struct sock *sk = sock->sk; +#define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM) + if (!(sk->sk_route_caps & NETIF_F_SG) || - !(sk->sk_route_caps & NETIF_F_ALL_CSUM)) + !(sk->sk_route_caps & TCP_ZC_CSUM_FLAGS)) return sock_no_sendpage(sock, page, offset, size, flags); +#undef TCP_ZC_CSUM_FLAGS + lock_sock(sk); TCP_CHECK_TIMER(sk); res = do_tcp_sendpages(sk, &page, offset, size, flags); @@ -643,7 +646,7 @@ static inline int select_size(struct sock *sk, struct tcp_sock *tp) int tmp = tp->mss_cache; if (sk->sk_route_caps & NETIF_F_SG) { - if (sk_can_gso(sk)) + if (sk->sk_route_caps & NETIF_F_TSO) tmp = 0; else { int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER); @@ -723,7 +726,9 @@ new_segment: /* * Check whether we can use HW checksum. */ - if (sk->sk_route_caps & NETIF_F_ALL_CSUM) + if (sk->sk_route_caps & + (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | + NETIF_F_HW_CSUM)) skb->ip_summed = CHECKSUM_HW; skb_entail(sk, tp, skb); @@ -819,7 +824,7 @@ new_segment: tp->write_seq += copy; TCP_SKB_CB(skb)->end_seq += copy; - skb_shinfo(skb)->gso_segs = 0; + skb_shinfo(skb)->tso_segs = 0; from += copy; copied += copy; @@ -932,7 +937,7 @@ static int tcp_recv_urg(struct sock *sk, long timeo, * calculation of whether or not we must ACK for the sake of * a window update. */ -void tcp_cleanup_rbuf(struct sock *sk, int copied) +static void cleanup_rbuf(struct sock *sk, int copied) { struct tcp_sock *tp = tcp_sk(sk); int time_to_ack = 0; @@ -1067,11 +1072,11 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, break; } if (skb->h.th->fin) { - sk_eat_skb(sk, skb, 0); + sk_eat_skb(sk, skb); ++seq; break; } - sk_eat_skb(sk, skb, 0); + sk_eat_skb(sk, skb); if (!desc->count) break; } @@ -1081,7 +1086,7 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, /* Clean up data we have read: This will do ACK frames. */ if (copied) - tcp_cleanup_rbuf(sk, copied); + cleanup_rbuf(sk, copied); return copied; } @@ -1105,7 +1110,6 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, int target; /* Read at least this many bytes */ long timeo; struct task_struct *user_recv = NULL; - int copied_early = 0; lock_sock(sk); @@ -1129,17 +1133,6 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); -#ifdef CONFIG_NET_DMA - tp->ucopy.dma_chan = NULL; - preempt_disable(); - if ((len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) && - !sysctl_tcp_low_latency && __get_cpu_var(softnet_data).net_dma) { - preempt_enable_no_resched(); - tp->ucopy.pinned_list = dma_pin_iovec_pages(msg->msg_iov, len); - } else - preempt_enable_no_resched(); -#endif - do { struct sk_buff *skb; u32 offset; @@ -1227,7 +1220,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, } } - tcp_cleanup_rbuf(sk, copied); + cleanup_rbuf(sk, copied); if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) { /* Install new reader */ @@ -1281,10 +1274,6 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, } else sk_wait_data(sk, &timeo); -#ifdef CONFIG_NET_DMA - tp->ucopy.wakeup = 0; -#endif - if (user_recv) { int chunk; @@ -1340,39 +1329,13 @@ do_prequeue: } if (!(flags & MSG_TRUNC)) { -#ifdef CONFIG_NET_DMA - if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list) - tp->ucopy.dma_chan = get_softnet_dma(); - - if (tp->ucopy.dma_chan) { - tp->ucopy.dma_cookie = dma_skb_copy_datagram_iovec( - tp->ucopy.dma_chan, skb, offset, - msg->msg_iov, used, - tp->ucopy.pinned_list); - - if (tp->ucopy.dma_cookie < 0) { - - printk(KERN_ALERT "dma_cookie < 0\n"); - - /* Exception. Bailout! */ - if (!copied) - copied = -EFAULT; - break; - } - if ((offset + used) == skb->len) - copied_early = 1; - - } else -#endif - { - err = skb_copy_datagram_iovec(skb, offset, - msg->msg_iov, used); - if (err) { - /* Exception. Bailout! */ - if (!copied) - copied = -EFAULT; - break; - } + err = skb_copy_datagram_iovec(skb, offset, + msg->msg_iov, used); + if (err) { + /* Exception. Bailout! */ + if (!copied) + copied = -EFAULT; + break; } } @@ -1392,19 +1355,15 @@ skip_copy: if (skb->h.th->fin) goto found_fin_ok; - if (!(flags & MSG_PEEK)) { - sk_eat_skb(sk, skb, copied_early); - copied_early = 0; - } + if (!(flags & MSG_PEEK)) + sk_eat_skb(sk, skb); continue; found_fin_ok: /* Process the FIN. */ ++*seq; - if (!(flags & MSG_PEEK)) { - sk_eat_skb(sk, skb, copied_early); - copied_early = 0; - } + if (!(flags & MSG_PEEK)) + sk_eat_skb(sk, skb); break; } while (len > 0); @@ -1427,42 +1386,12 @@ skip_copy: tp->ucopy.len = 0; } -#ifdef CONFIG_NET_DMA - if (tp->ucopy.dma_chan) { - struct sk_buff *skb; - dma_cookie_t done, used; - - dma_async_memcpy_issue_pending(tp->ucopy.dma_chan); - - while (dma_async_memcpy_complete(tp->ucopy.dma_chan, - tp->ucopy.dma_cookie, &done, - &used) == DMA_IN_PROGRESS) { - /* do partial cleanup of sk_async_wait_queue */ - while ((skb = skb_peek(&sk->sk_async_wait_queue)) && - (dma_async_is_complete(skb->dma_cookie, done, - used) == DMA_SUCCESS)) { - __skb_dequeue(&sk->sk_async_wait_queue); - kfree_skb(skb); - } - } - - /* Safe to free early-copied skbs now */ - __skb_queue_purge(&sk->sk_async_wait_queue); - dma_chan_put(tp->ucopy.dma_chan); - tp->ucopy.dma_chan = NULL; - } - if (tp->ucopy.pinned_list) { - dma_unpin_iovec_pages(tp->ucopy.pinned_list); - tp->ucopy.pinned_list = NULL; - } -#endif - /* According to UNIX98, msg_name/msg_namelen are ignored * on connected socket. I was just happy when found this 8) --ANK */ /* Clean up data we have read: This will do ACK frames. */ - tcp_cleanup_rbuf(sk, copied); + cleanup_rbuf(sk, copied); TCP_CHECK_TIMER(sk); release_sock(sk); @@ -1539,7 +1468,6 @@ void tcp_close(struct sock *sk, long timeout) { struct sk_buff *skb; int data_was_unread = 0; - int state; lock_sock(sk); sk->sk_shutdown = SHUTDOWN_MASK; @@ -1616,11 +1544,6 @@ void tcp_close(struct sock *sk, long timeout) sk_stream_wait_close(sk, timeout); adjudge_to_death: - state = sk->sk_state; - sock_hold(sk); - sock_orphan(sk); - atomic_inc(sk->sk_prot->orphan_count); - /* It is the last release_sock in its life. It will remove backlog. */ release_sock(sk); @@ -1632,9 +1555,8 @@ adjudge_to_death: bh_lock_sock(sk); BUG_TRAP(!sock_owned_by_user(sk)); - /* Have we already been destroyed by a softirq or backlog? */ - if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE) - goto out; + sock_hold(sk); + sock_orphan(sk); /* This is a (useful) BSD violating of the RFC. There is a * problem with TCP as specified in that the other end could @@ -1660,9 +1582,9 @@ adjudge_to_death: const int tmo = tcp_fin_time(sk); if (tmo > TCP_TIMEWAIT_LEN) { - inet_csk_reset_keepalive_timer(sk, - tmo - TCP_TIMEWAIT_LEN); + inet_csk_reset_keepalive_timer(sk, tcp_fin_time(sk)); } else { + atomic_inc(sk->sk_prot->orphan_count); tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); goto out; } @@ -1681,6 +1603,7 @@ adjudge_to_death: NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY); } } + atomic_inc(sk->sk_prot->orphan_count); if (sk->sk_state == TCP_CLOSE) inet_csk_destroy_sock(sk); @@ -1730,9 +1653,6 @@ int tcp_disconnect(struct sock *sk, int flags) __skb_queue_purge(&sk->sk_receive_queue); sk_stream_writequeue_purge(sk); __skb_queue_purge(&tp->out_of_order_queue); -#ifdef CONFIG_NET_DMA - __skb_queue_purge(&sk->sk_async_wait_queue); -#endif inet->dport = 0; @@ -1768,14 +1688,18 @@ int tcp_disconnect(struct sock *sk, int flags) /* * Socket option code for TCP. */ -static int do_tcp_setsockopt(struct sock *sk, int level, - int optname, char __user *optval, int optlen) +int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval, + int optlen) { struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); int val; int err = 0; + if (level != SOL_TCP) + return icsk->icsk_af_ops->setsockopt(sk, level, optname, + optval, optlen); + /* This is a string value all the others are int's */ if (optname == TCP_CONGESTION) { char name[TCP_CA_NAME_MAX]; @@ -1933,7 +1857,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) && inet_csk_ack_scheduled(sk)) { icsk->icsk_ack.pending |= ICSK_ACK_PUSHED; - tcp_cleanup_rbuf(sk, 1); + cleanup_rbuf(sk, 1); if (!(val & 1)) icsk->icsk_ack.pingpong = 1; } @@ -1948,30 +1872,6 @@ static int do_tcp_setsockopt(struct sock *sk, int level, return err; } -int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval, - int optlen) -{ - struct inet_connection_sock *icsk = inet_csk(sk); - - if (level != SOL_TCP) - return icsk->icsk_af_ops->setsockopt(sk, level, optname, - optval, optlen); - return do_tcp_setsockopt(sk, level, optname, optval, optlen); -} - -#ifdef CONFIG_COMPAT -int compat_tcp_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, int optlen) -{ - if (level != SOL_TCP) - return inet_csk_compat_setsockopt(sk, level, optname, - optval, optlen); - return do_tcp_setsockopt(sk, level, optname, optval, optlen); -} - -EXPORT_SYMBOL(compat_tcp_setsockopt); -#endif - /* Return information about state of tcp endpoint in API format. */ void tcp_get_info(struct sock *sk, struct tcp_info *info) { @@ -2032,13 +1932,17 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) EXPORT_SYMBOL_GPL(tcp_get_info); -static int do_tcp_getsockopt(struct sock *sk, int level, - int optname, char __user *optval, int __user *optlen) +int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, + int __user *optlen) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); int val, len; + if (level != SOL_TCP) + return icsk->icsk_af_ops->getsockopt(sk, level, optname, + optval, optlen); + if (get_user(len, optlen)) return -EFAULT; @@ -2122,112 +2026,6 @@ static int do_tcp_getsockopt(struct sock *sk, int level, return 0; } -int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, - int __user *optlen) -{ - struct inet_connection_sock *icsk = inet_csk(sk); - - if (level != SOL_TCP) - return icsk->icsk_af_ops->getsockopt(sk, level, optname, - optval, optlen); - return do_tcp_getsockopt(sk, level, optname, optval, optlen); -} - -#ifdef CONFIG_COMPAT -int compat_tcp_getsockopt(struct sock *sk, int level, int optname, - char __user *optval, int __user *optlen) -{ - if (level != SOL_TCP) - return inet_csk_compat_getsockopt(sk, level, optname, - optval, optlen); - return do_tcp_getsockopt(sk, level, optname, optval, optlen); -} - -EXPORT_SYMBOL(compat_tcp_getsockopt); -#endif - -struct sk_buff *tcp_tso_segment(struct sk_buff *skb, int features) -{ - struct sk_buff *segs = ERR_PTR(-EINVAL); - struct tcphdr *th; - unsigned thlen; - unsigned int seq; - unsigned int delta; - unsigned int oldlen; - unsigned int len; - - if (!pskb_may_pull(skb, sizeof(*th))) - goto out; - - th = skb->h.th; - thlen = th->doff * 4; - if (thlen < sizeof(*th)) - goto out; - - if (!pskb_may_pull(skb, thlen)) - goto out; - - oldlen = (u16)~skb->len; - __skb_pull(skb, thlen); - - if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) { - /* Packet is from an untrusted source, reset gso_segs. */ - int type = skb_shinfo(skb)->gso_type; - int mss; - - if (unlikely(type & - ~(SKB_GSO_TCPV4 | - SKB_GSO_DODGY | - SKB_GSO_TCP_ECN | - SKB_GSO_TCPV6 | - 0) || - !(type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))) - goto out; - - mss = skb_shinfo(skb)->gso_size; - skb_shinfo(skb)->gso_segs = (skb->len + mss - 1) / mss; - - segs = NULL; - goto out; - } - - segs = skb_segment(skb, features); - if (IS_ERR(segs)) - goto out; - - len = skb_shinfo(skb)->gso_size; - delta = htonl(oldlen + (thlen + len)); - - skb = segs; - th = skb->h.th; - seq = ntohl(th->seq); - - do { - th->fin = th->psh = 0; - - th->check = ~csum_fold(th->check + delta); - if (skb->ip_summed != CHECKSUM_HW) - th->check = csum_fold(csum_partial(skb->h.raw, thlen, - skb->csum)); - - seq += len; - skb = skb->next; - th = skb->h.th; - - th->seq = htonl(seq); - th->cwr = 0; - } while (skb->next); - - delta = htonl(oldlen + (skb->tail - skb->h.raw) + skb->data_len); - th->check = ~csum_fold(th->check + delta); - if (skb->ip_summed != CHECKSUM_HW) - th->check = csum_fold(csum_partial(skb->h.raw, thlen, - skb->csum)); - -out: - return segs; -} -EXPORT_SYMBOL(tcp_tso_segment); extern void __skb_cb_too_small_for_tcp(int, int); extern struct tcp_congestion_ops tcp_reno; @@ -2245,8 +2043,7 @@ __setup("thash_entries=", set_thash_entries); void __init tcp_init(void) { struct sk_buff *skb = NULL; - unsigned long limit; - int order, i, max_share; + int order, i; if (sizeof(struct tcp_skb_cb) > sizeof(skb->cb)) __skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb), @@ -2320,16 +2117,12 @@ void __init tcp_init(void) sysctl_tcp_mem[1] = 1024 << order; sysctl_tcp_mem[2] = 1536 << order; - limit = ((unsigned long)sysctl_tcp_mem[1]) << (PAGE_SHIFT - 7); - max_share = min(4UL*1024*1024, limit); - - sysctl_tcp_wmem[0] = SK_STREAM_MEM_QUANTUM; - sysctl_tcp_wmem[1] = 16*1024; - sysctl_tcp_wmem[2] = max(64*1024, max_share); - - sysctl_tcp_rmem[0] = SK_STREAM_MEM_QUANTUM; - sysctl_tcp_rmem[1] = 87380; - sysctl_tcp_rmem[2] = max(87380, max_share); + if (order < 3) { + sysctl_tcp_wmem[2] = 64 * 1024; + sysctl_tcp_rmem[0] = PAGE_SIZE; + sysctl_tcp_rmem[1] = 43689; + sysctl_tcp_rmem[2] = 2 * 43689; + } printk(KERN_INFO "TCP: Hash tables configured " "(established %d bind %d)\n", @@ -2350,4 +2143,3 @@ EXPORT_SYMBOL(tcp_sendpage); EXPORT_SYMBOL(tcp_setsockopt); EXPORT_SYMBOL(tcp_shutdown); EXPORT_SYMBOL(tcp_statistics); -EXPORT_SYMBOL_GPL(tcp_cleanup_rbuf);