linux 2.6.16.38 w/ vs2.0.3-rc1
[linux-2.6.git] / net / ipv4 / tcp.c
index b04e96e..072ae9c 100644 (file)
  *     TCP_CLOSE               socket is finished
  */
 
+#include <linux/config.h>
 #include <linux/module.h>
 #include <linux/types.h>
 #include <linux/fcntl.h>
 #include <linux/fs.h>
 #include <linux/random.h>
 #include <linux/bootmem.h>
-#include <linux/cache.h>
-#include <linux/err.h>
 #include <linux/in.h>
 
 #include <net/icmp.h>
 #include <net/tcp.h>
 #include <net/xfrm.h>
 #include <net/ip.h>
-#include <net/netdma.h>
+
 
 #include <asm/uaccess.h>
 #include <asm/ioctls.h>
@@ -277,9 +276,9 @@ atomic_t tcp_orphan_count = ATOMIC_INIT(0);
 
 EXPORT_SYMBOL_GPL(tcp_orphan_count);
 
-int sysctl_tcp_mem[3] __read_mostly;
-int sysctl_tcp_wmem[3] __read_mostly;
-int sysctl_tcp_rmem[3] __read_mostly;
+int sysctl_tcp_mem[3];
+int sysctl_tcp_wmem[3] = { 4 * 1024, 16 * 1024, 128 * 1024 };
+int sysctl_tcp_rmem[3] = { 4 * 1024, 87380, 87380 * 2 };
 
 EXPORT_SYMBOL(sysctl_tcp_mem);
 EXPORT_SYMBOL(sysctl_tcp_rmem);
@@ -367,7 +366,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
        if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE)
                mask |= POLLHUP;
        if (sk->sk_shutdown & RCV_SHUTDOWN)
-               mask |= POLLIN | POLLRDNORM | POLLRDHUP;
+               mask |= POLLIN | POLLRDNORM;
 
        /* Connected? */
        if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) {
@@ -572,7 +571,7 @@ new_segment:
                skb->ip_summed = CHECKSUM_HW;
                tp->write_seq += copy;
                TCP_SKB_CB(skb)->end_seq += copy;
-               skb_shinfo(skb)->gso_segs = 0;
+               skb_shinfo(skb)->tso_segs = 0;
 
                if (!copied)
                        TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
@@ -623,10 +622,14 @@ ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
        ssize_t res;
        struct sock *sk = sock->sk;
 
+#define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)
+
        if (!(sk->sk_route_caps & NETIF_F_SG) ||
-           !(sk->sk_route_caps & NETIF_F_ALL_CSUM))
+           !(sk->sk_route_caps & TCP_ZC_CSUM_FLAGS))
                return sock_no_sendpage(sock, page, offset, size, flags);
 
+#undef TCP_ZC_CSUM_FLAGS
+
        lock_sock(sk);
        TCP_CHECK_TIMER(sk);
        res = do_tcp_sendpages(sk, &page, offset, size, flags);
@@ -643,7 +646,7 @@ static inline int select_size(struct sock *sk, struct tcp_sock *tp)
        int tmp = tp->mss_cache;
 
        if (sk->sk_route_caps & NETIF_F_SG) {
-               if (sk_can_gso(sk))
+               if (sk->sk_route_caps & NETIF_F_TSO)
                        tmp = 0;
                else {
                        int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
@@ -723,7 +726,9 @@ new_segment:
                                /*
                                 * Check whether we can use HW checksum.
                                 */
-                               if (sk->sk_route_caps & NETIF_F_ALL_CSUM)
+                               if (sk->sk_route_caps &
+                                   (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM |
+                                    NETIF_F_HW_CSUM))
                                        skb->ip_summed = CHECKSUM_HW;
 
                                skb_entail(sk, tp, skb);
@@ -819,7 +824,7 @@ new_segment:
 
                        tp->write_seq += copy;
                        TCP_SKB_CB(skb)->end_seq += copy;
-                       skb_shinfo(skb)->gso_segs = 0;
+                       skb_shinfo(skb)->tso_segs = 0;
 
                        from += copy;
                        copied += copy;
@@ -932,7 +937,7 @@ static int tcp_recv_urg(struct sock *sk, long timeo,
  * calculation of whether or not we must ACK for the sake of
  * a window update.
  */
-void tcp_cleanup_rbuf(struct sock *sk, int copied)
+static void cleanup_rbuf(struct sock *sk, int copied)
 {
        struct tcp_sock *tp = tcp_sk(sk);
        int time_to_ack = 0;
@@ -1067,11 +1072,11 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
                                break;
                }
                if (skb->h.th->fin) {
-                       sk_eat_skb(sk, skb, 0);
+                       sk_eat_skb(sk, skb);
                        ++seq;
                        break;
                }
-               sk_eat_skb(sk, skb, 0);
+               sk_eat_skb(sk, skb);
                if (!desc->count)
                        break;
        }
@@ -1081,7 +1086,7 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
 
        /* Clean up data we have read: This will do ACK frames. */
        if (copied)
-               tcp_cleanup_rbuf(sk, copied);
+               cleanup_rbuf(sk, copied);
        return copied;
 }
 
@@ -1105,7 +1110,6 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
        int target;             /* Read at least this many bytes */
        long timeo;
        struct task_struct *user_recv = NULL;
-       int copied_early = 0;
 
        lock_sock(sk);
 
@@ -1129,17 +1133,6 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 
        target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
 
-#ifdef CONFIG_NET_DMA
-       tp->ucopy.dma_chan = NULL;
-       preempt_disable();
-       if ((len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) &&
-           !sysctl_tcp_low_latency && __get_cpu_var(softnet_data).net_dma) {
-               preempt_enable_no_resched();
-               tp->ucopy.pinned_list = dma_pin_iovec_pages(msg->msg_iov, len);
-       } else
-               preempt_enable_no_resched();
-#endif
-
        do {
                struct sk_buff *skb;
                u32 offset;
@@ -1227,7 +1220,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
                        }
                }
 
-               tcp_cleanup_rbuf(sk, copied);
+               cleanup_rbuf(sk, copied);
 
                if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) {
                        /* Install new reader */
@@ -1281,10 +1274,6 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
                } else
                        sk_wait_data(sk, &timeo);
 
-#ifdef CONFIG_NET_DMA
-               tp->ucopy.wakeup = 0;
-#endif
-
                if (user_recv) {
                        int chunk;
 
@@ -1340,39 +1329,13 @@ do_prequeue:
                }
 
                if (!(flags & MSG_TRUNC)) {
-#ifdef CONFIG_NET_DMA
-                       if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
-                               tp->ucopy.dma_chan = get_softnet_dma();
-
-                       if (tp->ucopy.dma_chan) {
-                               tp->ucopy.dma_cookie = dma_skb_copy_datagram_iovec(
-                                       tp->ucopy.dma_chan, skb, offset,
-                                       msg->msg_iov, used,
-                                       tp->ucopy.pinned_list);
-
-                               if (tp->ucopy.dma_cookie < 0) {
-
-                                       printk(KERN_ALERT "dma_cookie < 0\n");
-
-                                       /* Exception. Bailout! */
-                                       if (!copied)
-                                               copied = -EFAULT;
-                                       break;
-                               }
-                               if ((offset + used) == skb->len)
-                                       copied_early = 1;
-
-                       } else
-#endif
-                       {
-                               err = skb_copy_datagram_iovec(skb, offset,
-                                               msg->msg_iov, used);
-                               if (err) {
-                                       /* Exception. Bailout! */
-                                       if (!copied)
-                                               copied = -EFAULT;
-                                       break;
-                               }
+                       err = skb_copy_datagram_iovec(skb, offset,
+                                                     msg->msg_iov, used);
+                       if (err) {
+                               /* Exception. Bailout! */
+                               if (!copied)
+                                       copied = -EFAULT;
+                               break;
                        }
                }
 
@@ -1392,19 +1355,15 @@ skip_copy:
 
                if (skb->h.th->fin)
                        goto found_fin_ok;
-               if (!(flags & MSG_PEEK)) {
-                       sk_eat_skb(sk, skb, copied_early);
-                       copied_early = 0;
-               }
+               if (!(flags & MSG_PEEK))
+                       sk_eat_skb(sk, skb);
                continue;
 
        found_fin_ok:
                /* Process the FIN. */
                ++*seq;
-               if (!(flags & MSG_PEEK)) {
-                       sk_eat_skb(sk, skb, copied_early);
-                       copied_early = 0;
-               }
+               if (!(flags & MSG_PEEK))
+                       sk_eat_skb(sk, skb);
                break;
        } while (len > 0);
 
@@ -1427,42 +1386,12 @@ skip_copy:
                tp->ucopy.len = 0;
        }
 
-#ifdef CONFIG_NET_DMA
-       if (tp->ucopy.dma_chan) {
-               struct sk_buff *skb;
-               dma_cookie_t done, used;
-
-               dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);
-
-               while (dma_async_memcpy_complete(tp->ucopy.dma_chan,
-                                                tp->ucopy.dma_cookie, &done,
-                                                &used) == DMA_IN_PROGRESS) {
-                       /* do partial cleanup of sk_async_wait_queue */
-                       while ((skb = skb_peek(&sk->sk_async_wait_queue)) &&
-                              (dma_async_is_complete(skb->dma_cookie, done,
-                                                     used) == DMA_SUCCESS)) {
-                               __skb_dequeue(&sk->sk_async_wait_queue);
-                               kfree_skb(skb);
-                       }
-               }
-
-               /* Safe to free early-copied skbs now */
-               __skb_queue_purge(&sk->sk_async_wait_queue);
-               dma_chan_put(tp->ucopy.dma_chan);
-               tp->ucopy.dma_chan = NULL;
-       }
-       if (tp->ucopy.pinned_list) {
-               dma_unpin_iovec_pages(tp->ucopy.pinned_list);
-               tp->ucopy.pinned_list = NULL;
-       }
-#endif
-
        /* According to UNIX98, msg_name/msg_namelen are ignored
         * on connected socket. I was just happy when found this 8) --ANK
         */
 
        /* Clean up data we have read: This will do ACK frames. */
-       tcp_cleanup_rbuf(sk, copied);
+       cleanup_rbuf(sk, copied);
 
        TCP_CHECK_TIMER(sk);
        release_sock(sk);
@@ -1539,7 +1468,6 @@ void tcp_close(struct sock *sk, long timeout)
 {
        struct sk_buff *skb;
        int data_was_unread = 0;
-       int state;
 
        lock_sock(sk);
        sk->sk_shutdown = SHUTDOWN_MASK;
@@ -1616,11 +1544,6 @@ void tcp_close(struct sock *sk, long timeout)
        sk_stream_wait_close(sk, timeout);
 
 adjudge_to_death:
-       state = sk->sk_state;
-       sock_hold(sk);
-       sock_orphan(sk);
-       atomic_inc(sk->sk_prot->orphan_count);
-
        /* It is the last release_sock in its life. It will remove backlog. */
        release_sock(sk);
 
@@ -1632,9 +1555,8 @@ adjudge_to_death:
        bh_lock_sock(sk);
        BUG_TRAP(!sock_owned_by_user(sk));
 
-       /* Have we already been destroyed by a softirq or backlog? */
-       if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE)
-               goto out;
+       sock_hold(sk);
+       sock_orphan(sk);
 
        /*      This is a (useful) BSD violating of the RFC. There is a
         *      problem with TCP as specified in that the other end could
@@ -1660,9 +1582,9 @@ adjudge_to_death:
                        const int tmo = tcp_fin_time(sk);
 
                        if (tmo > TCP_TIMEWAIT_LEN) {
-                               inet_csk_reset_keepalive_timer(sk,
-                                               tmo - TCP_TIMEWAIT_LEN);
+                               inet_csk_reset_keepalive_timer(sk, tcp_fin_time(sk));
                        } else {
+                               atomic_inc(sk->sk_prot->orphan_count);
                                tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
                                goto out;
                        }
@@ -1681,6 +1603,7 @@ adjudge_to_death:
                        NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY);
                }
        }
+       atomic_inc(sk->sk_prot->orphan_count);
 
        if (sk->sk_state == TCP_CLOSE)
                inet_csk_destroy_sock(sk);
@@ -1730,9 +1653,6 @@ int tcp_disconnect(struct sock *sk, int flags)
        __skb_queue_purge(&sk->sk_receive_queue);
        sk_stream_writequeue_purge(sk);
        __skb_queue_purge(&tp->out_of_order_queue);
-#ifdef CONFIG_NET_DMA
-       __skb_queue_purge(&sk->sk_async_wait_queue);
-#endif
 
        inet->dport = 0;
 
@@ -1768,14 +1688,18 @@ int tcp_disconnect(struct sock *sk, int flags)
 /*
  *     Socket option code for TCP.
  */
-static int do_tcp_setsockopt(struct sock *sk, int level,
-               int optname, char __user *optval, int optlen)
+int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
+                  int optlen)
 {
        struct tcp_sock *tp = tcp_sk(sk);
        struct inet_connection_sock *icsk = inet_csk(sk);
        int val;
        int err = 0;
 
+       if (level != SOL_TCP)
+               return icsk->icsk_af_ops->setsockopt(sk, level, optname,
+                                                    optval, optlen);
+
        /* This is a string value all the others are int's */
        if (optname == TCP_CONGESTION) {
                char name[TCP_CA_NAME_MAX];
@@ -1933,7 +1857,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
                            (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
                            inet_csk_ack_scheduled(sk)) {
                                icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
-                               tcp_cleanup_rbuf(sk, 1);
+                               cleanup_rbuf(sk, 1);
                                if (!(val & 1))
                                        icsk->icsk_ack.pingpong = 1;
                        }
@@ -1948,30 +1872,6 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
        return err;
 }
 
-int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
-                  int optlen)
-{
-       struct inet_connection_sock *icsk = inet_csk(sk);
-
-       if (level != SOL_TCP)
-               return icsk->icsk_af_ops->setsockopt(sk, level, optname,
-                                                    optval, optlen);
-       return do_tcp_setsockopt(sk, level, optname, optval, optlen);
-}
-
-#ifdef CONFIG_COMPAT
-int compat_tcp_setsockopt(struct sock *sk, int level, int optname,
-                         char __user *optval, int optlen)
-{
-       if (level != SOL_TCP)
-               return inet_csk_compat_setsockopt(sk, level, optname,
-                                                 optval, optlen);
-       return do_tcp_setsockopt(sk, level, optname, optval, optlen);
-}
-
-EXPORT_SYMBOL(compat_tcp_setsockopt);
-#endif
-
 /* Return information about state of tcp endpoint in API format. */
 void tcp_get_info(struct sock *sk, struct tcp_info *info)
 {
@@ -2032,13 +1932,17 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
 
 EXPORT_SYMBOL_GPL(tcp_get_info);
 
-static int do_tcp_getsockopt(struct sock *sk, int level,
-               int optname, char __user *optval, int __user *optlen)
+int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
+                  int __user *optlen)
 {
        struct inet_connection_sock *icsk = inet_csk(sk);
        struct tcp_sock *tp = tcp_sk(sk);
        int val, len;
 
+       if (level != SOL_TCP)
+               return icsk->icsk_af_ops->getsockopt(sk, level, optname,
+                                                    optval, optlen);
+
        if (get_user(len, optlen))
                return -EFAULT;
 
@@ -2122,112 +2026,6 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
        return 0;
 }
 
-int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
-                  int __user *optlen)
-{
-       struct inet_connection_sock *icsk = inet_csk(sk);
-
-       if (level != SOL_TCP)
-               return icsk->icsk_af_ops->getsockopt(sk, level, optname,
-                                                    optval, optlen);
-       return do_tcp_getsockopt(sk, level, optname, optval, optlen);
-}
-
-#ifdef CONFIG_COMPAT
-int compat_tcp_getsockopt(struct sock *sk, int level, int optname,
-                         char __user *optval, int __user *optlen)
-{
-       if (level != SOL_TCP)
-               return inet_csk_compat_getsockopt(sk, level, optname,
-                                                 optval, optlen);
-       return do_tcp_getsockopt(sk, level, optname, optval, optlen);
-}
-
-EXPORT_SYMBOL(compat_tcp_getsockopt);
-#endif
-
-struct sk_buff *tcp_tso_segment(struct sk_buff *skb, int features)
-{
-       struct sk_buff *segs = ERR_PTR(-EINVAL);
-       struct tcphdr *th;
-       unsigned thlen;
-       unsigned int seq;
-       unsigned int delta;
-       unsigned int oldlen;
-       unsigned int len;
-
-       if (!pskb_may_pull(skb, sizeof(*th)))
-               goto out;
-
-       th = skb->h.th;
-       thlen = th->doff * 4;
-       if (thlen < sizeof(*th))
-               goto out;
-
-       if (!pskb_may_pull(skb, thlen))
-               goto out;
-
-       oldlen = (u16)~skb->len;
-       __skb_pull(skb, thlen);
-
-       if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) {
-               /* Packet is from an untrusted source, reset gso_segs. */
-               int type = skb_shinfo(skb)->gso_type;
-               int mss;
-
-               if (unlikely(type &
-                            ~(SKB_GSO_TCPV4 |
-                              SKB_GSO_DODGY |
-                              SKB_GSO_TCP_ECN |
-                              SKB_GSO_TCPV6 |
-                              0) ||
-                            !(type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))))
-                       goto out;
-
-               mss = skb_shinfo(skb)->gso_size;
-               skb_shinfo(skb)->gso_segs = (skb->len + mss - 1) / mss;
-
-               segs = NULL;
-               goto out;
-       }
-
-       segs = skb_segment(skb, features);
-       if (IS_ERR(segs))
-               goto out;
-
-       len = skb_shinfo(skb)->gso_size;
-       delta = htonl(oldlen + (thlen + len));
-
-       skb = segs;
-       th = skb->h.th;
-       seq = ntohl(th->seq);
-
-       do {
-               th->fin = th->psh = 0;
-
-               th->check = ~csum_fold(th->check + delta);
-               if (skb->ip_summed != CHECKSUM_HW)
-                       th->check = csum_fold(csum_partial(skb->h.raw, thlen,
-                                                          skb->csum));
-
-               seq += len;
-               skb = skb->next;
-               th = skb->h.th;
-
-               th->seq = htonl(seq);
-               th->cwr = 0;
-       } while (skb->next);
-
-       delta = htonl(oldlen + (skb->tail - skb->h.raw) + skb->data_len);
-       th->check = ~csum_fold(th->check + delta);
-       if (skb->ip_summed != CHECKSUM_HW)
-               th->check = csum_fold(csum_partial(skb->h.raw, thlen,
-                                                  skb->csum));
-
-out:
-       return segs;
-}
-EXPORT_SYMBOL(tcp_tso_segment);
 
 extern void __skb_cb_too_small_for_tcp(int, int);
 extern struct tcp_congestion_ops tcp_reno;
@@ -2245,8 +2043,7 @@ __setup("thash_entries=", set_thash_entries);
 void __init tcp_init(void)
 {
        struct sk_buff *skb = NULL;
-       unsigned long limit;
-       int order, i, max_share;
+       int order, i;
 
        if (sizeof(struct tcp_skb_cb) > sizeof(skb->cb))
                __skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb),
@@ -2320,16 +2117,12 @@ void __init tcp_init(void)
        sysctl_tcp_mem[1] = 1024 << order;
        sysctl_tcp_mem[2] = 1536 << order;
 
-       limit = ((unsigned long)sysctl_tcp_mem[1]) << (PAGE_SHIFT - 7);
-       max_share = min(4UL*1024*1024, limit);
-
-       sysctl_tcp_wmem[0] = SK_STREAM_MEM_QUANTUM;
-       sysctl_tcp_wmem[1] = 16*1024;
-       sysctl_tcp_wmem[2] = max(64*1024, max_share);
-
-       sysctl_tcp_rmem[0] = SK_STREAM_MEM_QUANTUM;
-       sysctl_tcp_rmem[1] = 87380;
-       sysctl_tcp_rmem[2] = max(87380, max_share);
+       if (order < 3) {
+               sysctl_tcp_wmem[2] = 64 * 1024;
+               sysctl_tcp_rmem[0] = PAGE_SIZE;
+               sysctl_tcp_rmem[1] = 43689;
+               sysctl_tcp_rmem[2] = 2 * 43689;
+       }
 
        printk(KERN_INFO "TCP: Hash tables configured "
               "(established %d bind %d)\n",
@@ -2350,4 +2143,3 @@ EXPORT_SYMBOL(tcp_sendpage);
 EXPORT_SYMBOL(tcp_setsockopt);
 EXPORT_SYMBOL(tcp_shutdown);
 EXPORT_SYMBOL(tcp_statistics);
-EXPORT_SYMBOL_GPL(tcp_cleanup_rbuf);