fedora core 6 1.2949 + vserver 2.2.0
[linux-2.6.git] / net / ipv4 / tcp.c
index f28f406..01fda57 100644 (file)
 #include <linux/bootmem.h>
 #include <linux/cache.h>
 #include <linux/err.h>
+#include <linux/crypto.h>
 #include <linux/in.h>
 
 #include <net/icmp.h>
 #include <asm/uaccess.h>
 #include <asm/ioctls.h>
 
-int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
+int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT;
 
 DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics) __read_mostly;
 
@@ -463,11 +464,12 @@ static inline int forced_push(struct tcp_sock *tp)
 static inline void skb_entail(struct sock *sk, struct tcp_sock *tp,
                              struct sk_buff *skb)
 {
-       skb->csum = 0;
-       TCP_SKB_CB(skb)->seq = tp->write_seq;
-       TCP_SKB_CB(skb)->end_seq = tp->write_seq;
-       TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
-       TCP_SKB_CB(skb)->sacked = 0;
+       struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
+
+       skb->csum    = 0;
+       tcb->seq     = tcb->end_seq = tp->write_seq;
+       tcb->flags   = TCPCB_FLAG_ACK;
+       tcb->sacked  = 0;
        skb_header_release(skb);
        __skb_queue_tail(&sk->sk_write_queue, skb);
        sk_charge_skb(sk, skb);
@@ -569,7 +571,7 @@ new_segment:
                skb->truesize += copy;
                sk->sk_wmem_queued += copy;
                sk->sk_forward_alloc -= copy;
-               skb->ip_summed = CHECKSUM_HW;
+               skb->ip_summed = CHECKSUM_PARTIAL;
                tp->write_seq += copy;
                TCP_SKB_CB(skb)->end_seq += copy;
                skb_shinfo(skb)->gso_segs = 0;
@@ -724,7 +726,7 @@ new_segment:
                                 * Check whether we can use HW checksum.
                                 */
                                if (sk->sk_route_caps & NETIF_F_ALL_CSUM)
-                                       skb->ip_summed = CHECKSUM_HW;
+                                       skb->ip_summed = CHECKSUM_PARTIAL;
 
                                skb_entail(sk, tp, skb);
                                copy = size_goal;
@@ -956,8 +958,11 @@ void tcp_cleanup_rbuf(struct sock *sk, int copied)
                     * receive buffer and there was a small segment
                     * in queue.
                     */
-                   (copied > 0 && (icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&
-                    !icsk->icsk_ack.pingpong && !atomic_read(&sk->sk_rmem_alloc)))
+                   (copied > 0 &&
+                    ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED2) ||
+                     ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&
+                      !icsk->icsk_ack.pingpong)) &&
+                     !atomic_read(&sk->sk_rmem_alloc)))
                        time_to_ack = 1;
        }
 
@@ -1940,6 +1945,13 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
                }
                break;
 
+#ifdef CONFIG_TCP_MD5SIG
+       case TCP_MD5SIG:
+               /* Read the IP->Key mappings from userspace */
+               err = tp->af_specific->md5_parse(sk, optval, optlen);
+               break;
+#endif
+
        default:
                err = -ENOPROTOOPT;
                break;
@@ -2152,7 +2164,7 @@ struct sk_buff *tcp_tso_segment(struct sk_buff *skb, int features)
        struct tcphdr *th;
        unsigned thlen;
        unsigned int seq;
-       unsigned int delta;
+       __be32 delta;
        unsigned int oldlen;
        unsigned int len;
 
@@ -2205,8 +2217,9 @@ struct sk_buff *tcp_tso_segment(struct sk_buff *skb, int features)
        do {
                th->fin = th->psh = 0;
 
-               th->check = ~csum_fold(th->check + delta);
-               if (skb->ip_summed != CHECKSUM_HW)
+               th->check = ~csum_fold((__force __wsum)((__force u32)th->check +
+                                      (__force u32)delta));
+               if (skb->ip_summed != CHECKSUM_PARTIAL)
                        th->check = csum_fold(csum_partial(skb->h.raw, thlen,
                                                           skb->csum));
 
@@ -2219,8 +2232,9 @@ struct sk_buff *tcp_tso_segment(struct sk_buff *skb, int features)
        } while (skb->next);
 
        delta = htonl(oldlen + (skb->tail - skb->h.raw) + skb->data_len);
-       th->check = ~csum_fold(th->check + delta);
-       if (skb->ip_summed != CHECKSUM_HW)
+       th->check = ~csum_fold((__force __wsum)((__force u32)th->check +
+                               (__force u32)delta));
+       if (skb->ip_summed != CHECKSUM_PARTIAL)
                th->check = csum_fold(csum_partial(skb->h.raw, thlen,
                                                   skb->csum));
 
@@ -2229,6 +2243,136 @@ out:
 }
 EXPORT_SYMBOL(tcp_tso_segment);
 
+#ifdef CONFIG_TCP_MD5SIG
+static unsigned long tcp_md5sig_users;
+static struct tcp_md5sig_pool **tcp_md5sig_pool;
+static DEFINE_SPINLOCK(tcp_md5sig_pool_lock);
+
+static void __tcp_free_md5sig_pool(struct tcp_md5sig_pool **pool)
+{
+       int cpu;
+       for_each_possible_cpu(cpu) {
+               struct tcp_md5sig_pool *p = *per_cpu_ptr(pool, cpu);
+               if (p) {
+                       if (p->md5_desc.tfm)
+                               crypto_free_hash(p->md5_desc.tfm);
+                       kfree(p);
+                       p = NULL;
+               }
+       }
+       free_percpu(pool);
+}
+
+void tcp_free_md5sig_pool(void)
+{
+       struct tcp_md5sig_pool **pool = NULL;
+
+       spin_lock_bh(&tcp_md5sig_pool_lock);
+       if (--tcp_md5sig_users == 0) {
+               pool = tcp_md5sig_pool;
+               tcp_md5sig_pool = NULL;
+       }
+       spin_unlock_bh(&tcp_md5sig_pool_lock);
+       if (pool)
+               __tcp_free_md5sig_pool(pool);
+}
+
+EXPORT_SYMBOL(tcp_free_md5sig_pool);
+
+static struct tcp_md5sig_pool **__tcp_alloc_md5sig_pool(void)
+{
+       int cpu;
+       struct tcp_md5sig_pool **pool;
+
+       pool = alloc_percpu(struct tcp_md5sig_pool *);
+       if (!pool)
+               return NULL;
+
+       for_each_possible_cpu(cpu) {
+               struct tcp_md5sig_pool *p;
+               struct crypto_hash *hash;
+
+               p = kzalloc(sizeof(*p), GFP_KERNEL);
+               if (!p)
+                       goto out_free;
+               *per_cpu_ptr(pool, cpu) = p;
+
+               hash = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC);
+               if (!hash || IS_ERR(hash))
+                       goto out_free;
+
+               p->md5_desc.tfm = hash;
+       }
+       return pool;
+out_free:
+       __tcp_free_md5sig_pool(pool);
+       return NULL;
+}
+
+struct tcp_md5sig_pool **tcp_alloc_md5sig_pool(void)
+{
+       struct tcp_md5sig_pool **pool;
+       int alloc = 0;
+
+retry:
+       spin_lock_bh(&tcp_md5sig_pool_lock);
+       pool = tcp_md5sig_pool;
+       if (tcp_md5sig_users++ == 0) {
+               alloc = 1;
+               spin_unlock_bh(&tcp_md5sig_pool_lock);
+       } else if (!pool) {
+               tcp_md5sig_users--;
+               spin_unlock_bh(&tcp_md5sig_pool_lock);
+               cpu_relax();
+               goto retry;
+       } else
+               spin_unlock_bh(&tcp_md5sig_pool_lock);
+
+       if (alloc) {
+               /* we cannot hold spinlock here because this may sleep. */
+               struct tcp_md5sig_pool **p = __tcp_alloc_md5sig_pool();
+               spin_lock_bh(&tcp_md5sig_pool_lock);
+               if (!p) {
+                       tcp_md5sig_users--;
+                       spin_unlock_bh(&tcp_md5sig_pool_lock);
+                       return NULL;
+               }
+               pool = tcp_md5sig_pool;
+               if (pool) {
+                       /* oops, it has already been assigned. */
+                       spin_unlock_bh(&tcp_md5sig_pool_lock);
+                       __tcp_free_md5sig_pool(p);
+               } else {
+                       tcp_md5sig_pool = pool = p;
+                       spin_unlock_bh(&tcp_md5sig_pool_lock);
+               }
+       }
+       return pool;
+}
+
+EXPORT_SYMBOL(tcp_alloc_md5sig_pool);
+
+struct tcp_md5sig_pool *__tcp_get_md5sig_pool(int cpu)
+{
+       struct tcp_md5sig_pool **p;
+       spin_lock_bh(&tcp_md5sig_pool_lock);
+       p = tcp_md5sig_pool;
+       if (p)
+               tcp_md5sig_users++;
+       spin_unlock_bh(&tcp_md5sig_pool_lock);
+       return (p ? *per_cpu_ptr(p, cpu) : NULL);
+}
+
+EXPORT_SYMBOL(__tcp_get_md5sig_pool);
+
+void __tcp_put_md5sig_pool(void)
+{
+       tcp_free_md5sig_pool();
+}
+
+EXPORT_SYMBOL(__tcp_put_md5sig_pool);
+#endif
+
 extern void __skb_cb_too_small_for_tcp(int, int);
 extern struct tcp_congestion_ops tcp_reno;
 
@@ -2255,9 +2399,7 @@ void __init tcp_init(void)
        tcp_hashinfo.bind_bucket_cachep =
                kmem_cache_create("tcp_bind_bucket",
                                  sizeof(struct inet_bind_bucket), 0,
-                                 SLAB_HWCACHE_ALIGN, NULL, NULL);
-       if (!tcp_hashinfo.bind_bucket_cachep)
-               panic("tcp_init: Cannot alloc tcp_bind_bucket cache.");
+                                 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
 
        /* Size and allocate the main established and bind bucket
         * hash tables.
@@ -2270,7 +2412,7 @@ void __init tcp_init(void)
                                        thash_entries,
                                        (num_physpages >= 128 * 1024) ?
                                        13 : 15,
-                                       HASH_HIGHMEM,
+                                       0,
                                        &tcp_hashinfo.ehash_size,
                                        NULL,
                                        0);
@@ -2286,7 +2428,7 @@ void __init tcp_init(void)
                                        tcp_hashinfo.ehash_size,
                                        (num_physpages >= 128 * 1024) ?
                                        13 : 15,
-                                       HASH_HIGHMEM,
+                                       0,
                                        &tcp_hashinfo.bhash_size,
                                        NULL,
                                        64 * 1024);
@@ -2316,10 +2458,18 @@ void __init tcp_init(void)
                sysctl_max_syn_backlog = 128;
        }
 
-       sysctl_tcp_mem[0] =  768 << order;
-       sysctl_tcp_mem[1] = 1024 << order;
-       sysctl_tcp_mem[2] = 1536 << order;
-
+       /* Set the pressure threshold to be a fraction of global memory that
+        * is up to 1/2 at 256 MB, decreasing toward zero with the amount of
+        * memory, with a floor of 128 pages.
+        */
+       limit = min(nr_all_pages, 1UL<<(28-PAGE_SHIFT)) >> (20-PAGE_SHIFT);
+       limit = (limit * (nr_all_pages >> (20-PAGE_SHIFT))) >> (PAGE_SHIFT-11);
+       limit = max(limit, 128UL);
+       sysctl_tcp_mem[0] = limit / 4 * 3;
+       sysctl_tcp_mem[1] = limit;
+       sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2;
+
+       /* Set per-socket limits to no more than 1/128 the pressure threshold */
        limit = ((unsigned long)sysctl_tcp_mem[1]) << (PAGE_SHIFT - 7);
        max_share = min(4UL*1024*1024, limit);
 
@@ -2350,4 +2500,3 @@ EXPORT_SYMBOL(tcp_sendpage);
 EXPORT_SYMBOL(tcp_setsockopt);
 EXPORT_SYMBOL(tcp_shutdown);
 EXPORT_SYMBOL(tcp_statistics);
-EXPORT_SYMBOL_GPL(tcp_cleanup_rbuf);