vserver 1.9.3
[linux-2.6.git] / net / ipv4 / tcp_ipv4.c
index ac92115..6886042 100644 (file)
@@ -74,6 +74,7 @@
 #include <linux/stddef.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
+#include <linux/vserver/debug.h>
 
 extern int sysctl_ip_dynaddr;
 int sysctl_tcp_tw_reuse;
@@ -181,7 +182,6 @@ void tcp_bind_hash(struct sock *sk, struct tcp_bind_bucket *tb,
 
 static inline int tcp_bind_conflict(struct sock *sk, struct tcp_bind_bucket *tb)
 {
-       const u32 sk_rcv_saddr = tcp_v4_rcv_saddr(sk);
        struct sock *sk2;
        struct hlist_node *node;
        int reuse = sk->sk_reuse;
@@ -194,9 +194,8 @@ static inline int tcp_bind_conflict(struct sock *sk, struct tcp_bind_bucket *tb)
                     sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
                        if (!reuse || !sk2->sk_reuse ||
                            sk2->sk_state == TCP_LISTEN) {
-                               const u32 sk2_rcv_saddr = tcp_v4_rcv_saddr(sk2);
-                               if (!sk2_rcv_saddr || !sk_rcv_saddr ||
-                                   sk2_rcv_saddr == sk_rcv_saddr)
+                               if (nx_addr_conflict(sk->sk_nx_info,
+                                       tcp_v4_rcv_saddr(sk), sk2))
                                        break;
                        }
                }
@@ -405,6 +404,26 @@ void tcp_unhash(struct sock *sk)
                wake_up(&tcp_lhash_wait);
 }
 
+
+/*
+ *      Check if a given address matches for a tcp socket
+ *
+ *      nxi:   the socket's nx_info if any
+ *      addr:  to be verified address
+ *      saddr: socket addresses
+ */
+static inline int tcp_addr_match (
+       struct nx_info *nxi,
+       uint32_t addr,
+       uint32_t saddr)
+{
+       if (addr && (saddr == addr))
+               return 1;
+       if (!saddr)
+               return addr_in_nx_info(nxi, addr);
+       return 0;
+}
+
 /* Don't inline this cruft.  Here are some nice properties to
  * exploit here.  The BSD API does not allow a listening TCP
  * to specify the remote port nor the remote address for the
@@ -426,11 +445,10 @@ static struct sock *__tcp_v4_lookup_listener(struct hlist_head *head, u32 daddr,
                        __u32 rcv_saddr = inet->rcv_saddr;
 
                        score = (sk->sk_family == PF_INET ? 1 : 0);
-                       if (rcv_saddr) {
-                               if (rcv_saddr != daddr)
-                                       continue;
+                       if (tcp_addr_match(sk->sk_nx_info, daddr, rcv_saddr))
                                score+=2;
-                       }
+                       else
+                               continue;
                        if (sk->sk_bound_dev_if) {
                                if (sk->sk_bound_dev_if != dif)
                                        continue;
@@ -460,8 +478,8 @@ inline struct sock *tcp_v4_lookup_listener(u32 daddr, unsigned short hnum,
                struct inet_opt *inet = inet_sk((sk = __sk_head(head)));
 
                if (inet->num == hnum && !sk->sk_node.next &&
-                   (!inet->rcv_saddr || inet->rcv_saddr == daddr) &&
                    (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) &&
+                   tcp_addr_match(sk->sk_nx_info, daddr, inet->rcv_saddr) &&
                    !sk->sk_bound_dev_if)
                        goto sherry_cache;
                sk = __tcp_v4_lookup_listener(head, daddr, hnum, dif);
@@ -618,11 +636,11 @@ unique:
 
        if (twp) {
                *twp = tw;
-               NET_INC_STATS_BH(TimeWaitRecycled);
+               NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
        } else if (tw) {
                /* Silly. Should hash-dance instead... */
                tcp_tw_deschedule(tw);
-               NET_INC_STATS_BH(TimeWaitRecycled);
+               NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
 
                tcp_tw_put(tw);
        }
@@ -998,14 +1016,14 @@ void tcp_v4_err(struct sk_buff *skb, u32 info)
        int err;
 
        if (skb->len < (iph->ihl << 2) + 8) {
-               ICMP_INC_STATS_BH(IcmpInErrors);
+               ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
                return;
        }
 
        sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr,
                           th->source, tcp_v4_iif(skb));
        if (!sk) {
-               ICMP_INC_STATS_BH(IcmpInErrors);
+               ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
                return;
        }
        if (sk->sk_state == TCP_TIME_WAIT) {
@@ -1018,7 +1036,7 @@ void tcp_v4_err(struct sk_buff *skb, u32 info)
         * servers this needs to be solved differently.
         */
        if (sock_owned_by_user(sk))
-               NET_INC_STATS_BH(LockDroppedIcmps);
+               NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
 
        if (sk->sk_state == TCP_CLOSE)
                goto out;
@@ -1027,17 +1045,13 @@ void tcp_v4_err(struct sk_buff *skb, u32 info)
        seq = ntohl(th->seq);
        if (sk->sk_state != TCP_LISTEN &&
            !between(seq, tp->snd_una, tp->snd_nxt)) {
-               NET_INC_STATS(OutOfWindowIcmps);
+               NET_INC_STATS(LINUX_MIB_OUTOFWINDOWICMPS);
                goto out;
        }
 
        switch (type) {
        case ICMP_SOURCE_QUENCH:
-               /* This is deprecated, but if someone generated it,
-                * we have no reasons to ignore it.
-                */
-               if (!sock_owned_by_user(sk))
-                       tcp_enter_cwr(tp);
+               /* Just silently ignore these. */
                goto out;
        case ICMP_PARAMETERPROB:
                err = EPROTO;
@@ -1078,7 +1092,7 @@ void tcp_v4_err(struct sk_buff *skb, u32 info)
                BUG_TRAP(!req->sk);
 
                if (seq != req->snt_isn) {
-                       NET_INC_STATS_BH(OutOfWindowIcmps);
+                       NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
                        goto out;
                }
 
@@ -1096,7 +1110,7 @@ void tcp_v4_err(struct sk_buff *skb, u32 info)
                               It can f.e. if SYNs crossed.
                             */
                if (!sock_owned_by_user(sk)) {
-                       TCP_INC_STATS_BH(TcpAttemptFails);
+                       TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
                        sk->sk_err = err;
 
                        sk->sk_error_report(sk);
@@ -1205,8 +1219,8 @@ static void tcp_v4_send_reset(struct sk_buff *skb)
 
        ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
 
-       TCP_INC_STATS_BH(TcpOutSegs);
-       TCP_INC_STATS_BH(TcpOutRsts);
+       TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
+       TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
 }
 
 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
@@ -1253,7 +1267,7 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
 
        ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
 
-       TCP_INC_STATS_BH(TcpOutSegs);
+       TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
 }
 
 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
@@ -1290,12 +1304,12 @@ static struct dst_entry* tcp_v4_route_req(struct sock *sk,
                                         .dport = req->rmt_port } } };
 
        if (ip_route_output_flow(&rt, &fl, sk, 0)) {
-               IP_INC_STATS_BH(IpOutNoRoutes);
+               IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
                return NULL;
        }
        if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
                ip_rt_put(rt);
-               IP_INC_STATS_BH(IpOutNoRoutes);
+               IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
                return NULL;
        }
        return &rt->u.dst;
@@ -1442,7 +1456,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
         * clogging syn queue with openreqs with exponentially increasing
         * timeout.
         */
-       if (tcp_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
+       if (sk_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
                goto drop;
 
        req = tcp_openreq_alloc();
@@ -1505,7 +1519,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
                        if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
                            (s32)(peer->tcp_ts - req->ts_recent) >
                                                        TCP_PAWS_WINDOW) {
-                               NET_INC_STATS_BH(PAWSPassiveRejected);
+                               NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
                                dst_release(dst);
                                goto drop_and_free;
                        }
@@ -1550,7 +1564,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 drop_and_free:
        tcp_openreq_free(req);
 drop:
-       TCP_INC_STATS_BH(TcpAttemptFails);
+       TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
        return 0;
 }
 
@@ -1567,7 +1581,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
        struct tcp_opt *newtp;
        struct sock *newsk;
 
-       if (tcp_acceptq_is_full(sk))
+       if (sk_acceptq_is_full(sk))
                goto exit_overflow;
 
        if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
@@ -1605,9 +1619,9 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
        return newsk;
 
 exit_overflow:
-       NET_INC_STATS_BH(ListenOverflows);
+       NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
 exit:
-       NET_INC_STATS_BH(ListenDrops);
+       NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
        dst_release(dst);
        return NULL;
 }
@@ -1725,7 +1739,7 @@ discard:
        return 0;
 
 csum_err:
-       TCP_INC_STATS_BH(TcpInErrs);
+       TCP_INC_STATS_BH(TCP_MIB_INERRS);
        goto discard;
 }
 
@@ -1743,7 +1757,7 @@ int tcp_v4_rcv(struct sk_buff *skb)
                goto discard_it;
 
        /* Count it even if it's bad */
-       TCP_INC_STATS_BH(TcpInSegs);
+       TCP_INC_STATS_BH(TCP_MIB_INSEGS);
 
        if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
                goto discard_it;
@@ -1810,7 +1824,7 @@ no_tcp_socket:
 
        if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
 bad_packet:
-               TCP_INC_STATS_BH(TcpInErrs);
+               TCP_INC_STATS_BH(TCP_MIB_INERRS);
        } else {
                tcp_v4_send_reset(skb);
        }
@@ -1831,7 +1845,7 @@ do_time_wait:
        }
 
        if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
-               TCP_INC_STATS_BH(TcpInErrs);
+               TCP_INC_STATS_BH(TCP_MIB_INERRS);
                tcp_tw_put((struct tcp_tw_bucket *) sk);
                goto discard_it;
        }
@@ -2075,13 +2089,13 @@ static int tcp_v4_init_sock(struct sock *sk)
         */
        tp->snd_ssthresh = 0x7fffffff;  /* Infinity */
        tp->snd_cwnd_clamp = ~0;
-       tp->mss_cache = 536;
+       tp->mss_cache_std = tp->mss_cache = 536;
 
        tp->reordering = sysctl_tcp_reordering;
 
        sk->sk_state = TCP_CLOSE;
 
-       sk->sk_write_space = tcp_write_space;
+       sk->sk_write_space = sk_stream_write_space;
        sk->sk_use_write_queue = 1;
 
        tp->af_specific = &ipv4_specific;
@@ -2094,14 +2108,14 @@ static int tcp_v4_init_sock(struct sock *sk)
        return 0;
 }
 
-static int tcp_v4_destroy_sock(struct sock *sk)
+int tcp_v4_destroy_sock(struct sock *sk)
 {
        struct tcp_opt *tp = tcp_sk(sk);
 
        tcp_clear_xmit_timers(sk);
 
        /* Cleanup up the write buffer. */
-       tcp_writequeue_purge(sk);
+       sk_stream_writequeue_purge(sk);
 
        /* Cleans up our, hopefully empty, out_of_order_queue. */
        __skb_queue_purge(&tp->out_of_order_queue);
@@ -2113,15 +2127,21 @@ static int tcp_v4_destroy_sock(struct sock *sk)
        if (tp->bind_hash)
                tcp_put_port(sk);
 
-       /* If sendmsg cached page exists, toss it. */
-       if (inet_sk(sk)->sndmsg_page)
-               __free_page(inet_sk(sk)->sndmsg_page);
+       /*
+        * If sendmsg cached page exists, toss it.
+        */
+       if (sk->sk_sndmsg_page) {
+               __free_page(sk->sk_sndmsg_page);
+               sk->sk_sndmsg_page = NULL;
+       }
 
        atomic_dec(&tcp_sockets_allocated);
 
        return 0;
 }
 
+EXPORT_SYMBOL(tcp_v4_destroy_sock);
+
 #ifdef CONFIG_PROC_FS
 /* Proc filesystem TCP sock list dumping. */
 
@@ -2159,6 +2179,12 @@ static void *listening_get_next(struct seq_file *seq, void *cur)
                req = req->dl_next;
                while (1) {
                        while (req) {
+                               vxdprintk(VXD_CBIT(net, 6),
+                                       "sk,req: %p [#%d] (from %d)", req->sk,
+                                       (req->sk)?req->sk->sk_xid:0, current->xid);
+                               if (req->sk &&
+                                       !vx_check(req->sk->sk_xid, VX_IDENT|VX_WATCH))
+                                       continue;
                                if (req->class->family == st->family) {
                                        cur = req;
                                        goto out;
@@ -2173,10 +2199,20 @@ get_req:
                sk        = sk_next(st->syn_wait_sk);
                st->state = TCP_SEQ_STATE_LISTENING;
                read_unlock_bh(&tp->syn_wait_lock);
-       } else
+       } else {
+               tp = tcp_sk(sk);
+               read_lock_bh(&tp->syn_wait_lock);
+               if (tp->listen_opt && tp->listen_opt->qlen)
+                       goto start_req;
+               read_unlock_bh(&tp->syn_wait_lock);
                sk = sk_next(sk);
+       }
 get_sk:
        sk_for_each_from(sk, node) {
+               vxdprintk(VXD_CBIT(net, 6), "sk: %p [#%d] (from %d)",
+                       sk, sk->sk_xid, current->xid);
+               if (!vx_check(sk->sk_xid, VX_IDENT|VX_WATCH))
+                       continue;
                if (sk->sk_family == st->family) {
                        cur = sk;
                        goto out;
@@ -2184,6 +2220,7 @@ get_sk:
                tp = tcp_sk(sk);
                read_lock_bh(&tp->syn_wait_lock);
                if (tp->listen_opt && tp->listen_opt->qlen) {
+start_req:
                        st->uid         = sock_i_uid(sk);
                        st->syn_wait_sk = sk;
                        st->state       = TCP_SEQ_STATE_OPENREQ;
@@ -2224,18 +2261,26 @@ static void *established_get_first(struct seq_file *seq)
               
                read_lock(&tcp_ehash[st->bucket].lock);
                sk_for_each(sk, node, &tcp_ehash[st->bucket].chain) {
-                       if (sk->sk_family != st->family) {
+                       vxdprintk(VXD_CBIT(net, 6),
+                               "sk,egf: %p [#%d] (from %d)",
+                               sk, sk->sk_xid, current->xid);
+                       if (!vx_check(sk->sk_xid, VX_IDENT|VX_WATCH))
+                               continue;
+                       if (sk->sk_family != st->family)
                                continue;
-                       }
                        rc = sk;
                        goto out;
                }
                st->state = TCP_SEQ_STATE_TIME_WAIT;
                tw_for_each(tw, node,
                            &tcp_ehash[st->bucket + tcp_ehash_size].chain) {
-                       if (tw->tw_family != st->family) {
+                       vxdprintk(VXD_CBIT(net, 6),
+                               "tw: %p [#%d] (from %d)",
+                               tw, tw->tw_xid, current->xid);
+                       if (!vx_check(tw->tw_xid, VX_IDENT|VX_WATCH))
+                               continue;
+                       if (tw->tw_family != st->family)
                                continue;
-                       }
                        rc = tw;
                        goto out;
                }
@@ -2259,7 +2304,8 @@ static void *established_get_next(struct seq_file *seq, void *cur)
                tw = cur;
                tw = tw_next(tw);
 get_tw:
-               while (tw && tw->tw_family != st->family) {
+               while (tw && (tw->tw_family != st->family ||
+                       !vx_check(tw->tw_xid, VX_IDENT|VX_WATCH))) {
                        tw = tw_next(tw);
                }
                if (tw) {
@@ -2279,6 +2325,11 @@ get_tw:
                sk = sk_next(sk);
 
        sk_for_each_from(sk, node) {
+               vxdprintk(VXD_CBIT(net, 6),
+                       "sk,egn: %p [#%d] (from %d)",
+                       sk, sk->sk_xid, current->xid);
+               if (!vx_check(sk->sk_xid, VX_IDENT|VX_WATCH))
+                       continue;
                if (sk->sk_family == st->family)
                        goto found;
        }
@@ -2452,7 +2503,7 @@ static void get_openreq4(struct sock *sk, struct open_request *req,
        int ttd = req->expires - jiffies;
 
        sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
-               " %02X %08X:%08X %02X:%08X %08X %5d %8d %u %d %p",
+               " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
                i,
                req->af.v4_req.loc_addr,
                ntohs(inet_sk(sk)->sport),
@@ -2526,7 +2577,7 @@ static void get_timewait4_sock(struct tcp_tw_bucket *tw, char *tmpbuf, int i)
        srcp  = ntohs(tw->tw_sport);
 
        sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
-               " %02X %08X:%08X %02X:%08X %08X %5d %8d %d %d %p",
+               " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
                i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
                3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
                atomic_read(&tw->tw_refcnt), tw);
@@ -2586,23 +2637,32 @@ void tcp4_proc_exit(void)
 #endif /* CONFIG_PROC_FS */
 
 struct proto tcp_prot = {
-       .name           =       "TCP",
-       .close          =       tcp_close,
-       .connect        =       tcp_v4_connect,
-       .disconnect     =       tcp_disconnect,
-       .accept         =       tcp_accept,
-       .ioctl          =       tcp_ioctl,
-       .init           =       tcp_v4_init_sock,
-       .destroy        =       tcp_v4_destroy_sock,
-       .shutdown       =       tcp_shutdown,
-       .setsockopt     =       tcp_setsockopt,
-       .getsockopt     =       tcp_getsockopt,
-       .sendmsg        =       tcp_sendmsg,
-       .recvmsg        =       tcp_recvmsg,
-       .backlog_rcv    =       tcp_v4_do_rcv,
-       .hash           =       tcp_v4_hash,
-       .unhash         =       tcp_unhash,
-       .get_port       =       tcp_v4_get_port,
+       .name                   = "TCP",
+       .close                  = tcp_close,
+       .connect                = tcp_v4_connect,
+       .disconnect             = tcp_disconnect,
+       .accept                 = tcp_accept,
+       .ioctl                  = tcp_ioctl,
+       .init                   = tcp_v4_init_sock,
+       .destroy                = tcp_v4_destroy_sock,
+       .shutdown               = tcp_shutdown,
+       .setsockopt             = tcp_setsockopt,
+       .getsockopt             = tcp_getsockopt,
+       .sendmsg                = tcp_sendmsg,
+       .recvmsg                = tcp_recvmsg,
+       .backlog_rcv            = tcp_v4_do_rcv,
+       .hash                   = tcp_v4_hash,
+       .unhash                 = tcp_unhash,
+       .get_port               = tcp_v4_get_port,
+       .enter_memory_pressure  = tcp_enter_memory_pressure,
+       .sockets_allocated      = &tcp_sockets_allocated,
+       .memory_allocated       = &tcp_memory_allocated,
+       .memory_pressure        = &tcp_memory_pressure,
+       .sysctl_mem             = sysctl_tcp_mem,
+       .sysctl_wmem            = sysctl_tcp_wmem,
+       .sysctl_rmem            = sysctl_tcp_rmem,
+       .max_header             = MAX_TCP_HEADER,
+       .slab_obj_size          = sizeof(struct tcp_sock),
 };