net/ipv4/tcp_ipv4.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
   9  *
  10  *              IPv4 specific functions
  11  *
  12  *
  13  *              code split from:
  14  *              linux/ipv4/tcp.c
  15  *              linux/ipv4/tcp_input.c
  16  *              linux/ipv4/tcp_output.c
  17  *
  18  *              See tcp.c for author information
  19  *
  20  *      This program is free software; you can redistribute it and/or
  21  *      modify it under the terms of the GNU General Public License
  22  *      as published by the Free Software Foundation; either version
  23  *      2 of the License, or (at your option) any later version.
  24  */
  25
  26 /*
  27  * Changes:
  28  *              David S. Miller :       New socket lookup architecture.
  29  *                                      This code is dedicated to John Dyson.
  30  *              David S. Miller :       Change semantics of established hash,
  31  *                                      half is devoted to TIME_WAIT sockets
  32  *                                      and the rest go in the other half.
  33  *              Andi Kleen :            Add support for syncookies and fixed
  34  *                                      some bugs: ip options weren't passed to
  35  *                                      the TCP layer, missed a check for an
  36  *                                      ACK bit.
  37  *              Andi Kleen :            Implemented fast path mtu discovery.
  38  *                                      Fixed many serious bugs in the
  39  *                                      open_request handling and moved
  40  *                                      most of it into the af independent code.
  41  *                                      Added tail drop and some other bugfixes.
  42  *                                      Added new listen sematics.
  43  *              Mike McLagan    :       Routing by source
  44  *      Juan Jose Ciarlante:            ip_dynaddr bits
  45  *              Andi Kleen:             various fixes.
  46  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  47  *                                      coma.
  48  *      Andi Kleen              :       Fix new listen.
  49  *      Andi Kleen              :       Fix accept error reporting.
  50  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  51  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  52  *                                      a single port at the same time.
  53  */
  54
  55 #include <linux/config.h>
  56
  57 #include <linux/types.h>
  58 #include <linux/fcntl.h>
  59 #include <linux/module.h>
  60 #include <linux/random.h>
  61 #include <linux/cache.h>
  62 #include <linux/jhash.h>
  63 #include <linux/init.h>
  64 #include <linux/times.h>
  65
  66 #include <net/icmp.h>
  67 #include <net/tcp.h>
  68 #include <net/ipv6.h>
  69 #include <net/inet_common.h>
  70 #include <net/xfrm.h>
  71
  72 #include <linux/inet.h>
  73 #include <linux/ipv6.h>
  74 #include <linux/stddef.h>
  75 #include <linux/proc_fs.h>
  76 #include <linux/seq_file.h>
  77 #include <linux/vserver/debug.h>
  78
  79 extern int sysctl_ip_dynaddr;
  80 int sysctl_tcp_tw_reuse;
  81 int sysctl_tcp_low_latency;
  82
  83 /* Check TCP sequence numbers in ICMP packets. */
  84 #define ICMP_MIN_LENGTH 8
  85
  86 /* Socket used for sending RSTs */
  87 static struct socket *tcp_socket;
  88
  89 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
  90                        struct sk_buff *skb);
  91
  92 struct tcp_hashinfo __cacheline_aligned tcp_hashinfo = {
  93         .__tcp_lhash_lock       =       RW_LOCK_UNLOCKED,
  94         .__tcp_lhash_users      =       ATOMIC_INIT(0),
  95         .__tcp_lhash_wait
  96           = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.__tcp_lhash_wait),
  97         .__tcp_portalloc_lock   =       SPIN_LOCK_UNLOCKED
  98 };
  99
 100 /*
 101  * This array holds the first and last local port number.
 102  * For high-usage systems, use sysctl to change this to
 103  * 32768-61000
 104  */
 105 int sysctl_local_port_range[2] = { 1024, 4999 };
 106 int tcp_port_rover = 1024 - 1;
 107
 108 static __inline__ int tcp_hashfn(__u32 laddr, __u16 lport,
 109                                  __u32 faddr, __u16 fport)
 110 {
 111         int h = (laddr ^ lport) ^ (faddr ^ fport);
 112         h ^= h >> 16;
 113         h ^= h >> 8;
 114         return h & (tcp_ehash_size - 1);
 115 }
 116
 117 static __inline__ int tcp_sk_hashfn(struct sock *sk)
 118 {
 119         struct inet_opt *inet = inet_sk(sk);
 120         __u32 laddr = inet->rcv_saddr;
 121         __u16 lport = inet->num;
 122         __u32 faddr = inet->daddr;
 123         __u16 fport = inet->dport;
 124
 125         return tcp_hashfn(laddr, lport, faddr, fport);
 126 }
 127
 128 /* Allocate and initialize a new TCP local port bind bucket.
 129  * The bindhash mutex for snum's hash chain must be held here.
 130  */
 131 struct tcp_bind_bucket *tcp_bucket_create(struct tcp_bind_hashbucket *head,
 132                                           unsigned short snum)
 133 {
 134         struct tcp_bind_bucket *tb = kmem_cache_alloc(tcp_bucket_cachep,
 135                                                       SLAB_ATOMIC);
 136         if (tb) {
 137                 tb->port = snum;
 138                 tb->fastreuse = 0;
 139                 INIT_HLIST_HEAD(&tb->owners);
 140                 hlist_add_head(&tb->node, &head->chain);
 141         }
 142         return tb;
 143 }
 144
 145 /* Caller must hold hashbucket lock for this tb with local BH disabled */
 146 void tcp_bucket_destroy(struct tcp_bind_bucket *tb)
 147 {
 148         if (hlist_empty(&tb->owners)) {
 149                 __hlist_del(&tb->node);
 150                 kmem_cache_free(tcp_bucket_cachep, tb);
 151         }
 152 }
 153
 154 /* Caller must disable local BH processing. */
 155 static __inline__ void __tcp_inherit_port(struct sock *sk, struct sock *child)
 156 {
 157         struct tcp_bind_hashbucket *head =
 158                                 &tcp_bhash[tcp_bhashfn(inet_sk(child)->num)];
 159         struct tcp_bind_bucket *tb;
 160
 161         spin_lock(&head->lock);
 162         tb = tcp_sk(sk)->bind_hash;
 163         sk_add_bind_node(child, &tb->owners);
 164         tcp_sk(child)->bind_hash = tb;
 165         spin_unlock(&head->lock);
 166 }
 167
 168 inline void tcp_inherit_port(struct sock *sk, struct sock *child)
 169 {
 170         local_bh_disable();
 171         __tcp_inherit_port(sk, child);
 172         local_bh_enable();
 173 }
 174
 175 void tcp_bind_hash(struct sock *sk, struct tcp_bind_bucket *tb,
 176                    unsigned short snum)
 177 {
 178         inet_sk(sk)->num = snum;
 179         sk_add_bind_node(sk, &tb->owners);
 180         tcp_sk(sk)->bind_hash = tb;
 181 }
 182
 183 /*
 184         Return 1 if addr match the socket IP list
 185         or the socket is INADDR_ANY
 186 */
 187 static inline int tcp_in_list(struct sock *sk, u32 addr)
 188 {
 189         struct nx_info *nxi = sk->sk_nx_info;
 190
 191         vxdprintk(VXD_CBIT(net, 2), "tcp_in_list(%p) %p,%p;%lx",
 192                 sk, nxi, sk->sk_socket,
 193                 (sk->sk_socket?sk->sk_socket->flags:0));
 194
 195         if (nxi) {
 196                 int n = nxi->nbipv4;
 197                 int i;
 198
 199                 for (i=0; i<n; i++)
 200                         if (nxi->ipv4[i] == addr)
 201                                 return 1;
 202         }
 203         else if (!tcp_v4_rcv_saddr(sk) || tcp_v4_rcv_saddr(sk) == addr)
 204                 return 1;
 205         return 0;
 206 }
 207
 208 /*
 209         Check if the addresses in sk1 conflict with those in sk2
 210 */
 211 int tcp_ipv4_addr_conflict(struct sock *sk1, struct sock *sk2)
 212 {
 213         if (sk1 && sk2)
 214         vxdprintk(VXD_CBIT(net, 5),
 215                 "tcp_ipv4_addr_conflict(%p,%p) %p,%p;%lx %p,%p;%lx",
 216                 sk1, sk2,
 217                 sk1->sk_nx_info, sk1->sk_socket,
 218                 (sk1->sk_socket?sk1->sk_socket->flags:0),
 219                 sk2->sk_nx_info, sk2->sk_socket,
 220                 (sk2->sk_socket?sk2->sk_socket->flags:0));
 221
 222         if (tcp_v4_rcv_saddr(sk1)) {
 223                 /* Bind to one address only */
 224                 return tcp_in_list (sk2, tcp_v4_rcv_saddr(sk1));
 225         } else if (sk1->sk_nx_info) {
 226                 /* A restricted bind(any) */
 227                 struct nx_info *nxi = sk1->sk_nx_info;
 228                 int n = nxi->nbipv4;
 229                 int i;
 230
 231                 for (i=0; i<n; i++)
 232                         if (tcp_in_list (sk2, nxi->ipv4[i]))
 233                                 return 1;
 234         } else  /* A bind(any) do not allow other bind on the same port */
 235                 return 1;
 236         return 0;
 237 }
 238
 239 static inline int tcp_bind_conflict(struct sock *sk, struct tcp_bind_bucket *tb)
 240 {
 241         struct sock *sk2;
 242         struct hlist_node *node;
 243         int reuse = sk->sk_reuse;
 244
 245         sk_for_each_bound(sk2, node, &tb->owners) {
 246                 if (sk != sk2 &&
 247                     !tcp_v6_ipv6only(sk2) &&
 248                     (!sk->sk_bound_dev_if ||
 249                      !sk2->sk_bound_dev_if ||
 250                      sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
 251                         if (!reuse || !sk2->sk_reuse ||
 252                             sk2->sk_state == TCP_LISTEN) {
 253                                 if (tcp_ipv4_addr_conflict(sk, sk2))
 254                                         break;
 255                         }
 256                 }
 257         }
 258         return node != NULL;
 259 }
 260
 261 /* Obtain a reference to a local port for the given sock,
 262  * if snum is zero it means select any available local port.
 263  */
 264 static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
 265 {
 266         struct tcp_bind_hashbucket *head;
 267         struct hlist_node *node;
 268         struct tcp_bind_bucket *tb;
 269         int ret;
 270
 271         local_bh_disable();
 272         if (!snum) {
 273                 int low = sysctl_local_port_range[0];
 274                 int high = sysctl_local_port_range[1];
 275                 int remaining = (high - low) + 1;
 276                 int rover;
 277
 278                 spin_lock(&tcp_portalloc_lock);
 279                 rover = tcp_port_rover;
 280                 do {
 281                         rover++;
 282                         if (rover < low || rover > high)
 283                                 rover = low;
 284                         head = &tcp_bhash[tcp_bhashfn(rover)];
 285                         spin_lock(&head->lock);
 286                         tb_for_each(tb, node, &head->chain)
 287                                 if (tb->port == rover)
 288                                         goto next;
 289                         break;
 290                 next:
 291                         spin_unlock(&head->lock);
 292                 } while (--remaining > 0);
 293                 tcp_port_rover = rover;
 294                 spin_unlock(&tcp_portalloc_lock);
 295
 296                 /* Exhausted local port range during search? */
 297                 ret = 1;
 298                 if (remaining <= 0)
 299                         goto fail;
 300
 301                 /* OK, here is the one we will use.  HEAD is
 302                  * non-NULL and we hold it's mutex.
 303                  */
 304                 snum = rover;
 305         } else {
 306                 head = &tcp_bhash[tcp_bhashfn(snum)];
 307                 spin_lock(&head->lock);
 308                 tb_for_each(tb, node, &head->chain)
 309                         if (tb->port == snum)
 310                                 goto tb_found;
 311         }
 312         tb = NULL;
 313         goto tb_not_found;
 314 tb_found:
 315         if (!hlist_empty(&tb->owners)) {
 316                 if (sk->sk_reuse > 1)
 317                         goto success;
 318                 if (tb->fastreuse > 0 &&
 319                     sk->sk_reuse && sk->sk_state != TCP_LISTEN) {
 320                         goto success;
 321                 } else {
 322                         ret = 1;
 323                         if (tcp_bind_conflict(sk, tb))
 324                                 goto fail_unlock;
 325                 }
 326         }
 327 tb_not_found:
 328         ret = 1;
 329         if (!tb && (tb = tcp_bucket_create(head, snum)) == NULL)
 330                 goto fail_unlock;
 331         if (hlist_empty(&tb->owners)) {
 332                 if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
 333                         tb->fastreuse = 1;
 334                 else
 335                         tb->fastreuse = 0;
 336         } else if (tb->fastreuse &&
 337                    (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
 338                 tb->fastreuse = 0;
 339 success:
 340         if (!tcp_sk(sk)->bind_hash)
 341                 tcp_bind_hash(sk, tb, snum);
 342         BUG_TRAP(tcp_sk(sk)->bind_hash == tb);
 343         ret = 0;
 344
 345 fail_unlock:
 346         spin_unlock(&head->lock);
 347 fail:
 348         local_bh_enable();
 349         return ret;
 350 }
 351
 352 /* Get rid of any references to a local port held by the
 353  * given sock.
 354  */
 355 static void __tcp_put_port(struct sock *sk)
 356 {
 357         struct inet_opt *inet = inet_sk(sk);
 358         struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(inet->num)];
 359         struct tcp_bind_bucket *tb;
 360
 361         spin_lock(&head->lock);
 362         tb = tcp_sk(sk)->bind_hash;
 363         __sk_del_bind_node(sk);
 364         tcp_sk(sk)->bind_hash = NULL;
 365         inet->num = 0;
 366         tcp_bucket_destroy(tb);
 367         spin_unlock(&head->lock);
 368 }
 369
 370 void tcp_put_port(struct sock *sk)
 371 {
 372         local_bh_disable();
 373         __tcp_put_port(sk);
 374         local_bh_enable();
 375 }
 376
 377 /* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP.
 378  * Look, when several writers sleep and reader wakes them up, all but one
 379  * immediately hit write lock and grab all the cpus. Exclusive sleep solves
 380  * this, _but_ remember, it adds useless work on UP machines (wake up each
 381  * exclusive lock release). It should be ifdefed really.
 382  */
 383
 384 void tcp_listen_wlock(void)
 385 {
 386         write_lock(&tcp_lhash_lock);
 387
 388         if (atomic_read(&tcp_lhash_users)) {
 389                 DEFINE_WAIT(wait);
 390
 391                 for (;;) {
 392                         prepare_to_wait_exclusive(&tcp_lhash_wait,
 393                                                 &wait, TASK_UNINTERRUPTIBLE);
 394                         if (!atomic_read(&tcp_lhash_users))
 395                                 break;
 396                         write_unlock_bh(&tcp_lhash_lock);
 397                         schedule();
 398                         write_lock_bh(&tcp_lhash_lock);
 399                 }
 400
 401                 finish_wait(&tcp_lhash_wait, &wait);
 402         }
 403 }
 404
 405 static __inline__ void __tcp_v4_hash(struct sock *sk, const int listen_possible)
 406 {
 407         struct hlist_head *list;
 408         rwlock_t *lock;
 409
 410         BUG_TRAP(sk_unhashed(sk));
 411         if (listen_possible && sk->sk_state == TCP_LISTEN) {
 412                 list = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)];
 413                 lock = &tcp_lhash_lock;
 414                 tcp_listen_wlock();
 415         } else {
 416                 list = &tcp_ehash[(sk->sk_hashent = tcp_sk_hashfn(sk))].chain;
 417                 lock = &tcp_ehash[sk->sk_hashent].lock;
 418                 write_lock(lock);
 419         }
 420         __sk_add_node(sk, list);
 421         sock_prot_inc_use(sk->sk_prot);
 422         write_unlock(lock);
 423         if (listen_possible && sk->sk_state == TCP_LISTEN)
 424                 wake_up(&tcp_lhash_wait);
 425 }
 426
 427 static void tcp_v4_hash(struct sock *sk)
 428 {
 429         if (sk->sk_state != TCP_CLOSE) {
 430                 local_bh_disable();
 431                 __tcp_v4_hash(sk, 1);
 432                 local_bh_enable();
 433         }
 434 }
 435
 436 void tcp_unhash(struct sock *sk)
 437 {
 438         rwlock_t *lock;
 439
 440         if (sk_unhashed(sk))
 441                 goto ende;
 442
 443         if (sk->sk_state == TCP_LISTEN) {
 444                 local_bh_disable();
 445                 tcp_listen_wlock();
 446                 lock = &tcp_lhash_lock;
 447         } else {
 448                 struct tcp_ehash_bucket *head = &tcp_ehash[sk->sk_hashent];
 449                 lock = &head->lock;
 450                 write_lock_bh(&head->lock);
 451         }
 452
 453         if (__sk_del_node_init(sk))
 454                 sock_prot_dec_use(sk->sk_prot);
 455         write_unlock_bh(lock);
 456
 457  ende:
 458         if (sk->sk_state == TCP_LISTEN)
 459                 wake_up(&tcp_lhash_wait);
 460 }
 461
 462 /*
 463         Check if an address is in the list
 464 */
 465 static inline int tcp_addr_in_list(
 466         u32 rcv_saddr,
 467         u32 daddr,
 468         struct nx_info *nx_info)
 469 {
 470         if (rcv_saddr == daddr)
 471                 return 1;
 472         else if (rcv_saddr == 0) {
 473                 /* Accept any address or check the list */
 474                 if (!nx_info)
 475                         return 1;
 476                 else {
 477                         int n = nx_info->nbipv4;
 478                         int i;
 479
 480                         for (i=0; i<n; i++)
 481                                 if (nx_info->ipv4[i] == daddr)
 482                                         return 1;
 483                 }
 484         }
 485         return 0;
 486 }
 487
 488
 489
 490 /* Don't inline this cruft.  Here are some nice properties to
 491  * exploit here.  The BSD API does not allow a listening TCP
 492  * to specify the remote port nor the remote address for the
 493  * connection.  So always assume those are both wildcarded
 494  * during the search since they can never be otherwise.
 495  */
 496 static struct sock *__tcp_v4_lookup_listener(struct hlist_head *head, u32 daddr,
 497                                              unsigned short hnum, int dif)
 498 {
 499         struct sock *result = NULL, *sk;
 500         struct hlist_node *node;
 501         int score, hiscore;
 502
 503         hiscore=-1;
 504         sk_for_each(sk, node, head) {
 505                 struct inet_opt *inet = inet_sk(sk);
 506
 507                 if (inet->num == hnum && !ipv6_only_sock(sk)) {
 508                         __u32 rcv_saddr = inet->rcv_saddr;
 509
 510                         score = (sk->sk_family == PF_INET ? 1 : 0);
 511                         if (tcp_addr_in_list(rcv_saddr, daddr, sk->sk_nx_info))
 512                                 score+=2;
 513                         else
 514                                 continue;
 515                         if (sk->sk_bound_dev_if) {
 516                                 if (sk->sk_bound_dev_if != dif)
 517                                         continue;
 518                                 score+=2;
 519                         }
 520                         if (score == 5)
 521                                 return sk;
 522                         if (score > hiscore) {
 523                                 hiscore = score;
 524                                 result = sk;
 525                         }
 526                 }
 527         }
 528         return result;
 529 }
 530
 531 /* Optimize the common listener case. */
 532 inline struct sock *tcp_v4_lookup_listener(u32 daddr, unsigned short hnum,
 533                                            int dif)
 534 {
 535         struct sock *sk = NULL;
 536         struct hlist_head *head;
 537
 538         read_lock(&tcp_lhash_lock);
 539         head = &tcp_listening_hash[tcp_lhashfn(hnum)];
 540         if (!hlist_empty(head)) {
 541                 struct inet_opt *inet = inet_sk((sk = __sk_head(head)));
 542
 543                 if (inet->num == hnum && !sk->sk_node.next &&
 544                     (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) &&
 545                     tcp_addr_in_list(inet->rcv_saddr, daddr, sk->sk_nx_info) &&
 546                     !sk->sk_bound_dev_if)
 547                         goto sherry_cache;
 548                 sk = __tcp_v4_lookup_listener(head, daddr, hnum, dif);
 549         }
 550         if (sk) {
 551 sherry_cache:
 552                 sock_hold(sk);
 553         }
 554         read_unlock(&tcp_lhash_lock);
 555         return sk;
 556 }
 557
 558 /* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
 559  * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
 560  *
 561  * Local BH must be disabled here.
 562  */
 563
 564 static inline struct sock *__tcp_v4_lookup_established(u32 saddr, u16 sport,
 565                                                        u32 daddr, u16 hnum,
 566                                                        int dif)
 567 {
 568         struct tcp_ehash_bucket *head;
 569         TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
 570         __u32 ports = TCP_COMBINED_PORTS(sport, hnum);
 571         struct sock *sk;
 572         struct hlist_node *node;
 573         /* Optimize here for direct hit, only listening connections can
 574          * have wildcards anyways.
 575          */
 576         int hash = tcp_hashfn(daddr, hnum, saddr, sport);
 577         head = &tcp_ehash[hash];
 578         read_lock(&head->lock);
 579         sk_for_each(sk, node, &head->chain) {
 580                 if (TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
 581                         goto hit; /* You sunk my battleship! */
 582         }
 583
 584         /* Must check for a TIME_WAIT'er before going to listener hash. */
 585         sk_for_each(sk, node, &(head + tcp_ehash_size)->chain) {
 586                 if (TCP_IPV4_TW_MATCH(sk, acookie, saddr, daddr, ports, dif))
 587                         goto hit;
 588         }
 589         sk = NULL;
 590 out:
 591         read_unlock(&head->lock);
 592         return sk;
 593 hit:
 594         sock_hold(sk);
 595         goto out;
 596 }
 597
 598 static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport,
 599                                            u32 daddr, u16 hnum, int dif)
 600 {
 601         struct sock *sk = __tcp_v4_lookup_established(saddr, sport,
 602                                                       daddr, hnum, dif);
 603
 604         return sk ? : tcp_v4_lookup_listener(daddr, hnum, dif);
 605 }
 606
 607 inline struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr,
 608                                   u16 dport, int dif)
 609 {
 610         struct sock *sk;
 611
 612         local_bh_disable();
 613         sk = __tcp_v4_lookup(saddr, sport, daddr, ntohs(dport), dif);
 614         local_bh_enable();
 615
 616         return sk;
 617 }
 618
 619 static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
 620 {
 621         return secure_tcp_sequence_number(skb->nh.iph->daddr,
 622                                           skb->nh.iph->saddr,
 623                                           skb->h.th->dest,
 624                                           skb->h.th->source);
 625 }
 626
 627 /* called with local bh disabled */
 628 static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
 629                                       struct tcp_tw_bucket **twp)
 630 {
 631         struct inet_opt *inet = inet_sk(sk);
 632         u32 daddr = inet->rcv_saddr;
 633         u32 saddr = inet->daddr;
 634         int dif = sk->sk_bound_dev_if;
 635         TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
 636         __u32 ports = TCP_COMBINED_PORTS(inet->dport, lport);
 637         int hash = tcp_hashfn(daddr, lport, saddr, inet->dport);
 638         struct tcp_ehash_bucket *head = &tcp_ehash[hash];
 639         struct sock *sk2;
 640         struct hlist_node *node;
 641         struct tcp_tw_bucket *tw;
 642
 643         write_lock(&head->lock);
 644
 645         /* Check TIME-WAIT sockets first. */
 646         sk_for_each(sk2, node, &(head + tcp_ehash_size)->chain) {
 647                 tw = (struct tcp_tw_bucket *)sk2;
 648
 649                 if (TCP_IPV4_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
 650                         struct tcp_opt *tp = tcp_sk(sk);
 651
 652                         /* With PAWS, it is safe from the viewpoint
 653                            of data integrity. Even without PAWS it
 654                            is safe provided sequence spaces do not
 655                            overlap i.e. at data rates <= 80Mbit/sec.
 656
 657                            Actually, the idea is close to VJ's one,
 658                            only timestamp cache is held not per host,
 659                            but per port pair and TW bucket is used
 660                            as state holder.
 661
 662                            If TW bucket has been already destroyed we
 663                            fall back to VJ's scheme and use initial
 664                            timestamp retrieved from peer table.
 665                          */
 666                         if (tw->tw_ts_recent_stamp &&
 667                             (!twp || (sysctl_tcp_tw_reuse &&
 668                                       xtime.tv_sec -
 669                                       tw->tw_ts_recent_stamp > 1))) {
 670                                 if ((tp->write_seq =
 671                                                 tw->tw_snd_nxt + 65535 + 2) == 0)
 672                                         tp->write_seq = 1;
 673                                 tp->ts_recent       = tw->tw_ts_recent;
 674                                 tp->ts_recent_stamp = tw->tw_ts_recent_stamp;
 675                                 sock_hold(sk2);
 676                                 goto unique;
 677                         } else
 678                                 goto not_unique;
 679                 }
 680         }
 681         tw = NULL;
 682
 683         /* And established part... */
 684         sk_for_each(sk2, node, &head->chain) {
 685                 if (TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif))
 686                         goto not_unique;
 687         }
 688
 689 unique:
 690         /* Must record num and sport now. Otherwise we will see
 691          * in hash table socket with a funny identity. */
 692         inet->num = lport;
 693         inet->sport = htons(lport);
 694         sk->sk_hashent = hash;
 695         BUG_TRAP(sk_unhashed(sk));
 696         __sk_add_node(sk, &head->chain);
 697         sock_prot_inc_use(sk->sk_prot);
 698         write_unlock(&head->lock);
 699
 700         if (twp) {
 701                 *twp = tw;
 702                 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
 703         } else if (tw) {
 704                 /* Silly. Should hash-dance instead... */
 705                 tcp_tw_deschedule(tw);
 706                 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
 707
 708                 tcp_tw_put(tw);
 709         }
 710
 711         return 0;
 712
 713 not_unique:
 714         write_unlock(&head->lock);
 715         return -EADDRNOTAVAIL;
 716 }
 717
 718 /*
 719  * Bind a port for a connect operation and hash it.
 720  */
 721 static int tcp_v4_hash_connect(struct sock *sk)
 722 {
 723         unsigned short snum = inet_sk(sk)->num;
 724         struct tcp_bind_hashbucket *head;
 725         struct tcp_bind_bucket *tb;
 726         int ret;
 727
 728         if (!snum) {
 729                 int rover;
 730                 int low = sysctl_local_port_range[0];
 731                 int high = sysctl_local_port_range[1];
 732                 int remaining = (high - low) + 1;
 733                 struct hlist_node *node;
 734                 struct tcp_tw_bucket *tw = NULL;
 735
 736                 local_bh_disable();
 737
 738                 /* TODO. Actually it is not so bad idea to remove
 739                  * tcp_portalloc_lock before next submission to Linus.
 740                  * As soon as we touch this place at all it is time to think.
 741                  *
 742                  * Now it protects single _advisory_ variable tcp_port_rover,
 743                  * hence it is mostly useless.
 744                  * Code will work nicely if we just delete it, but
 745                  * I am afraid in contented case it will work not better or
 746                  * even worse: another cpu just will hit the same bucket
 747                  * and spin there.
 748                  * So some cpu salt could remove both contention and
 749                  * memory pingpong. Any ideas how to do this in a nice way?
 750                  */
 751                 spin_lock(&tcp_portalloc_lock);
 752                 rover = tcp_port_rover;
 753
 754                 do {
 755                         rover++;
 756                         if ((rover < low) || (rover > high))
 757                                 rover = low;
 758                         head = &tcp_bhash[tcp_bhashfn(rover)];
 759                         spin_lock(&head->lock);
 760
 761                         /* Does not bother with rcv_saddr checks,
 762                          * because the established check is already
 763                          * unique enough.
 764                          */
 765                         tb_for_each(tb, node, &head->chain) {
 766                                 if (tb->port == rover) {
 767                                         BUG_TRAP(!hlist_empty(&tb->owners));
 768                                         if (tb->fastreuse >= 0)
 769                                                 goto next_port;
 770                                         if (!__tcp_v4_check_established(sk,
 771                                                                         rover,
 772                                                                         &tw))
 773                                                 goto ok;
 774                                         goto next_port;
 775                                 }
 776                         }
 777
 778                         tb = tcp_bucket_create(head, rover);
 779                         if (!tb) {
 780                                 spin_unlock(&head->lock);
 781                                 break;
 782                         }
 783                         tb->fastreuse = -1;
 784                         goto ok;
 785
 786                 next_port:
 787                         spin_unlock(&head->lock);
 788                 } while (--remaining > 0);
 789                 tcp_port_rover = rover;
 790                 spin_unlock(&tcp_portalloc_lock);
 791
 792                 local_bh_enable();
 793
 794                 return -EADDRNOTAVAIL;
 795
 796 ok:
 797                 /* All locks still held and bhs disabled */
 798                 tcp_port_rover = rover;
 799                 spin_unlock(&tcp_portalloc_lock);
 800
 801                 tcp_bind_hash(sk, tb, rover);
 802                 if (sk_unhashed(sk)) {
 803                         inet_sk(sk)->sport = htons(rover);
 804                         __tcp_v4_hash(sk, 0);
 805                 }
 806                 spin_unlock(&head->lock);
 807
 808                 if (tw) {
 809                         tcp_tw_deschedule(tw);
 810                         tcp_tw_put(tw);
 811                 }
 812
 813                 ret = 0;
 814                 goto out;
 815         }
 816
 817         head  = &tcp_bhash[tcp_bhashfn(snum)];
 818         tb  = tcp_sk(sk)->bind_hash;
 819         spin_lock_bh(&head->lock);
 820         if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
 821                 __tcp_v4_hash(sk, 0);
 822                 spin_unlock_bh(&head->lock);
 823                 return 0;
 824         } else {
 825                 spin_unlock(&head->lock);
 826                 /* No definite answer... Walk to established hash table */
 827                 ret = __tcp_v4_check_established(sk, snum, NULL);
 828 out:
 829                 local_bh_enable();
 830                 return ret;
 831         }
 832 }
 833
 834 /* This will initiate an outgoing connection. */
 835 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 836 {
 837         struct inet_opt *inet = inet_sk(sk);
 838         struct tcp_opt *tp = tcp_sk(sk);
 839         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 840         struct rtable *rt;
 841         u32 daddr, nexthop;
 842         int tmp;
 843         int err;
 844
 845         if (addr_len < sizeof(struct sockaddr_in))
 846                 return -EINVAL;
 847
 848         if (usin->sin_family != AF_INET)
 849                 return -EAFNOSUPPORT;
 850
 851         nexthop = daddr = usin->sin_addr.s_addr;
 852         if (inet->opt && inet->opt->srr) {
 853                 if (!daddr)
 854                         return -EINVAL;
 855                 nexthop = inet->opt->faddr;
 856         }
 857
 858         tmp = ip_route_connect(&rt, nexthop, inet->saddr,
 859                                RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 860                                IPPROTO_TCP,
 861                                inet->sport, usin->sin_port, sk);
 862         if (tmp < 0)
 863                 return tmp;
 864
 865         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 866                 ip_rt_put(rt);
 867                 return -ENETUNREACH;
 868         }
 869
 870         if (!inet->opt || !inet->opt->srr)
 871                 daddr = rt->rt_dst;
 872
 873         if (!inet->saddr)
 874                 inet->saddr = rt->rt_src;
 875         inet->rcv_saddr = inet->saddr;
 876
 877         if (tp->ts_recent_stamp && inet->daddr != daddr) {
 878                 /* Reset inherited state */
 879                 tp->ts_recent       = 0;
 880                 tp->ts_recent_stamp = 0;
 881                 tp->write_seq       = 0;
 882         }
 883
 884         if (sysctl_tcp_tw_recycle &&
 885             !tp->ts_recent_stamp && rt->rt_dst == daddr) {
 886                 struct inet_peer *peer = rt_get_peer(rt);
 887
 888                 /* VJ's idea. We save last timestamp seen from
 889                  * the destination in peer table, when entering state TIME-WAIT
 890                  * and initialize ts_recent from it, when trying new connection.
 891                  */
 892
 893                 if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
 894                         tp->ts_recent_stamp = peer->tcp_ts_stamp;
 895                         tp->ts_recent = peer->tcp_ts;
 896                 }
 897         }
 898
 899         inet->dport = usin->sin_port;
 900         inet->daddr = daddr;
 901
 902         tp->ext_header_len = 0;
 903         if (inet->opt)
 904                 tp->ext_header_len = inet->opt->optlen;
 905
 906         tp->mss_clamp = 536;
 907
 908         /* Socket identity is still unknown (sport may be zero).
 909          * However we set state to SYN-SENT and not releasing socket
 910          * lock select source port, enter ourselves into the hash tables and
 911          * complete initialization after this.
 912          */
 913         tcp_set_state(sk, TCP_SYN_SENT);
 914         err = tcp_v4_hash_connect(sk);
 915         if (err)
 916                 goto failure;
 917
 918         err = ip_route_newports(&rt, inet->sport, inet->dport, sk);
 919         if (err)
 920                 goto failure;
 921
 922         /* OK, now commit destination to socket.  */
 923         __sk_dst_set(sk, &rt->u.dst);
 924         tcp_v4_setup_caps(sk, &rt->u.dst);
 925         tp->ext2_header_len = rt->u.dst.header_len;
 926
 927         if (!tp->write_seq)
 928                 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
 929                                                            inet->daddr,
 930                                                            inet->sport,
 931                                                            usin->sin_port);
 932
 933         inet->id = tp->write_seq ^ jiffies;
 934
 935         err = tcp_connect(sk);
 936         rt = NULL;
 937         if (err)
 938                 goto failure;
 939
 940         return 0;
 941
 942 failure:
 943         /* This unhashes the socket and releases the local port, if necessary. */
 944         tcp_set_state(sk, TCP_CLOSE);
 945         ip_rt_put(rt);
 946         sk->sk_route_caps = 0;
 947         inet->dport = 0;
 948         return err;
 949 }
 950
 951 static __inline__ int tcp_v4_iif(struct sk_buff *skb)
 952 {
 953         return ((struct rtable *)skb->dst)->rt_iif;
 954 }
 955
 956 static __inline__ u32 tcp_v4_synq_hash(u32 raddr, u16 rport, u32 rnd)
 957 {
 958         return (jhash_2words(raddr, (u32) rport, rnd) & (TCP_SYNQ_HSIZE - 1));
 959 }
 960
 961 static struct open_request *tcp_v4_search_req(struct tcp_opt *tp,
 962                                               struct open_request ***prevp,
 963                                               __u16 rport,
 964                                               __u32 raddr, __u32 laddr)
 965 {
 966         struct tcp_listen_opt *lopt = tp->listen_opt;
 967         struct open_request *req, **prev;
 968
 969         for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport, lopt->hash_rnd)];
 970              (req = *prev) != NULL;
 971              prev = &req->dl_next) {
 972                 if (req->rmt_port == rport &&
 973                     req->af.v4_req.rmt_addr == raddr &&
 974                     req->af.v4_req.loc_addr == laddr &&
 975                     TCP_INET_FAMILY(req->class->family)) {
 976                         BUG_TRAP(!req->sk);
 977                         *prevp = prev;
 978                         break;
 979                 }
 980         }
 981
 982         return req;
 983 }
 984
 985 static void tcp_v4_synq_add(struct sock *sk, struct open_request *req)
 986 {
 987         struct tcp_opt *tp = tcp_sk(sk);
 988         struct tcp_listen_opt *lopt = tp->listen_opt;
 989         u32 h = tcp_v4_synq_hash(req->af.v4_req.rmt_addr, req->rmt_port, lopt->hash_rnd);
 990
 991         req->expires = jiffies + TCP_TIMEOUT_INIT;
 992         req->retrans = 0;
 993         req->sk = NULL;
 994         req->dl_next = lopt->syn_table[h];
 995
 996         write_lock(&tp->syn_wait_lock);
 997         lopt->syn_table[h] = req;
 998         write_unlock(&tp->syn_wait_lock);
 999
1000         tcp_synq_added(sk);
1001 }
1002
1003
1004 /*
1005  * This routine does path mtu discovery as defined in RFC1191.
1006  */
1007 static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph,
1008                                      u32 mtu)
1009 {
1010         struct dst_entry *dst;
1011         struct inet_opt *inet = inet_sk(sk);
1012         struct tcp_opt *tp = tcp_sk(sk);
1013
1014         /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
1015          * send out by Linux are always <576bytes so they should go through
1016          * unfragmented).
1017          */
1018         if (sk->sk_state == TCP_LISTEN)
1019                 return;
1020
1021         /* We don't check in the destentry if pmtu discovery is forbidden
1022          * on this route. We just assume that no packet_to_big packets
1023          * are send back when pmtu discovery is not active.
1024          * There is a small race when the user changes this flag in the
1025          * route, but I think that's acceptable.
1026          */
1027         if ((dst = __sk_dst_check(sk, 0)) == NULL)
1028                 return;
1029
1030         dst->ops->update_pmtu(dst, mtu);
1031
1032         /* Something is about to be wrong... Remember soft error
1033          * for the case, if this connection will not able to recover.
1034          */
1035         if (mtu < dst_pmtu(dst) && ip_dont_fragment(sk, dst))
1036                 sk->sk_err_soft = EMSGSIZE;
1037
1038         mtu = dst_pmtu(dst);
1039
1040         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
1041             tp->pmtu_cookie > mtu) {
1042                 tcp_sync_mss(sk, mtu);
1043
1044                 /* Resend the TCP packet because it's
1045                  * clear that the old packet has been
1046                  * dropped. This is the new "fast" path mtu
1047                  * discovery.
1048                  */
1049                 tcp_simple_retransmit(sk);
1050         } /* else let the usual retransmit timer handle it */
1051 }
1052
1053 /*
1054  * This routine is called by the ICMP module when it gets some
1055  * sort of error condition.  If err < 0 then the socket should
1056  * be closed and the error returned to the user.  If err > 0
1057  * it's just the icmp type << 8 | icmp code.  After adjustment
1058  * header points to the first 8 bytes of the tcp header.  We need
1059  * to find the appropriate port.
1060  *
1061  * The locking strategy used here is very "optimistic". When
1062  * someone else accesses the socket the ICMP is just dropped
1063  * and for some paths there is no check at all.
1064  * A more general error queue to queue errors for later handling
1065  * is probably better.
1066  *
1067  */
1068
1069 void tcp_v4_err(struct sk_buff *skb, u32 info)
1070 {
1071         struct iphdr *iph = (struct iphdr *)skb->data;
1072         struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
1073         struct tcp_opt *tp;
1074         struct inet_opt *inet;
1075         int type = skb->h.icmph->type;
1076         int code = skb->h.icmph->code;
1077         struct sock *sk;
1078         __u32 seq;
1079         int err;
1080
1081         if (skb->len < (iph->ihl << 2) + 8) {
1082                 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
1083                 return;
1084         }
1085
1086         sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr,
1087                            th->source, tcp_v4_iif(skb));
1088         if (!sk) {
1089                 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
1090                 return;
1091         }
1092         if (sk->sk_state == TCP_TIME_WAIT) {
1093                 tcp_tw_put((struct tcp_tw_bucket *)sk);
1094                 return;
1095         }
1096
1097         bh_lock_sock(sk);
1098         /* If too many ICMPs get dropped on busy
1099          * servers this needs to be solved differently.
1100          */
1101         if (sock_owned_by_user(sk))
1102                 NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
1103
1104         if (sk->sk_state == TCP_CLOSE)
1105                 goto out;
1106
1107         tp = tcp_sk(sk);
1108         seq = ntohl(th->seq);
1109         if (sk->sk_state != TCP_LISTEN &&
1110             !between(seq, tp->snd_una, tp->snd_nxt)) {
1111                 NET_INC_STATS(LINUX_MIB_OUTOFWINDOWICMPS);
1112                 goto out;
1113         }
1114
1115         switch (type) {
1116         case ICMP_SOURCE_QUENCH:
1117                 /* This is deprecated, but if someone generated it,
1118                  * we have no reasons to ignore it.
1119                  */
1120                 if (!sock_owned_by_user(sk))
1121                         tcp_enter_cwr(tp);
1122                 goto out;
1123         case ICMP_PARAMETERPROB:
1124                 err = EPROTO;
1125                 break;
1126         case ICMP_DEST_UNREACH:
1127                 if (code > NR_ICMP_UNREACH)
1128                         goto out;
1129
1130                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
1131                         if (!sock_owned_by_user(sk))
1132                                 do_pmtu_discovery(sk, iph, info);
1133                         goto out;
1134                 }
1135
1136                 err = icmp_err_convert[code].errno;
1137                 break;
1138         case ICMP_TIME_EXCEEDED:
1139                 err = EHOSTUNREACH;
1140                 break;
1141         default:
1142                 goto out;
1143         }
1144
1145         switch (sk->sk_state) {
1146                 struct open_request *req, **prev;
1147         case TCP_LISTEN:
1148                 if (sock_owned_by_user(sk))
1149                         goto out;
1150
1151                 req = tcp_v4_search_req(tp, &prev, th->dest,
1152                                         iph->daddr, iph->saddr);
1153                 if (!req)
1154                         goto out;
1155
1156                 /* ICMPs are not backlogged, hence we cannot get
1157                    an established socket here.
1158                  */
1159                 BUG_TRAP(!req->sk);
1160
1161                 if (seq != req->snt_isn) {
1162                         NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
1163                         goto out;
1164                 }
1165
1166                 /*
1167                  * Still in SYN_RECV, just remove it silently.
1168                  * There is no good way to pass the error to the newly
1169                  * created socket, and POSIX does not want network
1170                  * errors returned from accept().
1171                  */
1172                 tcp_synq_drop(sk, req, prev);
1173                 goto out;
1174
1175         case TCP_SYN_SENT:
1176         case TCP_SYN_RECV:  /* Cannot happen.
1177                                It can f.e. if SYNs crossed.
1178                              */
1179                 if (!sock_owned_by_user(sk)) {
1180                         TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1181                         sk->sk_err = err;
1182
1183                         sk->sk_error_report(sk);
1184
1185                         tcp_done(sk);
1186                 } else {
1187                         sk->sk_err_soft = err;
1188                 }
1189                 goto out;
1190         }
1191
1192         /* If we've already connected we will keep trying
1193          * until we time out, or the user gives up.
1194          *
1195          * rfc1122 4.2.3.9 allows to consider as hard errors
1196          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
1197          * but it is obsoleted by pmtu discovery).
1198          *
1199          * Note, that in modern internet, where routing is unreliable
1200          * and in each dark corner broken firewalls sit, sending random
1201          * errors ordered by their masters even this two messages finally lose
1202          * their original sense (even Linux sends invalid PORT_UNREACHs)
1203          *
1204          * Now we are in compliance with RFCs.
1205          *                                                      --ANK (980905)
1206          */
1207
1208         inet = inet_sk(sk);
1209         if (!sock_owned_by_user(sk) && inet->recverr) {
1210                 sk->sk_err = err;
1211                 sk->sk_error_report(sk);
1212         } else  { /* Only an error on timeout */
1213                 sk->sk_err_soft = err;
1214         }
1215
1216 out:
1217         bh_unlock_sock(sk);
1218         sock_put(sk);
1219 }
1220
1221 /* This routine computes an IPv4 TCP checksum. */
1222 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
1223                        struct sk_buff *skb)
1224 {
1225         struct inet_opt *inet = inet_sk(sk);
1226
1227         if (skb->ip_summed == CHECKSUM_HW) {
1228                 th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0);
1229                 skb->csum = offsetof(struct tcphdr, check);
1230         } else {
1231                 th->check = tcp_v4_check(th, len, inet->saddr, inet->daddr,
1232                                          csum_partial((char *)th,
1233                                                       th->doff << 2,
1234                                                       skb->csum));
1235         }
1236 }
1237
1238 /*
1239  *      This routine will send an RST to the other tcp.
1240  *
1241  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
1242  *                    for reset.
1243  *      Answer: if a packet caused RST, it is not for a socket
1244  *              existing in our system, if it is matched to a socket,
1245  *              it is just duplicate segment or bug in other side's TCP.
1246  *              So that we build reply only basing on parameters
1247  *              arrived with segment.
1248  *      Exception: precedence violation. We do not implement it in any case.
1249  */
1250
1251 static void tcp_v4_send_reset(struct sk_buff *skb)
1252 {
1253         struct tcphdr *th = skb->h.th;
1254         struct tcphdr rth;
1255         struct ip_reply_arg arg;
1256
1257         /* Never send a reset in response to a reset. */
1258         if (th->rst)
1259                 return;
1260
1261         if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
1262                 return;
1263
1264         /* Swap the send and the receive. */
1265         memset(&rth, 0, sizeof(struct tcphdr));
1266         rth.dest   = th->source;
1267         rth.source = th->dest;
1268         rth.doff   = sizeof(struct tcphdr) / 4;
1269         rth.rst    = 1;
1270
1271         if (th->ack) {
1272                 rth.seq = th->ack_seq;
1273         } else {
1274                 rth.ack = 1;
1275                 rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
1276                                     skb->len - (th->doff << 2));
1277         }
1278
1279         memset(&arg, 0, sizeof arg);
1280         arg.iov[0].iov_base = (unsigned char *)&rth;
1281         arg.iov[0].iov_len  = sizeof rth;
1282         arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1283                                       skb->nh.iph->saddr, /*XXX*/
1284                                       sizeof(struct tcphdr), IPPROTO_TCP, 0);
1285         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1286
1287         ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
1288
1289         TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
1290         TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
1291 }
1292
1293 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
1294    outside socket context is ugly, certainly. What can I do?
1295  */
1296
1297 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
1298                             u32 win, u32 ts)
1299 {
1300         struct tcphdr *th = skb->h.th;
1301         struct {
1302                 struct tcphdr th;
1303                 u32 tsopt[3];
1304         } rep;
1305         struct ip_reply_arg arg;
1306
1307         memset(&rep.th, 0, sizeof(struct tcphdr));
1308         memset(&arg, 0, sizeof arg);
1309
1310         arg.iov[0].iov_base = (unsigned char *)&rep;
1311         arg.iov[0].iov_len  = sizeof(rep.th);
1312         if (ts) {
1313                 rep.tsopt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
1314                                      (TCPOPT_TIMESTAMP << 8) |
1315                                      TCPOLEN_TIMESTAMP);
1316                 rep.tsopt[1] = htonl(tcp_time_stamp);
1317                 rep.tsopt[2] = htonl(ts);
1318                 arg.iov[0].iov_len = sizeof(rep);
1319         }
1320
1321         /* Swap the send and the receive. */
1322         rep.th.dest    = th->source;
1323         rep.th.source  = th->dest;
1324         rep.th.doff    = arg.iov[0].iov_len / 4;
1325         rep.th.seq     = htonl(seq);
1326         rep.th.ack_seq = htonl(ack);
1327         rep.th.ack     = 1;
1328         rep.th.window  = htons(win);
1329
1330         arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1331                                       skb->nh.iph->saddr, /*XXX*/
1332                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
1333         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1334
1335         ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
1336
1337         TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
1338 }
1339
1340 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
1341 {
1342         struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk;
1343
1344         tcp_v4_send_ack(skb, tw->tw_snd_nxt, tw->tw_rcv_nxt,
1345                         tw->tw_rcv_wnd >> tw->tw_rcv_wscale, tw->tw_ts_recent);
1346
1347         tcp_tw_put(tw);
1348 }
1349
1350 static void tcp_v4_or_send_ack(struct sk_buff *skb, struct open_request *req)
1351 {
1352         tcp_v4_send_ack(skb, req->snt_isn + 1, req->rcv_isn + 1, req->rcv_wnd,
1353                         req->ts_recent);
1354 }
1355
1356 static struct dst_entry* tcp_v4_route_req(struct sock *sk,
1357                                           struct open_request *req)
1358 {
1359         struct rtable *rt;
1360         struct ip_options *opt = req->af.v4_req.opt;
1361         struct flowi fl = { .oif = sk->sk_bound_dev_if,
1362                             .nl_u = { .ip4_u =
1363                                       { .daddr = ((opt && opt->srr) ?
1364                                                   opt->faddr :
1365                                                   req->af.v4_req.rmt_addr),
1366                                         .saddr = req->af.v4_req.loc_addr,
1367                                         .tos = RT_CONN_FLAGS(sk) } },
1368                             .proto = IPPROTO_TCP,
1369                             .uli_u = { .ports =
1370                                        { .sport = inet_sk(sk)->sport,
1371                                          .dport = req->rmt_port } } };
1372
1373         if (ip_route_output_flow(&rt, &fl, sk, 0)) {
1374                 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
1375                 return NULL;
1376         }
1377         if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
1378                 ip_rt_put(rt);
1379                 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
1380                 return NULL;
1381         }
1382         return &rt->u.dst;
1383 }
1384
1385 /*
1386  *      Send a SYN-ACK after having received an ACK.
1387  *      This still operates on a open_request only, not on a big
1388  *      socket.
1389  */
1390 static int tcp_v4_send_synack(struct sock *sk, struct open_request *req,
1391                               struct dst_entry *dst)
1392 {
1393         int err = -1;
1394         struct sk_buff * skb;
1395
1396         /* First, grab a route. */
1397         if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1398                 goto out;
1399
1400         skb = tcp_make_synack(sk, dst, req);
1401
1402         if (skb) {
1403                 struct tcphdr *th = skb->h.th;
1404
1405                 th->check = tcp_v4_check(th, skb->len,
1406                                          req->af.v4_req.loc_addr,
1407                                          req->af.v4_req.rmt_addr,
1408                                          csum_partial((char *)th, skb->len,
1409                                                       skb->csum));
1410
1411                 err = ip_build_and_send_pkt(skb, sk, req->af.v4_req.loc_addr,
1412                                             req->af.v4_req.rmt_addr,
1413                                             req->af.v4_req.opt);
1414                 if (err == NET_XMIT_CN)
1415                         err = 0;
1416         }
1417
1418 out:
1419         dst_release(dst);
1420         return err;
1421 }
1422
1423 /*
1424  *      IPv4 open_request destructor.
1425  */
1426 static void tcp_v4_or_free(struct open_request *req)
1427 {
1428         if (req->af.v4_req.opt)
1429                 kfree(req->af.v4_req.opt);
1430 }
1431
1432 static inline void syn_flood_warning(struct sk_buff *skb)
1433 {
1434         static unsigned long warntime;
1435
1436         if (time_after(jiffies, (warntime + HZ * 60))) {
1437                 warntime = jiffies;
1438                 printk(KERN_INFO
1439                        "possible SYN flooding on port %d. Sending cookies.\n",
1440                        ntohs(skb->h.th->dest));
1441         }
1442 }
1443
1444 /*
1445  * Save and compile IPv4 options into the open_request if needed.
1446  */
1447 static inline struct ip_options *tcp_v4_save_options(struct sock *sk,
1448                                                      struct sk_buff *skb)
1449 {
1450         struct ip_options *opt = &(IPCB(skb)->opt);
1451         struct ip_options *dopt = NULL;
1452
1453         if (opt && opt->optlen) {
1454                 int opt_size = optlength(opt);
1455                 dopt = kmalloc(opt_size, GFP_ATOMIC);
1456                 if (dopt) {
1457                         if (ip_options_echo(dopt, skb)) {
1458                                 kfree(dopt);
1459                                 dopt = NULL;
1460                         }
1461                 }
1462         }
1463         return dopt;
1464 }
1465
1466 /*
1467  * Maximum number of SYN_RECV sockets in queue per LISTEN socket.
1468  * One SYN_RECV socket costs about 80bytes on a 32bit machine.
1469  * It would be better to replace it with a global counter for all sockets
1470  * but then some measure against one socket starving all other sockets
1471  * would be needed.
1472  *
1473  * It was 128 by default. Experiments with real servers show, that
1474  * it is absolutely not enough even at 100conn/sec. 256 cures most
1475  * of problems. This value is adjusted to 128 for very small machines
1476  * (<=32Mb of memory) and to 1024 on normal or better ones (>=256Mb).
1477  * Further increasing requires to change hash table size.
1478  */
1479 int sysctl_max_syn_backlog = 256;
1480
1481 struct or_calltable or_ipv4 = {
1482         .family         =       PF_INET,
1483         .rtx_syn_ack    =       tcp_v4_send_synack,
1484         .send_ack       =       tcp_v4_or_send_ack,
1485         .destructor     =       tcp_v4_or_free,
1486         .send_reset     =       tcp_v4_send_reset,
1487 };
1488
1489 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1490 {
1491         struct tcp_opt tp;
1492         struct open_request *req;
1493         __u32 saddr = skb->nh.iph->saddr;
1494         __u32 daddr = skb->nh.iph->daddr;
1495         __u32 isn = TCP_SKB_CB(skb)->when;
1496         struct dst_entry *dst = NULL;
1497 #ifdef CONFIG_SYN_COOKIES
1498         int want_cookie = 0;
1499 #else
1500 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1501 #endif
1502
1503         /* Never answer to SYNs send to broadcast or multicast */
1504         if (((struct rtable *)skb->dst)->rt_flags &
1505             (RTCF_BROADCAST | RTCF_MULTICAST))
1506                 goto drop;
1507
1508         /* TW buckets are converted to open requests without
1509          * limitations, they conserve resources and peer is
1510          * evidently real one.
1511          */
1512         if (tcp_synq_is_full(sk) && !isn) {
1513 #ifdef CONFIG_SYN_COOKIES
1514                 if (sysctl_tcp_syncookies) {
1515                         want_cookie = 1;
1516                 } else
1517 #endif
1518                 goto drop;
1519         }
1520
1521         /* Accept backlog is full. If we have already queued enough
1522          * of warm entries in syn queue, drop request. It is better than
1523          * clogging syn queue with openreqs with exponentially increasing
1524          * timeout.
1525          */
1526         if (sk_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
1527                 goto drop;
1528
1529         req = tcp_openreq_alloc();
1530         if (!req)
1531                 goto drop;
1532
1533         tcp_clear_options(&tp);
1534         tp.mss_clamp = 536;
1535         tp.user_mss  = tcp_sk(sk)->user_mss;
1536
1537         tcp_parse_options(skb, &tp, 0);
1538
1539         if (want_cookie) {
1540                 tcp_clear_options(&tp);
1541                 tp.saw_tstamp = 0;
1542         }
1543
1544         if (tp.saw_tstamp && !tp.rcv_tsval) {
1545                 /* Some OSes (unknown ones, but I see them on web server, which
1546                  * contains information interesting only for windows'
1547                  * users) do not send their stamp in SYN. It is easy case.
1548                  * We simply do not advertise TS support.
1549                  */
1550                 tp.saw_tstamp = 0;
1551                 tp.tstamp_ok  = 0;
1552         }
1553         tp.tstamp_ok = tp.saw_tstamp;
1554
1555         tcp_openreq_init(req, &tp, skb);
1556
1557         req->af.v4_req.loc_addr = daddr;
1558         req->af.v4_req.rmt_addr = saddr;
1559         req->af.v4_req.opt = tcp_v4_save_options(sk, skb);
1560         req->class = &or_ipv4;
1561         if (!want_cookie)
1562                 TCP_ECN_create_request(req, skb->h.th);
1563
1564         if (want_cookie) {
1565 #ifdef CONFIG_SYN_COOKIES
1566                 syn_flood_warning(skb);
1567 #endif
1568                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1569         } else if (!isn) {
1570                 struct inet_peer *peer = NULL;
1571
1572                 /* VJ's idea. We save last timestamp seen
1573                  * from the destination in peer table, when entering
1574                  * state TIME-WAIT, and check against it before
1575                  * accepting new connection request.
1576                  *
1577                  * If "isn" is not zero, this request hit alive
1578                  * timewait bucket, so that all the necessary checks
1579                  * are made in the function processing timewait state.
1580                  */
1581                 if (tp.saw_tstamp &&
1582                     sysctl_tcp_tw_recycle &&
1583                     (dst = tcp_v4_route_req(sk, req)) != NULL &&
1584                     (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1585                     peer->v4daddr == saddr) {
1586                         if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1587                             (s32)(peer->tcp_ts - req->ts_recent) >
1588                                                         TCP_PAWS_WINDOW) {
1589                                 NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
1590                                 dst_release(dst);
1591                                 goto drop_and_free;
1592                         }
1593                 }
1594                 /* Kill the following clause, if you dislike this way. */
1595                 else if (!sysctl_tcp_syncookies &&
1596                          (sysctl_max_syn_backlog - tcp_synq_len(sk) <
1597                           (sysctl_max_syn_backlog >> 2)) &&
1598                          (!peer || !peer->tcp_ts_stamp) &&
1599                          (!dst || !dst_metric(dst, RTAX_RTT))) {
1600                         /* Without syncookies last quarter of
1601                          * backlog is filled with destinations,
1602                          * proven to be alive.
1603                          * It means that we continue to communicate
1604                          * to destinations, already remembered
1605                          * to the moment of synflood.
1606                          */
1607                         NETDEBUG(if (net_ratelimit()) \
1608                                         printk(KERN_DEBUG "TCP: drop open "
1609                                                           "request from %u.%u."
1610                                                           "%u.%u/%u\n", \
1611                                                NIPQUAD(saddr),
1612                                                ntohs(skb->h.th->source)));
1613                         dst_release(dst);
1614                         goto drop_and_free;
1615                 }
1616
1617                 isn = tcp_v4_init_sequence(sk, skb);
1618         }
1619         req->snt_isn = isn;
1620
1621         if (tcp_v4_send_synack(sk, req, dst))
1622                 goto drop_and_free;
1623
1624         if (want_cookie) {
1625                 tcp_openreq_free(req);
1626         } else {
1627                 tcp_v4_synq_add(sk, req);
1628         }
1629         return 0;
1630
1631 drop_and_free:
1632         tcp_openreq_free(req);
1633 drop:
1634         TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1635         return 0;
1636 }
1637
1638
1639 /*
1640  * The three way handshake has completed - we got a valid synack -
1641  * now create the new socket.
1642  */
1643 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1644                                   struct open_request *req,
1645                                   struct dst_entry *dst)
1646 {
1647         struct inet_opt *newinet;
1648         struct tcp_opt *newtp;
1649         struct sock *newsk;
1650
1651         if (sk_acceptq_is_full(sk))
1652                 goto exit_overflow;
1653
1654         if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1655                 goto exit;
1656
1657         newsk = tcp_create_openreq_child(sk, req, skb);
1658         if (!newsk)
1659                 goto exit;
1660
1661         newsk->sk_dst_cache = dst;
1662         tcp_v4_setup_caps(newsk, dst);
1663
1664         newtp                 = tcp_sk(newsk);
1665         newinet               = inet_sk(newsk);
1666         newinet->daddr        = req->af.v4_req.rmt_addr;
1667         newinet->rcv_saddr    = req->af.v4_req.loc_addr;
1668         newinet->saddr        = req->af.v4_req.loc_addr;
1669         newinet->opt          = req->af.v4_req.opt;
1670         req->af.v4_req.opt    = NULL;
1671         newinet->mc_index     = tcp_v4_iif(skb);
1672         newinet->mc_ttl       = skb->nh.iph->ttl;
1673         newtp->ext_header_len = 0;
1674         if (newinet->opt)
1675                 newtp->ext_header_len = newinet->opt->optlen;
1676         newtp->ext2_header_len = dst->header_len;
1677         newinet->id = newtp->write_seq ^ jiffies;
1678
1679         tcp_sync_mss(newsk, dst_pmtu(dst));
1680         newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1681         tcp_initialize_rcv_mss(newsk);
1682
1683         __tcp_v4_hash(newsk, 0);
1684         __tcp_inherit_port(sk, newsk);
1685
1686         return newsk;
1687
1688 exit_overflow:
1689         NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
1690 exit:
1691         NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
1692         dst_release(dst);
1693         return NULL;
1694 }
1695
1696 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1697 {
1698         struct tcphdr *th = skb->h.th;
1699         struct iphdr *iph = skb->nh.iph;
1700         struct tcp_opt *tp = tcp_sk(sk);
1701         struct sock *nsk;
1702         struct open_request **prev;
1703         /* Find possible connection requests. */
1704         struct open_request *req = tcp_v4_search_req(tp, &prev, th->source,
1705                                                      iph->saddr, iph->daddr);
1706         if (req)
1707                 return tcp_check_req(sk, skb, req, prev);
1708
1709         nsk = __tcp_v4_lookup_established(skb->nh.iph->saddr,
1710                                           th->source,
1711                                           skb->nh.iph->daddr,
1712                                           ntohs(th->dest),
1713                                           tcp_v4_iif(skb));
1714
1715         if (nsk) {
1716                 if (nsk->sk_state != TCP_TIME_WAIT) {
1717                         bh_lock_sock(nsk);
1718                         return nsk;
1719                 }
1720                 tcp_tw_put((struct tcp_tw_bucket *)nsk);
1721                 return NULL;
1722         }
1723
1724 #ifdef CONFIG_SYN_COOKIES
1725         if (!th->rst && !th->syn && th->ack)
1726                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1727 #endif
1728         return sk;
1729 }
1730
1731 static int tcp_v4_checksum_init(struct sk_buff *skb)
1732 {
1733         if (skb->ip_summed == CHECKSUM_HW) {
1734                 skb->ip_summed = CHECKSUM_UNNECESSARY;
1735                 if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1736                                   skb->nh.iph->daddr, skb->csum))
1737                         return 0;
1738
1739                 NETDEBUG(if (net_ratelimit())
1740                                 printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
1741                 skb->ip_summed = CHECKSUM_NONE;
1742         }
1743         if (skb->len <= 76) {
1744                 if (tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1745                                  skb->nh.iph->daddr,
1746                                  skb_checksum(skb, 0, skb->len, 0)))
1747                         return -1;
1748                 skb->ip_summed = CHECKSUM_UNNECESSARY;
1749         } else {
1750                 skb->csum = ~tcp_v4_check(skb->h.th, skb->len,
1751                                           skb->nh.iph->saddr,
1752                                           skb->nh.iph->daddr, 0);
1753         }
1754         return 0;
1755 }
1756
1757
1758 /* The socket must have it's spinlock held when we get
1759  * here.
1760  *
1761  * We have a potential double-lock case here, so even when
1762  * doing backlog processing we use the BH locking scheme.
1763  * This is because we cannot sleep with the original spinlock
1764  * held.
1765  */
1766 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1767 {
1768         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1769                 TCP_CHECK_TIMER(sk);
1770                 if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
1771                         goto reset;
1772                 TCP_CHECK_TIMER(sk);
1773                 return 0;
1774         }
1775
1776         if (skb->len < (skb->h.th->doff << 2) || tcp_checksum_complete(skb))
1777                 goto csum_err;
1778
1779         if (sk->sk_state == TCP_LISTEN) {
1780                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1781                 if (!nsk)
1782                         goto discard;
1783
1784                 if (nsk != sk) {
1785                         if (tcp_child_process(sk, nsk, skb))
1786                                 goto reset;
1787                         return 0;
1788                 }
1789         }
1790
1791         TCP_CHECK_TIMER(sk);
1792         if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
1793                 goto reset;
1794         TCP_CHECK_TIMER(sk);
1795         return 0;
1796
1797 reset:
1798         tcp_v4_send_reset(skb);
1799 discard:
1800         kfree_skb(skb);
1801         /* Be careful here. If this function gets more complicated and
1802          * gcc suffers from register pressure on the x86, sk (in %ebx)
1803          * might be destroyed here. This current version compiles correctly,
1804          * but you have been warned.
1805          */
1806         return 0;
1807
1808 csum_err:
1809         TCP_INC_STATS_BH(TCP_MIB_INERRS);
1810         goto discard;
1811 }
1812
1813 /*
1814  *      From tcp_input.c
1815  */
1816
1817 int tcp_v4_rcv(struct sk_buff *skb)
1818 {
1819         struct tcphdr *th;
1820         struct sock *sk;
1821         int ret;
1822
1823         if (skb->pkt_type != PACKET_HOST)
1824                 goto discard_it;
1825
1826         /* Count it even if it's bad */
1827         TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1828
1829         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1830                 goto discard_it;
1831
1832         th = skb->h.th;
1833
1834         if (th->doff < sizeof(struct tcphdr) / 4)
1835                 goto bad_packet;
1836         if (!pskb_may_pull(skb, th->doff * 4))
1837                 goto discard_it;
1838
1839         /* An explanation is required here, I think.
1840          * Packet length and doff are validated by header prediction,
1841          * provided case of th->doff==0 is elimineted.
1842          * So, we defer the checks. */
1843         if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
1844              tcp_v4_checksum_init(skb) < 0))
1845                 goto bad_packet;
1846
1847         th = skb->h.th;
1848         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1849         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1850                                     skb->len - th->doff * 4);
1851         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1852         TCP_SKB_CB(skb)->when    = 0;
1853         TCP_SKB_CB(skb)->flags   = skb->nh.iph->tos;
1854         TCP_SKB_CB(skb)->sacked  = 0;
1855
1856         sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source,
1857                              skb->nh.iph->daddr, ntohs(th->dest),
1858                              tcp_v4_iif(skb));
1859
1860         if (!sk)
1861                 goto no_tcp_socket;
1862
1863 process:
1864         if (sk->sk_state == TCP_TIME_WAIT)
1865                 goto do_time_wait;
1866
1867         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1868                 goto discard_and_relse;
1869
1870         if (sk_filter(sk, skb, 0))
1871                 goto discard_and_relse;
1872
1873         skb->dev = NULL;
1874
1875         bh_lock_sock(sk);
1876         ret = 0;
1877         if (!sock_owned_by_user(sk)) {
1878                 if (!tcp_prequeue(sk, skb))
1879                         ret = tcp_v4_do_rcv(sk, skb);
1880         } else
1881                 sk_add_backlog(sk, skb);
1882         bh_unlock_sock(sk);
1883
1884         sock_put(sk);
1885
1886         return ret;
1887
1888 no_tcp_socket:
1889         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1890                 goto discard_it;
1891
1892         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1893 bad_packet:
1894                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1895         } else {
1896                 tcp_v4_send_reset(skb);
1897         }
1898
1899 discard_it:
1900         /* Discard frame. */
1901         kfree_skb(skb);
1902         return 0;
1903
1904 discard_and_relse:
1905         sock_put(sk);
1906         goto discard_it;
1907
1908 do_time_wait:
1909         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1910                 tcp_tw_put((struct tcp_tw_bucket *) sk);
1911                 goto discard_it;
1912         }
1913
1914         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1915                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1916                 tcp_tw_put((struct tcp_tw_bucket *) sk);
1917                 goto discard_it;
1918         }
1919         switch (tcp_timewait_state_process((struct tcp_tw_bucket *)sk,
1920                                            skb, th, skb->len)) {
1921         case TCP_TW_SYN: {
1922                 struct sock *sk2 = tcp_v4_lookup_listener(skb->nh.iph->daddr,
1923                                                           ntohs(th->dest),
1924                                                           tcp_v4_iif(skb));
1925                 if (sk2) {
1926                         tcp_tw_deschedule((struct tcp_tw_bucket *)sk);
1927                         tcp_tw_put((struct tcp_tw_bucket *)sk);
1928                         sk = sk2;
1929                         goto process;
1930                 }
1931                 /* Fall through to ACK */
1932         }
1933         case TCP_TW_ACK:
1934                 tcp_v4_timewait_ack(sk, skb);
1935                 break;
1936         case TCP_TW_RST:
1937                 goto no_tcp_socket;
1938         case TCP_TW_SUCCESS:;
1939         }
1940         goto discard_it;
1941 }
1942
1943 /* With per-bucket locks this operation is not-atomic, so that
1944  * this version is not worse.
1945  */
1946 static void __tcp_v4_rehash(struct sock *sk)
1947 {
1948         sk->sk_prot->unhash(sk);
1949         sk->sk_prot->hash(sk);
1950 }
1951
1952 static int tcp_v4_reselect_saddr(struct sock *sk)
1953 {
1954         struct inet_opt *inet = inet_sk(sk);
1955         int err;
1956         struct rtable *rt;
1957         __u32 old_saddr = inet->saddr;
1958         __u32 new_saddr;
1959         __u32 daddr = inet->daddr;
1960
1961         if (inet->opt && inet->opt->srr)
1962                 daddr = inet->opt->faddr;
1963
1964         /* Query new route. */
1965         err = ip_route_connect(&rt, daddr, 0,
1966                                RT_TOS(inet->tos) | sk->sk_localroute,
1967                                sk->sk_bound_dev_if,
1968                                IPPROTO_TCP,
1969                                inet->sport, inet->dport, sk);
1970         if (err)
1971                 return err;
1972
1973         __sk_dst_set(sk, &rt->u.dst);
1974         tcp_v4_setup_caps(sk, &rt->u.dst);
1975         tcp_sk(sk)->ext2_header_len = rt->u.dst.header_len;
1976
1977         new_saddr = rt->rt_src;
1978
1979         if (new_saddr == old_saddr)
1980                 return 0;
1981
1982         if (sysctl_ip_dynaddr > 1) {
1983                 printk(KERN_INFO "tcp_v4_rebuild_header(): shifting inet->"
1984                                  "saddr from %d.%d.%d.%d to %d.%d.%d.%d\n",
1985                        NIPQUAD(old_saddr),
1986                        NIPQUAD(new_saddr));
1987         }
1988
1989         inet->saddr = new_saddr;
1990         inet->rcv_saddr = new_saddr;
1991
1992         /* XXX The only one ugly spot where we need to
1993          * XXX really change the sockets identity after
1994          * XXX it has entered the hashes. -DaveM
1995          *
1996          * Besides that, it does not check for connection
1997          * uniqueness. Wait for troubles.
1998          */
1999         __tcp_v4_rehash(sk);
2000         return 0;
2001 }
2002
2003 int tcp_v4_rebuild_header(struct sock *sk)
2004 {
2005         struct inet_opt *inet = inet_sk(sk);
2006         struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0);
2007         u32 daddr;
2008         int err;
2009
2010         /* Route is OK, nothing to do. */
2011         if (rt)
2012                 return 0;
2013
2014         /* Reroute. */
2015         daddr = inet->daddr;
2016         if (inet->opt && inet->opt->srr)
2017                 daddr = inet->opt->faddr;
2018
2019         {
2020                 struct flowi fl = { .oif = sk->sk_bound_dev_if,
2021                                     .nl_u = { .ip4_u =
2022                                               { .daddr = daddr,
2023                                                 .saddr = inet->saddr,
2024                                                 .tos = RT_CONN_FLAGS(sk) } },
2025                                     .proto = IPPROTO_TCP,
2026                                     .uli_u = { .ports =
2027                                                { .sport = inet->sport,
2028                                                  .dport = inet->dport } } };
2029
2030                 err = ip_route_output_flow(&rt, &fl, sk, 0);
2031         }
2032         if (!err) {
2033                 __sk_dst_set(sk, &rt->u.dst);
2034                 tcp_v4_setup_caps(sk, &rt->u.dst);
2035                 tcp_sk(sk)->ext2_header_len = rt->u.dst.header_len;
2036                 return 0;
2037         }
2038
2039         /* Routing failed... */
2040         sk->sk_route_caps = 0;
2041
2042         if (!sysctl_ip_dynaddr ||
2043             sk->sk_state != TCP_SYN_SENT ||
2044             (sk->sk_userlocks & SOCK_BINDADDR_LOCK) ||
2045             (err = tcp_v4_reselect_saddr(sk)) != 0)
2046                 sk->sk_err_soft = -err;
2047
2048         return err;
2049 }
2050
2051 static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
2052 {
2053         struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
2054         struct inet_opt *inet = inet_sk(sk);
2055
2056         sin->sin_family         = AF_INET;
2057         sin->sin_addr.s_addr    = inet->daddr;
2058         sin->sin_port           = inet->dport;
2059 }
2060
2061 /* VJ's idea. Save last timestamp seen from this destination
2062  * and hold it at least for normal timewait interval to use for duplicate
2063  * segment detection in subsequent connections, before they enter synchronized
2064  * state.
2065  */
2066
2067 int tcp_v4_remember_stamp(struct sock *sk)
2068 {
2069         struct inet_opt *inet = inet_sk(sk);
2070         struct tcp_opt *tp = tcp_sk(sk);
2071         struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
2072         struct inet_peer *peer = NULL;
2073         int release_it = 0;
2074
2075         if (!rt || rt->rt_dst != inet->daddr) {
2076                 peer = inet_getpeer(inet->daddr, 1);
2077                 release_it = 1;
2078         } else {
2079                 if (!rt->peer)
2080                         rt_bind_peer(rt, 1);
2081                 peer = rt->peer;
2082         }
2083
2084         if (peer) {
2085                 if ((s32)(peer->tcp_ts - tp->ts_recent) <= 0 ||
2086                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
2087                      peer->tcp_ts_stamp <= tp->ts_recent_stamp)) {
2088                         peer->tcp_ts_stamp = tp->ts_recent_stamp;
2089                         peer->tcp_ts = tp->ts_recent;
2090                 }
2091                 if (release_it)
2092                         inet_putpeer(peer);
2093                 return 1;
2094         }
2095
2096         return 0;
2097 }
2098
2099 int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw)
2100 {
2101         struct inet_peer *peer = NULL;
2102
2103         peer = inet_getpeer(tw->tw_daddr, 1);
2104
2105         if (peer) {
2106                 if ((s32)(peer->tcp_ts - tw->tw_ts_recent) <= 0 ||
2107                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
2108                      peer->tcp_ts_stamp <= tw->tw_ts_recent_stamp)) {
2109                         peer->tcp_ts_stamp = tw->tw_ts_recent_stamp;
2110                         peer->tcp_ts = tw->tw_ts_recent;
2111                 }
2112                 inet_putpeer(peer);
2113                 return 1;
2114         }
2115
2116         return 0;
2117 }
2118
2119 struct tcp_func ipv4_specific = {
2120         .queue_xmit     =       ip_queue_xmit,
2121         .send_check     =       tcp_v4_send_check,
2122         .rebuild_header =       tcp_v4_rebuild_header,
2123         .conn_request   =       tcp_v4_conn_request,
2124         .syn_recv_sock  =       tcp_v4_syn_recv_sock,
2125         .remember_stamp =       tcp_v4_remember_stamp,
2126         .net_header_len =       sizeof(struct iphdr),
2127         .setsockopt     =       ip_setsockopt,
2128         .getsockopt     =       ip_getsockopt,
2129         .addr2sockaddr  =       v4_addr2sockaddr,
2130         .sockaddr_len   =       sizeof(struct sockaddr_in),
2131 };
2132
2133 /* NOTE: A lot of things set to zero explicitly by call to
2134  *       sk_alloc() so need not be done here.
2135  */
2136 static int tcp_v4_init_sock(struct sock *sk)
2137 {
2138         struct tcp_opt *tp = tcp_sk(sk);
2139
2140         skb_queue_head_init(&tp->out_of_order_queue);
2141         tcp_init_xmit_timers(sk);
2142         tcp_prequeue_init(tp);
2143
2144         tp->rto  = TCP_TIMEOUT_INIT;
2145         tp->mdev = TCP_TIMEOUT_INIT;
2146
2147         /* So many TCP implementations out there (incorrectly) count the
2148          * initial SYN frame in their delayed-ACK and congestion control
2149          * algorithms that we must have the following bandaid to talk
2150          * efficiently to them.  -DaveM
2151          */
2152         tp->snd_cwnd = 2;
2153
2154         /* See draft-stevens-tcpca-spec-01 for discussion of the
2155          * initialization of these values.
2156          */
2157         tp->snd_ssthresh = 0x7fffffff;  /* Infinity */
2158         tp->snd_cwnd_clamp = ~0;
2159         tp->mss_cache = 536;
2160
2161         tp->reordering = sysctl_tcp_reordering;
2162
2163         sk->sk_state = TCP_CLOSE;
2164
2165         sk->sk_write_space = sk_stream_write_space;
2166         sk->sk_use_write_queue = 1;
2167
2168         tp->af_specific = &ipv4_specific;
2169
2170         sk->sk_sndbuf = sysctl_tcp_wmem[1];
2171         sk->sk_rcvbuf = sysctl_tcp_rmem[1];
2172
2173         atomic_inc(&tcp_sockets_allocated);
2174
2175         return 0;
2176 }
2177
2178 int tcp_v4_destroy_sock(struct sock *sk)
2179 {
2180         struct tcp_opt *tp = tcp_sk(sk);
2181
2182         tcp_clear_xmit_timers(sk);
2183
2184         /* Cleanup up the write buffer. */
2185         sk_stream_writequeue_purge(sk);
2186
2187         /* Cleans up our, hopefully empty, out_of_order_queue. */
2188         __skb_queue_purge(&tp->out_of_order_queue);
2189
2190         /* Clean prequeue, it must be empty really */
2191         __skb_queue_purge(&tp->ucopy.prequeue);
2192
2193         /* Clean up a referenced TCP bind bucket. */
2194         if (tp->bind_hash)
2195                 tcp_put_port(sk);
2196
2197         /*
2198          * If sendmsg cached page exists, toss it.
2199          */
2200         if (sk->sk_sndmsg_page) {
2201                 __free_page(sk->sk_sndmsg_page);
2202                 sk->sk_sndmsg_page = NULL;
2203         }
2204
2205         atomic_dec(&tcp_sockets_allocated);
2206
2207         return 0;
2208 }
2209
2210 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2211
2212 #ifdef CONFIG_PROC_FS
2213 /* Proc filesystem TCP sock list dumping. */
2214
2215 static inline struct tcp_tw_bucket *tw_head(struct hlist_head *head)
2216 {
2217         return hlist_empty(head) ? NULL :
2218                 list_entry(head->first, struct tcp_tw_bucket, tw_node);
2219 }
2220
2221 static inline struct tcp_tw_bucket *tw_next(struct tcp_tw_bucket *tw)
2222 {
2223         return tw->tw_node.next ?
2224                 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
2225 }
2226
2227 static void *listening_get_next(struct seq_file *seq, void *cur)
2228 {
2229         struct tcp_opt *tp;
2230         struct hlist_node *node;
2231         struct sock *sk = cur;
2232         struct tcp_iter_state* st = seq->private;
2233
2234         if (!sk) {
2235                 st->bucket = 0;
2236                 sk = sk_head(&tcp_listening_hash[0]);
2237                 goto get_sk;
2238         }
2239
2240         ++st->num;
2241
2242         if (st->state == TCP_SEQ_STATE_OPENREQ) {
2243                 struct open_request *req = cur;
2244
2245                 tp = tcp_sk(st->syn_wait_sk);
2246                 req = req->dl_next;
2247                 while (1) {
2248                         while (req) {
2249                                 vxdprintk(VXD_CBIT(net, 6),
2250                                         "sk,req: %p [#%d] (from %d)",
2251                                         req->sk, req->sk->sk_xid, current->xid);
2252                                 if (!vx_check(req->sk->sk_xid, VX_IDENT|VX_WATCH))
2253                                         continue;
2254                                 if (req->class->family == st->family) {
2255                                         cur = req;
2256                                         goto out;
2257                                 }
2258                                 req = req->dl_next;
2259                         }
2260                         if (++st->sbucket >= TCP_SYNQ_HSIZE)
2261                                 break;
2262 get_req:
2263                         req = tp->listen_opt->syn_table[st->sbucket];
2264                 }
2265                 sk        = sk_next(st->syn_wait_sk);
2266                 st->state = TCP_SEQ_STATE_LISTENING;
2267                 read_unlock_bh(&tp->syn_wait_lock);
2268         } else
2269                 sk = sk_next(sk);
2270 get_sk:
2271         sk_for_each_from(sk, node) {
2272                 vxdprintk(VXD_CBIT(net, 6), "sk: %p [#%d] (from %d)",
2273                         sk, sk->sk_xid, current->xid);
2274                 if (!vx_check(sk->sk_xid, VX_IDENT|VX_WATCH))
2275                         continue;
2276                 if (sk->sk_family == st->family) {
2277                         cur = sk;
2278                         goto out;
2279                 }
2280                 tp = tcp_sk(sk);
2281                 read_lock_bh(&tp->syn_wait_lock);
2282                 if (tp->listen_opt && tp->listen_opt->qlen) {
2283                         st->uid         = sock_i_uid(sk);
2284                         st->syn_wait_sk = sk;
2285                         st->state       = TCP_SEQ_STATE_OPENREQ;
2286                         st->sbucket     = 0;
2287                         goto get_req;
2288                 }
2289                 read_unlock_bh(&tp->syn_wait_lock);
2290         }
2291         if (++st->bucket < TCP_LHTABLE_SIZE) {
2292                 sk = sk_head(&tcp_listening_hash[st->bucket]);
2293                 goto get_sk;
2294         }
2295         cur = NULL;
2296 out:
2297         return cur;
2298 }
2299
2300 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2301 {
2302         void *rc = listening_get_next(seq, NULL);
2303
2304         while (rc && *pos) {
2305                 rc = listening_get_next(seq, rc);
2306                 --*pos;
2307         }
2308         return rc;
2309 }
2310
2311 static void *established_get_first(struct seq_file *seq)
2312 {
2313         struct tcp_iter_state* st = seq->private;
2314         void *rc = NULL;
2315
2316         for (st->bucket = 0; st->bucket < tcp_ehash_size; ++st->bucket) {
2317                 struct sock *sk;
2318                 struct hlist_node *node;
2319                 struct tcp_tw_bucket *tw;
2320
2321                 read_lock(&tcp_ehash[st->bucket].lock);
2322                 sk_for_each(sk, node, &tcp_ehash[st->bucket].chain) {
2323                         vxdprintk(VXD_CBIT(net, 6),
2324                                 "sk,egf: %p [#%d] (from %d)",
2325                                 sk, sk->sk_xid, current->xid);
2326                         if (!vx_check(sk->sk_xid, VX_IDENT|VX_WATCH))
2327                                 continue;
2328                         if (sk->sk_family != st->family)
2329                                 continue;
2330                         rc = sk;
2331                         goto out;
2332                 }
2333                 st->state = TCP_SEQ_STATE_TIME_WAIT;
2334                 tw_for_each(tw, node,
2335                             &tcp_ehash[st->bucket + tcp_ehash_size].chain) {
2336                         vxdprintk(VXD_CBIT(net, 6),
2337                                 "tw: %p [#%d] (from %d)",
2338                                 tw, tw->tw_xid, current->xid);
2339                         if (!vx_check(tw->tw_xid, VX_IDENT|VX_WATCH))
2340                                 continue;
2341                         if (tw->tw_family != st->family)
2342                                 continue;
2343                         rc = tw;
2344                         goto out;
2345                 }
2346                 read_unlock(&tcp_ehash[st->bucket].lock);
2347                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2348         }
2349 out:
2350         return rc;
2351 }
2352
2353 static void *established_get_next(struct seq_file *seq, void *cur)
2354 {
2355         struct sock *sk = cur;
2356         struct tcp_tw_bucket *tw;
2357         struct hlist_node *node;
2358         struct tcp_iter_state* st = seq->private;
2359
2360         ++st->num;
2361
2362         if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2363                 tw = cur;
2364                 tw = tw_next(tw);
2365 get_tw:
2366                 while (tw && (tw->tw_family != st->family ||
2367                         !vx_check(tw->tw_xid, VX_IDENT|VX_WATCH))) {
2368                         tw = tw_next(tw);
2369                 }
2370                 if (tw) {
2371                         cur = tw;
2372                         goto out;
2373                 }
2374                 read_unlock(&tcp_ehash[st->bucket].lock);
2375                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2376                 if (++st->bucket < tcp_ehash_size) {
2377                         read_lock(&tcp_ehash[st->bucket].lock);
2378                         sk = sk_head(&tcp_ehash[st->bucket].chain);
2379                 } else {
2380                         cur = NULL;
2381                         goto out;
2382                 }
2383         } else
2384                 sk = sk_next(sk);
2385
2386         sk_for_each_from(sk, node) {
2387                 vxdprintk(VXD_CBIT(net, 6),
2388                         "sk,egn: %p [#%d] (from %d)",
2389                         sk, sk->sk_xid, current->xid);
2390                 if (!vx_check(sk->sk_xid, VX_IDENT|VX_WATCH))
2391                         continue;
2392                 if (sk->sk_family == st->family)
2393                         goto found;
2394         }
2395
2396         st->state = TCP_SEQ_STATE_TIME_WAIT;
2397         tw = tw_head(&tcp_ehash[st->bucket + tcp_ehash_size].chain);
2398         goto get_tw;
2399 found:
2400         cur = sk;
2401 out:
2402         return cur;
2403 }
2404
2405 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2406 {
2407         void *rc = established_get_first(seq);
2408
2409         while (rc && pos) {
2410                 rc = established_get_next(seq, rc);
2411                 --pos;
2412         }
2413         return rc;
2414 }
2415
2416 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2417 {
2418         void *rc;
2419         struct tcp_iter_state* st = seq->private;
2420
2421         tcp_listen_lock();
2422         st->state = TCP_SEQ_STATE_LISTENING;
2423         rc        = listening_get_idx(seq, &pos);
2424
2425         if (!rc) {
2426                 tcp_listen_unlock();
2427                 local_bh_disable();
2428                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2429                 rc        = established_get_idx(seq, pos);
2430         }
2431
2432         return rc;
2433 }
2434
2435 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2436 {
2437         struct tcp_iter_state* st = seq->private;
2438         st->state = TCP_SEQ_STATE_LISTENING;
2439         st->num = 0;
2440         return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2441 }
2442
2443 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2444 {
2445         void *rc = NULL;
2446         struct tcp_iter_state* st;
2447
2448         if (v == SEQ_START_TOKEN) {
2449                 rc = tcp_get_idx(seq, 0);
2450                 goto out;
2451         }
2452         st = seq->private;
2453
2454         switch (st->state) {
2455         case TCP_SEQ_STATE_OPENREQ:
2456         case TCP_SEQ_STATE_LISTENING:
2457                 rc = listening_get_next(seq, v);
2458                 if (!rc) {
2459                         tcp_listen_unlock();
2460                         local_bh_disable();
2461                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2462                         rc        = established_get_first(seq);
2463                 }
2464                 break;
2465         case TCP_SEQ_STATE_ESTABLISHED:
2466         case TCP_SEQ_STATE_TIME_WAIT:
2467                 rc = established_get_next(seq, v);
2468                 break;
2469         }
2470 out:
2471         ++*pos;
2472         return rc;
2473 }
2474
2475 static void tcp_seq_stop(struct seq_file *seq, void *v)
2476 {
2477         struct tcp_iter_state* st = seq->private;
2478
2479         switch (st->state) {
2480         case TCP_SEQ_STATE_OPENREQ:
2481                 if (v) {
2482                         struct tcp_opt *tp = tcp_sk(st->syn_wait_sk);
2483                         read_unlock_bh(&tp->syn_wait_lock);
2484                 }
2485         case TCP_SEQ_STATE_LISTENING:
2486                 if (v != SEQ_START_TOKEN)
2487                         tcp_listen_unlock();
2488                 break;
2489         case TCP_SEQ_STATE_TIME_WAIT:
2490         case TCP_SEQ_STATE_ESTABLISHED:
2491                 if (v)
2492                         read_unlock(&tcp_ehash[st->bucket].lock);
2493                 local_bh_enable();
2494                 break;
2495         }
2496 }
2497
2498 static int tcp_seq_open(struct inode *inode, struct file *file)
2499 {
2500         struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2501         struct seq_file *seq;
2502         struct tcp_iter_state *s;
2503         int rc;
2504
2505         if (unlikely(afinfo == NULL))
2506                 return -EINVAL;
2507
2508         s = kmalloc(sizeof(*s), GFP_KERNEL);
2509         if (!s)
2510                 return -ENOMEM;
2511         memset(s, 0, sizeof(*s));
2512         s->family               = afinfo->family;
2513         s->seq_ops.start        = tcp_seq_start;
2514         s->seq_ops.next         = tcp_seq_next;
2515         s->seq_ops.show         = afinfo->seq_show;
2516         s->seq_ops.stop         = tcp_seq_stop;
2517
2518         rc = seq_open(file, &s->seq_ops);
2519         if (rc)
2520                 goto out_kfree;
2521         seq          = file->private_data;
2522         seq->private = s;
2523 out:
2524         return rc;
2525 out_kfree:
2526         kfree(s);
2527         goto out;
2528 }
2529
2530 int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
2531 {
2532         int rc = 0;
2533         struct proc_dir_entry *p;
2534
2535         if (!afinfo)
2536                 return -EINVAL;
2537         afinfo->seq_fops->owner         = afinfo->owner;
2538         afinfo->seq_fops->open          = tcp_seq_open;
2539         afinfo->seq_fops->read          = seq_read;
2540         afinfo->seq_fops->llseek        = seq_lseek;
2541         afinfo->seq_fops->release       = seq_release_private;
2542
2543         p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
2544         if (p)
2545                 p->data = afinfo;
2546         else
2547                 rc = -ENOMEM;
2548         return rc;
2549 }
2550
2551 void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
2552 {
2553         if (!afinfo)
2554                 return;
2555         proc_net_remove(afinfo->name);
2556         memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops));
2557 }
2558
2559 static void get_openreq4(struct sock *sk, struct open_request *req,
2560                          char *tmpbuf, int i, int uid)
2561 {
2562         int ttd = req->expires - jiffies;
2563
2564         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2565                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
2566                 i,
2567                 req->af.v4_req.loc_addr,
2568                 ntohs(inet_sk(sk)->sport),
2569                 req->af.v4_req.rmt_addr,
2570                 ntohs(req->rmt_port),
2571                 TCP_SYN_RECV,
2572                 0, 0, /* could print option size, but that is af dependent. */
2573                 1,    /* timers active (only the expire timer) */
2574                 jiffies_to_clock_t(ttd),
2575                 req->retrans,
2576                 uid,
2577                 0,  /* non standard timer */
2578                 0, /* open_requests have no inode */
2579                 atomic_read(&sk->sk_refcnt),
2580                 req);
2581 }
2582
2583 static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
2584 {
2585         int timer_active;
2586         unsigned long timer_expires;
2587         struct tcp_opt *tp = tcp_sk(sp);
2588         struct inet_opt *inet = inet_sk(sp);
2589         unsigned int dest = inet->daddr;
2590         unsigned int src = inet->rcv_saddr;
2591         __u16 destp = ntohs(inet->dport);
2592         __u16 srcp = ntohs(inet->sport);
2593
2594         if (tp->pending == TCP_TIME_RETRANS) {
2595                 timer_active    = 1;
2596                 timer_expires   = tp->timeout;
2597         } else if (tp->pending == TCP_TIME_PROBE0) {
2598                 timer_active    = 4;
2599                 timer_expires   = tp->timeout;
2600         } else if (timer_pending(&sp->sk_timer)) {
2601                 timer_active    = 2;
2602                 timer_expires   = sp->sk_timer.expires;
2603         } else {
2604                 timer_active    = 0;
2605                 timer_expires = jiffies;
2606         }
2607
2608         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2609                         "%08X %5d %8d %lu %d %p %u %u %u %u %d",
2610                 i, src, srcp, dest, destp, sp->sk_state,
2611                 tp->write_seq - tp->snd_una, tp->rcv_nxt - tp->copied_seq,
2612                 timer_active,
2613                 jiffies_to_clock_t(timer_expires - jiffies),
2614                 tp->retransmits,
2615                 sock_i_uid(sp),
2616                 tp->probes_out,
2617                 sock_i_ino(sp),
2618                 atomic_read(&sp->sk_refcnt), sp,
2619                 tp->rto, tp->ack.ato, (tp->ack.quick << 1) | tp->ack.pingpong,
2620                 tp->snd_cwnd,
2621                 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
2622 }
2623
2624 static void get_timewait4_sock(struct tcp_tw_bucket *tw, char *tmpbuf, int i)
2625 {
2626         unsigned int dest, src;
2627         __u16 destp, srcp;
2628         int ttd = tw->tw_ttd - jiffies;
2629
2630         if (ttd < 0)
2631                 ttd = 0;
2632
2633         dest  = tw->tw_daddr;
2634         src   = tw->tw_rcv_saddr;
2635         destp = ntohs(tw->tw_dport);
2636         srcp  = ntohs(tw->tw_sport);
2637
2638         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2639                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
2640                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2641                 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2642                 atomic_read(&tw->tw_refcnt), tw);
2643 }
2644
2645 #define TMPSZ 150
2646
2647 static int tcp4_seq_show(struct seq_file *seq, void *v)
2648 {
2649         struct tcp_iter_state* st;
2650         char tmpbuf[TMPSZ + 1];
2651
2652         if (v == SEQ_START_TOKEN) {
2653                 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2654                            "  sl  local_address rem_address   st tx_queue "
2655                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2656                            "inode");
2657                 goto out;
2658         }
2659         st = seq->private;
2660
2661         switch (st->state) {
2662         case TCP_SEQ_STATE_LISTENING:
2663         case TCP_SEQ_STATE_ESTABLISHED:
2664                 get_tcp4_sock(v, tmpbuf, st->num);
2665                 break;
2666         case TCP_SEQ_STATE_OPENREQ:
2667                 get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
2668                 break;
2669         case TCP_SEQ_STATE_TIME_WAIT:
2670                 get_timewait4_sock(v, tmpbuf, st->num);
2671                 break;
2672         }
2673         seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
2674 out:
2675         return 0;
2676 }
2677
2678 static struct file_operations tcp4_seq_fops;
2679 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2680         .owner          = THIS_MODULE,
2681         .name           = "tcp",
2682         .family         = AF_INET,
2683         .seq_show       = tcp4_seq_show,
2684         .seq_fops       = &tcp4_seq_fops,
2685 };
2686
2687 int __init tcp4_proc_init(void)
2688 {
2689         return tcp_proc_register(&tcp4_seq_afinfo);
2690 }
2691
2692 void tcp4_proc_exit(void)
2693 {
2694         tcp_proc_unregister(&tcp4_seq_afinfo);
2695 }
2696 #endif /* CONFIG_PROC_FS */
2697
2698 struct proto tcp_prot = {
2699         .name                   = "TCP",
2700         .close                  = tcp_close,
2701         .connect                = tcp_v4_connect,
2702         .disconnect             = tcp_disconnect,
2703         .accept                 = tcp_accept,
2704         .ioctl                  = tcp_ioctl,
2705         .init                   = tcp_v4_init_sock,
2706         .destroy                = tcp_v4_destroy_sock,
2707         .shutdown               = tcp_shutdown,
2708         .setsockopt             = tcp_setsockopt,
2709         .getsockopt             = tcp_getsockopt,
2710         .sendmsg                = tcp_sendmsg,
2711         .recvmsg                = tcp_recvmsg,
2712         .backlog_rcv            = tcp_v4_do_rcv,
2713         .hash                   = tcp_v4_hash,
2714         .unhash                 = tcp_unhash,
2715         .get_port               = tcp_v4_get_port,
2716         .enter_memory_pressure  = tcp_enter_memory_pressure,
2717         .sockets_allocated      = &tcp_sockets_allocated,
2718         .memory_allocated       = &tcp_memory_allocated,
2719         .memory_pressure        = &tcp_memory_pressure,
2720         .sysctl_mem             = sysctl_tcp_mem,
2721         .sysctl_wmem            = sysctl_tcp_wmem,
2722         .sysctl_rmem            = sysctl_tcp_rmem,
2723         .max_header             = MAX_TCP_HEADER,
2724 };
2725
2726
2727
2728 void __init tcp_v4_init(struct net_proto_family *ops)
2729 {
2730         int err = sock_create_kern(PF_INET, SOCK_RAW, IPPROTO_TCP, &tcp_socket);
2731         if (err < 0)
2732                 panic("Failed to create the TCP control socket.\n");
2733         tcp_socket->sk->sk_allocation   = GFP_ATOMIC;
2734         inet_sk(tcp_socket->sk)->uc_ttl = -1;
2735
2736         /* Unhash it so that IP input processing does not even
2737          * see it, we do not wish this socket to see incoming
2738          * packets.
2739          */
2740         tcp_socket->sk->sk_prot->unhash(tcp_socket->sk);
2741 }
2742
2743 EXPORT_SYMBOL(ipv4_specific);
2744 EXPORT_SYMBOL(tcp_bind_hash);
2745 EXPORT_SYMBOL(tcp_bucket_create);
2746 EXPORT_SYMBOL(tcp_hashinfo);
2747 EXPORT_SYMBOL(tcp_inherit_port);
2748 EXPORT_SYMBOL(tcp_listen_wlock);
2749 EXPORT_SYMBOL(tcp_port_rover);
2750 EXPORT_SYMBOL(tcp_prot);
2751 EXPORT_SYMBOL(tcp_put_port);
2752 EXPORT_SYMBOL(tcp_unhash);
2753 EXPORT_SYMBOL(tcp_v4_conn_request);
2754 EXPORT_SYMBOL(tcp_v4_connect);
2755 EXPORT_SYMBOL(tcp_v4_do_rcv);
2756 EXPORT_SYMBOL(tcp_v4_lookup_listener);
2757 EXPORT_SYMBOL(tcp_v4_rebuild_header);
2758 EXPORT_SYMBOL(tcp_v4_remember_stamp);
2759 EXPORT_SYMBOL(tcp_v4_send_check);
2760 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2761
2762 #ifdef CONFIG_PROC_FS
2763 EXPORT_SYMBOL(tcp_proc_register);
2764 EXPORT_SYMBOL(tcp_proc_unregister);
2765 #endif
2766 #ifdef CONFIG_SYSCTL
2767 EXPORT_SYMBOL(sysctl_local_port_range);
2768 EXPORT_SYMBOL(sysctl_max_syn_backlog);
2769 EXPORT_SYMBOL(sysctl_tcp_low_latency);
2770 #endif