net/ipv4/tcp_ipv4.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
   9  *
  10  *              IPv4 specific functions
  11  *
  12  *
  13  *              code split from:
  14  *              linux/ipv4/tcp.c
  15  *              linux/ipv4/tcp_input.c
  16  *              linux/ipv4/tcp_output.c
  17  *
  18  *              See tcp.c for author information
  19  *
  20  *      This program is free software; you can redistribute it and/or
  21  *      modify it under the terms of the GNU General Public License
  22  *      as published by the Free Software Foundation; either version
  23  *      2 of the License, or (at your option) any later version.
  24  */
  25
  26 /*
  27  * Changes:
  28  *              David S. Miller :       New socket lookup architecture.
  29  *                                      This code is dedicated to John Dyson.
  30  *              David S. Miller :       Change semantics of established hash,
  31  *                                      half is devoted to TIME_WAIT sockets
  32  *                                      and the rest go in the other half.
  33  *              Andi Kleen :            Add support for syncookies and fixed
  34  *                                      some bugs: ip options weren't passed to
  35  *                                      the TCP layer, missed a check for an
  36  *                                      ACK bit.
  37  *              Andi Kleen :            Implemented fast path mtu discovery.
  38  *                                      Fixed many serious bugs in the
  39  *                                      open_request handling and moved
  40  *                                      most of it into the af independent code.
  41  *                                      Added tail drop and some other bugfixes.
  42  *                                      Added new listen sematics.
  43  *              Mike McLagan    :       Routing by source
  44  *      Juan Jose Ciarlante:            ip_dynaddr bits
  45  *              Andi Kleen:             various fixes.
  46  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  47  *                                      coma.
  48  *      Andi Kleen              :       Fix new listen.
  49  *      Andi Kleen              :       Fix accept error reporting.
  50  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  51  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  52  *                                      a single port at the same time.
  53  */
  54
  55 #include <linux/config.h>
  56
  57 #include <linux/types.h>
  58 #include <linux/fcntl.h>
  59 #include <linux/module.h>
  60 #include <linux/random.h>
  61 #include <linux/cache.h>
  62 #include <linux/jhash.h>
  63 #include <linux/init.h>
  64 #include <linux/times.h>
  65
  66 #include <net/icmp.h>
  67 #include <net/tcp.h>
  68 #include <net/ipv6.h>
  69 #include <net/inet_common.h>
  70 #include <net/xfrm.h>
  71
  72 #include <linux/inet.h>
  73 #include <linux/ipv6.h>
  74 #include <linux/stddef.h>
  75 #include <linux/proc_fs.h>
  76 #include <linux/seq_file.h>
  77
  78 extern int sysctl_ip_dynaddr;
  79 int sysctl_tcp_tw_reuse;
  80 int sysctl_tcp_low_latency;
  81
  82 /* Check TCP sequence numbers in ICMP packets. */
  83 #define ICMP_MIN_LENGTH 8
  84
  85 /* Socket used for sending RSTs */
  86 static struct socket *tcp_socket;
  87
  88 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
  89                        struct sk_buff *skb);
  90
  91 struct tcp_hashinfo __cacheline_aligned tcp_hashinfo = {
  92         .__tcp_lhash_lock       =       RW_LOCK_UNLOCKED,
  93         .__tcp_lhash_users      =       ATOMIC_INIT(0),
  94         .__tcp_lhash_wait
  95           = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.__tcp_lhash_wait),
  96         .__tcp_portalloc_lock   =       SPIN_LOCK_UNLOCKED
  97 };
  98
  99 /*
 100  * This array holds the first and last local port number.
 101  * For high-usage systems, use sysctl to change this to
 102  * 32768-61000
 103  */
 104 int sysctl_local_port_range[2] = { 1024, 4999 };
 105 int tcp_port_rover = 1024 - 1;
 106
 107 static __inline__ int tcp_hashfn(__u32 laddr, __u16 lport,
 108                                  __u32 faddr, __u16 fport)
 109 {
 110         int h = (laddr ^ lport) ^ (faddr ^ fport);
 111         h ^= h >> 16;
 112         h ^= h >> 8;
 113         return h & (tcp_ehash_size - 1);
 114 }
 115
 116 static __inline__ int tcp_sk_hashfn(struct sock *sk)
 117 {
 118         struct inet_opt *inet = inet_sk(sk);
 119         __u32 laddr = inet->rcv_saddr;
 120         __u16 lport = inet->num;
 121         __u32 faddr = inet->daddr;
 122         __u16 fport = inet->dport;
 123
 124         return tcp_hashfn(laddr, lport, faddr, fport);
 125 }
 126
 127 /* Allocate and initialize a new TCP local port bind bucket.
 128  * The bindhash mutex for snum's hash chain must be held here.
 129  */
 130 struct tcp_bind_bucket *tcp_bucket_create(struct tcp_bind_hashbucket *head,
 131                                           unsigned short snum)
 132 {
 133         struct tcp_bind_bucket *tb = kmem_cache_alloc(tcp_bucket_cachep,
 134                                                       SLAB_ATOMIC);
 135         if (tb) {
 136                 tb->port = snum;
 137                 tb->fastreuse = 0;
 138                 INIT_HLIST_HEAD(&tb->owners);
 139                 hlist_add_head(&tb->node, &head->chain);
 140         }
 141         return tb;
 142 }
 143
 144 /* Caller must hold hashbucket lock for this tb with local BH disabled */
 145 void tcp_bucket_destroy(struct tcp_bind_bucket *tb)
 146 {
 147         if (hlist_empty(&tb->owners)) {
 148                 __hlist_del(&tb->node);
 149                 kmem_cache_free(tcp_bucket_cachep, tb);
 150         }
 151 }
 152
 153 /* Caller must disable local BH processing. */
 154 static __inline__ void __tcp_inherit_port(struct sock *sk, struct sock *child)
 155 {
 156         struct tcp_bind_hashbucket *head =
 157                                 &tcp_bhash[tcp_bhashfn(inet_sk(child)->num)];
 158         struct tcp_bind_bucket *tb;
 159
 160         spin_lock(&head->lock);
 161         tb = tcp_sk(sk)->bind_hash;
 162         sk_add_bind_node(child, &tb->owners);
 163         tcp_sk(child)->bind_hash = tb;
 164         spin_unlock(&head->lock);
 165 }
 166
 167 inline void tcp_inherit_port(struct sock *sk, struct sock *child)
 168 {
 169         local_bh_disable();
 170         __tcp_inherit_port(sk, child);
 171         local_bh_enable();
 172 }
 173
 174 void tcp_bind_hash(struct sock *sk, struct tcp_bind_bucket *tb,
 175                    unsigned short snum)
 176 {
 177         inet_sk(sk)->num = snum;
 178         sk_add_bind_node(sk, &tb->owners);
 179         tcp_sk(sk)->bind_hash = tb;
 180 }
 181
 182 /*
 183         Return 1 if addr match the socket IP list
 184         or the socket is INADDR_ANY
 185 */
 186 static inline int tcp_in_list(struct sock *sk, u32 addr)
 187 {
 188         struct nx_info *nxi = sk->sk_nx_info;
 189
 190         vxdprintk("tcp_in_list(%p) %p,%p;%lx\n",
 191                 sk, nxi, sk->sk_socket,
 192                 (sk->sk_socket?sk->sk_socket->flags:0));
 193
 194         if (nxi) {
 195                 int n = nxi->nbipv4;
 196                 int i;
 197
 198                 for (i=0; i<n; i++)
 199                         if (nxi->ipv4[i] == addr)
 200                                 return 1;
 201         }
 202         else if (!tcp_v4_rcv_saddr(sk) || tcp_v4_rcv_saddr(sk) == addr)
 203                 return 1;
 204         return 0;
 205 }
 206
 207 /*
 208         Check if the addresses in sk1 conflict with those in sk2
 209 */
 210 int tcp_ipv4_addr_conflict(struct sock *sk1, struct sock *sk2)
 211 {
 212         if (sk1 && sk2)
 213         nxdprintk("inet_bind(%p,%p) %p,%p;%lx %p,%p;%lx\n",
 214                 sk1, sk2,
 215                 sk1->sk_nx_info, sk1->sk_socket,
 216                 (sk1->sk_socket?sk1->sk_socket->flags:0),
 217                 sk2->sk_nx_info, sk2->sk_socket,
 218                 (sk2->sk_socket?sk2->sk_socket->flags:0));
 219
 220         if (tcp_v4_rcv_saddr(sk1)) {
 221                 /* Bind to one address only */
 222                 return tcp_in_list (sk2, tcp_v4_rcv_saddr(sk1));
 223         } else if (sk1->sk_nx_info) {
 224                 /* A restricted bind(any) */
 225                 struct nx_info *nxi = sk1->sk_nx_info;
 226                 int n = nxi->nbipv4;
 227                 int i;
 228
 229                 for (i=0; i<n; i++)
 230                         if (tcp_in_list (sk2, nxi->ipv4[i]))
 231                                 return 1;
 232         } else  /* A bind(any) do not allow other bind on the same port */
 233                 return 1;
 234         return 0;
 235 }
 236
 237 static inline int tcp_bind_conflict(struct sock *sk, struct tcp_bind_bucket *tb)
 238 {
 239         struct sock *sk2;
 240         struct hlist_node *node;
 241         int reuse = sk->sk_reuse;
 242
 243         sk_for_each_bound(sk2, node, &tb->owners) {
 244                 if (sk != sk2 &&
 245                     !tcp_v6_ipv6only(sk2) &&
 246                     (!sk->sk_bound_dev_if ||
 247                      !sk2->sk_bound_dev_if ||
 248                      sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
 249                         if (!reuse || !sk2->sk_reuse ||
 250                             sk2->sk_state == TCP_LISTEN) {
 251                                 if (tcp_ipv4_addr_conflict(sk, sk2))
 252                                         break;
 253                         }
 254                 }
 255         }
 256         return node != NULL;
 257 }
 258
 259 /* Obtain a reference to a local port for the given sock,
 260  * if snum is zero it means select any available local port.
 261  */
 262 static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
 263 {
 264         struct tcp_bind_hashbucket *head;
 265         struct hlist_node *node;
 266         struct tcp_bind_bucket *tb;
 267         int ret;
 268
 269         local_bh_disable();
 270         if (!snum) {
 271                 int low = sysctl_local_port_range[0];
 272                 int high = sysctl_local_port_range[1];
 273                 int remaining = (high - low) + 1;
 274                 int rover;
 275
 276                 spin_lock(&tcp_portalloc_lock);
 277                 rover = tcp_port_rover;
 278                 do {
 279                         rover++;
 280                         if (rover < low || rover > high)
 281                                 rover = low;
 282                         head = &tcp_bhash[tcp_bhashfn(rover)];
 283                         spin_lock(&head->lock);
 284                         tb_for_each(tb, node, &head->chain)
 285                                 if (tb->port == rover)
 286                                         goto next;
 287                         break;
 288                 next:
 289                         spin_unlock(&head->lock);
 290                 } while (--remaining > 0);
 291                 tcp_port_rover = rover;
 292                 spin_unlock(&tcp_portalloc_lock);
 293
 294                 /* Exhausted local port range during search? */
 295                 ret = 1;
 296                 if (remaining <= 0)
 297                         goto fail;
 298
 299                 /* OK, here is the one we will use.  HEAD is
 300                  * non-NULL and we hold it's mutex.
 301                  */
 302                 snum = rover;
 303         } else {
 304                 head = &tcp_bhash[tcp_bhashfn(snum)];
 305                 spin_lock(&head->lock);
 306                 tb_for_each(tb, node, &head->chain)
 307                         if (tb->port == snum)
 308                                 goto tb_found;
 309         }
 310         tb = NULL;
 311         goto tb_not_found;
 312 tb_found:
 313         if (!hlist_empty(&tb->owners)) {
 314                 if (sk->sk_reuse > 1)
 315                         goto success;
 316                 if (tb->fastreuse > 0 &&
 317                     sk->sk_reuse && sk->sk_state != TCP_LISTEN) {
 318                         goto success;
 319                 } else {
 320                         ret = 1;
 321                         if (tcp_bind_conflict(sk, tb))
 322                                 goto fail_unlock;
 323                 }
 324         }
 325 tb_not_found:
 326         ret = 1;
 327         if (!tb && (tb = tcp_bucket_create(head, snum)) == NULL)
 328                 goto fail_unlock;
 329         if (hlist_empty(&tb->owners)) {
 330                 if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
 331                         tb->fastreuse = 1;
 332                 else
 333                         tb->fastreuse = 0;
 334         } else if (tb->fastreuse &&
 335                    (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
 336                 tb->fastreuse = 0;
 337 success:
 338         if (!tcp_sk(sk)->bind_hash)
 339                 tcp_bind_hash(sk, tb, snum);
 340         BUG_TRAP(tcp_sk(sk)->bind_hash == tb);
 341         ret = 0;
 342
 343 fail_unlock:
 344         spin_unlock(&head->lock);
 345 fail:
 346         local_bh_enable();
 347         return ret;
 348 }
 349
 350 /* Get rid of any references to a local port held by the
 351  * given sock.
 352  */
 353 static void __tcp_put_port(struct sock *sk)
 354 {
 355         struct inet_opt *inet = inet_sk(sk);
 356         struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(inet->num)];
 357         struct tcp_bind_bucket *tb;
 358
 359         spin_lock(&head->lock);
 360         tb = tcp_sk(sk)->bind_hash;
 361         __sk_del_bind_node(sk);
 362         tcp_sk(sk)->bind_hash = NULL;
 363         inet->num = 0;
 364         tcp_bucket_destroy(tb);
 365         spin_unlock(&head->lock);
 366 }
 367
 368 void tcp_put_port(struct sock *sk)
 369 {
 370         local_bh_disable();
 371         __tcp_put_port(sk);
 372         local_bh_enable();
 373 }
 374
 375 /* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP.
 376  * Look, when several writers sleep and reader wakes them up, all but one
 377  * immediately hit write lock and grab all the cpus. Exclusive sleep solves
 378  * this, _but_ remember, it adds useless work on UP machines (wake up each
 379  * exclusive lock release). It should be ifdefed really.
 380  */
 381
 382 void tcp_listen_wlock(void)
 383 {
 384         write_lock(&tcp_lhash_lock);
 385
 386         if (atomic_read(&tcp_lhash_users)) {
 387                 DEFINE_WAIT(wait);
 388
 389                 for (;;) {
 390                         prepare_to_wait_exclusive(&tcp_lhash_wait,
 391                                                 &wait, TASK_UNINTERRUPTIBLE);
 392                         if (!atomic_read(&tcp_lhash_users))
 393                                 break;
 394                         write_unlock_bh(&tcp_lhash_lock);
 395                         schedule();
 396                         write_lock_bh(&tcp_lhash_lock);
 397                 }
 398
 399                 finish_wait(&tcp_lhash_wait, &wait);
 400         }
 401 }
 402
 403 static __inline__ void __tcp_v4_hash(struct sock *sk, const int listen_possible)
 404 {
 405         struct hlist_head *list;
 406         rwlock_t *lock;
 407
 408         BUG_TRAP(sk_unhashed(sk));
 409         if (listen_possible && sk->sk_state == TCP_LISTEN) {
 410                 list = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)];
 411                 lock = &tcp_lhash_lock;
 412                 tcp_listen_wlock();
 413         } else {
 414                 list = &tcp_ehash[(sk->sk_hashent = tcp_sk_hashfn(sk))].chain;
 415                 lock = &tcp_ehash[sk->sk_hashent].lock;
 416                 write_lock(lock);
 417         }
 418         __sk_add_node(sk, list);
 419         sock_prot_inc_use(sk->sk_prot);
 420         write_unlock(lock);
 421         if (listen_possible && sk->sk_state == TCP_LISTEN)
 422                 wake_up(&tcp_lhash_wait);
 423 }
 424
 425 static void tcp_v4_hash(struct sock *sk)
 426 {
 427         if (sk->sk_state != TCP_CLOSE) {
 428                 local_bh_disable();
 429                 __tcp_v4_hash(sk, 1);
 430                 local_bh_enable();
 431         }
 432 }
 433
 434 void tcp_unhash(struct sock *sk)
 435 {
 436         rwlock_t *lock;
 437
 438         if (sk_unhashed(sk))
 439                 goto ende;
 440
 441         if (sk->sk_state == TCP_LISTEN) {
 442                 local_bh_disable();
 443                 tcp_listen_wlock();
 444                 lock = &tcp_lhash_lock;
 445         } else {
 446                 struct tcp_ehash_bucket *head = &tcp_ehash[sk->sk_hashent];
 447                 lock = &head->lock;
 448                 write_lock_bh(&head->lock);
 449         }
 450
 451         if (__sk_del_node_init(sk))
 452                 sock_prot_dec_use(sk->sk_prot);
 453         write_unlock_bh(lock);
 454
 455  ende:
 456         if (sk->sk_state == TCP_LISTEN)
 457                 wake_up(&tcp_lhash_wait);
 458 }
 459
 460 /*
 461         Check if an address is in the list
 462 */
 463 static inline int tcp_addr_in_list(
 464         u32 rcv_saddr,
 465         u32 daddr,
 466         struct nx_info *nx_info)
 467 {
 468         if (rcv_saddr == daddr)
 469                 return 1;
 470         else if (rcv_saddr == 0) {
 471                 /* Accept any address or check the list */
 472                 if (!nx_info)
 473                         return 1;
 474                 else {
 475                         int n = nx_info->nbipv4;
 476                         int i;
 477
 478                         for (i=0; i<n; i++)
 479                                 if (nx_info->ipv4[i] == daddr)
 480                                         return 1;
 481                 }
 482         }
 483         return 0;
 484 }
 485
 486
 487
 488 /* Don't inline this cruft.  Here are some nice properties to
 489  * exploit here.  The BSD API does not allow a listening TCP
 490  * to specify the remote port nor the remote address for the
 491  * connection.  So always assume those are both wildcarded
 492  * during the search since they can never be otherwise.
 493  */
 494 static struct sock *__tcp_v4_lookup_listener(struct hlist_head *head, u32 daddr,
 495                                              unsigned short hnum, int dif)
 496 {
 497         struct sock *result = NULL, *sk;
 498         struct hlist_node *node;
 499         int score, hiscore;
 500
 501         hiscore=-1;
 502         sk_for_each(sk, node, head) {
 503                 struct inet_opt *inet = inet_sk(sk);
 504
 505                 if (inet->num == hnum && !ipv6_only_sock(sk)) {
 506                         __u32 rcv_saddr = inet->rcv_saddr;
 507
 508                         score = (sk->sk_family == PF_INET ? 1 : 0);
 509                         if (tcp_addr_in_list(rcv_saddr, daddr, sk->sk_nx_info))
 510                                 score+=2;
 511                         else
 512                                 continue;
 513                         if (sk->sk_bound_dev_if) {
 514                                 if (sk->sk_bound_dev_if != dif)
 515                                         continue;
 516                                 score+=2;
 517                         }
 518                         if (score == 5)
 519                                 return sk;
 520                         if (score > hiscore) {
 521                                 hiscore = score;
 522                                 result = sk;
 523                         }
 524                 }
 525         }
 526         return result;
 527 }
 528
 529 /* Optimize the common listener case. */
 530 inline struct sock *tcp_v4_lookup_listener(u32 daddr, unsigned short hnum,
 531                                            int dif)
 532 {
 533         struct sock *sk = NULL;
 534         struct hlist_head *head;
 535
 536         read_lock(&tcp_lhash_lock);
 537         head = &tcp_listening_hash[tcp_lhashfn(hnum)];
 538         if (!hlist_empty(head)) {
 539                 struct inet_opt *inet = inet_sk((sk = __sk_head(head)));
 540                 if (inet->num == hnum && !sk->sk_node.next &&
 541                     (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) &&
 542                     tcp_addr_in_list(inet->rcv_saddr, daddr, sk->sk_nx_info) &&
 543                     !sk->sk_bound_dev_if)
 544                         goto sherry_cache;
 545                 sk = __tcp_v4_lookup_listener(head, daddr, hnum, dif);
 546         }
 547         if (sk) {
 548 sherry_cache:
 549                 sock_hold(sk);
 550         }
 551         read_unlock(&tcp_lhash_lock);
 552         return sk;
 553 }
 554
 555 /* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
 556  * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
 557  *
 558  * Local BH must be disabled here.
 559  */
 560
 561 static inline struct sock *__tcp_v4_lookup_established(u32 saddr, u16 sport,
 562                                                        u32 daddr, u16 hnum,
 563                                                        int dif)
 564 {
 565         struct tcp_ehash_bucket *head;
 566         TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
 567         __u32 ports = TCP_COMBINED_PORTS(sport, hnum);
 568         struct sock *sk;
 569         struct hlist_node *node;
 570         /* Optimize here for direct hit, only listening connections can
 571          * have wildcards anyways.
 572          */
 573         int hash = tcp_hashfn(daddr, hnum, saddr, sport);
 574         head = &tcp_ehash[hash];
 575         read_lock(&head->lock);
 576         sk_for_each(sk, node, &head->chain) {
 577                 if (TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
 578                         goto hit; /* You sunk my battleship! */
 579         }
 580
 581         /* Must check for a TIME_WAIT'er before going to listener hash. */
 582         sk_for_each(sk, node, &(head + tcp_ehash_size)->chain) {
 583                 if (TCP_IPV4_TW_MATCH(sk, acookie, saddr, daddr, ports, dif))
 584                         goto hit;
 585         }
 586         sk = NULL;
 587 out:
 588         read_unlock(&head->lock);
 589         return sk;
 590 hit:
 591         sock_hold(sk);
 592         goto out;
 593 }
 594
 595 static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport,
 596                                            u32 daddr, u16 hnum, int dif)
 597 {
 598         struct sock *sk = __tcp_v4_lookup_established(saddr, sport,
 599                                                       daddr, hnum, dif);
 600
 601         return sk ? : tcp_v4_lookup_listener(daddr, hnum, dif);
 602 }
 603
 604 inline struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr,
 605                                   u16 dport, int dif)
 606 {
 607         struct sock *sk;
 608
 609         local_bh_disable();
 610         sk = __tcp_v4_lookup(saddr, sport, daddr, ntohs(dport), dif);
 611         local_bh_enable();
 612
 613         return sk;
 614 }
 615
 616 static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
 617 {
 618         return secure_tcp_sequence_number(skb->nh.iph->daddr,
 619                                           skb->nh.iph->saddr,
 620                                           skb->h.th->dest,
 621                                           skb->h.th->source);
 622 }
 623
 624 /* called with local bh disabled */
 625 static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
 626                                       struct tcp_tw_bucket **twp)
 627 {
 628         struct inet_opt *inet = inet_sk(sk);
 629         u32 daddr = inet->rcv_saddr;
 630         u32 saddr = inet->daddr;
 631         int dif = sk->sk_bound_dev_if;
 632         TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
 633         __u32 ports = TCP_COMBINED_PORTS(inet->dport, lport);
 634         int hash = tcp_hashfn(daddr, lport, saddr, inet->dport);
 635         struct tcp_ehash_bucket *head = &tcp_ehash[hash];
 636         struct sock *sk2;
 637         struct hlist_node *node;
 638         struct tcp_tw_bucket *tw;
 639
 640         write_lock(&head->lock);
 641
 642         /* Check TIME-WAIT sockets first. */
 643         sk_for_each(sk2, node, &(head + tcp_ehash_size)->chain) {
 644                 tw = (struct tcp_tw_bucket *)sk2;
 645
 646                 if (TCP_IPV4_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
 647                         struct tcp_opt *tp = tcp_sk(sk);
 648
 649                         /* With PAWS, it is safe from the viewpoint
 650                            of data integrity. Even without PAWS it
 651                            is safe provided sequence spaces do not
 652                            overlap i.e. at data rates <= 80Mbit/sec.
 653
 654                            Actually, the idea is close to VJ's one,
 655                            only timestamp cache is held not per host,
 656                            but per port pair and TW bucket is used
 657                            as state holder.
 658
 659                            If TW bucket has been already destroyed we
 660                            fall back to VJ's scheme and use initial
 661                            timestamp retrieved from peer table.
 662                          */
 663                         if (tw->tw_ts_recent_stamp &&
 664                             (!twp || (sysctl_tcp_tw_reuse &&
 665                                       xtime.tv_sec -
 666                                       tw->tw_ts_recent_stamp > 1))) {
 667                                 if ((tp->write_seq =
 668                                                 tw->tw_snd_nxt + 65535 + 2) == 0)
 669                                         tp->write_seq = 1;
 670                                 tp->ts_recent       = tw->tw_ts_recent;
 671                                 tp->ts_recent_stamp = tw->tw_ts_recent_stamp;
 672                                 sock_hold(sk2);
 673                                 goto unique;
 674                         } else
 675                                 goto not_unique;
 676                 }
 677         }
 678         tw = NULL;
 679
 680         /* And established part... */
 681         sk_for_each(sk2, node, &head->chain) {
 682                 if (TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif))
 683                         goto not_unique;
 684         }
 685
 686 unique:
 687         /* Must record num and sport now. Otherwise we will see
 688          * in hash table socket with a funny identity. */
 689         inet->num = lport;
 690         inet->sport = htons(lport);
 691         sk->sk_hashent = hash;
 692         BUG_TRAP(sk_unhashed(sk));
 693         __sk_add_node(sk, &head->chain);
 694         sock_prot_inc_use(sk->sk_prot);
 695         write_unlock(&head->lock);
 696
 697         if (twp) {
 698                 *twp = tw;
 699                 NET_INC_STATS_BH(TimeWaitRecycled);
 700         } else if (tw) {
 701                 /* Silly. Should hash-dance instead... */
 702                 tcp_tw_deschedule(tw);
 703                 NET_INC_STATS_BH(TimeWaitRecycled);
 704
 705                 tcp_tw_put(tw);
 706         }
 707
 708         return 0;
 709
 710 not_unique:
 711         write_unlock(&head->lock);
 712         return -EADDRNOTAVAIL;
 713 }
 714
 715 /*
 716  * Bind a port for a connect operation and hash it.
 717  */
 718 static int tcp_v4_hash_connect(struct sock *sk)
 719 {
 720         unsigned short snum = inet_sk(sk)->num;
 721         struct tcp_bind_hashbucket *head;
 722         struct tcp_bind_bucket *tb;
 723         int ret;
 724
 725         if (!snum) {
 726                 int rover;
 727                 int low = sysctl_local_port_range[0];
 728                 int high = sysctl_local_port_range[1];
 729                 int remaining = (high - low) + 1;
 730                 struct hlist_node *node;
 731                 struct tcp_tw_bucket *tw = NULL;
 732
 733                 local_bh_disable();
 734
 735                 /* TODO. Actually it is not so bad idea to remove
 736                  * tcp_portalloc_lock before next submission to Linus.
 737                  * As soon as we touch this place at all it is time to think.
 738                  *
 739                  * Now it protects single _advisory_ variable tcp_port_rover,
 740                  * hence it is mostly useless.
 741                  * Code will work nicely if we just delete it, but
 742                  * I am afraid in contented case it will work not better or
 743                  * even worse: another cpu just will hit the same bucket
 744                  * and spin there.
 745                  * So some cpu salt could remove both contention and
 746                  * memory pingpong. Any ideas how to do this in a nice way?
 747                  */
 748                 spin_lock(&tcp_portalloc_lock);
 749                 rover = tcp_port_rover;
 750
 751                 do {
 752                         rover++;
 753                         if ((rover < low) || (rover > high))
 754                                 rover = low;
 755                         head = &tcp_bhash[tcp_bhashfn(rover)];
 756                         spin_lock(&head->lock);
 757
 758                         /* Does not bother with rcv_saddr checks,
 759                          * because the established check is already
 760                          * unique enough.
 761                          */
 762                         tb_for_each(tb, node, &head->chain) {
 763                                 if (tb->port == rover) {
 764                                         BUG_TRAP(!hlist_empty(&tb->owners));
 765                                         if (tb->fastreuse >= 0)
 766                                                 goto next_port;
 767                                         if (!__tcp_v4_check_established(sk,
 768                                                                         rover,
 769                                                                         &tw))
 770                                                 goto ok;
 771                                         goto next_port;
 772                                 }
 773                         }
 774
 775                         tb = tcp_bucket_create(head, rover);
 776                         if (!tb) {
 777                                 spin_unlock(&head->lock);
 778                                 break;
 779                         }
 780                         tb->fastreuse = -1;
 781                         goto ok;
 782
 783                 next_port:
 784                         spin_unlock(&head->lock);
 785                 } while (--remaining > 0);
 786                 tcp_port_rover = rover;
 787                 spin_unlock(&tcp_portalloc_lock);
 788
 789                 local_bh_enable();
 790
 791                 return -EADDRNOTAVAIL;
 792
 793 ok:
 794                 /* All locks still held and bhs disabled */
 795                 tcp_port_rover = rover;
 796                 spin_unlock(&tcp_portalloc_lock);
 797
 798                 tcp_bind_hash(sk, tb, rover);
 799                 if (sk_unhashed(sk)) {
 800                         inet_sk(sk)->sport = htons(rover);
 801                         __tcp_v4_hash(sk, 0);
 802                 }
 803                 spin_unlock(&head->lock);
 804
 805                 if (tw) {
 806                         tcp_tw_deschedule(tw);
 807                         tcp_tw_put(tw);
 808                 }
 809
 810                 ret = 0;
 811                 goto out;
 812         }
 813
 814         head  = &tcp_bhash[tcp_bhashfn(snum)];
 815         tb  = tcp_sk(sk)->bind_hash;
 816         spin_lock_bh(&head->lock);
 817         if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
 818                 __tcp_v4_hash(sk, 0);
 819                 spin_unlock_bh(&head->lock);
 820                 return 0;
 821         } else {
 822                 spin_unlock(&head->lock);
 823                 /* No definite answer... Walk to established hash table */
 824                 ret = __tcp_v4_check_established(sk, snum, NULL);
 825 out:
 826                 local_bh_enable();
 827                 return ret;
 828         }
 829 }
 830
 831 /* This will initiate an outgoing connection. */
 832 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 833 {
 834         struct inet_opt *inet = inet_sk(sk);
 835         struct tcp_opt *tp = tcp_sk(sk);
 836         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 837         struct rtable *rt;
 838         u32 daddr, nexthop;
 839         int tmp;
 840         int err;
 841
 842         if (addr_len < sizeof(struct sockaddr_in))
 843                 return -EINVAL;
 844
 845         if (usin->sin_family != AF_INET)
 846                 return -EAFNOSUPPORT;
 847
 848         nexthop = daddr = usin->sin_addr.s_addr;
 849         if (inet->opt && inet->opt->srr) {
 850                 if (!daddr)
 851                         return -EINVAL;
 852                 nexthop = inet->opt->faddr;
 853         }
 854
 855         tmp = ip_route_connect(&rt, nexthop, inet->saddr,
 856                                RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 857                                IPPROTO_TCP,
 858                                inet->sport, usin->sin_port, sk);
 859         if (tmp < 0)
 860                 return tmp;
 861
 862         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 863                 ip_rt_put(rt);
 864                 return -ENETUNREACH;
 865         }
 866
 867         if (!inet->opt || !inet->opt->srr)
 868                 daddr = rt->rt_dst;
 869
 870         if (!inet->saddr)
 871                 inet->saddr = rt->rt_src;
 872         inet->rcv_saddr = inet->saddr;
 873
 874         if (tp->ts_recent_stamp && inet->daddr != daddr) {
 875                 /* Reset inherited state */
 876                 tp->ts_recent       = 0;
 877                 tp->ts_recent_stamp = 0;
 878                 tp->write_seq       = 0;
 879         }
 880
 881         if (sysctl_tcp_tw_recycle &&
 882             !tp->ts_recent_stamp && rt->rt_dst == daddr) {
 883                 struct inet_peer *peer = rt_get_peer(rt);
 884
 885                 /* VJ's idea. We save last timestamp seen from
 886                  * the destination in peer table, when entering state TIME-WAIT
 887                  * and initialize ts_recent from it, when trying new connection.
 888                  */
 889
 890                 if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
 891                         tp->ts_recent_stamp = peer->tcp_ts_stamp;
 892                         tp->ts_recent = peer->tcp_ts;
 893                 }
 894         }
 895
 896         inet->dport = usin->sin_port;
 897         inet->daddr = daddr;
 898
 899         tp->ext_header_len = 0;
 900         if (inet->opt)
 901                 tp->ext_header_len = inet->opt->optlen;
 902
 903         tp->mss_clamp = 536;
 904
 905         /* Socket identity is still unknown (sport may be zero).
 906          * However we set state to SYN-SENT and not releasing socket
 907          * lock select source port, enter ourselves into the hash tables and
 908          * complete initialization after this.
 909          */
 910         tcp_set_state(sk, TCP_SYN_SENT);
 911         err = tcp_v4_hash_connect(sk);
 912         if (err)
 913                 goto failure;
 914
 915         err = ip_route_newports(&rt, inet->sport, inet->dport, sk);
 916         if (err)
 917                 goto failure;
 918
 919         /* OK, now commit destination to socket.  */
 920         __sk_dst_set(sk, &rt->u.dst);
 921         tcp_v4_setup_caps(sk, &rt->u.dst);
 922         tp->ext2_header_len = rt->u.dst.header_len;
 923
 924         if (!tp->write_seq)
 925                 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
 926                                                            inet->daddr,
 927                                                            inet->sport,
 928                                                            usin->sin_port);
 929
 930         inet->id = tp->write_seq ^ jiffies;
 931
 932         err = tcp_connect(sk);
 933         rt = NULL;
 934         if (err)
 935                 goto failure;
 936
 937         return 0;
 938
 939 failure:
 940         /* This unhashes the socket and releases the local port, if necessary. */
 941         tcp_set_state(sk, TCP_CLOSE);
 942         ip_rt_put(rt);
 943         sk->sk_route_caps = 0;
 944         inet->dport = 0;
 945         return err;
 946 }
 947
 948 static __inline__ int tcp_v4_iif(struct sk_buff *skb)
 949 {
 950         return ((struct rtable *)skb->dst)->rt_iif;
 951 }
 952
 953 static __inline__ u32 tcp_v4_synq_hash(u32 raddr, u16 rport, u32 rnd)
 954 {
 955         return (jhash_2words(raddr, (u32) rport, rnd) & (TCP_SYNQ_HSIZE - 1));
 956 }
 957
 958 static struct open_request *tcp_v4_search_req(struct tcp_opt *tp,
 959                                               struct open_request ***prevp,
 960                                               __u16 rport,
 961                                               __u32 raddr, __u32 laddr)
 962 {
 963         struct tcp_listen_opt *lopt = tp->listen_opt;
 964         struct open_request *req, **prev;
 965
 966         for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport, lopt->hash_rnd)];
 967              (req = *prev) != NULL;
 968              prev = &req->dl_next) {
 969                 if (req->rmt_port == rport &&
 970                     req->af.v4_req.rmt_addr == raddr &&
 971                     req->af.v4_req.loc_addr == laddr &&
 972                     TCP_INET_FAMILY(req->class->family)) {
 973                         BUG_TRAP(!req->sk);
 974                         *prevp = prev;
 975                         break;
 976                 }
 977         }
 978
 979         return req;
 980 }
 981
 982 static void tcp_v4_synq_add(struct sock *sk, struct open_request *req)
 983 {
 984         struct tcp_opt *tp = tcp_sk(sk);
 985         struct tcp_listen_opt *lopt = tp->listen_opt;
 986         u32 h = tcp_v4_synq_hash(req->af.v4_req.rmt_addr, req->rmt_port, lopt->hash_rnd);
 987
 988         req->expires = jiffies + TCP_TIMEOUT_INIT;
 989         req->retrans = 0;
 990         req->sk = NULL;
 991         req->dl_next = lopt->syn_table[h];
 992
 993         write_lock(&tp->syn_wait_lock);
 994         lopt->syn_table[h] = req;
 995         write_unlock(&tp->syn_wait_lock);
 996
 997 #ifdef CONFIG_ACCEPT_QUEUES
 998         tcp_synq_added(sk, req);
 999 #else
1000         tcp_synq_added(sk);
1001 #endif
1002 }
1003
1004
1005 /*
1006  * This routine does path mtu discovery as defined in RFC1191.
1007  */
1008 static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph,
1009                                      u32 mtu)
1010 {
1011         struct dst_entry *dst;
1012         struct inet_opt *inet = inet_sk(sk);
1013         struct tcp_opt *tp = tcp_sk(sk);
1014
1015         /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
1016          * send out by Linux are always <576bytes so they should go through
1017          * unfragmented).
1018          */
1019         if (sk->sk_state == TCP_LISTEN)
1020                 return;
1021
1022         /* We don't check in the destentry if pmtu discovery is forbidden
1023          * on this route. We just assume that no packet_to_big packets
1024          * are send back when pmtu discovery is not active.
1025          * There is a small race when the user changes this flag in the
1026          * route, but I think that's acceptable.
1027          */
1028         if ((dst = __sk_dst_check(sk, 0)) == NULL)
1029                 return;
1030
1031         dst->ops->update_pmtu(dst, mtu);
1032
1033         /* Something is about to be wrong... Remember soft error
1034          * for the case, if this connection will not able to recover.
1035          */
1036         if (mtu < dst_pmtu(dst) && ip_dont_fragment(sk, dst))
1037                 sk->sk_err_soft = EMSGSIZE;
1038
1039         mtu = dst_pmtu(dst);
1040
1041         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
1042             tp->pmtu_cookie > mtu) {
1043                 tcp_sync_mss(sk, mtu);
1044
1045                 /* Resend the TCP packet because it's
1046                  * clear that the old packet has been
1047                  * dropped. This is the new "fast" path mtu
1048                  * discovery.
1049                  */
1050                 tcp_simple_retransmit(sk);
1051         } /* else let the usual retransmit timer handle it */
1052 }
1053
1054 /*
1055  * This routine is called by the ICMP module when it gets some
1056  * sort of error condition.  If err < 0 then the socket should
1057  * be closed and the error returned to the user.  If err > 0
1058  * it's just the icmp type << 8 | icmp code.  After adjustment
1059  * header points to the first 8 bytes of the tcp header.  We need
1060  * to find the appropriate port.
1061  *
1062  * The locking strategy used here is very "optimistic". When
1063  * someone else accesses the socket the ICMP is just dropped
1064  * and for some paths there is no check at all.
1065  * A more general error queue to queue errors for later handling
1066  * is probably better.
1067  *
1068  */
1069
1070 void tcp_v4_err(struct sk_buff *skb, u32 info)
1071 {
1072         struct iphdr *iph = (struct iphdr *)skb->data;
1073         struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
1074         struct tcp_opt *tp;
1075         struct inet_opt *inet;
1076         int type = skb->h.icmph->type;
1077         int code = skb->h.icmph->code;
1078         struct sock *sk;
1079         __u32 seq;
1080         int err;
1081
1082         if (skb->len < (iph->ihl << 2) + 8) {
1083                 ICMP_INC_STATS_BH(IcmpInErrors);
1084                 return;
1085         }
1086
1087         sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr,
1088                            th->source, tcp_v4_iif(skb));
1089         if (!sk) {
1090                 ICMP_INC_STATS_BH(IcmpInErrors);
1091                 return;
1092         }
1093         if (sk->sk_state == TCP_TIME_WAIT) {
1094                 tcp_tw_put((struct tcp_tw_bucket *)sk);
1095                 return;
1096         }
1097
1098         bh_lock_sock(sk);
1099         /* If too many ICMPs get dropped on busy
1100          * servers this needs to be solved differently.
1101          */
1102         if (sock_owned_by_user(sk))
1103                 NET_INC_STATS_BH(LockDroppedIcmps);
1104
1105         if (sk->sk_state == TCP_CLOSE)
1106                 goto out;
1107
1108         tp = tcp_sk(sk);
1109         seq = ntohl(th->seq);
1110         if (sk->sk_state != TCP_LISTEN &&
1111             !between(seq, tp->snd_una, tp->snd_nxt)) {
1112                 NET_INC_STATS(OutOfWindowIcmps);
1113                 goto out;
1114         }
1115
1116         switch (type) {
1117         case ICMP_SOURCE_QUENCH:
1118                 /* This is deprecated, but if someone generated it,
1119                  * we have no reasons to ignore it.
1120                  */
1121                 if (!sock_owned_by_user(sk))
1122                         tcp_enter_cwr(tp);
1123                 goto out;
1124         case ICMP_PARAMETERPROB:
1125                 err = EPROTO;
1126                 break;
1127         case ICMP_DEST_UNREACH:
1128                 if (code > NR_ICMP_UNREACH)
1129                         goto out;
1130
1131                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
1132                         if (!sock_owned_by_user(sk))
1133                                 do_pmtu_discovery(sk, iph, info);
1134                         goto out;
1135                 }
1136
1137                 err = icmp_err_convert[code].errno;
1138                 break;
1139         case ICMP_TIME_EXCEEDED:
1140                 err = EHOSTUNREACH;
1141                 break;
1142         default:
1143                 goto out;
1144         }
1145
1146         switch (sk->sk_state) {
1147                 struct open_request *req, **prev;
1148         case TCP_LISTEN:
1149                 if (sock_owned_by_user(sk))
1150                         goto out;
1151
1152                 req = tcp_v4_search_req(tp, &prev, th->dest,
1153                                         iph->daddr, iph->saddr);
1154                 if (!req)
1155                         goto out;
1156
1157                 /* ICMPs are not backlogged, hence we cannot get
1158                    an established socket here.
1159                  */
1160                 BUG_TRAP(!req->sk);
1161
1162                 if (seq != req->snt_isn) {
1163                         NET_INC_STATS_BH(OutOfWindowIcmps);
1164                         goto out;
1165                 }
1166
1167                 /*
1168                  * Still in SYN_RECV, just remove it silently.
1169                  * There is no good way to pass the error to the newly
1170                  * created socket, and POSIX does not want network
1171                  * errors returned from accept().
1172                  */
1173                 tcp_synq_drop(sk, req, prev);
1174                 goto out;
1175
1176         case TCP_SYN_SENT:
1177         case TCP_SYN_RECV:  /* Cannot happen.
1178                                It can f.e. if SYNs crossed.
1179                              */
1180                 if (!sock_owned_by_user(sk)) {
1181                         TCP_INC_STATS_BH(TcpAttemptFails);
1182                         sk->sk_err = err;
1183
1184                         sk->sk_error_report(sk);
1185
1186                         tcp_done(sk);
1187                 } else {
1188                         sk->sk_err_soft = err;
1189                 }
1190                 goto out;
1191         }
1192
1193         /* If we've already connected we will keep trying
1194          * until we time out, or the user gives up.
1195          *
1196          * rfc1122 4.2.3.9 allows to consider as hard errors
1197          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
1198          * but it is obsoleted by pmtu discovery).
1199          *
1200          * Note, that in modern internet, where routing is unreliable
1201          * and in each dark corner broken firewalls sit, sending random
1202          * errors ordered by their masters even this two messages finally lose
1203          * their original sense (even Linux sends invalid PORT_UNREACHs)
1204          *
1205          * Now we are in compliance with RFCs.
1206          *                                                      --ANK (980905)
1207          */
1208
1209         inet = inet_sk(sk);
1210         if (!sock_owned_by_user(sk) && inet->recverr) {
1211                 sk->sk_err = err;
1212                 sk->sk_error_report(sk);
1213         } else  { /* Only an error on timeout */
1214                 sk->sk_err_soft = err;
1215         }
1216
1217 out:
1218         bh_unlock_sock(sk);
1219         sock_put(sk);
1220 }
1221
1222 /* This routine computes an IPv4 TCP checksum. */
1223 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
1224                        struct sk_buff *skb)
1225 {
1226         struct inet_opt *inet = inet_sk(sk);
1227
1228         if (skb->ip_summed == CHECKSUM_HW) {
1229                 th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0);
1230                 skb->csum = offsetof(struct tcphdr, check);
1231         } else {
1232                 th->check = tcp_v4_check(th, len, inet->saddr, inet->daddr,
1233                                          csum_partial((char *)th,
1234                                                       th->doff << 2,
1235                                                       skb->csum));
1236         }
1237 }
1238
1239 /*
1240  *      This routine will send an RST to the other tcp.
1241  *
1242  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
1243  *                    for reset.
1244  *      Answer: if a packet caused RST, it is not for a socket
1245  *              existing in our system, if it is matched to a socket,
1246  *              it is just duplicate segment or bug in other side's TCP.
1247  *              So that we build reply only basing on parameters
1248  *              arrived with segment.
1249  *      Exception: precedence violation. We do not implement it in any case.
1250  */
1251
1252 static void tcp_v4_send_reset(struct sk_buff *skb)
1253 {
1254         struct tcphdr *th = skb->h.th;
1255         struct tcphdr rth;
1256         struct ip_reply_arg arg;
1257
1258         /* Never send a reset in response to a reset. */
1259         if (th->rst)
1260                 return;
1261
1262         if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
1263                 return;
1264
1265         /* Swap the send and the receive. */
1266         memset(&rth, 0, sizeof(struct tcphdr));
1267         rth.dest   = th->source;
1268         rth.source = th->dest;
1269         rth.doff   = sizeof(struct tcphdr) / 4;
1270         rth.rst    = 1;
1271
1272         if (th->ack) {
1273                 rth.seq = th->ack_seq;
1274         } else {
1275                 rth.ack = 1;
1276                 rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
1277                                     skb->len - (th->doff << 2));
1278         }
1279
1280         memset(&arg, 0, sizeof arg);
1281         arg.iov[0].iov_base = (unsigned char *)&rth;
1282         arg.iov[0].iov_len  = sizeof rth;
1283         arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1284                                       skb->nh.iph->saddr, /*XXX*/
1285                                       sizeof(struct tcphdr), IPPROTO_TCP, 0);
1286         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1287
1288         ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
1289
1290         TCP_INC_STATS_BH(TcpOutSegs);
1291         TCP_INC_STATS_BH(TcpOutRsts);
1292 }
1293
1294 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
1295    outside socket context is ugly, certainly. What can I do?
1296  */
1297
1298 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
1299                             u32 win, u32 ts)
1300 {
1301         struct tcphdr *th = skb->h.th;
1302         struct {
1303                 struct tcphdr th;
1304                 u32 tsopt[3];
1305         } rep;
1306         struct ip_reply_arg arg;
1307
1308         memset(&rep.th, 0, sizeof(struct tcphdr));
1309         memset(&arg, 0, sizeof arg);
1310
1311         arg.iov[0].iov_base = (unsigned char *)&rep;
1312         arg.iov[0].iov_len  = sizeof(rep.th);
1313         if (ts) {
1314                 rep.tsopt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
1315                                      (TCPOPT_TIMESTAMP << 8) |
1316                                      TCPOLEN_TIMESTAMP);
1317                 rep.tsopt[1] = htonl(tcp_time_stamp);
1318                 rep.tsopt[2] = htonl(ts);
1319                 arg.iov[0].iov_len = sizeof(rep);
1320         }
1321
1322         /* Swap the send and the receive. */
1323         rep.th.dest    = th->source;
1324         rep.th.source  = th->dest;
1325         rep.th.doff    = arg.iov[0].iov_len / 4;
1326         rep.th.seq     = htonl(seq);
1327         rep.th.ack_seq = htonl(ack);
1328         rep.th.ack     = 1;
1329         rep.th.window  = htons(win);
1330
1331         arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1332                                       skb->nh.iph->saddr, /*XXX*/
1333                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
1334         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1335
1336         ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
1337
1338         TCP_INC_STATS_BH(TcpOutSegs);
1339 }
1340
1341 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
1342 {
1343         struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk;
1344
1345         tcp_v4_send_ack(skb, tw->tw_snd_nxt, tw->tw_rcv_nxt,
1346                         tw->tw_rcv_wnd >> tw->tw_rcv_wscale, tw->tw_ts_recent);
1347
1348         tcp_tw_put(tw);
1349 }
1350
1351 static void tcp_v4_or_send_ack(struct sk_buff *skb, struct open_request *req)
1352 {
1353         tcp_v4_send_ack(skb, req->snt_isn + 1, req->rcv_isn + 1, req->rcv_wnd,
1354                         req->ts_recent);
1355 }
1356
1357 static struct dst_entry* tcp_v4_route_req(struct sock *sk,
1358                                           struct open_request *req)
1359 {
1360         struct rtable *rt;
1361         struct ip_options *opt = req->af.v4_req.opt;
1362         struct flowi fl = { .oif = sk->sk_bound_dev_if,
1363                             .nl_u = { .ip4_u =
1364                                       { .daddr = ((opt && opt->srr) ?
1365                                                   opt->faddr :
1366                                                   req->af.v4_req.rmt_addr),
1367                                         .saddr = req->af.v4_req.loc_addr,
1368                                         .tos = RT_CONN_FLAGS(sk) } },
1369                             .proto = IPPROTO_TCP,
1370                             .uli_u = { .ports =
1371                                        { .sport = inet_sk(sk)->sport,
1372                                          .dport = req->rmt_port } } };
1373
1374         if (ip_route_output_flow(&rt, &fl, sk, 0)) {
1375                 IP_INC_STATS_BH(OutNoRoutes);
1376                 return NULL;
1377         }
1378         if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
1379                 ip_rt_put(rt);
1380                 IP_INC_STATS_BH(OutNoRoutes);
1381                 return NULL;
1382         }
1383         return &rt->u.dst;
1384 }
1385
1386 /*
1387  *      Send a SYN-ACK after having received an ACK.
1388  *      This still operates on a open_request only, not on a big
1389  *      socket.
1390  */
1391 static int tcp_v4_send_synack(struct sock *sk, struct open_request *req,
1392                               struct dst_entry *dst)
1393 {
1394         int err = -1;
1395         struct sk_buff * skb;
1396
1397         /* First, grab a route. */
1398         if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1399                 goto out;
1400
1401         skb = tcp_make_synack(sk, dst, req);
1402
1403         if (skb) {
1404                 struct tcphdr *th = skb->h.th;
1405
1406                 th->check = tcp_v4_check(th, skb->len,
1407                                          req->af.v4_req.loc_addr,
1408                                          req->af.v4_req.rmt_addr,
1409                                          csum_partial((char *)th, skb->len,
1410                                                       skb->csum));
1411
1412                 err = ip_build_and_send_pkt(skb, sk, req->af.v4_req.loc_addr,
1413                                             req->af.v4_req.rmt_addr,
1414                                             req->af.v4_req.opt);
1415                 if (err == NET_XMIT_CN)
1416                         err = 0;
1417         }
1418
1419 out:
1420         dst_release(dst);
1421         return err;
1422 }
1423
1424 /*
1425  *      IPv4 open_request destructor.
1426  */
1427 static void tcp_v4_or_free(struct open_request *req)
1428 {
1429         if (req->af.v4_req.opt)
1430                 kfree(req->af.v4_req.opt);
1431 }
1432
1433 static inline void syn_flood_warning(struct sk_buff *skb)
1434 {
1435         static unsigned long warntime;
1436
1437         if (time_after(jiffies, (warntime + HZ * 60))) {
1438                 warntime = jiffies;
1439                 printk(KERN_INFO
1440                        "possible SYN flooding on port %d. Sending cookies.\n",
1441                        ntohs(skb->h.th->dest));
1442         }
1443 }
1444
1445 /*
1446  * Save and compile IPv4 options into the open_request if needed.
1447  */
1448 static inline struct ip_options *tcp_v4_save_options(struct sock *sk,
1449                                                      struct sk_buff *skb)
1450 {
1451         struct ip_options *opt = &(IPCB(skb)->opt);
1452         struct ip_options *dopt = NULL;
1453
1454         if (opt && opt->optlen) {
1455                 int opt_size = optlength(opt);
1456                 dopt = kmalloc(opt_size, GFP_ATOMIC);
1457                 if (dopt) {
1458                         if (ip_options_echo(dopt, skb)) {
1459                                 kfree(dopt);
1460                                 dopt = NULL;
1461                         }
1462                 }
1463         }
1464         return dopt;
1465 }
1466
1467 /*
1468  * Maximum number of SYN_RECV sockets in queue per LISTEN socket.
1469  * One SYN_RECV socket costs about 80bytes on a 32bit machine.
1470  * It would be better to replace it with a global counter for all sockets
1471  * but then some measure against one socket starving all other sockets
1472  * would be needed.
1473  *
1474  * It was 128 by default. Experiments with real servers show, that
1475  * it is absolutely not enough even at 100conn/sec. 256 cures most
1476  * of problems. This value is adjusted to 128 for very small machines
1477  * (<=32Mb of memory) and to 1024 on normal or better ones (>=256Mb).
1478  * Further increasing requires to change hash table size.
1479  */
1480 int sysctl_max_syn_backlog = 256;
1481
1482 struct or_calltable or_ipv4 = {
1483         .family         =       PF_INET,
1484         .rtx_syn_ack    =       tcp_v4_send_synack,
1485         .send_ack       =       tcp_v4_or_send_ack,
1486         .destructor     =       tcp_v4_or_free,
1487         .send_reset     =       tcp_v4_send_reset,
1488 };
1489
1490 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1491 {
1492         struct tcp_opt tp;
1493         struct open_request *req;
1494         __u32 saddr = skb->nh.iph->saddr;
1495         __u32 daddr = skb->nh.iph->daddr;
1496         __u32 isn = TCP_SKB_CB(skb)->when;
1497         struct dst_entry *dst = NULL;
1498 #ifdef CONFIG_ACCEPT_QUEUES
1499         int class = 0;
1500 #endif
1501 #ifdef CONFIG_SYN_COOKIES
1502         int want_cookie = 0;
1503 #else
1504 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1505 #endif
1506
1507         /* Never answer to SYNs send to broadcast or multicast */
1508         if (((struct rtable *)skb->dst)->rt_flags &
1509             (RTCF_BROADCAST | RTCF_MULTICAST))
1510                 goto drop;
1511
1512         /* TW buckets are converted to open requests without
1513          * limitations, they conserve resources and peer is
1514          * evidently real one.
1515          */
1516         if (tcp_synq_is_full(sk) && !isn) {
1517 #ifdef CONFIG_SYN_COOKIES
1518                 if (sysctl_tcp_syncookies) {
1519                         want_cookie = 1;
1520                 } else
1521 #endif
1522                 goto drop;
1523         }
1524
1525 #ifdef CONFIG_ACCEPT_QUEUES
1526         class = (skb->nfmark <= 0) ? 0 :
1527                 ((skb->nfmark >= NUM_ACCEPT_QUEUES) ? 0: skb->nfmark);
1528         /*
1529          * Accept only if the class has shares set or if the default class
1530          * i.e. class 0 has shares
1531          */
1532         if (!(tcp_sk(sk)->acceptq[class].aq_valid)) {
1533                 if (tcp_sk(sk)->acceptq[0].aq_valid)
1534                         class = 0;
1535                 else
1536                         goto drop;
1537         }
1538 #endif
1539
1540         /* Accept backlog is full. If we have already queued enough
1541          * of warm entries in syn queue, drop request. It is better than
1542          * clogging syn queue with openreqs with exponentially increasing
1543          * timeout.
1544          */
1545 #ifdef CONFIG_ACCEPT_QUEUES
1546         if (tcp_acceptq_is_full(sk, class) && tcp_synq_young(sk, class) > 1)
1547 #else
1548         if (sk_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
1549 #endif
1550                 goto drop;
1551
1552         req = tcp_openreq_alloc();
1553         if (!req)
1554                 goto drop;
1555
1556         tcp_clear_options(&tp);
1557         tp.mss_clamp = 536;
1558         tp.user_mss  = tcp_sk(sk)->user_mss;
1559
1560         tcp_parse_options(skb, &tp, 0);
1561
1562         if (want_cookie) {
1563                 tcp_clear_options(&tp);
1564                 tp.saw_tstamp = 0;
1565         }
1566
1567         if (tp.saw_tstamp && !tp.rcv_tsval) {
1568                 /* Some OSes (unknown ones, but I see them on web server, which
1569                  * contains information interesting only for windows'
1570                  * users) do not send their stamp in SYN. It is easy case.
1571                  * We simply do not advertise TS support.
1572                  */
1573                 tp.saw_tstamp = 0;
1574                 tp.tstamp_ok  = 0;
1575         }
1576         tp.tstamp_ok = tp.saw_tstamp;
1577
1578         tcp_openreq_init(req, &tp, skb);
1579 #ifdef CONFIG_ACCEPT_QUEUES
1580         req->acceptq_class = class;
1581         req->acceptq_time_stamp = jiffies;
1582 #endif
1583         req->af.v4_req.loc_addr = daddr;
1584         req->af.v4_req.rmt_addr = saddr;
1585         req->af.v4_req.opt = tcp_v4_save_options(sk, skb);
1586         req->class = &or_ipv4;
1587         if (!want_cookie)
1588                 TCP_ECN_create_request(req, skb->h.th);
1589
1590         if (want_cookie) {
1591 #ifdef CONFIG_SYN_COOKIES
1592                 syn_flood_warning(skb);
1593 #endif
1594                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1595         } else if (!isn) {
1596                 struct inet_peer *peer = NULL;
1597
1598                 /* VJ's idea. We save last timestamp seen
1599                  * from the destination in peer table, when entering
1600                  * state TIME-WAIT, and check against it before
1601                  * accepting new connection request.
1602                  *
1603                  * If "isn" is not zero, this request hit alive
1604                  * timewait bucket, so that all the necessary checks
1605                  * are made in the function processing timewait state.
1606                  */
1607                 if (tp.saw_tstamp &&
1608                     sysctl_tcp_tw_recycle &&
1609                     (dst = tcp_v4_route_req(sk, req)) != NULL &&
1610                     (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1611                     peer->v4daddr == saddr) {
1612                         if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1613                             (s32)(peer->tcp_ts - req->ts_recent) >
1614                                                         TCP_PAWS_WINDOW) {
1615                                 NET_INC_STATS_BH(PAWSPassiveRejected);
1616                                 dst_release(dst);
1617                                 goto drop_and_free;
1618                         }
1619                 }
1620                 /* Kill the following clause, if you dislike this way. */
1621                 else if (!sysctl_tcp_syncookies &&
1622                          (sysctl_max_syn_backlog - tcp_synq_len(sk) <
1623                           (sysctl_max_syn_backlog >> 2)) &&
1624                          (!peer || !peer->tcp_ts_stamp) &&
1625                          (!dst || !dst_metric(dst, RTAX_RTT))) {
1626                         /* Without syncookies last quarter of
1627                          * backlog is filled with destinations,
1628                          * proven to be alive.
1629                          * It means that we continue to communicate
1630                          * to destinations, already remembered
1631                          * to the moment of synflood.
1632                          */
1633                         NETDEBUG(if (net_ratelimit()) \
1634                                         printk(KERN_DEBUG "TCP: drop open "
1635                                                           "request from %u.%u."
1636                                                           "%u.%u/%u\n", \
1637                                                NIPQUAD(saddr),
1638                                                ntohs(skb->h.th->source)));
1639                         dst_release(dst);
1640                         goto drop_and_free;
1641                 }
1642
1643                 isn = tcp_v4_init_sequence(sk, skb);
1644         }
1645         req->snt_isn = isn;
1646
1647         if (tcp_v4_send_synack(sk, req, dst))
1648                 goto drop_and_free;
1649
1650         if (want_cookie) {
1651                 tcp_openreq_free(req);
1652         } else {
1653                 tcp_v4_synq_add(sk, req);
1654         }
1655         return 0;
1656
1657 drop_and_free:
1658         tcp_openreq_free(req);
1659 drop:
1660         TCP_INC_STATS_BH(TcpAttemptFails);
1661         return 0;
1662 }
1663
1664
1665 /*
1666  * The three way handshake has completed - we got a valid synack -
1667  * now create the new socket.
1668  */
1669 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1670                                   struct open_request *req,
1671                                   struct dst_entry *dst)
1672 {
1673         struct inet_opt *newinet;
1674         struct tcp_opt *newtp;
1675         struct sock *newsk;
1676
1677 #ifdef CONFIG_ACCEPT_QUEUES
1678         if (tcp_acceptq_is_full(sk, req->acceptq_class))
1679 #else
1680         if (sk_acceptq_is_full(sk))
1681 #endif
1682                 goto exit_overflow;
1683
1684         if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1685                 goto exit;
1686
1687         newsk = tcp_create_openreq_child(sk, req, skb);
1688         if (!newsk)
1689                 goto exit;
1690
1691         newsk->sk_dst_cache = dst;
1692         tcp_v4_setup_caps(newsk, dst);
1693
1694         newtp                 = tcp_sk(newsk);
1695         newinet               = inet_sk(newsk);
1696         newinet->daddr        = req->af.v4_req.rmt_addr;
1697         newinet->rcv_saddr    = req->af.v4_req.loc_addr;
1698         newinet->saddr        = req->af.v4_req.loc_addr;
1699         newinet->opt          = req->af.v4_req.opt;
1700         req->af.v4_req.opt    = NULL;
1701         newinet->mc_index     = tcp_v4_iif(skb);
1702         newinet->mc_ttl       = skb->nh.iph->ttl;
1703         newtp->ext_header_len = 0;
1704         if (newinet->opt)
1705                 newtp->ext_header_len = newinet->opt->optlen;
1706         newtp->ext2_header_len = dst->header_len;
1707         newinet->id = newtp->write_seq ^ jiffies;
1708
1709         tcp_sync_mss(newsk, dst_pmtu(dst));
1710         newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1711         tcp_initialize_rcv_mss(newsk);
1712
1713         __tcp_v4_hash(newsk, 0);
1714         __tcp_inherit_port(sk, newsk);
1715
1716         return newsk;
1717
1718 exit_overflow:
1719         NET_INC_STATS_BH(ListenOverflows);
1720 exit:
1721         NET_INC_STATS_BH(ListenDrops);
1722         dst_release(dst);
1723         return NULL;
1724 }
1725
1726 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1727 {
1728         struct tcphdr *th = skb->h.th;
1729         struct iphdr *iph = skb->nh.iph;
1730         struct tcp_opt *tp = tcp_sk(sk);
1731         struct sock *nsk;
1732         struct open_request **prev;
1733         /* Find possible connection requests. */
1734         struct open_request *req = tcp_v4_search_req(tp, &prev, th->source,
1735                                                      iph->saddr, iph->daddr);
1736         if (req)
1737                 return tcp_check_req(sk, skb, req, prev);
1738
1739         nsk = __tcp_v4_lookup_established(skb->nh.iph->saddr,
1740                                           th->source,
1741                                           skb->nh.iph->daddr,
1742                                           ntohs(th->dest),
1743                                           tcp_v4_iif(skb));
1744
1745         if (nsk) {
1746                 if (nsk->sk_state != TCP_TIME_WAIT) {
1747                         bh_lock_sock(nsk);
1748                         return nsk;
1749                 }
1750                 tcp_tw_put((struct tcp_tw_bucket *)nsk);
1751                 return NULL;
1752         }
1753
1754 #ifdef CONFIG_SYN_COOKIES
1755         if (!th->rst && !th->syn && th->ack)
1756                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1757 #endif
1758         return sk;
1759 }
1760
1761 static int tcp_v4_checksum_init(struct sk_buff *skb)
1762 {
1763         if (skb->ip_summed == CHECKSUM_HW) {
1764                 skb->ip_summed = CHECKSUM_UNNECESSARY;
1765                 if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1766                                   skb->nh.iph->daddr, skb->csum))
1767                         return 0;
1768
1769                 NETDEBUG(if (net_ratelimit())
1770                                 printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
1771                 skb->ip_summed = CHECKSUM_NONE;
1772         }
1773         if (skb->len <= 76) {
1774                 if (tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1775                                  skb->nh.iph->daddr,
1776                                  skb_checksum(skb, 0, skb->len, 0)))
1777                         return -1;
1778                 skb->ip_summed = CHECKSUM_UNNECESSARY;
1779         } else {
1780                 skb->csum = ~tcp_v4_check(skb->h.th, skb->len,
1781                                           skb->nh.iph->saddr,
1782                                           skb->nh.iph->daddr, 0);
1783         }
1784         return 0;
1785 }
1786
1787
1788 /* The socket must have it's spinlock held when we get
1789  * here.
1790  *
1791  * We have a potential double-lock case here, so even when
1792  * doing backlog processing we use the BH locking scheme.
1793  * This is because we cannot sleep with the original spinlock
1794  * held.
1795  */
1796 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1797 {
1798         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1799                 TCP_CHECK_TIMER(sk);
1800                 if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
1801                         goto reset;
1802                 TCP_CHECK_TIMER(sk);
1803                 return 0;
1804         }
1805
1806         if (skb->len < (skb->h.th->doff << 2) || tcp_checksum_complete(skb))
1807                 goto csum_err;
1808
1809         if (sk->sk_state == TCP_LISTEN) {
1810                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1811                 if (!nsk)
1812                         goto discard;
1813
1814                 if (nsk != sk) {
1815                         if (tcp_child_process(sk, nsk, skb))
1816                                 goto reset;
1817                         return 0;
1818                 }
1819         }
1820
1821         TCP_CHECK_TIMER(sk);
1822         if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
1823                 goto reset;
1824         TCP_CHECK_TIMER(sk);
1825         return 0;
1826
1827 reset:
1828         tcp_v4_send_reset(skb);
1829 discard:
1830         kfree_skb(skb);
1831         /* Be careful here. If this function gets more complicated and
1832          * gcc suffers from register pressure on the x86, sk (in %ebx)
1833          * might be destroyed here. This current version compiles correctly,
1834          * but you have been warned.
1835          */
1836         return 0;
1837
1838 csum_err:
1839         TCP_INC_STATS_BH(TcpInErrs);
1840         goto discard;
1841 }
1842
1843 /*
1844  *      From tcp_input.c
1845  */
1846
1847 int tcp_v4_rcv(struct sk_buff *skb)
1848 {
1849         struct tcphdr *th;
1850         struct sock *sk;
1851         int ret;
1852
1853         if (skb->pkt_type != PACKET_HOST)
1854                 goto discard_it;
1855
1856         /* Count it even if it's bad */
1857         TCP_INC_STATS_BH(TcpInSegs);
1858
1859         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1860                 goto discard_it;
1861
1862         th = skb->h.th;
1863
1864         if (th->doff < sizeof(struct tcphdr) / 4)
1865                 goto bad_packet;
1866         if (!pskb_may_pull(skb, th->doff * 4))
1867                 goto discard_it;
1868
1869         /* An explanation is required here, I think.
1870          * Packet length and doff are validated by header prediction,
1871          * provided case of th->doff==0 is elimineted.
1872          * So, we defer the checks. */
1873         if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
1874              tcp_v4_checksum_init(skb) < 0))
1875                 goto bad_packet;
1876
1877         th = skb->h.th;
1878         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1879         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1880                                     skb->len - th->doff * 4);
1881         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1882         TCP_SKB_CB(skb)->when    = 0;
1883         TCP_SKB_CB(skb)->flags   = skb->nh.iph->tos;
1884         TCP_SKB_CB(skb)->sacked  = 0;
1885
1886         sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source,
1887                              skb->nh.iph->daddr, ntohs(th->dest),
1888                              tcp_v4_iif(skb));
1889
1890         if (!sk)
1891                 goto no_tcp_socket;
1892
1893 process:
1894         if (sk->sk_state == TCP_TIME_WAIT)
1895                 goto do_time_wait;
1896
1897         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1898                 goto discard_and_relse;
1899
1900         if (sk_filter(sk, skb, 0))
1901                 goto discard_and_relse;
1902
1903         skb->dev = NULL;
1904
1905         bh_lock_sock(sk);
1906         ret = 0;
1907         if (!sock_owned_by_user(sk)) {
1908                 if (!tcp_prequeue(sk, skb))
1909                         ret = tcp_v4_do_rcv(sk, skb);
1910         } else
1911                 sk_add_backlog(sk, skb);
1912         bh_unlock_sock(sk);
1913
1914         sock_put(sk);
1915
1916         return ret;
1917
1918 no_tcp_socket:
1919         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1920                 goto discard_it;
1921
1922         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1923 bad_packet:
1924                 TCP_INC_STATS_BH(TcpInErrs);
1925         } else {
1926                 tcp_v4_send_reset(skb);
1927         }
1928
1929 discard_it:
1930         /* Discard frame. */
1931         kfree_skb(skb);
1932         return 0;
1933
1934 discard_and_relse:
1935         sock_put(sk);
1936         goto discard_it;
1937
1938 do_time_wait:
1939         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1940                 tcp_tw_put((struct tcp_tw_bucket *) sk);
1941                 goto discard_it;
1942         }
1943
1944         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1945                 TCP_INC_STATS_BH(TcpInErrs);
1946                 tcp_tw_put((struct tcp_tw_bucket *) sk);
1947                 goto discard_it;
1948         }
1949         switch (tcp_timewait_state_process((struct tcp_tw_bucket *)sk,
1950                                            skb, th, skb->len)) {
1951         case TCP_TW_SYN: {
1952                 struct sock *sk2 = tcp_v4_lookup_listener(skb->nh.iph->daddr,
1953                                                           ntohs(th->dest),
1954                                                           tcp_v4_iif(skb));
1955                 if (sk2) {
1956                         tcp_tw_deschedule((struct tcp_tw_bucket *)sk);
1957                         tcp_tw_put((struct tcp_tw_bucket *)sk);
1958                         sk = sk2;
1959                         goto process;
1960                 }
1961                 /* Fall through to ACK */
1962         }
1963         case TCP_TW_ACK:
1964                 tcp_v4_timewait_ack(sk, skb);
1965                 break;
1966         case TCP_TW_RST:
1967                 goto no_tcp_socket;
1968         case TCP_TW_SUCCESS:;
1969         }
1970         goto discard_it;
1971 }
1972
1973 /* With per-bucket locks this operation is not-atomic, so that
1974  * this version is not worse.
1975  */
1976 static void __tcp_v4_rehash(struct sock *sk)
1977 {
1978         sk->sk_prot->unhash(sk);
1979         sk->sk_prot->hash(sk);
1980 }
1981
1982 static int tcp_v4_reselect_saddr(struct sock *sk)
1983 {
1984         struct inet_opt *inet = inet_sk(sk);
1985         int err;
1986         struct rtable *rt;
1987         __u32 old_saddr = inet->saddr;
1988         __u32 new_saddr;
1989         __u32 daddr = inet->daddr;
1990
1991         if (inet->opt && inet->opt->srr)
1992                 daddr = inet->opt->faddr;
1993
1994         /* Query new route. */
1995         err = ip_route_connect(&rt, daddr, 0,
1996                                RT_TOS(inet->tos) | sk->sk_localroute,
1997                                sk->sk_bound_dev_if,
1998                                IPPROTO_TCP,
1999                                inet->sport, inet->dport, sk);
2000         if (err)
2001                 return err;
2002
2003         __sk_dst_set(sk, &rt->u.dst);
2004         tcp_v4_setup_caps(sk, &rt->u.dst);
2005         tcp_sk(sk)->ext2_header_len = rt->u.dst.header_len;
2006
2007         new_saddr = rt->rt_src;
2008
2009         if (new_saddr == old_saddr)
2010                 return 0;
2011
2012         if (sysctl_ip_dynaddr > 1) {
2013                 printk(KERN_INFO "tcp_v4_rebuild_header(): shifting inet->"
2014                                  "saddr from %d.%d.%d.%d to %d.%d.%d.%d\n",
2015                        NIPQUAD(old_saddr),
2016                        NIPQUAD(new_saddr));
2017         }
2018
2019         inet->saddr = new_saddr;
2020         inet->rcv_saddr = new_saddr;
2021
2022         /* XXX The only one ugly spot where we need to
2023          * XXX really change the sockets identity after
2024          * XXX it has entered the hashes. -DaveM
2025          *
2026          * Besides that, it does not check for connection
2027          * uniqueness. Wait for troubles.
2028          */
2029         __tcp_v4_rehash(sk);
2030         return 0;
2031 }
2032
2033 int tcp_v4_rebuild_header(struct sock *sk)
2034 {
2035         struct inet_opt *inet = inet_sk(sk);
2036         struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0);
2037         u32 daddr;
2038         int err;
2039
2040         /* Route is OK, nothing to do. */
2041         if (rt)
2042                 return 0;
2043
2044         /* Reroute. */
2045         daddr = inet->daddr;
2046         if (inet->opt && inet->opt->srr)
2047                 daddr = inet->opt->faddr;
2048
2049         {
2050                 struct flowi fl = { .oif = sk->sk_bound_dev_if,
2051                                     .nl_u = { .ip4_u =
2052                                               { .daddr = daddr,
2053                                                 .saddr = inet->saddr,
2054                                                 .tos = RT_CONN_FLAGS(sk) } },
2055                                     .proto = IPPROTO_TCP,
2056                                     .uli_u = { .ports =
2057                                                { .sport = inet->sport,
2058                                                  .dport = inet->dport } } };
2059
2060                 err = ip_route_output_flow(&rt, &fl, sk, 0);
2061         }
2062         if (!err) {
2063                 __sk_dst_set(sk, &rt->u.dst);
2064                 tcp_v4_setup_caps(sk, &rt->u.dst);
2065                 tcp_sk(sk)->ext2_header_len = rt->u.dst.header_len;
2066                 return 0;
2067         }
2068
2069         /* Routing failed... */
2070         sk->sk_route_caps = 0;
2071
2072         if (!sysctl_ip_dynaddr ||
2073             sk->sk_state != TCP_SYN_SENT ||
2074             (sk->sk_userlocks & SOCK_BINDADDR_LOCK) ||
2075             (err = tcp_v4_reselect_saddr(sk)) != 0)
2076                 sk->sk_err_soft = -err;
2077
2078         return err;
2079 }
2080
2081 static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
2082 {
2083         struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
2084         struct inet_opt *inet = inet_sk(sk);
2085
2086         sin->sin_family         = AF_INET;
2087         sin->sin_addr.s_addr    = inet->daddr;
2088         sin->sin_port           = inet->dport;
2089 }
2090
2091 /* VJ's idea. Save last timestamp seen from this destination
2092  * and hold it at least for normal timewait interval to use for duplicate
2093  * segment detection in subsequent connections, before they enter synchronized
2094  * state.
2095  */
2096
2097 int tcp_v4_remember_stamp(struct sock *sk)
2098 {
2099         struct inet_opt *inet = inet_sk(sk);
2100         struct tcp_opt *tp = tcp_sk(sk);
2101         struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
2102         struct inet_peer *peer = NULL;
2103         int release_it = 0;
2104
2105         if (!rt || rt->rt_dst != inet->daddr) {
2106                 peer = inet_getpeer(inet->daddr, 1);
2107                 release_it = 1;
2108         } else {
2109                 if (!rt->peer)
2110                         rt_bind_peer(rt, 1);
2111                 peer = rt->peer;
2112         }
2113
2114         if (peer) {
2115                 if ((s32)(peer->tcp_ts - tp->ts_recent) <= 0 ||
2116                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
2117                      peer->tcp_ts_stamp <= tp->ts_recent_stamp)) {
2118                         peer->tcp_ts_stamp = tp->ts_recent_stamp;
2119                         peer->tcp_ts = tp->ts_recent;
2120                 }
2121                 if (release_it)
2122                         inet_putpeer(peer);
2123                 return 1;
2124         }
2125
2126         return 0;
2127 }
2128
2129 int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw)
2130 {
2131         struct inet_peer *peer = NULL;
2132
2133         peer = inet_getpeer(tw->tw_daddr, 1);
2134
2135         if (peer) {
2136                 if ((s32)(peer->tcp_ts - tw->tw_ts_recent) <= 0 ||
2137                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
2138                      peer->tcp_ts_stamp <= tw->tw_ts_recent_stamp)) {
2139                         peer->tcp_ts_stamp = tw->tw_ts_recent_stamp;
2140                         peer->tcp_ts = tw->tw_ts_recent;
2141                 }
2142                 inet_putpeer(peer);
2143                 return 1;
2144         }
2145
2146         return 0;
2147 }
2148
2149 struct tcp_func ipv4_specific = {
2150         .queue_xmit     =       ip_queue_xmit,
2151         .send_check     =       tcp_v4_send_check,
2152         .rebuild_header =       tcp_v4_rebuild_header,
2153         .conn_request   =       tcp_v4_conn_request,
2154         .syn_recv_sock  =       tcp_v4_syn_recv_sock,
2155         .remember_stamp =       tcp_v4_remember_stamp,
2156         .net_header_len =       sizeof(struct iphdr),
2157         .setsockopt     =       ip_setsockopt,
2158         .getsockopt     =       ip_getsockopt,
2159         .addr2sockaddr  =       v4_addr2sockaddr,
2160         .sockaddr_len   =       sizeof(struct sockaddr_in),
2161 };
2162
2163 /* NOTE: A lot of things set to zero explicitly by call to
2164  *       sk_alloc() so need not be done here.
2165  */
2166 static int tcp_v4_init_sock(struct sock *sk)
2167 {
2168         struct tcp_opt *tp = tcp_sk(sk);
2169
2170         skb_queue_head_init(&tp->out_of_order_queue);
2171         tcp_init_xmit_timers(sk);
2172         tcp_prequeue_init(tp);
2173
2174         tp->rto  = TCP_TIMEOUT_INIT;
2175         tp->mdev = TCP_TIMEOUT_INIT;
2176
2177         /* So many TCP implementations out there (incorrectly) count the
2178          * initial SYN frame in their delayed-ACK and congestion control
2179          * algorithms that we must have the following bandaid to talk
2180          * efficiently to them.  -DaveM
2181          */
2182         tp->snd_cwnd = 2;
2183
2184         /* See draft-stevens-tcpca-spec-01 for discussion of the
2185          * initialization of these values.
2186          */
2187         tp->snd_ssthresh = 0x7fffffff;  /* Infinity */
2188         tp->snd_cwnd_clamp = ~0;
2189         tp->mss_cache = 536;
2190
2191         tp->reordering = sysctl_tcp_reordering;
2192
2193         sk->sk_state = TCP_CLOSE;
2194
2195         sk->sk_write_space = sk_stream_write_space;
2196         sk->sk_use_write_queue = 1;
2197
2198         tp->af_specific = &ipv4_specific;
2199
2200         sk->sk_sndbuf = sysctl_tcp_wmem[1];
2201         sk->sk_rcvbuf = sysctl_tcp_rmem[1];
2202
2203         atomic_inc(&tcp_sockets_allocated);
2204
2205         return 0;
2206 }
2207
2208 static int tcp_v4_destroy_sock(struct sock *sk)
2209 {
2210         struct tcp_opt *tp = tcp_sk(sk);
2211
2212         tcp_clear_xmit_timers(sk);
2213
2214         /* Cleanup up the write buffer. */
2215         tcp_writequeue_purge(sk);
2216
2217         /* Cleans up our, hopefully empty, out_of_order_queue. */
2218         __skb_queue_purge(&tp->out_of_order_queue);
2219
2220         /* Clean prequeue, it must be empty really */
2221         __skb_queue_purge(&tp->ucopy.prequeue);
2222
2223         /* Clean up a referenced TCP bind bucket. */
2224         if (tp->bind_hash)
2225                 tcp_put_port(sk);
2226
2227         /* If sendmsg cached page exists, toss it. */
2228         if (inet_sk(sk)->sndmsg_page)
2229                 __free_page(inet_sk(sk)->sndmsg_page);
2230
2231         atomic_dec(&tcp_sockets_allocated);
2232
2233         return 0;
2234 }
2235
2236 #ifdef CONFIG_PROC_FS
2237 /* Proc filesystem TCP sock list dumping. */
2238
2239 static inline struct tcp_tw_bucket *tw_head(struct hlist_head *head)
2240 {
2241         return hlist_empty(head) ? NULL :
2242                 list_entry(head->first, struct tcp_tw_bucket, tw_node);
2243 }
2244
2245 static inline struct tcp_tw_bucket *tw_next(struct tcp_tw_bucket *tw)
2246 {
2247         return tw->tw_node.next ?
2248                 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
2249 }
2250
2251 static void *listening_get_next(struct seq_file *seq, void *cur)
2252 {
2253         struct tcp_opt *tp;
2254         struct hlist_node *node;
2255         struct sock *sk = cur;
2256         struct tcp_iter_state* st = seq->private;
2257
2258         if (!sk) {
2259                 st->bucket = 0;
2260                 sk = sk_head(&tcp_listening_hash[0]);
2261                 goto get_sk;
2262         }
2263
2264         ++st->num;
2265
2266         if (st->state == TCP_SEQ_STATE_OPENREQ) {
2267                 struct open_request *req = cur;
2268
2269                 tp = tcp_sk(st->syn_wait_sk);
2270                 req = req->dl_next;
2271                 while (1) {
2272                         while (req) {
2273                                 if (!vx_check(req->sk->sk_xid, VX_IDENT|VX_WATCH))
2274                                         continue;
2275                                 if (req->class->family == st->family) {
2276                                         cur = req;
2277                                         goto out;
2278                                 }
2279                                 req = req->dl_next;
2280                         }
2281                         if (++st->sbucket >= TCP_SYNQ_HSIZE)
2282                                 break;
2283 get_req:
2284                         req = tp->listen_opt->syn_table[st->sbucket];
2285                 }
2286                 sk        = sk_next(st->syn_wait_sk);
2287                 st->state = TCP_SEQ_STATE_LISTENING;
2288                 read_unlock_bh(&tp->syn_wait_lock);
2289         } else
2290                 sk = sk_next(sk);
2291 get_sk:
2292         sk_for_each_from(sk, node) {
2293                 if (!vx_check(sk->sk_xid, VX_IDENT|VX_WATCH))
2294                         continue;
2295                 if (sk->sk_family == st->family) {
2296                         cur = sk;
2297                         goto out;
2298                 }
2299                 tp = tcp_sk(sk);
2300                 read_lock_bh(&tp->syn_wait_lock);
2301                 if (tp->listen_opt && tp->listen_opt->qlen) {
2302                         st->uid         = sock_i_uid(sk);
2303                         st->syn_wait_sk = sk;
2304                         st->state       = TCP_SEQ_STATE_OPENREQ;
2305                         st->sbucket     = 0;
2306                         goto get_req;
2307                 }
2308                 read_unlock_bh(&tp->syn_wait_lock);
2309         }
2310         if (++st->bucket < TCP_LHTABLE_SIZE) {
2311                 sk = sk_head(&tcp_listening_hash[st->bucket]);
2312                 goto get_sk;
2313         }
2314         cur = NULL;
2315 out:
2316         return cur;
2317 }
2318
2319 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2320 {
2321         void *rc = listening_get_next(seq, NULL);
2322
2323         while (rc && *pos) {
2324                 rc = listening_get_next(seq, rc);
2325                 --*pos;
2326         }
2327         return rc;
2328 }
2329
2330 static void *established_get_first(struct seq_file *seq)
2331 {
2332         struct tcp_iter_state* st = seq->private;
2333         void *rc = NULL;
2334
2335         for (st->bucket = 0; st->bucket < tcp_ehash_size; ++st->bucket) {
2336                 struct sock *sk;
2337                 struct hlist_node *node;
2338                 struct tcp_tw_bucket *tw;
2339
2340                 read_lock(&tcp_ehash[st->bucket].lock);
2341                 sk_for_each(sk, node, &tcp_ehash[st->bucket].chain) {
2342                         if (!vx_check(sk->sk_xid, VX_IDENT|VX_WATCH))
2343                                 continue;
2344                         if (sk->sk_family != st->family)
2345                                 continue;
2346                         rc = sk;
2347                         goto out;
2348                 }
2349                 st->state = TCP_SEQ_STATE_TIME_WAIT;
2350                 tw_for_each(tw, node,
2351                             &tcp_ehash[st->bucket + tcp_ehash_size].chain) {
2352                         if (!vx_check(tw->tw_xid, VX_IDENT|VX_WATCH))
2353                                 continue;
2354                         if (tw->tw_family != st->family)
2355                                 continue;
2356                         rc = tw;
2357                         goto out;
2358                 }
2359                 read_unlock(&tcp_ehash[st->bucket].lock);
2360                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2361         }
2362 out:
2363         return rc;
2364 }
2365
2366 static void *established_get_next(struct seq_file *seq, void *cur)
2367 {
2368         struct sock *sk = cur;
2369         struct tcp_tw_bucket *tw;
2370         struct hlist_node *node;
2371         struct tcp_iter_state* st = seq->private;
2372
2373         ++st->num;
2374
2375         if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2376                 tw = cur;
2377                 tw = tw_next(tw);
2378 get_tw:
2379                 while (tw && tw->tw_family != st->family &&
2380                         !vx_check(tw->tw_xid, VX_IDENT|VX_WATCH)) {
2381                         tw = tw_next(tw);
2382                 }
2383                 if (tw) {
2384                         cur = tw;
2385                         goto out;
2386                 }
2387                 read_unlock(&tcp_ehash[st->bucket].lock);
2388                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2389                 if (++st->bucket < tcp_ehash_size) {
2390                         read_lock(&tcp_ehash[st->bucket].lock);
2391                         sk = sk_head(&tcp_ehash[st->bucket].chain);
2392                 } else {
2393                         cur = NULL;
2394                         goto out;
2395                 }
2396         } else
2397                 sk = sk_next(sk);
2398
2399         sk_for_each_from(sk, node) {
2400                 if (!vx_check(sk->sk_xid, VX_IDENT|VX_WATCH))
2401                         continue;
2402                 if (sk->sk_family == st->family)
2403                         goto found;
2404         }
2405
2406         st->state = TCP_SEQ_STATE_TIME_WAIT;
2407         tw = tw_head(&tcp_ehash[st->bucket + tcp_ehash_size].chain);
2408         goto get_tw;
2409 found:
2410         cur = sk;
2411 out:
2412         return cur;
2413 }
2414
2415 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2416 {
2417         void *rc = established_get_first(seq);
2418
2419         while (rc && pos) {
2420                 rc = established_get_next(seq, rc);
2421                 --pos;
2422         }
2423         return rc;
2424 }
2425
2426 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2427 {
2428         void *rc;
2429         struct tcp_iter_state* st = seq->private;
2430
2431         tcp_listen_lock();
2432         st->state = TCP_SEQ_STATE_LISTENING;
2433         rc        = listening_get_idx(seq, &pos);
2434
2435         if (!rc) {
2436                 tcp_listen_unlock();
2437                 local_bh_disable();
2438                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2439                 rc        = established_get_idx(seq, pos);
2440         }
2441
2442         return rc;
2443 }
2444
2445 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2446 {
2447         struct tcp_iter_state* st = seq->private;
2448         st->state = TCP_SEQ_STATE_LISTENING;
2449         st->num = 0;
2450         return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2451 }
2452
2453 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2454 {
2455         void *rc = NULL;
2456         struct tcp_iter_state* st;
2457
2458         if (v == SEQ_START_TOKEN) {
2459                 rc = tcp_get_idx(seq, 0);
2460                 goto out;
2461         }
2462         st = seq->private;
2463
2464         switch (st->state) {
2465         case TCP_SEQ_STATE_OPENREQ:
2466         case TCP_SEQ_STATE_LISTENING:
2467                 rc = listening_get_next(seq, v);
2468                 if (!rc) {
2469                         tcp_listen_unlock();
2470                         local_bh_disable();
2471                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2472                         rc        = established_get_first(seq);
2473                 }
2474                 break;
2475         case TCP_SEQ_STATE_ESTABLISHED:
2476         case TCP_SEQ_STATE_TIME_WAIT:
2477                 rc = established_get_next(seq, v);
2478                 break;
2479         }
2480 out:
2481         ++*pos;
2482         return rc;
2483 }
2484
2485 static void tcp_seq_stop(struct seq_file *seq, void *v)
2486 {
2487         struct tcp_iter_state* st = seq->private;
2488
2489         switch (st->state) {
2490         case TCP_SEQ_STATE_OPENREQ:
2491                 if (v) {
2492                         struct tcp_opt *tp = tcp_sk(st->syn_wait_sk);
2493                         read_unlock_bh(&tp->syn_wait_lock);
2494                 }
2495         case TCP_SEQ_STATE_LISTENING:
2496                 if (v != SEQ_START_TOKEN)
2497                         tcp_listen_unlock();
2498                 break;
2499         case TCP_SEQ_STATE_TIME_WAIT:
2500         case TCP_SEQ_STATE_ESTABLISHED:
2501                 if (v)
2502                         read_unlock(&tcp_ehash[st->bucket].lock);
2503                 local_bh_enable();
2504                 break;
2505         }
2506 }
2507
2508 static int tcp_seq_open(struct inode *inode, struct file *file)
2509 {
2510         struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2511         struct seq_file *seq;
2512         struct tcp_iter_state *s;
2513         int rc;
2514
2515         if (unlikely(afinfo == NULL))
2516                 return -EINVAL;
2517
2518         s = kmalloc(sizeof(*s), GFP_KERNEL);
2519         if (!s)
2520                 return -ENOMEM;
2521         memset(s, 0, sizeof(*s));
2522         s->family               = afinfo->family;
2523         s->seq_ops.start        = tcp_seq_start;
2524         s->seq_ops.next         = tcp_seq_next;
2525         s->seq_ops.show         = afinfo->seq_show;
2526         s->seq_ops.stop         = tcp_seq_stop;
2527
2528         rc = seq_open(file, &s->seq_ops);
2529         if (rc)
2530                 goto out_kfree;
2531         seq          = file->private_data;
2532         seq->private = s;
2533 out:
2534         return rc;
2535 out_kfree:
2536         kfree(s);
2537         goto out;
2538 }
2539
2540 int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
2541 {
2542         int rc = 0;
2543         struct proc_dir_entry *p;
2544
2545         if (!afinfo)
2546                 return -EINVAL;
2547         afinfo->seq_fops->owner         = afinfo->owner;
2548         afinfo->seq_fops->open          = tcp_seq_open;
2549         afinfo->seq_fops->read          = seq_read;
2550         afinfo->seq_fops->llseek        = seq_lseek;
2551         afinfo->seq_fops->release       = seq_release_private;
2552
2553         p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
2554         if (p)
2555                 p->data = afinfo;
2556         else
2557                 rc = -ENOMEM;
2558         return rc;
2559 }
2560
2561 void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
2562 {
2563         if (!afinfo)
2564                 return;
2565         proc_net_remove(afinfo->name);
2566         memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops));
2567 }
2568
2569 static void get_openreq4(struct sock *sk, struct open_request *req,
2570                          char *tmpbuf, int i, int uid)
2571 {
2572         int ttd = req->expires - jiffies;
2573
2574         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2575                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
2576                 i,
2577                 req->af.v4_req.loc_addr,
2578                 ntohs(inet_sk(sk)->sport),
2579                 req->af.v4_req.rmt_addr,
2580                 ntohs(req->rmt_port),
2581                 TCP_SYN_RECV,
2582                 0, 0, /* could print option size, but that is af dependent. */
2583                 1,    /* timers active (only the expire timer) */
2584                 jiffies_to_clock_t(ttd),
2585                 req->retrans,
2586                 uid,
2587                 0,  /* non standard timer */
2588                 0, /* open_requests have no inode */
2589                 atomic_read(&sk->sk_refcnt),
2590                 req);
2591 }
2592
2593 static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
2594 {
2595         int timer_active;
2596         unsigned long timer_expires;
2597         struct tcp_opt *tp = tcp_sk(sp);
2598         struct inet_opt *inet = inet_sk(sp);
2599         unsigned int dest = inet->daddr;
2600         unsigned int src = inet->rcv_saddr;
2601         __u16 destp = ntohs(inet->dport);
2602         __u16 srcp = ntohs(inet->sport);
2603
2604         if (tp->pending == TCP_TIME_RETRANS) {
2605                 timer_active    = 1;
2606                 timer_expires   = tp->timeout;
2607         } else if (tp->pending == TCP_TIME_PROBE0) {
2608                 timer_active    = 4;
2609                 timer_expires   = tp->timeout;
2610         } else if (timer_pending(&sp->sk_timer)) {
2611                 timer_active    = 2;
2612                 timer_expires   = sp->sk_timer.expires;
2613         } else {
2614                 timer_active    = 0;
2615                 timer_expires = jiffies;
2616         }
2617
2618         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2619                         "%08X %5d %8d %lu %d %p %u %u %u %u %d",
2620                 i, src, srcp, dest, destp, sp->sk_state,
2621                 tp->write_seq - tp->snd_una, tp->rcv_nxt - tp->copied_seq,
2622                 timer_active,
2623                 jiffies_to_clock_t(timer_expires - jiffies),
2624                 tp->retransmits,
2625                 sock_i_uid(sp),
2626                 tp->probes_out,
2627                 sock_i_ino(sp),
2628                 atomic_read(&sp->sk_refcnt), sp,
2629                 tp->rto, tp->ack.ato, (tp->ack.quick << 1) | tp->ack.pingpong,
2630                 tp->snd_cwnd,
2631                 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
2632 }
2633
2634 static void get_timewait4_sock(struct tcp_tw_bucket *tw, char *tmpbuf, int i)
2635 {
2636         unsigned int dest, src;
2637         __u16 destp, srcp;
2638         int ttd = tw->tw_ttd - jiffies;
2639
2640         if (ttd < 0)
2641                 ttd = 0;
2642
2643         dest  = tw->tw_daddr;
2644         src   = tw->tw_rcv_saddr;
2645         destp = ntohs(tw->tw_dport);
2646         srcp  = ntohs(tw->tw_sport);
2647
2648         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2649                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
2650                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2651                 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2652                 atomic_read(&tw->tw_refcnt), tw);
2653 }
2654
2655 #define TMPSZ 150
2656
2657 static int tcp4_seq_show(struct seq_file *seq, void *v)
2658 {
2659         struct tcp_iter_state* st;
2660         char tmpbuf[TMPSZ + 1];
2661
2662         if (v == SEQ_START_TOKEN) {
2663                 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2664                            "  sl  local_address rem_address   st tx_queue "
2665                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2666                            "inode");
2667                 goto out;
2668         }
2669         st = seq->private;
2670
2671         switch (st->state) {
2672         case TCP_SEQ_STATE_LISTENING:
2673         case TCP_SEQ_STATE_ESTABLISHED:
2674                 get_tcp4_sock(v, tmpbuf, st->num);
2675                 break;
2676         case TCP_SEQ_STATE_OPENREQ:
2677                 get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
2678                 break;
2679         case TCP_SEQ_STATE_TIME_WAIT:
2680                 get_timewait4_sock(v, tmpbuf, st->num);
2681                 break;
2682         }
2683         seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
2684 out:
2685         return 0;
2686 }
2687
2688 static struct file_operations tcp4_seq_fops;
2689 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2690         .owner          = THIS_MODULE,
2691         .name           = "tcp",
2692         .family         = AF_INET,
2693         .seq_show       = tcp4_seq_show,
2694         .seq_fops       = &tcp4_seq_fops,
2695 };
2696
2697 int __init tcp4_proc_init(void)
2698 {
2699         return tcp_proc_register(&tcp4_seq_afinfo);
2700 }
2701
2702 void tcp4_proc_exit(void)
2703 {
2704         tcp_proc_unregister(&tcp4_seq_afinfo);
2705 }
2706 #endif /* CONFIG_PROC_FS */
2707
2708 struct proto tcp_prot = {
2709         .name           =       "TCP",
2710         .close          =       tcp_close,
2711         .connect        =       tcp_v4_connect,
2712         .disconnect     =       tcp_disconnect,
2713         .accept         =       tcp_accept,
2714         .ioctl          =       tcp_ioctl,
2715         .init           =       tcp_v4_init_sock,
2716         .destroy        =       tcp_v4_destroy_sock,
2717         .shutdown       =       tcp_shutdown,
2718         .setsockopt     =       tcp_setsockopt,
2719         .getsockopt     =       tcp_getsockopt,
2720         .sendmsg        =       tcp_sendmsg,
2721         .recvmsg        =       tcp_recvmsg,
2722         .backlog_rcv    =       tcp_v4_do_rcv,
2723         .hash           =       tcp_v4_hash,
2724         .unhash         =       tcp_unhash,
2725         .get_port       =       tcp_v4_get_port,
2726 };
2727
2728
2729
2730 void __init tcp_v4_init(struct net_proto_family *ops)
2731 {
2732         int err = sock_create_kern(PF_INET, SOCK_RAW, IPPROTO_TCP, &tcp_socket);
2733         if (err < 0)
2734                 panic("Failed to create the TCP control socket.\n");
2735         tcp_socket->sk->sk_allocation   = GFP_ATOMIC;
2736         inet_sk(tcp_socket->sk)->uc_ttl = -1;
2737
2738         /* Unhash it so that IP input processing does not even
2739          * see it, we do not wish this socket to see incoming
2740          * packets.
2741          */
2742         tcp_socket->sk->sk_prot->unhash(tcp_socket->sk);
2743 }
2744
2745 EXPORT_SYMBOL(ipv4_specific);
2746 EXPORT_SYMBOL(tcp_bind_hash);
2747 EXPORT_SYMBOL(tcp_bucket_create);
2748 EXPORT_SYMBOL(tcp_hashinfo);
2749 EXPORT_SYMBOL(tcp_inherit_port);
2750 EXPORT_SYMBOL(tcp_listen_wlock);
2751 EXPORT_SYMBOL(tcp_port_rover);
2752 EXPORT_SYMBOL(tcp_prot);
2753 EXPORT_SYMBOL(tcp_put_port);
2754 EXPORT_SYMBOL(tcp_unhash);
2755 EXPORT_SYMBOL(tcp_v4_conn_request);
2756 EXPORT_SYMBOL(tcp_v4_connect);
2757 EXPORT_SYMBOL(tcp_v4_do_rcv);
2758 EXPORT_SYMBOL(tcp_v4_lookup_listener);
2759 EXPORT_SYMBOL(tcp_v4_rebuild_header);
2760 EXPORT_SYMBOL(tcp_v4_remember_stamp);
2761 EXPORT_SYMBOL(tcp_v4_send_check);
2762 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2763
2764 #ifdef CONFIG_PROC_FS
2765 EXPORT_SYMBOL(tcp_proc_register);
2766 EXPORT_SYMBOL(tcp_proc_unregister);
2767 #endif
2768 #ifdef CONFIG_SYSCTL
2769 EXPORT_SYMBOL(sysctl_local_port_range);
2770 EXPORT_SYMBOL(sysctl_max_syn_backlog);
2771 EXPORT_SYMBOL(sysctl_tcp_low_latency);
2772 #endif