net/ipv4/tcp_ipv4.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
   9  *
  10  *              IPv4 specific functions
  11  *
  12  *
  13  *              code split from:
  14  *              linux/ipv4/tcp.c
  15  *              linux/ipv4/tcp_input.c
  16  *              linux/ipv4/tcp_output.c
  17  *
  18  *              See tcp.c for author information
  19  *
  20  *      This program is free software; you can redistribute it and/or
  21  *      modify it under the terms of the GNU General Public License
  22  *      as published by the Free Software Foundation; either version
  23  *      2 of the License, or (at your option) any later version.
  24  */
  25
  26 /*
  27  * Changes:
  28  *              David S. Miller :       New socket lookup architecture.
  29  *                                      This code is dedicated to John Dyson.
  30  *              David S. Miller :       Change semantics of established hash,
  31  *                                      half is devoted to TIME_WAIT sockets
  32  *                                      and the rest go in the other half.
  33  *              Andi Kleen :            Add support for syncookies and fixed
  34  *                                      some bugs: ip options weren't passed to
  35  *                                      the TCP layer, missed a check for an
  36  *                                      ACK bit.
  37  *              Andi Kleen :            Implemented fast path mtu discovery.
  38  *                                      Fixed many serious bugs in the
  39  *                                      open_request handling and moved
  40  *                                      most of it into the af independent code.
  41  *                                      Added tail drop and some other bugfixes.
  42  *                                      Added new listen sematics.
  43  *              Mike McLagan    :       Routing by source
  44  *      Juan Jose Ciarlante:            ip_dynaddr bits
  45  *              Andi Kleen:             various fixes.
  46  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  47  *                                      coma.
  48  *      Andi Kleen              :       Fix new listen.
  49  *      Andi Kleen              :       Fix accept error reporting.
  50  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  51  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  52  *                                      a single port at the same time.
  53  */
  54
  55 #include <linux/config.h>
  56
  57 #include <linux/types.h>
  58 #include <linux/fcntl.h>
  59 #include <linux/module.h>
  60 #include <linux/random.h>
  61 #include <linux/cache.h>
  62 #include <linux/jhash.h>
  63 #include <linux/init.h>
  64 #include <linux/times.h>
  65
  66 #include <net/icmp.h>
  67 #include <net/tcp.h>
  68 #include <net/ipv6.h>
  69 #include <net/inet_common.h>
  70 #include <net/xfrm.h>
  71
  72 #include <linux/inet.h>
  73 #include <linux/ipv6.h>
  74 #include <linux/stddef.h>
  75 #include <linux/proc_fs.h>
  76 #include <linux/seq_file.h>
  77 #include <linux/vserver/debug.h>
  78
  79 extern int sysctl_ip_dynaddr;
  80 int sysctl_tcp_tw_reuse;
  81 int sysctl_tcp_low_latency;
  82
  83 /* Check TCP sequence numbers in ICMP packets. */
  84 #define ICMP_MIN_LENGTH 8
  85
  86 /* Socket used for sending RSTs */
  87 static struct socket *tcp_socket;
  88
  89 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
  90                        struct sk_buff *skb);
  91
  92 struct tcp_hashinfo __cacheline_aligned tcp_hashinfo = {
  93         .__tcp_lhash_lock       =       RW_LOCK_UNLOCKED,
  94         .__tcp_lhash_users      =       ATOMIC_INIT(0),
  95         .__tcp_lhash_wait
  96           = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.__tcp_lhash_wait),
  97         .__tcp_portalloc_lock   =       SPIN_LOCK_UNLOCKED
  98 };
  99
 100 /*
 101  * This array holds the first and last local port number.
 102  * For high-usage systems, use sysctl to change this to
 103  * 32768-61000
 104  */
 105 int sysctl_local_port_range[2] = { 1024, 4999 };
 106 int tcp_port_rover = 1024 - 1;
 107
 108 static __inline__ int tcp_hashfn(__u32 laddr, __u16 lport,
 109                                  __u32 faddr, __u16 fport)
 110 {
 111         int h = (laddr ^ lport) ^ (faddr ^ fport);
 112         h ^= h >> 16;
 113         h ^= h >> 8;
 114         return h & (tcp_ehash_size - 1);
 115 }
 116
 117 static __inline__ int tcp_sk_hashfn(struct sock *sk)
 118 {
 119         struct inet_sock *inet = inet_sk(sk);
 120         __u32 laddr = inet->rcv_saddr;
 121         __u16 lport = inet->num;
 122         __u32 faddr = inet->daddr;
 123         __u16 fport = inet->dport;
 124
 125         return tcp_hashfn(laddr, lport, faddr, fport);
 126 }
 127
 128 /* Allocate and initialize a new TCP local port bind bucket.
 129  * The bindhash mutex for snum's hash chain must be held here.
 130  */
 131 struct tcp_bind_bucket *tcp_bucket_create(struct tcp_bind_hashbucket *head,
 132                                           unsigned short snum)
 133 {
 134         struct tcp_bind_bucket *tb = kmem_cache_alloc(tcp_bucket_cachep,
 135                                                       SLAB_ATOMIC);
 136         if (tb) {
 137                 tb->port = snum;
 138                 tb->fastreuse = 0;
 139                 INIT_HLIST_HEAD(&tb->owners);
 140                 hlist_add_head(&tb->node, &head->chain);
 141         }
 142         return tb;
 143 }
 144
 145 /* Caller must hold hashbucket lock for this tb with local BH disabled */
 146 void tcp_bucket_destroy(struct tcp_bind_bucket *tb)
 147 {
 148         if (hlist_empty(&tb->owners)) {
 149                 __hlist_del(&tb->node);
 150                 kmem_cache_free(tcp_bucket_cachep, tb);
 151         }
 152 }
 153
 154 /* Caller must disable local BH processing. */
 155 static __inline__ void __tcp_inherit_port(struct sock *sk, struct sock *child)
 156 {
 157         struct tcp_bind_hashbucket *head =
 158                                 &tcp_bhash[tcp_bhashfn(inet_sk(child)->num)];
 159         struct tcp_bind_bucket *tb;
 160
 161         spin_lock(&head->lock);
 162         tb = tcp_sk(sk)->bind_hash;
 163         sk_add_bind_node(child, &tb->owners);
 164         tcp_sk(child)->bind_hash = tb;
 165         spin_unlock(&head->lock);
 166 }
 167
 168 inline void tcp_inherit_port(struct sock *sk, struct sock *child)
 169 {
 170         local_bh_disable();
 171         __tcp_inherit_port(sk, child);
 172         local_bh_enable();
 173 }
 174
 175 void tcp_bind_hash(struct sock *sk, struct tcp_bind_bucket *tb,
 176                    unsigned short snum)
 177 {
 178         inet_sk(sk)->num = snum;
 179         sk_add_bind_node(sk, &tb->owners);
 180         tcp_sk(sk)->bind_hash = tb;
 181 }
 182
 183 static inline int tcp_bind_conflict(struct sock *sk, struct tcp_bind_bucket *tb)
 184 {
 185         struct sock *sk2;
 186         struct hlist_node *node;
 187         int reuse = sk->sk_reuse;
 188
 189         sk_for_each_bound(sk2, node, &tb->owners) {
 190                 if (sk != sk2 &&
 191                     !tcp_v6_ipv6only(sk2) &&
 192                     (!sk->sk_bound_dev_if ||
 193                      !sk2->sk_bound_dev_if ||
 194                      sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
 195                         if (!reuse || !sk2->sk_reuse ||
 196                             sk2->sk_state == TCP_LISTEN) {
 197                                 if (nx_addr_conflict(sk->sk_nx_info,
 198                                         tcp_v4_rcv_saddr(sk), sk2))
 199                                         break;
 200                         }
 201                 }
 202         }
 203         return node != NULL;
 204 }
 205
 206 /* Obtain a reference to a local port for the given sock,
 207  * if snum is zero it means select any available local port.
 208  */
 209 static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
 210 {
 211         struct tcp_bind_hashbucket *head;
 212         struct hlist_node *node;
 213         struct tcp_bind_bucket *tb;
 214         int ret;
 215
 216         local_bh_disable();
 217         if (!snum) {
 218                 int low = sysctl_local_port_range[0];
 219                 int high = sysctl_local_port_range[1];
 220                 int remaining = (high - low) + 1;
 221                 int rover;
 222
 223                 spin_lock(&tcp_portalloc_lock);
 224                 rover = tcp_port_rover;
 225                 do {
 226                         rover++;
 227                         if (rover < low || rover > high)
 228                                 rover = low;
 229                         head = &tcp_bhash[tcp_bhashfn(rover)];
 230                         spin_lock(&head->lock);
 231                         tb_for_each(tb, node, &head->chain)
 232                                 if (tb->port == rover)
 233                                         goto next;
 234                         break;
 235                 next:
 236                         spin_unlock(&head->lock);
 237                 } while (--remaining > 0);
 238                 tcp_port_rover = rover;
 239                 spin_unlock(&tcp_portalloc_lock);
 240
 241                 /* Exhausted local port range during search? */
 242                 ret = 1;
 243                 if (remaining <= 0)
 244                         goto fail;
 245
 246                 /* OK, here is the one we will use.  HEAD is
 247                  * non-NULL and we hold it's mutex.
 248                  */
 249                 snum = rover;
 250         } else {
 251                 head = &tcp_bhash[tcp_bhashfn(snum)];
 252                 spin_lock(&head->lock);
 253                 tb_for_each(tb, node, &head->chain)
 254                         if (tb->port == snum)
 255                                 goto tb_found;
 256         }
 257         tb = NULL;
 258         goto tb_not_found;
 259 tb_found:
 260         if (!hlist_empty(&tb->owners)) {
 261                 if (sk->sk_reuse > 1)
 262                         goto success;
 263                 if (tb->fastreuse > 0 &&
 264                     sk->sk_reuse && sk->sk_state != TCP_LISTEN) {
 265                         goto success;
 266                 } else {
 267                         ret = 1;
 268                         if (tcp_bind_conflict(sk, tb))
 269                                 goto fail_unlock;
 270                 }
 271         }
 272 tb_not_found:
 273         ret = 1;
 274         if (!tb && (tb = tcp_bucket_create(head, snum)) == NULL)
 275                 goto fail_unlock;
 276         if (hlist_empty(&tb->owners)) {
 277                 if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
 278                         tb->fastreuse = 1;
 279                 else
 280                         tb->fastreuse = 0;
 281         } else if (tb->fastreuse &&
 282                    (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
 283                 tb->fastreuse = 0;
 284 success:
 285         if (!tcp_sk(sk)->bind_hash)
 286                 tcp_bind_hash(sk, tb, snum);
 287         BUG_TRAP(tcp_sk(sk)->bind_hash == tb);
 288         ret = 0;
 289
 290 fail_unlock:
 291         spin_unlock(&head->lock);
 292 fail:
 293         local_bh_enable();
 294         return ret;
 295 }
 296
 297 /* Get rid of any references to a local port held by the
 298  * given sock.
 299  */
 300 static void __tcp_put_port(struct sock *sk)
 301 {
 302         struct inet_sock *inet = inet_sk(sk);
 303         struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(inet->num)];
 304         struct tcp_bind_bucket *tb;
 305
 306         spin_lock(&head->lock);
 307         tb = tcp_sk(sk)->bind_hash;
 308         __sk_del_bind_node(sk);
 309         tcp_sk(sk)->bind_hash = NULL;
 310         inet->num = 0;
 311         tcp_bucket_destroy(tb);
 312         spin_unlock(&head->lock);
 313 }
 314
 315 void tcp_put_port(struct sock *sk)
 316 {
 317         local_bh_disable();
 318         __tcp_put_port(sk);
 319         local_bh_enable();
 320 }
 321
 322 /* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP.
 323  * Look, when several writers sleep and reader wakes them up, all but one
 324  * immediately hit write lock and grab all the cpus. Exclusive sleep solves
 325  * this, _but_ remember, it adds useless work on UP machines (wake up each
 326  * exclusive lock release). It should be ifdefed really.
 327  */
 328
 329 void tcp_listen_wlock(void)
 330 {
 331         write_lock(&tcp_lhash_lock);
 332
 333         if (atomic_read(&tcp_lhash_users)) {
 334                 DEFINE_WAIT(wait);
 335
 336                 for (;;) {
 337                         prepare_to_wait_exclusive(&tcp_lhash_wait,
 338                                                 &wait, TASK_UNINTERRUPTIBLE);
 339                         if (!atomic_read(&tcp_lhash_users))
 340                                 break;
 341                         write_unlock_bh(&tcp_lhash_lock);
 342                         schedule();
 343                         write_lock_bh(&tcp_lhash_lock);
 344                 }
 345
 346                 finish_wait(&tcp_lhash_wait, &wait);
 347         }
 348 }
 349
 350 static __inline__ void __tcp_v4_hash(struct sock *sk, const int listen_possible)
 351 {
 352         struct hlist_head *list;
 353         rwlock_t *lock;
 354
 355         BUG_TRAP(sk_unhashed(sk));
 356         if (listen_possible && sk->sk_state == TCP_LISTEN) {
 357                 list = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)];
 358                 lock = &tcp_lhash_lock;
 359                 tcp_listen_wlock();
 360         } else {
 361                 list = &tcp_ehash[(sk->sk_hashent = tcp_sk_hashfn(sk))].chain;
 362                 lock = &tcp_ehash[sk->sk_hashent].lock;
 363                 write_lock(lock);
 364         }
 365         __sk_add_node(sk, list);
 366         sock_prot_inc_use(sk->sk_prot);
 367         write_unlock(lock);
 368         if (listen_possible && sk->sk_state == TCP_LISTEN)
 369                 wake_up(&tcp_lhash_wait);
 370 }
 371
 372 static void tcp_v4_hash(struct sock *sk)
 373 {
 374         if (sk->sk_state != TCP_CLOSE) {
 375                 local_bh_disable();
 376                 __tcp_v4_hash(sk, 1);
 377                 local_bh_enable();
 378         }
 379 }
 380
 381 void tcp_unhash(struct sock *sk)
 382 {
 383         rwlock_t *lock;
 384
 385         if (sk_unhashed(sk))
 386                 goto ende;
 387
 388         if (sk->sk_state == TCP_LISTEN) {
 389                 local_bh_disable();
 390                 tcp_listen_wlock();
 391                 lock = &tcp_lhash_lock;
 392         } else {
 393                 struct tcp_ehash_bucket *head = &tcp_ehash[sk->sk_hashent];
 394                 lock = &head->lock;
 395                 write_lock_bh(&head->lock);
 396         }
 397
 398         if (__sk_del_node_init(sk))
 399                 sock_prot_dec_use(sk->sk_prot);
 400         write_unlock_bh(lock);
 401
 402  ende:
 403         if (sk->sk_state == TCP_LISTEN)
 404                 wake_up(&tcp_lhash_wait);
 405 }
 406
 407
 408 /*
 409  *      Check if a given address matches for a tcp socket
 410  *
 411  *      nxi:    the socket's nx_info if any
 412  *      addr:   to be verified address
 413  *      saddr:  socket addresses
 414  */
 415 static inline int tcp_addr_match (
 416         struct nx_info *nxi,
 417         uint32_t addr,
 418         uint32_t saddr)
 419 {
 420         if (addr && (saddr == addr))
 421                 return 1;
 422         if (!saddr)
 423                 return addr_in_nx_info(nxi, addr);
 424         return 0;
 425 }
 426
 427 /* Don't inline this cruft.  Here are some nice properties to
 428  * exploit here.  The BSD API does not allow a listening TCP
 429  * to specify the remote port nor the remote address for the
 430  * connection.  So always assume those are both wildcarded
 431  * during the search since they can never be otherwise.
 432  */
 433 static struct sock *__tcp_v4_lookup_listener(struct hlist_head *head, u32 daddr,
 434                                              unsigned short hnum, int dif)
 435 {
 436         struct sock *result = NULL, *sk;
 437         struct hlist_node *node;
 438         int score, hiscore;
 439
 440         hiscore=-1;
 441         sk_for_each(sk, node, head) {
 442                 struct inet_sock *inet = inet_sk(sk);
 443
 444                 if (inet->num == hnum && !ipv6_only_sock(sk)) {
 445                         __u32 rcv_saddr = inet->rcv_saddr;
 446
 447                         score = (sk->sk_family == PF_INET ? 1 : 0);
 448                         if (tcp_addr_match(sk->sk_nx_info, daddr, rcv_saddr))
 449                                 score+=2;
 450                         else
 451                                 continue;
 452                         if (sk->sk_bound_dev_if) {
 453                                 if (sk->sk_bound_dev_if != dif)
 454                                         continue;
 455                                 score+=2;
 456                         }
 457                         if (score == 5)
 458                                 return sk;
 459                         if (score > hiscore) {
 460                                 hiscore = score;
 461                                 result = sk;
 462                         }
 463                 }
 464         }
 465         return result;
 466 }
 467
 468 /* Optimize the common listener case. */
 469 static inline struct sock *tcp_v4_lookup_listener(u32 daddr,
 470                 unsigned short hnum, int dif)
 471 {
 472         struct sock *sk = NULL;
 473         struct hlist_head *head;
 474
 475         read_lock(&tcp_lhash_lock);
 476         head = &tcp_listening_hash[tcp_lhashfn(hnum)];
 477         if (!hlist_empty(head)) {
 478                 struct inet_sock *inet = inet_sk((sk = __sk_head(head)));
 479
 480                 if (inet->num == hnum && !sk->sk_node.next &&
 481                     (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) &&
 482                     tcp_addr_match(sk->sk_nx_info, daddr, inet->rcv_saddr) &&
 483                     !sk->sk_bound_dev_if)
 484                         goto sherry_cache;
 485                 sk = __tcp_v4_lookup_listener(head, daddr, hnum, dif);
 486         }
 487         if (sk) {
 488 sherry_cache:
 489                 sock_hold(sk);
 490         }
 491         read_unlock(&tcp_lhash_lock);
 492         return sk;
 493 }
 494
 495 /* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
 496  * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
 497  *
 498  * Local BH must be disabled here.
 499  */
 500
 501 static inline struct sock *__tcp_v4_lookup_established(u32 saddr, u16 sport,
 502                                                        u32 daddr, u16 hnum,
 503                                                        int dif)
 504 {
 505         struct tcp_ehash_bucket *head;
 506         TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
 507         __u32 ports = TCP_COMBINED_PORTS(sport, hnum);
 508         struct sock *sk;
 509         struct hlist_node *node;
 510         /* Optimize here for direct hit, only listening connections can
 511          * have wildcards anyways.
 512          */
 513         int hash = tcp_hashfn(daddr, hnum, saddr, sport);
 514         head = &tcp_ehash[hash];
 515         read_lock(&head->lock);
 516         sk_for_each(sk, node, &head->chain) {
 517                 if (TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
 518                         goto hit; /* You sunk my battleship! */
 519         }
 520
 521         /* Must check for a TIME_WAIT'er before going to listener hash. */
 522         sk_for_each(sk, node, &(head + tcp_ehash_size)->chain) {
 523                 if (TCP_IPV4_TW_MATCH(sk, acookie, saddr, daddr, ports, dif))
 524                         goto hit;
 525         }
 526         sk = NULL;
 527 out:
 528         read_unlock(&head->lock);
 529         return sk;
 530 hit:
 531         sock_hold(sk);
 532         goto out;
 533 }
 534
 535 static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport,
 536                                            u32 daddr, u16 hnum, int dif)
 537 {
 538         struct sock *sk = __tcp_v4_lookup_established(saddr, sport,
 539                                                       daddr, hnum, dif);
 540
 541         return sk ? : tcp_v4_lookup_listener(daddr, hnum, dif);
 542 }
 543
 544 inline struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr,
 545                                   u16 dport, int dif)
 546 {
 547         struct sock *sk;
 548
 549         local_bh_disable();
 550         sk = __tcp_v4_lookup(saddr, sport, daddr, ntohs(dport), dif);
 551         local_bh_enable();
 552
 553         return sk;
 554 }
 555
 556 EXPORT_SYMBOL_GPL(tcp_v4_lookup);
 557
 558 static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
 559 {
 560         return secure_tcp_sequence_number(skb->nh.iph->daddr,
 561                                           skb->nh.iph->saddr,
 562                                           skb->h.th->dest,
 563                                           skb->h.th->source);
 564 }
 565
 566 /* called with local bh disabled */
 567 static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
 568                                       struct tcp_tw_bucket **twp)
 569 {
 570         struct inet_sock *inet = inet_sk(sk);
 571         u32 daddr = inet->rcv_saddr;
 572         u32 saddr = inet->daddr;
 573         int dif = sk->sk_bound_dev_if;
 574         TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
 575         __u32 ports = TCP_COMBINED_PORTS(inet->dport, lport);
 576         int hash = tcp_hashfn(daddr, lport, saddr, inet->dport);
 577         struct tcp_ehash_bucket *head = &tcp_ehash[hash];
 578         struct sock *sk2;
 579         struct hlist_node *node;
 580         struct tcp_tw_bucket *tw;
 581
 582         write_lock(&head->lock);
 583
 584         /* Check TIME-WAIT sockets first. */
 585         sk_for_each(sk2, node, &(head + tcp_ehash_size)->chain) {
 586                 tw = (struct tcp_tw_bucket *)sk2;
 587
 588                 if (TCP_IPV4_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
 589                         struct tcp_sock *tp = tcp_sk(sk);
 590
 591                         /* With PAWS, it is safe from the viewpoint
 592                            of data integrity. Even without PAWS it
 593                            is safe provided sequence spaces do not
 594                            overlap i.e. at data rates <= 80Mbit/sec.
 595
 596                            Actually, the idea is close to VJ's one,
 597                            only timestamp cache is held not per host,
 598                            but per port pair and TW bucket is used
 599                            as state holder.
 600
 601                            If TW bucket has been already destroyed we
 602                            fall back to VJ's scheme and use initial
 603                            timestamp retrieved from peer table.
 604                          */
 605                         if (tw->tw_ts_recent_stamp &&
 606                             (!twp || (sysctl_tcp_tw_reuse &&
 607                                       xtime.tv_sec -
 608                                       tw->tw_ts_recent_stamp > 1))) {
 609                                 if ((tp->write_seq =
 610                                                 tw->tw_snd_nxt + 65535 + 2) == 0)
 611                                         tp->write_seq = 1;
 612                                 tp->rx_opt.ts_recent       = tw->tw_ts_recent;
 613                                 tp->rx_opt.ts_recent_stamp = tw->tw_ts_recent_stamp;
 614                                 sock_hold(sk2);
 615                                 goto unique;
 616                         } else
 617                                 goto not_unique;
 618                 }
 619         }
 620         tw = NULL;
 621
 622         /* And established part... */
 623         sk_for_each(sk2, node, &head->chain) {
 624                 if (TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif))
 625                         goto not_unique;
 626         }
 627
 628 unique:
 629         /* Must record num and sport now. Otherwise we will see
 630          * in hash table socket with a funny identity. */
 631         inet->num = lport;
 632         inet->sport = htons(lport);
 633         sk->sk_hashent = hash;
 634         BUG_TRAP(sk_unhashed(sk));
 635         __sk_add_node(sk, &head->chain);
 636         sock_prot_inc_use(sk->sk_prot);
 637         write_unlock(&head->lock);
 638
 639         if (twp) {
 640                 *twp = tw;
 641                 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
 642         } else if (tw) {
 643                 /* Silly. Should hash-dance instead... */
 644                 tcp_tw_deschedule(tw);
 645                 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
 646
 647                 tcp_tw_put(tw);
 648         }
 649
 650         return 0;
 651
 652 not_unique:
 653         write_unlock(&head->lock);
 654         return -EADDRNOTAVAIL;
 655 }
 656
 657 static inline u32 connect_port_offset(const struct sock *sk)
 658 {
 659         const struct inet_sock *inet = inet_sk(sk);
 660
 661         return secure_tcp_port_ephemeral(inet->rcv_saddr, inet->daddr,
 662                                          inet->dport);
 663 }
 664
 665 /*
 666  * Bind a port for a connect operation and hash it.
 667  */
 668 static inline int tcp_v4_hash_connect(struct sock *sk)
 669 {
 670         unsigned short snum = inet_sk(sk)->num;
 671         struct tcp_bind_hashbucket *head;
 672         struct tcp_bind_bucket *tb;
 673         int ret;
 674
 675         if (!snum) {
 676                 int low = sysctl_local_port_range[0];
 677                 int high = sysctl_local_port_range[1];
 678                 int range = high - low;
 679                 int i;
 680                 int port;
 681                 static u32 hint;
 682                 u32 offset = hint + connect_port_offset(sk);
 683                 struct hlist_node *node;
 684                 struct tcp_tw_bucket *tw = NULL;
 685
 686                 local_bh_disable();
 687                 for (i = 1; i <= range; i++) {
 688                         port = low + (i + offset) % range;
 689                         head = &tcp_bhash[tcp_bhashfn(port)];
 690                         spin_lock(&head->lock);
 691
 692                         /* Does not bother with rcv_saddr checks,
 693                          * because the established check is already
 694                          * unique enough.
 695                          */
 696                         tb_for_each(tb, node, &head->chain) {
 697                                 if (tb->port == port) {
 698                                         BUG_TRAP(!hlist_empty(&tb->owners));
 699                                         if (tb->fastreuse >= 0)
 700                                                 goto next_port;
 701                                         if (!__tcp_v4_check_established(sk,
 702                                                                         port,
 703                                                                         &tw))
 704                                                 goto ok;
 705                                         goto next_port;
 706                                 }
 707                         }
 708
 709                         tb = tcp_bucket_create(head, port);
 710                         if (!tb) {
 711                                 spin_unlock(&head->lock);
 712                                 break;
 713                         }
 714                         tb->fastreuse = -1;
 715                         goto ok;
 716
 717                 next_port:
 718                         spin_unlock(&head->lock);
 719                 }
 720                 local_bh_enable();
 721
 722                 return -EADDRNOTAVAIL;
 723
 724 ok:
 725                 hint += i;
 726
 727                 /* Head lock still held and bh's disabled */
 728                 tcp_bind_hash(sk, tb, port);
 729                 if (sk_unhashed(sk)) {
 730                         inet_sk(sk)->sport = htons(port);
 731                         __tcp_v4_hash(sk, 0);
 732                 }
 733                 spin_unlock(&head->lock);
 734
 735                 if (tw) {
 736                         tcp_tw_deschedule(tw);
 737                         tcp_tw_put(tw);
 738                 }
 739
 740                 ret = 0;
 741                 goto out;
 742         }
 743
 744         head  = &tcp_bhash[tcp_bhashfn(snum)];
 745         tb  = tcp_sk(sk)->bind_hash;
 746         spin_lock_bh(&head->lock);
 747         if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
 748                 __tcp_v4_hash(sk, 0);
 749                 spin_unlock_bh(&head->lock);
 750                 return 0;
 751         } else {
 752                 spin_unlock(&head->lock);
 753                 /* No definite answer... Walk to established hash table */
 754                 ret = __tcp_v4_check_established(sk, snum, NULL);
 755 out:
 756                 local_bh_enable();
 757                 return ret;
 758         }
 759 }
 760
 761 /* This will initiate an outgoing connection. */
 762 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 763 {
 764         struct inet_sock *inet = inet_sk(sk);
 765         struct tcp_sock *tp = tcp_sk(sk);
 766         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 767         struct rtable *rt;
 768         u32 daddr, nexthop;
 769         int tmp;
 770         int err;
 771
 772         if (addr_len < sizeof(struct sockaddr_in))
 773                 return -EINVAL;
 774
 775         if (usin->sin_family != AF_INET)
 776                 return -EAFNOSUPPORT;
 777
 778         nexthop = daddr = usin->sin_addr.s_addr;
 779         if (inet->opt && inet->opt->srr) {
 780                 if (!daddr)
 781                         return -EINVAL;
 782                 nexthop = inet->opt->faddr;
 783         }
 784
 785         tmp = ip_route_connect(&rt, nexthop, inet->saddr,
 786                                RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 787                                IPPROTO_TCP,
 788                                inet->sport, usin->sin_port, sk);
 789         if (tmp < 0)
 790                 return tmp;
 791
 792         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 793                 ip_rt_put(rt);
 794                 return -ENETUNREACH;
 795         }
 796
 797         if (!inet->opt || !inet->opt->srr)
 798                 daddr = rt->rt_dst;
 799
 800         if (!inet->saddr)
 801                 inet->saddr = rt->rt_src;
 802         inet->rcv_saddr = inet->saddr;
 803
 804         if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
 805                 /* Reset inherited state */
 806                 tp->rx_opt.ts_recent       = 0;
 807                 tp->rx_opt.ts_recent_stamp = 0;
 808                 tp->write_seq              = 0;
 809         }
 810
 811         if (sysctl_tcp_tw_recycle &&
 812             !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
 813                 struct inet_peer *peer = rt_get_peer(rt);
 814
 815                 /* VJ's idea. We save last timestamp seen from
 816                  * the destination in peer table, when entering state TIME-WAIT
 817                  * and initialize rx_opt.ts_recent from it, when trying new connection.
 818                  */
 819
 820                 if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
 821                         tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
 822                         tp->rx_opt.ts_recent = peer->tcp_ts;
 823                 }
 824         }
 825
 826         inet->dport = usin->sin_port;
 827         inet->daddr = daddr;
 828
 829         tp->ext_header_len = 0;
 830         if (inet->opt)
 831                 tp->ext_header_len = inet->opt->optlen;
 832
 833         tp->rx_opt.mss_clamp = 536;
 834
 835         /* Socket identity is still unknown (sport may be zero).
 836          * However we set state to SYN-SENT and not releasing socket
 837          * lock select source port, enter ourselves into the hash tables and
 838          * complete initialization after this.
 839          */
 840         tcp_set_state(sk, TCP_SYN_SENT);
 841         err = tcp_v4_hash_connect(sk);
 842         if (err)
 843                 goto failure;
 844
 845         err = ip_route_newports(&rt, inet->sport, inet->dport, sk);
 846         if (err)
 847                 goto failure;
 848
 849         /* OK, now commit destination to socket.  */
 850         __sk_dst_set(sk, &rt->u.dst);
 851         tcp_v4_setup_caps(sk, &rt->u.dst);
 852         tp->ext2_header_len = rt->u.dst.header_len;
 853
 854         if (!tp->write_seq)
 855                 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
 856                                                            inet->daddr,
 857                                                            inet->sport,
 858                                                            usin->sin_port);
 859
 860         inet->id = tp->write_seq ^ jiffies;
 861
 862         err = tcp_connect(sk);
 863         rt = NULL;
 864         if (err)
 865                 goto failure;
 866
 867         return 0;
 868
 869 failure:
 870         /* This unhashes the socket and releases the local port, if necessary. */
 871         tcp_set_state(sk, TCP_CLOSE);
 872         ip_rt_put(rt);
 873         sk->sk_route_caps = 0;
 874         inet->dport = 0;
 875         return err;
 876 }
 877
 878 static __inline__ int tcp_v4_iif(struct sk_buff *skb)
 879 {
 880         return ((struct rtable *)skb->dst)->rt_iif;
 881 }
 882
 883 static __inline__ u32 tcp_v4_synq_hash(u32 raddr, u16 rport, u32 rnd)
 884 {
 885         return (jhash_2words(raddr, (u32) rport, rnd) & (TCP_SYNQ_HSIZE - 1));
 886 }
 887
 888 static struct open_request *tcp_v4_search_req(struct tcp_sock *tp,
 889                                               struct open_request ***prevp,
 890                                               __u16 rport,
 891                                               __u32 raddr, __u32 laddr)
 892 {
 893         struct tcp_listen_opt *lopt = tp->listen_opt;
 894         struct open_request *req, **prev;
 895
 896         for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport, lopt->hash_rnd)];
 897              (req = *prev) != NULL;
 898              prev = &req->dl_next) {
 899                 if (req->rmt_port == rport &&
 900                     req->af.v4_req.rmt_addr == raddr &&
 901                     req->af.v4_req.loc_addr == laddr &&
 902                     TCP_INET_FAMILY(req->class->family)) {
 903                         BUG_TRAP(!req->sk);
 904                         *prevp = prev;
 905                         break;
 906                 }
 907         }
 908
 909         return req;
 910 }
 911
 912 static void tcp_v4_synq_add(struct sock *sk, struct open_request *req)
 913 {
 914         struct tcp_sock *tp = tcp_sk(sk);
 915         struct tcp_listen_opt *lopt = tp->listen_opt;
 916         u32 h = tcp_v4_synq_hash(req->af.v4_req.rmt_addr, req->rmt_port, lopt->hash_rnd);
 917
 918         req->expires = jiffies + TCP_TIMEOUT_INIT;
 919         req->retrans = 0;
 920         req->sk = NULL;
 921         req->dl_next = lopt->syn_table[h];
 922
 923         write_lock(&tp->syn_wait_lock);
 924         lopt->syn_table[h] = req;
 925         write_unlock(&tp->syn_wait_lock);
 926
 927         tcp_synq_added(sk);
 928 }
 929
 930
 931 /*
 932  * This routine does path mtu discovery as defined in RFC1191.
 933  */
 934 static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph,
 935                                      u32 mtu)
 936 {
 937         struct dst_entry *dst;
 938         struct inet_sock *inet = inet_sk(sk);
 939         struct tcp_sock *tp = tcp_sk(sk);
 940
 941         /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
 942          * send out by Linux are always <576bytes so they should go through
 943          * unfragmented).
 944          */
 945         if (sk->sk_state == TCP_LISTEN)
 946                 return;
 947
 948         /* We don't check in the destentry if pmtu discovery is forbidden
 949          * on this route. We just assume that no packet_to_big packets
 950          * are send back when pmtu discovery is not active.
 951          * There is a small race when the user changes this flag in the
 952          * route, but I think that's acceptable.
 953          */
 954         if ((dst = __sk_dst_check(sk, 0)) == NULL)
 955                 return;
 956
 957         dst->ops->update_pmtu(dst, mtu);
 958
 959         /* Something is about to be wrong... Remember soft error
 960          * for the case, if this connection will not able to recover.
 961          */
 962         if (mtu < dst_pmtu(dst) && ip_dont_fragment(sk, dst))
 963                 sk->sk_err_soft = EMSGSIZE;
 964
 965         mtu = dst_pmtu(dst);
 966
 967         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 968             tp->pmtu_cookie > mtu) {
 969                 tcp_sync_mss(sk, mtu);
 970
 971                 /* Resend the TCP packet because it's
 972                  * clear that the old packet has been
 973                  * dropped. This is the new "fast" path mtu
 974                  * discovery.
 975                  */
 976                 tcp_simple_retransmit(sk);
 977         } /* else let the usual retransmit timer handle it */
 978 }
 979
 980 /*
 981  * This routine is called by the ICMP module when it gets some
 982  * sort of error condition.  If err < 0 then the socket should
 983  * be closed and the error returned to the user.  If err > 0
 984  * it's just the icmp type << 8 | icmp code.  After adjustment
 985  * header points to the first 8 bytes of the tcp header.  We need
 986  * to find the appropriate port.
 987  *
 988  * The locking strategy used here is very "optimistic". When
 989  * someone else accesses the socket the ICMP is just dropped
 990  * and for some paths there is no check at all.
 991  * A more general error queue to queue errors for later handling
 992  * is probably better.
 993  *
 994  */
 995
 996 void tcp_v4_err(struct sk_buff *skb, u32 info)
 997 {
 998         struct iphdr *iph = (struct iphdr *)skb->data;
 999         struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
1000         struct tcp_sock *tp;
1001         struct inet_sock *inet;
1002         int type = skb->h.icmph->type;
1003         int code = skb->h.icmph->code;
1004         struct sock *sk;
1005         __u32 seq;
1006         int err;
1007
1008         if (skb->len < (iph->ihl << 2) + 8) {
1009                 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
1010                 return;
1011         }
1012
1013         sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr,
1014                            th->source, tcp_v4_iif(skb));
1015         if (!sk) {
1016                 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
1017                 return;
1018         }
1019         if (sk->sk_state == TCP_TIME_WAIT) {
1020                 tcp_tw_put((struct tcp_tw_bucket *)sk);
1021                 return;
1022         }
1023
1024         bh_lock_sock(sk);
1025         /* If too many ICMPs get dropped on busy
1026          * servers this needs to be solved differently.
1027          */
1028         if (sock_owned_by_user(sk))
1029                 NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
1030
1031         if (sk->sk_state == TCP_CLOSE)
1032                 goto out;
1033
1034         tp = tcp_sk(sk);
1035         seq = ntohl(th->seq);
1036         if (sk->sk_state != TCP_LISTEN &&
1037             !between(seq, tp->snd_una, tp->snd_nxt)) {
1038                 NET_INC_STATS(LINUX_MIB_OUTOFWINDOWICMPS);
1039                 goto out;
1040         }
1041
1042         switch (type) {
1043         case ICMP_SOURCE_QUENCH:
1044                 /* Just silently ignore these. */
1045                 goto out;
1046         case ICMP_PARAMETERPROB:
1047                 err = EPROTO;
1048                 break;
1049         case ICMP_DEST_UNREACH:
1050                 if (code > NR_ICMP_UNREACH)
1051                         goto out;
1052
1053                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
1054                         if (!sock_owned_by_user(sk))
1055                                 do_pmtu_discovery(sk, iph, info);
1056                         goto out;
1057                 }
1058
1059                 err = icmp_err_convert[code].errno;
1060                 break;
1061         case ICMP_TIME_EXCEEDED:
1062                 err = EHOSTUNREACH;
1063                 break;
1064         default:
1065                 goto out;
1066         }
1067
1068         switch (sk->sk_state) {
1069                 struct open_request *req, **prev;
1070         case TCP_LISTEN:
1071                 if (sock_owned_by_user(sk))
1072                         goto out;
1073
1074                 req = tcp_v4_search_req(tp, &prev, th->dest,
1075                                         iph->daddr, iph->saddr);
1076                 if (!req)
1077                         goto out;
1078
1079                 /* ICMPs are not backlogged, hence we cannot get
1080                    an established socket here.
1081                  */
1082                 BUG_TRAP(!req->sk);
1083
1084                 if (seq != req->snt_isn) {
1085                         NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
1086                         goto out;
1087                 }
1088
1089                 /*
1090                  * Still in SYN_RECV, just remove it silently.
1091                  * There is no good way to pass the error to the newly
1092                  * created socket, and POSIX does not want network
1093                  * errors returned from accept().
1094                  */
1095                 tcp_synq_drop(sk, req, prev);
1096                 goto out;
1097
1098         case TCP_SYN_SENT:
1099         case TCP_SYN_RECV:  /* Cannot happen.
1100                                It can f.e. if SYNs crossed.
1101                              */
1102                 if (!sock_owned_by_user(sk)) {
1103                         TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1104                         sk->sk_err = err;
1105
1106                         sk->sk_error_report(sk);
1107
1108                         tcp_done(sk);
1109                 } else {
1110                         sk->sk_err_soft = err;
1111                 }
1112                 goto out;
1113         }
1114
1115         /* If we've already connected we will keep trying
1116          * until we time out, or the user gives up.
1117          *
1118          * rfc1122 4.2.3.9 allows to consider as hard errors
1119          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
1120          * but it is obsoleted by pmtu discovery).
1121          *
1122          * Note, that in modern internet, where routing is unreliable
1123          * and in each dark corner broken firewalls sit, sending random
1124          * errors ordered by their masters even this two messages finally lose
1125          * their original sense (even Linux sends invalid PORT_UNREACHs)
1126          *
1127          * Now we are in compliance with RFCs.
1128          *                                                      --ANK (980905)
1129          */
1130
1131         inet = inet_sk(sk);
1132         if (!sock_owned_by_user(sk) && inet->recverr) {
1133                 sk->sk_err = err;
1134                 sk->sk_error_report(sk);
1135         } else  { /* Only an error on timeout */
1136                 sk->sk_err_soft = err;
1137         }
1138
1139 out:
1140         bh_unlock_sock(sk);
1141         sock_put(sk);
1142 }
1143
1144 /* This routine computes an IPv4 TCP checksum. */
1145 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
1146                        struct sk_buff *skb)
1147 {
1148         struct inet_sock *inet = inet_sk(sk);
1149
1150         if (skb->ip_summed == CHECKSUM_HW) {
1151                 th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0);
1152                 skb->csum = offsetof(struct tcphdr, check);
1153         } else {
1154                 th->check = tcp_v4_check(th, len, inet->saddr, inet->daddr,
1155                                          csum_partial((char *)th,
1156                                                       th->doff << 2,
1157                                                       skb->csum));
1158         }
1159 }
1160
1161 /*
1162  *      This routine will send an RST to the other tcp.
1163  *
1164  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
1165  *                    for reset.
1166  *      Answer: if a packet caused RST, it is not for a socket
1167  *              existing in our system, if it is matched to a socket,
1168  *              it is just duplicate segment or bug in other side's TCP.
1169  *              So that we build reply only basing on parameters
1170  *              arrived with segment.
1171  *      Exception: precedence violation. We do not implement it in any case.
1172  */
1173
1174 static void tcp_v4_send_reset(struct sk_buff *skb)
1175 {
1176         struct tcphdr *th = skb->h.th;
1177         struct tcphdr rth;
1178         struct ip_reply_arg arg;
1179
1180         /* Never send a reset in response to a reset. */
1181         if (th->rst)
1182                 return;
1183
1184         if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
1185                 return;
1186
1187         /* Swap the send and the receive. */
1188         memset(&rth, 0, sizeof(struct tcphdr));
1189         rth.dest   = th->source;
1190         rth.source = th->dest;
1191         rth.doff   = sizeof(struct tcphdr) / 4;
1192         rth.rst    = 1;
1193
1194         if (th->ack) {
1195                 rth.seq = th->ack_seq;
1196         } else {
1197                 rth.ack = 1;
1198                 rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
1199                                     skb->len - (th->doff << 2));
1200         }
1201
1202         memset(&arg, 0, sizeof arg);
1203         arg.iov[0].iov_base = (unsigned char *)&rth;
1204         arg.iov[0].iov_len  = sizeof rth;
1205         arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1206                                       skb->nh.iph->saddr, /*XXX*/
1207                                       sizeof(struct tcphdr), IPPROTO_TCP, 0);
1208         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1209
1210         ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
1211
1212         TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
1213         TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
1214 }
1215
1216 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
1217    outside socket context is ugly, certainly. What can I do?
1218  */
1219
1220 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
1221                             u32 win, u32 ts)
1222 {
1223         struct tcphdr *th = skb->h.th;
1224         struct {
1225                 struct tcphdr th;
1226                 u32 tsopt[3];
1227         } rep;
1228         struct ip_reply_arg arg;
1229
1230         memset(&rep.th, 0, sizeof(struct tcphdr));
1231         memset(&arg, 0, sizeof arg);
1232
1233         arg.iov[0].iov_base = (unsigned char *)&rep;
1234         arg.iov[0].iov_len  = sizeof(rep.th);
1235         if (ts) {
1236                 rep.tsopt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
1237                                      (TCPOPT_TIMESTAMP << 8) |
1238                                      TCPOLEN_TIMESTAMP);
1239                 rep.tsopt[1] = htonl(tcp_time_stamp);
1240                 rep.tsopt[2] = htonl(ts);
1241                 arg.iov[0].iov_len = sizeof(rep);
1242         }
1243
1244         /* Swap the send and the receive. */
1245         rep.th.dest    = th->source;
1246         rep.th.source  = th->dest;
1247         rep.th.doff    = arg.iov[0].iov_len / 4;
1248         rep.th.seq     = htonl(seq);
1249         rep.th.ack_seq = htonl(ack);
1250         rep.th.ack     = 1;
1251         rep.th.window  = htons(win);
1252
1253         arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1254                                       skb->nh.iph->saddr, /*XXX*/
1255                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
1256         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1257
1258         ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
1259
1260         TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
1261 }
1262
1263 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
1264 {
1265         struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk;
1266
1267         tcp_v4_send_ack(skb, tw->tw_snd_nxt, tw->tw_rcv_nxt,
1268                         tw->tw_rcv_wnd >> tw->tw_rcv_wscale, tw->tw_ts_recent);
1269
1270         tcp_tw_put(tw);
1271 }
1272
1273 static void tcp_v4_or_send_ack(struct sk_buff *skb, struct open_request *req)
1274 {
1275         tcp_v4_send_ack(skb, req->snt_isn + 1, req->rcv_isn + 1, req->rcv_wnd,
1276                         req->ts_recent);
1277 }
1278
1279 static struct dst_entry* tcp_v4_route_req(struct sock *sk,
1280                                           struct open_request *req)
1281 {
1282         struct rtable *rt;
1283         struct ip_options *opt = req->af.v4_req.opt;
1284         struct flowi fl = { .oif = sk->sk_bound_dev_if,
1285                             .nl_u = { .ip4_u =
1286                                       { .daddr = ((opt && opt->srr) ?
1287                                                   opt->faddr :
1288                                                   req->af.v4_req.rmt_addr),
1289                                         .saddr = req->af.v4_req.loc_addr,
1290                                         .tos = RT_CONN_FLAGS(sk) } },
1291                             .proto = IPPROTO_TCP,
1292                             .uli_u = { .ports =
1293                                        { .sport = inet_sk(sk)->sport,
1294                                          .dport = req->rmt_port } } };
1295
1296         if (ip_route_output_flow(&rt, &fl, sk, 0)) {
1297                 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
1298                 return NULL;
1299         }
1300         if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
1301                 ip_rt_put(rt);
1302                 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
1303                 return NULL;
1304         }
1305         return &rt->u.dst;
1306 }
1307
1308 /*
1309  *      Send a SYN-ACK after having received an ACK.
1310  *      This still operates on a open_request only, not on a big
1311  *      socket.
1312  */
1313 static int tcp_v4_send_synack(struct sock *sk, struct open_request *req,
1314                               struct dst_entry *dst)
1315 {
1316         int err = -1;
1317         struct sk_buff * skb;
1318
1319         /* First, grab a route. */
1320         if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1321                 goto out;
1322
1323         skb = tcp_make_synack(sk, dst, req);
1324
1325         if (skb) {
1326                 struct tcphdr *th = skb->h.th;
1327
1328                 th->check = tcp_v4_check(th, skb->len,
1329                                          req->af.v4_req.loc_addr,
1330                                          req->af.v4_req.rmt_addr,
1331                                          csum_partial((char *)th, skb->len,
1332                                                       skb->csum));
1333
1334                 err = ip_build_and_send_pkt(skb, sk, req->af.v4_req.loc_addr,
1335                                             req->af.v4_req.rmt_addr,
1336                                             req->af.v4_req.opt);
1337                 if (err == NET_XMIT_CN)
1338                         err = 0;
1339         }
1340
1341 out:
1342         dst_release(dst);
1343         return err;
1344 }
1345
1346 /*
1347  *      IPv4 open_request destructor.
1348  */
1349 static void tcp_v4_or_free(struct open_request *req)
1350 {
1351         if (req->af.v4_req.opt)
1352                 kfree(req->af.v4_req.opt);
1353 }
1354
1355 static inline void syn_flood_warning(struct sk_buff *skb)
1356 {
1357         static unsigned long warntime;
1358
1359         if (time_after(jiffies, (warntime + HZ * 60))) {
1360                 warntime = jiffies;
1361                 printk(KERN_INFO
1362                        "possible SYN flooding on port %d. Sending cookies.\n",
1363                        ntohs(skb->h.th->dest));
1364         }
1365 }
1366
1367 /*
1368  * Save and compile IPv4 options into the open_request if needed.
1369  */
1370 static inline struct ip_options *tcp_v4_save_options(struct sock *sk,
1371                                                      struct sk_buff *skb)
1372 {
1373         struct ip_options *opt = &(IPCB(skb)->opt);
1374         struct ip_options *dopt = NULL;
1375
1376         if (opt && opt->optlen) {
1377                 int opt_size = optlength(opt);
1378                 dopt = kmalloc(opt_size, GFP_ATOMIC);
1379                 if (dopt) {
1380                         if (ip_options_echo(dopt, skb)) {
1381                                 kfree(dopt);
1382                                 dopt = NULL;
1383                         }
1384                 }
1385         }
1386         return dopt;
1387 }
1388
1389 /*
1390  * Maximum number of SYN_RECV sockets in queue per LISTEN socket.
1391  * One SYN_RECV socket costs about 80bytes on a 32bit machine.
1392  * It would be better to replace it with a global counter for all sockets
1393  * but then some measure against one socket starving all other sockets
1394  * would be needed.
1395  *
1396  * It was 128 by default. Experiments with real servers show, that
1397  * it is absolutely not enough even at 100conn/sec. 256 cures most
1398  * of problems. This value is adjusted to 128 for very small machines
1399  * (<=32Mb of memory) and to 1024 on normal or better ones (>=256Mb).
1400  * Further increasing requires to change hash table size.
1401  */
1402 int sysctl_max_syn_backlog = 256;
1403
1404 struct or_calltable or_ipv4 = {
1405         .family         =       PF_INET,
1406         .rtx_syn_ack    =       tcp_v4_send_synack,
1407         .send_ack       =       tcp_v4_or_send_ack,
1408         .destructor     =       tcp_v4_or_free,
1409         .send_reset     =       tcp_v4_send_reset,
1410 };
1411
1412 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1413 {
1414         struct tcp_options_received tmp_opt;
1415         struct open_request *req;
1416         __u32 saddr = skb->nh.iph->saddr;
1417         __u32 daddr = skb->nh.iph->daddr;
1418         __u32 isn = TCP_SKB_CB(skb)->when;
1419         struct dst_entry *dst = NULL;
1420 #ifdef CONFIG_SYN_COOKIES
1421         int want_cookie = 0;
1422 #else
1423 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1424 #endif
1425
1426         /* Never answer to SYNs send to broadcast or multicast */
1427         if (((struct rtable *)skb->dst)->rt_flags &
1428             (RTCF_BROADCAST | RTCF_MULTICAST))
1429                 goto drop;
1430
1431         /* TW buckets are converted to open requests without
1432          * limitations, they conserve resources and peer is
1433          * evidently real one.
1434          */
1435         if (tcp_synq_is_full(sk) && !isn) {
1436 #ifdef CONFIG_SYN_COOKIES
1437                 if (sysctl_tcp_syncookies) {
1438                         want_cookie = 1;
1439                 } else
1440 #endif
1441                 goto drop;
1442         }
1443
1444         /* Accept backlog is full. If we have already queued enough
1445          * of warm entries in syn queue, drop request. It is better than
1446          * clogging syn queue with openreqs with exponentially increasing
1447          * timeout.
1448          */
1449         if (sk_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
1450                 goto drop;
1451
1452         req = tcp_openreq_alloc();
1453         if (!req)
1454                 goto drop;
1455
1456         tcp_clear_options(&tmp_opt);
1457         tmp_opt.mss_clamp = 536;
1458         tmp_opt.user_mss  = tcp_sk(sk)->rx_opt.user_mss;
1459
1460         tcp_parse_options(skb, &tmp_opt, 0);
1461
1462         if (want_cookie) {
1463                 tcp_clear_options(&tmp_opt);
1464                 tmp_opt.saw_tstamp = 0;
1465         }
1466
1467         if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
1468                 /* Some OSes (unknown ones, but I see them on web server, which
1469                  * contains information interesting only for windows'
1470                  * users) do not send their stamp in SYN. It is easy case.
1471                  * We simply do not advertise TS support.
1472                  */
1473                 tmp_opt.saw_tstamp = 0;
1474                 tmp_opt.tstamp_ok  = 0;
1475         }
1476         tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1477
1478         tcp_openreq_init(req, &tmp_opt, skb);
1479
1480         req->af.v4_req.loc_addr = daddr;
1481         req->af.v4_req.rmt_addr = saddr;
1482         req->af.v4_req.opt = tcp_v4_save_options(sk, skb);
1483         req->class = &or_ipv4;
1484         if (!want_cookie)
1485                 TCP_ECN_create_request(req, skb->h.th);
1486
1487         if (want_cookie) {
1488 #ifdef CONFIG_SYN_COOKIES
1489                 syn_flood_warning(skb);
1490 #endif
1491                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1492         } else if (!isn) {
1493                 struct inet_peer *peer = NULL;
1494
1495                 /* VJ's idea. We save last timestamp seen
1496                  * from the destination in peer table, when entering
1497                  * state TIME-WAIT, and check against it before
1498                  * accepting new connection request.
1499                  *
1500                  * If "isn" is not zero, this request hit alive
1501                  * timewait bucket, so that all the necessary checks
1502                  * are made in the function processing timewait state.
1503                  */
1504                 if (tmp_opt.saw_tstamp &&
1505                     sysctl_tcp_tw_recycle &&
1506                     (dst = tcp_v4_route_req(sk, req)) != NULL &&
1507                     (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1508                     peer->v4daddr == saddr) {
1509                         if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1510                             (s32)(peer->tcp_ts - req->ts_recent) >
1511                                                         TCP_PAWS_WINDOW) {
1512                                 NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
1513                                 dst_release(dst);
1514                                 goto drop_and_free;
1515                         }
1516                 }
1517                 /* Kill the following clause, if you dislike this way. */
1518                 else if (!sysctl_tcp_syncookies &&
1519                          (sysctl_max_syn_backlog - tcp_synq_len(sk) <
1520                           (sysctl_max_syn_backlog >> 2)) &&
1521                          (!peer || !peer->tcp_ts_stamp) &&
1522                          (!dst || !dst_metric(dst, RTAX_RTT))) {
1523                         /* Without syncookies last quarter of
1524                          * backlog is filled with destinations,
1525                          * proven to be alive.
1526                          * It means that we continue to communicate
1527                          * to destinations, already remembered
1528                          * to the moment of synflood.
1529                          */
1530                         NETDEBUG(if (net_ratelimit()) \
1531                                         printk(KERN_DEBUG "TCP: drop open "
1532                                                           "request from %u.%u."
1533                                                           "%u.%u/%u\n", \
1534                                                NIPQUAD(saddr),
1535                                                ntohs(skb->h.th->source)));
1536                         dst_release(dst);
1537                         goto drop_and_free;
1538                 }
1539
1540                 isn = tcp_v4_init_sequence(sk, skb);
1541         }
1542         req->snt_isn = isn;
1543
1544         if (tcp_v4_send_synack(sk, req, dst))
1545                 goto drop_and_free;
1546
1547         if (want_cookie) {
1548                 tcp_openreq_free(req);
1549         } else {
1550                 tcp_v4_synq_add(sk, req);
1551         }
1552         return 0;
1553
1554 drop_and_free:
1555         tcp_openreq_free(req);
1556 drop:
1557         TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1558         return 0;
1559 }
1560
1561
1562 /*
1563  * The three way handshake has completed - we got a valid synack -
1564  * now create the new socket.
1565  */
1566 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1567                                   struct open_request *req,
1568                                   struct dst_entry *dst)
1569 {
1570         struct inet_sock *newinet;
1571         struct tcp_sock *newtp;
1572         struct sock *newsk;
1573
1574         if (sk_acceptq_is_full(sk))
1575                 goto exit_overflow;
1576
1577         if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1578                 goto exit;
1579
1580         newsk = tcp_create_openreq_child(sk, req, skb);
1581         if (!newsk)
1582                 goto exit;
1583
1584         newsk->sk_dst_cache = dst;
1585         tcp_v4_setup_caps(newsk, dst);
1586
1587         newtp                 = tcp_sk(newsk);
1588         newinet               = inet_sk(newsk);
1589         newinet->daddr        = req->af.v4_req.rmt_addr;
1590         newinet->rcv_saddr    = req->af.v4_req.loc_addr;
1591         newinet->saddr        = req->af.v4_req.loc_addr;
1592         newinet->opt          = req->af.v4_req.opt;
1593         req->af.v4_req.opt    = NULL;
1594         newinet->mc_index     = tcp_v4_iif(skb);
1595         newinet->mc_ttl       = skb->nh.iph->ttl;
1596         newtp->ext_header_len = 0;
1597         if (newinet->opt)
1598                 newtp->ext_header_len = newinet->opt->optlen;
1599         newtp->ext2_header_len = dst->header_len;
1600         newinet->id = newtp->write_seq ^ jiffies;
1601
1602         tcp_sync_mss(newsk, dst_pmtu(dst));
1603         newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1604         tcp_initialize_rcv_mss(newsk);
1605
1606         __tcp_v4_hash(newsk, 0);
1607         __tcp_inherit_port(sk, newsk);
1608
1609         return newsk;
1610
1611 exit_overflow:
1612         NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
1613 exit:
1614         NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
1615         dst_release(dst);
1616         return NULL;
1617 }
1618
1619 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1620 {
1621         struct tcphdr *th = skb->h.th;
1622         struct iphdr *iph = skb->nh.iph;
1623         struct tcp_sock *tp = tcp_sk(sk);
1624         struct sock *nsk;
1625         struct open_request **prev;
1626         /* Find possible connection requests. */
1627         struct open_request *req = tcp_v4_search_req(tp, &prev, th->source,
1628                                                      iph->saddr, iph->daddr);
1629         if (req)
1630                 return tcp_check_req(sk, skb, req, prev);
1631
1632         nsk = __tcp_v4_lookup_established(skb->nh.iph->saddr,
1633                                           th->source,
1634                                           skb->nh.iph->daddr,
1635                                           ntohs(th->dest),
1636                                           tcp_v4_iif(skb));
1637
1638         if (nsk) {
1639                 if (nsk->sk_state != TCP_TIME_WAIT) {
1640                         bh_lock_sock(nsk);
1641                         return nsk;
1642                 }
1643                 tcp_tw_put((struct tcp_tw_bucket *)nsk);
1644                 return NULL;
1645         }
1646
1647 #ifdef CONFIG_SYN_COOKIES
1648         if (!th->rst && !th->syn && th->ack)
1649                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1650 #endif
1651         return sk;
1652 }
1653
1654 static int tcp_v4_checksum_init(struct sk_buff *skb)
1655 {
1656         if (skb->ip_summed == CHECKSUM_HW) {
1657                 skb->ip_summed = CHECKSUM_UNNECESSARY;
1658                 if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1659                                   skb->nh.iph->daddr, skb->csum))
1660                         return 0;
1661
1662                 NETDEBUG(if (net_ratelimit())
1663                                 printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
1664                 skb->ip_summed = CHECKSUM_NONE;
1665         }
1666         if (skb->len <= 76) {
1667                 if (tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1668                                  skb->nh.iph->daddr,
1669                                  skb_checksum(skb, 0, skb->len, 0)))
1670                         return -1;
1671                 skb->ip_summed = CHECKSUM_UNNECESSARY;
1672         } else {
1673                 skb->csum = ~tcp_v4_check(skb->h.th, skb->len,
1674                                           skb->nh.iph->saddr,
1675                                           skb->nh.iph->daddr, 0);
1676         }
1677         return 0;
1678 }
1679
1680
1681 /* The socket must have it's spinlock held when we get
1682  * here.
1683  *
1684  * We have a potential double-lock case here, so even when
1685  * doing backlog processing we use the BH locking scheme.
1686  * This is because we cannot sleep with the original spinlock
1687  * held.
1688  */
1689 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1690 {
1691         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1692                 TCP_CHECK_TIMER(sk);
1693                 if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
1694                         goto reset;
1695                 TCP_CHECK_TIMER(sk);
1696                 return 0;
1697         }
1698
1699         if (skb->len < (skb->h.th->doff << 2) || tcp_checksum_complete(skb))
1700                 goto csum_err;
1701
1702         if (sk->sk_state == TCP_LISTEN) {
1703                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1704                 if (!nsk)
1705                         goto discard;
1706
1707                 if (nsk != sk) {
1708                         if (tcp_child_process(sk, nsk, skb))
1709                                 goto reset;
1710                         return 0;
1711                 }
1712         }
1713
1714         TCP_CHECK_TIMER(sk);
1715         if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
1716                 goto reset;
1717         TCP_CHECK_TIMER(sk);
1718         return 0;
1719
1720 reset:
1721         tcp_v4_send_reset(skb);
1722 discard:
1723         kfree_skb(skb);
1724         /* Be careful here. If this function gets more complicated and
1725          * gcc suffers from register pressure on the x86, sk (in %ebx)
1726          * might be destroyed here. This current version compiles correctly,
1727          * but you have been warned.
1728          */
1729         return 0;
1730
1731 csum_err:
1732         TCP_INC_STATS_BH(TCP_MIB_INERRS);
1733         goto discard;
1734 }
1735
1736 /*
1737  *      From tcp_input.c
1738  */
1739
1740 int tcp_v4_rcv(struct sk_buff *skb)
1741 {
1742         struct tcphdr *th;
1743         struct sock *sk;
1744         int ret;
1745
1746         if (skb->pkt_type != PACKET_HOST)
1747                 goto discard_it;
1748
1749         /* Count it even if it's bad */
1750         TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1751
1752         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1753                 goto discard_it;
1754
1755         th = skb->h.th;
1756
1757         if (th->doff < sizeof(struct tcphdr) / 4)
1758                 goto bad_packet;
1759         if (!pskb_may_pull(skb, th->doff * 4))
1760                 goto discard_it;
1761
1762         /* An explanation is required here, I think.
1763          * Packet length and doff are validated by header prediction,
1764          * provided case of th->doff==0 is elimineted.
1765          * So, we defer the checks. */
1766         if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
1767              tcp_v4_checksum_init(skb) < 0))
1768                 goto bad_packet;
1769
1770         th = skb->h.th;
1771         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1772         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1773                                     skb->len - th->doff * 4);
1774         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1775         TCP_SKB_CB(skb)->when    = 0;
1776         TCP_SKB_CB(skb)->flags   = skb->nh.iph->tos;
1777         TCP_SKB_CB(skb)->sacked  = 0;
1778
1779         sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source,
1780                              skb->nh.iph->daddr, ntohs(th->dest),
1781                              tcp_v4_iif(skb));
1782
1783         if (!sk)
1784                 goto no_tcp_socket;
1785
1786 process:
1787         if (sk->sk_state == TCP_TIME_WAIT)
1788                 goto do_time_wait;
1789
1790         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1791                 goto discard_and_relse;
1792
1793         if (sk_filter(sk, skb, 0))
1794                 goto discard_and_relse;
1795
1796         skb->dev = NULL;
1797
1798         bh_lock_sock(sk);
1799         ret = 0;
1800         if (!sock_owned_by_user(sk)) {
1801                 if (!tcp_prequeue(sk, skb))
1802                         ret = tcp_v4_do_rcv(sk, skb);
1803         } else
1804                 sk_add_backlog(sk, skb);
1805         bh_unlock_sock(sk);
1806
1807         sock_put(sk);
1808
1809         return ret;
1810
1811 no_tcp_socket:
1812         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1813                 goto discard_it;
1814
1815         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1816 bad_packet:
1817                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1818         } else {
1819                 tcp_v4_send_reset(skb);
1820         }
1821
1822 discard_it:
1823         /* Discard frame. */
1824         kfree_skb(skb);
1825         return 0;
1826
1827 discard_and_relse:
1828         sock_put(sk);
1829         goto discard_it;
1830
1831 do_time_wait:
1832         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1833                 tcp_tw_put((struct tcp_tw_bucket *) sk);
1834                 goto discard_it;
1835         }
1836
1837         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1838                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1839                 tcp_tw_put((struct tcp_tw_bucket *) sk);
1840                 goto discard_it;
1841         }
1842         switch (tcp_timewait_state_process((struct tcp_tw_bucket *)sk,
1843                                            skb, th, skb->len)) {
1844         case TCP_TW_SYN: {
1845                 struct sock *sk2 = tcp_v4_lookup_listener(skb->nh.iph->daddr,
1846                                                           ntohs(th->dest),
1847                                                           tcp_v4_iif(skb));
1848                 if (sk2) {
1849                         tcp_tw_deschedule((struct tcp_tw_bucket *)sk);
1850                         tcp_tw_put((struct tcp_tw_bucket *)sk);
1851                         sk = sk2;
1852                         goto process;
1853                 }
1854                 /* Fall through to ACK */
1855         }
1856         case TCP_TW_ACK:
1857                 tcp_v4_timewait_ack(sk, skb);
1858                 break;
1859         case TCP_TW_RST:
1860                 goto no_tcp_socket;
1861         case TCP_TW_SUCCESS:;
1862         }
1863         goto discard_it;
1864 }
1865
1866 /* With per-bucket locks this operation is not-atomic, so that
1867  * this version is not worse.
1868  */
1869 static void __tcp_v4_rehash(struct sock *sk)
1870 {
1871         sk->sk_prot->unhash(sk);
1872         sk->sk_prot->hash(sk);
1873 }
1874
1875 static int tcp_v4_reselect_saddr(struct sock *sk)
1876 {
1877         struct inet_sock *inet = inet_sk(sk);
1878         int err;
1879         struct rtable *rt;
1880         __u32 old_saddr = inet->saddr;
1881         __u32 new_saddr;
1882         __u32 daddr = inet->daddr;
1883
1884         if (inet->opt && inet->opt->srr)
1885                 daddr = inet->opt->faddr;
1886
1887         /* Query new route. */
1888         err = ip_route_connect(&rt, daddr, 0,
1889                                RT_TOS(inet->tos) | sk->sk_localroute,
1890                                sk->sk_bound_dev_if,
1891                                IPPROTO_TCP,
1892                                inet->sport, inet->dport, sk);
1893         if (err)
1894                 return err;
1895
1896         __sk_dst_set(sk, &rt->u.dst);
1897         tcp_v4_setup_caps(sk, &rt->u.dst);
1898         tcp_sk(sk)->ext2_header_len = rt->u.dst.header_len;
1899
1900         new_saddr = rt->rt_src;
1901
1902         if (new_saddr == old_saddr)
1903                 return 0;
1904
1905         if (sysctl_ip_dynaddr > 1) {
1906                 printk(KERN_INFO "tcp_v4_rebuild_header(): shifting inet->"
1907                                  "saddr from %d.%d.%d.%d to %d.%d.%d.%d\n",
1908                        NIPQUAD(old_saddr),
1909                        NIPQUAD(new_saddr));
1910         }
1911
1912         inet->saddr = new_saddr;
1913         inet->rcv_saddr = new_saddr;
1914
1915         /* XXX The only one ugly spot where we need to
1916          * XXX really change the sockets identity after
1917          * XXX it has entered the hashes. -DaveM
1918          *
1919          * Besides that, it does not check for connection
1920          * uniqueness. Wait for troubles.
1921          */
1922         __tcp_v4_rehash(sk);
1923         return 0;
1924 }
1925
1926 int tcp_v4_rebuild_header(struct sock *sk)
1927 {
1928         struct inet_sock *inet = inet_sk(sk);
1929         struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0);
1930         u32 daddr;
1931         int err;
1932
1933         /* Route is OK, nothing to do. */
1934         if (rt)
1935                 return 0;
1936
1937         /* Reroute. */
1938         daddr = inet->daddr;
1939         if (inet->opt && inet->opt->srr)
1940                 daddr = inet->opt->faddr;
1941
1942         {
1943                 struct flowi fl = { .oif = sk->sk_bound_dev_if,
1944                                     .nl_u = { .ip4_u =
1945                                               { .daddr = daddr,
1946                                                 .saddr = inet->saddr,
1947                                                 .tos = RT_CONN_FLAGS(sk) } },
1948                                     .proto = IPPROTO_TCP,
1949                                     .uli_u = { .ports =
1950                                                { .sport = inet->sport,
1951                                                  .dport = inet->dport } } };
1952
1953                 err = ip_route_output_flow(&rt, &fl, sk, 0);
1954         }
1955         if (!err) {
1956                 __sk_dst_set(sk, &rt->u.dst);
1957                 tcp_v4_setup_caps(sk, &rt->u.dst);
1958                 tcp_sk(sk)->ext2_header_len = rt->u.dst.header_len;
1959                 return 0;
1960         }
1961
1962         /* Routing failed... */
1963         sk->sk_route_caps = 0;
1964
1965         if (!sysctl_ip_dynaddr ||
1966             sk->sk_state != TCP_SYN_SENT ||
1967             (sk->sk_userlocks & SOCK_BINDADDR_LOCK) ||
1968             (err = tcp_v4_reselect_saddr(sk)) != 0)
1969                 sk->sk_err_soft = -err;
1970
1971         return err;
1972 }
1973
1974 static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
1975 {
1976         struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
1977         struct inet_sock *inet = inet_sk(sk);
1978
1979         sin->sin_family         = AF_INET;
1980         sin->sin_addr.s_addr    = inet->daddr;
1981         sin->sin_port           = inet->dport;
1982 }
1983
1984 /* VJ's idea. Save last timestamp seen from this destination
1985  * and hold it at least for normal timewait interval to use for duplicate
1986  * segment detection in subsequent connections, before they enter synchronized
1987  * state.
1988  */
1989
1990 int tcp_v4_remember_stamp(struct sock *sk)
1991 {
1992         struct inet_sock *inet = inet_sk(sk);
1993         struct tcp_sock *tp = tcp_sk(sk);
1994         struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1995         struct inet_peer *peer = NULL;
1996         int release_it = 0;
1997
1998         if (!rt || rt->rt_dst != inet->daddr) {
1999                 peer = inet_getpeer(inet->daddr, 1);
2000                 release_it = 1;
2001         } else {
2002                 if (!rt->peer)
2003                         rt_bind_peer(rt, 1);
2004                 peer = rt->peer;
2005         }
2006
2007         if (peer) {
2008                 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
2009                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
2010                      peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
2011                         peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
2012                         peer->tcp_ts = tp->rx_opt.ts_recent;
2013                 }
2014                 if (release_it)
2015                         inet_putpeer(peer);
2016                 return 1;
2017         }
2018
2019         return 0;
2020 }
2021
2022 int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw)
2023 {
2024         struct inet_peer *peer = NULL;
2025
2026         peer = inet_getpeer(tw->tw_daddr, 1);
2027
2028         if (peer) {
2029                 if ((s32)(peer->tcp_ts - tw->tw_ts_recent) <= 0 ||
2030                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
2031                      peer->tcp_ts_stamp <= tw->tw_ts_recent_stamp)) {
2032                         peer->tcp_ts_stamp = tw->tw_ts_recent_stamp;
2033                         peer->tcp_ts = tw->tw_ts_recent;
2034                 }
2035                 inet_putpeer(peer);
2036                 return 1;
2037         }
2038
2039         return 0;
2040 }
2041
2042 struct tcp_func ipv4_specific = {
2043         .queue_xmit     =       ip_queue_xmit,
2044         .send_check     =       tcp_v4_send_check,
2045         .rebuild_header =       tcp_v4_rebuild_header,
2046         .conn_request   =       tcp_v4_conn_request,
2047         .syn_recv_sock  =       tcp_v4_syn_recv_sock,
2048         .remember_stamp =       tcp_v4_remember_stamp,
2049         .net_header_len =       sizeof(struct iphdr),
2050         .setsockopt     =       ip_setsockopt,
2051         .getsockopt     =       ip_getsockopt,
2052         .addr2sockaddr  =       v4_addr2sockaddr,
2053         .sockaddr_len   =       sizeof(struct sockaddr_in),
2054 };
2055
2056 /* NOTE: A lot of things set to zero explicitly by call to
2057  *       sk_alloc() so need not be done here.
2058  */
2059 static int tcp_v4_init_sock(struct sock *sk)
2060 {
2061         struct tcp_sock *tp = tcp_sk(sk);
2062
2063         skb_queue_head_init(&tp->out_of_order_queue);
2064         tcp_init_xmit_timers(sk);
2065         tcp_prequeue_init(tp);
2066
2067         tp->rto  = TCP_TIMEOUT_INIT;
2068         tp->mdev = TCP_TIMEOUT_INIT;
2069
2070         /* So many TCP implementations out there (incorrectly) count the
2071          * initial SYN frame in their delayed-ACK and congestion control
2072          * algorithms that we must have the following bandaid to talk
2073          * efficiently to them.  -DaveM
2074          */
2075         tp->snd_cwnd = 2;
2076
2077         /* See draft-stevens-tcpca-spec-01 for discussion of the
2078          * initialization of these values.
2079          */
2080         tp->snd_ssthresh = 0x7fffffff;  /* Infinity */
2081         tp->snd_cwnd_clamp = ~0;
2082         tp->mss_cache_std = tp->mss_cache = 536;
2083
2084         tp->reordering = sysctl_tcp_reordering;
2085
2086         sk->sk_state = TCP_CLOSE;
2087
2088         sk->sk_write_space = sk_stream_write_space;
2089         sk->sk_use_write_queue = 1;
2090
2091         tp->af_specific = &ipv4_specific;
2092
2093         sk->sk_sndbuf = sysctl_tcp_wmem[1];
2094         sk->sk_rcvbuf = sysctl_tcp_rmem[1];
2095
2096         atomic_inc(&tcp_sockets_allocated);
2097
2098         return 0;
2099 }
2100
2101 int tcp_v4_destroy_sock(struct sock *sk)
2102 {
2103         struct tcp_sock *tp = tcp_sk(sk);
2104
2105         tcp_clear_xmit_timers(sk);
2106
2107         /* Cleanup up the write buffer. */
2108         sk_stream_writequeue_purge(sk);
2109
2110         /* Cleans up our, hopefully empty, out_of_order_queue. */
2111         __skb_queue_purge(&tp->out_of_order_queue);
2112
2113         /* Clean prequeue, it must be empty really */
2114         __skb_queue_purge(&tp->ucopy.prequeue);
2115
2116         /* Clean up a referenced TCP bind bucket. */
2117         if (tp->bind_hash)
2118                 tcp_put_port(sk);
2119
2120         /*
2121          * If sendmsg cached page exists, toss it.
2122          */
2123         if (sk->sk_sndmsg_page) {
2124                 __free_page(sk->sk_sndmsg_page);
2125                 sk->sk_sndmsg_page = NULL;
2126         }
2127
2128         atomic_dec(&tcp_sockets_allocated);
2129
2130         return 0;
2131 }
2132
2133 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2134
2135 #ifdef CONFIG_PROC_FS
2136 /* Proc filesystem TCP sock list dumping. */
2137
2138 static inline struct tcp_tw_bucket *tw_head(struct hlist_head *head)
2139 {
2140         return hlist_empty(head) ? NULL :
2141                 list_entry(head->first, struct tcp_tw_bucket, tw_node);
2142 }
2143
2144 static inline struct tcp_tw_bucket *tw_next(struct tcp_tw_bucket *tw)
2145 {
2146         return tw->tw_node.next ?
2147                 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
2148 }
2149
2150 static void *listening_get_next(struct seq_file *seq, void *cur)
2151 {
2152         struct tcp_sock *tp;
2153         struct hlist_node *node;
2154         struct sock *sk = cur;
2155         struct tcp_iter_state* st = seq->private;
2156
2157         if (!sk) {
2158                 st->bucket = 0;
2159                 sk = sk_head(&tcp_listening_hash[0]);
2160                 goto get_sk;
2161         }
2162
2163         ++st->num;
2164
2165         if (st->state == TCP_SEQ_STATE_OPENREQ) {
2166                 struct open_request *req = cur;
2167
2168                 tp = tcp_sk(st->syn_wait_sk);
2169                 req = req->dl_next;
2170                 while (1) {
2171                         while (req) {
2172                                 vxdprintk(VXD_CBIT(net, 6),
2173                                         "sk,req: %p [#%d] (from %d)", req->sk,
2174                                         (req->sk)?req->sk->sk_xid:0, vx_current_xid());
2175                                 if (req->sk &&
2176                                         !vx_check(req->sk->sk_xid, VX_IDENT|VX_WATCH))
2177                                         continue;
2178                                 if (req->class->family == st->family) {
2179                                         cur = req;
2180                                         goto out;
2181                                 }
2182                                 req = req->dl_next;
2183                         }
2184                         if (++st->sbucket >= TCP_SYNQ_HSIZE)
2185                                 break;
2186 get_req:
2187                         req = tp->listen_opt->syn_table[st->sbucket];
2188                 }
2189                 sk        = sk_next(st->syn_wait_sk);
2190                 st->state = TCP_SEQ_STATE_LISTENING;
2191                 read_unlock_bh(&tp->syn_wait_lock);
2192         } else {
2193                 tp = tcp_sk(sk);
2194                 read_lock_bh(&tp->syn_wait_lock);
2195                 if (tp->listen_opt && tp->listen_opt->qlen)
2196                         goto start_req;
2197                 read_unlock_bh(&tp->syn_wait_lock);
2198                 sk = sk_next(sk);
2199         }
2200 get_sk:
2201         sk_for_each_from(sk, node) {
2202                 vxdprintk(VXD_CBIT(net, 6), "sk: %p [#%d] (from %d)",
2203                         sk, sk->sk_xid, vx_current_xid());
2204                 if (!vx_check(sk->sk_xid, VX_IDENT|VX_WATCH))
2205                         continue;
2206                 if (sk->sk_family == st->family) {
2207                         cur = sk;
2208                         goto out;
2209                 }
2210                 tp = tcp_sk(sk);
2211                 read_lock_bh(&tp->syn_wait_lock);
2212                 if (tp->listen_opt && tp->listen_opt->qlen) {
2213 start_req:
2214                         st->uid         = sock_i_uid(sk);
2215                         st->syn_wait_sk = sk;
2216                         st->state       = TCP_SEQ_STATE_OPENREQ;
2217                         st->sbucket     = 0;
2218                         goto get_req;
2219                 }
2220                 read_unlock_bh(&tp->syn_wait_lock);
2221         }
2222         if (++st->bucket < TCP_LHTABLE_SIZE) {
2223                 sk = sk_head(&tcp_listening_hash[st->bucket]);
2224                 goto get_sk;
2225         }
2226         cur = NULL;
2227 out:
2228         return cur;
2229 }
2230
2231 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2232 {
2233         void *rc = listening_get_next(seq, NULL);
2234
2235         while (rc && *pos) {
2236                 rc = listening_get_next(seq, rc);
2237                 --*pos;
2238         }
2239         return rc;
2240 }
2241
2242 static void *established_get_first(struct seq_file *seq)
2243 {
2244         struct tcp_iter_state* st = seq->private;
2245         void *rc = NULL;
2246
2247         for (st->bucket = 0; st->bucket < tcp_ehash_size; ++st->bucket) {
2248                 struct sock *sk;
2249                 struct hlist_node *node;
2250                 struct tcp_tw_bucket *tw;
2251
2252                 /* We can reschedule _before_ having picked the target: */
2253                 cond_resched_softirq();
2254
2255                 read_lock(&tcp_ehash[st->bucket].lock);
2256                 sk_for_each(sk, node, &tcp_ehash[st->bucket].chain) {
2257                         vxdprintk(VXD_CBIT(net, 6),
2258                                 "sk,egf: %p [#%d] (from %d)",
2259                                 sk, sk->sk_xid, vx_current_xid());
2260                         if (!vx_check(sk->sk_xid, VX_IDENT|VX_WATCH))
2261                                 continue;
2262                         if (sk->sk_family != st->family)
2263                                 continue;
2264                         rc = sk;
2265                         goto out;
2266                 }
2267                 st->state = TCP_SEQ_STATE_TIME_WAIT;
2268                 tw_for_each(tw, node,
2269                             &tcp_ehash[st->bucket + tcp_ehash_size].chain) {
2270                         vxdprintk(VXD_CBIT(net, 6),
2271                                 "tw: %p [#%d] (from %d)",
2272                                 tw, tw->tw_xid, vx_current_xid());
2273                         if (!vx_check(tw->tw_xid, VX_IDENT|VX_WATCH))
2274                                 continue;
2275                         if (tw->tw_family != st->family)
2276                                 continue;
2277                         rc = tw;
2278                         goto out;
2279                 }
2280                 read_unlock(&tcp_ehash[st->bucket].lock);
2281                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2282         }
2283 out:
2284         return rc;
2285 }
2286
2287 static void *established_get_next(struct seq_file *seq, void *cur)
2288 {
2289         struct sock *sk = cur;
2290         struct tcp_tw_bucket *tw;
2291         struct hlist_node *node;
2292         struct tcp_iter_state* st = seq->private;
2293
2294         ++st->num;
2295
2296         if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2297                 tw = cur;
2298                 tw = tw_next(tw);
2299 get_tw:
2300                 while (tw && (tw->tw_family != st->family ||
2301                         !vx_check(tw->tw_xid, VX_IDENT|VX_WATCH))) {
2302                         tw = tw_next(tw);
2303                 }
2304                 if (tw) {
2305                         cur = tw;
2306                         goto out;
2307                 }
2308                 read_unlock(&tcp_ehash[st->bucket].lock);
2309                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2310
2311                 /* We can reschedule between buckets: */
2312                 cond_resched_softirq();
2313
2314                 if (++st->bucket < tcp_ehash_size) {
2315                         read_lock(&tcp_ehash[st->bucket].lock);
2316                         sk = sk_head(&tcp_ehash[st->bucket].chain);
2317                 } else {
2318                         cur = NULL;
2319                         goto out;
2320                 }
2321         } else
2322                 sk = sk_next(sk);
2323
2324         sk_for_each_from(sk, node) {
2325                 vxdprintk(VXD_CBIT(net, 6),
2326                         "sk,egn: %p [#%d] (from %d)",
2327                         sk, sk->sk_xid, vx_current_xid());
2328                 if (!vx_check(sk->sk_xid, VX_IDENT|VX_WATCH))
2329                         continue;
2330                 if (sk->sk_family == st->family)
2331                         goto found;
2332         }
2333
2334         st->state = TCP_SEQ_STATE_TIME_WAIT;
2335         tw = tw_head(&tcp_ehash[st->bucket + tcp_ehash_size].chain);
2336         goto get_tw;
2337 found:
2338         cur = sk;
2339 out:
2340         return cur;
2341 }
2342
2343 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2344 {
2345         void *rc = established_get_first(seq);
2346
2347         while (rc && pos) {
2348                 rc = established_get_next(seq, rc);
2349                 --pos;
2350         }
2351         return rc;
2352 }
2353
2354 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2355 {
2356         void *rc;
2357         struct tcp_iter_state* st = seq->private;
2358
2359         tcp_listen_lock();
2360         st->state = TCP_SEQ_STATE_LISTENING;
2361         rc        = listening_get_idx(seq, &pos);
2362
2363         if (!rc) {
2364                 tcp_listen_unlock();
2365                 local_bh_disable();
2366                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2367                 rc        = established_get_idx(seq, pos);
2368         }
2369
2370         return rc;
2371 }
2372
2373 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2374 {
2375         struct tcp_iter_state* st = seq->private;
2376         st->state = TCP_SEQ_STATE_LISTENING;
2377         st->num = 0;
2378         return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2379 }
2380
2381 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2382 {
2383         void *rc = NULL;
2384         struct tcp_iter_state* st;
2385
2386         if (v == SEQ_START_TOKEN) {
2387                 rc = tcp_get_idx(seq, 0);
2388                 goto out;
2389         }
2390         st = seq->private;
2391
2392         switch (st->state) {
2393         case TCP_SEQ_STATE_OPENREQ:
2394         case TCP_SEQ_STATE_LISTENING:
2395                 rc = listening_get_next(seq, v);
2396                 if (!rc) {
2397                         tcp_listen_unlock();
2398                         local_bh_disable();
2399                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2400                         rc        = established_get_first(seq);
2401                 }
2402                 break;
2403         case TCP_SEQ_STATE_ESTABLISHED:
2404         case TCP_SEQ_STATE_TIME_WAIT:
2405                 rc = established_get_next(seq, v);
2406                 break;
2407         }
2408 out:
2409         ++*pos;
2410         return rc;
2411 }
2412
2413 static void tcp_seq_stop(struct seq_file *seq, void *v)
2414 {
2415         struct tcp_iter_state* st = seq->private;
2416
2417         switch (st->state) {
2418         case TCP_SEQ_STATE_OPENREQ:
2419                 if (v) {
2420                         struct tcp_sock *tp = tcp_sk(st->syn_wait_sk);
2421                         read_unlock_bh(&tp->syn_wait_lock);
2422                 }
2423         case TCP_SEQ_STATE_LISTENING:
2424                 if (v != SEQ_START_TOKEN)
2425                         tcp_listen_unlock();
2426                 break;
2427         case TCP_SEQ_STATE_TIME_WAIT:
2428         case TCP_SEQ_STATE_ESTABLISHED:
2429                 if (v)
2430                         read_unlock(&tcp_ehash[st->bucket].lock);
2431                 local_bh_enable();
2432                 break;
2433         }
2434 }
2435
2436 static int tcp_seq_open(struct inode *inode, struct file *file)
2437 {
2438         struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2439         struct seq_file *seq;
2440         struct tcp_iter_state *s;
2441         int rc;
2442
2443         if (unlikely(afinfo == NULL))
2444                 return -EINVAL;
2445
2446         s = kmalloc(sizeof(*s), GFP_KERNEL);
2447         if (!s)
2448                 return -ENOMEM;
2449         memset(s, 0, sizeof(*s));
2450         s->family               = afinfo->family;
2451         s->seq_ops.start        = tcp_seq_start;
2452         s->seq_ops.next         = tcp_seq_next;
2453         s->seq_ops.show         = afinfo->seq_show;
2454         s->seq_ops.stop         = tcp_seq_stop;
2455
2456         rc = seq_open(file, &s->seq_ops);
2457         if (rc)
2458                 goto out_kfree;
2459         seq          = file->private_data;
2460         seq->private = s;
2461 out:
2462         return rc;
2463 out_kfree:
2464         kfree(s);
2465         goto out;
2466 }
2467
2468 int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
2469 {
2470         int rc = 0;
2471         struct proc_dir_entry *p;
2472
2473         if (!afinfo)
2474                 return -EINVAL;
2475         afinfo->seq_fops->owner         = afinfo->owner;
2476         afinfo->seq_fops->open          = tcp_seq_open;
2477         afinfo->seq_fops->read          = seq_read;
2478         afinfo->seq_fops->llseek        = seq_lseek;
2479         afinfo->seq_fops->release       = seq_release_private;
2480
2481         p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
2482         if (p)
2483                 p->data = afinfo;
2484         else
2485                 rc = -ENOMEM;
2486         return rc;
2487 }
2488
2489 void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
2490 {
2491         if (!afinfo)
2492                 return;
2493         proc_net_remove(afinfo->name);
2494         memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops));
2495 }
2496
2497 static void get_openreq4(struct sock *sk, struct open_request *req,
2498                          char *tmpbuf, int i, int uid)
2499 {
2500         int ttd = req->expires - jiffies;
2501
2502         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2503                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
2504                 i,
2505                 req->af.v4_req.loc_addr,
2506                 ntohs(inet_sk(sk)->sport),
2507                 req->af.v4_req.rmt_addr,
2508                 ntohs(req->rmt_port),
2509                 TCP_SYN_RECV,
2510                 0, 0, /* could print option size, but that is af dependent. */
2511                 1,    /* timers active (only the expire timer) */
2512                 jiffies_to_clock_t(ttd),
2513                 req->retrans,
2514                 uid,
2515                 0,  /* non standard timer */
2516                 0, /* open_requests have no inode */
2517                 atomic_read(&sk->sk_refcnt),
2518                 req);
2519 }
2520
2521 static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
2522 {
2523         int timer_active;
2524         unsigned long timer_expires;
2525         struct tcp_sock *tp = tcp_sk(sp);
2526         struct inet_sock *inet = inet_sk(sp);
2527         unsigned int dest = inet->daddr;
2528         unsigned int src = inet->rcv_saddr;
2529         __u16 destp = ntohs(inet->dport);
2530         __u16 srcp = ntohs(inet->sport);
2531
2532         if (tp->pending == TCP_TIME_RETRANS) {
2533                 timer_active    = 1;
2534                 timer_expires   = tp->timeout;
2535         } else if (tp->pending == TCP_TIME_PROBE0) {
2536                 timer_active    = 4;
2537                 timer_expires   = tp->timeout;
2538         } else if (timer_pending(&sp->sk_timer)) {
2539                 timer_active    = 2;
2540                 timer_expires   = sp->sk_timer.expires;
2541         } else {
2542                 timer_active    = 0;
2543                 timer_expires = jiffies;
2544         }
2545
2546         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2547                         "%08X %5d %8d %lu %d %p %u %u %u %u %d",
2548                 i, src, srcp, dest, destp, sp->sk_state,
2549                 tp->write_seq - tp->snd_una, tp->rcv_nxt - tp->copied_seq,
2550                 timer_active,
2551                 jiffies_to_clock_t(timer_expires - jiffies),
2552                 tp->retransmits,
2553                 sock_i_uid(sp),
2554                 tp->probes_out,
2555                 sock_i_ino(sp),
2556                 atomic_read(&sp->sk_refcnt), sp,
2557                 tp->rto, tp->ack.ato, (tp->ack.quick << 1) | tp->ack.pingpong,
2558                 tp->snd_cwnd,
2559                 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
2560 }
2561
2562 static void get_timewait4_sock(struct tcp_tw_bucket *tw, char *tmpbuf, int i)
2563 {
2564         unsigned int dest, src;
2565         __u16 destp, srcp;
2566         int ttd = tw->tw_ttd - jiffies;
2567
2568         if (ttd < 0)
2569                 ttd = 0;
2570
2571         dest  = tw->tw_daddr;
2572         src   = tw->tw_rcv_saddr;
2573         destp = ntohs(tw->tw_dport);
2574         srcp  = ntohs(tw->tw_sport);
2575
2576         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2577                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
2578                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2579                 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2580                 atomic_read(&tw->tw_refcnt), tw);
2581 }
2582
2583 #define TMPSZ 150
2584
2585 static int tcp4_seq_show(struct seq_file *seq, void *v)
2586 {
2587         struct tcp_iter_state* st;
2588         char tmpbuf[TMPSZ + 1];
2589
2590         if (v == SEQ_START_TOKEN) {
2591                 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2592                            "  sl  local_address rem_address   st tx_queue "
2593                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2594                            "inode");
2595                 goto out;
2596         }
2597         st = seq->private;
2598
2599         switch (st->state) {
2600         case TCP_SEQ_STATE_LISTENING:
2601         case TCP_SEQ_STATE_ESTABLISHED:
2602                 get_tcp4_sock(v, tmpbuf, st->num);
2603                 break;
2604         case TCP_SEQ_STATE_OPENREQ:
2605                 get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
2606                 break;
2607         case TCP_SEQ_STATE_TIME_WAIT:
2608                 get_timewait4_sock(v, tmpbuf, st->num);
2609                 break;
2610         }
2611         seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
2612 out:
2613         return 0;
2614 }
2615
2616 static struct file_operations tcp4_seq_fops;
2617 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2618         .owner          = THIS_MODULE,
2619         .name           = "tcp",
2620         .family         = AF_INET,
2621         .seq_show       = tcp4_seq_show,
2622         .seq_fops       = &tcp4_seq_fops,
2623 };
2624
2625 int __init tcp4_proc_init(void)
2626 {
2627         return tcp_proc_register(&tcp4_seq_afinfo);
2628 }
2629
2630 void tcp4_proc_exit(void)
2631 {
2632         tcp_proc_unregister(&tcp4_seq_afinfo);
2633 }
2634 #endif /* CONFIG_PROC_FS */
2635
2636 struct proto tcp_prot = {
2637         .name                   = "TCP",
2638         .owner                  = THIS_MODULE,
2639         .close                  = tcp_close,
2640         .connect                = tcp_v4_connect,
2641         .disconnect             = tcp_disconnect,
2642         .accept                 = tcp_accept,
2643         .ioctl                  = tcp_ioctl,
2644         .init                   = tcp_v4_init_sock,
2645         .destroy                = tcp_v4_destroy_sock,
2646         .shutdown               = tcp_shutdown,
2647         .setsockopt             = tcp_setsockopt,
2648         .getsockopt             = tcp_getsockopt,
2649         .sendmsg                = tcp_sendmsg,
2650         .recvmsg                = tcp_recvmsg,
2651         .backlog_rcv            = tcp_v4_do_rcv,
2652         .hash                   = tcp_v4_hash,
2653         .unhash                 = tcp_unhash,
2654         .get_port               = tcp_v4_get_port,
2655         .enter_memory_pressure  = tcp_enter_memory_pressure,
2656         .sockets_allocated      = &tcp_sockets_allocated,
2657         .memory_allocated       = &tcp_memory_allocated,
2658         .memory_pressure        = &tcp_memory_pressure,
2659         .sysctl_mem             = sysctl_tcp_mem,
2660         .sysctl_wmem            = sysctl_tcp_wmem,
2661         .sysctl_rmem            = sysctl_tcp_rmem,
2662         .max_header             = MAX_TCP_HEADER,
2663         .slab_obj_size          = sizeof(struct tcp_sock),
2664 };
2665
2666
2667
2668 void __init tcp_v4_init(struct net_proto_family *ops)
2669 {
2670         int err = sock_create_kern(PF_INET, SOCK_RAW, IPPROTO_TCP, &tcp_socket);
2671         if (err < 0)
2672                 panic("Failed to create the TCP control socket.\n");
2673         tcp_socket->sk->sk_allocation   = GFP_ATOMIC;
2674         inet_sk(tcp_socket->sk)->uc_ttl = -1;
2675
2676         /* Unhash it so that IP input processing does not even
2677          * see it, we do not wish this socket to see incoming
2678          * packets.
2679          */
2680         tcp_socket->sk->sk_prot->unhash(tcp_socket->sk);
2681 }
2682
2683 EXPORT_SYMBOL(ipv4_specific);
2684 EXPORT_SYMBOL(tcp_bind_hash);
2685 EXPORT_SYMBOL(tcp_bucket_create);
2686 EXPORT_SYMBOL(tcp_hashinfo);
2687 EXPORT_SYMBOL(tcp_inherit_port);
2688 EXPORT_SYMBOL(tcp_listen_wlock);
2689 EXPORT_SYMBOL(tcp_port_rover);
2690 EXPORT_SYMBOL(tcp_prot);
2691 EXPORT_SYMBOL(tcp_put_port);
2692 EXPORT_SYMBOL(tcp_unhash);
2693 EXPORT_SYMBOL(tcp_v4_conn_request);
2694 EXPORT_SYMBOL(tcp_v4_connect);
2695 EXPORT_SYMBOL(tcp_v4_do_rcv);
2696 EXPORT_SYMBOL(tcp_v4_rebuild_header);
2697 EXPORT_SYMBOL(tcp_v4_remember_stamp);
2698 EXPORT_SYMBOL(tcp_v4_send_check);
2699 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2700
2701 #ifdef CONFIG_PROC_FS
2702 EXPORT_SYMBOL(tcp_proc_register);
2703 EXPORT_SYMBOL(tcp_proc_unregister);
2704 #endif
2705 EXPORT_SYMBOL(sysctl_local_port_range);
2706 EXPORT_SYMBOL(sysctl_max_syn_backlog);
2707 EXPORT_SYMBOL(sysctl_tcp_low_latency);
2708