net/ipv4/tcp_ipv4.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
   9  *
  10  *              IPv4 specific functions
  11  *
  12  *
  13  *              code split from:
  14  *              linux/ipv4/tcp.c
  15  *              linux/ipv4/tcp_input.c
  16  *              linux/ipv4/tcp_output.c
  17  *
  18  *              See tcp.c for author information
  19  *
  20  *      This program is free software; you can redistribute it and/or
  21  *      modify it under the terms of the GNU General Public License
  22  *      as published by the Free Software Foundation; either version
  23  *      2 of the License, or (at your option) any later version.
  24  */
  25
  26 /*
  27  * Changes:
  28  *              David S. Miller :       New socket lookup architecture.
  29  *                                      This code is dedicated to John Dyson.
  30  *              David S. Miller :       Change semantics of established hash,
  31  *                                      half is devoted to TIME_WAIT sockets
  32  *                                      and the rest go in the other half.
  33  *              Andi Kleen :            Add support for syncookies and fixed
  34  *                                      some bugs: ip options weren't passed to
  35  *                                      the TCP layer, missed a check for an
  36  *                                      ACK bit.
  37  *              Andi Kleen :            Implemented fast path mtu discovery.
  38  *                                      Fixed many serious bugs in the
  39  *                                      open_request handling and moved
  40  *                                      most of it into the af independent code.
  41  *                                      Added tail drop and some other bugfixes.
  42  *                                      Added new listen sematics.
  43  *              Mike McLagan    :       Routing by source
  44  *      Juan Jose Ciarlante:            ip_dynaddr bits
  45  *              Andi Kleen:             various fixes.
  46  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  47  *                                      coma.
  48  *      Andi Kleen              :       Fix new listen.
  49  *      Andi Kleen              :       Fix accept error reporting.
  50  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  51  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  52  *                                      a single port at the same time.
  53  */
  54
  55 #include <linux/config.h>
  56
  57 #include <linux/types.h>
  58 #include <linux/fcntl.h>
  59 #include <linux/module.h>
  60 #include <linux/random.h>
  61 #include <linux/cache.h>
  62 #include <linux/jhash.h>
  63 #include <linux/init.h>
  64 #include <linux/times.h>
  65
  66 #include <net/icmp.h>
  67 #include <net/tcp.h>
  68 #include <net/ipv6.h>
  69 #include <net/inet_common.h>
  70 #include <net/xfrm.h>
  71
  72 #include <linux/inet.h>
  73 #include <linux/ipv6.h>
  74 #include <linux/stddef.h>
  75 #include <linux/proc_fs.h>
  76 #include <linux/seq_file.h>
  77
  78 extern int sysctl_ip_dynaddr;
  79 int sysctl_tcp_tw_reuse;
  80 int sysctl_tcp_low_latency;
  81
  82 /* Check TCP sequence numbers in ICMP packets. */
  83 #define ICMP_MIN_LENGTH 8
  84
  85 /* Socket used for sending RSTs */
  86 static struct socket *tcp_socket;
  87
  88 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
  89                        struct sk_buff *skb);
  90
  91 struct tcp_hashinfo __cacheline_aligned tcp_hashinfo = {
  92         .__tcp_lhash_lock       =       RW_LOCK_UNLOCKED,
  93         .__tcp_lhash_users      =       ATOMIC_INIT(0),
  94         .__tcp_lhash_wait
  95           = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.__tcp_lhash_wait),
  96         .__tcp_portalloc_lock   =       SPIN_LOCK_UNLOCKED
  97 };
  98
  99 /*
 100  * This array holds the first and last local port number.
 101  * For high-usage systems, use sysctl to change this to
 102  * 32768-61000
 103  */
 104 int sysctl_local_port_range[2] = { 1024, 4999 };
 105 int tcp_port_rover = 1024 - 1;
 106
 107 static __inline__ int tcp_hashfn(__u32 laddr, __u16 lport,
 108                                  __u32 faddr, __u16 fport)
 109 {
 110         int h = (laddr ^ lport) ^ (faddr ^ fport);
 111         h ^= h >> 16;
 112         h ^= h >> 8;
 113         return h & (tcp_ehash_size - 1);
 114 }
 115
 116 static __inline__ int tcp_sk_hashfn(struct sock *sk)
 117 {
 118         struct inet_opt *inet = inet_sk(sk);
 119         __u32 laddr = inet->rcv_saddr;
 120         __u16 lport = inet->num;
 121         __u32 faddr = inet->daddr;
 122         __u16 fport = inet->dport;
 123
 124         return tcp_hashfn(laddr, lport, faddr, fport);
 125 }
 126
 127 /* Allocate and initialize a new TCP local port bind bucket.
 128  * The bindhash mutex for snum's hash chain must be held here.
 129  */
 130 struct tcp_bind_bucket *tcp_bucket_create(struct tcp_bind_hashbucket *head,
 131                                           unsigned short snum)
 132 {
 133         struct tcp_bind_bucket *tb = kmem_cache_alloc(tcp_bucket_cachep,
 134                                                       SLAB_ATOMIC);
 135         if (tb) {
 136                 tb->port = snum;
 137                 tb->fastreuse = 0;
 138                 INIT_HLIST_HEAD(&tb->owners);
 139                 hlist_add_head(&tb->node, &head->chain);
 140         }
 141         return tb;
 142 }
 143
 144 /* Caller must hold hashbucket lock for this tb with local BH disabled */
 145 void tcp_bucket_destroy(struct tcp_bind_bucket *tb)
 146 {
 147         if (hlist_empty(&tb->owners)) {
 148                 __hlist_del(&tb->node);
 149                 kmem_cache_free(tcp_bucket_cachep, tb);
 150         }
 151 }
 152
 153 /* Caller must disable local BH processing. */
 154 static __inline__ void __tcp_inherit_port(struct sock *sk, struct sock *child)
 155 {
 156         struct tcp_bind_hashbucket *head =
 157                                 &tcp_bhash[tcp_bhashfn(inet_sk(child)->num)];
 158         struct tcp_bind_bucket *tb;
 159
 160         spin_lock(&head->lock);
 161         tb = tcp_sk(sk)->bind_hash;
 162         sk_add_bind_node(child, &tb->owners);
 163         tcp_sk(child)->bind_hash = tb;
 164         spin_unlock(&head->lock);
 165 }
 166
 167 inline void tcp_inherit_port(struct sock *sk, struct sock *child)
 168 {
 169         local_bh_disable();
 170         __tcp_inherit_port(sk, child);
 171         local_bh_enable();
 172 }
 173
 174 void tcp_bind_hash(struct sock *sk, struct tcp_bind_bucket *tb,
 175                    unsigned short snum)
 176 {
 177         inet_sk(sk)->num = snum;
 178         sk_add_bind_node(sk, &tb->owners);
 179         tcp_sk(sk)->bind_hash = tb;
 180 }
 181
 182 static inline int tcp_bind_conflict(struct sock *sk, struct tcp_bind_bucket *tb)
 183 {
 184         const u32 sk_rcv_saddr = tcp_v4_rcv_saddr(sk);
 185         struct sock *sk2;
 186         struct hlist_node *node;
 187         int reuse = sk->sk_reuse;
 188
 189         sk_for_each_bound(sk2, node, &tb->owners) {
 190                 if (sk != sk2 &&
 191                     !tcp_v6_ipv6only(sk2) &&
 192                     (!sk->sk_bound_dev_if ||
 193                      !sk2->sk_bound_dev_if ||
 194                      sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
 195                         if (!reuse || !sk2->sk_reuse ||
 196                             sk2->sk_state == TCP_LISTEN) {
 197                                 const u32 sk2_rcv_saddr = tcp_v4_rcv_saddr(sk2);
 198                                 if (!sk2_rcv_saddr || !sk_rcv_saddr ||
 199                                     sk2_rcv_saddr == sk_rcv_saddr)
 200                                         break;
 201                         }
 202                 }
 203         }
 204         return node != NULL;
 205 }
 206
 207 /* Obtain a reference to a local port for the given sock,
 208  * if snum is zero it means select any available local port.
 209  */
 210 static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
 211 {
 212         struct tcp_bind_hashbucket *head;
 213         struct hlist_node *node;
 214         struct tcp_bind_bucket *tb;
 215         int ret;
 216
 217         local_bh_disable();
 218         if (!snum) {
 219                 int low = sysctl_local_port_range[0];
 220                 int high = sysctl_local_port_range[1];
 221                 int remaining = (high - low) + 1;
 222                 int rover;
 223
 224                 spin_lock(&tcp_portalloc_lock);
 225                 rover = tcp_port_rover;
 226                 do {
 227                         rover++;
 228                         if (rover < low || rover > high)
 229                                 rover = low;
 230                         head = &tcp_bhash[tcp_bhashfn(rover)];
 231                         spin_lock(&head->lock);
 232                         tb_for_each(tb, node, &head->chain)
 233                                 if (tb->port == rover)
 234                                         goto next;
 235                         break;
 236                 next:
 237                         spin_unlock(&head->lock);
 238                 } while (--remaining > 0);
 239                 tcp_port_rover = rover;
 240                 spin_unlock(&tcp_portalloc_lock);
 241
 242                 /* Exhausted local port range during search? */
 243                 ret = 1;
 244                 if (remaining <= 0)
 245                         goto fail;
 246
 247                 /* OK, here is the one we will use.  HEAD is
 248                  * non-NULL and we hold it's mutex.
 249                  */
 250                 snum = rover;
 251         } else {
 252                 head = &tcp_bhash[tcp_bhashfn(snum)];
 253                 spin_lock(&head->lock);
 254                 tb_for_each(tb, node, &head->chain)
 255                         if (tb->port == snum)
 256                                 goto tb_found;
 257         }
 258         tb = NULL;
 259         goto tb_not_found;
 260 tb_found:
 261         if (!hlist_empty(&tb->owners)) {
 262                 if (sk->sk_reuse > 1)
 263                         goto success;
 264                 if (tb->fastreuse > 0 &&
 265                     sk->sk_reuse && sk->sk_state != TCP_LISTEN) {
 266                         goto success;
 267                 } else {
 268                         ret = 1;
 269                         if (tcp_bind_conflict(sk, tb))
 270                                 goto fail_unlock;
 271                 }
 272         }
 273 tb_not_found:
 274         ret = 1;
 275         if (!tb && (tb = tcp_bucket_create(head, snum)) == NULL)
 276                 goto fail_unlock;
 277         if (hlist_empty(&tb->owners)) {
 278                 if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
 279                         tb->fastreuse = 1;
 280                 else
 281                         tb->fastreuse = 0;
 282         } else if (tb->fastreuse &&
 283                    (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
 284                 tb->fastreuse = 0;
 285 success:
 286         if (!tcp_sk(sk)->bind_hash)
 287                 tcp_bind_hash(sk, tb, snum);
 288         BUG_TRAP(tcp_sk(sk)->bind_hash == tb);
 289         ret = 0;
 290
 291 fail_unlock:
 292         spin_unlock(&head->lock);
 293 fail:
 294         local_bh_enable();
 295         return ret;
 296 }
 297
 298 /* Get rid of any references to a local port held by the
 299  * given sock.
 300  */
 301 static void __tcp_put_port(struct sock *sk)
 302 {
 303         struct inet_opt *inet = inet_sk(sk);
 304         struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(inet->num)];
 305         struct tcp_bind_bucket *tb;
 306
 307         spin_lock(&head->lock);
 308         tb = tcp_sk(sk)->bind_hash;
 309         __sk_del_bind_node(sk);
 310         tcp_sk(sk)->bind_hash = NULL;
 311         inet->num = 0;
 312         tcp_bucket_destroy(tb);
 313         spin_unlock(&head->lock);
 314 }
 315
 316 void tcp_put_port(struct sock *sk)
 317 {
 318         local_bh_disable();
 319         __tcp_put_port(sk);
 320         local_bh_enable();
 321 }
 322
 323 /* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP.
 324  * Look, when several writers sleep and reader wakes them up, all but one
 325  * immediately hit write lock and grab all the cpus. Exclusive sleep solves
 326  * this, _but_ remember, it adds useless work on UP machines (wake up each
 327  * exclusive lock release). It should be ifdefed really.
 328  */
 329
 330 void tcp_listen_wlock(void)
 331 {
 332         write_lock(&tcp_lhash_lock);
 333
 334         if (atomic_read(&tcp_lhash_users)) {
 335                 DEFINE_WAIT(wait);
 336
 337                 for (;;) {
 338                         prepare_to_wait_exclusive(&tcp_lhash_wait,
 339                                                 &wait, TASK_UNINTERRUPTIBLE);
 340                         if (!atomic_read(&tcp_lhash_users))
 341                                 break;
 342                         write_unlock_bh(&tcp_lhash_lock);
 343                         schedule();
 344                         write_lock_bh(&tcp_lhash_lock);
 345                 }
 346
 347                 finish_wait(&tcp_lhash_wait, &wait);
 348         }
 349 }
 350
 351 static __inline__ void __tcp_v4_hash(struct sock *sk, const int listen_possible)
 352 {
 353         struct hlist_head *list;
 354         rwlock_t *lock;
 355
 356         BUG_TRAP(sk_unhashed(sk));
 357         if (listen_possible && sk->sk_state == TCP_LISTEN) {
 358                 list = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)];
 359                 lock = &tcp_lhash_lock;
 360                 tcp_listen_wlock();
 361         } else {
 362                 list = &tcp_ehash[(sk->sk_hashent = tcp_sk_hashfn(sk))].chain;
 363                 lock = &tcp_ehash[sk->sk_hashent].lock;
 364                 write_lock(lock);
 365         }
 366         __sk_add_node(sk, list);
 367         sock_prot_inc_use(sk->sk_prot);
 368         write_unlock(lock);
 369         if (listen_possible && sk->sk_state == TCP_LISTEN)
 370                 wake_up(&tcp_lhash_wait);
 371 }
 372
 373 static void tcp_v4_hash(struct sock *sk)
 374 {
 375         if (sk->sk_state != TCP_CLOSE) {
 376                 local_bh_disable();
 377                 __tcp_v4_hash(sk, 1);
 378                 local_bh_enable();
 379         }
 380 }
 381
 382 void tcp_unhash(struct sock *sk)
 383 {
 384         rwlock_t *lock;
 385
 386         if (sk_unhashed(sk))
 387                 goto ende;
 388
 389         if (sk->sk_state == TCP_LISTEN) {
 390                 local_bh_disable();
 391                 tcp_listen_wlock();
 392                 lock = &tcp_lhash_lock;
 393         } else {
 394                 struct tcp_ehash_bucket *head = &tcp_ehash[sk->sk_hashent];
 395                 lock = &head->lock;
 396                 write_lock_bh(&head->lock);
 397         }
 398
 399         if (__sk_del_node_init(sk))
 400                 sock_prot_dec_use(sk->sk_prot);
 401         write_unlock_bh(lock);
 402
 403  ende:
 404         if (sk->sk_state == TCP_LISTEN)
 405                 wake_up(&tcp_lhash_wait);
 406 }
 407
 408 /* Don't inline this cruft.  Here are some nice properties to
 409  * exploit here.  The BSD API does not allow a listening TCP
 410  * to specify the remote port nor the remote address for the
 411  * connection.  So always assume those are both wildcarded
 412  * during the search since they can never be otherwise.
 413  */
 414 static struct sock *__tcp_v4_lookup_listener(struct hlist_head *head, u32 daddr,
 415                                              unsigned short hnum, int dif)
 416 {
 417         struct sock *result = NULL, *sk;
 418         struct hlist_node *node;
 419         int score, hiscore;
 420
 421         hiscore=-1;
 422         sk_for_each(sk, node, head) {
 423                 struct inet_opt *inet = inet_sk(sk);
 424
 425                 if (inet->num == hnum && !ipv6_only_sock(sk)) {
 426                         __u32 rcv_saddr = inet->rcv_saddr;
 427
 428                         score = (sk->sk_family == PF_INET ? 1 : 0);
 429                         if (rcv_saddr) {
 430                                 if (rcv_saddr != daddr)
 431                                         continue;
 432                                 score+=2;
 433                         }
 434                         if (sk->sk_bound_dev_if) {
 435                                 if (sk->sk_bound_dev_if != dif)
 436                                         continue;
 437                                 score+=2;
 438                         }
 439                         if (score == 5)
 440                                 return sk;
 441                         if (score > hiscore) {
 442                                 hiscore = score;
 443                                 result = sk;
 444                         }
 445                 }
 446         }
 447         return result;
 448 }
 449
 450 /* Optimize the common listener case. */
 451 inline struct sock *tcp_v4_lookup_listener(u32 daddr, unsigned short hnum,
 452                                            int dif)
 453 {
 454         struct sock *sk = NULL;
 455         struct hlist_head *head;
 456
 457         read_lock(&tcp_lhash_lock);
 458         head = &tcp_listening_hash[tcp_lhashfn(hnum)];
 459         if (!hlist_empty(head)) {
 460                 struct inet_opt *inet = inet_sk((sk = __sk_head(head)));
 461
 462                 if (inet->num == hnum && !sk->sk_node.next &&
 463                     (!inet->rcv_saddr || inet->rcv_saddr == daddr) &&
 464                     (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) &&
 465                     !sk->sk_bound_dev_if)
 466                         goto sherry_cache;
 467                 sk = __tcp_v4_lookup_listener(head, daddr, hnum, dif);
 468         }
 469         if (sk) {
 470 sherry_cache:
 471                 sock_hold(sk);
 472         }
 473         read_unlock(&tcp_lhash_lock);
 474         return sk;
 475 }
 476
 477 /* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
 478  * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
 479  *
 480  * Local BH must be disabled here.
 481  */
 482
 483 static inline struct sock *__tcp_v4_lookup_established(u32 saddr, u16 sport,
 484                                                        u32 daddr, u16 hnum,
 485                                                        int dif)
 486 {
 487         struct tcp_ehash_bucket *head;
 488         TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
 489         __u32 ports = TCP_COMBINED_PORTS(sport, hnum);
 490         struct sock *sk;
 491         struct hlist_node *node;
 492         /* Optimize here for direct hit, only listening connections can
 493          * have wildcards anyways.
 494          */
 495         int hash = tcp_hashfn(daddr, hnum, saddr, sport);
 496         head = &tcp_ehash[hash];
 497         read_lock(&head->lock);
 498         sk_for_each(sk, node, &head->chain) {
 499                 if (TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
 500                         goto hit; /* You sunk my battleship! */
 501         }
 502
 503         /* Must check for a TIME_WAIT'er before going to listener hash. */
 504         sk_for_each(sk, node, &(head + tcp_ehash_size)->chain) {
 505                 if (TCP_IPV4_TW_MATCH(sk, acookie, saddr, daddr, ports, dif))
 506                         goto hit;
 507         }
 508         sk = NULL;
 509 out:
 510         read_unlock(&head->lock);
 511         return sk;
 512 hit:
 513         sock_hold(sk);
 514         goto out;
 515 }
 516
 517 static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport,
 518                                            u32 daddr, u16 hnum, int dif)
 519 {
 520         struct sock *sk = __tcp_v4_lookup_established(saddr, sport,
 521                                                       daddr, hnum, dif);
 522
 523         return sk ? : tcp_v4_lookup_listener(daddr, hnum, dif);
 524 }
 525
 526 inline struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr,
 527                                   u16 dport, int dif)
 528 {
 529         struct sock *sk;
 530
 531         local_bh_disable();
 532         sk = __tcp_v4_lookup(saddr, sport, daddr, ntohs(dport), dif);
 533         local_bh_enable();
 534
 535         return sk;
 536 }
 537
 538 static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
 539 {
 540         return secure_tcp_sequence_number(skb->nh.iph->daddr,
 541                                           skb->nh.iph->saddr,
 542                                           skb->h.th->dest,
 543                                           skb->h.th->source);
 544 }
 545
 546 /* called with local bh disabled */
 547 static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
 548                                       struct tcp_tw_bucket **twp)
 549 {
 550         struct inet_opt *inet = inet_sk(sk);
 551         u32 daddr = inet->rcv_saddr;
 552         u32 saddr = inet->daddr;
 553         int dif = sk->sk_bound_dev_if;
 554         TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
 555         __u32 ports = TCP_COMBINED_PORTS(inet->dport, lport);
 556         int hash = tcp_hashfn(daddr, lport, saddr, inet->dport);
 557         struct tcp_ehash_bucket *head = &tcp_ehash[hash];
 558         struct sock *sk2;
 559         struct hlist_node *node;
 560         struct tcp_tw_bucket *tw;
 561
 562         write_lock(&head->lock);
 563
 564         /* Check TIME-WAIT sockets first. */
 565         sk_for_each(sk2, node, &(head + tcp_ehash_size)->chain) {
 566                 tw = (struct tcp_tw_bucket *)sk2;
 567
 568                 if (TCP_IPV4_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
 569                         struct tcp_opt *tp = tcp_sk(sk);
 570
 571                         /* With PAWS, it is safe from the viewpoint
 572                            of data integrity. Even without PAWS it
 573                            is safe provided sequence spaces do not
 574                            overlap i.e. at data rates <= 80Mbit/sec.
 575
 576                            Actually, the idea is close to VJ's one,
 577                            only timestamp cache is held not per host,
 578                            but per port pair and TW bucket is used
 579                            as state holder.
 580
 581                            If TW bucket has been already destroyed we
 582                            fall back to VJ's scheme and use initial
 583                            timestamp retrieved from peer table.
 584                          */
 585                         if (tw->tw_ts_recent_stamp &&
 586                             (!twp || (sysctl_tcp_tw_reuse &&
 587                                       xtime.tv_sec -
 588                                       tw->tw_ts_recent_stamp > 1))) {
 589                                 if ((tp->write_seq =
 590                                                 tw->tw_snd_nxt + 65535 + 2) == 0)
 591                                         tp->write_seq = 1;
 592                                 tp->ts_recent       = tw->tw_ts_recent;
 593                                 tp->ts_recent_stamp = tw->tw_ts_recent_stamp;
 594                                 sock_hold(sk2);
 595                                 goto unique;
 596                         } else
 597                                 goto not_unique;
 598                 }
 599         }
 600         tw = NULL;
 601
 602         /* And established part... */
 603         sk_for_each(sk2, node, &head->chain) {
 604                 if (TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif))
 605                         goto not_unique;
 606         }
 607
 608 unique:
 609         /* Must record num and sport now. Otherwise we will see
 610          * in hash table socket with a funny identity. */
 611         inet->num = lport;
 612         inet->sport = htons(lport);
 613         sk->sk_hashent = hash;
 614         BUG_TRAP(sk_unhashed(sk));
 615         __sk_add_node(sk, &head->chain);
 616         sock_prot_inc_use(sk->sk_prot);
 617         write_unlock(&head->lock);
 618
 619         if (twp) {
 620                 *twp = tw;
 621                 NET_INC_STATS_BH(TimeWaitRecycled);
 622         } else if (tw) {
 623                 /* Silly. Should hash-dance instead... */
 624                 tcp_tw_deschedule(tw);
 625                 NET_INC_STATS_BH(TimeWaitRecycled);
 626
 627                 tcp_tw_put(tw);
 628         }
 629
 630         return 0;
 631
 632 not_unique:
 633         write_unlock(&head->lock);
 634         return -EADDRNOTAVAIL;
 635 }
 636
 637 /*
 638  * Bind a port for a connect operation and hash it.
 639  */
 640 static int tcp_v4_hash_connect(struct sock *sk)
 641 {
 642         unsigned short snum = inet_sk(sk)->num;
 643         struct tcp_bind_hashbucket *head;
 644         struct tcp_bind_bucket *tb;
 645         int ret;
 646
 647         if (!snum) {
 648                 int rover;
 649                 int low = sysctl_local_port_range[0];
 650                 int high = sysctl_local_port_range[1];
 651                 int remaining = (high - low) + 1;
 652                 struct hlist_node *node;
 653                 struct tcp_tw_bucket *tw = NULL;
 654
 655                 local_bh_disable();
 656
 657                 /* TODO. Actually it is not so bad idea to remove
 658                  * tcp_portalloc_lock before next submission to Linus.
 659                  * As soon as we touch this place at all it is time to think.
 660                  *
 661                  * Now it protects single _advisory_ variable tcp_port_rover,
 662                  * hence it is mostly useless.
 663                  * Code will work nicely if we just delete it, but
 664                  * I am afraid in contented case it will work not better or
 665                  * even worse: another cpu just will hit the same bucket
 666                  * and spin there.
 667                  * So some cpu salt could remove both contention and
 668                  * memory pingpong. Any ideas how to do this in a nice way?
 669                  */
 670                 spin_lock(&tcp_portalloc_lock);
 671                 rover = tcp_port_rover;
 672
 673                 do {
 674                         rover++;
 675                         if ((rover < low) || (rover > high))
 676                                 rover = low;
 677                         head = &tcp_bhash[tcp_bhashfn(rover)];
 678                         spin_lock(&head->lock);
 679
 680                         /* Does not bother with rcv_saddr checks,
 681                          * because the established check is already
 682                          * unique enough.
 683                          */
 684                         tb_for_each(tb, node, &head->chain) {
 685                                 if (tb->port == rover) {
 686                                         BUG_TRAP(!hlist_empty(&tb->owners));
 687                                         if (tb->fastreuse >= 0)
 688                                                 goto next_port;
 689                                         if (!__tcp_v4_check_established(sk,
 690                                                                         rover,
 691                                                                         &tw))
 692                                                 goto ok;
 693                                         goto next_port;
 694                                 }
 695                         }
 696
 697                         tb = tcp_bucket_create(head, rover);
 698                         if (!tb) {
 699                                 spin_unlock(&head->lock);
 700                                 break;
 701                         }
 702                         tb->fastreuse = -1;
 703                         goto ok;
 704
 705                 next_port:
 706                         spin_unlock(&head->lock);
 707                 } while (--remaining > 0);
 708                 tcp_port_rover = rover;
 709                 spin_unlock(&tcp_portalloc_lock);
 710
 711                 local_bh_enable();
 712
 713                 return -EADDRNOTAVAIL;
 714
 715 ok:
 716                 /* All locks still held and bhs disabled */
 717                 tcp_port_rover = rover;
 718                 spin_unlock(&tcp_portalloc_lock);
 719
 720                 tcp_bind_hash(sk, tb, rover);
 721                 if (sk_unhashed(sk)) {
 722                         inet_sk(sk)->sport = htons(rover);
 723                         __tcp_v4_hash(sk, 0);
 724                 }
 725                 spin_unlock(&head->lock);
 726
 727                 if (tw) {
 728                         tcp_tw_deschedule(tw);
 729                         tcp_tw_put(tw);
 730                 }
 731
 732                 ret = 0;
 733                 goto out;
 734         }
 735
 736         head  = &tcp_bhash[tcp_bhashfn(snum)];
 737         tb  = tcp_sk(sk)->bind_hash;
 738         spin_lock_bh(&head->lock);
 739         if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
 740                 __tcp_v4_hash(sk, 0);
 741                 spin_unlock_bh(&head->lock);
 742                 return 0;
 743         } else {
 744                 spin_unlock(&head->lock);
 745                 /* No definite answer... Walk to established hash table */
 746                 ret = __tcp_v4_check_established(sk, snum, NULL);
 747 out:
 748                 local_bh_enable();
 749                 return ret;
 750         }
 751 }
 752
 753 /* This will initiate an outgoing connection. */
 754 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 755 {
 756         struct inet_opt *inet = inet_sk(sk);
 757         struct tcp_opt *tp = tcp_sk(sk);
 758         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 759         struct rtable *rt;
 760         u32 daddr, nexthop;
 761         int tmp;
 762         int err;
 763
 764         if (addr_len < sizeof(struct sockaddr_in))
 765                 return -EINVAL;
 766
 767         if (usin->sin_family != AF_INET)
 768                 return -EAFNOSUPPORT;
 769
 770         nexthop = daddr = usin->sin_addr.s_addr;
 771         if (inet->opt && inet->opt->srr) {
 772                 if (!daddr)
 773                         return -EINVAL;
 774                 nexthop = inet->opt->faddr;
 775         }
 776
 777         tmp = ip_route_connect(&rt, nexthop, inet->saddr,
 778                                RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 779                                IPPROTO_TCP,
 780                                inet->sport, usin->sin_port, sk);
 781         if (tmp < 0)
 782                 return tmp;
 783
 784         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 785                 ip_rt_put(rt);
 786                 return -ENETUNREACH;
 787         }
 788
 789         if (!inet->opt || !inet->opt->srr)
 790                 daddr = rt->rt_dst;
 791
 792         if (!inet->saddr)
 793                 inet->saddr = rt->rt_src;
 794         inet->rcv_saddr = inet->saddr;
 795
 796         if (tp->ts_recent_stamp && inet->daddr != daddr) {
 797                 /* Reset inherited state */
 798                 tp->ts_recent       = 0;
 799                 tp->ts_recent_stamp = 0;
 800                 tp->write_seq       = 0;
 801         }
 802
 803         if (sysctl_tcp_tw_recycle &&
 804             !tp->ts_recent_stamp && rt->rt_dst == daddr) {
 805                 struct inet_peer *peer = rt_get_peer(rt);
 806
 807                 /* VJ's idea. We save last timestamp seen from
 808                  * the destination in peer table, when entering state TIME-WAIT
 809                  * and initialize ts_recent from it, when trying new connection.
 810                  */
 811
 812                 if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
 813                         tp->ts_recent_stamp = peer->tcp_ts_stamp;
 814                         tp->ts_recent = peer->tcp_ts;
 815                 }
 816         }
 817
 818         inet->dport = usin->sin_port;
 819         inet->daddr = daddr;
 820
 821         tp->ext_header_len = 0;
 822         if (inet->opt)
 823                 tp->ext_header_len = inet->opt->optlen;
 824
 825         tp->mss_clamp = 536;
 826
 827         /* Socket identity is still unknown (sport may be zero).
 828          * However we set state to SYN-SENT and not releasing socket
 829          * lock select source port, enter ourselves into the hash tables and
 830          * complete initialization after this.
 831          */
 832         tcp_set_state(sk, TCP_SYN_SENT);
 833         err = tcp_v4_hash_connect(sk);
 834         if (err)
 835                 goto failure;
 836
 837         err = ip_route_newports(&rt, inet->sport, inet->dport, sk);
 838         if (err)
 839                 goto failure;
 840
 841         /* OK, now commit destination to socket.  */
 842         __sk_dst_set(sk, &rt->u.dst);
 843         tcp_v4_setup_caps(sk, &rt->u.dst);
 844         tp->ext2_header_len = rt->u.dst.header_len;
 845
 846         if (!tp->write_seq)
 847                 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
 848                                                            inet->daddr,
 849                                                            inet->sport,
 850                                                            usin->sin_port);
 851
 852         inet->id = tp->write_seq ^ jiffies;
 853
 854         err = tcp_connect(sk);
 855         rt = NULL;
 856         if (err)
 857                 goto failure;
 858
 859         return 0;
 860
 861 failure:
 862         /* This unhashes the socket and releases the local port, if necessary. */
 863         tcp_set_state(sk, TCP_CLOSE);
 864         ip_rt_put(rt);
 865         sk->sk_route_caps = 0;
 866         inet->dport = 0;
 867         return err;
 868 }
 869
 870 static __inline__ int tcp_v4_iif(struct sk_buff *skb)
 871 {
 872         return ((struct rtable *)skb->dst)->rt_iif;
 873 }
 874
 875 static __inline__ u32 tcp_v4_synq_hash(u32 raddr, u16 rport, u32 rnd)
 876 {
 877         return (jhash_2words(raddr, (u32) rport, rnd) & (TCP_SYNQ_HSIZE - 1));
 878 }
 879
 880 static struct open_request *tcp_v4_search_req(struct tcp_opt *tp,
 881                                               struct open_request ***prevp,
 882                                               __u16 rport,
 883                                               __u32 raddr, __u32 laddr)
 884 {
 885         struct tcp_listen_opt *lopt = tp->listen_opt;
 886         struct open_request *req, **prev;
 887
 888         for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport, lopt->hash_rnd)];
 889              (req = *prev) != NULL;
 890              prev = &req->dl_next) {
 891                 if (req->rmt_port == rport &&
 892                     req->af.v4_req.rmt_addr == raddr &&
 893                     req->af.v4_req.loc_addr == laddr &&
 894                     TCP_INET_FAMILY(req->class->family)) {
 895                         BUG_TRAP(!req->sk);
 896                         *prevp = prev;
 897                         break;
 898                 }
 899         }
 900
 901         return req;
 902 }
 903
 904 static void tcp_v4_synq_add(struct sock *sk, struct open_request *req)
 905 {
 906         struct tcp_opt *tp = tcp_sk(sk);
 907         struct tcp_listen_opt *lopt = tp->listen_opt;
 908         u32 h = tcp_v4_synq_hash(req->af.v4_req.rmt_addr, req->rmt_port, lopt->hash_rnd);
 909
 910         req->expires = jiffies + TCP_TIMEOUT_INIT;
 911         req->retrans = 0;
 912         req->sk = NULL;
 913         req->dl_next = lopt->syn_table[h];
 914
 915         write_lock(&tp->syn_wait_lock);
 916         lopt->syn_table[h] = req;
 917         write_unlock(&tp->syn_wait_lock);
 918
 919         tcp_synq_added(sk);
 920 }
 921
 922
 923 /*
 924  * This routine does path mtu discovery as defined in RFC1191.
 925  */
 926 static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph,
 927                                      u32 mtu)
 928 {
 929         struct dst_entry *dst;
 930         struct inet_opt *inet = inet_sk(sk);
 931         struct tcp_opt *tp = tcp_sk(sk);
 932
 933         /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
 934          * send out by Linux are always <576bytes so they should go through
 935          * unfragmented).
 936          */
 937         if (sk->sk_state == TCP_LISTEN)
 938                 return;
 939
 940         /* We don't check in the destentry if pmtu discovery is forbidden
 941          * on this route. We just assume that no packet_to_big packets
 942          * are send back when pmtu discovery is not active.
 943          * There is a small race when the user changes this flag in the
 944          * route, but I think that's acceptable.
 945          */
 946         if ((dst = __sk_dst_check(sk, 0)) == NULL)
 947                 return;
 948
 949         dst->ops->update_pmtu(dst, mtu);
 950
 951         /* Something is about to be wrong... Remember soft error
 952          * for the case, if this connection will not able to recover.
 953          */
 954         if (mtu < dst_pmtu(dst) && ip_dont_fragment(sk, dst))
 955                 sk->sk_err_soft = EMSGSIZE;
 956
 957         mtu = dst_pmtu(dst);
 958
 959         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 960             tp->pmtu_cookie > mtu) {
 961                 tcp_sync_mss(sk, mtu);
 962
 963                 /* Resend the TCP packet because it's
 964                  * clear that the old packet has been
 965                  * dropped. This is the new "fast" path mtu
 966                  * discovery.
 967                  */
 968                 tcp_simple_retransmit(sk);
 969         } /* else let the usual retransmit timer handle it */
 970 }
 971
 972 /*
 973  * This routine is called by the ICMP module when it gets some
 974  * sort of error condition.  If err < 0 then the socket should
 975  * be closed and the error returned to the user.  If err > 0
 976  * it's just the icmp type << 8 | icmp code.  After adjustment
 977  * header points to the first 8 bytes of the tcp header.  We need
 978  * to find the appropriate port.
 979  *
 980  * The locking strategy used here is very "optimistic". When
 981  * someone else accesses the socket the ICMP is just dropped
 982  * and for some paths there is no check at all.
 983  * A more general error queue to queue errors for later handling
 984  * is probably better.
 985  *
 986  */
 987
 988 void tcp_v4_err(struct sk_buff *skb, u32 info)
 989 {
 990         struct iphdr *iph = (struct iphdr *)skb->data;
 991         struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
 992         struct tcp_opt *tp;
 993         struct inet_opt *inet;
 994         int type = skb->h.icmph->type;
 995         int code = skb->h.icmph->code;
 996         struct sock *sk;
 997         __u32 seq;
 998         int err;
 999
1000         if (skb->len < (iph->ihl << 2) + 8) {
1001                 ICMP_INC_STATS_BH(IcmpInErrors);
1002                 return;
1003         }
1004
1005         sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr,
1006                            th->source, tcp_v4_iif(skb));
1007         if (!sk) {
1008                 ICMP_INC_STATS_BH(IcmpInErrors);
1009                 return;
1010         }
1011         if (sk->sk_state == TCP_TIME_WAIT) {
1012                 tcp_tw_put((struct tcp_tw_bucket *)sk);
1013                 return;
1014         }
1015
1016         bh_lock_sock(sk);
1017         /* If too many ICMPs get dropped on busy
1018          * servers this needs to be solved differently.
1019          */
1020         if (sock_owned_by_user(sk))
1021                 NET_INC_STATS_BH(LockDroppedIcmps);
1022
1023         if (sk->sk_state == TCP_CLOSE)
1024                 goto out;
1025
1026         tp = tcp_sk(sk);
1027         seq = ntohl(th->seq);
1028         if (sk->sk_state != TCP_LISTEN &&
1029             !between(seq, tp->snd_una, tp->snd_nxt)) {
1030                 NET_INC_STATS(OutOfWindowIcmps);
1031                 goto out;
1032         }
1033
1034         switch (type) {
1035         case ICMP_SOURCE_QUENCH:
1036                 /* This is deprecated, but if someone generated it,
1037                  * we have no reasons to ignore it.
1038                  */
1039                 if (!sock_owned_by_user(sk))
1040                         tcp_enter_cwr(tp);
1041                 goto out;
1042         case ICMP_PARAMETERPROB:
1043                 err = EPROTO;
1044                 break;
1045         case ICMP_DEST_UNREACH:
1046                 if (code > NR_ICMP_UNREACH)
1047                         goto out;
1048
1049                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
1050                         if (!sock_owned_by_user(sk))
1051                                 do_pmtu_discovery(sk, iph, info);
1052                         goto out;
1053                 }
1054
1055                 err = icmp_err_convert[code].errno;
1056                 break;
1057         case ICMP_TIME_EXCEEDED:
1058                 err = EHOSTUNREACH;
1059                 break;
1060         default:
1061                 goto out;
1062         }
1063
1064         switch (sk->sk_state) {
1065                 struct open_request *req, **prev;
1066         case TCP_LISTEN:
1067                 if (sock_owned_by_user(sk))
1068                         goto out;
1069
1070                 req = tcp_v4_search_req(tp, &prev, th->dest,
1071                                         iph->daddr, iph->saddr);
1072                 if (!req)
1073                         goto out;
1074
1075                 /* ICMPs are not backlogged, hence we cannot get
1076                    an established socket here.
1077                  */
1078                 BUG_TRAP(!req->sk);
1079
1080                 if (seq != req->snt_isn) {
1081                         NET_INC_STATS_BH(OutOfWindowIcmps);
1082                         goto out;
1083                 }
1084
1085                 /*
1086                  * Still in SYN_RECV, just remove it silently.
1087                  * There is no good way to pass the error to the newly
1088                  * created socket, and POSIX does not want network
1089                  * errors returned from accept().
1090                  */
1091                 tcp_synq_drop(sk, req, prev);
1092                 goto out;
1093
1094         case TCP_SYN_SENT:
1095         case TCP_SYN_RECV:  /* Cannot happen.
1096                                It can f.e. if SYNs crossed.
1097                              */
1098                 if (!sock_owned_by_user(sk)) {
1099                         TCP_INC_STATS_BH(TcpAttemptFails);
1100                         sk->sk_err = err;
1101
1102                         sk->sk_error_report(sk);
1103
1104                         tcp_done(sk);
1105                 } else {
1106                         sk->sk_err_soft = err;
1107                 }
1108                 goto out;
1109         }
1110
1111         /* If we've already connected we will keep trying
1112          * until we time out, or the user gives up.
1113          *
1114          * rfc1122 4.2.3.9 allows to consider as hard errors
1115          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
1116          * but it is obsoleted by pmtu discovery).
1117          *
1118          * Note, that in modern internet, where routing is unreliable
1119          * and in each dark corner broken firewalls sit, sending random
1120          * errors ordered by their masters even this two messages finally lose
1121          * their original sense (even Linux sends invalid PORT_UNREACHs)
1122          *
1123          * Now we are in compliance with RFCs.
1124          *                                                      --ANK (980905)
1125          */
1126
1127         inet = inet_sk(sk);
1128         if (!sock_owned_by_user(sk) && inet->recverr) {
1129                 sk->sk_err = err;
1130                 sk->sk_error_report(sk);
1131         } else  { /* Only an error on timeout */
1132                 sk->sk_err_soft = err;
1133         }
1134
1135 out:
1136         bh_unlock_sock(sk);
1137         sock_put(sk);
1138 }
1139
1140 /* This routine computes an IPv4 TCP checksum. */
1141 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
1142                        struct sk_buff *skb)
1143 {
1144         struct inet_opt *inet = inet_sk(sk);
1145
1146         if (skb->ip_summed == CHECKSUM_HW) {
1147                 th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0);
1148                 skb->csum = offsetof(struct tcphdr, check);
1149         } else {
1150                 th->check = tcp_v4_check(th, len, inet->saddr, inet->daddr,
1151                                          csum_partial((char *)th,
1152                                                       th->doff << 2,
1153                                                       skb->csum));
1154         }
1155 }
1156
1157 /*
1158  *      This routine will send an RST to the other tcp.
1159  *
1160  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
1161  *                    for reset.
1162  *      Answer: if a packet caused RST, it is not for a socket
1163  *              existing in our system, if it is matched to a socket,
1164  *              it is just duplicate segment or bug in other side's TCP.
1165  *              So that we build reply only basing on parameters
1166  *              arrived with segment.
1167  *      Exception: precedence violation. We do not implement it in any case.
1168  */
1169
1170 static void tcp_v4_send_reset(struct sk_buff *skb)
1171 {
1172         struct tcphdr *th = skb->h.th;
1173         struct tcphdr rth;
1174         struct ip_reply_arg arg;
1175
1176         /* Never send a reset in response to a reset. */
1177         if (th->rst)
1178                 return;
1179
1180         if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
1181                 return;
1182
1183         /* Swap the send and the receive. */
1184         memset(&rth, 0, sizeof(struct tcphdr));
1185         rth.dest   = th->source;
1186         rth.source = th->dest;
1187         rth.doff   = sizeof(struct tcphdr) / 4;
1188         rth.rst    = 1;
1189
1190         if (th->ack) {
1191                 rth.seq = th->ack_seq;
1192         } else {
1193                 rth.ack = 1;
1194                 rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
1195                                     skb->len - (th->doff << 2));
1196         }
1197
1198         memset(&arg, 0, sizeof arg);
1199         arg.iov[0].iov_base = (unsigned char *)&rth;
1200         arg.iov[0].iov_len  = sizeof rth;
1201         arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1202                                       skb->nh.iph->saddr, /*XXX*/
1203                                       sizeof(struct tcphdr), IPPROTO_TCP, 0);
1204         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1205
1206         ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
1207
1208         TCP_INC_STATS_BH(TcpOutSegs);
1209         TCP_INC_STATS_BH(TcpOutRsts);
1210 }
1211
1212 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
1213    outside socket context is ugly, certainly. What can I do?
1214  */
1215
1216 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
1217                             u32 win, u32 ts)
1218 {
1219         struct tcphdr *th = skb->h.th;
1220         struct {
1221                 struct tcphdr th;
1222                 u32 tsopt[3];
1223         } rep;
1224         struct ip_reply_arg arg;
1225
1226         memset(&rep.th, 0, sizeof(struct tcphdr));
1227         memset(&arg, 0, sizeof arg);
1228
1229         arg.iov[0].iov_base = (unsigned char *)&rep;
1230         arg.iov[0].iov_len  = sizeof(rep.th);
1231         if (ts) {
1232                 rep.tsopt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
1233                                      (TCPOPT_TIMESTAMP << 8) |
1234                                      TCPOLEN_TIMESTAMP);
1235                 rep.tsopt[1] = htonl(tcp_time_stamp);
1236                 rep.tsopt[2] = htonl(ts);
1237                 arg.iov[0].iov_len = sizeof(rep);
1238         }
1239
1240         /* Swap the send and the receive. */
1241         rep.th.dest    = th->source;
1242         rep.th.source  = th->dest;
1243         rep.th.doff    = arg.iov[0].iov_len / 4;
1244         rep.th.seq     = htonl(seq);
1245         rep.th.ack_seq = htonl(ack);
1246         rep.th.ack     = 1;
1247         rep.th.window  = htons(win);
1248
1249         arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1250                                       skb->nh.iph->saddr, /*XXX*/
1251                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
1252         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1253
1254         ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
1255
1256         TCP_INC_STATS_BH(TcpOutSegs);
1257 }
1258
1259 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
1260 {
1261         struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk;
1262
1263         tcp_v4_send_ack(skb, tw->tw_snd_nxt, tw->tw_rcv_nxt,
1264                         tw->tw_rcv_wnd >> tw->tw_rcv_wscale, tw->tw_ts_recent);
1265
1266         tcp_tw_put(tw);
1267 }
1268
1269 static void tcp_v4_or_send_ack(struct sk_buff *skb, struct open_request *req)
1270 {
1271         tcp_v4_send_ack(skb, req->snt_isn + 1, req->rcv_isn + 1, req->rcv_wnd,
1272                         req->ts_recent);
1273 }
1274
1275 static struct dst_entry* tcp_v4_route_req(struct sock *sk,
1276                                           struct open_request *req)
1277 {
1278         struct rtable *rt;
1279         struct ip_options *opt = req->af.v4_req.opt;
1280         struct flowi fl = { .oif = sk->sk_bound_dev_if,
1281                             .nl_u = { .ip4_u =
1282                                       { .daddr = ((opt && opt->srr) ?
1283                                                   opt->faddr :
1284                                                   req->af.v4_req.rmt_addr),
1285                                         .saddr = req->af.v4_req.loc_addr,
1286                                         .tos = RT_CONN_FLAGS(sk) } },
1287                             .proto = IPPROTO_TCP,
1288                             .uli_u = { .ports =
1289                                        { .sport = inet_sk(sk)->sport,
1290                                          .dport = req->rmt_port } } };
1291
1292         if (ip_route_output_flow(&rt, &fl, sk, 0)) {
1293                 IP_INC_STATS_BH(IpOutNoRoutes);
1294                 return NULL;
1295         }
1296         if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
1297                 ip_rt_put(rt);
1298                 IP_INC_STATS_BH(IpOutNoRoutes);
1299                 return NULL;
1300         }
1301         return &rt->u.dst;
1302 }
1303
1304 /*
1305  *      Send a SYN-ACK after having received an ACK.
1306  *      This still operates on a open_request only, not on a big
1307  *      socket.
1308  */
1309 static int tcp_v4_send_synack(struct sock *sk, struct open_request *req,
1310                               struct dst_entry *dst)
1311 {
1312         int err = -1;
1313         struct sk_buff * skb;
1314
1315         /* First, grab a route. */
1316         if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1317                 goto out;
1318
1319         skb = tcp_make_synack(sk, dst, req);
1320
1321         if (skb) {
1322                 struct tcphdr *th = skb->h.th;
1323
1324                 th->check = tcp_v4_check(th, skb->len,
1325                                          req->af.v4_req.loc_addr,
1326                                          req->af.v4_req.rmt_addr,
1327                                          csum_partial((char *)th, skb->len,
1328                                                       skb->csum));
1329
1330                 err = ip_build_and_send_pkt(skb, sk, req->af.v4_req.loc_addr,
1331                                             req->af.v4_req.rmt_addr,
1332                                             req->af.v4_req.opt);
1333                 if (err == NET_XMIT_CN)
1334                         err = 0;
1335         }
1336
1337 out:
1338         dst_release(dst);
1339         return err;
1340 }
1341
1342 /*
1343  *      IPv4 open_request destructor.
1344  */
1345 static void tcp_v4_or_free(struct open_request *req)
1346 {
1347         if (req->af.v4_req.opt)
1348                 kfree(req->af.v4_req.opt);
1349 }
1350
1351 static inline void syn_flood_warning(struct sk_buff *skb)
1352 {
1353         static unsigned long warntime;
1354
1355         if (time_after(jiffies, (warntime + HZ * 60))) {
1356                 warntime = jiffies;
1357                 printk(KERN_INFO
1358                        "possible SYN flooding on port %d. Sending cookies.\n",
1359                        ntohs(skb->h.th->dest));
1360         }
1361 }
1362
1363 /*
1364  * Save and compile IPv4 options into the open_request if needed.
1365  */
1366 static inline struct ip_options *tcp_v4_save_options(struct sock *sk,
1367                                                      struct sk_buff *skb)
1368 {
1369         struct ip_options *opt = &(IPCB(skb)->opt);
1370         struct ip_options *dopt = NULL;
1371
1372         if (opt && opt->optlen) {
1373                 int opt_size = optlength(opt);
1374                 dopt = kmalloc(opt_size, GFP_ATOMIC);
1375                 if (dopt) {
1376                         if (ip_options_echo(dopt, skb)) {
1377                                 kfree(dopt);
1378                                 dopt = NULL;
1379                         }
1380                 }
1381         }
1382         return dopt;
1383 }
1384
1385 /*
1386  * Maximum number of SYN_RECV sockets in queue per LISTEN socket.
1387  * One SYN_RECV socket costs about 80bytes on a 32bit machine.
1388  * It would be better to replace it with a global counter for all sockets
1389  * but then some measure against one socket starving all other sockets
1390  * would be needed.
1391  *
1392  * It was 128 by default. Experiments with real servers show, that
1393  * it is absolutely not enough even at 100conn/sec. 256 cures most
1394  * of problems. This value is adjusted to 128 for very small machines
1395  * (<=32Mb of memory) and to 1024 on normal or better ones (>=256Mb).
1396  * Further increasing requires to change hash table size.
1397  */
1398 int sysctl_max_syn_backlog = 256;
1399
1400 struct or_calltable or_ipv4 = {
1401         .family         =       PF_INET,
1402         .rtx_syn_ack    =       tcp_v4_send_synack,
1403         .send_ack       =       tcp_v4_or_send_ack,
1404         .destructor     =       tcp_v4_or_free,
1405         .send_reset     =       tcp_v4_send_reset,
1406 };
1407
1408 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1409 {
1410         struct tcp_opt tp;
1411         struct open_request *req;
1412         __u32 saddr = skb->nh.iph->saddr;
1413         __u32 daddr = skb->nh.iph->daddr;
1414         __u32 isn = TCP_SKB_CB(skb)->when;
1415         struct dst_entry *dst = NULL;
1416 #ifdef CONFIG_SYN_COOKIES
1417         int want_cookie = 0;
1418 #else
1419 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1420 #endif
1421
1422         /* Never answer to SYNs send to broadcast or multicast */
1423         if (((struct rtable *)skb->dst)->rt_flags &
1424             (RTCF_BROADCAST | RTCF_MULTICAST))
1425                 goto drop;
1426
1427         /* TW buckets are converted to open requests without
1428          * limitations, they conserve resources and peer is
1429          * evidently real one.
1430          */
1431         if (tcp_synq_is_full(sk) && !isn) {
1432 #ifdef CONFIG_SYN_COOKIES
1433                 if (sysctl_tcp_syncookies) {
1434                         want_cookie = 1;
1435                 } else
1436 #endif
1437                 goto drop;
1438         }
1439
1440         /* Accept backlog is full. If we have already queued enough
1441          * of warm entries in syn queue, drop request. It is better than
1442          * clogging syn queue with openreqs with exponentially increasing
1443          * timeout.
1444          */
1445         if (tcp_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
1446                 goto drop;
1447
1448         req = tcp_openreq_alloc();
1449         if (!req)
1450                 goto drop;
1451
1452         tcp_clear_options(&tp);
1453         tp.mss_clamp = 536;
1454         tp.user_mss  = tcp_sk(sk)->user_mss;
1455
1456         tcp_parse_options(skb, &tp, 0);
1457
1458         if (want_cookie) {
1459                 tcp_clear_options(&tp);
1460                 tp.saw_tstamp = 0;
1461         }
1462
1463         if (tp.saw_tstamp && !tp.rcv_tsval) {
1464                 /* Some OSes (unknown ones, but I see them on web server, which
1465                  * contains information interesting only for windows'
1466                  * users) do not send their stamp in SYN. It is easy case.
1467                  * We simply do not advertise TS support.
1468                  */
1469                 tp.saw_tstamp = 0;
1470                 tp.tstamp_ok  = 0;
1471         }
1472         tp.tstamp_ok = tp.saw_tstamp;
1473
1474         tcp_openreq_init(req, &tp, skb);
1475
1476         req->af.v4_req.loc_addr = daddr;
1477         req->af.v4_req.rmt_addr = saddr;
1478         req->af.v4_req.opt = tcp_v4_save_options(sk, skb);
1479         req->class = &or_ipv4;
1480         if (!want_cookie)
1481                 TCP_ECN_create_request(req, skb->h.th);
1482
1483         if (want_cookie) {
1484 #ifdef CONFIG_SYN_COOKIES
1485                 syn_flood_warning(skb);
1486 #endif
1487                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1488         } else if (!isn) {
1489                 struct inet_peer *peer = NULL;
1490
1491                 /* VJ's idea. We save last timestamp seen
1492                  * from the destination in peer table, when entering
1493                  * state TIME-WAIT, and check against it before
1494                  * accepting new connection request.
1495                  *
1496                  * If "isn" is not zero, this request hit alive
1497                  * timewait bucket, so that all the necessary checks
1498                  * are made in the function processing timewait state.
1499                  */
1500                 if (tp.saw_tstamp &&
1501                     sysctl_tcp_tw_recycle &&
1502                     (dst = tcp_v4_route_req(sk, req)) != NULL &&
1503                     (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1504                     peer->v4daddr == saddr) {
1505                         if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1506                             (s32)(peer->tcp_ts - req->ts_recent) >
1507                                                         TCP_PAWS_WINDOW) {
1508                                 NET_INC_STATS_BH(PAWSPassiveRejected);
1509                                 dst_release(dst);
1510                                 goto drop_and_free;
1511                         }
1512                 }
1513                 /* Kill the following clause, if you dislike this way. */
1514                 else if (!sysctl_tcp_syncookies &&
1515                          (sysctl_max_syn_backlog - tcp_synq_len(sk) <
1516                           (sysctl_max_syn_backlog >> 2)) &&
1517                          (!peer || !peer->tcp_ts_stamp) &&
1518                          (!dst || !dst_metric(dst, RTAX_RTT))) {
1519                         /* Without syncookies last quarter of
1520                          * backlog is filled with destinations,
1521                          * proven to be alive.
1522                          * It means that we continue to communicate
1523                          * to destinations, already remembered
1524                          * to the moment of synflood.
1525                          */
1526                         NETDEBUG(if (net_ratelimit()) \
1527                                         printk(KERN_DEBUG "TCP: drop open "
1528                                                           "request from %u.%u."
1529                                                           "%u.%u/%u\n", \
1530                                                NIPQUAD(saddr),
1531                                                ntohs(skb->h.th->source)));
1532                         dst_release(dst);
1533                         goto drop_and_free;
1534                 }
1535
1536                 isn = tcp_v4_init_sequence(sk, skb);
1537         }
1538         req->snt_isn = isn;
1539
1540         if (tcp_v4_send_synack(sk, req, dst))
1541                 goto drop_and_free;
1542
1543         if (want_cookie) {
1544                 tcp_openreq_free(req);
1545         } else {
1546                 tcp_v4_synq_add(sk, req);
1547         }
1548         return 0;
1549
1550 drop_and_free:
1551         tcp_openreq_free(req);
1552 drop:
1553         TCP_INC_STATS_BH(TcpAttemptFails);
1554         return 0;
1555 }
1556
1557
1558 /*
1559  * The three way handshake has completed - we got a valid synack -
1560  * now create the new socket.
1561  */
1562 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1563                                   struct open_request *req,
1564                                   struct dst_entry *dst)
1565 {
1566         struct inet_opt *newinet;
1567         struct tcp_opt *newtp;
1568         struct sock *newsk;
1569
1570         if (tcp_acceptq_is_full(sk))
1571                 goto exit_overflow;
1572
1573         if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1574                 goto exit;
1575
1576         newsk = tcp_create_openreq_child(sk, req, skb);
1577         if (!newsk)
1578                 goto exit;
1579
1580         newsk->sk_dst_cache = dst;
1581         tcp_v4_setup_caps(newsk, dst);
1582
1583         newtp                 = tcp_sk(newsk);
1584         newinet               = inet_sk(newsk);
1585         newinet->daddr        = req->af.v4_req.rmt_addr;
1586         newinet->rcv_saddr    = req->af.v4_req.loc_addr;
1587         newinet->saddr        = req->af.v4_req.loc_addr;
1588         newinet->opt          = req->af.v4_req.opt;
1589         req->af.v4_req.opt    = NULL;
1590         newinet->mc_index     = tcp_v4_iif(skb);
1591         newinet->mc_ttl       = skb->nh.iph->ttl;
1592         newtp->ext_header_len = 0;
1593         if (newinet->opt)
1594                 newtp->ext_header_len = newinet->opt->optlen;
1595         newtp->ext2_header_len = dst->header_len;
1596         newinet->id = newtp->write_seq ^ jiffies;
1597
1598         tcp_sync_mss(newsk, dst_pmtu(dst));
1599         newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1600         tcp_initialize_rcv_mss(newsk);
1601
1602         __tcp_v4_hash(newsk, 0);
1603         __tcp_inherit_port(sk, newsk);
1604
1605         return newsk;
1606
1607 exit_overflow:
1608         NET_INC_STATS_BH(ListenOverflows);
1609 exit:
1610         NET_INC_STATS_BH(ListenDrops);
1611         dst_release(dst);
1612         return NULL;
1613 }
1614
1615 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1616 {
1617         struct tcphdr *th = skb->h.th;
1618         struct iphdr *iph = skb->nh.iph;
1619         struct tcp_opt *tp = tcp_sk(sk);
1620         struct sock *nsk;
1621         struct open_request **prev;
1622         /* Find possible connection requests. */
1623         struct open_request *req = tcp_v4_search_req(tp, &prev, th->source,
1624                                                      iph->saddr, iph->daddr);
1625         if (req)
1626                 return tcp_check_req(sk, skb, req, prev);
1627
1628         nsk = __tcp_v4_lookup_established(skb->nh.iph->saddr,
1629                                           th->source,
1630                                           skb->nh.iph->daddr,
1631                                           ntohs(th->dest),
1632                                           tcp_v4_iif(skb));
1633
1634         if (nsk) {
1635                 if (nsk->sk_state != TCP_TIME_WAIT) {
1636                         bh_lock_sock(nsk);
1637                         return nsk;
1638                 }
1639                 tcp_tw_put((struct tcp_tw_bucket *)nsk);
1640                 return NULL;
1641         }
1642
1643 #ifdef CONFIG_SYN_COOKIES
1644         if (!th->rst && !th->syn && th->ack)
1645                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1646 #endif
1647         return sk;
1648 }
1649
1650 static int tcp_v4_checksum_init(struct sk_buff *skb)
1651 {
1652         if (skb->ip_summed == CHECKSUM_HW) {
1653                 skb->ip_summed = CHECKSUM_UNNECESSARY;
1654                 if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1655                                   skb->nh.iph->daddr, skb->csum))
1656                         return 0;
1657
1658                 NETDEBUG(if (net_ratelimit())
1659                                 printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
1660                 skb->ip_summed = CHECKSUM_NONE;
1661         }
1662         if (skb->len <= 76) {
1663                 if (tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1664                                  skb->nh.iph->daddr,
1665                                  skb_checksum(skb, 0, skb->len, 0)))
1666                         return -1;
1667                 skb->ip_summed = CHECKSUM_UNNECESSARY;
1668         } else {
1669                 skb->csum = ~tcp_v4_check(skb->h.th, skb->len,
1670                                           skb->nh.iph->saddr,
1671                                           skb->nh.iph->daddr, 0);
1672         }
1673         return 0;
1674 }
1675
1676
1677 /* The socket must have it's spinlock held when we get
1678  * here.
1679  *
1680  * We have a potential double-lock case here, so even when
1681  * doing backlog processing we use the BH locking scheme.
1682  * This is because we cannot sleep with the original spinlock
1683  * held.
1684  */
1685 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1686 {
1687         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1688                 TCP_CHECK_TIMER(sk);
1689                 if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
1690                         goto reset;
1691                 TCP_CHECK_TIMER(sk);
1692                 return 0;
1693         }
1694
1695         if (skb->len < (skb->h.th->doff << 2) || tcp_checksum_complete(skb))
1696                 goto csum_err;
1697
1698         if (sk->sk_state == TCP_LISTEN) {
1699                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1700                 if (!nsk)
1701                         goto discard;
1702
1703                 if (nsk != sk) {
1704                         if (tcp_child_process(sk, nsk, skb))
1705                                 goto reset;
1706                         return 0;
1707                 }
1708         }
1709
1710         TCP_CHECK_TIMER(sk);
1711         if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
1712                 goto reset;
1713         TCP_CHECK_TIMER(sk);
1714         return 0;
1715
1716 reset:
1717         tcp_v4_send_reset(skb);
1718 discard:
1719         kfree_skb(skb);
1720         /* Be careful here. If this function gets more complicated and
1721          * gcc suffers from register pressure on the x86, sk (in %ebx)
1722          * might be destroyed here. This current version compiles correctly,
1723          * but you have been warned.
1724          */
1725         return 0;
1726
1727 csum_err:
1728         TCP_INC_STATS_BH(TcpInErrs);
1729         goto discard;
1730 }
1731
1732 /*
1733  *      From tcp_input.c
1734  */
1735
1736 int tcp_v4_rcv(struct sk_buff *skb)
1737 {
1738         struct tcphdr *th;
1739         struct sock *sk;
1740         int ret;
1741
1742         if (skb->pkt_type != PACKET_HOST)
1743                 goto discard_it;
1744
1745         /* Count it even if it's bad */
1746         TCP_INC_STATS_BH(TcpInSegs);
1747
1748         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1749                 goto discard_it;
1750
1751         th = skb->h.th;
1752
1753         if (th->doff < sizeof(struct tcphdr) / 4)
1754                 goto bad_packet;
1755         if (!pskb_may_pull(skb, th->doff * 4))
1756                 goto discard_it;
1757
1758         /* An explanation is required here, I think.
1759          * Packet length and doff are validated by header prediction,
1760          * provided case of th->doff==0 is elimineted.
1761          * So, we defer the checks. */
1762         if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
1763              tcp_v4_checksum_init(skb) < 0))
1764                 goto bad_packet;
1765
1766         th = skb->h.th;
1767         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1768         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1769                                     skb->len - th->doff * 4);
1770         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1771         TCP_SKB_CB(skb)->when    = 0;
1772         TCP_SKB_CB(skb)->flags   = skb->nh.iph->tos;
1773         TCP_SKB_CB(skb)->sacked  = 0;
1774
1775         sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source,
1776                              skb->nh.iph->daddr, ntohs(th->dest),
1777                              tcp_v4_iif(skb));
1778
1779         if (!sk)
1780                 goto no_tcp_socket;
1781
1782 process:
1783         if (sk->sk_state == TCP_TIME_WAIT)
1784                 goto do_time_wait;
1785
1786         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1787                 goto discard_and_relse;
1788
1789         if (sk_filter(sk, skb, 0))
1790                 goto discard_and_relse;
1791
1792         skb->dev = NULL;
1793
1794         bh_lock_sock(sk);
1795         ret = 0;
1796         if (!sock_owned_by_user(sk)) {
1797                 if (!tcp_prequeue(sk, skb))
1798                         ret = tcp_v4_do_rcv(sk, skb);
1799         } else
1800                 sk_add_backlog(sk, skb);
1801         bh_unlock_sock(sk);
1802
1803         sock_put(sk);
1804
1805         return ret;
1806
1807 no_tcp_socket:
1808         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1809                 goto discard_it;
1810
1811         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1812 bad_packet:
1813                 TCP_INC_STATS_BH(TcpInErrs);
1814         } else {
1815                 tcp_v4_send_reset(skb);
1816         }
1817
1818 discard_it:
1819         /* Discard frame. */
1820         kfree_skb(skb);
1821         return 0;
1822
1823 discard_and_relse:
1824         sock_put(sk);
1825         goto discard_it;
1826
1827 do_time_wait:
1828         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1829                 tcp_tw_put((struct tcp_tw_bucket *) sk);
1830                 goto discard_it;
1831         }
1832
1833         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1834                 TCP_INC_STATS_BH(TcpInErrs);
1835                 tcp_tw_put((struct tcp_tw_bucket *) sk);
1836                 goto discard_it;
1837         }
1838         switch (tcp_timewait_state_process((struct tcp_tw_bucket *)sk,
1839                                            skb, th, skb->len)) {
1840         case TCP_TW_SYN: {
1841                 struct sock *sk2 = tcp_v4_lookup_listener(skb->nh.iph->daddr,
1842                                                           ntohs(th->dest),
1843                                                           tcp_v4_iif(skb));
1844                 if (sk2) {
1845                         tcp_tw_deschedule((struct tcp_tw_bucket *)sk);
1846                         tcp_tw_put((struct tcp_tw_bucket *)sk);
1847                         sk = sk2;
1848                         goto process;
1849                 }
1850                 /* Fall through to ACK */
1851         }
1852         case TCP_TW_ACK:
1853                 tcp_v4_timewait_ack(sk, skb);
1854                 break;
1855         case TCP_TW_RST:
1856                 goto no_tcp_socket;
1857         case TCP_TW_SUCCESS:;
1858         }
1859         goto discard_it;
1860 }
1861
1862 /* With per-bucket locks this operation is not-atomic, so that
1863  * this version is not worse.
1864  */
1865 static void __tcp_v4_rehash(struct sock *sk)
1866 {
1867         sk->sk_prot->unhash(sk);
1868         sk->sk_prot->hash(sk);
1869 }
1870
1871 static int tcp_v4_reselect_saddr(struct sock *sk)
1872 {
1873         struct inet_opt *inet = inet_sk(sk);
1874         int err;
1875         struct rtable *rt;
1876         __u32 old_saddr = inet->saddr;
1877         __u32 new_saddr;
1878         __u32 daddr = inet->daddr;
1879
1880         if (inet->opt && inet->opt->srr)
1881                 daddr = inet->opt->faddr;
1882
1883         /* Query new route. */
1884         err = ip_route_connect(&rt, daddr, 0,
1885                                RT_TOS(inet->tos) | sk->sk_localroute,
1886                                sk->sk_bound_dev_if,
1887                                IPPROTO_TCP,
1888                                inet->sport, inet->dport, sk);
1889         if (err)
1890                 return err;
1891
1892         __sk_dst_set(sk, &rt->u.dst);
1893         tcp_v4_setup_caps(sk, &rt->u.dst);
1894         tcp_sk(sk)->ext2_header_len = rt->u.dst.header_len;
1895
1896         new_saddr = rt->rt_src;
1897
1898         if (new_saddr == old_saddr)
1899                 return 0;
1900
1901         if (sysctl_ip_dynaddr > 1) {
1902                 printk(KERN_INFO "tcp_v4_rebuild_header(): shifting inet->"
1903                                  "saddr from %d.%d.%d.%d to %d.%d.%d.%d\n",
1904                        NIPQUAD(old_saddr),
1905                        NIPQUAD(new_saddr));
1906         }
1907
1908         inet->saddr = new_saddr;
1909         inet->rcv_saddr = new_saddr;
1910
1911         /* XXX The only one ugly spot where we need to
1912          * XXX really change the sockets identity after
1913          * XXX it has entered the hashes. -DaveM
1914          *
1915          * Besides that, it does not check for connection
1916          * uniqueness. Wait for troubles.
1917          */
1918         __tcp_v4_rehash(sk);
1919         return 0;
1920 }
1921
1922 int tcp_v4_rebuild_header(struct sock *sk)
1923 {
1924         struct inet_opt *inet = inet_sk(sk);
1925         struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0);
1926         u32 daddr;
1927         int err;
1928
1929         /* Route is OK, nothing to do. */
1930         if (rt)
1931                 return 0;
1932
1933         /* Reroute. */
1934         daddr = inet->daddr;
1935         if (inet->opt && inet->opt->srr)
1936                 daddr = inet->opt->faddr;
1937
1938         {
1939                 struct flowi fl = { .oif = sk->sk_bound_dev_if,
1940                                     .nl_u = { .ip4_u =
1941                                               { .daddr = daddr,
1942                                                 .saddr = inet->saddr,
1943                                                 .tos = RT_CONN_FLAGS(sk) } },
1944                                     .proto = IPPROTO_TCP,
1945                                     .uli_u = { .ports =
1946                                                { .sport = inet->sport,
1947                                                  .dport = inet->dport } } };
1948
1949                 err = ip_route_output_flow(&rt, &fl, sk, 0);
1950         }
1951         if (!err) {
1952                 __sk_dst_set(sk, &rt->u.dst);
1953                 tcp_v4_setup_caps(sk, &rt->u.dst);
1954                 tcp_sk(sk)->ext2_header_len = rt->u.dst.header_len;
1955                 return 0;
1956         }
1957
1958         /* Routing failed... */
1959         sk->sk_route_caps = 0;
1960
1961         if (!sysctl_ip_dynaddr ||
1962             sk->sk_state != TCP_SYN_SENT ||
1963             (sk->sk_userlocks & SOCK_BINDADDR_LOCK) ||
1964             (err = tcp_v4_reselect_saddr(sk)) != 0)
1965                 sk->sk_err_soft = -err;
1966
1967         return err;
1968 }
1969
1970 static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
1971 {
1972         struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
1973         struct inet_opt *inet = inet_sk(sk);
1974
1975         sin->sin_family         = AF_INET;
1976         sin->sin_addr.s_addr    = inet->daddr;
1977         sin->sin_port           = inet->dport;
1978 }
1979
1980 /* VJ's idea. Save last timestamp seen from this destination
1981  * and hold it at least for normal timewait interval to use for duplicate
1982  * segment detection in subsequent connections, before they enter synchronized
1983  * state.
1984  */
1985
1986 int tcp_v4_remember_stamp(struct sock *sk)
1987 {
1988         struct inet_opt *inet = inet_sk(sk);
1989         struct tcp_opt *tp = tcp_sk(sk);
1990         struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1991         struct inet_peer *peer = NULL;
1992         int release_it = 0;
1993
1994         if (!rt || rt->rt_dst != inet->daddr) {
1995                 peer = inet_getpeer(inet->daddr, 1);
1996                 release_it = 1;
1997         } else {
1998                 if (!rt->peer)
1999                         rt_bind_peer(rt, 1);
2000                 peer = rt->peer;
2001         }
2002
2003         if (peer) {
2004                 if ((s32)(peer->tcp_ts - tp->ts_recent) <= 0 ||
2005                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
2006                      peer->tcp_ts_stamp <= tp->ts_recent_stamp)) {
2007                         peer->tcp_ts_stamp = tp->ts_recent_stamp;
2008                         peer->tcp_ts = tp->ts_recent;
2009                 }
2010                 if (release_it)
2011                         inet_putpeer(peer);
2012                 return 1;
2013         }
2014
2015         return 0;
2016 }
2017
2018 int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw)
2019 {
2020         struct inet_peer *peer = NULL;
2021
2022         peer = inet_getpeer(tw->tw_daddr, 1);
2023
2024         if (peer) {
2025                 if ((s32)(peer->tcp_ts - tw->tw_ts_recent) <= 0 ||
2026                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
2027                      peer->tcp_ts_stamp <= tw->tw_ts_recent_stamp)) {
2028                         peer->tcp_ts_stamp = tw->tw_ts_recent_stamp;
2029                         peer->tcp_ts = tw->tw_ts_recent;
2030                 }
2031                 inet_putpeer(peer);
2032                 return 1;
2033         }
2034
2035         return 0;
2036 }
2037
2038 struct tcp_func ipv4_specific = {
2039         .queue_xmit     =       ip_queue_xmit,
2040         .send_check     =       tcp_v4_send_check,
2041         .rebuild_header =       tcp_v4_rebuild_header,
2042         .conn_request   =       tcp_v4_conn_request,
2043         .syn_recv_sock  =       tcp_v4_syn_recv_sock,
2044         .remember_stamp =       tcp_v4_remember_stamp,
2045         .net_header_len =       sizeof(struct iphdr),
2046         .setsockopt     =       ip_setsockopt,
2047         .getsockopt     =       ip_getsockopt,
2048         .addr2sockaddr  =       v4_addr2sockaddr,
2049         .sockaddr_len   =       sizeof(struct sockaddr_in),
2050 };
2051
2052 /* NOTE: A lot of things set to zero explicitly by call to
2053  *       sk_alloc() so need not be done here.
2054  */
2055 static int tcp_v4_init_sock(struct sock *sk)
2056 {
2057         struct tcp_opt *tp = tcp_sk(sk);
2058
2059         skb_queue_head_init(&tp->out_of_order_queue);
2060         tcp_init_xmit_timers(sk);
2061         tcp_prequeue_init(tp);
2062
2063         tp->rto  = TCP_TIMEOUT_INIT;
2064         tp->mdev = TCP_TIMEOUT_INIT;
2065
2066         /* So many TCP implementations out there (incorrectly) count the
2067          * initial SYN frame in their delayed-ACK and congestion control
2068          * algorithms that we must have the following bandaid to talk
2069          * efficiently to them.  -DaveM
2070          */
2071         tp->snd_cwnd = 2;
2072
2073         /* See draft-stevens-tcpca-spec-01 for discussion of the
2074          * initialization of these values.
2075          */
2076         tp->snd_ssthresh = 0x7fffffff;  /* Infinity */
2077         tp->snd_cwnd_clamp = ~0;
2078         tp->mss_cache = 536;
2079
2080         tp->reordering = sysctl_tcp_reordering;
2081
2082         sk->sk_state = TCP_CLOSE;
2083
2084         sk->sk_write_space = tcp_write_space;
2085         sk->sk_use_write_queue = 1;
2086
2087         tp->af_specific = &ipv4_specific;
2088
2089         sk->sk_sndbuf = sysctl_tcp_wmem[1];
2090         sk->sk_rcvbuf = sysctl_tcp_rmem[1];
2091
2092         atomic_inc(&tcp_sockets_allocated);
2093
2094         return 0;
2095 }
2096
2097 static int tcp_v4_destroy_sock(struct sock *sk)
2098 {
2099         struct tcp_opt *tp = tcp_sk(sk);
2100
2101         tcp_clear_xmit_timers(sk);
2102
2103         /* Cleanup up the write buffer. */
2104         tcp_writequeue_purge(sk);
2105
2106         /* Cleans up our, hopefully empty, out_of_order_queue. */
2107         __skb_queue_purge(&tp->out_of_order_queue);
2108
2109         /* Clean prequeue, it must be empty really */
2110         __skb_queue_purge(&tp->ucopy.prequeue);
2111
2112         /* Clean up a referenced TCP bind bucket. */
2113         if (tp->bind_hash)
2114                 tcp_put_port(sk);
2115
2116         /* If sendmsg cached page exists, toss it. */
2117         if (inet_sk(sk)->sndmsg_page)
2118                 __free_page(inet_sk(sk)->sndmsg_page);
2119
2120         atomic_dec(&tcp_sockets_allocated);
2121
2122         return 0;
2123 }
2124
2125 #ifdef CONFIG_PROC_FS
2126 /* Proc filesystem TCP sock list dumping. */
2127
2128 static inline struct tcp_tw_bucket *tw_head(struct hlist_head *head)
2129 {
2130         return hlist_empty(head) ? NULL :
2131                 list_entry(head->first, struct tcp_tw_bucket, tw_node);
2132 }
2133
2134 static inline struct tcp_tw_bucket *tw_next(struct tcp_tw_bucket *tw)
2135 {
2136         return tw->tw_node.next ?
2137                 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
2138 }
2139
2140 static void *listening_get_next(struct seq_file *seq, void *cur)
2141 {
2142         struct tcp_opt *tp;
2143         struct hlist_node *node;
2144         struct sock *sk = cur;
2145         struct tcp_iter_state* st = seq->private;
2146
2147         if (!sk) {
2148                 st->bucket = 0;
2149                 sk = sk_head(&tcp_listening_hash[0]);
2150                 goto get_sk;
2151         }
2152
2153         ++st->num;
2154
2155         if (st->state == TCP_SEQ_STATE_OPENREQ) {
2156                 struct open_request *req = cur;
2157
2158                 tp = tcp_sk(st->syn_wait_sk);
2159                 req = req->dl_next;
2160                 while (1) {
2161                         while (req) {
2162                                 if (req->class->family == st->family) {
2163                                         cur = req;
2164                                         goto out;
2165                                 }
2166                                 req = req->dl_next;
2167                         }
2168                         if (++st->sbucket >= TCP_SYNQ_HSIZE)
2169                                 break;
2170 get_req:
2171                         req = tp->listen_opt->syn_table[st->sbucket];
2172                 }
2173                 sk        = sk_next(st->syn_wait_sk);
2174                 st->state = TCP_SEQ_STATE_LISTENING;
2175                 read_unlock_bh(&tp->syn_wait_lock);
2176         } else
2177                 sk = sk_next(sk);
2178 get_sk:
2179         sk_for_each_from(sk, node) {
2180                 if (sk->sk_family == st->family) {
2181                         cur = sk;
2182                         goto out;
2183                 }
2184                 tp = tcp_sk(sk);
2185                 read_lock_bh(&tp->syn_wait_lock);
2186                 if (tp->listen_opt && tp->listen_opt->qlen) {
2187                         st->uid         = sock_i_uid(sk);
2188                         st->syn_wait_sk = sk;
2189                         st->state       = TCP_SEQ_STATE_OPENREQ;
2190                         st->sbucket     = 0;
2191                         goto get_req;
2192                 }
2193                 read_unlock_bh(&tp->syn_wait_lock);
2194         }
2195         if (++st->bucket < TCP_LHTABLE_SIZE) {
2196                 sk = sk_head(&tcp_listening_hash[st->bucket]);
2197                 goto get_sk;
2198         }
2199         cur = NULL;
2200 out:
2201         return cur;
2202 }
2203
2204 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2205 {
2206         void *rc = listening_get_next(seq, NULL);
2207
2208         while (rc && *pos) {
2209                 rc = listening_get_next(seq, rc);
2210                 --*pos;
2211         }
2212         return rc;
2213 }
2214
2215 static void *established_get_first(struct seq_file *seq)
2216 {
2217         struct tcp_iter_state* st = seq->private;
2218         void *rc = NULL;
2219
2220         for (st->bucket = 0; st->bucket < tcp_ehash_size; ++st->bucket) {
2221                 struct sock *sk;
2222                 struct hlist_node *node;
2223                 struct tcp_tw_bucket *tw;
2224
2225                 read_lock(&tcp_ehash[st->bucket].lock);
2226                 sk_for_each(sk, node, &tcp_ehash[st->bucket].chain) {
2227                         if (sk->sk_family != st->family) {
2228                                 continue;
2229                         }
2230                         rc = sk;
2231                         goto out;
2232                 }
2233                 st->state = TCP_SEQ_STATE_TIME_WAIT;
2234                 tw_for_each(tw, node,
2235                             &tcp_ehash[st->bucket + tcp_ehash_size].chain) {
2236                         if (tw->tw_family != st->family) {
2237                                 continue;
2238                         }
2239                         rc = tw;
2240                         goto out;
2241                 }
2242                 read_unlock(&tcp_ehash[st->bucket].lock);
2243                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2244         }
2245 out:
2246         return rc;
2247 }
2248
2249 static void *established_get_next(struct seq_file *seq, void *cur)
2250 {
2251         struct sock *sk = cur;
2252         struct tcp_tw_bucket *tw;
2253         struct hlist_node *node;
2254         struct tcp_iter_state* st = seq->private;
2255
2256         ++st->num;
2257
2258         if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2259                 tw = cur;
2260                 tw = tw_next(tw);
2261 get_tw:
2262                 while (tw && tw->tw_family != st->family) {
2263                         tw = tw_next(tw);
2264                 }
2265                 if (tw) {
2266                         cur = tw;
2267                         goto out;
2268                 }
2269                 read_unlock(&tcp_ehash[st->bucket].lock);
2270                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2271                 if (++st->bucket < tcp_ehash_size) {
2272                         read_lock(&tcp_ehash[st->bucket].lock);
2273                         sk = sk_head(&tcp_ehash[st->bucket].chain);
2274                 } else {
2275                         cur = NULL;
2276                         goto out;
2277                 }
2278         } else
2279                 sk = sk_next(sk);
2280
2281         sk_for_each_from(sk, node) {
2282                 if (sk->sk_family == st->family)
2283                         goto found;
2284         }
2285
2286         st->state = TCP_SEQ_STATE_TIME_WAIT;
2287         tw = tw_head(&tcp_ehash[st->bucket + tcp_ehash_size].chain);
2288         goto get_tw;
2289 found:
2290         cur = sk;
2291 out:
2292         return cur;
2293 }
2294
2295 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2296 {
2297         void *rc = established_get_first(seq);
2298
2299         while (rc && pos) {
2300                 rc = established_get_next(seq, rc);
2301                 --pos;
2302         }
2303         return rc;
2304 }
2305
2306 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2307 {
2308         void *rc;
2309         struct tcp_iter_state* st = seq->private;
2310
2311         tcp_listen_lock();
2312         st->state = TCP_SEQ_STATE_LISTENING;
2313         rc        = listening_get_idx(seq, &pos);
2314
2315         if (!rc) {
2316                 tcp_listen_unlock();
2317                 local_bh_disable();
2318                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2319                 rc        = established_get_idx(seq, pos);
2320         }
2321
2322         return rc;
2323 }
2324
2325 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2326 {
2327         struct tcp_iter_state* st = seq->private;
2328         st->state = TCP_SEQ_STATE_LISTENING;
2329         st->num = 0;
2330         return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2331 }
2332
2333 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2334 {
2335         void *rc = NULL;
2336         struct tcp_iter_state* st;
2337
2338         if (v == SEQ_START_TOKEN) {
2339                 rc = tcp_get_idx(seq, 0);
2340                 goto out;
2341         }
2342         st = seq->private;
2343
2344         switch (st->state) {
2345         case TCP_SEQ_STATE_OPENREQ:
2346         case TCP_SEQ_STATE_LISTENING:
2347                 rc = listening_get_next(seq, v);
2348                 if (!rc) {
2349                         tcp_listen_unlock();
2350                         local_bh_disable();
2351                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2352                         rc        = established_get_first(seq);
2353                 }
2354                 break;
2355         case TCP_SEQ_STATE_ESTABLISHED:
2356         case TCP_SEQ_STATE_TIME_WAIT:
2357                 rc = established_get_next(seq, v);
2358                 break;
2359         }
2360 out:
2361         ++*pos;
2362         return rc;
2363 }
2364
2365 static void tcp_seq_stop(struct seq_file *seq, void *v)
2366 {
2367         struct tcp_iter_state* st = seq->private;
2368
2369         switch (st->state) {
2370         case TCP_SEQ_STATE_OPENREQ:
2371                 if (v) {
2372                         struct tcp_opt *tp = tcp_sk(st->syn_wait_sk);
2373                         read_unlock_bh(&tp->syn_wait_lock);
2374                 }
2375         case TCP_SEQ_STATE_LISTENING:
2376                 if (v != SEQ_START_TOKEN)
2377                         tcp_listen_unlock();
2378                 break;
2379         case TCP_SEQ_STATE_TIME_WAIT:
2380         case TCP_SEQ_STATE_ESTABLISHED:
2381                 if (v)
2382                         read_unlock(&tcp_ehash[st->bucket].lock);
2383                 local_bh_enable();
2384                 break;
2385         }
2386 }
2387
2388 static int tcp_seq_open(struct inode *inode, struct file *file)
2389 {
2390         struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2391         struct seq_file *seq;
2392         struct tcp_iter_state *s;
2393         int rc;
2394
2395         if (unlikely(afinfo == NULL))
2396                 return -EINVAL;
2397
2398         s = kmalloc(sizeof(*s), GFP_KERNEL);
2399         if (!s)
2400                 return -ENOMEM;
2401         memset(s, 0, sizeof(*s));
2402         s->family               = afinfo->family;
2403         s->seq_ops.start        = tcp_seq_start;
2404         s->seq_ops.next         = tcp_seq_next;
2405         s->seq_ops.show         = afinfo->seq_show;
2406         s->seq_ops.stop         = tcp_seq_stop;
2407
2408         rc = seq_open(file, &s->seq_ops);
2409         if (rc)
2410                 goto out_kfree;
2411         seq          = file->private_data;
2412         seq->private = s;
2413 out:
2414         return rc;
2415 out_kfree:
2416         kfree(s);
2417         goto out;
2418 }
2419
2420 int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
2421 {
2422         int rc = 0;
2423         struct proc_dir_entry *p;
2424
2425         if (!afinfo)
2426                 return -EINVAL;
2427         afinfo->seq_fops->owner         = afinfo->owner;
2428         afinfo->seq_fops->open          = tcp_seq_open;
2429         afinfo->seq_fops->read          = seq_read;
2430         afinfo->seq_fops->llseek        = seq_lseek;
2431         afinfo->seq_fops->release       = seq_release_private;
2432
2433         p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
2434         if (p)
2435                 p->data = afinfo;
2436         else
2437                 rc = -ENOMEM;
2438         return rc;
2439 }
2440
2441 void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
2442 {
2443         if (!afinfo)
2444                 return;
2445         proc_net_remove(afinfo->name);
2446         memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops));
2447 }
2448
2449 static void get_openreq4(struct sock *sk, struct open_request *req,
2450                          char *tmpbuf, int i, int uid)
2451 {
2452         int ttd = req->expires - jiffies;
2453
2454         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2455                 " %02X %08X:%08X %02X:%08X %08X %5d %8d %u %d %p",
2456                 i,
2457                 req->af.v4_req.loc_addr,
2458                 ntohs(inet_sk(sk)->sport),
2459                 req->af.v4_req.rmt_addr,
2460                 ntohs(req->rmt_port),
2461                 TCP_SYN_RECV,
2462                 0, 0, /* could print option size, but that is af dependent. */
2463                 1,    /* timers active (only the expire timer) */
2464                 jiffies_to_clock_t(ttd),
2465                 req->retrans,
2466                 uid,
2467                 0,  /* non standard timer */
2468                 0, /* open_requests have no inode */
2469                 atomic_read(&sk->sk_refcnt),
2470                 req);
2471 }
2472
2473 static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
2474 {
2475         int timer_active;
2476         unsigned long timer_expires;
2477         struct tcp_opt *tp = tcp_sk(sp);
2478         struct inet_opt *inet = inet_sk(sp);
2479         unsigned int dest = inet->daddr;
2480         unsigned int src = inet->rcv_saddr;
2481         __u16 destp = ntohs(inet->dport);
2482         __u16 srcp = ntohs(inet->sport);
2483
2484         if (tp->pending == TCP_TIME_RETRANS) {
2485                 timer_active    = 1;
2486                 timer_expires   = tp->timeout;
2487         } else if (tp->pending == TCP_TIME_PROBE0) {
2488                 timer_active    = 4;
2489                 timer_expires   = tp->timeout;
2490         } else if (timer_pending(&sp->sk_timer)) {
2491                 timer_active    = 2;
2492                 timer_expires   = sp->sk_timer.expires;
2493         } else {
2494                 timer_active    = 0;
2495                 timer_expires = jiffies;
2496         }
2497
2498         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2499                         "%08X %5d %8d %lu %d %p %u %u %u %u %d",
2500                 i, src, srcp, dest, destp, sp->sk_state,
2501                 tp->write_seq - tp->snd_una, tp->rcv_nxt - tp->copied_seq,
2502                 timer_active,
2503                 jiffies_to_clock_t(timer_expires - jiffies),
2504                 tp->retransmits,
2505                 sock_i_uid(sp),
2506                 tp->probes_out,
2507                 sock_i_ino(sp),
2508                 atomic_read(&sp->sk_refcnt), sp,
2509                 tp->rto, tp->ack.ato, (tp->ack.quick << 1) | tp->ack.pingpong,
2510                 tp->snd_cwnd,
2511                 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
2512 }
2513
2514 static void get_timewait4_sock(struct tcp_tw_bucket *tw, char *tmpbuf, int i)
2515 {
2516         unsigned int dest, src;
2517         __u16 destp, srcp;
2518         int ttd = tw->tw_ttd - jiffies;
2519
2520         if (ttd < 0)
2521                 ttd = 0;
2522
2523         dest  = tw->tw_daddr;
2524         src   = tw->tw_rcv_saddr;
2525         destp = ntohs(tw->tw_dport);
2526         srcp  = ntohs(tw->tw_sport);
2527
2528         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2529                 " %02X %08X:%08X %02X:%08X %08X %5d %8d %d %d %p",
2530                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2531                 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2532                 atomic_read(&tw->tw_refcnt), tw);
2533 }
2534
2535 #define TMPSZ 150
2536
2537 static int tcp4_seq_show(struct seq_file *seq, void *v)
2538 {
2539         struct tcp_iter_state* st;
2540         char tmpbuf[TMPSZ + 1];
2541
2542         if (v == SEQ_START_TOKEN) {
2543                 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2544                            "  sl  local_address rem_address   st tx_queue "
2545                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2546                            "inode");
2547                 goto out;
2548         }
2549         st = seq->private;
2550
2551         switch (st->state) {
2552         case TCP_SEQ_STATE_LISTENING:
2553         case TCP_SEQ_STATE_ESTABLISHED:
2554                 get_tcp4_sock(v, tmpbuf, st->num);
2555                 break;
2556         case TCP_SEQ_STATE_OPENREQ:
2557                 get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
2558                 break;
2559         case TCP_SEQ_STATE_TIME_WAIT:
2560                 get_timewait4_sock(v, tmpbuf, st->num);
2561                 break;
2562         }
2563         seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
2564 out:
2565         return 0;
2566 }
2567
2568 static struct file_operations tcp4_seq_fops;
2569 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2570         .owner          = THIS_MODULE,
2571         .name           = "tcp",
2572         .family         = AF_INET,
2573         .seq_show       = tcp4_seq_show,
2574         .seq_fops       = &tcp4_seq_fops,
2575 };
2576
2577 int __init tcp4_proc_init(void)
2578 {
2579         return tcp_proc_register(&tcp4_seq_afinfo);
2580 }
2581
2582 void tcp4_proc_exit(void)
2583 {
2584         tcp_proc_unregister(&tcp4_seq_afinfo);
2585 }
2586 #endif /* CONFIG_PROC_FS */
2587
2588 struct proto tcp_prot = {
2589         .name           =       "TCP",
2590         .close          =       tcp_close,
2591         .connect        =       tcp_v4_connect,
2592         .disconnect     =       tcp_disconnect,
2593         .accept         =       tcp_accept,
2594         .ioctl          =       tcp_ioctl,
2595         .init           =       tcp_v4_init_sock,
2596         .destroy        =       tcp_v4_destroy_sock,
2597         .shutdown       =       tcp_shutdown,
2598         .setsockopt     =       tcp_setsockopt,
2599         .getsockopt     =       tcp_getsockopt,
2600         .sendmsg        =       tcp_sendmsg,
2601         .recvmsg        =       tcp_recvmsg,
2602         .backlog_rcv    =       tcp_v4_do_rcv,
2603         .hash           =       tcp_v4_hash,
2604         .unhash         =       tcp_unhash,
2605         .get_port       =       tcp_v4_get_port,
2606 };
2607
2608
2609
2610 void __init tcp_v4_init(struct net_proto_family *ops)
2611 {
2612         int err = sock_create_kern(PF_INET, SOCK_RAW, IPPROTO_TCP, &tcp_socket);
2613         if (err < 0)
2614                 panic("Failed to create the TCP control socket.\n");
2615         tcp_socket->sk->sk_allocation   = GFP_ATOMIC;
2616         inet_sk(tcp_socket->sk)->uc_ttl = -1;
2617
2618         /* Unhash it so that IP input processing does not even
2619          * see it, we do not wish this socket to see incoming
2620          * packets.
2621          */
2622         tcp_socket->sk->sk_prot->unhash(tcp_socket->sk);
2623 }
2624
2625 EXPORT_SYMBOL(ipv4_specific);
2626 EXPORT_SYMBOL(tcp_bind_hash);
2627 EXPORT_SYMBOL(tcp_bucket_create);
2628 EXPORT_SYMBOL(tcp_hashinfo);
2629 EXPORT_SYMBOL(tcp_inherit_port);
2630 EXPORT_SYMBOL(tcp_listen_wlock);
2631 EXPORT_SYMBOL(tcp_port_rover);
2632 EXPORT_SYMBOL(tcp_prot);
2633 EXPORT_SYMBOL(tcp_put_port);
2634 EXPORT_SYMBOL(tcp_unhash);
2635 EXPORT_SYMBOL(tcp_v4_conn_request);
2636 EXPORT_SYMBOL(tcp_v4_connect);
2637 EXPORT_SYMBOL(tcp_v4_do_rcv);
2638 EXPORT_SYMBOL(tcp_v4_lookup_listener);
2639 EXPORT_SYMBOL(tcp_v4_rebuild_header);
2640 EXPORT_SYMBOL(tcp_v4_remember_stamp);
2641 EXPORT_SYMBOL(tcp_v4_send_check);
2642 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2643
2644 #ifdef CONFIG_PROC_FS
2645 EXPORT_SYMBOL(tcp_proc_register);
2646 EXPORT_SYMBOL(tcp_proc_unregister);
2647 #endif
2648 #ifdef CONFIG_SYSCTL
2649 EXPORT_SYMBOL(sysctl_local_port_range);
2650 EXPORT_SYMBOL(sysctl_max_syn_backlog);
2651 EXPORT_SYMBOL(sysctl_tcp_low_latency);
2652 #endif