net/ipv4/tcp_ipv4.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     $Id$
   9  *
  10  *              IPv4 specific functions
  11  *
  12  *
  13  *              code split from:
  14  *              linux/ipv4/tcp.c
  15  *              linux/ipv4/tcp_input.c
  16  *              linux/ipv4/tcp_output.c
  17  *
  18  *              See tcp.c for author information
  19  *
  20  *      This program is free software; you can redistribute it and/or
  21  *      modify it under the terms of the GNU General Public License
  22  *      as published by the Free Software Foundation; either version
  23  *      2 of the License, or (at your option) any later version.
  24  */
  25
  26 /*
  27  * Changes:
  28  *              David S. Miller :       New socket lookup architecture.
  29  *                                      This code is dedicated to John Dyson.
  30  *              David S. Miller :       Change semantics of established hash,
  31  *                                      half is devoted to TIME_WAIT sockets
  32  *                                      and the rest go in the other half.
  33  *              Andi Kleen :            Add support for syncookies and fixed
  34  *                                      some bugs: ip options weren't passed to
  35  *                                      the TCP layer, missed a check for an
  36  *                                      ACK bit.
  37  *              Andi Kleen :            Implemented fast path mtu discovery.
  38  *                                      Fixed many serious bugs in the
  39  *                                      open_request handling and moved
  40  *                                      most of it into the af independent code.
  41  *                                      Added tail drop and some other bugfixes.
  42  *                                      Added new listen sematics.
  43  *              Mike McLagan    :       Routing by source
  44  *      Juan Jose Ciarlante:            ip_dynaddr bits
  45  *              Andi Kleen:             various fixes.
  46  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  47  *                                      coma.
  48  *      Andi Kleen              :       Fix new listen.
  49  *      Andi Kleen              :       Fix accept error reporting.
  50  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  51  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  52  *                                      a single port at the same time.
  53  */
  54
  55 #include <linux/config.h>
  56
  57 #include <linux/types.h>
  58 #include <linux/fcntl.h>
  59 #include <linux/module.h>
  60 #include <linux/random.h>
  61 #include <linux/cache.h>
  62 #include <linux/jhash.h>
  63 #include <linux/init.h>
  64 #include <linux/times.h>
  65
  66 #include <net/icmp.h>
  67 #include <net/tcp.h>
  68 #include <net/ipv6.h>
  69 #include <net/inet_common.h>
  70 #include <net/xfrm.h>
  71
  72 #include <linux/inet.h>
  73 #include <linux/ipv6.h>
  74 #include <linux/stddef.h>
  75 #include <linux/proc_fs.h>
  76 #include <linux/seq_file.h>
  77 #include <linux/vserver/debug.h>
  78
  79 extern int sysctl_ip_dynaddr;
  80 int sysctl_tcp_tw_reuse;
  81 int sysctl_tcp_low_latency;
  82
  83 /* Check TCP sequence numbers in ICMP packets. */
  84 #define ICMP_MIN_LENGTH 8
  85
  86 /* Socket used for sending RSTs */
  87 static struct socket *tcp_socket;
  88
  89 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
  90                        struct sk_buff *skb);
  91
  92 struct tcp_hashinfo __cacheline_aligned tcp_hashinfo = {
  93         .__tcp_lhash_lock       =       RW_LOCK_UNLOCKED,
  94         .__tcp_lhash_users      =       ATOMIC_INIT(0),
  95         .__tcp_lhash_wait
  96           = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.__tcp_lhash_wait),
  97         .__tcp_portalloc_lock   =       SPIN_LOCK_UNLOCKED
  98 };
  99
 100 /*
 101  * This array holds the first and last local port number.
 102  * For high-usage systems, use sysctl to change this to
 103  * 32768-61000
 104  */
 105 int sysctl_local_port_range[2] = { 1024, 4999 };
 106 int tcp_port_rover = 1024 - 1;
 107
 108 static __inline__ int tcp_hashfn(__u32 laddr, __u16 lport,
 109                                  __u32 faddr, __u16 fport)
 110 {
 111         int h = (laddr ^ lport) ^ (faddr ^ fport);
 112         h ^= h >> 16;
 113         h ^= h >> 8;
 114         return h & (tcp_ehash_size - 1);
 115 }
 116
 117 static __inline__ int tcp_sk_hashfn(struct sock *sk)
 118 {
 119         struct inet_sock *inet = inet_sk(sk);
 120         __u32 laddr = inet->rcv_saddr;
 121         __u16 lport = inet->num;
 122         __u32 faddr = inet->daddr;
 123         __u16 fport = inet->dport;
 124
 125         return tcp_hashfn(laddr, lport, faddr, fport);
 126 }
 127
 128 /* Allocate and initialize a new TCP local port bind bucket.
 129  * The bindhash mutex for snum's hash chain must be held here.
 130  */
 131 struct tcp_bind_bucket *tcp_bucket_create(struct tcp_bind_hashbucket *head,
 132                                           unsigned short snum)
 133 {
 134         struct tcp_bind_bucket *tb = kmem_cache_alloc(tcp_bucket_cachep,
 135                                                       SLAB_ATOMIC);
 136         if (tb) {
 137                 tb->port = snum;
 138                 tb->fastreuse = 0;
 139                 INIT_HLIST_HEAD(&tb->owners);
 140                 hlist_add_head(&tb->node, &head->chain);
 141         }
 142         return tb;
 143 }
 144
 145 /* Caller must hold hashbucket lock for this tb with local BH disabled */
 146 void tcp_bucket_destroy(struct tcp_bind_bucket *tb)
 147 {
 148         if (hlist_empty(&tb->owners)) {
 149                 __hlist_del(&tb->node);
 150                 kmem_cache_free(tcp_bucket_cachep, tb);
 151         }
 152 }
 153
 154 /* Caller must disable local BH processing. */
 155 static __inline__ void __tcp_inherit_port(struct sock *sk, struct sock *child)
 156 {
 157         struct tcp_bind_hashbucket *head =
 158                                 &tcp_bhash[tcp_bhashfn(inet_sk(child)->num)];
 159         struct tcp_bind_bucket *tb;
 160
 161         spin_lock(&head->lock);
 162         tb = tcp_sk(sk)->bind_hash;
 163         sk_add_bind_node(child, &tb->owners);
 164         tcp_sk(child)->bind_hash = tb;
 165         spin_unlock(&head->lock);
 166 }
 167
 168 inline void tcp_inherit_port(struct sock *sk, struct sock *child)
 169 {
 170         local_bh_disable();
 171         __tcp_inherit_port(sk, child);
 172         local_bh_enable();
 173 }
 174
 175 void tcp_bind_hash(struct sock *sk, struct tcp_bind_bucket *tb,
 176                    unsigned short snum)
 177 {
 178         inet_sk(sk)->num = snum;
 179         sk_add_bind_node(sk, &tb->owners);
 180         tcp_sk(sk)->bind_hash = tb;
 181 }
 182
 183 static inline int tcp_bind_conflict(struct sock *sk, struct tcp_bind_bucket *tb)
 184 {
 185         struct sock *sk2;
 186         struct hlist_node *node;
 187         int reuse = sk->sk_reuse;
 188
 189         sk_for_each_bound(sk2, node, &tb->owners) {
 190                 if (sk != sk2 &&
 191                     !tcp_v6_ipv6only(sk2) &&
 192                     (!sk->sk_bound_dev_if ||
 193                      !sk2->sk_bound_dev_if ||
 194                      sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
 195                         if (!reuse || !sk2->sk_reuse ||
 196                             sk2->sk_state == TCP_LISTEN) {
 197                                 if (nx_addr_conflict(sk->sk_nx_info,
 198                                         tcp_v4_rcv_saddr(sk), sk2))
 199                                         break;
 200                         }
 201                 }
 202         }
 203         return node != NULL;
 204 }
 205
 206 /* Obtain a reference to a local port for the given sock,
 207  * if snum is zero it means select any available local port.
 208  */
 209 static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
 210 {
 211         struct tcp_bind_hashbucket *head;
 212         struct hlist_node *node;
 213         struct tcp_bind_bucket *tb;
 214         int ret;
 215
 216         local_bh_disable();
 217         if (!snum) {
 218                 int low = sysctl_local_port_range[0];
 219                 int high = sysctl_local_port_range[1];
 220                 int remaining = (high - low) + 1;
 221                 int rover;
 222
 223                 spin_lock(&tcp_portalloc_lock);
 224                 if (tcp_port_rover < low)
 225                         rover = low;
 226                 else
 227                         rover = tcp_port_rover;
 228                 do {
 229                         rover++;
 230                         if (rover > high)
 231                                 rover = low;
 232                         head = &tcp_bhash[tcp_bhashfn(rover)];
 233                         spin_lock(&head->lock);
 234                         tb_for_each(tb, node, &head->chain)
 235                                 if (tb->port == rover)
 236                                         goto next;
 237                         break;
 238                 next:
 239                         spin_unlock(&head->lock);
 240                 } while (--remaining > 0);
 241                 tcp_port_rover = rover;
 242                 spin_unlock(&tcp_portalloc_lock);
 243
 244                 /* Exhausted local port range during search? */
 245                 ret = 1;
 246                 if (remaining <= 0)
 247                         goto fail;
 248
 249                 /* OK, here is the one we will use.  HEAD is
 250                  * non-NULL and we hold it's mutex.
 251                  */
 252                 snum = rover;
 253         } else {
 254                 head = &tcp_bhash[tcp_bhashfn(snum)];
 255                 spin_lock(&head->lock);
 256                 tb_for_each(tb, node, &head->chain)
 257                         if (tb->port == snum)
 258                                 goto tb_found;
 259         }
 260         tb = NULL;
 261         goto tb_not_found;
 262 tb_found:
 263         if (!hlist_empty(&tb->owners)) {
 264                 if (sk->sk_reuse > 1)
 265                         goto success;
 266                 if (tb->fastreuse > 0 &&
 267                     sk->sk_reuse && sk->sk_state != TCP_LISTEN) {
 268                         goto success;
 269                 } else {
 270                         ret = 1;
 271                         if (tcp_bind_conflict(sk, tb))
 272                                 goto fail_unlock;
 273                 }
 274         }
 275 tb_not_found:
 276         ret = 1;
 277         if (!tb && (tb = tcp_bucket_create(head, snum)) == NULL)
 278                 goto fail_unlock;
 279         if (hlist_empty(&tb->owners)) {
 280                 if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
 281                         tb->fastreuse = 1;
 282                 else
 283                         tb->fastreuse = 0;
 284         } else if (tb->fastreuse &&
 285                    (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
 286                 tb->fastreuse = 0;
 287 success:
 288         if (!tcp_sk(sk)->bind_hash)
 289                 tcp_bind_hash(sk, tb, snum);
 290         BUG_TRAP(tcp_sk(sk)->bind_hash == tb);
 291         ret = 0;
 292
 293 fail_unlock:
 294         spin_unlock(&head->lock);
 295 fail:
 296         local_bh_enable();
 297         return ret;
 298 }
 299
 300 /* Get rid of any references to a local port held by the
 301  * given sock.
 302  */
 303 static void __tcp_put_port(struct sock *sk)
 304 {
 305         struct inet_sock *inet = inet_sk(sk);
 306         struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(inet->num)];
 307         struct tcp_bind_bucket *tb;
 308
 309         spin_lock(&head->lock);
 310         tb = tcp_sk(sk)->bind_hash;
 311         __sk_del_bind_node(sk);
 312         tcp_sk(sk)->bind_hash = NULL;
 313         inet->num = 0;
 314         tcp_bucket_destroy(tb);
 315         spin_unlock(&head->lock);
 316 }
 317
 318 void tcp_put_port(struct sock *sk)
 319 {
 320         local_bh_disable();
 321         __tcp_put_port(sk);
 322         local_bh_enable();
 323 }
 324
 325 /* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP.
 326  * Look, when several writers sleep and reader wakes them up, all but one
 327  * immediately hit write lock and grab all the cpus. Exclusive sleep solves
 328  * this, _but_ remember, it adds useless work on UP machines (wake up each
 329  * exclusive lock release). It should be ifdefed really.
 330  */
 331
 332 void tcp_listen_wlock(void)
 333 {
 334         write_lock(&tcp_lhash_lock);
 335
 336         if (atomic_read(&tcp_lhash_users)) {
 337                 DEFINE_WAIT(wait);
 338
 339                 for (;;) {
 340                         prepare_to_wait_exclusive(&tcp_lhash_wait,
 341                                                 &wait, TASK_UNINTERRUPTIBLE);
 342                         if (!atomic_read(&tcp_lhash_users))
 343                                 break;
 344                         write_unlock_bh(&tcp_lhash_lock);
 345                         schedule();
 346                         write_lock_bh(&tcp_lhash_lock);
 347                 }
 348
 349                 finish_wait(&tcp_lhash_wait, &wait);
 350         }
 351 }
 352
 353 static __inline__ void __tcp_v4_hash(struct sock *sk, const int listen_possible)
 354 {
 355         struct hlist_head *list;
 356         rwlock_t *lock;
 357
 358         BUG_TRAP(sk_unhashed(sk));
 359         if (listen_possible && sk->sk_state == TCP_LISTEN) {
 360                 list = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)];
 361                 lock = &tcp_lhash_lock;
 362                 tcp_listen_wlock();
 363         } else {
 364                 list = &tcp_ehash[(sk->sk_hashent = tcp_sk_hashfn(sk))].chain;
 365                 lock = &tcp_ehash[sk->sk_hashent].lock;
 366                 write_lock(lock);
 367         }
 368         __sk_add_node(sk, list);
 369         sock_prot_inc_use(sk->sk_prot);
 370         write_unlock(lock);
 371         if (listen_possible && sk->sk_state == TCP_LISTEN)
 372                 wake_up(&tcp_lhash_wait);
 373 }
 374
 375 static void tcp_v4_hash(struct sock *sk)
 376 {
 377         if (sk->sk_state != TCP_CLOSE) {
 378                 local_bh_disable();
 379                 __tcp_v4_hash(sk, 1);
 380                 local_bh_enable();
 381         }
 382 }
 383
 384 void tcp_unhash(struct sock *sk)
 385 {
 386         rwlock_t *lock;
 387
 388         if (sk_unhashed(sk))
 389                 goto ende;
 390
 391         if (sk->sk_state == TCP_LISTEN) {
 392                 local_bh_disable();
 393                 tcp_listen_wlock();
 394                 lock = &tcp_lhash_lock;
 395         } else {
 396                 struct tcp_ehash_bucket *head = &tcp_ehash[sk->sk_hashent];
 397                 lock = &head->lock;
 398                 write_lock_bh(&head->lock);
 399         }
 400
 401         if (__sk_del_node_init(sk))
 402                 sock_prot_dec_use(sk->sk_prot);
 403         write_unlock_bh(lock);
 404
 405  ende:
 406         if (sk->sk_state == TCP_LISTEN)
 407                 wake_up(&tcp_lhash_wait);
 408 }
 409
 410
 411 /*
 412  *      Check if a given address matches for a tcp socket
 413  *
 414  *      nxi:    the socket's nx_info if any
 415  *      addr:   to be verified address
 416  *      saddr:  socket addresses
 417  */
 418 static inline int tcp_addr_match (
 419         struct nx_info *nxi,
 420         uint32_t addr,
 421         uint32_t saddr)
 422 {
 423         if (addr && (saddr == addr))
 424                 return 1;
 425         if (!saddr)
 426                 return addr_in_nx_info(nxi, addr);
 427         return 0;
 428 }
 429
 430 /* Don't inline this cruft.  Here are some nice properties to
 431  * exploit here.  The BSD API does not allow a listening TCP
 432  * to specify the remote port nor the remote address for the
 433  * connection.  So always assume those are both wildcarded
 434  * during the search since they can never be otherwise.
 435  */
 436 static struct sock *__tcp_v4_lookup_listener(struct hlist_head *head, u32 daddr,
 437                                              unsigned short hnum, int dif)
 438 {
 439         struct sock *result = NULL, *sk;
 440         struct hlist_node *node;
 441         int score, hiscore;
 442
 443         hiscore=-1;
 444         sk_for_each(sk, node, head) {
 445                 struct inet_sock *inet = inet_sk(sk);
 446
 447                 if (inet->num == hnum && !ipv6_only_sock(sk)) {
 448                         __u32 rcv_saddr = inet->rcv_saddr;
 449
 450                         score = (sk->sk_family == PF_INET ? 1 : 0);
 451                         if (tcp_addr_match(sk->sk_nx_info, daddr, rcv_saddr))
 452                                 score+=2;
 453                         else
 454                                 continue;
 455                         if (sk->sk_bound_dev_if) {
 456                                 if (sk->sk_bound_dev_if != dif)
 457                                         continue;
 458                                 score+=2;
 459                         }
 460                         if (score == 5)
 461                                 return sk;
 462                         if (score > hiscore) {
 463                                 hiscore = score;
 464                                 result = sk;
 465                         }
 466                 }
 467         }
 468         return result;
 469 }
 470
 471 /* Optimize the common listener case. */
 472 struct sock *tcp_v4_lookup_listener(u32 daddr, unsigned short hnum, int dif)
 473 {
 474         struct sock *sk = NULL;
 475         struct hlist_head *head;
 476
 477         read_lock(&tcp_lhash_lock);
 478         head = &tcp_listening_hash[tcp_lhashfn(hnum)];
 479         if (!hlist_empty(head)) {
 480                 struct inet_sock *inet = inet_sk((sk = __sk_head(head)));
 481
 482                 if (inet->num == hnum && !sk->sk_node.next &&
 483                     (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) &&
 484                     tcp_addr_match(sk->sk_nx_info, daddr, inet->rcv_saddr) &&
 485                     !sk->sk_bound_dev_if)
 486                         goto sherry_cache;
 487                 sk = __tcp_v4_lookup_listener(head, daddr, hnum, dif);
 488         }
 489         if (sk) {
 490 sherry_cache:
 491                 sock_hold(sk);
 492         }
 493         read_unlock(&tcp_lhash_lock);
 494         return sk;
 495 }
 496
 497 EXPORT_SYMBOL_GPL(tcp_v4_lookup_listener);
 498
 499 /* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
 500  * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
 501  *
 502  * Local BH must be disabled here.
 503  */
 504
 505 static inline struct sock *__tcp_v4_lookup_established(u32 saddr, u16 sport,
 506                                                        u32 daddr, u16 hnum,
 507                                                        int dif)
 508 {
 509         struct tcp_ehash_bucket *head;
 510         TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
 511         __u32 ports = TCP_COMBINED_PORTS(sport, hnum);
 512         struct sock *sk;
 513         struct hlist_node *node;
 514         /* Optimize here for direct hit, only listening connections can
 515          * have wildcards anyways.
 516          */
 517         int hash = tcp_hashfn(daddr, hnum, saddr, sport);
 518         head = &tcp_ehash[hash];
 519         read_lock(&head->lock);
 520         sk_for_each(sk, node, &head->chain) {
 521                 if (TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
 522                         goto hit; /* You sunk my battleship! */
 523         }
 524
 525         /* Must check for a TIME_WAIT'er before going to listener hash. */
 526         sk_for_each(sk, node, &(head + tcp_ehash_size)->chain) {
 527                 if (TCP_IPV4_TW_MATCH(sk, acookie, saddr, daddr, ports, dif))
 528                         goto hit;
 529         }
 530         sk = NULL;
 531 out:
 532         read_unlock(&head->lock);
 533         return sk;
 534 hit:
 535         sock_hold(sk);
 536         goto out;
 537 }
 538
 539 static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport,
 540                                            u32 daddr, u16 hnum, int dif)
 541 {
 542         struct sock *sk = __tcp_v4_lookup_established(saddr, sport,
 543                                                       daddr, hnum, dif);
 544
 545         return sk ? : tcp_v4_lookup_listener(daddr, hnum, dif);
 546 }
 547
 548 inline struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr,
 549                                   u16 dport, int dif)
 550 {
 551         struct sock *sk;
 552
 553         local_bh_disable();
 554         sk = __tcp_v4_lookup(saddr, sport, daddr, ntohs(dport), dif);
 555         local_bh_enable();
 556
 557         return sk;
 558 }
 559
 560 EXPORT_SYMBOL_GPL(tcp_v4_lookup);
 561
 562 static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
 563 {
 564         return secure_tcp_sequence_number(skb->nh.iph->daddr,
 565                                           skb->nh.iph->saddr,
 566                                           skb->h.th->dest,
 567                                           skb->h.th->source);
 568 }
 569
 570 /* called with local bh disabled */
 571 static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
 572                                       struct tcp_tw_bucket **twp)
 573 {
 574         struct inet_sock *inet = inet_sk(sk);
 575         u32 daddr = inet->rcv_saddr;
 576         u32 saddr = inet->daddr;
 577         int dif = sk->sk_bound_dev_if;
 578         TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
 579         __u32 ports = TCP_COMBINED_PORTS(inet->dport, lport);
 580         int hash = tcp_hashfn(daddr, lport, saddr, inet->dport);
 581         struct tcp_ehash_bucket *head = &tcp_ehash[hash];
 582         struct sock *sk2;
 583         struct hlist_node *node;
 584         struct tcp_tw_bucket *tw;
 585
 586         write_lock(&head->lock);
 587
 588         /* Check TIME-WAIT sockets first. */
 589         sk_for_each(sk2, node, &(head + tcp_ehash_size)->chain) {
 590                 tw = (struct tcp_tw_bucket *)sk2;
 591
 592                 if (TCP_IPV4_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
 593                         struct tcp_sock *tp = tcp_sk(sk);
 594
 595                         /* With PAWS, it is safe from the viewpoint
 596                            of data integrity. Even without PAWS it
 597                            is safe provided sequence spaces do not
 598                            overlap i.e. at data rates <= 80Mbit/sec.
 599
 600                            Actually, the idea is close to VJ's one,
 601                            only timestamp cache is held not per host,
 602                            but per port pair and TW bucket is used
 603                            as state holder.
 604
 605                            If TW bucket has been already destroyed we
 606                            fall back to VJ's scheme and use initial
 607                            timestamp retrieved from peer table.
 608                          */
 609                         if (tw->tw_ts_recent_stamp &&
 610                             (!twp || (sysctl_tcp_tw_reuse &&
 611                                       xtime.tv_sec -
 612                                       tw->tw_ts_recent_stamp > 1))) {
 613                                 if ((tp->write_seq =
 614                                                 tw->tw_snd_nxt + 65535 + 2) == 0)
 615                                         tp->write_seq = 1;
 616                                 tp->rx_opt.ts_recent       = tw->tw_ts_recent;
 617                                 tp->rx_opt.ts_recent_stamp = tw->tw_ts_recent_stamp;
 618                                 sock_hold(sk2);
 619                                 goto unique;
 620                         } else
 621                                 goto not_unique;
 622                 }
 623         }
 624         tw = NULL;
 625
 626         /* And established part... */
 627         sk_for_each(sk2, node, &head->chain) {
 628                 if (TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif))
 629                         goto not_unique;
 630         }
 631
 632 unique:
 633         /* Must record num and sport now. Otherwise we will see
 634          * in hash table socket with a funny identity. */
 635         inet->num = lport;
 636         inet->sport = htons(lport);
 637         sk->sk_hashent = hash;
 638         BUG_TRAP(sk_unhashed(sk));
 639         __sk_add_node(sk, &head->chain);
 640         sock_prot_inc_use(sk->sk_prot);
 641         write_unlock(&head->lock);
 642
 643         if (twp) {
 644                 *twp = tw;
 645                 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
 646         } else if (tw) {
 647                 /* Silly. Should hash-dance instead... */
 648                 tcp_tw_deschedule(tw);
 649                 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
 650
 651                 tcp_tw_put(tw);
 652         }
 653
 654         return 0;
 655
 656 not_unique:
 657         write_unlock(&head->lock);
 658         return -EADDRNOTAVAIL;
 659 }
 660
 661 static inline u32 connect_port_offset(const struct sock *sk)
 662 {
 663         const struct inet_sock *inet = inet_sk(sk);
 664
 665         return secure_tcp_port_ephemeral(inet->rcv_saddr, inet->daddr,
 666                                          inet->dport);
 667 }
 668
 669 /*
 670  * Bind a port for a connect operation and hash it.
 671  */
 672 static inline int tcp_v4_hash_connect(struct sock *sk)
 673 {
 674         unsigned short snum = inet_sk(sk)->num;
 675         struct tcp_bind_hashbucket *head;
 676         struct tcp_bind_bucket *tb;
 677         int ret;
 678
 679         if (!snum) {
 680                 int low = sysctl_local_port_range[0];
 681                 int high = sysctl_local_port_range[1];
 682                 int range = high - low;
 683                 int i;
 684                 int port;
 685                 static u32 hint;
 686                 u32 offset = hint + connect_port_offset(sk);
 687                 struct hlist_node *node;
 688                 struct tcp_tw_bucket *tw = NULL;
 689
 690                 local_bh_disable();
 691                 for (i = 1; i <= range; i++) {
 692                         port = low + (i + offset) % range;
 693                         head = &tcp_bhash[tcp_bhashfn(port)];
 694                         spin_lock(&head->lock);
 695
 696                         /* Does not bother with rcv_saddr checks,
 697                          * because the established check is already
 698                          * unique enough.
 699                          */
 700                         tb_for_each(tb, node, &head->chain) {
 701                                 if (tb->port == port) {
 702                                         BUG_TRAP(!hlist_empty(&tb->owners));
 703                                         if (tb->fastreuse >= 0)
 704                                                 goto next_port;
 705                                         if (!__tcp_v4_check_established(sk,
 706                                                                         port,
 707                                                                         &tw))
 708                                                 goto ok;
 709                                         goto next_port;
 710                                 }
 711                         }
 712
 713                         tb = tcp_bucket_create(head, port);
 714                         if (!tb) {
 715                                 spin_unlock(&head->lock);
 716                                 break;
 717                         }
 718                         tb->fastreuse = -1;
 719                         goto ok;
 720
 721                 next_port:
 722                         spin_unlock(&head->lock);
 723                 }
 724                 local_bh_enable();
 725
 726                 return -EADDRNOTAVAIL;
 727
 728 ok:
 729                 hint += i;
 730
 731                 /* Head lock still held and bh's disabled */
 732                 tcp_bind_hash(sk, tb, port);
 733                 if (sk_unhashed(sk)) {
 734                         inet_sk(sk)->sport = htons(port);
 735                         __tcp_v4_hash(sk, 0);
 736                 }
 737                 spin_unlock(&head->lock);
 738
 739                 if (tw) {
 740                         tcp_tw_deschedule(tw);
 741                         tcp_tw_put(tw);
 742                 }
 743
 744                 ret = 0;
 745                 goto out;
 746         }
 747
 748         head  = &tcp_bhash[tcp_bhashfn(snum)];
 749         tb  = tcp_sk(sk)->bind_hash;
 750         spin_lock_bh(&head->lock);
 751         if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
 752                 __tcp_v4_hash(sk, 0);
 753                 spin_unlock_bh(&head->lock);
 754                 return 0;
 755         } else {
 756                 spin_unlock(&head->lock);
 757                 /* No definite answer... Walk to established hash table */
 758                 ret = __tcp_v4_check_established(sk, snum, NULL);
 759 out:
 760                 local_bh_enable();
 761                 return ret;
 762         }
 763 }
 764
 765 /* This will initiate an outgoing connection. */
 766 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 767 {
 768         struct inet_sock *inet = inet_sk(sk);
 769         struct tcp_sock *tp = tcp_sk(sk);
 770         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 771         struct rtable *rt;
 772         u32 daddr, nexthop;
 773         int tmp;
 774         int err;
 775
 776         if (addr_len < sizeof(struct sockaddr_in))
 777                 return -EINVAL;
 778
 779         if (usin->sin_family != AF_INET)
 780                 return -EAFNOSUPPORT;
 781
 782         nexthop = daddr = usin->sin_addr.s_addr;
 783         if (inet->opt && inet->opt->srr) {
 784                 if (!daddr)
 785                         return -EINVAL;
 786                 nexthop = inet->opt->faddr;
 787         }
 788
 789         tmp = ip_route_connect(&rt, nexthop, inet->saddr,
 790                                RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 791                                IPPROTO_TCP,
 792                                inet->sport, usin->sin_port, sk);
 793         if (tmp < 0)
 794                 return tmp;
 795
 796         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 797                 ip_rt_put(rt);
 798                 return -ENETUNREACH;
 799         }
 800
 801         if (!inet->opt || !inet->opt->srr)
 802                 daddr = rt->rt_dst;
 803
 804         if (!inet->saddr)
 805                 inet->saddr = rt->rt_src;
 806         inet->rcv_saddr = inet->saddr;
 807
 808         if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
 809                 /* Reset inherited state */
 810                 tp->rx_opt.ts_recent       = 0;
 811                 tp->rx_opt.ts_recent_stamp = 0;
 812                 tp->write_seq              = 0;
 813         }
 814
 815         if (sysctl_tcp_tw_recycle &&
 816             !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
 817                 struct inet_peer *peer = rt_get_peer(rt);
 818
 819                 /* VJ's idea. We save last timestamp seen from
 820                  * the destination in peer table, when entering state TIME-WAIT
 821                  * and initialize rx_opt.ts_recent from it, when trying new connection.
 822                  */
 823
 824                 if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
 825                         tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
 826                         tp->rx_opt.ts_recent = peer->tcp_ts;
 827                 }
 828         }
 829
 830         inet->dport = usin->sin_port;
 831         inet->daddr = daddr;
 832
 833         tp->ext_header_len = 0;
 834         if (inet->opt)
 835                 tp->ext_header_len = inet->opt->optlen;
 836
 837         tp->rx_opt.mss_clamp = 536;
 838
 839         /* Socket identity is still unknown (sport may be zero).
 840          * However we set state to SYN-SENT and not releasing socket
 841          * lock select source port, enter ourselves into the hash tables and
 842          * complete initialization after this.
 843          */
 844         tcp_set_state(sk, TCP_SYN_SENT);
 845         err = tcp_v4_hash_connect(sk);
 846         if (err)
 847                 goto failure;
 848
 849         err = ip_route_newports(&rt, inet->sport, inet->dport, sk);
 850         if (err)
 851                 goto failure;
 852
 853         /* OK, now commit destination to socket.  */
 854         __sk_dst_set(sk, &rt->u.dst);
 855         tcp_v4_setup_caps(sk, &rt->u.dst);
 856
 857         if (!tp->write_seq)
 858                 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
 859                                                            inet->daddr,
 860                                                            inet->sport,
 861                                                            usin->sin_port);
 862
 863         inet->id = tp->write_seq ^ jiffies;
 864
 865         err = tcp_connect(sk);
 866         rt = NULL;
 867         if (err)
 868                 goto failure;
 869
 870         return 0;
 871
 872 failure:
 873         /* This unhashes the socket and releases the local port, if necessary. */
 874         tcp_set_state(sk, TCP_CLOSE);
 875         ip_rt_put(rt);
 876         sk->sk_route_caps = 0;
 877         inet->dport = 0;
 878         return err;
 879 }
 880
 881 static __inline__ int tcp_v4_iif(struct sk_buff *skb)
 882 {
 883         return ((struct rtable *)skb->dst)->rt_iif;
 884 }
 885
 886 static __inline__ u32 tcp_v4_synq_hash(u32 raddr, u16 rport, u32 rnd)
 887 {
 888         return (jhash_2words(raddr, (u32) rport, rnd) & (TCP_SYNQ_HSIZE - 1));
 889 }
 890
 891 static struct open_request *tcp_v4_search_req(struct tcp_sock *tp,
 892                                               struct open_request ***prevp,
 893                                               __u16 rport,
 894                                               __u32 raddr, __u32 laddr)
 895 {
 896         struct tcp_listen_opt *lopt = tp->listen_opt;
 897         struct open_request *req, **prev;
 898
 899         for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport, lopt->hash_rnd)];
 900              (req = *prev) != NULL;
 901              prev = &req->dl_next) {
 902                 if (req->rmt_port == rport &&
 903                     req->af.v4_req.rmt_addr == raddr &&
 904                     req->af.v4_req.loc_addr == laddr &&
 905                     TCP_INET_FAMILY(req->class->family)) {
 906                         BUG_TRAP(!req->sk);
 907                         *prevp = prev;
 908                         break;
 909                 }
 910         }
 911
 912         return req;
 913 }
 914
 915 static void tcp_v4_synq_add(struct sock *sk, struct open_request *req)
 916 {
 917         struct tcp_sock *tp = tcp_sk(sk);
 918         struct tcp_listen_opt *lopt = tp->listen_opt;
 919         u32 h = tcp_v4_synq_hash(req->af.v4_req.rmt_addr, req->rmt_port, lopt->hash_rnd);
 920
 921         req->expires = jiffies + TCP_TIMEOUT_INIT;
 922         req->retrans = 0;
 923         req->sk = NULL;
 924         req->dl_next = lopt->syn_table[h];
 925
 926         write_lock(&tp->syn_wait_lock);
 927         lopt->syn_table[h] = req;
 928         write_unlock(&tp->syn_wait_lock);
 929
 930         tcp_synq_added(sk);
 931 }
 932
 933
 934 /*
 935  * This routine does path mtu discovery as defined in RFC1191.
 936  */
 937 static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph,
 938                                      u32 mtu)
 939 {
 940         struct dst_entry *dst;
 941         struct inet_sock *inet = inet_sk(sk);
 942         struct tcp_sock *tp = tcp_sk(sk);
 943
 944         /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
 945          * send out by Linux are always <576bytes so they should go through
 946          * unfragmented).
 947          */
 948         if (sk->sk_state == TCP_LISTEN)
 949                 return;
 950
 951         /* We don't check in the destentry if pmtu discovery is forbidden
 952          * on this route. We just assume that no packet_to_big packets
 953          * are send back when pmtu discovery is not active.
 954          * There is a small race when the user changes this flag in the
 955          * route, but I think that's acceptable.
 956          */
 957         if ((dst = __sk_dst_check(sk, 0)) == NULL)
 958                 return;
 959
 960         dst->ops->update_pmtu(dst, mtu);
 961
 962         /* Something is about to be wrong... Remember soft error
 963          * for the case, if this connection will not able to recover.
 964          */
 965         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 966                 sk->sk_err_soft = EMSGSIZE;
 967
 968         mtu = dst_mtu(dst);
 969
 970         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 971             tp->pmtu_cookie > mtu) {
 972                 tcp_sync_mss(sk, mtu);
 973
 974                 /* Resend the TCP packet because it's
 975                  * clear that the old packet has been
 976                  * dropped. This is the new "fast" path mtu
 977                  * discovery.
 978                  */
 979                 tcp_simple_retransmit(sk);
 980         } /* else let the usual retransmit timer handle it */
 981 }
 982
 983 /*
 984  * This routine is called by the ICMP module when it gets some
 985  * sort of error condition.  If err < 0 then the socket should
 986  * be closed and the error returned to the user.  If err > 0
 987  * it's just the icmp type << 8 | icmp code.  After adjustment
 988  * header points to the first 8 bytes of the tcp header.  We need
 989  * to find the appropriate port.
 990  *
 991  * The locking strategy used here is very "optimistic". When
 992  * someone else accesses the socket the ICMP is just dropped
 993  * and for some paths there is no check at all.
 994  * A more general error queue to queue errors for later handling
 995  * is probably better.
 996  *
 997  */
 998
 999 void tcp_v4_err(struct sk_buff *skb, u32 info)
1000 {
1001         struct iphdr *iph = (struct iphdr *)skb->data;
1002         struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
1003         struct tcp_sock *tp;
1004         struct inet_sock *inet;
1005         int type = skb->h.icmph->type;
1006         int code = skb->h.icmph->code;
1007         struct sock *sk;
1008         __u32 seq;
1009         int err;
1010
1011         if (skb->len < (iph->ihl << 2) + 8) {
1012                 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
1013                 return;
1014         }
1015
1016         sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr,
1017                            th->source, tcp_v4_iif(skb));
1018         if (!sk) {
1019                 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
1020                 return;
1021         }
1022         if (sk->sk_state == TCP_TIME_WAIT) {
1023                 tcp_tw_put((struct tcp_tw_bucket *)sk);
1024                 return;
1025         }
1026
1027         bh_lock_sock(sk);
1028         /* If too many ICMPs get dropped on busy
1029          * servers this needs to be solved differently.
1030          */
1031         if (sock_owned_by_user(sk))
1032                 NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
1033
1034         if (sk->sk_state == TCP_CLOSE)
1035                 goto out;
1036
1037         tp = tcp_sk(sk);
1038         seq = ntohl(th->seq);
1039         if (sk->sk_state != TCP_LISTEN &&
1040             !between(seq, tp->snd_una, tp->snd_nxt)) {
1041                 NET_INC_STATS(LINUX_MIB_OUTOFWINDOWICMPS);
1042                 goto out;
1043         }
1044
1045         switch (type) {
1046         case ICMP_SOURCE_QUENCH:
1047                 /* Just silently ignore these. */
1048                 goto out;
1049         case ICMP_PARAMETERPROB:
1050                 err = EPROTO;
1051                 break;
1052         case ICMP_DEST_UNREACH:
1053                 if (code > NR_ICMP_UNREACH)
1054                         goto out;
1055
1056                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
1057                         if (!sock_owned_by_user(sk))
1058                                 do_pmtu_discovery(sk, iph, info);
1059                         goto out;
1060                 }
1061
1062                 err = icmp_err_convert[code].errno;
1063                 break;
1064         case ICMP_TIME_EXCEEDED:
1065                 err = EHOSTUNREACH;
1066                 break;
1067         default:
1068                 goto out;
1069         }
1070
1071         switch (sk->sk_state) {
1072                 struct open_request *req, **prev;
1073         case TCP_LISTEN:
1074                 if (sock_owned_by_user(sk))
1075                         goto out;
1076
1077                 req = tcp_v4_search_req(tp, &prev, th->dest,
1078                                         iph->daddr, iph->saddr);
1079                 if (!req)
1080                         goto out;
1081
1082                 /* ICMPs are not backlogged, hence we cannot get
1083                    an established socket here.
1084                  */
1085                 BUG_TRAP(!req->sk);
1086
1087                 if (seq != req->snt_isn) {
1088                         NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
1089                         goto out;
1090                 }
1091
1092                 /*
1093                  * Still in SYN_RECV, just remove it silently.
1094                  * There is no good way to pass the error to the newly
1095                  * created socket, and POSIX does not want network
1096                  * errors returned from accept().
1097                  */
1098                 tcp_synq_drop(sk, req, prev);
1099                 goto out;
1100
1101         case TCP_SYN_SENT:
1102         case TCP_SYN_RECV:  /* Cannot happen.
1103                                It can f.e. if SYNs crossed.
1104                              */
1105                 if (!sock_owned_by_user(sk)) {
1106                         TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1107                         sk->sk_err = err;
1108
1109                         sk->sk_error_report(sk);
1110
1111                         tcp_done(sk);
1112                 } else {
1113                         sk->sk_err_soft = err;
1114                 }
1115                 goto out;
1116         }
1117
1118         /* If we've already connected we will keep trying
1119          * until we time out, or the user gives up.
1120          *
1121          * rfc1122 4.2.3.9 allows to consider as hard errors
1122          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
1123          * but it is obsoleted by pmtu discovery).
1124          *
1125          * Note, that in modern internet, where routing is unreliable
1126          * and in each dark corner broken firewalls sit, sending random
1127          * errors ordered by their masters even this two messages finally lose
1128          * their original sense (even Linux sends invalid PORT_UNREACHs)
1129          *
1130          * Now we are in compliance with RFCs.
1131          *                                                      --ANK (980905)
1132          */
1133
1134         inet = inet_sk(sk);
1135         if (!sock_owned_by_user(sk) && inet->recverr) {
1136                 sk->sk_err = err;
1137                 sk->sk_error_report(sk);
1138         } else  { /* Only an error on timeout */
1139                 sk->sk_err_soft = err;
1140         }
1141
1142 out:
1143         bh_unlock_sock(sk);
1144         sock_put(sk);
1145 }
1146
1147 /* This routine computes an IPv4 TCP checksum. */
1148 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
1149                        struct sk_buff *skb)
1150 {
1151         struct inet_sock *inet = inet_sk(sk);
1152
1153         if (skb->ip_summed == CHECKSUM_HW) {
1154                 th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0);
1155                 skb->csum = offsetof(struct tcphdr, check);
1156         } else {
1157                 th->check = tcp_v4_check(th, len, inet->saddr, inet->daddr,
1158                                          csum_partial((char *)th,
1159                                                       th->doff << 2,
1160                                                       skb->csum));
1161         }
1162 }
1163
1164 /*
1165  *      This routine will send an RST to the other tcp.
1166  *
1167  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
1168  *                    for reset.
1169  *      Answer: if a packet caused RST, it is not for a socket
1170  *              existing in our system, if it is matched to a socket,
1171  *              it is just duplicate segment or bug in other side's TCP.
1172  *              So that we build reply only basing on parameters
1173  *              arrived with segment.
1174  *      Exception: precedence violation. We do not implement it in any case.
1175  */
1176
1177 static void tcp_v4_send_reset(struct sk_buff *skb)
1178 {
1179         struct tcphdr *th = skb->h.th;
1180         struct tcphdr rth;
1181         struct ip_reply_arg arg;
1182
1183         /* Never send a reset in response to a reset. */
1184         if (th->rst)
1185                 return;
1186
1187         if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
1188                 return;
1189
1190         /* Swap the send and the receive. */
1191         memset(&rth, 0, sizeof(struct tcphdr));
1192         rth.dest   = th->source;
1193         rth.source = th->dest;
1194         rth.doff   = sizeof(struct tcphdr) / 4;
1195         rth.rst    = 1;
1196
1197         if (th->ack) {
1198                 rth.seq = th->ack_seq;
1199         } else {
1200                 rth.ack = 1;
1201                 rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
1202                                     skb->len - (th->doff << 2));
1203         }
1204
1205         memset(&arg, 0, sizeof arg);
1206         arg.iov[0].iov_base = (unsigned char *)&rth;
1207         arg.iov[0].iov_len  = sizeof rth;
1208         arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1209                                       skb->nh.iph->saddr, /*XXX*/
1210                                       sizeof(struct tcphdr), IPPROTO_TCP, 0);
1211         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1212
1213         ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
1214
1215         TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
1216         TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
1217 }
1218
1219 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
1220    outside socket context is ugly, certainly. What can I do?
1221  */
1222
1223 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
1224                             u32 win, u32 ts)
1225 {
1226         struct tcphdr *th = skb->h.th;
1227         struct {
1228                 struct tcphdr th;
1229                 u32 tsopt[3];
1230         } rep;
1231         struct ip_reply_arg arg;
1232
1233         memset(&rep.th, 0, sizeof(struct tcphdr));
1234         memset(&arg, 0, sizeof arg);
1235
1236         arg.iov[0].iov_base = (unsigned char *)&rep;
1237         arg.iov[0].iov_len  = sizeof(rep.th);
1238         if (ts) {
1239                 rep.tsopt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
1240                                      (TCPOPT_TIMESTAMP << 8) |
1241                                      TCPOLEN_TIMESTAMP);
1242                 rep.tsopt[1] = htonl(tcp_time_stamp);
1243                 rep.tsopt[2] = htonl(ts);
1244                 arg.iov[0].iov_len = sizeof(rep);
1245         }
1246
1247         /* Swap the send and the receive. */
1248         rep.th.dest    = th->source;
1249         rep.th.source  = th->dest;
1250         rep.th.doff    = arg.iov[0].iov_len / 4;
1251         rep.th.seq     = htonl(seq);
1252         rep.th.ack_seq = htonl(ack);
1253         rep.th.ack     = 1;
1254         rep.th.window  = htons(win);
1255
1256         arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1257                                       skb->nh.iph->saddr, /*XXX*/
1258                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
1259         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1260
1261         ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
1262
1263         TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
1264 }
1265
1266 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
1267 {
1268         struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk;
1269
1270         tcp_v4_send_ack(skb, tw->tw_snd_nxt, tw->tw_rcv_nxt,
1271                         tw->tw_rcv_wnd >> tw->tw_rcv_wscale, tw->tw_ts_recent);
1272
1273         tcp_tw_put(tw);
1274 }
1275
1276 static void tcp_v4_or_send_ack(struct sk_buff *skb, struct open_request *req)
1277 {
1278         tcp_v4_send_ack(skb, req->snt_isn + 1, req->rcv_isn + 1, req->rcv_wnd,
1279                         req->ts_recent);
1280 }
1281
1282 static struct dst_entry* tcp_v4_route_req(struct sock *sk,
1283                                           struct open_request *req)
1284 {
1285         struct rtable *rt;
1286         struct ip_options *opt = req->af.v4_req.opt;
1287         struct flowi fl = { .oif = sk->sk_bound_dev_if,
1288                             .nl_u = { .ip4_u =
1289                                       { .daddr = ((opt && opt->srr) ?
1290                                                   opt->faddr :
1291                                                   req->af.v4_req.rmt_addr),
1292                                         .saddr = req->af.v4_req.loc_addr,
1293                                         .tos = RT_CONN_FLAGS(sk) } },
1294                             .proto = IPPROTO_TCP,
1295                             .uli_u = { .ports =
1296                                        { .sport = inet_sk(sk)->sport,
1297                                          .dport = req->rmt_port } } };
1298
1299         if (ip_route_output_flow(&rt, &fl, sk, 0)) {
1300                 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
1301                 return NULL;
1302         }
1303         if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
1304                 ip_rt_put(rt);
1305                 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
1306                 return NULL;
1307         }
1308         return &rt->u.dst;
1309 }
1310
1311 /*
1312  *      Send a SYN-ACK after having received an ACK.
1313  *      This still operates on a open_request only, not on a big
1314  *      socket.
1315  */
1316 static int tcp_v4_send_synack(struct sock *sk, struct open_request *req,
1317                               struct dst_entry *dst)
1318 {
1319         int err = -1;
1320         struct sk_buff * skb;
1321
1322         /* First, grab a route. */
1323         if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1324                 goto out;
1325
1326         skb = tcp_make_synack(sk, dst, req);
1327
1328         if (skb) {
1329                 struct tcphdr *th = skb->h.th;
1330
1331                 th->check = tcp_v4_check(th, skb->len,
1332                                          req->af.v4_req.loc_addr,
1333                                          req->af.v4_req.rmt_addr,
1334                                          csum_partial((char *)th, skb->len,
1335                                                       skb->csum));
1336
1337                 err = ip_build_and_send_pkt(skb, sk, req->af.v4_req.loc_addr,
1338                                             req->af.v4_req.rmt_addr,
1339                                             req->af.v4_req.opt);
1340                 if (err == NET_XMIT_CN)
1341                         err = 0;
1342         }
1343
1344 out:
1345         dst_release(dst);
1346         return err;
1347 }
1348
1349 /*
1350  *      IPv4 open_request destructor.
1351  */
1352 static void tcp_v4_or_free(struct open_request *req)
1353 {
1354         if (req->af.v4_req.opt)
1355                 kfree(req->af.v4_req.opt);
1356 }
1357
1358 static inline void syn_flood_warning(struct sk_buff *skb)
1359 {
1360         static unsigned long warntime;
1361
1362         if (time_after(jiffies, (warntime + HZ * 60))) {
1363                 warntime = jiffies;
1364                 printk(KERN_INFO
1365                        "possible SYN flooding on port %d. Sending cookies.\n",
1366                        ntohs(skb->h.th->dest));
1367         }
1368 }
1369
1370 /*
1371  * Save and compile IPv4 options into the open_request if needed.
1372  */
1373 static inline struct ip_options *tcp_v4_save_options(struct sock *sk,
1374                                                      struct sk_buff *skb)
1375 {
1376         struct ip_options *opt = &(IPCB(skb)->opt);
1377         struct ip_options *dopt = NULL;
1378
1379         if (opt && opt->optlen) {
1380                 int opt_size = optlength(opt);
1381                 dopt = kmalloc(opt_size, GFP_ATOMIC);
1382                 if (dopt) {
1383                         if (ip_options_echo(dopt, skb)) {
1384                                 kfree(dopt);
1385                                 dopt = NULL;
1386                         }
1387                 }
1388         }
1389         return dopt;
1390 }
1391
1392 /*
1393  * Maximum number of SYN_RECV sockets in queue per LISTEN socket.
1394  * One SYN_RECV socket costs about 80bytes on a 32bit machine.
1395  * It would be better to replace it with a global counter for all sockets
1396  * but then some measure against one socket starving all other sockets
1397  * would be needed.
1398  *
1399  * It was 128 by default. Experiments with real servers show, that
1400  * it is absolutely not enough even at 100conn/sec. 256 cures most
1401  * of problems. This value is adjusted to 128 for very small machines
1402  * (<=32Mb of memory) and to 1024 on normal or better ones (>=256Mb).
1403  * Further increasing requires to change hash table size.
1404  */
1405 int sysctl_max_syn_backlog = 256;
1406
1407 struct or_calltable or_ipv4 = {
1408         .family         =       PF_INET,
1409         .rtx_syn_ack    =       tcp_v4_send_synack,
1410         .send_ack       =       tcp_v4_or_send_ack,
1411         .destructor     =       tcp_v4_or_free,
1412         .send_reset     =       tcp_v4_send_reset,
1413 };
1414
1415 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1416 {
1417         struct tcp_options_received tmp_opt;
1418         struct open_request *req;
1419         __u32 saddr = skb->nh.iph->saddr;
1420         __u32 daddr = skb->nh.iph->daddr;
1421         __u32 isn = TCP_SKB_CB(skb)->when;
1422         struct dst_entry *dst = NULL;
1423 #ifdef CONFIG_SYN_COOKIES
1424         int want_cookie = 0;
1425 #else
1426 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1427 #endif
1428
1429         /* Never answer to SYNs send to broadcast or multicast */
1430         if (((struct rtable *)skb->dst)->rt_flags &
1431             (RTCF_BROADCAST | RTCF_MULTICAST))
1432                 goto drop;
1433
1434         /* TW buckets are converted to open requests without
1435          * limitations, they conserve resources and peer is
1436          * evidently real one.
1437          */
1438         if (tcp_synq_is_full(sk) && !isn) {
1439 #ifdef CONFIG_SYN_COOKIES
1440                 if (sysctl_tcp_syncookies) {
1441                         want_cookie = 1;
1442                 } else
1443 #endif
1444                 goto drop;
1445         }
1446
1447         /* Accept backlog is full. If we have already queued enough
1448          * of warm entries in syn queue, drop request. It is better than
1449          * clogging syn queue with openreqs with exponentially increasing
1450          * timeout.
1451          */
1452         if (sk_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
1453                 goto drop;
1454
1455         req = tcp_openreq_alloc();
1456         if (!req)
1457                 goto drop;
1458
1459         tcp_clear_options(&tmp_opt);
1460         tmp_opt.mss_clamp = 536;
1461         tmp_opt.user_mss  = tcp_sk(sk)->rx_opt.user_mss;
1462
1463         tcp_parse_options(skb, &tmp_opt, 0);
1464
1465         if (want_cookie) {
1466                 tcp_clear_options(&tmp_opt);
1467                 tmp_opt.saw_tstamp = 0;
1468         }
1469
1470         if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
1471                 /* Some OSes (unknown ones, but I see them on web server, which
1472                  * contains information interesting only for windows'
1473                  * users) do not send their stamp in SYN. It is easy case.
1474                  * We simply do not advertise TS support.
1475                  */
1476                 tmp_opt.saw_tstamp = 0;
1477                 tmp_opt.tstamp_ok  = 0;
1478         }
1479         tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1480
1481         tcp_openreq_init(req, &tmp_opt, skb);
1482
1483         req->af.v4_req.loc_addr = daddr;
1484         req->af.v4_req.rmt_addr = saddr;
1485         req->af.v4_req.opt = tcp_v4_save_options(sk, skb);
1486         req->class = &or_ipv4;
1487         if (!want_cookie)
1488                 TCP_ECN_create_request(req, skb->h.th);
1489
1490         if (want_cookie) {
1491 #ifdef CONFIG_SYN_COOKIES
1492                 syn_flood_warning(skb);
1493 #endif
1494                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1495         } else if (!isn) {
1496                 struct inet_peer *peer = NULL;
1497
1498                 /* VJ's idea. We save last timestamp seen
1499                  * from the destination in peer table, when entering
1500                  * state TIME-WAIT, and check against it before
1501                  * accepting new connection request.
1502                  *
1503                  * If "isn" is not zero, this request hit alive
1504                  * timewait bucket, so that all the necessary checks
1505                  * are made in the function processing timewait state.
1506                  */
1507                 if (tmp_opt.saw_tstamp &&
1508                     sysctl_tcp_tw_recycle &&
1509                     (dst = tcp_v4_route_req(sk, req)) != NULL &&
1510                     (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1511                     peer->v4daddr == saddr) {
1512                         if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1513                             (s32)(peer->tcp_ts - req->ts_recent) >
1514                                                         TCP_PAWS_WINDOW) {
1515                                 NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
1516                                 dst_release(dst);
1517                                 goto drop_and_free;
1518                         }
1519                 }
1520                 /* Kill the following clause, if you dislike this way. */
1521                 else if (!sysctl_tcp_syncookies &&
1522                          (sysctl_max_syn_backlog - tcp_synq_len(sk) <
1523                           (sysctl_max_syn_backlog >> 2)) &&
1524                          (!peer || !peer->tcp_ts_stamp) &&
1525                          (!dst || !dst_metric(dst, RTAX_RTT))) {
1526                         /* Without syncookies last quarter of
1527                          * backlog is filled with destinations,
1528                          * proven to be alive.
1529                          * It means that we continue to communicate
1530                          * to destinations, already remembered
1531                          * to the moment of synflood.
1532                          */
1533                         NETDEBUG(if (net_ratelimit()) \
1534                                         printk(KERN_DEBUG "TCP: drop open "
1535                                                           "request from %u.%u."
1536                                                           "%u.%u/%u\n", \
1537                                                NIPQUAD(saddr),
1538                                                ntohs(skb->h.th->source)));
1539                         dst_release(dst);
1540                         goto drop_and_free;
1541                 }
1542
1543                 isn = tcp_v4_init_sequence(sk, skb);
1544         }
1545         req->snt_isn = isn;
1546
1547         if (tcp_v4_send_synack(sk, req, dst))
1548                 goto drop_and_free;
1549
1550         if (want_cookie) {
1551                 tcp_openreq_free(req);
1552         } else {
1553                 tcp_v4_synq_add(sk, req);
1554         }
1555         return 0;
1556
1557 drop_and_free:
1558         tcp_openreq_free(req);
1559 drop:
1560         TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1561         return 0;
1562 }
1563
1564
1565 /*
1566  * The three way handshake has completed - we got a valid synack -
1567  * now create the new socket.
1568  */
1569 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1570                                   struct open_request *req,
1571                                   struct dst_entry *dst)
1572 {
1573         struct inet_sock *newinet;
1574         struct tcp_sock *newtp;
1575         struct sock *newsk;
1576
1577         if (sk_acceptq_is_full(sk))
1578                 goto exit_overflow;
1579
1580         if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1581                 goto exit;
1582
1583         newsk = tcp_create_openreq_child(sk, req, skb);
1584         if (!newsk)
1585                 goto exit;
1586
1587         newsk->sk_dst_cache = dst;
1588         tcp_v4_setup_caps(newsk, dst);
1589
1590         newtp                 = tcp_sk(newsk);
1591         newinet               = inet_sk(newsk);
1592         newinet->daddr        = req->af.v4_req.rmt_addr;
1593         newinet->rcv_saddr    = req->af.v4_req.loc_addr;
1594         newinet->saddr        = req->af.v4_req.loc_addr;
1595         newinet->opt          = req->af.v4_req.opt;
1596         req->af.v4_req.opt    = NULL;
1597         newinet->mc_index     = tcp_v4_iif(skb);
1598         newinet->mc_ttl       = skb->nh.iph->ttl;
1599         newtp->ext_header_len = 0;
1600         if (newinet->opt)
1601                 newtp->ext_header_len = newinet->opt->optlen;
1602         newinet->id = newtp->write_seq ^ jiffies;
1603
1604         tcp_sync_mss(newsk, dst_mtu(dst));
1605         newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1606         tcp_initialize_rcv_mss(newsk);
1607
1608         __tcp_v4_hash(newsk, 0);
1609         __tcp_inherit_port(sk, newsk);
1610
1611         return newsk;
1612
1613 exit_overflow:
1614         NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
1615 exit:
1616         NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
1617         dst_release(dst);
1618         return NULL;
1619 }
1620
1621 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1622 {
1623         struct tcphdr *th = skb->h.th;
1624         struct iphdr *iph = skb->nh.iph;
1625         struct tcp_sock *tp = tcp_sk(sk);
1626         struct sock *nsk;
1627         struct open_request **prev;
1628         /* Find possible connection requests. */
1629         struct open_request *req = tcp_v4_search_req(tp, &prev, th->source,
1630                                                      iph->saddr, iph->daddr);
1631         if (req)
1632                 return tcp_check_req(sk, skb, req, prev);
1633
1634         nsk = __tcp_v4_lookup_established(skb->nh.iph->saddr,
1635                                           th->source,
1636                                           skb->nh.iph->daddr,
1637                                           ntohs(th->dest),
1638                                           tcp_v4_iif(skb));
1639
1640         if (nsk) {
1641                 if (nsk->sk_state != TCP_TIME_WAIT) {
1642                         bh_lock_sock(nsk);
1643                         return nsk;
1644                 }
1645                 tcp_tw_put((struct tcp_tw_bucket *)nsk);
1646                 return NULL;
1647         }
1648
1649 #ifdef CONFIG_SYN_COOKIES
1650         if (!th->rst && !th->syn && th->ack)
1651                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1652 #endif
1653         return sk;
1654 }
1655
1656 static int tcp_v4_checksum_init(struct sk_buff *skb)
1657 {
1658         if (skb->ip_summed == CHECKSUM_HW) {
1659                 skb->ip_summed = CHECKSUM_UNNECESSARY;
1660                 if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1661                                   skb->nh.iph->daddr, skb->csum))
1662                         return 0;
1663
1664                 NETDEBUG(if (net_ratelimit())
1665                                 printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
1666                 skb->ip_summed = CHECKSUM_NONE;
1667         }
1668         if (skb->len <= 76) {
1669                 if (tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1670                                  skb->nh.iph->daddr,
1671                                  skb_checksum(skb, 0, skb->len, 0)))
1672                         return -1;
1673                 skb->ip_summed = CHECKSUM_UNNECESSARY;
1674         } else {
1675                 skb->csum = ~tcp_v4_check(skb->h.th, skb->len,
1676                                           skb->nh.iph->saddr,
1677                                           skb->nh.iph->daddr, 0);
1678         }
1679         return 0;
1680 }
1681
1682
1683 /* The socket must have it's spinlock held when we get
1684  * here.
1685  *
1686  * We have a potential double-lock case here, so even when
1687  * doing backlog processing we use the BH locking scheme.
1688  * This is because we cannot sleep with the original spinlock
1689  * held.
1690  */
1691 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1692 {
1693         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1694                 TCP_CHECK_TIMER(sk);
1695                 if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
1696                         goto reset;
1697                 TCP_CHECK_TIMER(sk);
1698                 return 0;
1699         }
1700
1701         if (skb->len < (skb->h.th->doff << 2) || tcp_checksum_complete(skb))
1702                 goto csum_err;
1703
1704         if (sk->sk_state == TCP_LISTEN) {
1705                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1706                 if (!nsk)
1707                         goto discard;
1708
1709                 if (nsk != sk) {
1710                         if (tcp_child_process(sk, nsk, skb))
1711                                 goto reset;
1712                         return 0;
1713                 }
1714         }
1715
1716         TCP_CHECK_TIMER(sk);
1717         if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
1718                 goto reset;
1719         TCP_CHECK_TIMER(sk);
1720         return 0;
1721
1722 reset:
1723         tcp_v4_send_reset(skb);
1724 discard:
1725         kfree_skb(skb);
1726         /* Be careful here. If this function gets more complicated and
1727          * gcc suffers from register pressure on the x86, sk (in %ebx)
1728          * might be destroyed here. This current version compiles correctly,
1729          * but you have been warned.
1730          */
1731         return 0;
1732
1733 csum_err:
1734         TCP_INC_STATS_BH(TCP_MIB_INERRS);
1735         goto discard;
1736 }
1737
1738 /*
1739  *      From tcp_input.c
1740  */
1741
1742 int tcp_v4_rcv(struct sk_buff *skb)
1743 {
1744         struct tcphdr *th;
1745         struct sock *sk;
1746         int ret;
1747
1748         if (skb->pkt_type != PACKET_HOST)
1749                 goto discard_it;
1750
1751         /* Count it even if it's bad */
1752         TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1753
1754         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1755                 goto discard_it;
1756
1757         th = skb->h.th;
1758
1759         if (th->doff < sizeof(struct tcphdr) / 4)
1760                 goto bad_packet;
1761         if (!pskb_may_pull(skb, th->doff * 4))
1762                 goto discard_it;
1763
1764         /* An explanation is required here, I think.
1765          * Packet length and doff are validated by header prediction,
1766          * provided case of th->doff==0 is elimineted.
1767          * So, we defer the checks. */
1768         if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
1769              tcp_v4_checksum_init(skb) < 0))
1770                 goto bad_packet;
1771
1772         th = skb->h.th;
1773         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1774         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1775                                     skb->len - th->doff * 4);
1776         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1777         TCP_SKB_CB(skb)->when    = 0;
1778         TCP_SKB_CB(skb)->flags   = skb->nh.iph->tos;
1779         TCP_SKB_CB(skb)->sacked  = 0;
1780
1781         sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source,
1782                              skb->nh.iph->daddr, ntohs(th->dest),
1783                              tcp_v4_iif(skb));
1784
1785         if (!sk)
1786                 goto no_tcp_socket;
1787
1788 process:
1789 #if defined(CONFIG_VNET) || defined(CONFIG_VNET_MODULE)
1790         /* Silently drop if VNET is active and the context is not
1791          * entitled to read the packet.
1792          */
1793         if (vnet_active) {
1794                 /* Transfer ownership of reusable TIME_WAIT buckets to
1795                  * whomever VNET decided should own the packet.
1796                  */
1797                 if (sk->sk_state == TCP_TIME_WAIT)
1798                         sk->sk_xid = skb->xid;
1799
1800                 if ((int) sk->sk_xid > 0 && sk->sk_xid != skb->xid)
1801                         goto discard_it;
1802         }
1803 #endif
1804
1805         if (sk->sk_state == TCP_TIME_WAIT)
1806                 goto do_time_wait;
1807
1808         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1809                 goto discard_and_relse;
1810
1811         if (sk_filter(sk, skb, 0))
1812                 goto discard_and_relse;
1813
1814         skb->dev = NULL;
1815
1816         bh_lock_sock(sk);
1817         ret = 0;
1818         if (!sock_owned_by_user(sk)) {
1819                 if (!tcp_prequeue(sk, skb))
1820                         ret = tcp_v4_do_rcv(sk, skb);
1821         } else
1822                 sk_add_backlog(sk, skb);
1823         bh_unlock_sock(sk);
1824
1825         sock_put(sk);
1826
1827         return ret;
1828
1829 no_tcp_socket:
1830         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1831                 goto discard_it;
1832
1833         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1834 bad_packet:
1835                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1836 #if defined(CONFIG_VNET) || defined(CONFIG_VNET_MODULE)
1837         } else if (vnet_active && skb->sk) {
1838                 /* VNET: Suppress RST if the port was bound to a (presumably raw) socket */
1839 #endif
1840         } else {
1841                 tcp_v4_send_reset(skb);
1842         }
1843
1844 discard_it:
1845         /* Discard frame. */
1846         kfree_skb(skb);
1847         return 0;
1848
1849 discard_and_relse:
1850         sock_put(sk);
1851         goto discard_it;
1852
1853 do_time_wait:
1854         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1855                 tcp_tw_put((struct tcp_tw_bucket *) sk);
1856                 goto discard_it;
1857         }
1858
1859         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1860                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1861                 tcp_tw_put((struct tcp_tw_bucket *) sk);
1862                 goto discard_it;
1863         }
1864         switch (tcp_timewait_state_process((struct tcp_tw_bucket *)sk,
1865                                            skb, th, skb->len)) {
1866         case TCP_TW_SYN: {
1867                 struct sock *sk2 = tcp_v4_lookup_listener(skb->nh.iph->daddr,
1868                                                           ntohs(th->dest),
1869                                                           tcp_v4_iif(skb));
1870                 if (sk2) {
1871                         tcp_tw_deschedule((struct tcp_tw_bucket *)sk);
1872                         tcp_tw_put((struct tcp_tw_bucket *)sk);
1873                         sk = sk2;
1874                         goto process;
1875                 }
1876                 /* Fall through to ACK */
1877         }
1878         case TCP_TW_ACK:
1879                 tcp_v4_timewait_ack(sk, skb);
1880                 break;
1881         case TCP_TW_RST:
1882                 goto no_tcp_socket;
1883         case TCP_TW_SUCCESS:;
1884         }
1885         goto discard_it;
1886 }
1887
1888 /* With per-bucket locks this operation is not-atomic, so that
1889  * this version is not worse.
1890  */
1891 static void __tcp_v4_rehash(struct sock *sk)
1892 {
1893         sk->sk_prot->unhash(sk);
1894         sk->sk_prot->hash(sk);
1895 }
1896
1897 static int tcp_v4_reselect_saddr(struct sock *sk)
1898 {
1899         struct inet_sock *inet = inet_sk(sk);
1900         int err;
1901         struct rtable *rt;
1902         __u32 old_saddr = inet->saddr;
1903         __u32 new_saddr;
1904         __u32 daddr = inet->daddr;
1905
1906         if (inet->opt && inet->opt->srr)
1907                 daddr = inet->opt->faddr;
1908
1909         /* Query new route. */
1910         err = ip_route_connect(&rt, daddr, 0,
1911                                RT_CONN_FLAGS(sk),
1912                                sk->sk_bound_dev_if,
1913                                IPPROTO_TCP,
1914                                inet->sport, inet->dport, sk);
1915         if (err)
1916                 return err;
1917
1918         __sk_dst_set(sk, &rt->u.dst);
1919         tcp_v4_setup_caps(sk, &rt->u.dst);
1920
1921         new_saddr = rt->rt_src;
1922
1923         if (new_saddr == old_saddr)
1924                 return 0;
1925
1926         if (sysctl_ip_dynaddr > 1) {
1927                 printk(KERN_INFO "tcp_v4_rebuild_header(): shifting inet->"
1928                                  "saddr from %d.%d.%d.%d to %d.%d.%d.%d\n",
1929                        NIPQUAD(old_saddr),
1930                        NIPQUAD(new_saddr));
1931         }
1932
1933         inet->saddr = new_saddr;
1934         inet->rcv_saddr = new_saddr;
1935
1936         /* XXX The only one ugly spot where we need to
1937          * XXX really change the sockets identity after
1938          * XXX it has entered the hashes. -DaveM
1939          *
1940          * Besides that, it does not check for connection
1941          * uniqueness. Wait for troubles.
1942          */
1943         __tcp_v4_rehash(sk);
1944         return 0;
1945 }
1946
1947 int tcp_v4_rebuild_header(struct sock *sk)
1948 {
1949         struct inet_sock *inet = inet_sk(sk);
1950         struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0);
1951         u32 daddr;
1952         int err;
1953
1954         /* Route is OK, nothing to do. */
1955         if (rt)
1956                 return 0;
1957
1958         /* Reroute. */
1959         daddr = inet->daddr;
1960         if (inet->opt && inet->opt->srr)
1961                 daddr = inet->opt->faddr;
1962
1963         {
1964                 struct flowi fl = { .oif = sk->sk_bound_dev_if,
1965                                     .nl_u = { .ip4_u =
1966                                               { .daddr = daddr,
1967                                                 .saddr = inet->saddr,
1968                                                 .tos = RT_CONN_FLAGS(sk) } },
1969                                     .proto = IPPROTO_TCP,
1970                                     .uli_u = { .ports =
1971                                                { .sport = inet->sport,
1972                                                  .dport = inet->dport } } };
1973
1974                 err = ip_route_output_flow(&rt, &fl, sk, 0);
1975         }
1976         if (!err) {
1977                 __sk_dst_set(sk, &rt->u.dst);
1978                 tcp_v4_setup_caps(sk, &rt->u.dst);
1979                 return 0;
1980         }
1981
1982         /* Routing failed... */
1983         sk->sk_route_caps = 0;
1984
1985         if (!sysctl_ip_dynaddr ||
1986             sk->sk_state != TCP_SYN_SENT ||
1987             (sk->sk_userlocks & SOCK_BINDADDR_LOCK) ||
1988             (err = tcp_v4_reselect_saddr(sk)) != 0)
1989                 sk->sk_err_soft = -err;
1990
1991         return err;
1992 }
1993
1994 static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
1995 {
1996         struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
1997         struct inet_sock *inet = inet_sk(sk);
1998
1999         sin->sin_family         = AF_INET;
2000         sin->sin_addr.s_addr    = inet->daddr;
2001         sin->sin_port           = inet->dport;
2002 }
2003
2004 /* VJ's idea. Save last timestamp seen from this destination
2005  * and hold it at least for normal timewait interval to use for duplicate
2006  * segment detection in subsequent connections, before they enter synchronized
2007  * state.
2008  */
2009
2010 int tcp_v4_remember_stamp(struct sock *sk)
2011 {
2012         struct inet_sock *inet = inet_sk(sk);
2013         struct tcp_sock *tp = tcp_sk(sk);
2014         struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
2015         struct inet_peer *peer = NULL;
2016         int release_it = 0;
2017
2018         if (!rt || rt->rt_dst != inet->daddr) {
2019                 peer = inet_getpeer(inet->daddr, 1);
2020                 release_it = 1;
2021         } else {
2022                 if (!rt->peer)
2023                         rt_bind_peer(rt, 1);
2024                 peer = rt->peer;
2025         }
2026
2027         if (peer) {
2028                 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
2029                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
2030                      peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
2031                         peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
2032                         peer->tcp_ts = tp->rx_opt.ts_recent;
2033                 }
2034                 if (release_it)
2035                         inet_putpeer(peer);
2036                 return 1;
2037         }
2038
2039         return 0;
2040 }
2041
2042 int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw)
2043 {
2044         struct inet_peer *peer = NULL;
2045
2046         peer = inet_getpeer(tw->tw_daddr, 1);
2047
2048         if (peer) {
2049                 if ((s32)(peer->tcp_ts - tw->tw_ts_recent) <= 0 ||
2050                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
2051                      peer->tcp_ts_stamp <= tw->tw_ts_recent_stamp)) {
2052                         peer->tcp_ts_stamp = tw->tw_ts_recent_stamp;
2053                         peer->tcp_ts = tw->tw_ts_recent;
2054                 }
2055                 inet_putpeer(peer);
2056                 return 1;
2057         }
2058
2059         return 0;
2060 }
2061
2062 struct tcp_func ipv4_specific = {
2063         .queue_xmit     =       ip_queue_xmit,
2064         .send_check     =       tcp_v4_send_check,
2065         .rebuild_header =       tcp_v4_rebuild_header,
2066         .conn_request   =       tcp_v4_conn_request,
2067         .syn_recv_sock  =       tcp_v4_syn_recv_sock,
2068         .remember_stamp =       tcp_v4_remember_stamp,
2069         .net_header_len =       sizeof(struct iphdr),
2070         .setsockopt     =       ip_setsockopt,
2071         .getsockopt     =       ip_getsockopt,
2072         .addr2sockaddr  =       v4_addr2sockaddr,
2073         .sockaddr_len   =       sizeof(struct sockaddr_in),
2074 };
2075
2076 /* NOTE: A lot of things set to zero explicitly by call to
2077  *       sk_alloc() so need not be done here.
2078  */
2079 static int tcp_v4_init_sock(struct sock *sk)
2080 {
2081         struct tcp_sock *tp = tcp_sk(sk);
2082
2083         skb_queue_head_init(&tp->out_of_order_queue);
2084         tcp_init_xmit_timers(sk);
2085         tcp_prequeue_init(tp);
2086
2087         tp->rto  = TCP_TIMEOUT_INIT;
2088         tp->mdev = TCP_TIMEOUT_INIT;
2089
2090         /* So many TCP implementations out there (incorrectly) count the
2091          * initial SYN frame in their delayed-ACK and congestion control
2092          * algorithms that we must have the following bandaid to talk
2093          * efficiently to them.  -DaveM
2094          */
2095         tp->snd_cwnd = 2;
2096
2097         /* See draft-stevens-tcpca-spec-01 for discussion of the
2098          * initialization of these values.
2099          */
2100         tp->snd_ssthresh = 0x7fffffff;  /* Infinity */
2101         tp->snd_cwnd_clamp = ~0;
2102         tp->mss_cache_std = tp->mss_cache = 536;
2103
2104         tp->reordering = sysctl_tcp_reordering;
2105
2106         sk->sk_state = TCP_CLOSE;
2107
2108         sk->sk_write_space = sk_stream_write_space;
2109         sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2110
2111         tp->af_specific = &ipv4_specific;
2112
2113         sk->sk_sndbuf = sysctl_tcp_wmem[1];
2114         sk->sk_rcvbuf = sysctl_tcp_rmem[1];
2115
2116         atomic_inc(&tcp_sockets_allocated);
2117
2118         return 0;
2119 }
2120
2121 int tcp_v4_destroy_sock(struct sock *sk)
2122 {
2123         struct tcp_sock *tp = tcp_sk(sk);
2124
2125         tcp_clear_xmit_timers(sk);
2126
2127         /* Cleanup up the write buffer. */
2128         sk_stream_writequeue_purge(sk);
2129
2130         /* Cleans up our, hopefully empty, out_of_order_queue. */
2131         __skb_queue_purge(&tp->out_of_order_queue);
2132
2133         /* Clean prequeue, it must be empty really */
2134         __skb_queue_purge(&tp->ucopy.prequeue);
2135
2136         /* Clean up a referenced TCP bind bucket. */
2137         if (tp->bind_hash)
2138                 tcp_put_port(sk);
2139
2140         /*
2141          * If sendmsg cached page exists, toss it.
2142          */
2143         if (sk->sk_sndmsg_page) {
2144                 __free_page(sk->sk_sndmsg_page);
2145                 sk->sk_sndmsg_page = NULL;
2146         }
2147
2148         atomic_dec(&tcp_sockets_allocated);
2149
2150         return 0;
2151 }
2152
2153 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2154
2155 #ifdef CONFIG_PROC_FS
2156 /* Proc filesystem TCP sock list dumping. */
2157
2158 static inline struct tcp_tw_bucket *tw_head(struct hlist_head *head)
2159 {
2160         return hlist_empty(head) ? NULL :
2161                 list_entry(head->first, struct tcp_tw_bucket, tw_node);
2162 }
2163
2164 static inline struct tcp_tw_bucket *tw_next(struct tcp_tw_bucket *tw)
2165 {
2166         return tw->tw_node.next ?
2167                 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
2168 }
2169
2170 static void *listening_get_next(struct seq_file *seq, void *cur)
2171 {
2172         struct tcp_sock *tp;
2173         struct hlist_node *node;
2174         struct sock *sk = cur;
2175         struct tcp_iter_state* st = seq->private;
2176
2177         if (!sk) {
2178                 st->bucket = 0;
2179                 sk = sk_head(&tcp_listening_hash[0]);
2180                 goto get_sk;
2181         }
2182
2183         ++st->num;
2184
2185         if (st->state == TCP_SEQ_STATE_OPENREQ) {
2186                 struct open_request *req = cur;
2187
2188                 tp = tcp_sk(st->syn_wait_sk);
2189                 req = req->dl_next;
2190                 while (1) {
2191                         while (req) {
2192                                 vxdprintk(VXD_CBIT(net, 6),
2193                                         "sk,req: %p [#%d] (from %d)", req->sk,
2194                                         (req->sk)?req->sk->sk_xid:0, vx_current_xid());
2195                                 if (req->sk &&
2196                                         !vx_check(req->sk->sk_xid, VX_IDENT|VX_WATCH))
2197                                         continue;
2198                                 if (req->class->family == st->family) {
2199                                         cur = req;
2200                                         goto out;
2201                                 }
2202                                 req = req->dl_next;
2203                         }
2204                         if (++st->sbucket >= TCP_SYNQ_HSIZE)
2205                                 break;
2206 get_req:
2207                         req = tp->listen_opt->syn_table[st->sbucket];
2208                 }
2209                 sk        = sk_next(st->syn_wait_sk);
2210                 st->state = TCP_SEQ_STATE_LISTENING;
2211                 read_unlock_bh(&tp->syn_wait_lock);
2212         } else {
2213                 tp = tcp_sk(sk);
2214                 read_lock_bh(&tp->syn_wait_lock);
2215                 if (tp->listen_opt && tp->listen_opt->qlen)
2216                         goto start_req;
2217                 read_unlock_bh(&tp->syn_wait_lock);
2218                 sk = sk_next(sk);
2219         }
2220 get_sk:
2221         sk_for_each_from(sk, node) {
2222                 vxdprintk(VXD_CBIT(net, 6), "sk: %p [#%d] (from %d)",
2223                         sk, sk->sk_xid, vx_current_xid());
2224                 if (!vx_check(sk->sk_xid, VX_IDENT|VX_WATCH))
2225                         continue;
2226                 if (sk->sk_family == st->family) {
2227                         cur = sk;
2228                         goto out;
2229                 }
2230                 tp = tcp_sk(sk);
2231                 read_lock_bh(&tp->syn_wait_lock);
2232                 if (tp->listen_opt && tp->listen_opt->qlen) {
2233 start_req:
2234                         st->uid         = sock_i_uid(sk);
2235                         st->syn_wait_sk = sk;
2236                         st->state       = TCP_SEQ_STATE_OPENREQ;
2237                         st->sbucket     = 0;
2238                         goto get_req;
2239                 }
2240                 read_unlock_bh(&tp->syn_wait_lock);
2241         }
2242         if (++st->bucket < TCP_LHTABLE_SIZE) {
2243                 sk = sk_head(&tcp_listening_hash[st->bucket]);
2244                 goto get_sk;
2245         }
2246         cur = NULL;
2247 out:
2248         return cur;
2249 }
2250
2251 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2252 {
2253         void *rc = listening_get_next(seq, NULL);
2254
2255         while (rc && *pos) {
2256                 rc = listening_get_next(seq, rc);
2257                 --*pos;
2258         }
2259         return rc;
2260 }
2261
2262 static void *established_get_first(struct seq_file *seq)
2263 {
2264         struct tcp_iter_state* st = seq->private;
2265         void *rc = NULL;
2266
2267         for (st->bucket = 0; st->bucket < tcp_ehash_size; ++st->bucket) {
2268                 struct sock *sk;
2269                 struct hlist_node *node;
2270                 struct tcp_tw_bucket *tw;
2271
2272                 /* We can reschedule _before_ having picked the target: */
2273                 cond_resched_softirq();
2274
2275                 read_lock(&tcp_ehash[st->bucket].lock);
2276                 sk_for_each(sk, node, &tcp_ehash[st->bucket].chain) {
2277                         vxdprintk(VXD_CBIT(net, 6),
2278                                 "sk,egf: %p [#%d] (from %d)",
2279                                 sk, sk->sk_xid, vx_current_xid());
2280                         if (!vx_check(sk->sk_xid, VX_IDENT|VX_WATCH))
2281                                 continue;
2282                         if (sk->sk_family != st->family)
2283                                 continue;
2284                         rc = sk;
2285                         goto out;
2286                 }
2287                 st->state = TCP_SEQ_STATE_TIME_WAIT;
2288                 tw_for_each(tw, node,
2289                             &tcp_ehash[st->bucket + tcp_ehash_size].chain) {
2290                         vxdprintk(VXD_CBIT(net, 6),
2291                                 "tw: %p [#%d] (from %d)",
2292                                 tw, tw->tw_xid, vx_current_xid());
2293                         if (!vx_check(tw->tw_xid, VX_IDENT|VX_WATCH))
2294                                 continue;
2295                         if (tw->tw_family != st->family)
2296                                 continue;
2297                         rc = tw;
2298                         goto out;
2299                 }
2300                 read_unlock(&tcp_ehash[st->bucket].lock);
2301                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2302         }
2303 out:
2304         return rc;
2305 }
2306
2307 static void *established_get_next(struct seq_file *seq, void *cur)
2308 {
2309         struct sock *sk = cur;
2310         struct tcp_tw_bucket *tw;
2311         struct hlist_node *node;
2312         struct tcp_iter_state* st = seq->private;
2313
2314         ++st->num;
2315
2316         if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2317                 tw = cur;
2318                 tw = tw_next(tw);
2319 get_tw:
2320                 while (tw && (tw->tw_family != st->family ||
2321                         !vx_check(tw->tw_xid, VX_IDENT|VX_WATCH))) {
2322                         tw = tw_next(tw);
2323                 }
2324                 if (tw) {
2325                         cur = tw;
2326                         goto out;
2327                 }
2328                 read_unlock(&tcp_ehash[st->bucket].lock);
2329                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2330
2331                 /* We can reschedule between buckets: */
2332                 cond_resched_softirq();
2333
2334                 if (++st->bucket < tcp_ehash_size) {
2335                         read_lock(&tcp_ehash[st->bucket].lock);
2336                         sk = sk_head(&tcp_ehash[st->bucket].chain);
2337                 } else {
2338                         cur = NULL;
2339                         goto out;
2340                 }
2341         } else
2342                 sk = sk_next(sk);
2343
2344         sk_for_each_from(sk, node) {
2345                 vxdprintk(VXD_CBIT(net, 6),
2346                         "sk,egn: %p [#%d] (from %d)",
2347                         sk, sk->sk_xid, vx_current_xid());
2348                 if (!vx_check(sk->sk_xid, VX_IDENT|VX_WATCH))
2349                         continue;
2350                 if (sk->sk_family == st->family)
2351                         goto found;
2352         }
2353
2354         st->state = TCP_SEQ_STATE_TIME_WAIT;
2355         tw = tw_head(&tcp_ehash[st->bucket + tcp_ehash_size].chain);
2356         goto get_tw;
2357 found:
2358         cur = sk;
2359 out:
2360         return cur;
2361 }
2362
2363 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2364 {
2365         void *rc = established_get_first(seq);
2366
2367         while (rc && pos) {
2368                 rc = established_get_next(seq, rc);
2369                 --pos;
2370         }
2371         return rc;
2372 }
2373
2374 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2375 {
2376         void *rc;
2377         struct tcp_iter_state* st = seq->private;
2378
2379         tcp_listen_lock();
2380         st->state = TCP_SEQ_STATE_LISTENING;
2381         rc        = listening_get_idx(seq, &pos);
2382
2383         if (!rc) {
2384                 tcp_listen_unlock();
2385                 local_bh_disable();
2386                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2387                 rc        = established_get_idx(seq, pos);
2388         }
2389
2390         return rc;
2391 }
2392
2393 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2394 {
2395         struct tcp_iter_state* st = seq->private;
2396         st->state = TCP_SEQ_STATE_LISTENING;
2397         st->num = 0;
2398         return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2399 }
2400
2401 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2402 {
2403         void *rc = NULL;
2404         struct tcp_iter_state* st;
2405
2406         if (v == SEQ_START_TOKEN) {
2407                 rc = tcp_get_idx(seq, 0);
2408                 goto out;
2409         }
2410         st = seq->private;
2411
2412         switch (st->state) {
2413         case TCP_SEQ_STATE_OPENREQ:
2414         case TCP_SEQ_STATE_LISTENING:
2415                 rc = listening_get_next(seq, v);
2416                 if (!rc) {
2417                         tcp_listen_unlock();
2418                         local_bh_disable();
2419                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2420                         rc        = established_get_first(seq);
2421                 }
2422                 break;
2423         case TCP_SEQ_STATE_ESTABLISHED:
2424         case TCP_SEQ_STATE_TIME_WAIT:
2425                 rc = established_get_next(seq, v);
2426                 break;
2427         }
2428 out:
2429         ++*pos;
2430         return rc;
2431 }
2432
2433 static void tcp_seq_stop(struct seq_file *seq, void *v)
2434 {
2435         struct tcp_iter_state* st = seq->private;
2436
2437         switch (st->state) {
2438         case TCP_SEQ_STATE_OPENREQ:
2439                 if (v) {
2440                         struct tcp_sock *tp = tcp_sk(st->syn_wait_sk);
2441                         read_unlock_bh(&tp->syn_wait_lock);
2442                 }
2443         case TCP_SEQ_STATE_LISTENING:
2444                 if (v != SEQ_START_TOKEN)
2445                         tcp_listen_unlock();
2446                 break;
2447         case TCP_SEQ_STATE_TIME_WAIT:
2448         case TCP_SEQ_STATE_ESTABLISHED:
2449                 if (v)
2450                         read_unlock(&tcp_ehash[st->bucket].lock);
2451                 local_bh_enable();
2452                 break;
2453         }
2454 }
2455
2456 static int tcp_seq_open(struct inode *inode, struct file *file)
2457 {
2458         struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2459         struct seq_file *seq;
2460         struct tcp_iter_state *s;
2461         int rc;
2462
2463         if (unlikely(afinfo == NULL))
2464                 return -EINVAL;
2465
2466         s = kmalloc(sizeof(*s), GFP_KERNEL);
2467         if (!s)
2468                 return -ENOMEM;
2469         memset(s, 0, sizeof(*s));
2470         s->family               = afinfo->family;
2471         s->seq_ops.start        = tcp_seq_start;
2472         s->seq_ops.next         = tcp_seq_next;
2473         s->seq_ops.show         = afinfo->seq_show;
2474         s->seq_ops.stop         = tcp_seq_stop;
2475
2476         rc = seq_open(file, &s->seq_ops);
2477         if (rc)
2478                 goto out_kfree;
2479         seq          = file->private_data;
2480         seq->private = s;
2481 out:
2482         return rc;
2483 out_kfree:
2484         kfree(s);
2485         goto out;
2486 }
2487
2488 int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
2489 {
2490         int rc = 0;
2491         struct proc_dir_entry *p;
2492
2493         if (!afinfo)
2494                 return -EINVAL;
2495         afinfo->seq_fops->owner         = afinfo->owner;
2496         afinfo->seq_fops->open          = tcp_seq_open;
2497         afinfo->seq_fops->read          = seq_read;
2498         afinfo->seq_fops->llseek        = seq_lseek;
2499         afinfo->seq_fops->release       = seq_release_private;
2500
2501         p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
2502         if (p)
2503                 p->data = afinfo;
2504         else
2505                 rc = -ENOMEM;
2506         return rc;
2507 }
2508
2509 void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
2510 {
2511         if (!afinfo)
2512                 return;
2513         proc_net_remove(afinfo->name);
2514         memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops));
2515 }
2516
2517 static void get_openreq4(struct sock *sk, struct open_request *req,
2518                          char *tmpbuf, int i, int uid)
2519 {
2520         int ttd = req->expires - jiffies;
2521
2522         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2523                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
2524                 i,
2525                 req->af.v4_req.loc_addr,
2526                 ntohs(inet_sk(sk)->sport),
2527                 req->af.v4_req.rmt_addr,
2528                 ntohs(req->rmt_port),
2529                 TCP_SYN_RECV,
2530                 0, 0, /* could print option size, but that is af dependent. */
2531                 1,    /* timers active (only the expire timer) */
2532                 jiffies_to_clock_t(ttd),
2533                 req->retrans,
2534                 uid,
2535                 0,  /* non standard timer */
2536                 0, /* open_requests have no inode */
2537                 atomic_read(&sk->sk_refcnt),
2538                 req);
2539 }
2540
2541 static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
2542 {
2543         int timer_active;
2544         unsigned long timer_expires;
2545         struct tcp_sock *tp = tcp_sk(sp);
2546         struct inet_sock *inet = inet_sk(sp);
2547         unsigned int dest = inet->daddr;
2548         unsigned int src = inet->rcv_saddr;
2549         __u16 destp = ntohs(inet->dport);
2550         __u16 srcp = ntohs(inet->sport);
2551
2552         if (tp->pending == TCP_TIME_RETRANS) {
2553                 timer_active    = 1;
2554                 timer_expires   = tp->timeout;
2555         } else if (tp->pending == TCP_TIME_PROBE0) {
2556                 timer_active    = 4;
2557                 timer_expires   = tp->timeout;
2558         } else if (timer_pending(&sp->sk_timer)) {
2559                 timer_active    = 2;
2560                 timer_expires   = sp->sk_timer.expires;
2561         } else {
2562                 timer_active    = 0;
2563                 timer_expires = jiffies;
2564         }
2565
2566         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2567                         "%08X %5d %8d %lu %d %p %u %u %u %u %d",
2568                 i, src, srcp, dest, destp, sp->sk_state,
2569                 tp->write_seq - tp->snd_una, tp->rcv_nxt - tp->copied_seq,
2570                 timer_active,
2571                 jiffies_to_clock_t(timer_expires - jiffies),
2572                 tp->retransmits,
2573                 sock_i_uid(sp),
2574                 tp->probes_out,
2575                 sock_i_ino(sp),
2576                 atomic_read(&sp->sk_refcnt), sp,
2577                 tp->rto, tp->ack.ato, (tp->ack.quick << 1) | tp->ack.pingpong,
2578                 tp->snd_cwnd,
2579                 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
2580 }
2581
2582 static void get_timewait4_sock(struct tcp_tw_bucket *tw, char *tmpbuf, int i)
2583 {
2584         unsigned int dest, src;
2585         __u16 destp, srcp;
2586         int ttd = tw->tw_ttd - jiffies;
2587
2588         if (ttd < 0)
2589                 ttd = 0;
2590
2591         dest  = tw->tw_daddr;
2592         src   = tw->tw_rcv_saddr;
2593         destp = ntohs(tw->tw_dport);
2594         srcp  = ntohs(tw->tw_sport);
2595
2596         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2597                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
2598                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2599                 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2600                 atomic_read(&tw->tw_refcnt), tw);
2601 }
2602
2603 #define TMPSZ 150
2604
2605 static int tcp4_seq_show(struct seq_file *seq, void *v)
2606 {
2607         struct tcp_iter_state* st;
2608         char tmpbuf[TMPSZ + 1];
2609
2610         if (v == SEQ_START_TOKEN) {
2611                 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2612                            "  sl  local_address rem_address   st tx_queue "
2613                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2614                            "inode");
2615                 goto out;
2616         }
2617         st = seq->private;
2618
2619         switch (st->state) {
2620         case TCP_SEQ_STATE_LISTENING:
2621         case TCP_SEQ_STATE_ESTABLISHED:
2622                 get_tcp4_sock(v, tmpbuf, st->num);
2623                 break;
2624         case TCP_SEQ_STATE_OPENREQ:
2625                 get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
2626                 break;
2627         case TCP_SEQ_STATE_TIME_WAIT:
2628                 get_timewait4_sock(v, tmpbuf, st->num);
2629                 break;
2630         }
2631         seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
2632 out:
2633         return 0;
2634 }
2635
2636 static struct file_operations tcp4_seq_fops;
2637 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2638         .owner          = THIS_MODULE,
2639         .name           = "tcp",
2640         .family         = AF_INET,
2641         .seq_show       = tcp4_seq_show,
2642         .seq_fops       = &tcp4_seq_fops,
2643 };
2644
2645 int __init tcp4_proc_init(void)
2646 {
2647         return tcp_proc_register(&tcp4_seq_afinfo);
2648 }
2649
2650 void tcp4_proc_exit(void)
2651 {
2652         tcp_proc_unregister(&tcp4_seq_afinfo);
2653 }
2654 #endif /* CONFIG_PROC_FS */
2655
2656 struct proto tcp_prot = {
2657         .name                   = "TCP",
2658         .owner                  = THIS_MODULE,
2659         .close                  = tcp_close,
2660         .connect                = tcp_v4_connect,
2661         .disconnect             = tcp_disconnect,
2662         .accept                 = tcp_accept,
2663         .ioctl                  = tcp_ioctl,
2664         .init                   = tcp_v4_init_sock,
2665         .destroy                = tcp_v4_destroy_sock,
2666         .shutdown               = tcp_shutdown,
2667         .setsockopt             = tcp_setsockopt,
2668         .getsockopt             = tcp_getsockopt,
2669         .sendmsg                = tcp_sendmsg,
2670         .recvmsg                = tcp_recvmsg,
2671         .backlog_rcv            = tcp_v4_do_rcv,
2672         .hash                   = tcp_v4_hash,
2673         .unhash                 = tcp_unhash,
2674         .get_port               = tcp_v4_get_port,
2675         .enter_memory_pressure  = tcp_enter_memory_pressure,
2676         .sockets_allocated      = &tcp_sockets_allocated,
2677         .memory_allocated       = &tcp_memory_allocated,
2678         .memory_pressure        = &tcp_memory_pressure,
2679         .sysctl_mem             = sysctl_tcp_mem,
2680         .sysctl_wmem            = sysctl_tcp_wmem,
2681         .sysctl_rmem            = sysctl_tcp_rmem,
2682         .max_header             = MAX_TCP_HEADER,
2683         .obj_size               = sizeof(struct tcp_sock),
2684 };
2685
2686
2687
2688 void __init tcp_v4_init(struct net_proto_family *ops)
2689 {
2690         int err = sock_create_kern(PF_INET, SOCK_RAW, IPPROTO_TCP, &tcp_socket);
2691         if (err < 0)
2692                 panic("Failed to create the TCP control socket.\n");
2693         tcp_socket->sk->sk_allocation   = GFP_ATOMIC;
2694         inet_sk(tcp_socket->sk)->uc_ttl = -1;
2695
2696         /* Unhash it so that IP input processing does not even
2697          * see it, we do not wish this socket to see incoming
2698          * packets.
2699          */
2700         tcp_socket->sk->sk_prot->unhash(tcp_socket->sk);
2701 }
2702
2703 EXPORT_SYMBOL(ipv4_specific);
2704 EXPORT_SYMBOL(tcp_bind_hash);
2705 EXPORT_SYMBOL(tcp_bucket_create);
2706 EXPORT_SYMBOL(tcp_hashinfo);
2707 EXPORT_SYMBOL(tcp_inherit_port);
2708 EXPORT_SYMBOL(tcp_listen_wlock);
2709 EXPORT_SYMBOL(tcp_port_rover);
2710 EXPORT_SYMBOL(tcp_prot);
2711 EXPORT_SYMBOL(tcp_put_port);
2712 EXPORT_SYMBOL(tcp_unhash);
2713 EXPORT_SYMBOL(tcp_v4_conn_request);
2714 EXPORT_SYMBOL(tcp_v4_connect);
2715 EXPORT_SYMBOL(tcp_v4_do_rcv);
2716 EXPORT_SYMBOL(tcp_v4_rebuild_header);
2717 EXPORT_SYMBOL(tcp_v4_remember_stamp);
2718 EXPORT_SYMBOL(tcp_v4_send_check);
2719 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2720
2721 #ifdef CONFIG_PROC_FS
2722 EXPORT_SYMBOL(tcp_proc_register);
2723 EXPORT_SYMBOL(tcp_proc_unregister);
2724 #endif
2725 EXPORT_SYMBOL(sysctl_local_port_range);
2726 EXPORT_SYMBOL(sysctl_max_syn_backlog);
2727 EXPORT_SYMBOL(sysctl_tcp_low_latency);
2728 EXPORT_SYMBOL(sysctl_tcp_tw_reuse);
2729