net/ipv4/tcp_minisocks.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     $Id: tcp_minisocks.c,v 1.15 2002/02/01 22:01:04 davem Exp $
   9  *
  10  * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *              Corey Minyard <wf-rch!minyard@relay.EU.net>
  14  *              Florian La Roche, <flla@stud.uni-sb.de>
  15  *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
  16  *              Linus Torvalds, <torvalds@cs.helsinki.fi>
  17  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  18  *              Matthew Dillon, <dillon@apollo.west.oic.com>
  19  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  20  *              Jorge Cwik, <jorge@laser.satlink.net>
  21  */
  22
  23 #include <linux/config.h>
  24 #include <linux/mm.h>
  25 #include <linux/module.h>
  26 #include <linux/sysctl.h>
  27 #include <linux/workqueue.h>
  28 #include <net/tcp.h>
  29 #include <net/inet_common.h>
  30 #include <net/xfrm.h>
  31
  32 #ifdef CONFIG_SYSCTL
  33 #define SYNC_INIT 0 /* let the user enable it */
  34 #else
  35 #define SYNC_INIT 1
  36 #endif
  37
  38 int sysctl_tcp_tw_recycle;
  39 int sysctl_tcp_max_tw_buckets = NR_FILE*2;
  40
  41 int sysctl_tcp_syncookies = SYNC_INIT;
  42 int sysctl_tcp_abort_on_overflow;
  43
  44 static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
  45 {
  46         if (seq == s_win)
  47                 return 1;
  48         if (after(end_seq, s_win) && before(seq, e_win))
  49                 return 1;
  50         return (seq == e_win && seq == end_seq);
  51 }
  52
  53 /* New-style handling of TIME_WAIT sockets. */
  54
  55 int tcp_tw_count;
  56
  57
  58 /* Must be called with locally disabled BHs. */
  59 static void tcp_timewait_kill(struct tcp_tw_bucket *tw)
  60 {
  61         struct tcp_ehash_bucket *ehead;
  62         struct tcp_bind_hashbucket *bhead;
  63         struct tcp_bind_bucket *tb;
  64
  65         /* Unlink from established hashes. */
  66         ehead = &tcp_ehash[tw->tw_hashent];
  67         write_lock(&ehead->lock);
  68         if (hlist_unhashed(&tw->tw_node)) {
  69                 write_unlock(&ehead->lock);
  70                 return;
  71         }
  72         __hlist_del(&tw->tw_node);
  73         sk_node_init(&tw->tw_node);
  74         write_unlock(&ehead->lock);
  75
  76         /* Disassociate with bind bucket. */
  77         bhead = &tcp_bhash[tcp_bhashfn(tw->tw_num)];
  78         spin_lock(&bhead->lock);
  79         tb = tw->tw_tb;
  80         __hlist_del(&tw->tw_bind_node);
  81         tw->tw_tb = NULL;
  82         tcp_bucket_destroy(tb);
  83         spin_unlock(&bhead->lock);
  84
  85 #ifdef INET_REFCNT_DEBUG
  86         if (atomic_read(&tw->tw_refcnt) != 1) {
  87                 printk(KERN_DEBUG "tw_bucket %p refcnt=%d\n", tw,
  88                        atomic_read(&tw->tw_refcnt));
  89         }
  90 #endif
  91         tcp_tw_put(tw);
  92 }
  93
  94 /*
  95  * * Main purpose of TIME-WAIT state is to close connection gracefully,
  96  *   when one of ends sits in LAST-ACK or CLOSING retransmitting FIN
  97  *   (and, probably, tail of data) and one or more our ACKs are lost.
  98  * * What is TIME-WAIT timeout? It is associated with maximal packet
  99  *   lifetime in the internet, which results in wrong conclusion, that
 100  *   it is set to catch "old duplicate segments" wandering out of their path.
 101  *   It is not quite correct. This timeout is calculated so that it exceeds
 102  *   maximal retransmission timeout enough to allow to lose one (or more)
 103  *   segments sent by peer and our ACKs. This time may be calculated from RTO.
 104  * * When TIME-WAIT socket receives RST, it means that another end
 105  *   finally closed and we are allowed to kill TIME-WAIT too.
 106  * * Second purpose of TIME-WAIT is catching old duplicate segments.
 107  *   Well, certainly it is pure paranoia, but if we load TIME-WAIT
 108  *   with this semantics, we MUST NOT kill TIME-WAIT state with RSTs.
 109  * * If we invented some more clever way to catch duplicates
 110  *   (f.e. based on PAWS), we could truncate TIME-WAIT to several RTOs.
 111  *
 112  * The algorithm below is based on FORMAL INTERPRETATION of RFCs.
 113  * When you compare it to RFCs, please, read section SEGMENT ARRIVES
 114  * from the very beginning.
 115  *
 116  * NOTE. With recycling (and later with fin-wait-2) TW bucket
 117  * is _not_ stateless. It means, that strictly speaking we must
 118  * spinlock it. I do not want! Well, probability of misbehaviour
 119  * is ridiculously low and, seems, we could use some mb() tricks
 120  * to avoid misread sequence numbers, states etc.  --ANK
 121  */
 122 enum tcp_tw_status
 123 tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb,
 124                            struct tcphdr *th, unsigned len)
 125 {
 126         struct tcp_opt tp;
 127         int paws_reject = 0;
 128
 129         tp.saw_tstamp = 0;
 130         if (th->doff > (sizeof(struct tcphdr) >> 2) && tw->tw_ts_recent_stamp) {
 131                 tcp_parse_options(skb, &tp, 0);
 132
 133                 if (tp.saw_tstamp) {
 134                         tp.ts_recent       = tw->tw_ts_recent;
 135                         tp.ts_recent_stamp = tw->tw_ts_recent_stamp;
 136                         paws_reject = tcp_paws_check(&tp, th->rst);
 137                 }
 138         }
 139
 140         if (tw->tw_substate == TCP_FIN_WAIT2) {
 141                 /* Just repeat all the checks of tcp_rcv_state_process() */
 142
 143                 /* Out of window, send ACK */
 144                 if (paws_reject ||
 145                     !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
 146                                    tw->tw_rcv_nxt,
 147                                    tw->tw_rcv_nxt + tw->tw_rcv_wnd))
 148                         return TCP_TW_ACK;
 149
 150                 if (th->rst)
 151                         goto kill;
 152
 153                 if (th->syn && !before(TCP_SKB_CB(skb)->seq, tw->tw_rcv_nxt))
 154                         goto kill_with_rst;
 155
 156                 /* Dup ACK? */
 157                 if (!after(TCP_SKB_CB(skb)->end_seq, tw->tw_rcv_nxt) ||
 158                     TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) {
 159                         tcp_tw_put(tw);
 160                         return TCP_TW_SUCCESS;
 161                 }
 162
 163                 /* New data or FIN. If new data arrive after half-duplex close,
 164                  * reset.
 165                  */
 166                 if (!th->fin ||
 167                     TCP_SKB_CB(skb)->end_seq != tw->tw_rcv_nxt + 1) {
 168 kill_with_rst:
 169                         tcp_tw_deschedule(tw);
 170                         tcp_tw_put(tw);
 171                         return TCP_TW_RST;
 172                 }
 173
 174                 /* FIN arrived, enter true time-wait state. */
 175                 tw->tw_substate = TCP_TIME_WAIT;
 176                 tw->tw_rcv_nxt  = TCP_SKB_CB(skb)->end_seq;
 177                 if (tp.saw_tstamp) {
 178                         tw->tw_ts_recent_stamp  = xtime.tv_sec;
 179                         tw->tw_ts_recent        = tp.rcv_tsval;
 180                 }
 181
 182                 /* I am shamed, but failed to make it more elegant.
 183                  * Yes, it is direct reference to IP, which is impossible
 184                  * to generalize to IPv6. Taking into account that IPv6
 185                  * do not undertsnad recycling in any case, it not
 186                  * a big problem in practice. --ANK */
 187                 if (tw->tw_family == AF_INET &&
 188                     sysctl_tcp_tw_recycle && tw->tw_ts_recent_stamp &&
 189                     tcp_v4_tw_remember_stamp(tw))
 190                         tcp_tw_schedule(tw, tw->tw_timeout);
 191                 else
 192                         tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN);
 193                 return TCP_TW_ACK;
 194         }
 195
 196         /*
 197          *      Now real TIME-WAIT state.
 198          *
 199          *      RFC 1122:
 200          *      "When a connection is [...] on TIME-WAIT state [...]
 201          *      [a TCP] MAY accept a new SYN from the remote TCP to
 202          *      reopen the connection directly, if it:
 203          *
 204          *      (1)  assigns its initial sequence number for the new
 205          *      connection to be larger than the largest sequence
 206          *      number it used on the previous connection incarnation,
 207          *      and
 208          *
 209          *      (2)  returns to TIME-WAIT state if the SYN turns out
 210          *      to be an old duplicate".
 211          */
 212
 213         if (!paws_reject &&
 214             (TCP_SKB_CB(skb)->seq == tw->tw_rcv_nxt &&
 215              (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq || th->rst))) {
 216                 /* In window segment, it may be only reset or bare ack. */
 217
 218                 if (th->rst) {
 219                         /* This is TIME_WAIT assasination, in two flavors.
 220                          * Oh well... nobody has a sufficient solution to this
 221                          * protocol bug yet.
 222                          */
 223                         if (sysctl_tcp_rfc1337 == 0) {
 224 kill:
 225                                 tcp_tw_deschedule(tw);
 226                                 tcp_tw_put(tw);
 227                                 return TCP_TW_SUCCESS;
 228                         }
 229                 }
 230                 tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN);
 231
 232                 if (tp.saw_tstamp) {
 233                         tw->tw_ts_recent        = tp.rcv_tsval;
 234                         tw->tw_ts_recent_stamp  = xtime.tv_sec;
 235                 }
 236
 237                 tcp_tw_put(tw);
 238                 return TCP_TW_SUCCESS;
 239         }
 240
 241         /* Out of window segment.
 242
 243            All the segments are ACKed immediately.
 244
 245            The only exception is new SYN. We accept it, if it is
 246            not old duplicate and we are not in danger to be killed
 247            by delayed old duplicates. RFC check is that it has
 248            newer sequence number works at rates <40Mbit/sec.
 249            However, if paws works, it is reliable AND even more,
 250            we even may relax silly seq space cutoff.
 251
 252            RED-PEN: we violate main RFC requirement, if this SYN will appear
 253            old duplicate (i.e. we receive RST in reply to SYN-ACK),
 254            we must return socket to time-wait state. It is not good,
 255            but not fatal yet.
 256          */
 257
 258         if (th->syn && !th->rst && !th->ack && !paws_reject &&
 259             (after(TCP_SKB_CB(skb)->seq, tw->tw_rcv_nxt) ||
 260              (tp.saw_tstamp && (s32)(tw->tw_ts_recent - tp.rcv_tsval) < 0))) {
 261                 u32 isn = tw->tw_snd_nxt + 65535 + 2;
 262                 if (isn == 0)
 263                         isn++;
 264                 TCP_SKB_CB(skb)->when = isn;
 265                 return TCP_TW_SYN;
 266         }
 267
 268         if (paws_reject)
 269                 NET_INC_STATS_BH(PAWSEstabRejected);
 270
 271         if(!th->rst) {
 272                 /* In this case we must reset the TIMEWAIT timer.
 273                  *
 274                  * If it is ACKless SYN it may be both old duplicate
 275                  * and new good SYN with random sequence number <rcv_nxt.
 276                  * Do not reschedule in the last case.
 277                  */
 278                 if (paws_reject || th->ack)
 279                         tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN);
 280
 281                 /* Send ACK. Note, we do not put the bucket,
 282                  * it will be released by caller.
 283                  */
 284                 return TCP_TW_ACK;
 285         }
 286         tcp_tw_put(tw);
 287         return TCP_TW_SUCCESS;
 288 }
 289
 290 /* Enter the time wait state.  This is called with locally disabled BH.
 291  * Essentially we whip up a timewait bucket, copy the
 292  * relevant info into it from the SK, and mess with hash chains
 293  * and list linkage.
 294  */
 295 static void __tcp_tw_hashdance(struct sock *sk, struct tcp_tw_bucket *tw)
 296 {
 297         struct tcp_ehash_bucket *ehead = &tcp_ehash[sk->sk_hashent];
 298         struct tcp_bind_hashbucket *bhead;
 299
 300         /* Step 1: Put TW into bind hash. Original socket stays there too.
 301            Note, that any socket with inet_sk(sk)->num != 0 MUST be bound in
 302            binding cache, even if it is closed.
 303          */
 304         bhead = &tcp_bhash[tcp_bhashfn(inet_sk(sk)->num)];
 305         spin_lock(&bhead->lock);
 306         tw->tw_tb = tcp_sk(sk)->bind_hash;
 307         BUG_TRAP(tcp_sk(sk)->bind_hash);
 308         tw_add_bind_node(tw, &tw->tw_tb->owners);
 309         spin_unlock(&bhead->lock);
 310
 311         write_lock(&ehead->lock);
 312
 313         /* Step 2: Remove SK from established hash. */
 314         if (__sk_del_node_init(sk))
 315                 sock_prot_dec_use(sk->sk_prot);
 316
 317         /* Step 3: Hash TW into TIMEWAIT half of established hash table. */
 318         tw_add_node(tw, &(ehead + tcp_ehash_size)->chain);
 319         atomic_inc(&tw->tw_refcnt);
 320
 321         write_unlock(&ehead->lock);
 322 }
 323
 324 /*
 325  * Move a socket to time-wait or dead fin-wait-2 state.
 326  */
 327 void tcp_time_wait(struct sock *sk, int state, int timeo)
 328 {
 329         struct tcp_tw_bucket *tw = NULL;
 330         struct tcp_opt *tp = tcp_sk(sk);
 331         int recycle_ok = 0;
 332
 333         if (sysctl_tcp_tw_recycle && tp->ts_recent_stamp)
 334                 recycle_ok = tp->af_specific->remember_stamp(sk);
 335
 336         if (tcp_tw_count < sysctl_tcp_max_tw_buckets)
 337                 tw = kmem_cache_alloc(tcp_timewait_cachep, SLAB_ATOMIC);
 338
 339         if(tw != NULL) {
 340                 struct inet_opt *inet = inet_sk(sk);
 341                 int rto = (tp->rto<<2) - (tp->rto>>1);
 342
 343                 /* Give us an identity. */
 344                 tw->tw_daddr            = inet->daddr;
 345                 tw->tw_rcv_saddr        = inet->rcv_saddr;
 346                 tw->tw_bound_dev_if     = sk->sk_bound_dev_if;
 347                 tw->tw_num              = inet->num;
 348                 tw->tw_state            = TCP_TIME_WAIT;
 349                 tw->tw_substate         = state;
 350                 tw->tw_sport            = inet->sport;
 351                 tw->tw_dport            = inet->dport;
 352                 tw->tw_family           = sk->sk_family;
 353                 tw->tw_reuse            = sk->sk_reuse;
 354                 tw->tw_rcv_wscale       = tp->rcv_wscale;
 355                 atomic_set(&tw->tw_refcnt, 1);
 356
 357                 tw->tw_hashent          = sk->sk_hashent;
 358                 tw->tw_rcv_nxt          = tp->rcv_nxt;
 359                 tw->tw_snd_nxt          = tp->snd_nxt;
 360                 tw->tw_rcv_wnd          = tcp_receive_window(tp);
 361                 tw->tw_ts_recent        = tp->ts_recent;
 362                 tw->tw_ts_recent_stamp  = tp->ts_recent_stamp;
 363                 tw_dead_node_init(tw);
 364
 365                 tw->tw_xid              = sk->sk_xid;
 366                 tw->tw_vx_info          = NULL;
 367                 tw->tw_nid              = sk->sk_nid;
 368                 tw->tw_nx_info          = NULL;
 369
 370 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 371                 if (tw->tw_family == PF_INET6) {
 372                         struct ipv6_pinfo *np = inet6_sk(sk);
 373
 374                         ipv6_addr_copy(&tw->tw_v6_daddr, &np->daddr);
 375                         ipv6_addr_copy(&tw->tw_v6_rcv_saddr, &np->rcv_saddr);
 376                         tw->tw_v6_ipv6only = np->ipv6only;
 377                 } else {
 378                         memset(&tw->tw_v6_daddr, 0, sizeof(tw->tw_v6_daddr));
 379                         memset(&tw->tw_v6_rcv_saddr, 0, sizeof(tw->tw_v6_rcv_saddr));
 380                         tw->tw_v6_ipv6only = 0;
 381                 }
 382 #endif
 383                 /* Linkage updates. */
 384                 __tcp_tw_hashdance(sk, tw);
 385
 386                 /* Get the TIME_WAIT timeout firing. */
 387                 if (timeo < rto)
 388                         timeo = rto;
 389
 390                 if (recycle_ok) {
 391                         tw->tw_timeout = rto;
 392                 } else {
 393                         tw->tw_timeout = TCP_TIMEWAIT_LEN;
 394                         if (state == TCP_TIME_WAIT)
 395                                 timeo = TCP_TIMEWAIT_LEN;
 396                 }
 397
 398                 tcp_tw_schedule(tw, timeo);
 399                 tcp_tw_put(tw);
 400         } else {
 401                 /* Sorry, if we're out of memory, just CLOSE this
 402                  * socket up.  We've got bigger problems than
 403                  * non-graceful socket closings.
 404                  */
 405                 if (net_ratelimit())
 406                         printk(KERN_INFO "TCP: time wait bucket table overflow\n");
 407         }
 408
 409         tcp_update_metrics(sk);
 410         tcp_done(sk);
 411 }
 412
 413 /* Kill off TIME_WAIT sockets once their lifetime has expired. */
 414 static int tcp_tw_death_row_slot;
 415
 416 static void tcp_twkill(unsigned long);
 417
 418 /* TIME_WAIT reaping mechanism. */
 419 #define TCP_TWKILL_SLOTS        8       /* Please keep this a power of 2. */
 420 #define TCP_TWKILL_PERIOD       (TCP_TIMEWAIT_LEN/TCP_TWKILL_SLOTS)
 421
 422 #define TCP_TWKILL_QUOTA        100
 423
 424 static struct hlist_head tcp_tw_death_row[TCP_TWKILL_SLOTS];
 425 static spinlock_t tw_death_lock = SPIN_LOCK_UNLOCKED;
 426 static struct timer_list tcp_tw_timer = TIMER_INITIALIZER(tcp_twkill, 0, 0);
 427 static void twkill_work(void *);
 428 static DECLARE_WORK(tcp_twkill_work, twkill_work, NULL);
 429 static u32 twkill_thread_slots;
 430
 431 /* Returns non-zero if quota exceeded.  */
 432 static int tcp_do_twkill_work(int slot, unsigned int quota)
 433 {
 434         struct tcp_tw_bucket *tw;
 435         struct hlist_node *node;
 436         unsigned int killed;
 437         int ret;
 438
 439         /* NOTE: compare this to previous version where lock
 440          * was released after detaching chain. It was racy,
 441          * because tw buckets are scheduled in not serialized context
 442          * in 2.3 (with netfilter), and with softnet it is common, because
 443          * soft irqs are not sequenced.
 444          */
 445         killed = 0;
 446         ret = 0;
 447 rescan:
 448         tw_for_each_inmate(tw, node, &tcp_tw_death_row[slot]) {
 449                 __tw_del_dead_node(tw);
 450                 spin_unlock(&tw_death_lock);
 451                 tcp_timewait_kill(tw);
 452                 tcp_tw_put(tw);
 453                 killed++;
 454                 spin_lock(&tw_death_lock);
 455                 if (killed > quota) {
 456                         ret = 1;
 457                         break;
 458                 }
 459
 460                 /* While we dropped tw_death_lock, another cpu may have
 461                  * killed off the next TW bucket in the list, therefore
 462                  * do a fresh re-read of the hlist head node with the
 463                  * lock reacquired.  We still use the hlist traversal
 464                  * macro in order to get the prefetches.
 465                  */
 466                 goto rescan;
 467         }
 468
 469         tcp_tw_count -= killed;
 470         NET_ADD_STATS_BH(TimeWaited, killed);
 471
 472         return ret;
 473 }
 474
 475 static void tcp_twkill(unsigned long dummy)
 476 {
 477         int need_timer, ret;
 478
 479         spin_lock(&tw_death_lock);
 480
 481         if (tcp_tw_count == 0)
 482                 goto out;
 483
 484         need_timer = 0;
 485         ret = tcp_do_twkill_work(tcp_tw_death_row_slot, TCP_TWKILL_QUOTA);
 486         if (ret) {
 487                 twkill_thread_slots |= (1 << tcp_tw_death_row_slot);
 488                 mb();
 489                 schedule_work(&tcp_twkill_work);
 490                 need_timer = 1;
 491         } else {
 492                 /* We purged the entire slot, anything left?  */
 493                 if (tcp_tw_count)
 494                         need_timer = 1;
 495         }
 496         tcp_tw_death_row_slot =
 497                 ((tcp_tw_death_row_slot + 1) & (TCP_TWKILL_SLOTS - 1));
 498         if (need_timer)
 499                 mod_timer(&tcp_tw_timer, jiffies + TCP_TWKILL_PERIOD);
 500 out:
 501         spin_unlock(&tw_death_lock);
 502 }
 503
 504 extern void twkill_slots_invalid(void);
 505
 506 static void twkill_work(void *dummy)
 507 {
 508         int i;
 509
 510         if ((TCP_TWKILL_SLOTS - 1) > (sizeof(twkill_thread_slots) * 8))
 511                 twkill_slots_invalid();
 512
 513         while (twkill_thread_slots) {
 514                 spin_lock_bh(&tw_death_lock);
 515                 for (i = 0; i < TCP_TWKILL_SLOTS; i++) {
 516                         if (!(twkill_thread_slots & (1 << i)))
 517                                 continue;
 518
 519                         while (tcp_do_twkill_work(i, TCP_TWKILL_QUOTA) != 0) {
 520                                 if (need_resched()) {
 521                                         spin_unlock_bh(&tw_death_lock);
 522                                         schedule();
 523                                         spin_lock_bh(&tw_death_lock);
 524                                 }
 525                         }
 526
 527                         twkill_thread_slots &= ~(1 << i);
 528                 }
 529                 spin_unlock_bh(&tw_death_lock);
 530         }
 531 }
 532
 533 /* These are always called from BH context.  See callers in
 534  * tcp_input.c to verify this.
 535  */
 536
 537 /* This is for handling early-kills of TIME_WAIT sockets. */
 538 void tcp_tw_deschedule(struct tcp_tw_bucket *tw)
 539 {
 540         spin_lock(&tw_death_lock);
 541         if (tw_del_dead_node(tw)) {
 542                 tcp_tw_put(tw);
 543                 if (--tcp_tw_count == 0)
 544                         del_timer(&tcp_tw_timer);
 545         }
 546         spin_unlock(&tw_death_lock);
 547         tcp_timewait_kill(tw);
 548 }
 549
 550 /* Short-time timewait calendar */
 551
 552 static int tcp_twcal_hand = -1;
 553 static int tcp_twcal_jiffie;
 554 static void tcp_twcal_tick(unsigned long);
 555 static struct timer_list tcp_twcal_timer =
 556                 TIMER_INITIALIZER(tcp_twcal_tick, 0, 0);
 557 static struct hlist_head tcp_twcal_row[TCP_TW_RECYCLE_SLOTS];
 558
 559 void tcp_tw_schedule(struct tcp_tw_bucket *tw, int timeo)
 560 {
 561         struct hlist_head *list;
 562         int slot;
 563
 564         /* timeout := RTO * 3.5
 565          *
 566          * 3.5 = 1+2+0.5 to wait for two retransmits.
 567          *
 568          * RATIONALE: if FIN arrived and we entered TIME-WAIT state,
 569          * our ACK acking that FIN can be lost. If N subsequent retransmitted
 570          * FINs (or previous seqments) are lost (probability of such event
 571          * is p^(N+1), where p is probability to lose single packet and
 572          * time to detect the loss is about RTO*(2^N - 1) with exponential
 573          * backoff). Normal timewait length is calculated so, that we
 574          * waited at least for one retransmitted FIN (maximal RTO is 120sec).
 575          * [ BTW Linux. following BSD, violates this requirement waiting
 576          *   only for 60sec, we should wait at least for 240 secs.
 577          *   Well, 240 consumes too much of resources 8)
 578          * ]
 579          * This interval is not reduced to catch old duplicate and
 580          * responces to our wandering segments living for two MSLs.
 581          * However, if we use PAWS to detect
 582          * old duplicates, we can reduce the interval to bounds required
 583          * by RTO, rather than MSL. So, if peer understands PAWS, we
 584          * kill tw bucket after 3.5*RTO (it is important that this number
 585          * is greater than TS tick!) and detect old duplicates with help
 586          * of PAWS.
 587          */
 588         slot = (timeo + (1<<TCP_TW_RECYCLE_TICK) - 1) >> TCP_TW_RECYCLE_TICK;
 589
 590         spin_lock(&tw_death_lock);
 591
 592         /* Unlink it, if it was scheduled */
 593         if (tw_del_dead_node(tw))
 594                 tcp_tw_count--;
 595         else
 596                 atomic_inc(&tw->tw_refcnt);
 597
 598         if (slot >= TCP_TW_RECYCLE_SLOTS) {
 599                 /* Schedule to slow timer */
 600                 if (timeo >= TCP_TIMEWAIT_LEN) {
 601                         slot = TCP_TWKILL_SLOTS-1;
 602                 } else {
 603                         slot = (timeo + TCP_TWKILL_PERIOD-1) / TCP_TWKILL_PERIOD;
 604                         if (slot >= TCP_TWKILL_SLOTS)
 605                                 slot = TCP_TWKILL_SLOTS-1;
 606                 }
 607                 tw->tw_ttd = jiffies + timeo;
 608                 slot = (tcp_tw_death_row_slot + slot) & (TCP_TWKILL_SLOTS - 1);
 609                 list = &tcp_tw_death_row[slot];
 610         } else {
 611                 tw->tw_ttd = jiffies + (slot << TCP_TW_RECYCLE_TICK);
 612
 613                 if (tcp_twcal_hand < 0) {
 614                         tcp_twcal_hand = 0;
 615                         tcp_twcal_jiffie = jiffies;
 616                         tcp_twcal_timer.expires = tcp_twcal_jiffie + (slot<<TCP_TW_RECYCLE_TICK);
 617                         add_timer(&tcp_twcal_timer);
 618                 } else {
 619                         if (time_after(tcp_twcal_timer.expires, jiffies + (slot<<TCP_TW_RECYCLE_TICK)))
 620                                 mod_timer(&tcp_twcal_timer, jiffies + (slot<<TCP_TW_RECYCLE_TICK));
 621                         slot = (tcp_twcal_hand + slot)&(TCP_TW_RECYCLE_SLOTS-1);
 622                 }
 623                 list = &tcp_twcal_row[slot];
 624         }
 625
 626         hlist_add_head(&tw->tw_death_node, list);
 627
 628         if (tcp_tw_count++ == 0)
 629                 mod_timer(&tcp_tw_timer, jiffies+TCP_TWKILL_PERIOD);
 630         spin_unlock(&tw_death_lock);
 631 }
 632
 633 void tcp_twcal_tick(unsigned long dummy)
 634 {
 635         int n, slot;
 636         unsigned long j;
 637         unsigned long now = jiffies;
 638         int killed = 0;
 639         int adv = 0;
 640
 641         spin_lock(&tw_death_lock);
 642         if (tcp_twcal_hand < 0)
 643                 goto out;
 644
 645         slot = tcp_twcal_hand;
 646         j = tcp_twcal_jiffie;
 647
 648         for (n=0; n<TCP_TW_RECYCLE_SLOTS; n++) {
 649                 if (time_before_eq(j, now)) {
 650                         struct hlist_node *node, *safe;
 651                         struct tcp_tw_bucket *tw;
 652
 653                         tw_for_each_inmate_safe(tw, node, safe,
 654                                            &tcp_twcal_row[slot]) {
 655                                 __tw_del_dead_node(tw);
 656                                 tcp_timewait_kill(tw);
 657                                 tcp_tw_put(tw);
 658                                 killed++;
 659                         }
 660                 } else {
 661                         if (!adv) {
 662                                 adv = 1;
 663                                 tcp_twcal_jiffie = j;
 664                                 tcp_twcal_hand = slot;
 665                         }
 666
 667                         if (!hlist_empty(&tcp_twcal_row[slot])) {
 668                                 mod_timer(&tcp_twcal_timer, j);
 669                                 goto out;
 670                         }
 671                 }
 672                 j += (1<<TCP_TW_RECYCLE_TICK);
 673                 slot = (slot+1)&(TCP_TW_RECYCLE_SLOTS-1);
 674         }
 675         tcp_twcal_hand = -1;
 676
 677 out:
 678         if ((tcp_tw_count -= killed) == 0)
 679                 del_timer(&tcp_tw_timer);
 680         NET_ADD_STATS_BH(TimeWaitKilled, killed);
 681         spin_unlock(&tw_death_lock);
 682 }
 683
 684 /* This is not only more efficient than what we used to do, it eliminates
 685  * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM
 686  *
 687  * Actually, we could lots of memory writes here. tp of listening
 688  * socket contains all necessary default parameters.
 689  */
 690 struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req, struct sk_buff *skb)
 691 {
 692         /* allocate the newsk from the same slab of the master sock,
 693          * if not, at sk_free time we'll try to free it from the wrong
 694          * slabcache (i.e. is it TCPv4 or v6?) -acme */
 695         struct sock *newsk = sk_alloc(PF_INET, GFP_ATOMIC, 0, sk->sk_slab);
 696
 697         if(newsk != NULL) {
 698                 struct tcp_opt *newtp;
 699                 struct sk_filter *filter;
 700
 701                 memcpy(newsk, sk, sizeof(struct tcp_sock));
 702                 newsk->sk_state = TCP_SYN_RECV;
 703
 704                 /* SANITY */
 705                 sock_vx_init(newsk);
 706                 sock_nx_init(newsk);
 707                 sk_node_init(&newsk->sk_node);
 708                 tcp_sk(newsk)->bind_hash = NULL;
 709
 710                 /* Clone the TCP header template */
 711                 inet_sk(newsk)->dport = req->rmt_port;
 712
 713                 sock_lock_init(newsk);
 714                 bh_lock_sock(newsk);
 715
 716                 newsk->sk_dst_lock = RW_LOCK_UNLOCKED;
 717                 atomic_set(&newsk->sk_rmem_alloc, 0);
 718                 skb_queue_head_init(&newsk->sk_receive_queue);
 719                 atomic_set(&newsk->sk_wmem_alloc, 0);
 720                 skb_queue_head_init(&newsk->sk_write_queue);
 721                 atomic_set(&newsk->sk_omem_alloc, 0);
 722                 newsk->sk_wmem_queued = 0;
 723                 newsk->sk_forward_alloc = 0;
 724
 725                 sock_reset_flag(newsk, SOCK_DONE);
 726                 newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
 727                 newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
 728                 newsk->sk_callback_lock = RW_LOCK_UNLOCKED;
 729                 skb_queue_head_init(&newsk->sk_error_queue);
 730                 newsk->sk_write_space = tcp_write_space;
 731
 732                 if ((filter = newsk->sk_filter) != NULL)
 733                         sk_filter_charge(newsk, filter);
 734
 735                 if (sk->sk_create_child)
 736                         sk->sk_create_child(sk, newsk);
 737
 738                 if (unlikely(xfrm_sk_clone_policy(newsk))) {
 739                         /* It is still raw copy of parent, so invalidate
 740                          * destructor and make plain sk_free() */
 741                         newsk->sk_destruct = NULL;
 742                         sk_free(newsk);
 743                         return NULL;
 744                 }
 745
 746                 /* Now setup tcp_opt */
 747                 newtp = tcp_sk(newsk);
 748                 newtp->pred_flags = 0;
 749                 newtp->rcv_nxt = req->rcv_isn + 1;
 750                 newtp->snd_nxt = req->snt_isn + 1;
 751                 newtp->snd_una = req->snt_isn + 1;
 752                 newtp->snd_sml = req->snt_isn + 1;
 753
 754                 tcp_prequeue_init(newtp);
 755
 756                 tcp_init_wl(newtp, req->snt_isn, req->rcv_isn);
 757
 758                 newtp->retransmits = 0;
 759                 newtp->backoff = 0;
 760                 newtp->srtt = 0;
 761                 newtp->mdev = TCP_TIMEOUT_INIT;
 762                 newtp->rto = TCP_TIMEOUT_INIT;
 763
 764                 newtp->packets_out = 0;
 765                 newtp->left_out = 0;
 766                 newtp->retrans_out = 0;
 767                 newtp->sacked_out = 0;
 768                 newtp->fackets_out = 0;
 769                 newtp->snd_ssthresh = 0x7fffffff;
 770
 771                 /* So many TCP implementations out there (incorrectly) count the
 772                  * initial SYN frame in their delayed-ACK and congestion control
 773                  * algorithms that we must have the following bandaid to talk
 774                  * efficiently to them.  -DaveM
 775                  */
 776                 newtp->snd_cwnd = 2;
 777                 newtp->snd_cwnd_cnt = 0;
 778
 779                 newtp->bictcp.cnt = 0;
 780                 newtp->bictcp.last_max_cwnd = newtp->bictcp.last_cwnd = 0;
 781
 782                 newtp->frto_counter = 0;
 783                 newtp->frto_highmark = 0;
 784
 785                 tcp_set_ca_state(newtp, TCP_CA_Open);
 786                 tcp_init_xmit_timers(newsk);
 787                 skb_queue_head_init(&newtp->out_of_order_queue);
 788                 newtp->send_head = NULL;
 789                 newtp->rcv_wup = req->rcv_isn + 1;
 790                 newtp->write_seq = req->snt_isn + 1;
 791                 newtp->pushed_seq = newtp->write_seq;
 792                 newtp->copied_seq = req->rcv_isn + 1;
 793
 794                 newtp->saw_tstamp = 0;
 795
 796                 newtp->dsack = 0;
 797                 newtp->eff_sacks = 0;
 798
 799                 newtp->probes_out = 0;
 800                 newtp->num_sacks = 0;
 801                 newtp->urg_data = 0;
 802                 newtp->listen_opt = NULL;
 803                 newtp->accept_queue = newtp->accept_queue_tail = NULL;
 804                 /* Deinitialize syn_wait_lock to trap illegal accesses. */
 805                 memset(&newtp->syn_wait_lock, 0, sizeof(newtp->syn_wait_lock));
 806
 807                 /* Back to base struct sock members. */
 808                 newsk->sk_err = 0;
 809                 newsk->sk_priority = 0;
 810                 atomic_set(&newsk->sk_refcnt, 2);
 811
 812                 /* hmm, maybe from socket? */
 813                 set_vx_info(&newsk->sk_vx_info, current->vx_info);
 814                 set_nx_info(&newsk->sk_nx_info, current->nx_info);
 815 #ifdef INET_REFCNT_DEBUG
 816                 atomic_inc(&inet_sock_nr);
 817 #endif
 818                 atomic_inc(&tcp_sockets_allocated);
 819
 820                 if (sock_flag(newsk, SOCK_KEEPOPEN))
 821                         tcp_reset_keepalive_timer(newsk,
 822                                                   keepalive_time_when(newtp));
 823                 newsk->sk_socket = NULL;
 824                 newsk->sk_sleep = NULL;
 825                 newsk->sk_owner = NULL;
 826
 827                 newtp->tstamp_ok = req->tstamp_ok;
 828                 if((newtp->sack_ok = req->sack_ok) != 0) {
 829                         if (sysctl_tcp_fack)
 830                                 newtp->sack_ok |= 2;
 831                 }
 832                 newtp->window_clamp = req->window_clamp;
 833                 newtp->rcv_ssthresh = req->rcv_wnd;
 834                 newtp->rcv_wnd = req->rcv_wnd;
 835                 newtp->wscale_ok = req->wscale_ok;
 836                 if (newtp->wscale_ok) {
 837                         newtp->snd_wscale = req->snd_wscale;
 838                         newtp->rcv_wscale = req->rcv_wscale;
 839                 } else {
 840                         newtp->snd_wscale = newtp->rcv_wscale = 0;
 841                         newtp->window_clamp = min(newtp->window_clamp, 65535U);
 842                 }
 843                 newtp->snd_wnd = ntohs(skb->h.th->window) << newtp->snd_wscale;
 844                 newtp->max_window = newtp->snd_wnd;
 845
 846                 if (newtp->tstamp_ok) {
 847                         newtp->ts_recent = req->ts_recent;
 848                         newtp->ts_recent_stamp = xtime.tv_sec;
 849                         newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
 850                 } else {
 851                         newtp->ts_recent_stamp = 0;
 852                         newtp->tcp_header_len = sizeof(struct tcphdr);
 853                 }
 854                 if (skb->len >= TCP_MIN_RCVMSS+newtp->tcp_header_len)
 855                         newtp->ack.last_seg_size = skb->len-newtp->tcp_header_len;
 856                 newtp->mss_clamp = req->mss;
 857                 TCP_ECN_openreq_child(newtp, req);
 858                 if (newtp->ecn_flags&TCP_ECN_OK)
 859                         newsk->sk_no_largesend = 1;
 860
 861                 tcp_vegas_init(newtp);
 862                 TCP_INC_STATS_BH(TcpPassiveOpens);
 863         }
 864         return newsk;
 865 }
 866
 867 /*
 868  *      Process an incoming packet for SYN_RECV sockets represented
 869  *      as an open_request.
 870  */
 871
 872 struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
 873                            struct open_request *req,
 874                            struct open_request **prev)
 875 {
 876         struct tcphdr *th = skb->h.th;
 877         struct tcp_opt *tp = tcp_sk(sk);
 878         u32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);
 879         int paws_reject = 0;
 880         struct tcp_opt ttp;
 881         struct sock *child;
 882
 883         ttp.saw_tstamp = 0;
 884         if (th->doff > (sizeof(struct tcphdr)>>2)) {
 885                 tcp_parse_options(skb, &ttp, 0);
 886
 887                 if (ttp.saw_tstamp) {
 888                         ttp.ts_recent = req->ts_recent;
 889                         /* We do not store true stamp, but it is not required,
 890                          * it can be estimated (approximately)
 891                          * from another data.
 892                          */
 893                         ttp.ts_recent_stamp = xtime.tv_sec - ((TCP_TIMEOUT_INIT/HZ)<<req->retrans);
 894                         paws_reject = tcp_paws_check(&ttp, th->rst);
 895                 }
 896         }
 897
 898         /* Check for pure retransmitted SYN. */
 899         if (TCP_SKB_CB(skb)->seq == req->rcv_isn &&
 900             flg == TCP_FLAG_SYN &&
 901             !paws_reject) {
 902                 /*
 903                  * RFC793 draws (Incorrectly! It was fixed in RFC1122)
 904                  * this case on figure 6 and figure 8, but formal
 905                  * protocol description says NOTHING.
 906                  * To be more exact, it says that we should send ACK,
 907                  * because this segment (at least, if it has no data)
 908                  * is out of window.
 909                  *
 910                  *  CONCLUSION: RFC793 (even with RFC1122) DOES NOT
 911                  *  describe SYN-RECV state. All the description
 912                  *  is wrong, we cannot believe to it and should
 913                  *  rely only on common sense and implementation
 914                  *  experience.
 915                  *
 916                  * Enforce "SYN-ACK" according to figure 8, figure 6
 917                  * of RFC793, fixed by RFC1122.
 918                  */
 919                 req->class->rtx_syn_ack(sk, req, NULL);
 920                 return NULL;
 921         }
 922
 923         /* Further reproduces section "SEGMENT ARRIVES"
 924            for state SYN-RECEIVED of RFC793.
 925            It is broken, however, it does not work only
 926            when SYNs are crossed.
 927
 928            You would think that SYN crossing is impossible here, since
 929            we should have a SYN_SENT socket (from connect()) on our end,
 930            but this is not true if the crossed SYNs were sent to both
 931            ends by a malicious third party.  We must defend against this,
 932            and to do that we first verify the ACK (as per RFC793, page
 933            36) and reset if it is invalid.  Is this a true full defense?
 934            To convince ourselves, let us consider a way in which the ACK
 935            test can still pass in this 'malicious crossed SYNs' case.
 936            Malicious sender sends identical SYNs (and thus identical sequence
 937            numbers) to both A and B:
 938
 939                 A: gets SYN, seq=7
 940                 B: gets SYN, seq=7
 941
 942            By our good fortune, both A and B select the same initial
 943            send sequence number of seven :-)
 944
 945                 A: sends SYN|ACK, seq=7, ack_seq=8
 946                 B: sends SYN|ACK, seq=7, ack_seq=8
 947
 948            So we are now A eating this SYN|ACK, ACK test passes.  So
 949            does sequence test, SYN is truncated, and thus we consider
 950            it a bare ACK.
 951
 952            If tp->defer_accept, we silently drop this bare ACK.  Otherwise,
 953            we create an established connection.  Both ends (listening sockets)
 954            accept the new incoming connection and try to talk to each other. 8-)
 955
 956            Note: This case is both harmless, and rare.  Possibility is about the
 957            same as us discovering intelligent life on another plant tomorrow.
 958
 959            But generally, we should (RFC lies!) to accept ACK
 960            from SYNACK both here and in tcp_rcv_state_process().
 961            tcp_rcv_state_process() does not, hence, we do not too.
 962
 963            Note that the case is absolutely generic:
 964            we cannot optimize anything here without
 965            violating protocol. All the checks must be made
 966            before attempt to create socket.
 967          */
 968
 969         /* RFC793 page 36: "If the connection is in any non-synchronized state ...
 970          *                  and the incoming segment acknowledges something not yet
 971          *                  sent (the segment carries an unaccaptable ACK) ...
 972          *                  a reset is sent."
 973          *
 974          * Invalid ACK: reset will be sent by listening socket
 975          */
 976         if ((flg & TCP_FLAG_ACK) &&
 977             (TCP_SKB_CB(skb)->ack_seq != req->snt_isn+1))
 978                 return sk;
 979
 980         /* Also, it would be not so bad idea to check rcv_tsecr, which
 981          * is essentially ACK extension and too early or too late values
 982          * should cause reset in unsynchronized states.
 983          */
 984
 985         /* RFC793: "first check sequence number". */
 986
 987         if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
 988                                           req->rcv_isn+1, req->rcv_isn+1+req->rcv_wnd)) {
 989                 /* Out of window: send ACK and drop. */
 990                 if (!(flg & TCP_FLAG_RST))
 991                         req->class->send_ack(skb, req);
 992                 if (paws_reject)
 993                         NET_INC_STATS_BH(PAWSEstabRejected);
 994                 return NULL;
 995         }
 996
 997         /* In sequence, PAWS is OK. */
 998
 999         if (ttp.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, req->rcv_isn+1))
1000                 req->ts_recent = ttp.rcv_tsval;
1001
1002         if (TCP_SKB_CB(skb)->seq == req->rcv_isn) {
1003                 /* Truncate SYN, it is out of window starting
1004                    at req->rcv_isn+1. */
1005                 flg &= ~TCP_FLAG_SYN;
1006         }
1007
1008         /* RFC793: "second check the RST bit" and
1009          *         "fourth, check the SYN bit"
1010          */
1011         if (flg & (TCP_FLAG_RST|TCP_FLAG_SYN))
1012                 goto embryonic_reset;
1013
1014         /* ACK sequence verified above, just make sure ACK is
1015          * set.  If ACK not set, just silently drop the packet.
1016          */
1017         if (!(flg & TCP_FLAG_ACK))
1018                 return NULL;
1019
1020         /* If TCP_DEFER_ACCEPT is set, drop bare ACK. */
1021         if (tp->defer_accept && TCP_SKB_CB(skb)->end_seq == req->rcv_isn+1) {
1022                 req->acked = 1;
1023                 return NULL;
1024         }
1025
1026         /* OK, ACK is valid, create big socket and
1027          * feed this segment to it. It will repeat all
1028          * the tests. THIS SEGMENT MUST MOVE SOCKET TO
1029          * ESTABLISHED STATE. If it will be dropped after
1030          * socket is created, wait for troubles.
1031          */
1032         child = tp->af_specific->syn_recv_sock(sk, skb, req, NULL);
1033         if (child == NULL)
1034                 goto listen_overflow;
1035
1036         sk_set_owner(child, sk->sk_owner);
1037         tcp_synq_unlink(tp, req, prev);
1038         tcp_synq_removed(sk, req);
1039
1040         tcp_acceptq_queue(sk, req, child);
1041         return child;
1042
1043 listen_overflow:
1044         if (!sysctl_tcp_abort_on_overflow) {
1045                 req->acked = 1;
1046                 return NULL;
1047         }
1048
1049 embryonic_reset:
1050         NET_INC_STATS_BH(EmbryonicRsts);
1051         if (!(flg & TCP_FLAG_RST))
1052                 req->class->send_reset(skb);
1053
1054         tcp_synq_drop(sk, req, prev);
1055         return NULL;
1056 }
1057
1058 /*
1059  * Queue segment on the new socket if the new socket is active,
1060  * otherwise we just shortcircuit this and continue with
1061  * the new socket.
1062  */
1063
1064 int tcp_child_process(struct sock *parent, struct sock *child,
1065                       struct sk_buff *skb)
1066 {
1067         int ret = 0;
1068         int state = child->sk_state;
1069
1070         if (!sock_owned_by_user(child)) {
1071                 ret = tcp_rcv_state_process(child, skb, skb->h.th, skb->len);
1072
1073                 /* Wakeup parent, send SIGIO */
1074                 if (state == TCP_SYN_RECV && child->sk_state != state)
1075                         parent->sk_data_ready(parent, 0);
1076         } else {
1077                 /* Alas, it is possible again, because we do lookup
1078                  * in main socket hash table and lock on listening
1079                  * socket does not protect us more.
1080                  */
1081                 sk_add_backlog(child, skb);
1082         }
1083
1084         bh_unlock_sock(child);
1085         sock_put(child);
1086         return ret;
1087 }
1088
1089 EXPORT_SYMBOL(tcp_check_req);
1090 EXPORT_SYMBOL(tcp_child_process);
1091 EXPORT_SYMBOL(tcp_create_openreq_child);
1092 EXPORT_SYMBOL(tcp_timewait_state_process);
1093 EXPORT_SYMBOL(tcp_tw_deschedule);
1094
1095 #ifdef CONFIG_SYSCTL
1096 EXPORT_SYMBOL(sysctl_tcp_tw_recycle);
1097 #endif