net/ipv4/tcp_minisocks.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     $Id: tcp_minisocks.c,v 1.15 2002/02/01 22:01:04 davem Exp $
   9  *
  10  * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *              Corey Minyard <wf-rch!minyard@relay.EU.net>
  14  *              Florian La Roche, <flla@stud.uni-sb.de>
  15  *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
  16  *              Linus Torvalds, <torvalds@cs.helsinki.fi>
  17  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  18  *              Matthew Dillon, <dillon@apollo.west.oic.com>
  19  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  20  *              Jorge Cwik, <jorge@laser.satlink.net>
  21  */
  22
  23 #include <linux/config.h>
  24 #include <linux/mm.h>
  25 #include <linux/module.h>
  26 #include <linux/sysctl.h>
  27 #include <linux/workqueue.h>
  28 #include <net/tcp.h>
  29 #include <net/inet_common.h>
  30 #include <net/xfrm.h>
  31
  32 #ifdef CONFIG_SYSCTL
  33 #define SYNC_INIT 0 /* let the user enable it */
  34 #else
  35 #define SYNC_INIT 1
  36 #endif
  37
  38 int sysctl_tcp_tw_recycle;
  39 int sysctl_tcp_max_tw_buckets = NR_FILE*2;
  40
  41 int sysctl_tcp_syncookies = SYNC_INIT;
  42 int sysctl_tcp_abort_on_overflow;
  43
  44 static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
  45 {
  46         if (seq == s_win)
  47                 return 1;
  48         if (after(end_seq, s_win) && before(seq, e_win))
  49                 return 1;
  50         return (seq == e_win && seq == end_seq);
  51 }
  52
  53 /* New-style handling of TIME_WAIT sockets. */
  54
  55 int tcp_tw_count;
  56
  57
  58 /* Must be called with locally disabled BHs. */
  59 static void tcp_timewait_kill(struct tcp_tw_bucket *tw)
  60 {
  61         struct tcp_ehash_bucket *ehead;
  62         struct tcp_bind_hashbucket *bhead;
  63         struct tcp_bind_bucket *tb;
  64
  65         /* Unlink from established hashes. */
  66         ehead = &tcp_ehash[tw->tw_hashent];
  67         write_lock(&ehead->lock);
  68         if (hlist_unhashed(&tw->tw_node)) {
  69                 write_unlock(&ehead->lock);
  70                 return;
  71         }
  72         __hlist_del(&tw->tw_node);
  73         sk_node_init(&tw->tw_node);
  74         write_unlock(&ehead->lock);
  75
  76         /* Disassociate with bind bucket. */
  77         bhead = &tcp_bhash[tcp_bhashfn(tw->tw_num)];
  78         spin_lock(&bhead->lock);
  79         tb = tw->tw_tb;
  80         __hlist_del(&tw->tw_bind_node);
  81         tw->tw_tb = NULL;
  82         tcp_bucket_destroy(tb);
  83         spin_unlock(&bhead->lock);
  84
  85 #ifdef INET_REFCNT_DEBUG
  86         if (atomic_read(&tw->tw_refcnt) != 1) {
  87                 printk(KERN_DEBUG "tw_bucket %p refcnt=%d\n", tw,
  88                        atomic_read(&tw->tw_refcnt));
  89         }
  90 #endif
  91         tcp_tw_put(tw);
  92 }
  93
  94 /*
  95  * * Main purpose of TIME-WAIT state is to close connection gracefully,
  96  *   when one of ends sits in LAST-ACK or CLOSING retransmitting FIN
  97  *   (and, probably, tail of data) and one or more our ACKs are lost.
  98  * * What is TIME-WAIT timeout? It is associated with maximal packet
  99  *   lifetime in the internet, which results in wrong conclusion, that
 100  *   it is set to catch "old duplicate segments" wandering out of their path.
 101  *   It is not quite correct. This timeout is calculated so that it exceeds
 102  *   maximal retransmission timeout enough to allow to lose one (or more)
 103  *   segments sent by peer and our ACKs. This time may be calculated from RTO.
 104  * * When TIME-WAIT socket receives RST, it means that another end
 105  *   finally closed and we are allowed to kill TIME-WAIT too.
 106  * * Second purpose of TIME-WAIT is catching old duplicate segments.
 107  *   Well, certainly it is pure paranoia, but if we load TIME-WAIT
 108  *   with this semantics, we MUST NOT kill TIME-WAIT state with RSTs.
 109  * * If we invented some more clever way to catch duplicates
 110  *   (f.e. based on PAWS), we could truncate TIME-WAIT to several RTOs.
 111  *
 112  * The algorithm below is based on FORMAL INTERPRETATION of RFCs.
 113  * When you compare it to RFCs, please, read section SEGMENT ARRIVES
 114  * from the very beginning.
 115  *
 116  * NOTE. With recycling (and later with fin-wait-2) TW bucket
 117  * is _not_ stateless. It means, that strictly speaking we must
 118  * spinlock it. I do not want! Well, probability of misbehaviour
 119  * is ridiculously low and, seems, we could use some mb() tricks
 120  * to avoid misread sequence numbers, states etc.  --ANK
 121  */
 122 enum tcp_tw_status
 123 tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb,
 124                            struct tcphdr *th, unsigned len)
 125 {
 126         struct tcp_opt tp;
 127         int paws_reject = 0;
 128
 129         tp.saw_tstamp = 0;
 130         if (th->doff > (sizeof(struct tcphdr) >> 2) && tw->tw_ts_recent_stamp) {
 131                 tcp_parse_options(skb, &tp, 0);
 132
 133                 if (tp.saw_tstamp) {
 134                         tp.ts_recent       = tw->tw_ts_recent;
 135                         tp.ts_recent_stamp = tw->tw_ts_recent_stamp;
 136                         paws_reject = tcp_paws_check(&tp, th->rst);
 137                 }
 138         }
 139
 140         if (tw->tw_substate == TCP_FIN_WAIT2) {
 141                 /* Just repeat all the checks of tcp_rcv_state_process() */
 142
 143                 /* Out of window, send ACK */
 144                 if (paws_reject ||
 145                     !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
 146                                    tw->tw_rcv_nxt,
 147                                    tw->tw_rcv_nxt + tw->tw_rcv_wnd))
 148                         return TCP_TW_ACK;
 149
 150                 if (th->rst)
 151                         goto kill;
 152
 153                 if (th->syn && !before(TCP_SKB_CB(skb)->seq, tw->tw_rcv_nxt))
 154                         goto kill_with_rst;
 155
 156                 /* Dup ACK? */
 157                 if (!after(TCP_SKB_CB(skb)->end_seq, tw->tw_rcv_nxt) ||
 158                     TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) {
 159                         tcp_tw_put(tw);
 160                         return TCP_TW_SUCCESS;
 161                 }
 162
 163                 /* New data or FIN. If new data arrive after half-duplex close,
 164                  * reset.
 165                  */
 166                 if (!th->fin ||
 167                     TCP_SKB_CB(skb)->end_seq != tw->tw_rcv_nxt + 1) {
 168 kill_with_rst:
 169                         tcp_tw_deschedule(tw);
 170                         tcp_tw_put(tw);
 171                         return TCP_TW_RST;
 172                 }
 173
 174                 /* FIN arrived, enter true time-wait state. */
 175                 tw->tw_substate = TCP_TIME_WAIT;
 176                 tw->tw_rcv_nxt  = TCP_SKB_CB(skb)->end_seq;
 177                 if (tp.saw_tstamp) {
 178                         tw->tw_ts_recent_stamp  = xtime.tv_sec;
 179                         tw->tw_ts_recent        = tp.rcv_tsval;
 180                 }
 181
 182                 /* I am shamed, but failed to make it more elegant.
 183                  * Yes, it is direct reference to IP, which is impossible
 184                  * to generalize to IPv6. Taking into account that IPv6
 185                  * do not undertsnad recycling in any case, it not
 186                  * a big problem in practice. --ANK */
 187                 if (tw->tw_family == AF_INET &&
 188                     sysctl_tcp_tw_recycle && tw->tw_ts_recent_stamp &&
 189                     tcp_v4_tw_remember_stamp(tw))
 190                         tcp_tw_schedule(tw, tw->tw_timeout);
 191                 else
 192                         tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN);
 193                 return TCP_TW_ACK;
 194         }
 195
 196         /*
 197          *      Now real TIME-WAIT state.
 198          *
 199          *      RFC 1122:
 200          *      "When a connection is [...] on TIME-WAIT state [...]
 201          *      [a TCP] MAY accept a new SYN from the remote TCP to
 202          *      reopen the connection directly, if it:
 203          *
 204          *      (1)  assigns its initial sequence number for the new
 205          *      connection to be larger than the largest sequence
 206          *      number it used on the previous connection incarnation,
 207          *      and
 208          *
 209          *      (2)  returns to TIME-WAIT state if the SYN turns out
 210          *      to be an old duplicate".
 211          */
 212
 213         if (!paws_reject &&
 214             (TCP_SKB_CB(skb)->seq == tw->tw_rcv_nxt &&
 215              (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq || th->rst))) {
 216                 /* In window segment, it may be only reset or bare ack. */
 217
 218                 if (th->rst) {
 219                         /* This is TIME_WAIT assasination, in two flavors.
 220                          * Oh well... nobody has a sufficient solution to this
 221                          * protocol bug yet.
 222                          */
 223                         if (sysctl_tcp_rfc1337 == 0) {
 224 kill:
 225                                 tcp_tw_deschedule(tw);
 226                                 tcp_tw_put(tw);
 227                                 return TCP_TW_SUCCESS;
 228                         }
 229                 }
 230                 tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN);
 231
 232                 if (tp.saw_tstamp) {
 233                         tw->tw_ts_recent        = tp.rcv_tsval;
 234                         tw->tw_ts_recent_stamp  = xtime.tv_sec;
 235                 }
 236
 237                 tcp_tw_put(tw);
 238                 return TCP_TW_SUCCESS;
 239         }
 240
 241         /* Out of window segment.
 242
 243            All the segments are ACKed immediately.
 244
 245            The only exception is new SYN. We accept it, if it is
 246            not old duplicate and we are not in danger to be killed
 247            by delayed old duplicates. RFC check is that it has
 248            newer sequence number works at rates <40Mbit/sec.
 249            However, if paws works, it is reliable AND even more,
 250            we even may relax silly seq space cutoff.
 251
 252            RED-PEN: we violate main RFC requirement, if this SYN will appear
 253            old duplicate (i.e. we receive RST in reply to SYN-ACK),
 254            we must return socket to time-wait state. It is not good,
 255            but not fatal yet.
 256          */
 257
 258         if (th->syn && !th->rst && !th->ack && !paws_reject &&
 259             (after(TCP_SKB_CB(skb)->seq, tw->tw_rcv_nxt) ||
 260              (tp.saw_tstamp && (s32)(tw->tw_ts_recent - tp.rcv_tsval) < 0))) {
 261                 u32 isn = tw->tw_snd_nxt + 65535 + 2;
 262                 if (isn == 0)
 263                         isn++;
 264                 TCP_SKB_CB(skb)->when = isn;
 265                 return TCP_TW_SYN;
 266         }
 267
 268         if (paws_reject)
 269                 NET_INC_STATS_BH(PAWSEstabRejected);
 270
 271         if(!th->rst) {
 272                 /* In this case we must reset the TIMEWAIT timer.
 273                  *
 274                  * If it is ACKless SYN it may be both old duplicate
 275                  * and new good SYN with random sequence number <rcv_nxt.
 276                  * Do not reschedule in the last case.
 277                  */
 278                 if (paws_reject || th->ack)
 279                         tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN);
 280
 281                 /* Send ACK. Note, we do not put the bucket,
 282                  * it will be released by caller.
 283                  */
 284                 return TCP_TW_ACK;
 285         }
 286         tcp_tw_put(tw);
 287         return TCP_TW_SUCCESS;
 288 }
 289
 290 /* Enter the time wait state.  This is called with locally disabled BH.
 291  * Essentially we whip up a timewait bucket, copy the
 292  * relevant info into it from the SK, and mess with hash chains
 293  * and list linkage.
 294  */
 295 static void __tcp_tw_hashdance(struct sock *sk, struct tcp_tw_bucket *tw)
 296 {
 297         struct tcp_ehash_bucket *ehead = &tcp_ehash[sk->sk_hashent];
 298         struct tcp_bind_hashbucket *bhead;
 299
 300         /* Step 1: Put TW into bind hash. Original socket stays there too.
 301            Note, that any socket with inet_sk(sk)->num != 0 MUST be bound in
 302            binding cache, even if it is closed.
 303          */
 304         bhead = &tcp_bhash[tcp_bhashfn(inet_sk(sk)->num)];
 305         spin_lock(&bhead->lock);
 306         tw->tw_tb = tcp_sk(sk)->bind_hash;
 307         BUG_TRAP(tcp_sk(sk)->bind_hash);
 308         tw_add_bind_node(tw, &tw->tw_tb->owners);
 309         spin_unlock(&bhead->lock);
 310
 311         write_lock(&ehead->lock);
 312
 313         /* Step 2: Remove SK from established hash. */
 314         if (__sk_del_node_init(sk))
 315                 sock_prot_dec_use(sk->sk_prot);
 316
 317         /* Step 3: Hash TW into TIMEWAIT half of established hash table. */
 318         tw_add_node(tw, &(ehead + tcp_ehash_size)->chain);
 319         atomic_inc(&tw->tw_refcnt);
 320
 321         write_unlock(&ehead->lock);
 322 }
 323
 324 /*
 325  * Move a socket to time-wait or dead fin-wait-2 state.
 326  */
 327 void tcp_time_wait(struct sock *sk, int state, int timeo)
 328 {
 329         struct tcp_tw_bucket *tw = NULL;
 330         struct tcp_opt *tp = tcp_sk(sk);
 331         int recycle_ok = 0;
 332
 333         if (sysctl_tcp_tw_recycle && tp->ts_recent_stamp)
 334                 recycle_ok = tp->af_specific->remember_stamp(sk);
 335
 336         if (tcp_tw_count < sysctl_tcp_max_tw_buckets)
 337                 tw = kmem_cache_alloc(tcp_timewait_cachep, SLAB_ATOMIC);
 338
 339         if(tw != NULL) {
 340                 struct inet_opt *inet = inet_sk(sk);
 341                 int rto = (tp->rto<<2) - (tp->rto>>1);
 342
 343                 /* Give us an identity. */
 344                 tw->tw_daddr            = inet->daddr;
 345                 tw->tw_rcv_saddr        = inet->rcv_saddr;
 346                 tw->tw_bound_dev_if     = sk->sk_bound_dev_if;
 347                 tw->tw_num              = inet->num;
 348                 tw->tw_state            = TCP_TIME_WAIT;
 349                 tw->tw_substate         = state;
 350                 tw->tw_sport            = inet->sport;
 351                 tw->tw_dport            = inet->dport;
 352                 tw->tw_family           = sk->sk_family;
 353                 tw->tw_reuse            = sk->sk_reuse;
 354                 tw->tw_rcv_wscale       = tp->rcv_wscale;
 355                 atomic_set(&tw->tw_refcnt, 1);
 356
 357                 tw->tw_hashent          = sk->sk_hashent;
 358                 tw->tw_rcv_nxt          = tp->rcv_nxt;
 359                 tw->tw_snd_nxt          = tp->snd_nxt;
 360                 tw->tw_rcv_wnd          = tcp_receive_window(tp);
 361                 tw->tw_ts_recent        = tp->ts_recent;
 362                 tw->tw_ts_recent_stamp  = tp->ts_recent_stamp;
 363                 tw_dead_node_init(tw);
 364
 365                 tw->tw_xid              = sk->sk_xid;
 366                 tw->tw_vx_info          = NULL;
 367                 tw->tw_nid              = sk->sk_nid;
 368                 tw->tw_nx_info          = NULL;
 369
 370 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 371                 if (tw->tw_family == PF_INET6) {
 372                         struct ipv6_pinfo *np = inet6_sk(sk);
 373
 374                         ipv6_addr_copy(&tw->tw_v6_daddr, &np->daddr);
 375                         ipv6_addr_copy(&tw->tw_v6_rcv_saddr, &np->rcv_saddr);
 376                         tw->tw_v6_ipv6only = np->ipv6only;
 377                 } else {
 378                         memset(&tw->tw_v6_daddr, 0, sizeof(tw->tw_v6_daddr));
 379                         memset(&tw->tw_v6_rcv_saddr, 0, sizeof(tw->tw_v6_rcv_saddr));
 380                         tw->tw_v6_ipv6only = 0;
 381                 }
 382 #endif
 383                 /* Linkage updates. */
 384                 __tcp_tw_hashdance(sk, tw);
 385
 386                 /* Get the TIME_WAIT timeout firing. */
 387                 if (timeo < rto)
 388                         timeo = rto;
 389
 390                 if (recycle_ok) {
 391                         tw->tw_timeout = rto;
 392                 } else {
 393                         tw->tw_timeout = TCP_TIMEWAIT_LEN;
 394                         if (state == TCP_TIME_WAIT)
 395                                 timeo = TCP_TIMEWAIT_LEN;
 396                 }
 397
 398                 tcp_tw_schedule(tw, timeo);
 399                 tcp_tw_put(tw);
 400         } else {
 401                 /* Sorry, if we're out of memory, just CLOSE this
 402                  * socket up.  We've got bigger problems than
 403                  * non-graceful socket closings.
 404                  */
 405                 if (net_ratelimit())
 406                         printk(KERN_INFO "TCP: time wait bucket table overflow\n");
 407         }
 408
 409         tcp_update_metrics(sk);
 410         tcp_done(sk);
 411 }
 412
 413 /* Kill off TIME_WAIT sockets once their lifetime has expired. */
 414 static int tcp_tw_death_row_slot;
 415
 416 static void tcp_twkill(unsigned long);
 417
 418 /* TIME_WAIT reaping mechanism. */
 419 #define TCP_TWKILL_SLOTS        8       /* Please keep this a power of 2. */
 420 #define TCP_TWKILL_PERIOD       (TCP_TIMEWAIT_LEN/TCP_TWKILL_SLOTS)
 421
 422 #define TCP_TWKILL_QUOTA        100
 423
 424 static struct hlist_head tcp_tw_death_row[TCP_TWKILL_SLOTS];
 425 static spinlock_t tw_death_lock = SPIN_LOCK_UNLOCKED;
 426 static struct timer_list tcp_tw_timer = TIMER_INITIALIZER(tcp_twkill, 0, 0);
 427 static void twkill_work(void *);
 428 static DECLARE_WORK(tcp_twkill_work, twkill_work, NULL);
 429 static u32 twkill_thread_slots;
 430
 431 /* Returns non-zero if quota exceeded.  */
 432 static int tcp_do_twkill_work(int slot, unsigned int quota)
 433 {
 434         struct tcp_tw_bucket *tw;
 435         struct hlist_node *node;
 436         unsigned int killed;
 437         int ret;
 438
 439         /* NOTE: compare this to previous version where lock
 440          * was released after detaching chain. It was racy,
 441          * because tw buckets are scheduled in not serialized context
 442          * in 2.3 (with netfilter), and with softnet it is common, because
 443          * soft irqs are not sequenced.
 444          */
 445         killed = 0;
 446         ret = 0;
 447 rescan:
 448         tw_for_each_inmate(tw, node, &tcp_tw_death_row[slot]) {
 449                 __tw_del_dead_node(tw);
 450                 spin_unlock(&tw_death_lock);
 451                 tcp_timewait_kill(tw);
 452                 tcp_tw_put(tw);
 453                 killed++;
 454                 spin_lock(&tw_death_lock);
 455                 if (killed > quota) {
 456                         ret = 1;
 457                         break;
 458                 }
 459
 460                 /* While we dropped tw_death_lock, another cpu may have
 461                  * killed off the next TW bucket in the list, therefore
 462                  * do a fresh re-read of the hlist head node with the
 463                  * lock reacquired.  We still use the hlist traversal
 464                  * macro in order to get the prefetches.
 465                  */
 466                 goto rescan;
 467         }
 468
 469         tcp_tw_count -= killed;
 470         NET_ADD_STATS_BH(TimeWaited, killed);
 471
 472         return ret;
 473 }
 474
 475 static void tcp_twkill(unsigned long dummy)
 476 {
 477         int need_timer, ret;
 478
 479         spin_lock(&tw_death_lock);
 480
 481         if (tcp_tw_count == 0)
 482                 goto out;
 483
 484         need_timer = 0;
 485         ret = tcp_do_twkill_work(tcp_tw_death_row_slot, TCP_TWKILL_QUOTA);
 486         if (ret) {
 487                 twkill_thread_slots |= (1 << tcp_tw_death_row_slot);
 488                 mb();
 489                 schedule_work(&tcp_twkill_work);
 490                 need_timer = 1;
 491         } else {
 492                 /* We purged the entire slot, anything left?  */
 493                 if (tcp_tw_count)
 494                         need_timer = 1;
 495         }
 496         tcp_tw_death_row_slot =
 497                 ((tcp_tw_death_row_slot + 1) & (TCP_TWKILL_SLOTS - 1));
 498         if (need_timer)
 499                 mod_timer(&tcp_tw_timer, jiffies + TCP_TWKILL_PERIOD);
 500 out:
 501         spin_unlock(&tw_death_lock);
 502 }
 503
 504 extern void twkill_slots_invalid(void);
 505
 506 static void twkill_work(void *dummy)
 507 {
 508         int i;
 509
 510         if ((TCP_TWKILL_SLOTS - 1) > (sizeof(twkill_thread_slots) * 8))
 511                 twkill_slots_invalid();
 512
 513         while (twkill_thread_slots) {
 514                 spin_lock_bh(&tw_death_lock);
 515                 for (i = 0; i < TCP_TWKILL_SLOTS; i++) {
 516                         if (!(twkill_thread_slots & (1 << i)))
 517                                 continue;
 518
 519                         while (tcp_do_twkill_work(i, TCP_TWKILL_QUOTA) != 0) {
 520                                 if (need_resched()) {
 521                                         spin_unlock_bh(&tw_death_lock);
 522                                         schedule();
 523                                         spin_lock_bh(&tw_death_lock);
 524                                 }
 525                         }
 526
 527                         twkill_thread_slots &= ~(1 << i);
 528                 }
 529                 spin_unlock_bh(&tw_death_lock);
 530         }
 531 }
 532
 533 /* These are always called from BH context.  See callers in
 534  * tcp_input.c to verify this.
 535  */
 536
 537 /* This is for handling early-kills of TIME_WAIT sockets. */
 538 void tcp_tw_deschedule(struct tcp_tw_bucket *tw)
 539 {
 540         spin_lock(&tw_death_lock);
 541         if (tw_del_dead_node(tw)) {
 542                 tcp_tw_put(tw);
 543                 if (--tcp_tw_count == 0)
 544                         del_timer(&tcp_tw_timer);
 545         }
 546         spin_unlock(&tw_death_lock);
 547         tcp_timewait_kill(tw);
 548 }
 549
 550 /* Short-time timewait calendar */
 551
 552 static int tcp_twcal_hand = -1;
 553 static int tcp_twcal_jiffie;
 554 static void tcp_twcal_tick(unsigned long);
 555 static struct timer_list tcp_twcal_timer =
 556                 TIMER_INITIALIZER(tcp_twcal_tick, 0, 0);
 557 static struct hlist_head tcp_twcal_row[TCP_TW_RECYCLE_SLOTS];
 558
 559 void tcp_tw_schedule(struct tcp_tw_bucket *tw, int timeo)
 560 {
 561         struct hlist_head *list;
 562         int slot;
 563
 564         /* timeout := RTO * 3.5
 565          *
 566          * 3.5 = 1+2+0.5 to wait for two retransmits.
 567          *
 568          * RATIONALE: if FIN arrived and we entered TIME-WAIT state,
 569          * our ACK acking that FIN can be lost. If N subsequent retransmitted
 570          * FINs (or previous seqments) are lost (probability of such event
 571          * is p^(N+1), where p is probability to lose single packet and
 572          * time to detect the loss is about RTO*(2^N - 1) with exponential
 573          * backoff). Normal timewait length is calculated so, that we
 574          * waited at least for one retransmitted FIN (maximal RTO is 120sec).
 575          * [ BTW Linux. following BSD, violates this requirement waiting
 576          *   only for 60sec, we should wait at least for 240 secs.
 577          *   Well, 240 consumes too much of resources 8)
 578          * ]
 579          * This interval is not reduced to catch old duplicate and
 580          * responces to our wandering segments living for two MSLs.
 581          * However, if we use PAWS to detect
 582          * old duplicates, we can reduce the interval to bounds required
 583          * by RTO, rather than MSL. So, if peer understands PAWS, we
 584          * kill tw bucket after 3.5*RTO (it is important that this number
 585          * is greater than TS tick!) and detect old duplicates with help
 586          * of PAWS.
 587          */
 588         slot = (timeo + (1<<TCP_TW_RECYCLE_TICK) - 1) >> TCP_TW_RECYCLE_TICK;
 589
 590         spin_lock(&tw_death_lock);
 591
 592         /* Unlink it, if it was scheduled */
 593         if (tw_del_dead_node(tw))
 594                 tcp_tw_count--;
 595         else
 596                 atomic_inc(&tw->tw_refcnt);
 597
 598         if (slot >= TCP_TW_RECYCLE_SLOTS) {
 599                 /* Schedule to slow timer */
 600                 if (timeo >= TCP_TIMEWAIT_LEN) {
 601                         slot = TCP_TWKILL_SLOTS-1;
 602                 } else {
 603                         slot = (timeo + TCP_TWKILL_PERIOD-1) / TCP_TWKILL_PERIOD;
 604                         if (slot >= TCP_TWKILL_SLOTS)
 605                                 slot = TCP_TWKILL_SLOTS-1;
 606                 }
 607                 tw->tw_ttd = jiffies + timeo;
 608                 slot = (tcp_tw_death_row_slot + slot) & (TCP_TWKILL_SLOTS - 1);
 609                 list = &tcp_tw_death_row[slot];
 610         } else {
 611                 tw->tw_ttd = jiffies + (slot << TCP_TW_RECYCLE_TICK);
 612
 613                 if (tcp_twcal_hand < 0) {
 614                         tcp_twcal_hand = 0;
 615                         tcp_twcal_jiffie = jiffies;
 616                         tcp_twcal_timer.expires = tcp_twcal_jiffie + (slot<<TCP_TW_RECYCLE_TICK);
 617                         add_timer(&tcp_twcal_timer);
 618                 } else {
 619                         if (time_after(tcp_twcal_timer.expires, jiffies + (slot<<TCP_TW_RECYCLE_TICK)))
 620                                 mod_timer(&tcp_twcal_timer, jiffies + (slot<<TCP_TW_RECYCLE_TICK));
 621                         slot = (tcp_twcal_hand + slot)&(TCP_TW_RECYCLE_SLOTS-1);
 622                 }
 623                 list = &tcp_twcal_row[slot];
 624         }
 625
 626         hlist_add_head(&tw->tw_death_node, list);
 627
 628         if (tcp_tw_count++ == 0)
 629                 mod_timer(&tcp_tw_timer, jiffies+TCP_TWKILL_PERIOD);
 630         spin_unlock(&tw_death_lock);
 631 }
 632
 633 void tcp_twcal_tick(unsigned long dummy)
 634 {
 635         int n, slot;
 636         unsigned long j;
 637         unsigned long now = jiffies;
 638         int killed = 0;
 639         int adv = 0;
 640
 641         spin_lock(&tw_death_lock);
 642         if (tcp_twcal_hand < 0)
 643                 goto out;
 644
 645         slot = tcp_twcal_hand;
 646         j = tcp_twcal_jiffie;
 647
 648         for (n=0; n<TCP_TW_RECYCLE_SLOTS; n++) {
 649                 if (time_before_eq(j, now)) {
 650                         struct hlist_node *node, *safe;
 651                         struct tcp_tw_bucket *tw;
 652
 653                         tw_for_each_inmate_safe(tw, node, safe,
 654                                            &tcp_twcal_row[slot]) {
 655                                 __tw_del_dead_node(tw);
 656                                 tcp_timewait_kill(tw);
 657                                 tcp_tw_put(tw);
 658                                 killed++;
 659                         }
 660                 } else {
 661                         if (!adv) {
 662                                 adv = 1;
 663                                 tcp_twcal_jiffie = j;
 664                                 tcp_twcal_hand = slot;
 665                         }
 666
 667                         if (!hlist_empty(&tcp_twcal_row[slot])) {
 668                                 mod_timer(&tcp_twcal_timer, j);
 669                                 goto out;
 670                         }
 671                 }
 672                 j += (1<<TCP_TW_RECYCLE_TICK);
 673                 slot = (slot+1)&(TCP_TW_RECYCLE_SLOTS-1);
 674         }
 675         tcp_twcal_hand = -1;
 676
 677 out:
 678         if ((tcp_tw_count -= killed) == 0)
 679                 del_timer(&tcp_tw_timer);
 680         NET_ADD_STATS_BH(TimeWaitKilled, killed);
 681         spin_unlock(&tw_death_lock);
 682 }
 683
 684 /* This is not only more efficient than what we used to do, it eliminates
 685  * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM
 686  *
 687  * Actually, we could lots of memory writes here. tp of listening
 688  * socket contains all necessary default parameters.
 689  */
 690 struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req, struct sk_buff *skb)
 691 {
 692         /* allocate the newsk from the same slab of the master sock,
 693          * if not, at sk_free time we'll try to free it from the wrong
 694          * slabcache (i.e. is it TCPv4 or v6?) -acme */
 695         struct sock *newsk = sk_alloc(PF_INET, GFP_ATOMIC, 0, sk->sk_slab);
 696
 697         if(newsk != NULL) {
 698                 struct tcp_opt *newtp;
 699                 struct sk_filter *filter;
 700
 701                 memcpy(newsk, sk, sizeof(struct tcp_sock));
 702                 newsk->sk_state = TCP_SYN_RECV;
 703
 704                 /* SANITY */
 705                 sock_vx_init(newsk);
 706                 sock_nx_init(newsk);
 707                 sk_node_init(&newsk->sk_node);
 708                 tcp_sk(newsk)->bind_hash = NULL;
 709
 710                 /* Clone the TCP header template */
 711                 inet_sk(newsk)->dport = req->rmt_port;
 712
 713                 sock_lock_init(newsk);
 714                 bh_lock_sock(newsk);
 715
 716                 newsk->sk_dst_lock = RW_LOCK_UNLOCKED;
 717                 atomic_set(&newsk->sk_rmem_alloc, 0);
 718                 skb_queue_head_init(&newsk->sk_receive_queue);
 719                 atomic_set(&newsk->sk_wmem_alloc, 0);
 720                 skb_queue_head_init(&newsk->sk_write_queue);
 721                 atomic_set(&newsk->sk_omem_alloc, 0);
 722                 newsk->sk_wmem_queued = 0;
 723                 newsk->sk_forward_alloc = 0;
 724
 725                 sock_reset_flag(newsk, SOCK_DONE);
 726                 newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
 727                 newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
 728                 newsk->sk_callback_lock = RW_LOCK_UNLOCKED;
 729                 skb_queue_head_init(&newsk->sk_error_queue);
 730                 newsk->sk_write_space = tcp_write_space;
 731
 732                 if ((filter = newsk->sk_filter) != NULL)
 733                         sk_filter_charge(newsk, filter);
 734
 735                 if (unlikely(xfrm_sk_clone_policy(newsk))) {
 736                         /* It is still raw copy of parent, so invalidate
 737                          * destructor and make plain sk_free() */
 738                         newsk->sk_destruct = NULL;
 739                         sk_free(newsk);
 740                         return NULL;
 741                 }
 742
 743                 /* Now setup tcp_opt */
 744                 newtp = tcp_sk(newsk);
 745                 newtp->pred_flags = 0;
 746                 newtp->rcv_nxt = req->rcv_isn + 1;
 747                 newtp->snd_nxt = req->snt_isn + 1;
 748                 newtp->snd_una = req->snt_isn + 1;
 749                 newtp->snd_sml = req->snt_isn + 1;
 750
 751                 tcp_prequeue_init(newtp);
 752
 753                 tcp_init_wl(newtp, req->snt_isn, req->rcv_isn);
 754
 755                 newtp->retransmits = 0;
 756                 newtp->backoff = 0;
 757                 newtp->srtt = 0;
 758                 newtp->mdev = TCP_TIMEOUT_INIT;
 759                 newtp->rto = TCP_TIMEOUT_INIT;
 760
 761                 newtp->packets_out = 0;
 762                 newtp->left_out = 0;
 763                 newtp->retrans_out = 0;
 764                 newtp->sacked_out = 0;
 765                 newtp->fackets_out = 0;
 766                 newtp->snd_ssthresh = 0x7fffffff;
 767
 768                 /* So many TCP implementations out there (incorrectly) count the
 769                  * initial SYN frame in their delayed-ACK and congestion control
 770                  * algorithms that we must have the following bandaid to talk
 771                  * efficiently to them.  -DaveM
 772                  */
 773                 newtp->snd_cwnd = 2;
 774                 newtp->snd_cwnd_cnt = 0;
 775
 776                 newtp->bictcp.cnt = 0;
 777                 newtp->bictcp.last_max_cwnd = newtp->bictcp.last_cwnd = 0;
 778
 779                 newtp->frto_counter = 0;
 780                 newtp->frto_highmark = 0;
 781
 782                 tcp_set_ca_state(newtp, TCP_CA_Open);
 783                 tcp_init_xmit_timers(newsk);
 784                 skb_queue_head_init(&newtp->out_of_order_queue);
 785                 newtp->send_head = NULL;
 786                 newtp->rcv_wup = req->rcv_isn + 1;
 787                 newtp->write_seq = req->snt_isn + 1;
 788                 newtp->pushed_seq = newtp->write_seq;
 789                 newtp->copied_seq = req->rcv_isn + 1;
 790
 791                 newtp->saw_tstamp = 0;
 792
 793                 newtp->dsack = 0;
 794                 newtp->eff_sacks = 0;
 795
 796                 newtp->probes_out = 0;
 797                 newtp->num_sacks = 0;
 798                 newtp->urg_data = 0;
 799                 newtp->listen_opt = NULL;
 800                 newtp->accept_queue = newtp->accept_queue_tail = NULL;
 801                 /* Deinitialize syn_wait_lock to trap illegal accesses. */
 802                 memset(&newtp->syn_wait_lock, 0, sizeof(newtp->syn_wait_lock));
 803
 804                 /* Back to base struct sock members. */
 805                 newsk->sk_err = 0;
 806                 newsk->sk_priority = 0;
 807                 atomic_set(&newsk->sk_refcnt, 2);
 808
 809                 /* hmm, maybe from socket? */
 810                 set_vx_info(&newsk->sk_vx_info, current->vx_info);
 811                 set_nx_info(&newsk->sk_nx_info, current->nx_info);
 812 #ifdef INET_REFCNT_DEBUG
 813                 atomic_inc(&inet_sock_nr);
 814 #endif
 815                 atomic_inc(&tcp_sockets_allocated);
 816
 817                 if (sock_flag(newsk, SOCK_KEEPOPEN))
 818                         tcp_reset_keepalive_timer(newsk,
 819                                                   keepalive_time_when(newtp));
 820                 newsk->sk_socket = NULL;
 821                 newsk->sk_sleep = NULL;
 822                 newsk->sk_owner = NULL;
 823
 824                 newtp->tstamp_ok = req->tstamp_ok;
 825                 if((newtp->sack_ok = req->sack_ok) != 0) {
 826                         if (sysctl_tcp_fack)
 827                                 newtp->sack_ok |= 2;
 828                 }
 829                 newtp->window_clamp = req->window_clamp;
 830                 newtp->rcv_ssthresh = req->rcv_wnd;
 831                 newtp->rcv_wnd = req->rcv_wnd;
 832                 newtp->wscale_ok = req->wscale_ok;
 833                 if (newtp->wscale_ok) {
 834                         newtp->snd_wscale = req->snd_wscale;
 835                         newtp->rcv_wscale = req->rcv_wscale;
 836                 } else {
 837                         newtp->snd_wscale = newtp->rcv_wscale = 0;
 838                         newtp->window_clamp = min(newtp->window_clamp, 65535U);
 839                 }
 840                 newtp->snd_wnd = ntohs(skb->h.th->window) << newtp->snd_wscale;
 841                 newtp->max_window = newtp->snd_wnd;
 842
 843                 if (newtp->tstamp_ok) {
 844                         newtp->ts_recent = req->ts_recent;
 845                         newtp->ts_recent_stamp = xtime.tv_sec;
 846                         newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
 847                 } else {
 848                         newtp->ts_recent_stamp = 0;
 849                         newtp->tcp_header_len = sizeof(struct tcphdr);
 850                 }
 851                 if (skb->len >= TCP_MIN_RCVMSS+newtp->tcp_header_len)
 852                         newtp->ack.last_seg_size = skb->len-newtp->tcp_header_len;
 853                 newtp->mss_clamp = req->mss;
 854                 TCP_ECN_openreq_child(newtp, req);
 855                 if (newtp->ecn_flags&TCP_ECN_OK)
 856                         newsk->sk_no_largesend = 1;
 857
 858                 tcp_vegas_init(newtp);
 859                 TCP_INC_STATS_BH(TcpPassiveOpens);
 860         }
 861         return newsk;
 862 }
 863
 864 /*
 865  *      Process an incoming packet for SYN_RECV sockets represented
 866  *      as an open_request.
 867  */
 868
 869 struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
 870                            struct open_request *req,
 871                            struct open_request **prev)
 872 {
 873         struct tcphdr *th = skb->h.th;
 874         struct tcp_opt *tp = tcp_sk(sk);
 875         u32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);
 876         int paws_reject = 0;
 877         struct tcp_opt ttp;
 878         struct sock *child;
 879
 880         ttp.saw_tstamp = 0;
 881         if (th->doff > (sizeof(struct tcphdr)>>2)) {
 882                 tcp_parse_options(skb, &ttp, 0);
 883
 884                 if (ttp.saw_tstamp) {
 885                         ttp.ts_recent = req->ts_recent;
 886                         /* We do not store true stamp, but it is not required,
 887                          * it can be estimated (approximately)
 888                          * from another data.
 889                          */
 890                         ttp.ts_recent_stamp = xtime.tv_sec - ((TCP_TIMEOUT_INIT/HZ)<<req->retrans);
 891                         paws_reject = tcp_paws_check(&ttp, th->rst);
 892                 }
 893         }
 894
 895         /* Check for pure retransmitted SYN. */
 896         if (TCP_SKB_CB(skb)->seq == req->rcv_isn &&
 897             flg == TCP_FLAG_SYN &&
 898             !paws_reject) {
 899                 /*
 900                  * RFC793 draws (Incorrectly! It was fixed in RFC1122)
 901                  * this case on figure 6 and figure 8, but formal
 902                  * protocol description says NOTHING.
 903                  * To be more exact, it says that we should send ACK,
 904                  * because this segment (at least, if it has no data)
 905                  * is out of window.
 906                  *
 907                  *  CONCLUSION: RFC793 (even with RFC1122) DOES NOT
 908                  *  describe SYN-RECV state. All the description
 909                  *  is wrong, we cannot believe to it and should
 910                  *  rely only on common sense and implementation
 911                  *  experience.
 912                  *
 913                  * Enforce "SYN-ACK" according to figure 8, figure 6
 914                  * of RFC793, fixed by RFC1122.
 915                  */
 916                 req->class->rtx_syn_ack(sk, req, NULL);
 917                 return NULL;
 918         }
 919
 920         /* Further reproduces section "SEGMENT ARRIVES"
 921            for state SYN-RECEIVED of RFC793.
 922            It is broken, however, it does not work only
 923            when SYNs are crossed.
 924
 925            You would think that SYN crossing is impossible here, since
 926            we should have a SYN_SENT socket (from connect()) on our end,
 927            but this is not true if the crossed SYNs were sent to both
 928            ends by a malicious third party.  We must defend against this,
 929            and to do that we first verify the ACK (as per RFC793, page
 930            36) and reset if it is invalid.  Is this a true full defense?
 931            To convince ourselves, let us consider a way in which the ACK
 932            test can still pass in this 'malicious crossed SYNs' case.
 933            Malicious sender sends identical SYNs (and thus identical sequence
 934            numbers) to both A and B:
 935
 936                 A: gets SYN, seq=7
 937                 B: gets SYN, seq=7
 938
 939            By our good fortune, both A and B select the same initial
 940            send sequence number of seven :-)
 941
 942                 A: sends SYN|ACK, seq=7, ack_seq=8
 943                 B: sends SYN|ACK, seq=7, ack_seq=8
 944
 945            So we are now A eating this SYN|ACK, ACK test passes.  So
 946            does sequence test, SYN is truncated, and thus we consider
 947            it a bare ACK.
 948
 949            If tp->defer_accept, we silently drop this bare ACK.  Otherwise,
 950            we create an established connection.  Both ends (listening sockets)
 951            accept the new incoming connection and try to talk to each other. 8-)
 952
 953            Note: This case is both harmless, and rare.  Possibility is about the
 954            same as us discovering intelligent life on another plant tomorrow.
 955
 956            But generally, we should (RFC lies!) to accept ACK
 957            from SYNACK both here and in tcp_rcv_state_process().
 958            tcp_rcv_state_process() does not, hence, we do not too.
 959
 960            Note that the case is absolutely generic:
 961            we cannot optimize anything here without
 962            violating protocol. All the checks must be made
 963            before attempt to create socket.
 964          */
 965
 966         /* RFC793 page 36: "If the connection is in any non-synchronized state ...
 967          *                  and the incoming segment acknowledges something not yet
 968          *                  sent (the segment carries an unaccaptable ACK) ...
 969          *                  a reset is sent."
 970          *
 971          * Invalid ACK: reset will be sent by listening socket
 972          */
 973         if ((flg & TCP_FLAG_ACK) &&
 974             (TCP_SKB_CB(skb)->ack_seq != req->snt_isn+1))
 975                 return sk;
 976
 977         /* Also, it would be not so bad idea to check rcv_tsecr, which
 978          * is essentially ACK extension and too early or too late values
 979          * should cause reset in unsynchronized states.
 980          */
 981
 982         /* RFC793: "first check sequence number". */
 983
 984         if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
 985                                           req->rcv_isn+1, req->rcv_isn+1+req->rcv_wnd)) {
 986                 /* Out of window: send ACK and drop. */
 987                 if (!(flg & TCP_FLAG_RST))
 988                         req->class->send_ack(skb, req);
 989                 if (paws_reject)
 990                         NET_INC_STATS_BH(PAWSEstabRejected);
 991                 return NULL;
 992         }
 993
 994         /* In sequence, PAWS is OK. */
 995
 996         if (ttp.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, req->rcv_isn+1))
 997                 req->ts_recent = ttp.rcv_tsval;
 998
 999         if (TCP_SKB_CB(skb)->seq == req->rcv_isn) {
1000                 /* Truncate SYN, it is out of window starting
1001                    at req->rcv_isn+1. */
1002                 flg &= ~TCP_FLAG_SYN;
1003         }
1004
1005         /* RFC793: "second check the RST bit" and
1006          *         "fourth, check the SYN bit"
1007          */
1008         if (flg & (TCP_FLAG_RST|TCP_FLAG_SYN))
1009                 goto embryonic_reset;
1010
1011         /* ACK sequence verified above, just make sure ACK is
1012          * set.  If ACK not set, just silently drop the packet.
1013          */
1014         if (!(flg & TCP_FLAG_ACK))
1015                 return NULL;
1016
1017         /* If TCP_DEFER_ACCEPT is set, drop bare ACK. */
1018         if (tp->defer_accept && TCP_SKB_CB(skb)->end_seq == req->rcv_isn+1) {
1019                 req->acked = 1;
1020                 return NULL;
1021         }
1022
1023         /* OK, ACK is valid, create big socket and
1024          * feed this segment to it. It will repeat all
1025          * the tests. THIS SEGMENT MUST MOVE SOCKET TO
1026          * ESTABLISHED STATE. If it will be dropped after
1027          * socket is created, wait for troubles.
1028          */
1029         child = tp->af_specific->syn_recv_sock(sk, skb, req, NULL);
1030         if (child == NULL)
1031                 goto listen_overflow;
1032
1033         sk_set_owner(child, sk->sk_owner);
1034         tcp_synq_unlink(tp, req, prev);
1035         tcp_synq_removed(sk, req);
1036
1037         tcp_acceptq_queue(sk, req, child);
1038         return child;
1039
1040 listen_overflow:
1041         if (!sysctl_tcp_abort_on_overflow) {
1042                 req->acked = 1;
1043                 return NULL;
1044         }
1045
1046 embryonic_reset:
1047         NET_INC_STATS_BH(EmbryonicRsts);
1048         if (!(flg & TCP_FLAG_RST))
1049                 req->class->send_reset(skb);
1050
1051         tcp_synq_drop(sk, req, prev);
1052         return NULL;
1053 }
1054
1055 /*
1056  * Queue segment on the new socket if the new socket is active,
1057  * otherwise we just shortcircuit this and continue with
1058  * the new socket.
1059  */
1060
1061 int tcp_child_process(struct sock *parent, struct sock *child,
1062                       struct sk_buff *skb)
1063 {
1064         int ret = 0;
1065         int state = child->sk_state;
1066
1067         if (!sock_owned_by_user(child)) {
1068                 ret = tcp_rcv_state_process(child, skb, skb->h.th, skb->len);
1069
1070                 /* Wakeup parent, send SIGIO */
1071                 if (state == TCP_SYN_RECV && child->sk_state != state)
1072                         parent->sk_data_ready(parent, 0);
1073         } else {
1074                 /* Alas, it is possible again, because we do lookup
1075                  * in main socket hash table and lock on listening
1076                  * socket does not protect us more.
1077                  */
1078                 sk_add_backlog(child, skb);
1079         }
1080
1081         bh_unlock_sock(child);
1082         sock_put(child);
1083         return ret;
1084 }
1085
1086 EXPORT_SYMBOL(tcp_check_req);
1087 EXPORT_SYMBOL(tcp_child_process);
1088 EXPORT_SYMBOL(tcp_create_openreq_child);
1089 EXPORT_SYMBOL(tcp_timewait_state_process);
1090 EXPORT_SYMBOL(tcp_tw_deschedule);
1091
1092 #ifdef CONFIG_SYSCTL
1093 EXPORT_SYMBOL(sysctl_tcp_tw_recycle);
1094 #endif