8d3fa3c911766a6a993a250d2978a18ae7bfbec0
[linux-2.6.git] / net / ipv4 / tcp_ipv4.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              Implementation of the Transmission Control Protocol(TCP).
7  *
8  * Version:     $Id$
9  *
10  *              IPv4 specific functions
11  *
12  *
13  *              code split from:
14  *              linux/ipv4/tcp.c
15  *              linux/ipv4/tcp_input.c
16  *              linux/ipv4/tcp_output.c
17  *
18  *              See tcp.c for author information
19  *
20  *      This program is free software; you can redistribute it and/or
21  *      modify it under the terms of the GNU General Public License
22  *      as published by the Free Software Foundation; either version
23  *      2 of the License, or (at your option) any later version.
24  */
25
26 /*
27  * Changes:
28  *              David S. Miller :       New socket lookup architecture.
29  *                                      This code is dedicated to John Dyson.
30  *              David S. Miller :       Change semantics of established hash,
31  *                                      half is devoted to TIME_WAIT sockets
32  *                                      and the rest go in the other half.
33  *              Andi Kleen :            Add support for syncookies and fixed
34  *                                      some bugs: ip options weren't passed to
35  *                                      the TCP layer, missed a check for an
36  *                                      ACK bit.
37  *              Andi Kleen :            Implemented fast path mtu discovery.
38  *                                      Fixed many serious bugs in the
39  *                                      request_sock handling and moved
40  *                                      most of it into the af independent code.
41  *                                      Added tail drop and some other bugfixes.
42  *                                      Added new listen semantics.
43  *              Mike McLagan    :       Routing by source
44  *      Juan Jose Ciarlante:            ip_dynaddr bits
45  *              Andi Kleen:             various fixes.
46  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
47  *                                      coma.
48  *      Andi Kleen              :       Fix new listen.
49  *      Andi Kleen              :       Fix accept error reporting.
50  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
51  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
52  *                                      a single port at the same time.
53  */
54
55
56 #include <linux/types.h>
57 #include <linux/fcntl.h>
58 #include <linux/module.h>
59 #include <linux/random.h>
60 #include <linux/cache.h>
61 #include <linux/jhash.h>
62 #include <linux/init.h>
63 #include <linux/times.h>
64
65 #include <net/icmp.h>
66 #include <net/inet_hashtables.h>
67 #include <net/tcp.h>
68 #include <net/transp_v6.h>
69 #include <net/ipv6.h>
70 #include <net/inet_common.h>
71 #include <net/timewait_sock.h>
72 #include <net/xfrm.h>
73 #include <net/netdma.h>
74
75 #include <linux/inet.h>
76 #include <linux/ipv6.h>
77 #include <linux/stddef.h>
78 #include <linux/proc_fs.h>
79 #include <linux/seq_file.h>
80
81 #include <linux/crypto.h>
82 #include <linux/scatterlist.h>
83
84 int sysctl_tcp_tw_reuse __read_mostly;
85 int sysctl_tcp_low_latency __read_mostly;
86
87 /* Check TCP sequence numbers in ICMP packets. */
88 #define ICMP_MIN_LENGTH 8
89
90 /* Socket used for sending RSTs */
91 static struct socket *tcp_socket;
92
93 void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb);
94
95 #ifdef CONFIG_TCP_MD5SIG
96 static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk,
97                                                    __be32 addr);
98 static int tcp_v4_do_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
99                                    __be32 saddr, __be32 daddr,
100                                    struct tcphdr *th, int protocol,
101                                    int tcplen);
102 #endif
103
104 struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
105         .lhash_lock  = __RW_LOCK_UNLOCKED(tcp_hashinfo.lhash_lock),
106         .lhash_users = ATOMIC_INIT(0),
107         .lhash_wait  = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
108 };
109
110 static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
111 {
112         return inet_csk_get_port(&tcp_hashinfo, sk, snum,
113                                  inet_csk_bind_conflict);
114 }
115
116 static void tcp_v4_hash(struct sock *sk)
117 {
118         inet_hash(&tcp_hashinfo, sk);
119 }
120
121 void tcp_unhash(struct sock *sk)
122 {
123         inet_unhash(&tcp_hashinfo, sk);
124 }
125
126 static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
127 {
128         return secure_tcp_sequence_number(skb->nh.iph->daddr,
129                                           skb->nh.iph->saddr,
130                                           skb->h.th->dest,
131                                           skb->h.th->source);
132 }
133
134 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
135 {
136         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
137         struct tcp_sock *tp = tcp_sk(sk);
138
139         /* With PAWS, it is safe from the viewpoint
140            of data integrity. Even without PAWS it is safe provided sequence
141            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
142
143            Actually, the idea is close to VJ's one, only timestamp cache is
144            held not per host, but per port pair and TW bucket is used as state
145            holder.
146
147            If TW bucket has been already destroyed we fall back to VJ's scheme
148            and use initial timestamp retrieved from peer table.
149          */
150         if (tcptw->tw_ts_recent_stamp &&
151             (twp == NULL || (sysctl_tcp_tw_reuse &&
152                              xtime.tv_sec - tcptw->tw_ts_recent_stamp > 1))) {
153                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
154                 if (tp->write_seq == 0)
155                         tp->write_seq = 1;
156                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
157                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
158                 sock_hold(sktw);
159                 return 1;
160         }
161
162         return 0;
163 }
164
165 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
166
167 /* This will initiate an outgoing connection. */
168 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
169 {
170         struct inet_sock *inet = inet_sk(sk);
171         struct tcp_sock *tp = tcp_sk(sk);
172         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
173         struct rtable *rt;
174         __be32 daddr, nexthop;
175         int tmp;
176         int err;
177
178         if (addr_len < sizeof(struct sockaddr_in))
179                 return -EINVAL;
180
181         if (usin->sin_family != AF_INET)
182                 return -EAFNOSUPPORT;
183
184         nexthop = daddr = usin->sin_addr.s_addr;
185         if (inet->opt && inet->opt->srr) {
186                 if (!daddr)
187                         return -EINVAL;
188                 nexthop = inet->opt->faddr;
189         }
190
191         tmp = ip_route_connect(&rt, nexthop, inet->saddr,
192                                RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
193                                IPPROTO_TCP,
194                                inet->sport, usin->sin_port, sk);
195         if (tmp < 0)
196                 return tmp;
197
198         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
199                 ip_rt_put(rt);
200                 return -ENETUNREACH;
201         }
202
203         if (!inet->opt || !inet->opt->srr)
204                 daddr = rt->rt_dst;
205
206         if (!inet->saddr)
207                 inet->saddr = rt->rt_src;
208         inet->rcv_saddr = inet->saddr;
209
210         if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
211                 /* Reset inherited state */
212                 tp->rx_opt.ts_recent       = 0;
213                 tp->rx_opt.ts_recent_stamp = 0;
214                 tp->write_seq              = 0;
215         }
216
217         if (tcp_death_row.sysctl_tw_recycle &&
218             !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
219                 struct inet_peer *peer = rt_get_peer(rt);
220                 /*
221                  * VJ's idea. We save last timestamp seen from
222                  * the destination in peer table, when entering state
223                  * TIME-WAIT * and initialize rx_opt.ts_recent from it,
224                  * when trying new connection.
225                  */
226                 if (peer != NULL &&
227                     peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
228                         tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
229                         tp->rx_opt.ts_recent = peer->tcp_ts;
230                 }
231         }
232
233         inet->dport = usin->sin_port;
234         inet->daddr = daddr;
235
236         inet_csk(sk)->icsk_ext_hdr_len = 0;
237         if (inet->opt)
238                 inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
239
240         tp->rx_opt.mss_clamp = 536;
241
242         /* Socket identity is still unknown (sport may be zero).
243          * However we set state to SYN-SENT and not releasing socket
244          * lock select source port, enter ourselves into the hash tables and
245          * complete initialization after this.
246          */
247         tcp_set_state(sk, TCP_SYN_SENT);
248         err = inet_hash_connect(&tcp_death_row, sk);
249         if (err)
250                 goto failure;
251
252         err = ip_route_newports(&rt, IPPROTO_TCP,
253                                 inet->sport, inet->dport, sk);
254         if (err)
255                 goto failure;
256
257         /* OK, now commit destination to socket.  */
258         sk->sk_gso_type = SKB_GSO_TCPV4;
259         sk_setup_caps(sk, &rt->u.dst);
260
261         if (!tp->write_seq)
262                 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
263                                                            inet->daddr,
264                                                            inet->sport,
265                                                            usin->sin_port);
266
267         inet->id = tp->write_seq ^ jiffies;
268
269         err = tcp_connect(sk);
270         rt = NULL;
271         if (err)
272                 goto failure;
273
274         return 0;
275
276 failure:
277         /*
278          * This unhashes the socket and releases the local port,
279          * if necessary.
280          */
281         tcp_set_state(sk, TCP_CLOSE);
282         ip_rt_put(rt);
283         sk->sk_route_caps = 0;
284         inet->dport = 0;
285         return err;
286 }
287
288 /*
289  * This routine does path mtu discovery as defined in RFC1191.
290  */
291 static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu)
292 {
293         struct dst_entry *dst;
294         struct inet_sock *inet = inet_sk(sk);
295
296         /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
297          * send out by Linux are always <576bytes so they should go through
298          * unfragmented).
299          */
300         if (sk->sk_state == TCP_LISTEN)
301                 return;
302
303         /* We don't check in the destentry if pmtu discovery is forbidden
304          * on this route. We just assume that no packet_to_big packets
305          * are send back when pmtu discovery is not active.
306          * There is a small race when the user changes this flag in the
307          * route, but I think that's acceptable.
308          */
309         if ((dst = __sk_dst_check(sk, 0)) == NULL)
310                 return;
311
312         dst->ops->update_pmtu(dst, mtu);
313
314         /* Something is about to be wrong... Remember soft error
315          * for the case, if this connection will not able to recover.
316          */
317         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
318                 sk->sk_err_soft = EMSGSIZE;
319
320         mtu = dst_mtu(dst);
321
322         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
323             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
324                 tcp_sync_mss(sk, mtu);
325
326                 /* Resend the TCP packet because it's
327                  * clear that the old packet has been
328                  * dropped. This is the new "fast" path mtu
329                  * discovery.
330                  */
331                 tcp_simple_retransmit(sk);
332         } /* else let the usual retransmit timer handle it */
333 }
334
335 /*
336  * This routine is called by the ICMP module when it gets some
337  * sort of error condition.  If err < 0 then the socket should
338  * be closed and the error returned to the user.  If err > 0
339  * it's just the icmp type << 8 | icmp code.  After adjustment
340  * header points to the first 8 bytes of the tcp header.  We need
341  * to find the appropriate port.
342  *
343  * The locking strategy used here is very "optimistic". When
344  * someone else accesses the socket the ICMP is just dropped
345  * and for some paths there is no check at all.
346  * A more general error queue to queue errors for later handling
347  * is probably better.
348  *
349  */
350
351 void tcp_v4_err(struct sk_buff *skb, u32 info)
352 {
353         struct iphdr *iph = (struct iphdr *)skb->data;
354         struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
355         struct tcp_sock *tp;
356         struct inet_sock *inet;
357         int type = skb->h.icmph->type;
358         int code = skb->h.icmph->code;
359         struct sock *sk;
360         __u32 seq;
361         int err;
362
363         if (skb->len < (iph->ihl << 2) + 8) {
364                 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
365                 return;
366         }
367
368         sk = inet_lookup(&tcp_hashinfo, iph->daddr, th->dest, iph->saddr,
369                          th->source, inet_iif(skb));
370         if (!sk) {
371                 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
372                 return;
373         }
374         if (sk->sk_state == TCP_TIME_WAIT) {
375                 inet_twsk_put(inet_twsk(sk));
376                 return;
377         }
378
379         bh_lock_sock(sk);
380         /* If too many ICMPs get dropped on busy
381          * servers this needs to be solved differently.
382          */
383         if (sock_owned_by_user(sk))
384                 NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
385
386         if (sk->sk_state == TCP_CLOSE)
387                 goto out;
388
389         tp = tcp_sk(sk);
390         seq = ntohl(th->seq);
391         if (sk->sk_state != TCP_LISTEN &&
392             !between(seq, tp->snd_una, tp->snd_nxt)) {
393                 NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
394                 goto out;
395         }
396
397         switch (type) {
398         case ICMP_SOURCE_QUENCH:
399                 /* Just silently ignore these. */
400                 goto out;
401         case ICMP_PARAMETERPROB:
402                 err = EPROTO;
403                 break;
404         case ICMP_DEST_UNREACH:
405                 if (code > NR_ICMP_UNREACH)
406                         goto out;
407
408                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
409                         if (!sock_owned_by_user(sk))
410                                 do_pmtu_discovery(sk, iph, info);
411                         goto out;
412                 }
413
414                 err = icmp_err_convert[code].errno;
415                 break;
416         case ICMP_TIME_EXCEEDED:
417                 err = EHOSTUNREACH;
418                 break;
419         default:
420                 goto out;
421         }
422
423         switch (sk->sk_state) {
424                 struct request_sock *req, **prev;
425         case TCP_LISTEN:
426                 if (sock_owned_by_user(sk))
427                         goto out;
428
429                 req = inet_csk_search_req(sk, &prev, th->dest,
430                                           iph->daddr, iph->saddr);
431                 if (!req)
432                         goto out;
433
434                 /* ICMPs are not backlogged, hence we cannot get
435                    an established socket here.
436                  */
437                 BUG_TRAP(!req->sk);
438
439                 if (seq != tcp_rsk(req)->snt_isn) {
440                         NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
441                         goto out;
442                 }
443
444                 /*
445                  * Still in SYN_RECV, just remove it silently.
446                  * There is no good way to pass the error to the newly
447                  * created socket, and POSIX does not want network
448                  * errors returned from accept().
449                  */
450                 inet_csk_reqsk_queue_drop(sk, req, prev);
451                 goto out;
452
453         case TCP_SYN_SENT:
454         case TCP_SYN_RECV:  /* Cannot happen.
455                                It can f.e. if SYNs crossed.
456                              */
457                 if (!sock_owned_by_user(sk)) {
458                         sk->sk_err = err;
459
460                         sk->sk_error_report(sk);
461
462                         tcp_done(sk);
463                 } else {
464                         sk->sk_err_soft = err;
465                 }
466                 goto out;
467         }
468
469         /* If we've already connected we will keep trying
470          * until we time out, or the user gives up.
471          *
472          * rfc1122 4.2.3.9 allows to consider as hard errors
473          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
474          * but it is obsoleted by pmtu discovery).
475          *
476          * Note, that in modern internet, where routing is unreliable
477          * and in each dark corner broken firewalls sit, sending random
478          * errors ordered by their masters even this two messages finally lose
479          * their original sense (even Linux sends invalid PORT_UNREACHs)
480          *
481          * Now we are in compliance with RFCs.
482          *                                                      --ANK (980905)
483          */
484
485         inet = inet_sk(sk);
486         if (!sock_owned_by_user(sk) && inet->recverr) {
487                 sk->sk_err = err;
488                 sk->sk_error_report(sk);
489         } else  { /* Only an error on timeout */
490                 sk->sk_err_soft = err;
491         }
492
493 out:
494         bh_unlock_sock(sk);
495         sock_put(sk);
496 }
497
498 /* This routine computes an IPv4 TCP checksum. */
499 void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb)
500 {
501         struct inet_sock *inet = inet_sk(sk);
502         struct tcphdr *th = skb->h.th;
503
504         if (skb->ip_summed == CHECKSUM_PARTIAL) {
505                 th->check = ~tcp_v4_check(th, len,
506                                           inet->saddr, inet->daddr, 0);
507                 skb->csum_offset = offsetof(struct tcphdr, check);
508         } else {
509                 th->check = tcp_v4_check(th, len, inet->saddr, inet->daddr,
510                                          csum_partial((char *)th,
511                                                       th->doff << 2,
512                                                       skb->csum));
513         }
514 }
515
516 int tcp_v4_gso_send_check(struct sk_buff *skb)
517 {
518         struct iphdr *iph;
519         struct tcphdr *th;
520
521         if (!pskb_may_pull(skb, sizeof(*th)))
522                 return -EINVAL;
523
524         iph = skb->nh.iph;
525         th = skb->h.th;
526
527         th->check = 0;
528         th->check = ~tcp_v4_check(th, skb->len, iph->saddr, iph->daddr, 0);
529         skb->csum_offset = offsetof(struct tcphdr, check);
530         skb->ip_summed = CHECKSUM_PARTIAL;
531         return 0;
532 }
533
534 /*
535  *      This routine will send an RST to the other tcp.
536  *
537  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
538  *                    for reset.
539  *      Answer: if a packet caused RST, it is not for a socket
540  *              existing in our system, if it is matched to a socket,
541  *              it is just duplicate segment or bug in other side's TCP.
542  *              So that we build reply only basing on parameters
543  *              arrived with segment.
544  *      Exception: precedence violation. We do not implement it in any case.
545  */
546
547 static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
548 {
549         struct tcphdr *th = skb->h.th;
550         struct {
551                 struct tcphdr th;
552 #ifdef CONFIG_TCP_MD5SIG
553                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
554 #endif
555         } rep;
556         struct ip_reply_arg arg;
557 #ifdef CONFIG_TCP_MD5SIG
558         struct tcp_md5sig_key *key;
559 #endif
560
561         /* Never send a reset in response to a reset. */
562         if (th->rst)
563                 return;
564
565         if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
566                 return;
567
568         /* Swap the send and the receive. */
569         memset(&rep, 0, sizeof(rep));
570         rep.th.dest   = th->source;
571         rep.th.source = th->dest;
572         rep.th.doff   = sizeof(struct tcphdr) / 4;
573         rep.th.rst    = 1;
574
575         if (th->ack) {
576                 rep.th.seq = th->ack_seq;
577         } else {
578                 rep.th.ack = 1;
579                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
580                                        skb->len - (th->doff << 2));
581         }
582
583         memset(&arg, 0, sizeof(arg));
584         arg.iov[0].iov_base = (unsigned char *)&rep;
585         arg.iov[0].iov_len  = sizeof(rep.th);
586
587 #ifdef CONFIG_TCP_MD5SIG
588         key = sk ? tcp_v4_md5_do_lookup(sk, skb->nh.iph->daddr) : NULL;
589         if (key) {
590                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
591                                    (TCPOPT_NOP << 16) |
592                                    (TCPOPT_MD5SIG << 8) |
593                                    TCPOLEN_MD5SIG);
594                 /* Update length and the length the header thinks exists */
595                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
596                 rep.th.doff = arg.iov[0].iov_len / 4;
597
598                 tcp_v4_do_calc_md5_hash((__u8 *)&rep.opt[1],
599                                         key,
600                                         skb->nh.iph->daddr,
601                                         skb->nh.iph->saddr,
602                                         &rep.th, IPPROTO_TCP,
603                                         arg.iov[0].iov_len);
604         }
605 #endif
606         arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
607                                       skb->nh.iph->saddr, /* XXX */
608                                       sizeof(struct tcphdr), IPPROTO_TCP, 0);
609         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
610
611         ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
612
613         TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
614         TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
615 }
616
617 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
618    outside socket context is ugly, certainly. What can I do?
619  */
620
621 static void tcp_v4_send_ack(struct tcp_timewait_sock *twsk,
622                             struct sk_buff *skb, u32 seq, u32 ack,
623                             u32 win, u32 ts)
624 {
625         struct tcphdr *th = skb->h.th;
626         struct {
627                 struct tcphdr th;
628                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
629 #ifdef CONFIG_TCP_MD5SIG
630                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
631 #endif
632                         ];
633         } rep;
634         struct ip_reply_arg arg;
635 #ifdef CONFIG_TCP_MD5SIG
636         struct tcp_md5sig_key *key;
637         struct tcp_md5sig_key tw_key;
638 #endif
639
640         memset(&rep.th, 0, sizeof(struct tcphdr));
641         memset(&arg, 0, sizeof(arg));
642
643         arg.iov[0].iov_base = (unsigned char *)&rep;
644         arg.iov[0].iov_len  = sizeof(rep.th);
645         if (ts) {
646                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
647                                    (TCPOPT_TIMESTAMP << 8) |
648                                    TCPOLEN_TIMESTAMP);
649                 rep.opt[1] = htonl(tcp_time_stamp);
650                 rep.opt[2] = htonl(ts);
651                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
652         }
653
654         /* Swap the send and the receive. */
655         rep.th.dest    = th->source;
656         rep.th.source  = th->dest;
657         rep.th.doff    = arg.iov[0].iov_len / 4;
658         rep.th.seq     = htonl(seq);
659         rep.th.ack_seq = htonl(ack);
660         rep.th.ack     = 1;
661         rep.th.window  = htons(win);
662
663 #ifdef CONFIG_TCP_MD5SIG
664         /*
665          * The SKB holds an imcoming packet, but may not have a valid ->sk
666          * pointer. This is especially the case when we're dealing with a
667          * TIME_WAIT ack, because the sk structure is long gone, and only
668          * the tcp_timewait_sock remains. So the md5 key is stashed in that
669          * structure, and we use it in preference.  I believe that (twsk ||
670          * skb->sk) holds true, but we program defensively.
671          */
672         if (!twsk && skb->sk) {
673                 key = tcp_v4_md5_do_lookup(skb->sk, skb->nh.iph->daddr);
674         } else if (twsk && twsk->tw_md5_keylen) {
675                 tw_key.key = twsk->tw_md5_key;
676                 tw_key.keylen = twsk->tw_md5_keylen;
677                 key = &tw_key;
678         } else
679                 key = NULL;
680
681         if (key) {
682                 int offset = (ts) ? 3 : 0;
683
684                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
685                                           (TCPOPT_NOP << 16) |
686                                           (TCPOPT_MD5SIG << 8) |
687                                           TCPOLEN_MD5SIG);
688                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
689                 rep.th.doff = arg.iov[0].iov_len/4;
690
691                 tcp_v4_do_calc_md5_hash((__u8 *)&rep.opt[offset],
692                                         key,
693                                         skb->nh.iph->daddr,
694                                         skb->nh.iph->saddr,
695                                         &rep.th, IPPROTO_TCP,
696                                         arg.iov[0].iov_len);
697         }
698 #endif
699         arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
700                                       skb->nh.iph->saddr, /* XXX */
701                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
702         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
703
704         ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
705
706         TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
707 }
708
709 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
710 {
711         struct inet_timewait_sock *tw = inet_twsk(sk);
712         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
713
714         tcp_v4_send_ack(tcptw, skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
715                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
716                         tcptw->tw_ts_recent);
717
718         inet_twsk_put(tw);
719 }
720
721 static void tcp_v4_reqsk_send_ack(struct sk_buff *skb,
722                                   struct request_sock *req)
723 {
724         tcp_v4_send_ack(NULL, skb, tcp_rsk(req)->snt_isn + 1,
725                         tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
726                         req->ts_recent);
727 }
728
729 /*
730  *      Send a SYN-ACK after having received an ACK.
731  *      This still operates on a request_sock only, not on a big
732  *      socket.
733  */
734 static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
735                               struct dst_entry *dst)
736 {
737         const struct inet_request_sock *ireq = inet_rsk(req);
738         int err = -1;
739         struct sk_buff * skb;
740
741         /* First, grab a route. */
742         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
743                 goto out;
744
745         skb = tcp_make_synack(sk, dst, req);
746
747         if (skb) {
748                 struct tcphdr *th = skb->h.th;
749
750                 th->check = tcp_v4_check(th, skb->len,
751                                          ireq->loc_addr,
752                                          ireq->rmt_addr,
753                                          csum_partial((char *)th, skb->len,
754                                                       skb->csum));
755
756                 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
757                                             ireq->rmt_addr,
758                                             ireq->opt);
759                 err = net_xmit_eval(err);
760         }
761
762 out:
763         dst_release(dst);
764         return err;
765 }
766
767 /*
768  *      IPv4 request_sock destructor.
769  */
770 static void tcp_v4_reqsk_destructor(struct request_sock *req)
771 {
772         kfree(inet_rsk(req)->opt);
773 }
774
775 #ifdef CONFIG_SYN_COOKIES
776 static void syn_flood_warning(struct sk_buff *skb)
777 {
778         static unsigned long warntime;
779
780         if (time_after(jiffies, (warntime + HZ * 60))) {
781                 warntime = jiffies;
782                 printk(KERN_INFO
783                        "possible SYN flooding on port %d. Sending cookies.\n",
784                        ntohs(skb->h.th->dest));
785         }
786 }
787 #endif
788
789 /*
790  * Save and compile IPv4 options into the request_sock if needed.
791  */
792 static struct ip_options *tcp_v4_save_options(struct sock *sk,
793                                               struct sk_buff *skb)
794 {
795         struct ip_options *opt = &(IPCB(skb)->opt);
796         struct ip_options *dopt = NULL;
797
798         if (opt && opt->optlen) {
799                 int opt_size = optlength(opt);
800                 dopt = kmalloc(opt_size, GFP_ATOMIC);
801                 if (dopt) {
802                         if (ip_options_echo(dopt, skb)) {
803                                 kfree(dopt);
804                                 dopt = NULL;
805                         }
806                 }
807         }
808         return dopt;
809 }
810
811 #ifdef CONFIG_TCP_MD5SIG
812 /*
813  * RFC2385 MD5 checksumming requires a mapping of
814  * IP address->MD5 Key.
815  * We need to maintain these in the sk structure.
816  */
817
818 /* Find the Key structure for an address.  */
819 static struct tcp_md5sig_key *
820                         tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
821 {
822         struct tcp_sock *tp = tcp_sk(sk);
823         int i;
824
825         if (!tp->md5sig_info || !tp->md5sig_info->entries4)
826                 return NULL;
827         for (i = 0; i < tp->md5sig_info->entries4; i++) {
828                 if (tp->md5sig_info->keys4[i].addr == addr)
829                         return (struct tcp_md5sig_key *)
830                                                 &tp->md5sig_info->keys4[i];
831         }
832         return NULL;
833 }
834
835 struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
836                                          struct sock *addr_sk)
837 {
838         return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->daddr);
839 }
840
841 EXPORT_SYMBOL(tcp_v4_md5_lookup);
842
843 static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
844                                                       struct request_sock *req)
845 {
846         return tcp_v4_md5_do_lookup(sk, inet_rsk(req)->rmt_addr);
847 }
848
849 /* This can be called on a newly created socket, from other files */
850 int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
851                       u8 *newkey, u8 newkeylen)
852 {
853         /* Add Key to the list */
854         struct tcp4_md5sig_key *key;
855         struct tcp_sock *tp = tcp_sk(sk);
856         struct tcp4_md5sig_key *keys;
857
858         key = (struct tcp4_md5sig_key *)tcp_v4_md5_do_lookup(sk, addr);
859         if (key) {
860                 /* Pre-existing entry - just update that one. */
861                 kfree(key->key);
862                 key->key = newkey;
863                 key->keylen = newkeylen;
864         } else {
865                 struct tcp_md5sig_info *md5sig;
866
867                 if (!tp->md5sig_info) {
868                         tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info),
869                                                   GFP_ATOMIC);
870                         if (!tp->md5sig_info) {
871                                 kfree(newkey);
872                                 return -ENOMEM;
873                         }
874                 }
875                 if (tcp_alloc_md5sig_pool() == NULL) {
876                         kfree(newkey);
877                         return -ENOMEM;
878                 }
879                 md5sig = tp->md5sig_info;
880
881                 if (md5sig->alloced4 == md5sig->entries4) {
882                         keys = kmalloc((sizeof(*keys) *
883                                         (md5sig->entries4 + 1)), GFP_ATOMIC);
884                         if (!keys) {
885                                 kfree(newkey);
886                                 tcp_free_md5sig_pool();
887                                 return -ENOMEM;
888                         }
889
890                         if (md5sig->entries4)
891                                 memcpy(keys, md5sig->keys4,
892                                        sizeof(*keys) * md5sig->entries4);
893
894                         /* Free old key list, and reference new one */
895                         if (md5sig->keys4)
896                                 kfree(md5sig->keys4);
897                         md5sig->keys4 = keys;
898                         md5sig->alloced4++;
899                 }
900                 md5sig->entries4++;
901                 md5sig->keys4[md5sig->entries4 - 1].addr   = addr;
902                 md5sig->keys4[md5sig->entries4 - 1].key    = newkey;
903                 md5sig->keys4[md5sig->entries4 - 1].keylen = newkeylen;
904         }
905         return 0;
906 }
907
908 EXPORT_SYMBOL(tcp_v4_md5_do_add);
909
910 static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
911                                u8 *newkey, u8 newkeylen)
912 {
913         return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->daddr,
914                                  newkey, newkeylen);
915 }
916
917 int tcp_v4_md5_do_del(struct sock *sk, __be32 addr)
918 {
919         struct tcp_sock *tp = tcp_sk(sk);
920         int i;
921
922         for (i = 0; i < tp->md5sig_info->entries4; i++) {
923                 if (tp->md5sig_info->keys4[i].addr == addr) {
924                         /* Free the key */
925                         kfree(tp->md5sig_info->keys4[i].key);
926                         tp->md5sig_info->entries4--;
927
928                         if (tp->md5sig_info->entries4 == 0) {
929                                 kfree(tp->md5sig_info->keys4);
930                                 tp->md5sig_info->keys4 = NULL;
931                                 tp->md5sig_info->alloced4 = 0;
932                         } else if (tp->md5sig_info->entries4 != i) {
933                                 /* Need to do some manipulation */
934                                 memcpy(&tp->md5sig_info->keys4[i],
935                                        &tp->md5sig_info->keys4[i+1],
936                                        (tp->md5sig_info->entries4 - i) *
937                                         sizeof(struct tcp4_md5sig_key));
938                         }
939                         tcp_free_md5sig_pool();
940                         return 0;
941                 }
942         }
943         return -ENOENT;
944 }
945
946 EXPORT_SYMBOL(tcp_v4_md5_do_del);
947
948 static void tcp_v4_clear_md5_list(struct sock *sk)
949 {
950         struct tcp_sock *tp = tcp_sk(sk);
951
952         /* Free each key, then the set of key keys,
953          * the crypto element, and then decrement our
954          * hold on the last resort crypto.
955          */
956         if (tp->md5sig_info->entries4) {
957                 int i;
958                 for (i = 0; i < tp->md5sig_info->entries4; i++)
959                         kfree(tp->md5sig_info->keys4[i].key);
960                 tp->md5sig_info->entries4 = 0;
961                 tcp_free_md5sig_pool();
962         }
963         if (tp->md5sig_info->keys4) {
964                 kfree(tp->md5sig_info->keys4);
965                 tp->md5sig_info->keys4 = NULL;
966                 tp->md5sig_info->alloced4  = 0;
967         }
968 }
969
970 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
971                                  int optlen)
972 {
973         struct tcp_md5sig cmd;
974         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
975         u8 *newkey;
976
977         if (optlen < sizeof(cmd))
978                 return -EINVAL;
979
980         if (copy_from_user(&cmd, optval, sizeof(cmd)))
981                 return -EFAULT;
982
983         if (sin->sin_family != AF_INET)
984                 return -EINVAL;
985
986         if (!cmd.tcpm_key || !cmd.tcpm_keylen) {
987                 if (!tcp_sk(sk)->md5sig_info)
988                         return -ENOENT;
989                 return tcp_v4_md5_do_del(sk, sin->sin_addr.s_addr);
990         }
991
992         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
993                 return -EINVAL;
994
995         if (!tcp_sk(sk)->md5sig_info) {
996                 struct tcp_sock *tp = tcp_sk(sk);
997                 struct tcp_md5sig_info *p = kzalloc(sizeof(*p), GFP_KERNEL);
998
999                 if (!p)
1000                         return -EINVAL;
1001
1002                 tp->md5sig_info = p;
1003
1004         }
1005
1006         newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1007         if (!newkey)
1008                 return -ENOMEM;
1009         return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr,
1010                                  newkey, cmd.tcpm_keylen);
1011 }
1012
1013 static int tcp_v4_do_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
1014                                    __be32 saddr, __be32 daddr,
1015                                    struct tcphdr *th, int protocol,
1016                                    int tcplen)
1017 {
1018         struct scatterlist sg[4];
1019         __u16 data_len;
1020         int block = 0;
1021         __sum16 old_checksum;
1022         struct tcp_md5sig_pool *hp;
1023         struct tcp4_pseudohdr *bp;
1024         struct hash_desc *desc;
1025         int err;
1026         unsigned int nbytes = 0;
1027
1028         /*
1029          * Okay, so RFC2385 is turned on for this connection,
1030          * so we need to generate the MD5 hash for the packet now.
1031          */
1032
1033         hp = tcp_get_md5sig_pool();
1034         if (!hp)
1035                 goto clear_hash_noput;
1036
1037         bp = &hp->md5_blk.ip4;
1038         desc = &hp->md5_desc;
1039
1040         /*
1041          * 1. the TCP pseudo-header (in the order: source IP address,
1042          * destination IP address, zero-padded protocol number, and
1043          * segment length)
1044          */
1045         bp->saddr = saddr;
1046         bp->daddr = daddr;
1047         bp->pad = 0;
1048         bp->protocol = protocol;
1049         bp->len = htons(tcplen);
1050         sg_set_buf(&sg[block++], bp, sizeof(*bp));
1051         nbytes += sizeof(*bp);
1052
1053         /* 2. the TCP header, excluding options, and assuming a
1054          * checksum of zero/
1055          */
1056         old_checksum = th->check;
1057         th->check = 0;
1058         sg_set_buf(&sg[block++], th, sizeof(struct tcphdr));
1059         nbytes += sizeof(struct tcphdr);
1060
1061         /* 3. the TCP segment data (if any) */
1062         data_len = tcplen - (th->doff << 2);
1063         if (data_len > 0) {
1064                 unsigned char *data = (unsigned char *)th + (th->doff << 2);
1065                 sg_set_buf(&sg[block++], data, data_len);
1066                 nbytes += data_len;
1067         }
1068
1069         /* 4. an independently-specified key or password, known to both
1070          * TCPs and presumably connection-specific
1071          */
1072         sg_set_buf(&sg[block++], key->key, key->keylen);
1073         nbytes += key->keylen;
1074
1075         /* Now store the Hash into the packet */
1076         err = crypto_hash_init(desc);
1077         if (err)
1078                 goto clear_hash;
1079         err = crypto_hash_update(desc, sg, nbytes);
1080         if (err)
1081                 goto clear_hash;
1082         err = crypto_hash_final(desc, md5_hash);
1083         if (err)
1084                 goto clear_hash;
1085
1086         /* Reset header, and free up the crypto */
1087         tcp_put_md5sig_pool();
1088         th->check = old_checksum;
1089
1090 out:
1091         return 0;
1092 clear_hash:
1093         tcp_put_md5sig_pool();
1094 clear_hash_noput:
1095         memset(md5_hash, 0, 16);
1096         goto out;
1097 }
1098
1099 int tcp_v4_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
1100                          struct sock *sk,
1101                          struct dst_entry *dst,
1102                          struct request_sock *req,
1103                          struct tcphdr *th, int protocol,
1104                          int tcplen)
1105 {
1106         __be32 saddr, daddr;
1107
1108         if (sk) {
1109                 saddr = inet_sk(sk)->saddr;
1110                 daddr = inet_sk(sk)->daddr;
1111         } else {
1112                 struct rtable *rt = (struct rtable *)dst;
1113                 BUG_ON(!rt);
1114                 saddr = rt->rt_src;
1115                 daddr = rt->rt_dst;
1116         }
1117         return tcp_v4_do_calc_md5_hash(md5_hash, key,
1118                                        saddr, daddr,
1119                                        th, protocol, tcplen);
1120 }
1121
1122 EXPORT_SYMBOL(tcp_v4_calc_md5_hash);
1123
1124 static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
1125 {
1126         /*
1127          * This gets called for each TCP segment that arrives
1128          * so we want to be efficient.
1129          * We have 3 drop cases:
1130          * o No MD5 hash and one expected.
1131          * o MD5 hash and we're not expecting one.
1132          * o MD5 hash and its wrong.
1133          */
1134         __u8 *hash_location = NULL;
1135         struct tcp_md5sig_key *hash_expected;
1136         struct iphdr *iph = skb->nh.iph;
1137         struct tcphdr *th = skb->h.th;
1138         int length = (th->doff << 2) - sizeof(struct tcphdr);
1139         int genhash;
1140         unsigned char *ptr;
1141         unsigned char newhash[16];
1142
1143         hash_expected = tcp_v4_md5_do_lookup(sk, iph->saddr);
1144
1145         /*
1146          * If the TCP option length is less than the TCP_MD5SIG
1147          * option length, then we can shortcut
1148          */
1149         if (length < TCPOLEN_MD5SIG) {
1150                 if (hash_expected)
1151                         return 1;
1152                 else
1153                         return 0;
1154         }
1155
1156         /* Okay, we can't shortcut - we have to grub through the options */
1157         ptr = (unsigned char *)(th + 1);
1158         while (length > 0) {
1159                 int opcode = *ptr++;
1160                 int opsize;
1161
1162                 switch (opcode) {
1163                 case TCPOPT_EOL:
1164                         goto done_opts;
1165                 case TCPOPT_NOP:
1166                         length--;
1167                         continue;
1168                 default:
1169                         opsize = *ptr++;
1170                         if (opsize < 2)
1171                                 goto done_opts;
1172                         if (opsize > length)
1173                                 goto done_opts;
1174
1175                         if (opcode == TCPOPT_MD5SIG) {
1176                                 hash_location = ptr;
1177                                 goto done_opts;
1178                         }
1179                 }
1180                 ptr += opsize-2;
1181                 length -= opsize;
1182         }
1183 done_opts:
1184         /* We've parsed the options - do we have a hash? */
1185         if (!hash_expected && !hash_location)
1186                 return 0;
1187
1188         if (hash_expected && !hash_location) {
1189                 LIMIT_NETDEBUG(KERN_INFO "MD5 Hash expected but NOT found "
1190                                "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)\n",
1191                                NIPQUAD(iph->saddr), ntohs(th->source),
1192                                NIPQUAD(iph->daddr), ntohs(th->dest));
1193                 return 1;
1194         }
1195
1196         if (!hash_expected && hash_location) {
1197                 LIMIT_NETDEBUG(KERN_INFO "MD5 Hash NOT expected but found "
1198                                "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)\n",
1199                                NIPQUAD(iph->saddr), ntohs(th->source),
1200                                NIPQUAD(iph->daddr), ntohs(th->dest));
1201                 return 1;
1202         }
1203
1204         /* Okay, so this is hash_expected and hash_location -
1205          * so we need to calculate the checksum.
1206          */
1207         genhash = tcp_v4_do_calc_md5_hash(newhash,
1208                                           hash_expected,
1209                                           iph->saddr, iph->daddr,
1210                                           th, sk->sk_protocol,
1211                                           skb->len);
1212
1213         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1214                 if (net_ratelimit()) {
1215                         printk(KERN_INFO "MD5 Hash failed for "
1216                                "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)%s\n",
1217                                NIPQUAD(iph->saddr), ntohs(th->source),
1218                                NIPQUAD(iph->daddr), ntohs(th->dest),
1219                                genhash ? " tcp_v4_calc_md5_hash failed" : "");
1220                 }
1221                 return 1;
1222         }
1223         return 0;
1224 }
1225
1226 #endif
1227
1228 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1229         .family         =       PF_INET,
1230         .obj_size       =       sizeof(struct tcp_request_sock),
1231         .rtx_syn_ack    =       tcp_v4_send_synack,
1232         .send_ack       =       tcp_v4_reqsk_send_ack,
1233         .destructor     =       tcp_v4_reqsk_destructor,
1234         .send_reset     =       tcp_v4_send_reset,
1235 };
1236
1237 #ifdef CONFIG_TCP_MD5SIG
1238 static struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1239         .md5_lookup     =       tcp_v4_reqsk_md5_lookup,
1240 };
1241 #endif
1242
1243 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1244         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1245         .twsk_unique    = tcp_twsk_unique,
1246         .twsk_destructor= tcp_twsk_destructor,
1247 };
1248
1249 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1250 {
1251         struct inet_request_sock *ireq;
1252         struct tcp_options_received tmp_opt;
1253         struct request_sock *req;
1254         __be32 saddr = skb->nh.iph->saddr;
1255         __be32 daddr = skb->nh.iph->daddr;
1256         __u32 isn = TCP_SKB_CB(skb)->when;
1257         struct dst_entry *dst = NULL;
1258 #ifdef CONFIG_SYN_COOKIES
1259         int want_cookie = 0;
1260 #else
1261 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1262 #endif
1263
1264         /* Never answer to SYNs send to broadcast or multicast */
1265         if (((struct rtable *)skb->dst)->rt_flags &
1266             (RTCF_BROADCAST | RTCF_MULTICAST))
1267                 goto drop;
1268
1269         /* TW buckets are converted to open requests without
1270          * limitations, they conserve resources and peer is
1271          * evidently real one.
1272          */
1273         if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1274 #ifdef CONFIG_SYN_COOKIES
1275                 if (sysctl_tcp_syncookies) {
1276                         want_cookie = 1;
1277                 } else
1278 #endif
1279                 goto drop;
1280         }
1281
1282         /* Accept backlog is full. If we have already queued enough
1283          * of warm entries in syn queue, drop request. It is better than
1284          * clogging syn queue with openreqs with exponentially increasing
1285          * timeout.
1286          */
1287         if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1288                 goto drop;
1289
1290         req = reqsk_alloc(&tcp_request_sock_ops);
1291         if (!req)
1292                 goto drop;
1293
1294 #ifdef CONFIG_TCP_MD5SIG
1295         tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1296 #endif
1297
1298         tcp_clear_options(&tmp_opt);
1299         tmp_opt.mss_clamp = 536;
1300         tmp_opt.user_mss  = tcp_sk(sk)->rx_opt.user_mss;
1301
1302         tcp_parse_options(skb, &tmp_opt, 0);
1303
1304         if (want_cookie) {
1305                 tcp_clear_options(&tmp_opt);
1306                 tmp_opt.saw_tstamp = 0;
1307         }
1308
1309         if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
1310                 /* Some OSes (unknown ones, but I see them on web server, which
1311                  * contains information interesting only for windows'
1312                  * users) do not send their stamp in SYN. It is easy case.
1313                  * We simply do not advertise TS support.
1314                  */
1315                 tmp_opt.saw_tstamp = 0;
1316                 tmp_opt.tstamp_ok  = 0;
1317         }
1318         tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1319
1320         tcp_openreq_init(req, &tmp_opt, skb);
1321
1322         if (security_inet_conn_request(sk, skb, req))
1323                 goto drop_and_free;
1324
1325         ireq = inet_rsk(req);
1326         ireq->loc_addr = daddr;
1327         ireq->rmt_addr = saddr;
1328         ireq->opt = tcp_v4_save_options(sk, skb);
1329         if (!want_cookie)
1330                 TCP_ECN_create_request(req, skb->h.th);
1331
1332         if (want_cookie) {
1333 #ifdef CONFIG_SYN_COOKIES
1334                 syn_flood_warning(skb);
1335 #endif
1336                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1337         } else if (!isn) {
1338                 struct inet_peer *peer = NULL;
1339
1340                 /* VJ's idea. We save last timestamp seen
1341                  * from the destination in peer table, when entering
1342                  * state TIME-WAIT, and check against it before
1343                  * accepting new connection request.
1344                  *
1345                  * If "isn" is not zero, this request hit alive
1346                  * timewait bucket, so that all the necessary checks
1347                  * are made in the function processing timewait state.
1348                  */
1349                 if (tmp_opt.saw_tstamp &&
1350                     tcp_death_row.sysctl_tw_recycle &&
1351                     (dst = inet_csk_route_req(sk, req)) != NULL &&
1352                     (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1353                     peer->v4daddr == saddr) {
1354                         if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1355                             (s32)(peer->tcp_ts - req->ts_recent) >
1356                                                         TCP_PAWS_WINDOW) {
1357                                 NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
1358                                 dst_release(dst);
1359                                 goto drop_and_free;
1360                         }
1361                 }
1362                 /* Kill the following clause, if you dislike this way. */
1363                 else if (!sysctl_tcp_syncookies &&
1364                          (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1365                           (sysctl_max_syn_backlog >> 2)) &&
1366                          (!peer || !peer->tcp_ts_stamp) &&
1367                          (!dst || !dst_metric(dst, RTAX_RTT))) {
1368                         /* Without syncookies last quarter of
1369                          * backlog is filled with destinations,
1370                          * proven to be alive.
1371                          * It means that we continue to communicate
1372                          * to destinations, already remembered
1373                          * to the moment of synflood.
1374                          */
1375                         LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open "
1376                                        "request from %u.%u.%u.%u/%u\n",
1377                                        NIPQUAD(saddr),
1378                                        ntohs(skb->h.th->source));
1379                         dst_release(dst);
1380                         goto drop_and_free;
1381                 }
1382
1383                 isn = tcp_v4_init_sequence(skb);
1384         }
1385         tcp_rsk(req)->snt_isn = isn;
1386
1387         if (tcp_v4_send_synack(sk, req, dst))
1388                 goto drop_and_free;
1389
1390         if (want_cookie) {
1391                 reqsk_free(req);
1392         } else {
1393                 inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1394         }
1395         return 0;
1396
1397 drop_and_free:
1398         reqsk_free(req);
1399 drop:
1400         return 0;
1401 }
1402
1403
1404 /*
1405  * The three way handshake has completed - we got a valid synack -
1406  * now create the new socket.
1407  */
1408 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1409                                   struct request_sock *req,
1410                                   struct dst_entry *dst)
1411 {
1412         struct inet_request_sock *ireq;
1413         struct inet_sock *newinet;
1414         struct tcp_sock *newtp;
1415         struct sock *newsk;
1416 #ifdef CONFIG_TCP_MD5SIG
1417         struct tcp_md5sig_key *key;
1418 #endif
1419
1420         if (sk_acceptq_is_full(sk))
1421                 goto exit_overflow;
1422
1423         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
1424                 goto exit;
1425
1426         newsk = tcp_create_openreq_child(sk, req, skb);
1427         if (!newsk)
1428                 goto exit;
1429
1430         newsk->sk_gso_type = SKB_GSO_TCPV4;
1431         sk_setup_caps(newsk, dst);
1432
1433         newtp                 = tcp_sk(newsk);
1434         newinet               = inet_sk(newsk);
1435         ireq                  = inet_rsk(req);
1436         newinet->daddr        = ireq->rmt_addr;
1437         newinet->rcv_saddr    = ireq->loc_addr;
1438         newinet->saddr        = ireq->loc_addr;
1439         newinet->opt          = ireq->opt;
1440         ireq->opt             = NULL;
1441         newinet->mc_index     = inet_iif(skb);
1442         newinet->mc_ttl       = skb->nh.iph->ttl;
1443         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1444         if (newinet->opt)
1445                 inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
1446         newinet->id = newtp->write_seq ^ jiffies;
1447
1448         tcp_mtup_init(newsk);
1449         tcp_sync_mss(newsk, dst_mtu(dst));
1450         newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1451         tcp_initialize_rcv_mss(newsk);
1452
1453 #ifdef CONFIG_TCP_MD5SIG
1454         /* Copy over the MD5 key from the original socket */
1455         if ((key = tcp_v4_md5_do_lookup(sk, newinet->daddr)) != NULL) {
1456                 /*
1457                  * We're using one, so create a matching key
1458                  * on the newsk structure. If we fail to get
1459                  * memory, then we end up not copying the key
1460                  * across. Shucks.
1461                  */
1462                 char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC);
1463                 if (newkey != NULL)
1464                         tcp_v4_md5_do_add(newsk, inet_sk(sk)->daddr,
1465                                           newkey, key->keylen);
1466         }
1467 #endif
1468
1469         __inet_hash(&tcp_hashinfo, newsk, 0);
1470         __inet_inherit_port(&tcp_hashinfo, sk, newsk);
1471
1472         return newsk;
1473
1474 exit_overflow:
1475         NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
1476 exit:
1477         NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
1478         dst_release(dst);
1479         return NULL;
1480 }
1481
1482 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1483 {
1484         struct tcphdr *th = skb->h.th;
1485         struct iphdr *iph = skb->nh.iph;
1486         struct sock *nsk;
1487         struct request_sock **prev;
1488         /* Find possible connection requests. */
1489         struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1490                                                        iph->saddr, iph->daddr);
1491         if (req)
1492                 return tcp_check_req(sk, skb, req, prev);
1493
1494         nsk = inet_lookup_established(&tcp_hashinfo, skb->nh.iph->saddr,
1495                                       th->source, skb->nh.iph->daddr,
1496                                       th->dest, inet_iif(skb));
1497
1498         if (nsk) {
1499                 if (nsk->sk_state != TCP_TIME_WAIT) {
1500                         bh_lock_sock(nsk);
1501                         return nsk;
1502                 }
1503                 inet_twsk_put(inet_twsk(nsk));
1504                 return NULL;
1505         }
1506
1507 #ifdef CONFIG_SYN_COOKIES
1508         if (!th->rst && !th->syn && th->ack)
1509                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1510 #endif
1511         return sk;
1512 }
1513
1514 static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1515 {
1516         if (skb->ip_summed == CHECKSUM_COMPLETE) {
1517                 if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1518                                   skb->nh.iph->daddr, skb->csum)) {
1519                         skb->ip_summed = CHECKSUM_UNNECESSARY;
1520                         return 0;
1521                 }
1522         }
1523
1524         skb->csum = csum_tcpudp_nofold(skb->nh.iph->saddr, skb->nh.iph->daddr,
1525                                        skb->len, IPPROTO_TCP, 0);
1526
1527         if (skb->len <= 76) {
1528                 return __skb_checksum_complete(skb);
1529         }
1530         return 0;
1531 }
1532
1533
1534 /* The socket must have it's spinlock held when we get
1535  * here.
1536  *
1537  * We have a potential double-lock case here, so even when
1538  * doing backlog processing we use the BH locking scheme.
1539  * This is because we cannot sleep with the original spinlock
1540  * held.
1541  */
1542 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1543 {
1544         struct sock *rsk;
1545 #ifdef CONFIG_TCP_MD5SIG
1546         /*
1547          * We really want to reject the packet as early as possible
1548          * if:
1549          *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1550          *  o There is an MD5 option and we're not expecting one
1551          */
1552         if (tcp_v4_inbound_md5_hash(sk, skb))
1553                 goto discard;
1554 #endif
1555
1556         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1557                 TCP_CHECK_TIMER(sk);
1558                 if (tcp_rcv_established(sk, skb, skb->h.th, skb->len)) {
1559                         rsk = sk;
1560                         goto reset;
1561                 }
1562                 TCP_CHECK_TIMER(sk);
1563                 return 0;
1564         }
1565
1566         if (skb->len < (skb->h.th->doff << 2) || tcp_checksum_complete(skb))
1567                 goto csum_err;
1568
1569         if (sk->sk_state == TCP_LISTEN) {
1570                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1571                 if (!nsk)
1572                         goto discard;
1573
1574                 if (nsk != sk) {
1575                         if (tcp_child_process(sk, nsk, skb)) {
1576                                 rsk = nsk;
1577                                 goto reset;
1578                         }
1579                         return 0;
1580                 }
1581         }
1582
1583         TCP_CHECK_TIMER(sk);
1584         if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len)) {
1585                 rsk = sk;
1586                 goto reset;
1587         }
1588         TCP_CHECK_TIMER(sk);
1589         return 0;
1590
1591 reset:
1592         tcp_v4_send_reset(rsk, skb);
1593 discard:
1594         kfree_skb(skb);
1595         /* Be careful here. If this function gets more complicated and
1596          * gcc suffers from register pressure on the x86, sk (in %ebx)
1597          * might be destroyed here. This current version compiles correctly,
1598          * but you have been warned.
1599          */
1600         return 0;
1601
1602 csum_err:
1603         TCP_INC_STATS_BH(TCP_MIB_INERRS);
1604         goto discard;
1605 }
1606
1607 /*
1608  *      From tcp_input.c
1609  */
1610
1611 int tcp_v4_rcv(struct sk_buff *skb)
1612 {
1613         struct tcphdr *th;
1614         struct sock *sk;
1615         int ret;
1616
1617         if (skb->pkt_type != PACKET_HOST)
1618                 goto discard_it;
1619
1620         /* Count it even if it's bad */
1621         TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1622
1623         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1624                 goto discard_it;
1625
1626         th = skb->h.th;
1627
1628         if (th->doff < sizeof(struct tcphdr) / 4)
1629                 goto bad_packet;
1630         if (!pskb_may_pull(skb, th->doff * 4))
1631                 goto discard_it;
1632
1633         /* An explanation is required here, I think.
1634          * Packet length and doff are validated by header prediction,
1635          * provided case of th->doff==0 is eliminated.
1636          * So, we defer the checks. */
1637         if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
1638              tcp_v4_checksum_init(skb)))
1639                 goto bad_packet;
1640
1641         th = skb->h.th;
1642         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1643         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1644                                     skb->len - th->doff * 4);
1645         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1646         TCP_SKB_CB(skb)->when    = 0;
1647         TCP_SKB_CB(skb)->flags   = skb->nh.iph->tos;
1648         TCP_SKB_CB(skb)->sacked  = 0;
1649
1650         sk = __inet_lookup(&tcp_hashinfo, skb->nh.iph->saddr, th->source,
1651                            skb->nh.iph->daddr, th->dest,
1652                            inet_iif(skb));
1653
1654         if (!sk)
1655                 goto no_tcp_socket;
1656
1657 process:
1658
1659         if (sk->sk_state == TCP_TIME_WAIT)
1660                 goto do_time_wait;
1661
1662         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1663                 goto discard_and_relse;
1664         nf_reset(skb);
1665
1666         if (sk_filter(sk, skb))
1667                 goto discard_and_relse;
1668
1669         skb->dev = NULL;
1670
1671         bh_lock_sock_nested(sk);
1672         ret = 0;
1673         if (!sock_owned_by_user(sk)) {
1674 #ifdef CONFIG_NET_DMA
1675                 struct tcp_sock *tp = tcp_sk(sk);
1676                 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1677                         tp->ucopy.dma_chan = get_softnet_dma();
1678                 if (tp->ucopy.dma_chan)
1679                         ret = tcp_v4_do_rcv(sk, skb);
1680                 else
1681 #endif
1682                 {
1683                         if (!tcp_prequeue(sk, skb))
1684                         ret = tcp_v4_do_rcv(sk, skb);
1685                 }
1686         } else
1687                 sk_add_backlog(sk, skb);
1688         bh_unlock_sock(sk);
1689
1690         sock_put(sk);
1691
1692         return ret;
1693
1694 no_tcp_socket:
1695         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1696                 goto discard_it;
1697
1698         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1699 bad_packet:
1700                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1701         } else {
1702                 tcp_v4_send_reset(NULL, skb);
1703         }
1704
1705 discard_it:
1706         /* Discard frame. */
1707         kfree_skb(skb);
1708         return 0;
1709
1710 discard_and_relse:
1711         sock_put(sk);
1712         goto discard_it;
1713
1714 do_time_wait:
1715         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1716                 inet_twsk_put(inet_twsk(sk));
1717                 goto discard_it;
1718         }
1719
1720         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1721                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1722                 inet_twsk_put(inet_twsk(sk));
1723                 goto discard_it;
1724         }
1725         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1726         case TCP_TW_SYN: {
1727                 struct sock *sk2 = inet_lookup_listener(&tcp_hashinfo,
1728                                                         skb->nh.iph->daddr,
1729                                                         th->dest,
1730                                                         inet_iif(skb));
1731                 if (sk2) {
1732                         inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1733                         inet_twsk_put(inet_twsk(sk));
1734                         sk = sk2;
1735                         goto process;
1736                 }
1737                 /* Fall through to ACK */
1738         }
1739         case TCP_TW_ACK:
1740                 tcp_v4_timewait_ack(sk, skb);
1741                 break;
1742         case TCP_TW_RST:
1743                 goto no_tcp_socket;
1744         case TCP_TW_SUCCESS:;
1745         }
1746         goto discard_it;
1747 }
1748
1749 /* VJ's idea. Save last timestamp seen from this destination
1750  * and hold it at least for normal timewait interval to use for duplicate
1751  * segment detection in subsequent connections, before they enter synchronized
1752  * state.
1753  */
1754
1755 int tcp_v4_remember_stamp(struct sock *sk)
1756 {
1757         struct inet_sock *inet = inet_sk(sk);
1758         struct tcp_sock *tp = tcp_sk(sk);
1759         struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1760         struct inet_peer *peer = NULL;
1761         int release_it = 0;
1762
1763         if (!rt || rt->rt_dst != inet->daddr) {
1764                 peer = inet_getpeer(inet->daddr, 1);
1765                 release_it = 1;
1766         } else {
1767                 if (!rt->peer)
1768                         rt_bind_peer(rt, 1);
1769                 peer = rt->peer;
1770         }
1771
1772         if (peer) {
1773                 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1774                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1775                      peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1776                         peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1777                         peer->tcp_ts = tp->rx_opt.ts_recent;
1778                 }
1779                 if (release_it)
1780                         inet_putpeer(peer);
1781                 return 1;
1782         }
1783
1784         return 0;
1785 }
1786
1787 int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
1788 {
1789         struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
1790
1791         if (peer) {
1792                 const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
1793
1794                 if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
1795                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1796                      peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) {
1797                         peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp;
1798                         peer->tcp_ts       = tcptw->tw_ts_recent;
1799                 }
1800                 inet_putpeer(peer);
1801                 return 1;
1802         }
1803
1804         return 0;
1805 }
1806
1807 struct inet_connection_sock_af_ops ipv4_specific = {
1808         .queue_xmit        = ip_queue_xmit,
1809         .send_check        = tcp_v4_send_check,
1810         .rebuild_header    = inet_sk_rebuild_header,
1811         .conn_request      = tcp_v4_conn_request,
1812         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1813         .remember_stamp    = tcp_v4_remember_stamp,
1814         .net_header_len    = sizeof(struct iphdr),
1815         .setsockopt        = ip_setsockopt,
1816         .getsockopt        = ip_getsockopt,
1817         .addr2sockaddr     = inet_csk_addr2sockaddr,
1818         .sockaddr_len      = sizeof(struct sockaddr_in),
1819 #ifdef CONFIG_COMPAT
1820         .compat_setsockopt = compat_ip_setsockopt,
1821         .compat_getsockopt = compat_ip_getsockopt,
1822 #endif
1823 };
1824
1825 #ifdef CONFIG_TCP_MD5SIG
1826 static struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1827         .md5_lookup             = tcp_v4_md5_lookup,
1828         .calc_md5_hash          = tcp_v4_calc_md5_hash,
1829         .md5_add                = tcp_v4_md5_add_func,
1830         .md5_parse              = tcp_v4_parse_md5_keys,
1831 };
1832 #endif
1833
1834 /* NOTE: A lot of things set to zero explicitly by call to
1835  *       sk_alloc() so need not be done here.
1836  */
1837 static int tcp_v4_init_sock(struct sock *sk)
1838 {
1839         struct inet_connection_sock *icsk = inet_csk(sk);
1840         struct tcp_sock *tp = tcp_sk(sk);
1841
1842         skb_queue_head_init(&tp->out_of_order_queue);
1843         tcp_init_xmit_timers(sk);
1844         tcp_prequeue_init(tp);
1845
1846         icsk->icsk_rto = TCP_TIMEOUT_INIT;
1847         tp->mdev = TCP_TIMEOUT_INIT;
1848
1849         /* So many TCP implementations out there (incorrectly) count the
1850          * initial SYN frame in their delayed-ACK and congestion control
1851          * algorithms that we must have the following bandaid to talk
1852          * efficiently to them.  -DaveM
1853          */
1854         tp->snd_cwnd = 2;
1855
1856         /* See draft-stevens-tcpca-spec-01 for discussion of the
1857          * initialization of these values.
1858          */
1859         tp->snd_ssthresh = 0x7fffffff;  /* Infinity */
1860         tp->snd_cwnd_clamp = ~0;
1861         tp->mss_cache = 536;
1862
1863         tp->reordering = sysctl_tcp_reordering;
1864         icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1865
1866         sk->sk_state = TCP_CLOSE;
1867
1868         sk->sk_write_space = sk_stream_write_space;
1869         sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1870
1871         icsk->icsk_af_ops = &ipv4_specific;
1872         icsk->icsk_sync_mss = tcp_sync_mss;
1873 #ifdef CONFIG_TCP_MD5SIG
1874         tp->af_specific = &tcp_sock_ipv4_specific;
1875 #endif
1876
1877         sk->sk_sndbuf = sysctl_tcp_wmem[1];
1878         sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1879
1880         atomic_inc(&tcp_sockets_allocated);
1881
1882         return 0;
1883 }
1884
1885 int tcp_v4_destroy_sock(struct sock *sk)
1886 {
1887         struct tcp_sock *tp = tcp_sk(sk);
1888
1889         tcp_clear_xmit_timers(sk);
1890
1891         tcp_cleanup_congestion_control(sk);
1892
1893         /* Cleanup up the write buffer. */
1894         sk_stream_writequeue_purge(sk);
1895
1896         /* Cleans up our, hopefully empty, out_of_order_queue. */
1897         __skb_queue_purge(&tp->out_of_order_queue);
1898
1899 #ifdef CONFIG_TCP_MD5SIG
1900         /* Clean up the MD5 key list, if any */
1901         if (tp->md5sig_info) {
1902                 tcp_v4_clear_md5_list(sk);
1903                 kfree(tp->md5sig_info);
1904                 tp->md5sig_info = NULL;
1905         }
1906 #endif
1907
1908 #ifdef CONFIG_NET_DMA
1909         /* Cleans up our sk_async_wait_queue */
1910         __skb_queue_purge(&sk->sk_async_wait_queue);
1911 #endif
1912
1913         /* Clean prequeue, it must be empty really */
1914         __skb_queue_purge(&tp->ucopy.prequeue);
1915
1916         /* Clean up a referenced TCP bind bucket. */
1917         if (inet_csk(sk)->icsk_bind_hash)
1918                 inet_put_port(&tcp_hashinfo, sk);
1919
1920         /*
1921          * If sendmsg cached page exists, toss it.
1922          */
1923         if (sk->sk_sndmsg_page) {
1924                 __free_page(sk->sk_sndmsg_page);
1925                 sk->sk_sndmsg_page = NULL;
1926         }
1927
1928         atomic_dec(&tcp_sockets_allocated);
1929
1930         return 0;
1931 }
1932
1933 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1934
1935 #ifdef CONFIG_PROC_FS
1936 /* Proc filesystem TCP sock list dumping. */
1937
1938 static inline struct inet_timewait_sock *tw_head(struct hlist_head *head)
1939 {
1940         return hlist_empty(head) ? NULL :
1941                 list_entry(head->first, struct inet_timewait_sock, tw_node);
1942 }
1943
1944 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1945 {
1946         return tw->tw_node.next ?
1947                 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1948 }
1949
1950 static void *listening_get_next(struct seq_file *seq, void *cur)
1951 {
1952         struct inet_connection_sock *icsk;
1953         struct hlist_node *node;
1954         struct sock *sk = cur;
1955         struct tcp_iter_state* st = seq->private;
1956
1957         if (!sk) {
1958                 st->bucket = 0;
1959                 sk = sk_head(&tcp_hashinfo.listening_hash[0]);
1960                 goto get_sk;
1961         }
1962
1963         ++st->num;
1964
1965         if (st->state == TCP_SEQ_STATE_OPENREQ) {
1966                 struct request_sock *req = cur;
1967
1968                 icsk = inet_csk(st->syn_wait_sk);
1969                 req = req->dl_next;
1970                 while (1) {
1971                         while (req) {
1972                                 vxdprintk(VXD_CBIT(net, 6),
1973                                         "sk,req: %p [#%d] (from %d)", req->sk,
1974                                         (req->sk)?req->sk->sk_nid:0, nx_current_nid());
1975                                 if (req->sk &&
1976                                         !nx_check(req->sk->sk_nid, VS_WATCH_P|VS_IDENT))
1977                                         continue;
1978                                 if (req->rsk_ops->family == st->family) {
1979                                         cur = req;
1980                                         goto out;
1981                                 }
1982                                 req = req->dl_next;
1983                         }
1984                         if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
1985                                 break;
1986 get_req:
1987                         req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
1988                 }
1989                 sk        = sk_next(st->syn_wait_sk);
1990                 st->state = TCP_SEQ_STATE_LISTENING;
1991                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1992         } else {
1993                 icsk = inet_csk(sk);
1994                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1995                 if (reqsk_queue_len(&icsk->icsk_accept_queue))
1996                         goto start_req;
1997                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1998                 sk = sk_next(sk);
1999         }
2000 get_sk:
2001         sk_for_each_from(sk, node) {
2002                 vxdprintk(VXD_CBIT(net, 6), "sk: %p [#%d] (from %d)",
2003                         sk, sk->sk_nid, nx_current_nid());
2004                 if (!nx_check(sk->sk_nid, VS_WATCH_P|VS_IDENT))
2005                         continue;
2006                 if (sk->sk_family == st->family) {
2007                         cur = sk;
2008                         goto out;
2009                 }
2010                 icsk = inet_csk(sk);
2011                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2012                 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
2013 start_req:
2014                         st->uid         = sock_i_uid(sk);
2015                         st->syn_wait_sk = sk;
2016                         st->state       = TCP_SEQ_STATE_OPENREQ;
2017                         st->sbucket     = 0;
2018                         goto get_req;
2019                 }
2020                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2021         }
2022         if (++st->bucket < INET_LHTABLE_SIZE) {
2023                 sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]);
2024                 goto get_sk;
2025         }
2026         cur = NULL;
2027 out:
2028         return cur;
2029 }
2030
2031 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2032 {
2033         void *rc = listening_get_next(seq, NULL);
2034
2035         while (rc && *pos) {
2036                 rc = listening_get_next(seq, rc);
2037                 --*pos;
2038         }
2039         return rc;
2040 }
2041
2042 static void *established_get_first(struct seq_file *seq)
2043 {
2044         struct tcp_iter_state* st = seq->private;
2045         void *rc = NULL;
2046
2047         for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
2048                 struct sock *sk;
2049                 struct hlist_node *node;
2050                 struct inet_timewait_sock *tw;
2051
2052                 /* We can reschedule _before_ having picked the target: */
2053                 cond_resched_softirq();
2054
2055                 read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
2056                 sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2057                         vxdprintk(VXD_CBIT(net, 6),
2058                                 "sk,egf: %p [#%d] (from %d)",
2059                                 sk, sk->sk_nid, nx_current_nid());
2060                         if (!nx_check(sk->sk_nid, VS_WATCH_P|VS_IDENT))
2061                                 continue;
2062                         if (sk->sk_family != st->family)
2063                                 continue;
2064                         rc = sk;
2065                         goto out;
2066                 }
2067                 st->state = TCP_SEQ_STATE_TIME_WAIT;
2068                 inet_twsk_for_each(tw, node,
2069                                    &tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain) {
2070                         vxdprintk(VXD_CBIT(net, 6),
2071                                 "tw: %p [#%d] (from %d)",
2072                                 tw, tw->tw_nid, nx_current_nid());
2073                         if (!nx_check(tw->tw_nid, VS_WATCH_P|VS_IDENT))
2074                                 continue;
2075                         if (tw->tw_family != st->family)
2076                                 continue;
2077                         rc = tw;
2078                         goto out;
2079                 }
2080                 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
2081                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2082         }
2083 out:
2084         return rc;
2085 }
2086
2087 static void *established_get_next(struct seq_file *seq, void *cur)
2088 {
2089         struct sock *sk = cur;
2090         struct inet_timewait_sock *tw;
2091         struct hlist_node *node;
2092         struct tcp_iter_state* st = seq->private;
2093
2094         ++st->num;
2095
2096         if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2097                 tw = cur;
2098                 tw = tw_next(tw);
2099 get_tw:
2100                 while (tw && (tw->tw_family != st->family ||
2101                         !nx_check(tw->tw_nid, VS_WATCH_P|VS_IDENT))) {
2102                         tw = tw_next(tw);
2103                 }
2104                 if (tw) {
2105                         cur = tw;
2106                         goto out;
2107                 }
2108                 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
2109                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2110
2111                 /* We can reschedule between buckets: */
2112                 cond_resched_softirq();
2113
2114                 if (++st->bucket < tcp_hashinfo.ehash_size) {
2115                         read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
2116                         sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
2117                 } else {
2118                         cur = NULL;
2119                         goto out;
2120                 }
2121         } else
2122                 sk = sk_next(sk);
2123
2124         sk_for_each_from(sk, node) {
2125                 vxdprintk(VXD_CBIT(net, 6),
2126                         "sk,egn: %p [#%d] (from %d)",
2127                         sk, sk->sk_nid, nx_current_nid());
2128                 if (!nx_check(sk->sk_nid, VS_WATCH_P|VS_IDENT))
2129                         continue;
2130                 if (sk->sk_family == st->family)
2131                         goto found;
2132         }
2133
2134         st->state = TCP_SEQ_STATE_TIME_WAIT;
2135         tw = tw_head(&tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain);
2136         goto get_tw;
2137 found:
2138         cur = sk;
2139 out:
2140         return cur;
2141 }
2142
2143 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2144 {
2145         void *rc = established_get_first(seq);
2146
2147         while (rc && pos) {
2148                 rc = established_get_next(seq, rc);
2149                 --pos;
2150         }
2151         return rc;
2152 }
2153
2154 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2155 {
2156         void *rc;
2157         struct tcp_iter_state* st = seq->private;
2158
2159         inet_listen_lock(&tcp_hashinfo);
2160         st->state = TCP_SEQ_STATE_LISTENING;
2161         rc        = listening_get_idx(seq, &pos);
2162
2163         if (!rc) {
2164                 inet_listen_unlock(&tcp_hashinfo);
2165                 local_bh_disable();
2166                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2167                 rc        = established_get_idx(seq, pos);
2168         }
2169
2170         return rc;
2171 }
2172
2173 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2174 {
2175         struct tcp_iter_state* st = seq->private;
2176         st->state = TCP_SEQ_STATE_LISTENING;
2177         st->num = 0;
2178         return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2179 }
2180
2181 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2182 {
2183         void *rc = NULL;
2184         struct tcp_iter_state* st;
2185
2186         if (v == SEQ_START_TOKEN) {
2187                 rc = tcp_get_idx(seq, 0);
2188                 goto out;
2189         }
2190         st = seq->private;
2191
2192         switch (st->state) {
2193         case TCP_SEQ_STATE_OPENREQ:
2194         case TCP_SEQ_STATE_LISTENING:
2195                 rc = listening_get_next(seq, v);
2196                 if (!rc) {
2197                         inet_listen_unlock(&tcp_hashinfo);
2198                         local_bh_disable();
2199                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2200                         rc        = established_get_first(seq);
2201                 }
2202                 break;
2203         case TCP_SEQ_STATE_ESTABLISHED:
2204         case TCP_SEQ_STATE_TIME_WAIT:
2205                 rc = established_get_next(seq, v);
2206                 break;
2207         }
2208 out:
2209         ++*pos;
2210         return rc;
2211 }
2212
2213 static void tcp_seq_stop(struct seq_file *seq, void *v)
2214 {
2215         struct tcp_iter_state* st = seq->private;
2216
2217         switch (st->state) {
2218         case TCP_SEQ_STATE_OPENREQ:
2219                 if (v) {
2220                         struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2221                         read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2222                 }
2223         case TCP_SEQ_STATE_LISTENING:
2224                 if (v != SEQ_START_TOKEN)
2225                         inet_listen_unlock(&tcp_hashinfo);
2226                 break;
2227         case TCP_SEQ_STATE_TIME_WAIT:
2228         case TCP_SEQ_STATE_ESTABLISHED:
2229                 if (v)
2230                         read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
2231                 local_bh_enable();
2232                 break;
2233         }
2234 }
2235
2236 static int tcp_seq_open(struct inode *inode, struct file *file)
2237 {
2238         struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2239         struct seq_file *seq;
2240         struct tcp_iter_state *s;
2241         int rc;
2242
2243         if (unlikely(afinfo == NULL))
2244                 return -EINVAL;
2245
2246         s = kzalloc(sizeof(*s), GFP_KERNEL);
2247         if (!s)
2248                 return -ENOMEM;
2249         s->family               = afinfo->family;
2250         s->seq_ops.start        = tcp_seq_start;
2251         s->seq_ops.next         = tcp_seq_next;
2252         s->seq_ops.show         = afinfo->seq_show;
2253         s->seq_ops.stop         = tcp_seq_stop;
2254
2255         rc = seq_open(file, &s->seq_ops);
2256         if (rc)
2257                 goto out_kfree;
2258         seq          = file->private_data;
2259         seq->private = s;
2260 out:
2261         return rc;
2262 out_kfree:
2263         kfree(s);
2264         goto out;
2265 }
2266
2267 int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
2268 {
2269         int rc = 0;
2270         struct proc_dir_entry *p;
2271
2272         if (!afinfo)
2273                 return -EINVAL;
2274         afinfo->seq_fops->owner         = afinfo->owner;
2275         afinfo->seq_fops->open          = tcp_seq_open;
2276         afinfo->seq_fops->read          = seq_read;
2277         afinfo->seq_fops->llseek        = seq_lseek;
2278         afinfo->seq_fops->release       = seq_release_private;
2279
2280         p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
2281         if (p)
2282                 p->data = afinfo;
2283         else
2284                 rc = -ENOMEM;
2285         return rc;
2286 }
2287
2288 void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
2289 {
2290         if (!afinfo)
2291                 return;
2292         proc_net_remove(afinfo->name);
2293         memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops));
2294 }
2295
2296 static void get_openreq4(struct sock *sk, struct request_sock *req,
2297                          char *tmpbuf, int i, int uid)
2298 {
2299         const struct inet_request_sock *ireq = inet_rsk(req);
2300         int ttd = req->expires - jiffies;
2301
2302         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2303                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
2304                 i,
2305                 ireq->loc_addr,
2306                 ntohs(inet_sk(sk)->sport),
2307                 ireq->rmt_addr,
2308                 ntohs(ireq->rmt_port),
2309                 TCP_SYN_RECV,
2310                 0, 0, /* could print option size, but that is af dependent. */
2311                 1,    /* timers active (only the expire timer) */
2312                 jiffies_to_clock_t(ttd),
2313                 req->retrans,
2314                 uid,
2315                 0,  /* non standard timer */
2316                 0, /* open_requests have no inode */
2317                 atomic_read(&sk->sk_refcnt),
2318                 req);
2319 }
2320
2321 static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
2322 {
2323         int timer_active;
2324         unsigned long timer_expires;
2325         struct tcp_sock *tp = tcp_sk(sp);
2326         const struct inet_connection_sock *icsk = inet_csk(sp);
2327         struct inet_sock *inet = inet_sk(sp);
2328         __be32 dest = inet->daddr;
2329         __be32 src = inet->rcv_saddr;
2330         __u16 destp = ntohs(inet->dport);
2331         __u16 srcp = ntohs(inet->sport);
2332
2333         if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2334                 timer_active    = 1;
2335                 timer_expires   = icsk->icsk_timeout;
2336         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2337                 timer_active    = 4;
2338                 timer_expires   = icsk->icsk_timeout;
2339         } else if (timer_pending(&sp->sk_timer)) {
2340                 timer_active    = 2;
2341                 timer_expires   = sp->sk_timer.expires;
2342         } else {
2343                 timer_active    = 0;
2344                 timer_expires = jiffies;
2345         }
2346
2347         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2348                         "%08X %5d %8d %lu %d %p %u %u %u %u %d",
2349                 i, src, srcp, dest, destp, sp->sk_state,
2350                 tp->write_seq - tp->snd_una,
2351                 sp->sk_state == TCP_LISTEN ? sp->sk_ack_backlog :
2352                                              (tp->rcv_nxt - tp->copied_seq),
2353                 timer_active,
2354                 jiffies_to_clock_t(timer_expires - jiffies),
2355                 icsk->icsk_retransmits,
2356                 sock_i_uid(sp),
2357                 icsk->icsk_probes_out,
2358                 sock_i_ino(sp),
2359                 atomic_read(&sp->sk_refcnt), sp,
2360                 icsk->icsk_rto,
2361                 icsk->icsk_ack.ato,
2362                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2363                 tp->snd_cwnd,
2364                 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
2365 }
2366
2367 static void get_timewait4_sock(struct inet_timewait_sock *tw,
2368                                char *tmpbuf, int i)
2369 {
2370         __be32 dest, src;
2371         __u16 destp, srcp;
2372         int ttd = tw->tw_ttd - jiffies;
2373
2374         if (ttd < 0)
2375                 ttd = 0;
2376
2377         dest  = tw->tw_daddr;
2378         src   = tw->tw_rcv_saddr;
2379         destp = ntohs(tw->tw_dport);
2380         srcp  = ntohs(tw->tw_sport);
2381
2382         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2383                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
2384                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2385                 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2386                 atomic_read(&tw->tw_refcnt), tw);
2387 }
2388
2389 #define TMPSZ 150
2390
2391 static int tcp4_seq_show(struct seq_file *seq, void *v)
2392 {
2393         struct tcp_iter_state* st;
2394         char tmpbuf[TMPSZ + 1];
2395
2396         if (v == SEQ_START_TOKEN) {
2397                 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2398                            "  sl  local_address rem_address   st tx_queue "
2399                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2400                            "inode");
2401                 goto out;
2402         }
2403         st = seq->private;
2404
2405         switch (st->state) {
2406         case TCP_SEQ_STATE_LISTENING:
2407         case TCP_SEQ_STATE_ESTABLISHED:
2408                 get_tcp4_sock(v, tmpbuf, st->num);
2409                 break;
2410         case TCP_SEQ_STATE_OPENREQ:
2411                 get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
2412                 break;
2413         case TCP_SEQ_STATE_TIME_WAIT:
2414                 get_timewait4_sock(v, tmpbuf, st->num);
2415                 break;
2416         }
2417         seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
2418 out:
2419         return 0;
2420 }
2421
2422 static struct file_operations tcp4_seq_fops;
2423 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2424         .owner          = THIS_MODULE,
2425         .name           = "tcp",
2426         .family         = AF_INET,
2427         .seq_show       = tcp4_seq_show,
2428         .seq_fops       = &tcp4_seq_fops,
2429 };
2430
2431 int __init tcp4_proc_init(void)
2432 {
2433         return tcp_proc_register(&tcp4_seq_afinfo);
2434 }
2435
2436 void tcp4_proc_exit(void)
2437 {
2438         tcp_proc_unregister(&tcp4_seq_afinfo);
2439 }
2440 #endif /* CONFIG_PROC_FS */
2441
2442 struct proto tcp_prot = {
2443         .name                   = "TCP",
2444         .owner                  = THIS_MODULE,
2445         .close                  = tcp_close,
2446         .connect                = tcp_v4_connect,
2447         .disconnect             = tcp_disconnect,
2448         .accept                 = inet_csk_accept,
2449         .ioctl                  = tcp_ioctl,
2450         .init                   = tcp_v4_init_sock,
2451         .destroy                = tcp_v4_destroy_sock,
2452         .shutdown               = tcp_shutdown,
2453         .setsockopt             = tcp_setsockopt,
2454         .getsockopt             = tcp_getsockopt,
2455         .sendmsg                = tcp_sendmsg,
2456         .recvmsg                = tcp_recvmsg,
2457         .backlog_rcv            = tcp_v4_do_rcv,
2458         .hash                   = tcp_v4_hash,
2459         .unhash                 = tcp_unhash,
2460         .get_port               = tcp_v4_get_port,
2461         .enter_memory_pressure  = tcp_enter_memory_pressure,
2462         .sockets_allocated      = &tcp_sockets_allocated,
2463         .orphan_count           = &tcp_orphan_count,
2464         .memory_allocated       = &tcp_memory_allocated,
2465         .memory_pressure        = &tcp_memory_pressure,
2466         .sysctl_mem             = sysctl_tcp_mem,
2467         .sysctl_wmem            = sysctl_tcp_wmem,
2468         .sysctl_rmem            = sysctl_tcp_rmem,
2469         .max_header             = MAX_TCP_HEADER,
2470         .obj_size               = sizeof(struct tcp_sock),
2471         .twsk_prot              = &tcp_timewait_sock_ops,
2472         .rsk_prot               = &tcp_request_sock_ops,
2473 #ifdef CONFIG_COMPAT
2474         .compat_setsockopt      = compat_tcp_setsockopt,
2475         .compat_getsockopt      = compat_tcp_getsockopt,
2476 #endif
2477 };
2478
2479 void __init tcp_v4_init(struct net_proto_family *ops)
2480 {
2481         if (inet_csk_ctl_sock_create(&tcp_socket, PF_INET, SOCK_RAW,
2482                                      IPPROTO_TCP) < 0)
2483                 panic("Failed to create the TCP control socket.\n");
2484 }
2485
2486 EXPORT_SYMBOL(ipv4_specific);
2487 EXPORT_SYMBOL(tcp_hashinfo);
2488 EXPORT_SYMBOL(tcp_prot);
2489 EXPORT_SYMBOL(tcp_unhash);
2490 EXPORT_SYMBOL(tcp_v4_conn_request);
2491 EXPORT_SYMBOL(tcp_v4_connect);
2492 EXPORT_SYMBOL(tcp_v4_do_rcv);
2493 EXPORT_SYMBOL(tcp_v4_remember_stamp);
2494 EXPORT_SYMBOL(tcp_v4_send_check);
2495 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2496
2497 #ifdef CONFIG_PROC_FS
2498 EXPORT_SYMBOL(tcp_proc_register);
2499 EXPORT_SYMBOL(tcp_proc_unregister);
2500 #endif
2501 EXPORT_SYMBOL(sysctl_local_port_range);
2502 EXPORT_SYMBOL(sysctl_tcp_low_latency);
2503