fedora core 6 1.2949 + vserver 2.2.0
[linux-2.6.git] / include / linux / tcp.h
index 39e2d22..3cc70d1 100644 (file)
 
 #include <linux/types.h>
 #include <asm/byteorder.h>
+#include <linux/socket.h>
 
 struct tcphdr {
-       __u16   source;
-       __u16   dest;
-       __u32   seq;
-       __u32   ack_seq;
+       __be16  source;
+       __be16  dest;
+       __be32  seq;
+       __be32  ack_seq;
 #if defined(__LITTLE_ENDIAN_BITFIELD)
        __u16   res1:4,
                doff:4,
@@ -50,43 +51,9 @@ struct tcphdr {
 #else
 #error "Adjust your <asm/byteorder.h> defines"
 #endif 
-       __u16   window;
-       __u16   check;
-       __u16   urg_ptr;
-};
-
-
-enum {
-  TCP_ESTABLISHED = 1,
-  TCP_SYN_SENT,
-  TCP_SYN_RECV,
-  TCP_FIN_WAIT1,
-  TCP_FIN_WAIT2,
-  TCP_TIME_WAIT,
-  TCP_CLOSE,
-  TCP_CLOSE_WAIT,
-  TCP_LAST_ACK,
-  TCP_LISTEN,
-  TCP_CLOSING,  /* now a valid state */
-
-  TCP_MAX_STATES /* Leave at the end! */
-};
-
-#define TCP_STATE_MASK 0xF
-#define TCP_ACTION_FIN (1 << 7)
-
-enum {
-  TCPF_ESTABLISHED = (1 << 1),
-  TCPF_SYN_SENT  = (1 << 2),
-  TCPF_SYN_RECV  = (1 << 3),
-  TCPF_FIN_WAIT1 = (1 << 4),
-  TCPF_FIN_WAIT2 = (1 << 5),
-  TCPF_TIME_WAIT = (1 << 6),
-  TCPF_CLOSE     = (1 << 7),
-  TCPF_CLOSE_WAIT = (1 << 8),
-  TCPF_LAST_ACK  = (1 << 9),
-  TCPF_LISTEN    = (1 << 10),
-  TCPF_CLOSING   = (1 << 11) 
+       __be16  window;
+       __sum16 check;
+       __be16  urg_ptr;
 };
 
 /*
@@ -96,7 +63,7 @@ enum {
  */
 union tcp_word_hdr { 
        struct tcphdr hdr;
-       __u32             words[5];
+       __be32            words[5];
 }; 
 
 #define tcp_flag_word(tp) ( ((union tcp_word_hdr *)(tp))->words [3]) 
@@ -127,6 +94,8 @@ enum {
 #define TCP_WINDOW_CLAMP       10      /* Bound advertised window */
 #define TCP_INFO               11      /* Information about this connection. */
 #define TCP_QUICKACK           12      /* Block/reenable quick acks */
+#define TCP_CONGESTION         13      /* Congestion control algorithm */
+#define TCP_MD5SIG             14      /* TCP MD5 Signature (RFC2385) */
 
 #define TCPI_OPT_TIMESTAMPS    1
 #define TCPI_OPT_SACK          2
@@ -186,55 +155,99 @@ struct tcp_info
 
        __u32   tcpi_rcv_rtt;
        __u32   tcpi_rcv_space;
+
+       __u32   tcpi_total_retrans;
+};
+
+/* for TCP_MD5SIG socket option */
+#define TCP_MD5SIG_MAXKEYLEN   80
+
+struct tcp_md5sig {
+       struct __kernel_sockaddr_storage tcpm_addr;     /* address associated */
+       __u16   __tcpm_pad1;                            /* zero */
+       __u16   tcpm_keylen;                            /* key length */
+       __u32   __tcpm_pad2;                            /* zero */
+       __u8    tcpm_key[TCP_MD5SIG_MAXKEYLEN];         /* key (binary) */
 };
 
 #ifdef __KERNEL__
 
-#include <linux/config.h>
 #include <linux/skbuff.h>
-#include <linux/ip.h>
+#include <linux/dmaengine.h>
 #include <net/sock.h>
+#include <net/inet_connection_sock.h>
+#include <net/inet_timewait_sock.h>
 
 /* This defines a selective acknowledgement block. */
+struct tcp_sack_block_wire {
+       __be32  start_seq;
+       __be32  end_seq;
+};
+
 struct tcp_sack_block {
-       __u32   start_seq;
-       __u32   end_seq;
+       u32     start_seq;
+       u32     end_seq;
+};
+
+struct tcp_options_received {
+/*     PAWS/RTTM data  */
+       long    ts_recent_stamp;/* Time we stored ts_recent (for aging) */
+       u32     ts_recent;      /* Time stamp to echo next              */
+       u32     rcv_tsval;      /* Time stamp value                     */
+       u32     rcv_tsecr;      /* Time stamp echo reply                */
+       u16     saw_tstamp : 1, /* Saw TIMESTAMP on last packet         */
+               tstamp_ok : 1,  /* TIMESTAMP seen on SYN packet         */
+               dsack : 1,      /* D-SACK is scheduled                  */
+               wscale_ok : 1,  /* Wscale seen on SYN packet            */
+               sack_ok : 4,    /* SACK seen on SYN packet              */
+               snd_wscale : 4, /* Window scaling received from sender  */
+               rcv_wscale : 4; /* Window scaling to send to receiver   */
+/*     SACKs data      */
+       u8      eff_sacks;      /* Size of SACK array to send with next packet */
+       u8      num_sacks;      /* Number of SACK blocks                */
+       u16     user_mss;       /* mss requested by user in ioctl */
+       u16     mss_clamp;      /* Maximal mss, negotiated at connection setup */
 };
 
-struct tcp_opt {
-       int     tcp_header_len; /* Bytes of tcp header to send          */
+struct tcp_request_sock {
+       struct inet_request_sock        req;
+#ifdef CONFIG_TCP_MD5SIG
+       /* Only used by TCP MD5 Signature so far. */
+       struct tcp_request_sock_ops     *af_specific;
+#endif
+       u32                             rcv_isn;
+       u32                             snt_isn;
+};
+
+static inline struct tcp_request_sock *tcp_rsk(const struct request_sock *req)
+{
+       return (struct tcp_request_sock *)req;
+}
+
+struct tcp_sock {
+       /* inet_connection_sock has to be the first member of tcp_sock */
+       struct inet_connection_sock     inet_conn;
+       u16     tcp_header_len; /* Bytes of tcp header to send          */
+       u16     xmit_size_goal; /* Goal for segmenting output packets   */
 
 /*
  *     Header prediction flags
  *     0x5?10 << 16 + snd_wnd in net byte order
  */
-       __u32   pred_flags;
+       __be32  pred_flags;
 
 /*
  *     RFC793 variables by their proper names. This means you can
  *     read the code and the spec side by side (and laugh ...)
  *     See RFC793 and RFC1122. The RFC writes these in capitals.
  */
-       __u32   rcv_nxt;        /* What we want to receive next         */
-       __u32   snd_nxt;        /* Next sequence we send                */
-
-       __u32   snd_una;        /* First byte we want an ack for        */
-       __u32   snd_sml;        /* Last byte of the most recently transmitted small packet */
-       __u32   rcv_tstamp;     /* timestamp of last received ACK (for keepalives) */
-       __u32   lsndtime;       /* timestamp of last sent data packet (for restart window) */
-       struct tcp_bind_bucket *bind_hash;
-       /* Delayed ACK control data */
-       struct {
-               __u8    pending;        /* ACK is pending */
-               __u8    quick;          /* Scheduled number of quick acks       */
-               __u8    pingpong;       /* The session is interactive           */
-               __u8    blocked;        /* Delayed ACK was blocked by socket lock*/
-               __u32   ato;            /* Predicted tick of soft clock         */
-               unsigned long timeout;  /* Currently scheduled timeout          */
-               __u32   lrcvtime;       /* timestamp of last received data packet*/
-               __u16   last_seg_size;  /* Size of last incoming segment        */
-               __u16   rcv_mss;        /* MSS used for delayed ACK decisions   */ 
-       } ack;
+       u32     rcv_nxt;        /* What we want to receive next         */
+       u32     snd_nxt;        /* Next sequence we send                */
+
+       u32     snd_una;        /* First byte we want an ack for        */
+       u32     snd_sml;        /* Last byte of the most recently transmitted small packet */
+       u32     rcv_tstamp;     /* timestamp of last received ACK (for keepalives) */
+       u32     lsndtime;       /* timestamp of last sent data packet (for restart window) */
 
        /* Data for direct copy to user */
        struct {
@@ -243,135 +256,101 @@ struct tcp_opt {
                struct iovec            *iov;
                int                     memory;
                int                     len;
+#ifdef CONFIG_NET_DMA
+               /* members for async copy */
+               struct dma_chan         *dma_chan;
+               int                     wakeup;
+               struct dma_pinned_list  *pinned_list;
+               dma_cookie_t            dma_cookie;
+#endif
        } ucopy;
 
-       __u32   snd_wl1;        /* Sequence for window update           */
-       __u32   snd_wnd;        /* The window we expect to receive      */
-       __u32   max_window;     /* Maximal window ever seen from peer   */
-       __u32   pmtu_cookie;    /* Last pmtu seen by socket             */
-       __u32   mss_cache;      /* Cached effective mss, not including SACKS */
-       __u16   mss_cache_std;  /* Like mss_cache, but without TSO */
-       __u16   mss_clamp;      /* Maximal mss, negotiated at connection setup */
-       __u16   ext_header_len; /* Network protocol overhead (IP/IPv6 options) */
-       __u16   ext2_header_len;/* Options depending on route */
-       __u8    ca_state;       /* State of fast-retransmit machine     */
-       __u8    retransmits;    /* Number of unrecovered RTO timeouts.  */
-
-       __u8    reordering;     /* Packet reordering metric.            */
-       __u8    frto_counter;   /* Number of new acks after RTO */
-       __u32   frto_highmark;  /* snd_nxt when RTO occurred */
-
-       __u8    unused_pad;
-       __u8    queue_shrunk;   /* Write queue has been shrunk recently.*/
-       __u8    defer_accept;   /* User waits for some data after accept() */
+       u32     snd_wl1;        /* Sequence for window update           */
+       u32     snd_wnd;        /* The window we expect to receive      */
+       u32     max_window;     /* Maximal window ever seen from peer   */
+       u32     mss_cache;      /* Cached effective mss, not including SACKS */
 
-/* RTT measurement */
-       __u8    backoff;        /* backoff                              */
-       __u32   srtt;           /* smothed round trip time << 3         */
-       __u32   mdev;           /* medium deviation                     */
-       __u32   mdev_max;       /* maximal mdev for the last rtt period */
-       __u32   rttvar;         /* smoothed mdev_max                    */
-       __u32   rtt_seq;        /* sequence number to update rttvar     */
-       __u32   rto;            /* retransmit timeout                   */
+       u32     window_clamp;   /* Maximal window to advertise          */
+       u32     rcv_ssthresh;   /* Current window clamp                 */
 
-       __u32   packets_out;    /* Packets which are "in flight"        */
-       __u32   left_out;       /* Packets which leaved network         */
-       __u32   retrans_out;    /* Retransmitted packets out            */
+       u32     frto_highmark;  /* snd_nxt when RTO occurred */
+       u8      reordering;     /* Packet reordering metric.            */
+       u8      frto_counter;   /* Number of new acks after RTO */
+       u8      nonagle;        /* Disable Nagle algorithm?             */
+       u8      keepalive_probes; /* num of allowed keep alive probes   */
 
+/* RTT measurement */
+       u32     srtt;           /* smoothed round trip time << 3        */
+       u32     mdev;           /* medium deviation                     */
+       u32     mdev_max;       /* maximal mdev for the last rtt period */
+       u32     rttvar;         /* smoothed mdev_max                    */
+       u32     rtt_seq;        /* sequence number to update rttvar     */
+
+       u32     packets_out;    /* Packets which are "in flight"        */
+       u32     left_out;       /* Packets which leaved network */
+       u32     retrans_out;    /* Retransmitted packets out            */
+/*
+ *      Options received (usually on last packet, some only on SYN packets).
+ */
+       struct tcp_options_received rx_opt;
 
 /*
  *     Slow start and congestion control (see also Nagle, and Karn & Partridge)
  */
-       __u32   snd_ssthresh;   /* Slow start size threshold            */
-       __u32   snd_cwnd;       /* Sending congestion window            */
-       __u16   snd_cwnd_cnt;   /* Linear increase counter              */
-       __u16   snd_cwnd_clamp; /* Do not allow snd_cwnd to grow above this */
-       __u32   snd_cwnd_used;
-       __u32   snd_cwnd_stamp;
-
-       /* Two commonly used timers in both sender and receiver paths. */
-       unsigned long           timeout;
-       struct timer_list       retransmit_timer;       /* Resend (no ack)      */
-       struct timer_list       delack_timer;           /* Ack delay            */
+       u32     snd_ssthresh;   /* Slow start size threshold            */
+       u32     snd_cwnd;       /* Sending congestion window            */
+       u16     snd_cwnd_cnt;   /* Linear increase counter              */
+       u16     snd_cwnd_clamp; /* Do not allow snd_cwnd to grow above this */
+       u32     snd_cwnd_used;
+       u32     snd_cwnd_stamp;
 
        struct sk_buff_head     out_of_order_queue; /* Out of order segments go here */
 
-       struct tcp_func         *af_specific;   /* Operations which are AF_INET{4,6} specific   */
-       struct sk_buff          *send_head;     /* Front of stuff to transmit                   */
-
-       __u32   rcv_wnd;        /* Current receiver window              */
-       __u32   rcv_wup;        /* rcv_nxt on last window update sent   */
-       __u32   write_seq;      /* Tail(+1) of data held in tcp send buffer */
-       __u32   pushed_seq;     /* Last pushed seq, required to talk to windows */
-       __u32   copied_seq;     /* Head of yet unread data              */
-/*
- *      Options received (usually on last packet, some only on SYN packets).
- */
-       char    tstamp_ok,      /* TIMESTAMP seen on SYN packet         */
-               wscale_ok,      /* Wscale seen on SYN packet            */
-               sack_ok;        /* SACK seen on SYN packet              */
-       char    saw_tstamp;     /* Saw TIMESTAMP on last packet         */
-        __u8   snd_wscale;     /* Window scaling received from sender  */
-        __u8   rcv_wscale;     /* Window scaling to send to receiver   */
-       __u8    nonagle;        /* Disable Nagle algorithm?             */
-       __u8    keepalive_probes; /* num of allowed keep alive probes   */
-
-/*     PAWS/RTTM data  */
-        __u32  rcv_tsval;      /* Time stamp value                     */
-        __u32  rcv_tsecr;      /* Time stamp echo reply                */
-        __u32  ts_recent;      /* Time stamp to echo next              */
-        long   ts_recent_stamp;/* Time we stored ts_recent (for aging) */
+       u32     rcv_wnd;        /* Current receiver window              */
+       u32     rcv_wup;        /* rcv_nxt on last window update sent   */
+       u32     write_seq;      /* Tail(+1) of data held in tcp send buffer */
+       u32     pushed_seq;     /* Last pushed seq, required to talk to windows */
+       u32     copied_seq;     /* Head of yet unread data              */
 
 /*     SACKs data      */
-       __u16   user_mss;       /* mss requested by user in ioctl */
-       __u8    dsack;          /* D-SACK is scheduled                  */
-       __u8    eff_sacks;      /* Size of SACK array to send with next packet */
        struct tcp_sack_block duplicate_sack[1]; /* D-SACK block */
        struct tcp_sack_block selective_acks[4]; /* The SACKS themselves*/
 
-       __u32   window_clamp;   /* Maximal window to advertise          */
-       __u32   rcv_ssthresh;   /* Current window clamp                 */
-       __u8    probes_out;     /* unanswered 0 window probes           */
-       __u8    num_sacks;      /* Number of SACK blocks                */
-       __u16   advmss;         /* Advertised MSS                       */
-
-       __u8    syn_retries;    /* num of allowed syn retries */
-       __u8    ecn_flags;      /* ECN status bits.                     */
-       __u16   prior_ssthresh; /* ssthresh saved at recovery start     */
-       __u32   lost_out;       /* Lost packets                         */
-       __u32   sacked_out;     /* SACK'd packets                       */
-       __u32   fackets_out;    /* FACK'd packets                       */
-       __u32   high_seq;       /* snd_nxt at onset of congestion       */
-
-       __u32   retrans_stamp;  /* Timestamp of the last retransmit,
+       struct tcp_sack_block recv_sack_cache[4];
+
+       /* from STCP, retrans queue hinting */
+       struct sk_buff* lost_skb_hint;
+
+       struct sk_buff *scoreboard_skb_hint;
+       struct sk_buff *retransmit_skb_hint;
+       struct sk_buff *forward_skb_hint;
+       struct sk_buff *fastpath_skb_hint;
+
+       int     fastpath_cnt_hint;
+       int     lost_cnt_hint;
+       int     retransmit_cnt_hint;
+       int     forward_cnt_hint;
+
+       u16     advmss;         /* Advertised MSS                       */
+       u16     prior_ssthresh; /* ssthresh saved at recovery start     */
+       u32     lost_out;       /* Lost packets                 */
+       u32     sacked_out;     /* SACK'd packets                       */
+       u32     fackets_out;    /* FACK'd packets                       */
+       u32     high_seq;       /* snd_nxt at onset of congestion       */
+
+       u32     retrans_stamp;  /* Timestamp of the last retransmit,
                                 * also used in SYN-SENT to remember stamp of
                                 * the first SYN. */
-       __u32   undo_marker;    /* tracking retrans started here. */
+       u32     undo_marker;    /* tracking retrans started here. */
        int     undo_retrans;   /* number of undoable retransmissions. */
-       __u32   urg_seq;        /* Seq of received urgent pointer */
-       __u16   urg_data;       /* Saved octet of OOB data and control flags */
-       __u8    pending;        /* Scheduled timer event        */
-       __u8    urg_mode;       /* In urgent mode               */
-       __u32   snd_up;         /* Urgent pointer               */
-
-       /* The syn_wait_lock is necessary only to avoid proc interface having
-        * to grab the main lock sock while browsing the listening hash
-        * (otherwise it's deadlock prone).
-        * This lock is acquired in read mode only from listening_get_next()
-        * and it's acquired in write mode _only_ from code that is actively
-        * changing the syn_wait_queue. All readers that are holding
-        * the master sock lock don't need to grab this lock in read mode
-        * too as the syn_wait_queue writes are always protected from
-        * the main sock lock.
-        */
-       rwlock_t                syn_wait_lock;
-       struct tcp_listen_opt   *listen_opt;
-
-       /* FIFO of established children */
-       struct open_request     *accept_queue;
-       struct open_request     *accept_queue_tail;
-
-       int                     write_pending;  /* A write to socket waits to start. */
+       u32     urg_seq;        /* Seq of received urgent pointer */
+       u16     urg_data;       /* Saved octet of OOB data and control flags */
+       u8      urg_mode;       /* In urgent mode               */
+       u8      ecn_flags;      /* ECN status bits.                     */
+       u32     snd_up;         /* Urgent pointer               */
+
+       u32     total_retrans;  /* Total retransmits for entire connection */
+       u32     bytes_acked;    /* Appropriate Byte Counting - RFC3465 */
 
        unsigned int            keepalive_time;   /* time before keep alive takes place */
        unsigned int            keepalive_intvl;  /* time interval between keep alive probes */
@@ -379,66 +358,58 @@ struct tcp_opt {
 
        unsigned long last_synq_overflow; 
 
+       u32     tso_deferred;
+
 /* Receiver side RTT estimation */
        struct {
-               __u32   rtt;
-               __u32   seq;
-               __u32   time;
+               u32     rtt;
+               u32     seq;
+               u32     time;
        } rcv_rtt_est;
 
 /* Receiver queue space */
        struct {
                int     space;
-               __u32   seq;
-               __u32   time;
+               u32     seq;
+               u32     time;
        } rcvq_space;
 
-/* TCP Westwood structure */
-        struct {
-                __u32    bw_ns_est;        /* first bandwidth estimation..not too smoothed 8) */
-                __u32    bw_est;           /* bandwidth estimate */
-                __u32    rtt_win_sx;       /* here starts a new evaluation... */
-                __u32    bk;
-                __u32    snd_una;          /* used for evaluating the number of acked bytes */
-                __u32    cumul_ack;
-                __u32    accounted;
-                __u32    rtt;
-                __u32    rtt_min;          /* minimum observed RTT */
-        } westwood;
-
-/* Vegas variables */
-       struct {
-               __u32   beg_snd_nxt;    /* right edge during last RTT */
-               __u32   beg_snd_una;    /* left edge  during last RTT */
-               __u32   beg_snd_cwnd;   /* saves the size of the cwnd */
-               __u8    do_vegas;       /* do vegas for this connection */
-               __u8    doing_vegas_now;/* if true, do vegas for this RTT */
-               __u16   cntRTT;         /* # of RTTs measured within last RTT */
-               __u32   minRTT;         /* min of RTTs measured within last RTT (in usec) */
-               __u32   baseRTT;        /* the min of all Vegas RTT measurements seen (in usec) */
-       } vegas;
-
-       /* BI TCP Parameters */
+/* TCP-specific MTU probe information. */
        struct {
-               __u32   cnt;            /* increase cwnd by 1 after this number of ACKs */
-               __u32   last_max_cwnd;  /* last maximium snd_cwnd */
-               __u32   last_cwnd;      /* the last snd_cwnd */
-       } bictcp;
+               u32               probe_seq_start;
+               u32               probe_seq_end;
+       } mtu_probe;
+
+#ifdef CONFIG_TCP_MD5SIG
+/* TCP AF-Specific parts; only used by MD5 Signature support so far */
+       struct tcp_sock_af_ops  *af_specific;
+
+/* TCP MD5 Signagure Option information */
+       struct tcp_md5sig_info  *md5sig_info;
+#endif
 };
 
-/* WARNING: don't change the layout of the members in tcp_sock! */
-struct tcp_sock {
-       struct sock       sk;
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-       struct ipv6_pinfo *pinet6;
+static inline struct tcp_sock *tcp_sk(const struct sock *sk)
+{
+       return (struct tcp_sock *)sk;
+}
+
+struct tcp_timewait_sock {
+       struct inet_timewait_sock tw_sk;
+       u32                       tw_rcv_nxt;
+       u32                       tw_snd_nxt;
+       u32                       tw_rcv_wnd;
+       u32                       tw_ts_recent;
+       long                      tw_ts_recent_stamp;
+#ifdef CONFIG_TCP_MD5SIG
+       u16                       tw_md5_keylen;
+       u8                        tw_md5_key[TCP_MD5SIG_MAXKEYLEN];
 #endif
-       struct inet_opt   inet;
-       struct tcp_opt    tcp;
 };
 
-static inline struct tcp_opt * tcp_sk(const struct sock *__sk)
+static inline struct tcp_timewait_sock *tcp_twsk(const struct sock *sk)
 {
-       return &((struct tcp_sock *)__sk)->tcp;
+       return (struct tcp_timewait_sock *)sk;
 }
 
 #endif