datapath: Add tunnel header caching.
authorJesse Gross <jesse@nicira.com>
Fri, 27 Aug 2010 20:55:02 +0000 (13:55 -0700)
committerJesse Gross <jesse@nicira.com>
Wed, 22 Sep 2010 20:43:02 +0000 (13:43 -0700)
On the transmit path we generate essentially the same tunnel header
for every packet to a given destination.  However, each packet must
have the headers assembled in pieces, lookup the destination in the
routing table, and lookup the flow in OVS.  This avoids that extra
work by caching all of the header and output path information and
only rebuilding it when something actually changes.

This optimization reduces CPU load on transmit by approximately 13%.

Signed-off-by: Jesse Gross <jesse@nicira.com>
Reviewed-by: Ben Pfaff <blp@nicira.com>
datapath/tunnel.c
datapath/tunnel.h
datapath/vport-capwap.c
datapath/vport-gre.c
include/openvswitch/tunnel.h

index 6fa369b..77f976f 100644 (file)
@@ -15,6 +15,7 @@
 #include <linux/jhash.h>
 #include <linux/kernel.h>
 #include <linux/version.h>
+#include <linux/workqueue.h>
 
 #include <net/dsfield.h>
 #include <net/dst.h>
 #include "tunnel.h"
 #include "vport.h"
 #include "vport-generic.h"
+#include "vport-internal_dev.h"
+
+#ifdef NEED_CACHE_TIMEOUT
+/*
+ * On kernels where we can't quickly detect changes in the rest of the system
+ * we use an expiration time to invalidate the cache.  A shorter expiration
+ * reduces the length of time that we may potentially blackhole packets while
+ * a longer time increases performance by reducing the frequency that the
+ * cache needs to be rebuilt.  A variety of factors may cause the cache to be
+ * invalidated before the expiration time but this is the maximum.  The time
+ * is expressed in jiffies.
+ */
+#define MAX_CACHE_EXP HZ
+#endif
+
+/*
+ * Interval to check for and remove caches that are no longer valid.  Caches
+ * are checked for validity before they are used for packet encapsulation and
+ * old caches are removed at that time.  However, if no packets are sent through
+ * the tunnel then the cache will never be destroyed.  Since it holds
+ * references to a number of system objects, the cache will continue to use
+ * system resources by not allowing those objects to be destroyed.  The cache
+ * cleaner is periodically run to free invalid caches.  It does not
+ * significantly affect system performance.  A lower interval will release
+ * resources faster but will itself consume resources by requiring more frequent
+ * checks.  A longer interval may result in messages being printed to the kernel
+ * message buffer about unreleased resources.  The interval is expressed in
+ * jiffies.
+ */
+#define CACHE_CLEANER_INTERVAL (5 * HZ)
+
+#define CACHE_DATA_ALIGN 16
 
 /* Protected by RCU. */
 static struct tbl *port_table;
 
+static void cache_cleaner(struct work_struct *work);
+DECLARE_DELAYED_WORK(cache_cleaner_wq, cache_cleaner);
+
 /*
  * These are just used as an optimization: they don't require any kind of
  * synchronization because we could have just as easily read the value before
@@ -63,22 +99,54 @@ static inline struct tnl_vport *tnl_vport_table_cast(const struct tbl_node *node
        return container_of(node, struct tnl_vport, tbl_node);
 }
 
-/* RCU callback. */
-static void free_config(struct rcu_head *rcu)
+static inline void schedule_cache_cleaner(void)
+{
+       schedule_delayed_work(&cache_cleaner_wq, CACHE_CLEANER_INTERVAL);
+}
+
+static void free_cache(struct tnl_cache *cache)
+{
+       if (!cache)
+               return;
+
+       flow_put(cache->flow);
+       ip_rt_put(cache->rt);
+       kfree(cache);
+}
+
+static void free_config_rcu(struct rcu_head *rcu)
 {
        struct tnl_mutable_config *c = container_of(rcu, struct tnl_mutable_config, rcu);
        kfree(c);
 }
 
+static void free_cache_rcu(struct rcu_head *rcu)
+{
+       struct tnl_cache *c = container_of(rcu, struct tnl_cache, rcu);
+       free_cache(c);
+}
+
 static void assign_config_rcu(struct vport *vport,
                              struct tnl_mutable_config *new_config)
 {
        struct tnl_vport *tnl_vport = tnl_vport_priv(vport);
        struct tnl_mutable_config *old_config;
 
-       old_config = rcu_dereference(tnl_vport->mutable);
+       old_config = tnl_vport->mutable;
        rcu_assign_pointer(tnl_vport->mutable, new_config);
-       call_rcu(&old_config->rcu, free_config);
+       call_rcu(&old_config->rcu, free_config_rcu);
+}
+
+static void assign_cache_rcu(struct vport *vport, struct tnl_cache *new_cache)
+{
+       struct tnl_vport *tnl_vport = tnl_vport_priv(vport);
+       struct tnl_cache *old_cache;
+
+       old_cache = tnl_vport->cache;
+       rcu_assign_pointer(tnl_vport->cache, new_cache);
+
+       if (old_cache)
+               call_rcu(&old_cache->rcu, free_cache_rcu);
 }
 
 static unsigned int *find_port_pool(const struct tnl_mutable_config *mutable)
@@ -130,10 +198,32 @@ static u32 port_hash(struct port_lookup_key *lookup)
        return jhash2(lookup->vals, ARRAY_SIZE(lookup->vals), 0);
 }
 
+static u32 mutable_hash(const struct tnl_mutable_config *mutable)
+{
+       struct port_lookup_key lookup;
+
+       lookup.vals[LOOKUP_SADDR] = mutable->port_config.saddr;
+       lookup.vals[LOOKUP_DADDR] = mutable->port_config.daddr;
+       lookup.vals[LOOKUP_KEY] = mutable->port_config.in_key;
+       lookup.vals[LOOKUP_TUNNEL_TYPE] = mutable->tunnel_type;
+
+       return port_hash(&lookup);
+}
+
+static void check_table_empty(void)
+{
+       if (tbl_count(port_table) == 0) {
+               struct tbl *old_table = port_table;
+
+               cancel_delayed_work_sync(&cache_cleaner_wq);
+               rcu_assign_pointer(port_table, NULL);
+               tbl_deferred_destroy(old_table, NULL);
+       }
+}
+
 static int add_port(struct vport *vport)
 {
        struct tnl_vport *tnl_vport = tnl_vport_priv(vport);
-       struct port_lookup_key lookup;
        int err;
 
        if (!port_table) {
@@ -144,6 +234,7 @@ static int add_port(struct vport *vport)
                        return -ENOMEM;
 
                rcu_assign_pointer(port_table, new_table);
+               schedule_cache_cleaner();
 
        } else if (tbl_count(port_table) > tbl_n_buckets(port_table)) {
                struct tbl *old_table = port_table;
@@ -157,16 +248,44 @@ static int add_port(struct vport *vport)
                tbl_deferred_destroy(old_table, NULL);
        }
 
-       lookup.vals[LOOKUP_SADDR] = tnl_vport->mutable->port_config.saddr;
-       lookup.vals[LOOKUP_DADDR] = tnl_vport->mutable->port_config.daddr;
-       lookup.vals[LOOKUP_KEY] = tnl_vport->mutable->port_config.in_key;
-       lookup.vals[LOOKUP_TUNNEL_TYPE] = tnl_vport->mutable->tunnel_type;
+       err = tbl_insert(port_table, &tnl_vport->tbl_node, mutable_hash(tnl_vport->mutable));
+       if (err) {
+               check_table_empty();
+               return err;
+       }
+
+       (*find_port_pool(tnl_vport->mutable))++;
+
+       return 0;
+}
+
+static int move_port(struct vport *vport, struct tnl_mutable_config *new_mutable)
+{
+       int err;
+       struct tnl_vport *tnl_vport = tnl_vport_priv(vport);
+       u32 hash;
+
+       hash = mutable_hash(new_mutable);
+       if (hash == tnl_vport->tbl_node.hash)
+               goto table_updated;
 
-       err = tbl_insert(port_table, &tnl_vport->tbl_node, port_hash(&lookup));
+       /*
+        * Ideally we should make this move atomic to avoid having gaps in
+        * finding tunnels or the possibility of failure.  However, if we do
+        * find a tunnel it will always be consistent.
+        */
+       err = tbl_remove(port_table, &tnl_vport->tbl_node);
        if (err)
                return err;
 
-       (*find_port_pool(tnl_vport->mutable))++;
+       err = tbl_insert(port_table, &tnl_vport->tbl_node, hash);
+       if (err) {
+               check_table_empty();
+               return err;
+       }
+
+table_updated:
+       assign_config_rcu(vport, new_mutable);
 
        return 0;
 }
@@ -180,6 +299,7 @@ static int del_port(struct vport *vport)
        if (err)
                return err;
 
+       check_table_empty();
        (*find_port_pool(tnl_vport->mutable))--;
 
        return 0;
@@ -193,7 +313,7 @@ struct vport *tnl_find_port(__be32 saddr, __be32 daddr, __be32 key,
        struct tbl *table = rcu_dereference(port_table);
        struct tbl_node *tbl_node;
 
-       if (!table)
+       if (unlikely(!table))
                return NULL;
 
        lookup.vals[LOOKUP_SADDR] = saddr;
@@ -246,6 +366,60 @@ found:
        return tnl_vport_to_vport(tnl_vport_table_cast(tbl_node));
 }
 
+static inline void ecn_decapsulate(struct sk_buff *skb)
+{
+       u8 tos = ip_hdr(skb)->tos;
+
+       if (INET_ECN_is_ce(tos)) {
+               __be16 protocol = skb->protocol;
+               unsigned int nw_header = skb_network_offset(skb);
+
+               if (skb->protocol == htons(ETH_P_8021Q)) {
+                       if (unlikely(!pskb_may_pull(skb, VLAN_ETH_HLEN)))
+                               return;
+
+                       protocol = vlan_eth_hdr(skb)->h_vlan_encapsulated_proto;
+                       nw_header += VLAN_HLEN;
+               }
+
+               if (protocol == htons(ETH_P_IP)) {
+                       if (unlikely(!pskb_may_pull(skb, nw_header
+                           + sizeof(struct iphdr))))
+                               return;
+
+                       IP_ECN_set_ce((struct iphdr *)(skb->data + nw_header));
+               }
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+               else if (protocol == htons(ETH_P_IPV6)) {
+                       if (unlikely(!pskb_may_pull(skb, nw_header
+                           + sizeof(struct ipv6hdr))))
+                               return;
+
+                       IP6_ECN_set_ce((struct ipv6hdr *)(skb->data + nw_header));
+               }
+#endif
+       }
+}
+
+/* Called with rcu_read_lock. */
+void tnl_rcv(struct vport *vport, struct sk_buff *skb)
+{
+       skb->pkt_type = PACKET_HOST;
+       skb->protocol = eth_type_trans(skb, skb->dev);
+
+       skb_dst_drop(skb);
+       nf_reset(skb);
+       secpath_reset(skb);
+       skb_reset_network_header(skb);
+
+       ecn_decapsulate(skb);
+
+       skb_push(skb, ETH_HLEN);
+       compute_ip_summed(skb, false);
+
+       vport_receive(vport, skb);
+}
+
 static bool check_ipv4_address(__be32 addr)
 {
        if (ipv4_is_multicast(addr) || ipv4_is_lbcast(addr)
@@ -514,179 +688,412 @@ bool tnl_frag_needed(struct vport *vport, const struct tnl_mutable_config *mutab
        return true;
 }
 
-static struct sk_buff *check_headroom(struct sk_buff *skb, int headroom)
+static bool check_mtu(struct sk_buff *skb,
+                     struct vport *vport,
+                     const struct tnl_mutable_config *mutable,
+                     const struct rtable *rt, __be16 *frag_offp)
 {
-       if (skb_headroom(skb) < headroom || skb_header_cloned(skb)) {
-               struct sk_buff *nskb = skb_realloc_headroom(skb, headroom + 16);
-               if (unlikely(!nskb)) {
-                       kfree_skb(skb);
-                       return ERR_PTR(-ENOMEM);
+       int mtu;
+       __be16 frag_off;
+
+       frag_off = (mutable->port_config.flags & TNL_F_PMTUD) ? htons(IP_DF) : 0;
+       if (frag_off)
+               mtu = dst_mtu(&rt_dst(rt))
+                       - ETH_HLEN
+                       - mutable->tunnel_hlen
+                       - (eth_hdr(skb)->h_proto == htons(ETH_P_8021Q) ? VLAN_HLEN : 0);
+       else
+               mtu = mutable->mtu;
+
+       if (skb->protocol == htons(ETH_P_IP)) {
+               struct iphdr *old_iph = ip_hdr(skb);
+
+               frag_off |= old_iph->frag_off & htons(IP_DF);
+               mtu = max(mtu, IP_MIN_MTU);
+
+               if ((old_iph->frag_off & htons(IP_DF)) &&
+                   mtu < ntohs(old_iph->tot_len)) {
+                       if (tnl_frag_needed(vport, mutable, skb, mtu, OVS_CB(skb)->tun_id))
+                               goto drop;
                }
+       }
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+       else if (skb->protocol == htons(ETH_P_IPV6)) {
+               unsigned int packet_length = skb->len - ETH_HLEN
+                       - (eth_hdr(skb)->h_proto == htons(ETH_P_8021Q) ? VLAN_HLEN : 0);
 
-               set_skb_csum_bits(skb, nskb);
+               mtu = max(mtu, IPV6_MIN_MTU);
 
-               if (skb->sk)
-                       skb_set_owner_w(nskb, skb->sk);
+               /* IPv6 requires PMTUD if the packet is above the minimum MTU. */
+               if (packet_length > IPV6_MIN_MTU)
+                       frag_off = htons(IP_DF);
 
-               dev_kfree_skb(skb);
-               return nskb;
+               if (mtu < packet_length) {
+                       if (tnl_frag_needed(vport, mutable, skb, mtu, OVS_CB(skb)->tun_id))
+                               goto drop;
+               }
        }
+#endif
 
-       return skb;
+       *frag_offp = frag_off;
+       return true;
+
+drop:
+       *frag_offp = 0;
+       return false;
 }
 
-static inline u8 ecn_encapsulate(u8 tos, struct sk_buff *skb)
+static void create_tunnel_header(const struct vport *vport,
+                                const struct tnl_mutable_config *mutable,
+                                const struct rtable *rt, void *header)
 {
-       u8 inner;
+       struct tnl_vport *tnl_vport = tnl_vport_priv(vport);
+       struct iphdr *iph = header;
+
+       iph->version    = 4;
+       iph->ihl        = sizeof(struct iphdr) >> 2;
+       iph->frag_off   = htons(IP_DF);
+       iph->protocol   = tnl_vport->tnl_ops->ipproto;
+       iph->tos        = mutable->port_config.tos;
+       iph->daddr      = rt->rt_dst;
+       iph->saddr      = rt->rt_src;
+       iph->ttl        = mutable->port_config.ttl;
+       if (!iph->ttl)
+               iph->ttl = dst_metric(&rt_dst(rt), RTAX_HOPLIMIT);
+
+       tnl_vport->tnl_ops->build_header(vport, mutable, iph + 1);
+}
 
-       if (skb->protocol == htons(ETH_P_IP))
-               inner = ((struct iphdr *)skb_network_header(skb))->tos;
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-       else if (skb->protocol == htons(ETH_P_IPV6))
-               inner = ipv6_get_dsfield((struct ipv6hdr *)skb_network_header(skb));
-#endif
-       else
-               inner = 0;
+static inline void *get_cached_header(const struct tnl_cache *cache)
+{
+       return (void *)cache + ALIGN(sizeof(struct tnl_cache), CACHE_DATA_ALIGN);
+}
 
-       return INET_ECN_encapsulate(tos, inner);
+static inline bool check_cache_valid(const struct tnl_cache *cache,
+                                    const struct tnl_mutable_config *mutable)
+{
+       return cache &&
+#ifdef NEED_CACHE_TIMEOUT
+               time_before(jiffies, cache->expiration) &&
+#endif
+#ifdef HAVE_RT_GENID
+               atomic_read(&init_net.ipv4.rt_genid) == cache->rt->rt_genid &&
+#endif
+#ifdef HAVE_HH_SEQ
+               rt_dst(cache->rt).hh->hh_lock.sequence == cache->hh_seq &&
+#endif
+               mutable->seq == cache->mutable_seq &&
+               (!is_internal_dev(rt_dst(cache->rt).dev) ||
+               (cache->flow && !cache->flow->dead));
 }
 
-static inline void ecn_decapsulate(struct sk_buff *skb)
+static int cache_cleaner_cb(struct tbl_node *tbl_node, void *aux)
 {
-       u8 tos = ip_hdr(skb)->tos;
+       struct tnl_vport *tnl_vport = tnl_vport_table_cast(tbl_node);
+       const struct tnl_mutable_config *mutable = rcu_dereference(tnl_vport->mutable);
+       const struct tnl_cache *cache = rcu_dereference(tnl_vport->cache);
 
-       if (INET_ECN_is_ce(tos)) {
-               __be16 protocol = skb->protocol;
-               unsigned int nw_header = skb_network_header(skb) - skb->data;
+       if (cache && !check_cache_valid(cache, mutable) &&
+           spin_trylock_bh(&tnl_vport->cache_lock)) {
+               assign_cache_rcu(tnl_vport_to_vport(tnl_vport), NULL);
+               spin_unlock_bh(&tnl_vport->cache_lock);
+       }
 
-               if (skb->protocol == htons(ETH_P_8021Q)) {
-                       if (unlikely(!pskb_may_pull(skb, VLAN_ETH_HLEN)))
-                               return;
+       return 0;
+}
 
-                       protocol = vlan_eth_hdr(skb)->h_vlan_encapsulated_proto;
-                       nw_header += VLAN_HLEN;
-               }
+static void cache_cleaner(struct work_struct *work)
+{
+       schedule_cache_cleaner();
 
-               if (protocol == htons(ETH_P_IP)) {
-                       if (unlikely(!pskb_may_pull(skb, nw_header
-                           + sizeof(struct iphdr))))
-                               return;
+       rcu_read_lock();
+       tbl_foreach(port_table, cache_cleaner_cb, NULL);
+       rcu_read_unlock();
+}
 
-                       IP_ECN_set_ce((struct iphdr *)(nw_header + skb->data));
-               }
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-               else if (protocol == htons(ETH_P_IPV6)) {
-                       if (unlikely(!pskb_may_pull(skb, nw_header
-                           + sizeof(struct ipv6hdr))))
-                               return;
+static inline void create_eth_hdr(struct tnl_cache *cache,
+                                 const struct rtable *rt)
+{
+       void *cache_data = get_cached_header(cache);
+       int hh_len = rt_dst(rt).hh->hh_len;
+       int hh_off = HH_DATA_ALIGN(rt_dst(rt).hh->hh_len) - hh_len;
 
-                       IP6_ECN_set_ce((struct ipv6hdr *)(nw_header
-                                                         + skb->data));
-               }
+#ifdef HAVE_HH_SEQ
+       unsigned hh_seq;
+
+       do {
+               hh_seq = read_seqbegin(&rt_dst(rt).hh->hh_lock);
+               memcpy(cache_data, (void *)rt_dst(rt).hh->hh_data + hh_off, hh_len);
+       } while (read_seqretry(&rt_dst(rt).hh->hh_lock, hh_seq));
+
+       cache->hh_seq = hh_seq;
+#else
+       read_lock_bh(&rt_dst(rt).hh->hh_lock);
+       memcpy(cache_data, (void *)rt_dst(rt).hh->hh_data + hh_off, hh_len);
+       read_unlock_bh(&rt_dst(rt).hh->hh_lock);
 #endif
-       }
 }
 
-static struct sk_buff *handle_gso(struct sk_buff *skb)
+static struct tnl_cache *build_cache(struct vport *vport,
+                                    const struct tnl_mutable_config *mutable,
+                                    struct rtable *rt)
 {
-       if (skb_is_gso(skb)) {
-               struct sk_buff *nskb = skb_gso_segment(skb, 0);
+       struct tnl_vport *tnl_vport = tnl_vport_priv(vport);
+       struct tnl_cache *cache;
+       void *cache_data;
+       int cache_len;
 
-               dev_kfree_skb(skb);
-               return nskb;
+       if (!(mutable->port_config.flags & TNL_F_HDR_CACHE))
+               return NULL;
+
+       /*
+        * If there is no entry in the ARP cache or if this device does not
+        * support hard header caching just fall back to the IP stack.
+        */
+       if (!rt_dst(rt).hh)
+               return NULL;
+
+       /*
+        * If lock is contended fall back to directly building the header.
+        * We're not going to help performance by sitting here spinning.
+        */
+       if (!spin_trylock_bh(&tnl_vport->cache_lock))
+               return NULL;
+
+       cache = tnl_vport->cache;
+       if (check_cache_valid(cache, mutable))
+               goto unlock;
+       else
+               cache = NULL;
+
+       cache_len = rt_dst(rt).hh->hh_len + mutable->tunnel_hlen;
+
+       cache = kzalloc(ALIGN(sizeof(struct tnl_cache), CACHE_DATA_ALIGN) +
+                       cache_len, GFP_ATOMIC);
+       if (!cache)
+               goto unlock;
+
+       cache->len = cache_len;
+
+       create_eth_hdr(cache, rt);
+       cache_data = get_cached_header(cache) + rt_dst(rt).hh->hh_len;
+
+       create_tunnel_header(vport, mutable, rt, cache_data);
+
+       cache->mutable_seq = mutable->seq;
+       cache->rt = rt;
+#ifdef NEED_CACHE_TIMEOUT
+       cache->expiration = jiffies + tnl_vport->cache_exp_interval;
+#endif
+
+       if (is_internal_dev(rt_dst(rt).dev)) {
+               int err;
+               struct vport *vport;
+               struct dp_port *dp_port;
+               struct sk_buff *skb;
+               bool is_frag;
+               struct odp_flow_key flow_key;
+               struct tbl_node *flow_node;
+
+               vport = internal_dev_get_vport(rt_dst(rt).dev);
+               if (!vport)
+                       goto done;
+
+               dp_port = vport_get_dp_port(vport);
+               if (!dp_port)
+                       goto done;
+
+               skb = alloc_skb(cache->len, GFP_ATOMIC);
+               if (!skb)
+                       goto done;
+
+               __skb_put(skb, cache->len);
+               memcpy(skb->data, get_cached_header(cache), cache->len);
+
+               err = flow_extract(skb, dp_port->port_no, &flow_key, &is_frag);
+
+               kfree_skb(skb);
+               if (err || is_frag)
+                       goto done;
+
+               flow_node = tbl_lookup(rcu_dereference(dp_port->dp->table),
+                                      &flow_key, flow_hash(&flow_key),
+                                      flow_cmp);
+               if (flow_node) {
+                       struct sw_flow *flow = flow_cast(flow_node);
+
+                       cache->flow = flow;
+                       flow_hold(flow);
+               }
        }
 
-       return skb;
+done:
+       assign_cache_rcu(vport, cache);
+
+unlock:
+       spin_unlock_bh(&tnl_vport->cache_lock);
+
+       return cache;
 }
 
-static int handle_csum_offload(struct sk_buff *skb)
+static struct rtable *find_route(struct vport *vport,
+                                const struct tnl_mutable_config *mutable,
+                                u8 tos, struct tnl_cache **cache)
 {
-       if (skb->ip_summed == CHECKSUM_PARTIAL)
-               return skb_checksum_help(skb);
-       else {
-               skb->ip_summed = CHECKSUM_NONE;
-               return 0;
+       struct tnl_vport *tnl_vport = tnl_vport_priv(vport);
+       struct tnl_cache *cur_cache = rcu_dereference(tnl_vport->cache);
+
+       *cache = NULL;
+       tos = RT_TOS(tos);
+
+       if (likely(tos == mutable->port_config.tos &&
+                  check_cache_valid(cur_cache, mutable))) {
+               *cache = cur_cache;
+               return cur_cache->rt;
+       } else {
+               struct rtable *rt;
+               struct flowi fl = { .nl_u = { .ip4_u =
+                                             { .daddr = mutable->port_config.daddr,
+                                               .saddr = mutable->port_config.saddr,
+                                               .tos = tos } },
+                                   .proto = tnl_vport->tnl_ops->ipproto };
+
+               if (unlikely(ip_route_output_key(&init_net, &rt, &fl)))
+                       return NULL;
+
+               if (likely(tos == mutable->port_config.tos))
+                       *cache = build_cache(vport, mutable, rt);
+
+               return rt;
        }
 }
 
-/* Called with rcu_read_lock. */
-void tnl_rcv(struct vport *vport, struct sk_buff *skb)
+static struct sk_buff *check_headroom(struct sk_buff *skb, int headroom)
 {
-       skb->pkt_type = PACKET_HOST;
-       skb->protocol = eth_type_trans(skb, skb->dev);
+       if (skb_headroom(skb) < headroom || skb_header_cloned(skb)) {
+               struct sk_buff *nskb = skb_realloc_headroom(skb, headroom + 16);
+               if (unlikely(!nskb)) {
+                       kfree_skb(skb);
+                       return ERR_PTR(-ENOMEM);
+               }
 
-       skb_dst_drop(skb);
-       nf_reset(skb);
-       secpath_reset(skb);
-       skb_reset_network_header(skb);
+               set_skb_csum_bits(skb, nskb);
 
-       ecn_decapsulate(skb);
+               if (skb->sk)
+                       skb_set_owner_w(nskb, skb->sk);
 
-       skb_push(skb, ETH_HLEN);
-       compute_ip_summed(skb, false);
+               kfree_skb(skb);
+               return nskb;
+       }
 
-       vport_receive(vport, skb);
+       return skb;
 }
 
-static int build_packet(struct vport *vport, const struct tnl_mutable_config *mutable,
-                       struct iphdr *iph, struct rtable *rt, int max_headroom,
-                       int mtu, struct sk_buff *skb)
+static inline bool need_linearize(const struct sk_buff *skb)
 {
-       struct tnl_vport *tnl_vport = tnl_vport_priv(vport);
+       int i;
+
+       if (unlikely(skb_shinfo(skb)->frag_list))
+               return true;
+
+       /*
+        * Generally speaking we should linearize if there are paged frags.
+        * However, if all of the refcounts are 1 we know nobody else can
+        * change them from underneath us and we can skip the linearization.
+        */
+       for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
+               if (unlikely(page_count(skb_shinfo(skb)->frags[0].page) > 1))
+                       return true;
+
+       return false;
+}
+
+static struct sk_buff *handle_offloads(struct sk_buff *skb,
+                                      const struct tnl_mutable_config *mutable,
+                                      const struct rtable *rt)
+{
+       int min_headroom;
        int err;
-       struct iphdr *new_iph;
-       int orig_len = skb->len;
-       __be16 frag_off = iph->frag_off;
 
-       skb = check_headroom(skb, max_headroom);
-       if (unlikely(IS_ERR(skb)))
-               goto error;
+       forward_ip_summed(skb);
 
-       err = handle_csum_offload(skb);
+       err = vswitch_skb_checksum_setup(skb);
        if (unlikely(err))
                goto error_free;
 
-       if (skb->protocol == htons(ETH_P_IP)) {
-               struct iphdr *old_iph = ip_hdr(skb);
+       min_headroom = LL_RESERVED_SPACE(rt_dst(rt).dev) + rt_dst(rt).header_len
+                       + mutable->tunnel_hlen;
 
-               if ((old_iph->frag_off & htons(IP_DF)) &&
-                   mtu < ntohs(old_iph->tot_len)) {
-                       if (tnl_frag_needed(vport, mutable, skb, mtu, OVS_CB(skb)->tun_id))
-                               goto error_free;
+       if (skb_is_gso(skb)) {
+               struct sk_buff *nskb;
+
+               /*
+                * If we are doing GSO on a pskb it is better to make sure that
+                * the headroom is correct now.  We will only have to copy the
+                * portion in the linear data area and GSO will preserve
+                * headroom when it creates the segments.  This is particularly
+                * beneficial on Xen where we get a lot of GSO pskbs.
+                * Conversely, we avoid copying if it is just to get our own
+                * writable clone because GSO will do the copy for us.
+                */
+               if (skb_headroom(skb) < min_headroom) {
+                       skb = check_headroom(skb, min_headroom);
+                       if (unlikely(IS_ERR(skb))) {
+                               err = PTR_ERR(skb);
+                               goto error;
+                       }
                }
 
-       }
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-       else if (skb->protocol == htons(ETH_P_IPV6)) {
-               unsigned int packet_length = skb->len - ETH_HLEN
-                       - (eth_hdr(skb)->h_proto == htons(ETH_P_8021Q) ? VLAN_HLEN : 0);
+               nskb = skb_gso_segment(skb, 0);
+               kfree_skb(skb);
+               if (unlikely(IS_ERR(nskb))) {
+                       err = PTR_ERR(nskb);
+                       goto error;
+               }
 
-               /* IPv6 requires PMTUD if the packet is above the minimum MTU. */
-               if (packet_length > IPV6_MIN_MTU)
-                       frag_off = htons(IP_DF);
+               skb = nskb;
+       } else {
+               skb = check_headroom(skb, min_headroom);
+               if (unlikely(IS_ERR(skb))) {
+                       err = PTR_ERR(skb);
+                       goto error;
+               }
 
-               if (mtu < packet_length) {
-                       if (tnl_frag_needed(vport, mutable, skb, mtu, OVS_CB(skb)->tun_id))
+               if (skb->ip_summed == CHECKSUM_PARTIAL) {
+                       /*
+                        * Pages aren't locked and could change at any time.
+                        * If this happens after we compute the checksum, the
+                        * checksum will be wrong.  We linearize now to avoid
+                        * this problem.
+                        */
+                       if (unlikely(need_linearize(skb))) {
+                               err = __skb_linearize(skb);
+                               if (unlikely(err))
+                                       goto error_free;
+                       }
+
+                       err = skb_checksum_help(skb);
+                       if (unlikely(err))
                                goto error_free;
-               }
+               } else if (skb->ip_summed == CHECKSUM_COMPLETE)
+                       skb->ip_summed = CHECKSUM_NONE;
        }
-#endif
 
-       new_iph = (struct iphdr *)skb_push(skb, mutable->tunnel_hlen);
-       skb_reset_network_header(skb);
-       skb_set_transport_header(skb, sizeof(struct iphdr));
-
-       memcpy(new_iph, iph, sizeof(struct iphdr));
-       new_iph->frag_off = frag_off;
-       ip_select_ident(new_iph, &rt_dst(rt), NULL);
+       return skb;
 
-       memset(&IPCB(skb)->opt, 0, sizeof(IPCB(skb)->opt));
-       IPCB(skb)->flags = 0;
+error_free:
+       kfree_skb(skb);
+error:
+       return ERR_PTR(err);
+}
 
-       skb = tnl_vport->tnl_ops->build_header(skb, vport, mutable, &rt_dst(rt));
-       if (unlikely(!skb))
-               goto error;
+static int send_frags(struct sk_buff *skb,
+                     const struct tnl_mutable_config *mutable)
+{
+       int sent_len;
+       int err;
 
+       sent_len = 0;
        while (skb) {
                struct sk_buff *next = skb->next;
                int frag_len = skb->len - mutable->tunnel_hlen;
@@ -694,34 +1101,26 @@ static int build_packet(struct vport *vport, const struct tnl_mutable_config *mu
                skb->next = NULL;
 
                err = ip_local_out(skb);
-               if (unlikely(net_xmit_eval(err) != 0)) {
-                       orig_len -= frag_len;
+               if (likely(net_xmit_eval(err) == 0))
+                       sent_len += frag_len;
+               else {
                        skb = next;
                        goto free_frags;
                }
 
                skb = next;
-       };
+       }
 
-       return orig_len;
+       return sent_len;
 
-error_free:
-       kfree_skb(skb);
-error:
-       return 0;
 free_frags:
        /*
         * There's no point in continuing to send fragments once one has been
         * dropped so just free the rest.  This may help improve the congestion
         * that caused the first packet to be dropped.
         */
-       while (skb) {
-               struct sk_buff *next = skb->next;
-               orig_len -= skb->len - mutable->tunnel_hlen;
-               kfree_skb(skb);
-               skb = next;
-       };
-       return orig_len;
+       tnl_free_linked_skbs(skb);
+       return sent_len;
 }
 
 int tnl_send(struct vport *vport, struct sk_buff *skb)
@@ -729,12 +1128,15 @@ int tnl_send(struct vport *vport, struct sk_buff *skb)
        struct tnl_vport *tnl_vport = tnl_vport_priv(vport);
        const struct tnl_mutable_config *mutable = rcu_dereference(tnl_vport->mutable);
 
-       struct iphdr *old_iph;
-       int orig_len;
-       struct iphdr iph;
+       enum vport_err_type err = VPORT_E_TX_ERROR;
        struct rtable *rt;
-       int max_headroom;
-       int mtu;
+       struct dst_entry *unattached_dst = NULL;
+       struct tnl_cache *cache;
+       int sent_len = 0;
+       __be16 frag_off;
+       u8 ttl;
+       u8 inner_tos;
+       u8 tos;
 
        /* Validate the protocol headers before we try to use them. */
        if (skb->protocol == htons(ETH_P_8021Q)) {
@@ -746,147 +1148,164 @@ int tnl_send(struct vport *vport, struct sk_buff *skb)
        }
 
        if (skb->protocol == htons(ETH_P_IP)) {
-               if (unlikely(!pskb_may_pull(skb, skb_network_header(skb)
-                   + sizeof(struct iphdr) - skb->data)))
+               if (unlikely(!pskb_may_pull(skb, skb_network_offset(skb)
+                   + sizeof(struct iphdr))))
                        skb->protocol = 0;
        }
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
        else if (skb->protocol == htons(ETH_P_IPV6)) {
-               if (unlikely(!pskb_may_pull(skb, skb_network_header(skb)
-                   + sizeof(struct ipv6hdr) - skb->data)))
+               if (unlikely(!pskb_may_pull(skb, skb_network_offset(skb)
+                   + sizeof(struct ipv6hdr))))
                        skb->protocol = 0;
        }
 #endif
-       old_iph = ip_hdr(skb);
 
-       iph.tos = mutable->port_config.tos;
-       if (mutable->port_config.flags & TNL_F_TOS_INHERIT) {
-               if (skb->protocol == htons(ETH_P_IP))
-                       iph.tos = old_iph->tos;
+       /* ToS */
+       if (skb->protocol == htons(ETH_P_IP))
+               inner_tos = ip_hdr(skb)->tos;
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-               else if (skb->protocol == htons(ETH_P_IPV6))
-                       iph.tos = ipv6_get_dsfield(ipv6_hdr(skb));
+       else if (skb->protocol == htons(ETH_P_IPV6))
+               inner_tos = ipv6_get_dsfield(ipv6_hdr(skb));
 #endif
-       }
-       iph.tos = ecn_encapsulate(iph.tos, skb);
+       else
+               inner_tos = 0;
 
-       {
-               struct flowi fl = { .nl_u = { .ip4_u =
-                                             { .daddr = mutable->port_config.daddr,
-                                               .saddr = mutable->port_config.saddr,
-                                               .tos = RT_TOS(iph.tos) } },
-                                   .proto = tnl_vport->tnl_ops->ipproto };
+       if (mutable->port_config.flags & TNL_F_TOS_INHERIT)
+               tos = inner_tos;
+       else
+               tos = mutable->port_config.tos;
 
-               if (unlikely(ip_route_output_key(&init_net, &rt, &fl)))
-                       goto error_free;
+       tos = INET_ECN_encapsulate(tos, inner_tos);
+
+       /* Route lookup */
+       rt = find_route(vport, mutable, tos, &cache);
+       if (unlikely(!rt))
+               goto error_free;
+       if (unlikely(!cache))
+               unattached_dst = &rt_dst(rt);
+
+       /* Reset SKB */
+       nf_reset(skb);
+       secpath_reset(skb);
+       skb_dst_drop(skb);
+
+       /* Offloading */
+       skb = handle_offloads(skb, mutable, rt);
+       if (unlikely(IS_ERR(skb)))
+               goto error;
+
+       /* MTU */
+       if (unlikely(!check_mtu(skb, vport, mutable, rt, &frag_off))) {
+               err = VPORT_E_TX_DROPPED;
+               goto error_free;
        }
 
-       iph.ttl = mutable->port_config.ttl;
+       /*
+        * If we are over the MTU, allow the IP stack to handle fragmentation.
+        * Fragmentation is a slow path anyways.
+        */
+       if (unlikely(skb->len + mutable->tunnel_hlen > dst_mtu(&rt_dst(rt)) &&
+                    cache)) {
+               unattached_dst = &rt_dst(rt);
+               dst_hold(unattached_dst);
+               cache = NULL;
+       }
+
+       /* TTL */
+       ttl = mutable->port_config.ttl;
+       if (!ttl)
+               ttl = dst_metric(&rt_dst(rt), RTAX_HOPLIMIT);
+
        if (mutable->port_config.flags & TNL_F_TTL_INHERIT) {
                if (skb->protocol == htons(ETH_P_IP))
-                       iph.ttl = old_iph->ttl;
+                       ttl = ip_hdr(skb)->ttl;
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
                else if (skb->protocol == htons(ETH_P_IPV6))
-                       iph.ttl = ipv6_hdr(skb)->hop_limit;
+                       ttl = ipv6_hdr(skb)->hop_limit;
 #endif
        }
-       if (!iph.ttl)
-               iph.ttl = dst_metric(&rt_dst(rt), RTAX_HOPLIMIT);
 
-       iph.frag_off = (mutable->port_config.flags & TNL_F_PMTUD) ? htons(IP_DF) : 0;
-       if (iph.frag_off)
-               mtu = dst_mtu(&rt_dst(rt))
-                       - ETH_HLEN
-                       - mutable->tunnel_hlen
-                       - (eth_hdr(skb)->h_proto == htons(ETH_P_8021Q) ? VLAN_HLEN : 0);
-       else
-               mtu = mutable->mtu;
+       while (skb) {
+               struct iphdr *iph;
+               struct sk_buff *next_skb = skb->next;
+               skb->next = NULL;
 
-       if (skb->protocol == htons(ETH_P_IP)) {
-               iph.frag_off |= old_iph->frag_off & htons(IP_DF);
-               mtu = max(mtu, IP_MIN_MTU);
-       }
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-       else if (skb->protocol == htons(ETH_P_IPV6))
-               mtu = max(mtu, IPV6_MIN_MTU);
-#endif
+               if (likely(cache)) {
+                       skb_push(skb, cache->len);
+                       memcpy(skb->data, get_cached_header(cache), cache->len);
+                       skb_reset_mac_header(skb);
+                       skb_set_network_header(skb, rt_dst(rt).hh->hh_len);
 
-       iph.version = 4;
-       iph.ihl = sizeof(struct iphdr) >> 2;
-       iph.protocol = tnl_vport->tnl_ops->ipproto;
-       iph.daddr = rt->rt_dst;
-       iph.saddr = rt->rt_src;
+               } else {
+                       skb_push(skb, mutable->tunnel_hlen);
+                       create_tunnel_header(vport, mutable, rt, skb->data);
+                       skb_reset_network_header(skb);
 
-       nf_reset(skb);
-       secpath_reset(skb);
-       skb_dst_drop(skb);
-       skb_dst_set(skb, &rt_dst(rt));
+                       if (next_skb)
+                               skb_dst_set(skb, dst_clone(unattached_dst));
+                       else {
+                               skb_dst_set(skb, unattached_dst);
+                               unattached_dst = NULL;
+                       }
 
-       /*
-        * If we are doing GSO on a pskb it is better to make sure that the
-        * headroom is correct now.  We will only have to copy the portion in
-        * the linear data area and GSO will preserve headroom when it creates
-        * the segments.  This is particularly beneficial on Xen where we get
-        * lots of GSO pskbs.  Conversely, we delay copying if it is just to
-        * get our own writable clone because GSO may do the copy for us.
-        */
-       max_headroom = LL_RESERVED_SPACE(rt_dst(rt).dev) + rt_dst(rt).header_len
-                       + mutable->tunnel_hlen;
 
-       if (skb_headroom(skb) < max_headroom) {
-               skb = check_headroom(skb, max_headroom);
-               if (unlikely(IS_ERR(skb))) {
-                       vport_record_error(vport, VPORT_E_TX_DROPPED);
-                       goto error;
+                       memset(&IPCB(skb)->opt, 0, sizeof(IPCB(skb)->opt));
+                       IPCB(skb)->flags = 0;
                }
-       }
+               skb_set_transport_header(skb, skb_network_offset(skb) + sizeof(struct iphdr));
 
-       forward_ip_summed(skb);
+               iph = ip_hdr(skb);
+               iph->tos = tos;
+               iph->ttl = ttl;
+               iph->frag_off = frag_off;
+               ip_select_ident(iph, &rt_dst(rt), NULL);
 
-       if (unlikely(vswitch_skb_checksum_setup(skb)))
-               goto error_free;
+               skb = tnl_vport->tnl_ops->update_header(vport, mutable, &rt_dst(rt), skb);
+               if (unlikely(!skb))
+                       goto next;
 
-       skb = handle_gso(skb);
-       if (unlikely(IS_ERR(skb))) {
-               vport_record_error(vport, VPORT_E_TX_DROPPED);
-               goto error;
-       }
+               if (likely(cache)) {
+                       int orig_len = skb->len - cache->len;
+                       struct vport *cache_vport = internal_dev_get_vport(rt_dst(rt).dev);
 
-       /*
-        * Process GSO segments.  Try to do any work for the entire packet that
-        * doesn't involve actually writing to it before this point.
-        */
-       orig_len = 0;
-       do {
-               struct sk_buff *next_skb = skb->next;
-               skb->next = NULL;
+                       skb->protocol = htons(ETH_P_IP);
+
+                       iph->tot_len = htons(skb->len - skb_network_offset(skb));
+                       ip_send_check(iph);
 
-               orig_len += build_packet(vport, mutable, &iph, rt, max_headroom, mtu, skb);
+                       if (likely(cache_vport)) {
+                               OVS_CB(skb)->flow = cache->flow;
+                               compute_ip_summed(skb, true);
+                               vport_receive(cache_vport, skb);
+                               sent_len += orig_len;
+                       } else {
+                               int err;
 
+                               skb->dev = rt_dst(rt).dev;
+                               err = dev_queue_xmit(skb);
+
+                               if (likely(net_xmit_eval(err) == 0))
+                                       sent_len += orig_len;
+                       }
+               } else
+                       sent_len += send_frags(skb, mutable);
+
+next:
                skb = next_skb;
-       } while (skb);
+       }
 
-       if (unlikely(orig_len == 0))
+       if (unlikely(sent_len == 0))
                vport_record_error(vport, VPORT_E_TX_DROPPED);
 
-       return orig_len;
+       goto out;
 
 error_free:
-       kfree_skb(skb);
-       vport_record_error(vport, VPORT_E_TX_ERROR);
+       tnl_free_linked_skbs(skb);
 error:
-       return 0;
-}
-
-int tnl_init(void)
-{
-       return 0;
-}
-
-void tnl_exit(void)
-{
-       tbl_destroy(port_table, NULL);
-       port_table = NULL;
+       dst_release(unattached_dst);
+       vport_record_error(vport, err);
+out:
+       return sent_len;
 }
 
 static int set_config(const void __user *uconfig, const struct tnl_ops *tnl_ops,
@@ -899,15 +1318,18 @@ static int set_config(const void __user *uconfig, const struct tnl_ops *tnl_ops,
        if (copy_from_user(&mutable->port_config, uconfig, sizeof(struct tnl_port_config)))
                return -EFAULT;
 
+       if (mutable->port_config.daddr == 0)
+               return -EINVAL;
+
+       if (mutable->port_config.tos != RT_TOS(mutable->port_config.tos))
+               return -EINVAL;
+
        mutable->tunnel_hlen = tnl_ops->hdr_len(&mutable->port_config);
        if (mutable->tunnel_hlen < 0)
                return mutable->tunnel_hlen;
 
        mutable->tunnel_hlen += sizeof(struct iphdr);
 
-       if (mutable->port_config.daddr == 0)
-               return -EINVAL;
-
        mutable->tunnel_type = tnl_ops->tunnel_type;
        if (mutable->port_config.flags & TNL_F_IN_KEY_MATCH) {
                mutable->tunnel_type |= TNL_T_KEY_MATCH;
@@ -950,7 +1372,7 @@ struct vport *tnl_create(const char *name, const void __user *config,
        strcpy(tnl_vport->name, name);
        tnl_vport->tnl_ops = tnl_ops;
 
-       tnl_vport->mutable = kmalloc(sizeof(struct tnl_mutable_config), GFP_KERNEL);
+       tnl_vport->mutable = kzalloc(sizeof(struct tnl_mutable_config), GFP_KERNEL);
        if (!tnl_vport->mutable) {
                err = -ENOMEM;
                goto error_free_vport;
@@ -966,6 +1388,13 @@ struct vport *tnl_create(const char *name, const void __user *config,
        if (err)
                goto error_free_mutable;
 
+       spin_lock_init(&tnl_vport->cache_lock);
+
+#ifdef NEED_CACHE_TIMEOUT
+       tnl_vport->cache_exp_interval = MAX_CACHE_EXP -
+                                       (net_random() % (MAX_CACHE_EXP / 2));
+#endif
+
        err = add_port(vport);
        if (err)
                goto error_free_mutable;
@@ -985,7 +1414,6 @@ int tnl_modify(struct vport *vport, const void __user *config)
        struct tnl_vport *tnl_vport = tnl_vport_priv(vport);
        struct tnl_mutable_config *mutable;
        int err;
-       bool update_hash = false;
 
        mutable = kmemdup(tnl_vport->mutable, sizeof(struct tnl_mutable_config), GFP_KERNEL);
        if (!mutable) {
@@ -997,35 +1425,11 @@ int tnl_modify(struct vport *vport, const void __user *config)
        if (err)
                goto error_free;
 
-       /*
-        * Only remove the port from the hash table if something that would
-        * affect the lookup has changed.
-        */
-       if (tnl_vport->mutable->port_config.saddr != mutable->port_config.saddr ||
-           tnl_vport->mutable->port_config.daddr != mutable->port_config.daddr ||
-           tnl_vport->mutable->port_config.in_key != mutable->port_config.in_key ||
-           (tnl_vport->mutable->port_config.flags & TNL_F_IN_KEY_MATCH) !=
-           (mutable->port_config.flags & TNL_F_IN_KEY_MATCH))
-               update_hash = true;
-
-
-       /*
-        * This update is not atomic but the lookup uses the config, which
-        * serves as an inherent double check.
-        */
-       if (update_hash) {
-               err = del_port(vport);
-               if (err)
-                       goto error_free;
-       }
-
-       assign_config_rcu(vport, mutable);
+       mutable->seq++;
 
-       if (update_hash) {
-               err = add_port(vport);
-               if (err)
-                       goto error_free;
-       }
+       err = move_port(vport, mutable);
+       if (err)
+               goto error_free;
 
        return 0;
 
@@ -1035,10 +1439,14 @@ error:
        return err;
 }
 
-static void free_port(struct rcu_head *rcu)
+static void free_port_rcu(struct rcu_head *rcu)
 {
        struct tnl_vport *tnl_vport = container_of(rcu, struct tnl_vport, rcu);
 
+       spin_lock_bh(&tnl_vport->cache_lock);
+       free_cache(tnl_vport->cache);
+       spin_unlock_bh(&tnl_vport->cache_lock);
+
        kfree(tnl_vport->mutable);
        vport_free(tnl_vport_to_vport(tnl_vport));
 }
@@ -1055,7 +1463,7 @@ int tnl_destroy(struct vport *vport)
            &old_mutable))
                del_port(vport);
 
-       call_rcu(&tnl_vport->rcu, free_port);
+       call_rcu(&tnl_vport->rcu, free_port_rcu);
 
        return 0;
 }
@@ -1090,7 +1498,6 @@ int tnl_set_addr(struct vport *vport, const unsigned char *addr)
        return 0;
 }
 
-
 const char *tnl_get_name(const struct vport *vport)
 {
        const struct tnl_vport *tnl_vport = tnl_vport_priv(vport);
@@ -1108,3 +1515,15 @@ int tnl_get_mtu(const struct vport *vport)
        const struct tnl_vport *tnl_vport = tnl_vport_priv(vport);
        return rcu_dereference(tnl_vport->mutable)->mtu;
 }
+
+void tnl_free_linked_skbs(struct sk_buff *skb)
+{
+       if (unlikely(!skb))
+               return;
+
+       while (skb) {
+               struct sk_buff *next = skb->next;
+               kfree_skb(skb);
+               skb = next;
+       }
+}
index 37874c5..8ffb7bf 100644 (file)
@@ -9,6 +9,9 @@
 #ifndef TUNNEL_H
 #define TUNNEL_H 1
 
+#include <linux/version.h>
+
+#include "flow.h"
 #include "openvswitch/tunnel.h"
 #include "table.h"
 #include "vport.h"
 #define IP_MIN_MTU 68
 
 /*
- * One of these goes in your struct tnl_ops and in tnl_find_port().
+ * One of these goes in struct tnl_ops and in tnl_find_port().
  * These values are in the same namespace as other TNL_T_* values, so
- * you have only the first 10 bits to define protocol identifiers.
+ * only the least significant 10 bits are available to define protocol
+ * identifiers.
  */
 #define TNL_T_PROTO_GRE                0
 #define TNL_T_PROTO_CAPWAP     1
 
-/* You only need these flags when you are calling tnl_find_port(). */
+/* These flags are only needed when calling tnl_find_port(). */
 #define TNL_T_KEY_EXACT                (1 << 10)
 #define TNL_T_KEY_MATCH                (1 << 11)
 #define TNL_T_KEY_EITHER       (TNL_T_KEY_EXACT | TNL_T_KEY_MATCH)
 struct tnl_mutable_config {
        struct rcu_head rcu;
 
-       unsigned char eth_addr[ETH_ALEN];
-       unsigned int mtu;
-       struct tnl_port_config port_config;
+       unsigned seq;           /* Sequence number to identify this config. */
 
-       /* Set of TNL_T_* flags that define the category for lookup. */
-       u32 tunnel_type;
+       u32 tunnel_type;        /* Set of TNL_T_* flags that define lookup. */
+       unsigned tunnel_hlen;   /* Tunnel header length. */
+
+       unsigned char eth_addr[ETH_ALEN];
+       unsigned mtu;
 
-       int tunnel_hlen; /* Tunnel header length. */
+       struct tnl_port_config port_config;
 };
 
 struct tnl_ops {
-       /* Put your TNL_T_PROTO_* type in here. */
-       u32 tunnel_type;
-       u8 ipproto;
+       u32 tunnel_type;        /* Put the TNL_T_PROTO_* type in here. */
+       u8 ipproto;             /* The IP protocol for the tunnel. */
 
        /*
-        * Returns the length of the tunnel header you will add in
+        * Returns the length of the tunnel header that will be added in
         * build_header() (i.e. excludes the IP header).  Returns a negative
         * error code if the configuration is invalid.
         */
        int (*hdr_len)(const struct tnl_port_config *);
 
        /*
-        * Returns a linked list of SKBs with tunnel headers (multiple
-        * packets may be generated in the event of fragmentation).  Space
-        * will have already been allocated at the start of the packet equal
-        * to sizeof(struct iphdr) + value returned by hdr_len().  The IP
-        * header will have already been constructed.
+        * Builds the static portion of the tunnel header, which is stored in
+        * the header cache.  In general the performance of this function is
+        * not too important as we try to only call it when building the cache
+        * so it is preferable to shift as much work as possible here.  However,
+        * in some circumstances caching is disabled and this function will be
+        * called for every packet, so try not to make it too slow.
+        */
+       void (*build_header)(const struct vport *,
+                            const struct tnl_mutable_config *, void *header);
+
+       /*
+        * Updates the cached header of a packet to match the actual packet
+        * data.  Typical things that might need to be updated are length,
+        * checksum, etc.  The IP header will have already been updated and this
+        * is the final step before transmission.  Returns a linked list of
+        * completed SKBs (multiple packets may be generated in the event
+        * of fragmentation).
+        */
+       struct sk_buff *(*update_header)(const struct vport *,
+                                        const struct tnl_mutable_config *,
+                                        struct dst_entry *, struct sk_buff *);
+};
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20)
+/*
+ * On these kernels we have a fast mechanism to tell if the ARP cache for a
+ * particular destination has changed.
+ */
+#define HAVE_HH_SEQ
+#endif
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,27)
+/*
+ * On these kernels we have a fast mechanism to tell if the routing table
+ * has changed.
+ */
+#define HAVE_RT_GENID
+#endif
+#if !defined(HAVE_HH_SEQ) || !defined(HAVE_RT_GENID)
+/* If we can't detect all system changes directly we need to use a timeout. */
+#define NEED_CACHE_TIMEOUT
+#endif
+struct tnl_cache {
+       struct rcu_head rcu;
+
+       int len;                /* Length of data to be memcpy'd from cache. */
+
+       /* Sequence number of mutable->seq from which this cache was generated. */
+       unsigned mutable_seq;
+
+#ifdef HAVE_HH_SEQ
+       /*
+        * The sequence number from the seqlock protecting the hardware header
+        * cache (in the ARP cache).  Since every write increments the counter
+        * this gives us an easy way to tell if it has changed.
+        */
+       unsigned hh_seq;
+#endif
+
+#ifdef NEED_CACHE_TIMEOUT
+       /*
+        * If we don't have direct mechanisms to detect all important changes in
+        * the system fall back to an expiration time.  This expiration time
+        * can be relatively short since at high rates there will be millions of
+        * packets per second, so we'll still get plenty of benefit from the
+        * cache.  Note that if something changes we may blackhole packets
+        * until the expiration time (depending on what changed and the kernel
+        * version we may be able to detect the change sooner).  Expiration is
+        * expressed as a time in jiffies.
         */
-       struct sk_buff *(*build_header)(struct sk_buff *,
-                                       const struct vport *,
-                                       const struct tnl_mutable_config *,
-                                       struct dst_entry *);
+       unsigned long expiration;
+#endif
+
+       /*
+        * The routing table entry that is the result of looking up the tunnel
+        * endpoints.  It also contains a sequence number (called a generation
+        * ID) that can be compared to a global sequence to tell if the routing
+        * table has changed (and therefore there is a potential that this
+        * cached route has been invalidated).
+        */
+       struct rtable *rt;
+
+       /*
+        * If the output device for tunnel traffic is an OVS internal device,
+        * the flow of that datapath.  Since all tunnel traffic will have the
+        * same headers this allows us to cache the flow lookup.  NULL if the
+        * output device is not OVS or if there is no flow installed.
+        */
+       struct sw_flow *flow;
+
+       /* The cached header follows after padding for alignment. */
 };
 
 struct tnl_vport {
@@ -77,14 +161,29 @@ struct tnl_vport {
        char name[IFNAMSIZ];
        const struct tnl_ops *tnl_ops;
 
-       /* Protected by RCU. */
-       struct tnl_mutable_config *mutable;
+       struct tnl_mutable_config *mutable;     /* Protected by RCU. */
 
+       /*
+        * ID of last fragment sent (for tunnel protocols with direct support
+        * fragmentation).  If the protocol relies on IP fragmentation then
+        * this is not needed.
+        */
        atomic_t frag_id;
+
+       spinlock_t cache_lock;
+       struct tnl_cache *cache;                /* Protected by RCU/cache_lock. */
+
+#ifdef NEED_CACHE_TIMEOUT
+       /*
+        * If we must rely on expiration time to invalidate the cache, this is
+        * the interval.  It is randomized within a range (defined by
+        * MAX_CACHE_EXP in tunnel.c) to avoid synchronized expirations caused
+        * by creation of a large number of tunnels at a one time.
+        */
+       unsigned long cache_exp_interval;
+#endif
 };
 
-int tnl_init(void);
-void tnl_exit(void);
 struct vport *tnl_create(const char *name, const void __user *config,
                         const struct vport_ops *,
                         const struct tnl_ops *);
@@ -104,10 +203,12 @@ struct vport *tnl_find_port(__be32 saddr, __be32 daddr, __be32 key,
 bool tnl_frag_needed(struct vport *vport,
                     const struct tnl_mutable_config *mutable,
                     struct sk_buff *skb, unsigned int mtu, __be32 flow_key);
+void tnl_free_linked_skbs(struct sk_buff *skb);
 
 static inline struct tnl_vport *tnl_vport_priv(const struct vport *vport)
 {
        return vport_priv(vport);
 }
 
+
 #endif /* tunnel.h */
index 7ae3790..bf1465f 100644 (file)
@@ -128,24 +128,32 @@ static int capwap_hdr_len(const struct tnl_port_config *port_config)
        return CAPWAP_HLEN;
 }
 
-static struct sk_buff *capwap_build_header(struct sk_buff *skb,
-                                          const struct vport *vport,
-                                          const struct tnl_mutable_config *mutable,
-                                          struct dst_entry *dst)
+static void capwap_build_header(const struct vport *vport,
+                               const struct tnl_mutable_config *mutable,
+                               void *header)
 {
-       struct udphdr *udph = udp_hdr(skb);
-       struct capwaphdr *cwh = capwap_hdr(skb);
+       struct udphdr *udph = header;
+       struct capwaphdr *cwh = (struct capwaphdr *)(udph + 1);
 
        udph->source = htons(CAPWAP_SRC_PORT);
        udph->dest = htons(CAPWAP_DST_PORT);
-       udph->len = htons(skb->len - sizeof(struct iphdr));
        udph->check = 0;
 
        cwh->begin = NO_FRAG_HDR;
        cwh->frag_id = 0;
        cwh->frag_off = 0;
+}
+
+static struct sk_buff *capwap_update_header(const struct vport *vport,
+                                           const struct tnl_mutable_config *mutable,
+                                           struct dst_entry *dst,
+                                           struct sk_buff *skb)
+{
+       struct udphdr *udph = udp_hdr(skb);
 
-       if (unlikely(skb->len > dst_mtu(dst)))
+       udph->len = htons(skb->len - skb_transport_offset(skb));
+
+       if (unlikely(skb->len - skb_network_offset(skb) > dst_mtu(dst)))
                skb = fragment(skb, vport, dst);
 
        return skb;
@@ -209,6 +217,7 @@ struct tnl_ops capwap_tnl_ops = {
        .ipproto        = IPPROTO_UDP,
        .hdr_len        = capwap_hdr_len,
        .build_header   = capwap_build_header,
+       .update_header  = capwap_update_header,
 };
 
 static struct vport *capwap_create(const char *name, const void __user *config)
@@ -241,7 +250,7 @@ static int capwap_init(void)
 
        defrag_init();
 
-       return tnl_init();
+       return 0;
 
 error_sock:
        sock_release(capwap_rcv_socket);
@@ -252,7 +261,6 @@ error:
 
 static void capwap_exit(void)
 {
-       tnl_exit();
        defrag_exit();
        sock_release(capwap_rcv_socket);
 }
@@ -282,17 +290,19 @@ static struct sk_buff *fragment(struct sk_buff *skb, const struct vport *vport,
                                struct dst_entry *dst)
 {
        struct tnl_vport *tnl_vport = tnl_vport_priv(vport);
-       unsigned int hlen = sizeof(struct iphdr) + CAPWAP_HLEN;
-       unsigned int headroom = LL_RESERVED_SPACE(dst->dev) + dst->header_len;
+       unsigned int hlen = skb_transport_offset(skb) + CAPWAP_HLEN;
+       unsigned int headroom;
+       unsigned int max_frame_len = dst_mtu(dst) + skb_network_offset(skb);
        struct sk_buff *result = NULL, *list_cur = NULL;
        unsigned int remaining;
        unsigned int offset;
        __be16 frag_id;
 
-       if (hlen + ~FRAG_OFF_MASK + 1 > dst_mtu(dst)) {
+       if (hlen + ~FRAG_OFF_MASK + 1 > max_frame_len) {
                if (net_ratelimit())
                        pr_warn("capwap link mtu (%d) is less than minimum packet (%d)\n",
-                               dst_mtu(dst), hlen + ~FRAG_OFF_MASK + 1);
+                               dst_mtu(dst),
+                               hlen - skb_network_offset(skb) + ~FRAG_OFF_MASK + 1);
                goto error;
        }
 
@@ -300,14 +310,17 @@ static struct sk_buff *fragment(struct sk_buff *skb, const struct vport *vport,
        offset = 0;
        frag_id = htons(atomic_inc_return(&tnl_vport->frag_id));
 
+       headroom = dst->header_len + 16;
+       if (!skb_network_offset(skb))
+               headroom += LL_RESERVED_SPACE(dst->dev);
+
        while (remaining) {
                struct sk_buff *skb2;
                int frag_size;
-               struct iphdr *iph;
                struct udphdr *udph;
                struct capwaphdr *cwh;
 
-               frag_size = min(remaining, dst_mtu(dst) - hlen);
+               frag_size = min(remaining, max_frame_len - hlen);
                if (remaining > frag_size)
                        frag_size &= FRAG_OFF_MASK;
 
@@ -317,23 +330,22 @@ static struct sk_buff *fragment(struct sk_buff *skb, const struct vport *vport,
 
                skb_reserve(skb2, headroom);
                __skb_put(skb2, hlen + frag_size);
-               skb_reset_network_header(skb2);
-               skb_set_transport_header(skb2, sizeof(struct iphdr));
 
-               /* Copy IP/UDP/CAPWAP header. */
+               if (skb_network_offset(skb))
+                       skb_reset_mac_header(skb2);
+               skb_set_network_header(skb2, skb_network_offset(skb));
+               skb_set_transport_header(skb2, skb_transport_offset(skb));
+
+               /* Copy (Ethernet)/IP/UDP/CAPWAP header. */
                copy_skb_metadata(skb, skb2);
-               skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen);
+               skb_copy_from_linear_data(skb, skb2->data, hlen);
 
                /* Copy this data chunk. */
                if (skb_copy_bits(skb, hlen + offset, skb2->data + hlen, frag_size))
                        BUG();
 
-               iph = ip_hdr(skb2);
-               iph->tot_len = hlen + frag_size;
-               ip_send_check(iph);
-
                udph = udp_hdr(skb2);
-               udph->len = htons(skb2->len - sizeof(struct iphdr));
+               udph->len = htons(skb2->len - skb_transport_offset(skb2));
 
                cwh = capwap_hdr(skb2);
                if (remaining > frag_size)
@@ -356,11 +368,7 @@ static struct sk_buff *fragment(struct sk_buff *skb, const struct vport *vport,
        goto out;
 
 error:
-       while (result) {
-               list_cur = result->next;
-               kfree_skb(result);
-               result = list_cur;
-       }
+       tnl_free_linked_skbs(result);
 out:
        kfree_skb(skb);
        return result;
index 0a7092f..be8fb53 100644 (file)
@@ -50,41 +50,49 @@ static int gre_hdr_len(const struct tnl_port_config *port_config)
        return len;
 }
 
-static struct sk_buff *gre_build_header(struct sk_buff *skb,
-                                       const struct vport *vport,
-                                       const struct tnl_mutable_config *mutable,
-                                       struct dst_entry *dst)
+static void gre_build_header(const struct vport *vport,
+                            const struct tnl_mutable_config *mutable,
+                            void *header)
 {
-       struct gre_base_hdr *greh = (struct gre_base_hdr *)skb_transport_header(skb);
-       __be32 *options = (__be32 *)(skb_network_header(skb) + mutable->tunnel_hlen
-                                              - GRE_HEADER_SECTION);
+       struct gre_base_hdr *greh = header;
+       __be32 *options = (__be32 *)(greh + 1);
 
        greh->protocol = htons(ETH_P_TEB);
        greh->flags = 0;
 
-       /* Work backwards over the options so the checksum is last. */
+       if (mutable->port_config.flags & TNL_F_CSUM) {
+               greh->flags |= GRE_CSUM;
+               *options = 0;
+               options++;
+       }
+
        if (mutable->port_config.out_key ||
-           mutable->port_config.flags & TNL_F_OUT_KEY_ACTION) {
+           mutable->port_config.flags & TNL_F_OUT_KEY_ACTION)
                greh->flags |= GRE_KEY;
 
-               if (mutable->port_config.flags & TNL_F_OUT_KEY_ACTION)
-                       *options = OVS_CB(skb)->tun_id;
-               else
-                       *options = mutable->port_config.out_key;
+       if (mutable->port_config.out_key)
+               *options = mutable->port_config.out_key;
+}
+
+static struct sk_buff *gre_update_header(const struct vport *vport,
+                                        const struct tnl_mutable_config *mutable,
+                                        struct dst_entry *dst,
+                                        struct sk_buff *skb)
+{
+       __be32 *options = (__be32 *)(skb_network_header(skb) + mutable->tunnel_hlen
+                                              - GRE_HEADER_SECTION);
 
+       /* Work backwards over the options so the checksum is last. */
+       if (mutable->port_config.flags & TNL_F_OUT_KEY_ACTION) {
+               *options = OVS_CB(skb)->tun_id;
                options--;
        }
 
-       if (mutable->port_config.flags & TNL_F_CSUM) {
-               greh->flags |= GRE_CSUM;
-
-               *options = 0;
+       if (mutable->port_config.flags & TNL_F_CSUM)
                *(__sum16 *)options = csum_fold(skb_checksum(skb,
-                                               sizeof(struct iphdr),
-                                               skb->len - sizeof(struct iphdr),
+                                               skb_transport_offset(skb),
+                                               skb->len - skb_transport_offset(skb),
                                                0));
-       }
-
        /*
         * Allow our local IP stack to fragment the outer packet even if the
         * DF bit is set as a last resort.
@@ -329,6 +337,7 @@ struct tnl_ops gre_tnl_ops = {
        .ipproto        = IPPROTO_GRE,
        .hdr_len        = gre_hdr_len,
        .build_header   = gre_build_header,
+       .update_header  = gre_update_header,
 };
 
 static struct vport *gre_create(const char *name, const void __user *config)
@@ -346,20 +355,14 @@ static int gre_init(void)
        int err;
 
        err = inet_add_protocol(&gre_protocol_handlers, IPPROTO_GRE);
-       if (err) {
+       if (err)
                pr_warn("cannot register gre protocol handler\n");
-               goto out;
-       }
-
-       err = tnl_init();
 
-out:
        return err;
 }
 
 static void gre_exit(void)
 {
-       tnl_exit();
        inet_del_protocol(&gre_protocol_handlers, IPPROTO_GRE);
 }
 
index 3737975..dd700d0 100644 (file)
@@ -48,6 +48,7 @@
 #define TNL_F_TOS_INHERIT      (1 << 4) /* Inherit the ToS from the inner packet. */
 #define TNL_F_TTL_INHERIT      (1 << 5) /* Inherit the TTL from the inner packet. */
 #define TNL_F_PMTUD            (1 << 6) /* Enable path MTU discovery. */
+#define TNL_F_HDR_CACHE                (1 << 7) /* Enable tunnel header caching. */
 
 struct tnl_port_config {
        __u32   flags;