datapath: Add support for tun_key to Open vSwitch datapath
[sliver-openvswitch.git] / datapath / tunnel.c
index d651c11..020d9d4 100644 (file)
@@ -367,9 +367,9 @@ struct vport *ovs_tnl_find_port(struct net *net, __be32 saddr, __be32 daddr,
        return NULL;
 }
 
-static void ecn_decapsulate(struct sk_buff *skb, u8 tos)
+static void ecn_decapsulate(struct sk_buff *skb)
 {
-       if (unlikely(INET_ECN_is_ce(tos))) {
+       if (unlikely(INET_ECN_is_ce(OVS_CB(skb)->tun_key->ipv4_tos))) {
                __be16 protocol = skb->protocol;
 
                skb_set_network_header(skb, ETH_HLEN);
@@ -416,7 +416,7 @@ static void ecn_decapsulate(struct sk_buff *skb, u8 tos)
  * - skb->csum does not include the inner Ethernet header.
  * - The layer pointers are undefined.
  */
-void ovs_tnl_rcv(struct vport *vport, struct sk_buff *skb, u8 tos)
+void ovs_tnl_rcv(struct vport *vport, struct sk_buff *skb)
 {
        struct ethhdr *eh;
 
@@ -433,7 +433,7 @@ void ovs_tnl_rcv(struct vport *vport, struct sk_buff *skb, u8 tos)
        skb_clear_rxhash(skb);
        secpath_reset(skb);
 
-       ecn_decapsulate(skb, tos);
+       ecn_decapsulate(skb);
        vlan_set_tci(skb, 0);
 
        if (unlikely(compute_ip_summed(skb, false))) {
@@ -613,7 +613,7 @@ static void ipv6_build_icmp(struct sk_buff *skb, struct sk_buff *nskb,
 
 bool ovs_tnl_frag_needed(struct vport *vport,
                         const struct tnl_mutable_config *mutable,
-                        struct sk_buff *skb, unsigned int mtu, __be64 flow_key)
+                        struct sk_buff *skb, unsigned int mtu)
 {
        unsigned int eth_hdr_len = ETH_HLEN;
        unsigned int total_length = 0, header_length = 0, payload_length;
@@ -697,17 +697,6 @@ bool ovs_tnl_frag_needed(struct vport *vport,
                ipv6_build_icmp(skb, nskb, mtu, payload_length);
 #endif
 
-       /*
-        * Assume that flow based keys are symmetric with respect to input
-        * and output and use the key that we were going to put on the
-        * outgoing packet for the fake received packet.  If the keys are
-        * not symmetric then PMTUD needs to be disabled since we won't have
-        * any way of synthesizing packets.
-        */
-       if ((mutable->flags & (TNL_F_IN_KEY_MATCH | TNL_F_OUT_KEY_ACTION)) ==
-           (TNL_F_IN_KEY_MATCH | TNL_F_OUT_KEY_ACTION))
-               OVS_CB(nskb)->tun_id = flow_key;
-
        if (unlikely(compute_ip_summed(nskb, false))) {
                kfree_skb(nskb);
                return false;
@@ -721,14 +710,26 @@ bool ovs_tnl_frag_needed(struct vport *vport,
 static bool check_mtu(struct sk_buff *skb,
                      struct vport *vport,
                      const struct tnl_mutable_config *mutable,
-                     const struct rtable *rt, __be16 *frag_offp)
+                     const struct rtable *rt, __be16 *frag_offp,
+                     int tunnel_hlen)
 {
-       bool df_inherit = mutable->flags & TNL_F_DF_INHERIT;
-       bool pmtud = mutable->flags & TNL_F_PMTUD;
-       __be16 frag_off = mutable->flags & TNL_F_DF_DEFAULT ? htons(IP_DF) : 0;
+       bool df_inherit;
+       bool pmtud;
+       __be16 frag_off;
        int mtu = 0;
        unsigned int packet_length = skb->len - ETH_HLEN;
 
+       if (OVS_CB(skb)->tun_key->ipv4_dst) {
+               df_inherit = false;
+               pmtud = false;
+               frag_off = OVS_CB(skb)->tun_key->tun_flags & OVS_FLOW_TNL_F_DONT_FRAGMENT ?
+                                 htons(IP_DF) : 0;
+       } else {
+               df_inherit = mutable->flags & TNL_F_DF_INHERIT;
+               pmtud = mutable->flags & TNL_F_PMTUD;
+               frag_off = mutable->flags & TNL_F_DF_DEFAULT ? htons(IP_DF) : 0;
+       }
+
        /* Allow for one level of tagging in the packet length. */
        if (!vlan_tx_tag_present(skb) &&
            eth_hdr(skb)->h_proto == htons(ETH_P_8021Q))
@@ -746,7 +747,7 @@ static bool check_mtu(struct sk_buff *skb,
 
                mtu = dst_mtu(&rt_dst(rt))
                        - ETH_HLEN
-                       - mutable->tunnel_hlen
+                       - tunnel_hlen
                        - vlan_header;
        }
 
@@ -760,8 +761,7 @@ static bool check_mtu(struct sk_buff *skb,
                        mtu = max(mtu, IP_MIN_MTU);
 
                        if (packet_length > mtu &&
-                           ovs_tnl_frag_needed(vport, mutable, skb, mtu,
-                                               OVS_CB(skb)->tun_id))
+                           ovs_tnl_frag_needed(vport, mutable, skb, mtu))
                                return false;
                }
        }
@@ -777,8 +777,7 @@ static bool check_mtu(struct sk_buff *skb,
                        mtu = max(mtu, IPV6_MIN_MTU);
 
                        if (packet_length > mtu &&
-                           ovs_tnl_frag_needed(vport, mutable, skb, mtu,
-                                               OVS_CB(skb)->tun_id))
+                           ovs_tnl_frag_needed(vport, mutable, skb, mtu))
                                return false;
                }
        }
@@ -790,6 +789,7 @@ static bool check_mtu(struct sk_buff *skb,
 
 static void create_tunnel_header(const struct vport *vport,
                                 const struct tnl_mutable_config *mutable,
+                                const struct ovs_key_ipv4_tunnel *tun_key,
                                 const struct rtable *rt, void *header)
 {
        struct tnl_vport *tnl_vport = tnl_vport_priv(vport);
@@ -806,7 +806,7 @@ static void create_tunnel_header(const struct vport *vport,
        if (!iph->ttl)
                iph->ttl = ip4_dst_hoplimit(&rt_dst(rt));
 
-       tnl_vport->tnl_ops->build_header(vport, mutable, iph + 1);
+       tnl_vport->tnl_ops->build_header(vport, mutable, tun_key, iph + 1);
 }
 
 static void *get_cached_header(const struct tnl_cache *cache)
@@ -907,14 +907,22 @@ static struct tnl_cache *build_cache(struct vport *vport,
                                     struct rtable *rt)
 {
        struct tnl_vport *tnl_vport = tnl_vport_priv(vport);
+       static const struct ovs_key_ipv4_tunnel tun_key;
        struct tnl_cache *cache;
        void *cache_data;
        int cache_len;
        struct hh_cache *hh;
+       int tunnel_hlen;
 
        if (!(mutable->flags & TNL_F_HDR_CACHE))
                return NULL;
 
+       tunnel_hlen = tnl_vport->tnl_ops->hdr_len(mutable, &tun_key);
+       if (tunnel_hlen < 0)
+               return NULL;
+
+       tunnel_hlen += sizeof(struct iphdr);
+
        /*
         * If there is no entry in the ARP cache or if this device does not
         * support hard header caching just fall back to the IP stack.
@@ -937,7 +945,7 @@ static struct tnl_cache *build_cache(struct vport *vport,
        else
                cache = NULL;
 
-       cache_len = LL_RESERVED_SPACE(rt_dst(rt).dev) + mutable->tunnel_hlen;
+       cache_len = LL_RESERVED_SPACE(rt_dst(rt).dev) + tunnel_hlen;
 
        cache = kzalloc(ALIGN(sizeof(struct tnl_cache), CACHE_DATA_ALIGN) +
                        cache_len, GFP_ATOMIC);
@@ -946,9 +954,9 @@ static struct tnl_cache *build_cache(struct vport *vport,
 
        create_eth_hdr(cache, hh);
        cache_data = get_cached_header(cache) + cache->hh_len;
-       cache->len = cache->hh_len + mutable->tunnel_hlen;
+       cache->len = cache->hh_len + tunnel_hlen;
 
-       create_tunnel_header(vport, mutable, rt, cache_data);
+       create_tunnel_header(vport, mutable, &tun_key, rt, cache_data);
 
        cache->mutable_seq = mutable->seq;
        cache->rt = rt;
@@ -1000,15 +1008,16 @@ unlock:
 }
 
 static struct rtable *__find_route(const struct tnl_mutable_config *mutable,
-                                  u8 ipproto, u8 tos)
+                                  __be32 saddr, __be32 daddr, u8 ipproto,
+                                  u8 tos)
 {
        /* Tunnel configuration keeps DSCP part of TOS bits, But Linux
         * router expect RT_TOS bits only. */
 
 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,39)
        struct flowi fl = { .nl_u = { .ip4_u = {
-                                       .daddr = mutable->key.daddr,
-                                       .saddr = mutable->key.saddr,
+                                       .daddr = daddr,
+                                       .saddr = saddr,
                                        .tos   = RT_TOS(tos) } },
                                        .proto = ipproto };
        struct rtable *rt;
@@ -1018,8 +1027,8 @@ static struct rtable *__find_route(const struct tnl_mutable_config *mutable,
 
        return rt;
 #else
-       struct flowi4 fl = { .daddr = mutable->key.daddr,
-                            .saddr = mutable->key.saddr,
+       struct flowi4 fl = { .daddr = daddr,
+                            .saddr = saddr,
                             .flowi4_tos = RT_TOS(tos),
                             .flowi4_proto = ipproto };
 
@@ -1029,7 +1038,8 @@ static struct rtable *__find_route(const struct tnl_mutable_config *mutable,
 
 static struct rtable *find_route(struct vport *vport,
                                 const struct tnl_mutable_config *mutable,
-                                u8 tos, struct tnl_cache **cache)
+                                __be32 saddr, __be32 daddr, u8 tos,
+                                struct tnl_cache **cache)
 {
        struct tnl_vport *tnl_vport = tnl_vport_priv(vport);
        struct tnl_cache *cur_cache = rcu_dereference(tnl_vport->cache);
@@ -1037,17 +1047,17 @@ static struct rtable *find_route(struct vport *vport,
        *cache = NULL;
        tos = RT_TOS(tos);
 
-       if (likely(tos == RT_TOS(mutable->tos) &&
-           check_cache_valid(cur_cache, mutable))) {
+       if (tos == RT_TOS(mutable->tos) &&
+           check_cache_valid(cur_cache, mutable)) {
                *cache = cur_cache;
                return cur_cache->rt;
        } else {
                struct rtable *rt;
 
-               rt = __find_route(mutable, tnl_vport->tnl_ops->ipproto, tos);
+               rt = __find_route(mutable, saddr, daddr,
+                                 tnl_vport->tnl_ops->ipproto, tos);
                if (IS_ERR(rt))
                        return NULL;
-
                if (likely(tos == RT_TOS(mutable->tos)))
                        *cache = build_cache(vport, mutable, rt);
 
@@ -1076,13 +1086,14 @@ static bool need_linearize(const struct sk_buff *skb)
 
 static struct sk_buff *handle_offloads(struct sk_buff *skb,
                                       const struct tnl_mutable_config *mutable,
-                                      const struct rtable *rt)
+                                      const struct rtable *rt,
+                                      int tunnel_hlen)
 {
        int min_headroom;
        int err;
 
        min_headroom = LL_RESERVED_SPACE(rt_dst(rt).dev) + rt_dst(rt).header_len
-                       + mutable->tunnel_hlen
+                       + tunnel_hlen
                        + (vlan_tx_tag_present(skb) ? VLAN_HLEN : 0);
 
        if (skb_headroom(skb) < min_headroom || skb_header_cloned(skb)) {
@@ -1137,14 +1148,14 @@ error:
 }
 
 static int send_frags(struct sk_buff *skb,
-                     const struct tnl_mutable_config *mutable)
+                     int tunnel_hlen)
 {
        int sent_len;
 
        sent_len = 0;
        while (skb) {
                struct sk_buff *next = skb->next;
-               int frag_len = skb->len - mutable->tunnel_hlen;
+               int frag_len = skb->len - tunnel_hlen;
                int err;
 
                skb->next = NULL;
@@ -1173,15 +1184,17 @@ int ovs_tnl_send(struct vport *vport, struct sk_buff *skb)
 {
        struct tnl_vport *tnl_vport = tnl_vport_priv(vport);
        const struct tnl_mutable_config *mutable = rcu_dereference(tnl_vport->mutable);
-
        enum vport_err_type err = VPORT_E_TX_ERROR;
        struct rtable *rt;
        struct dst_entry *unattached_dst = NULL;
        struct tnl_cache *cache;
+       struct ovs_key_ipv4_tunnel tun_key;
        int sent_len = 0;
+       int tunnel_hlen;
        __be16 frag_off = 0;
+       __be32 daddr;
+       __be32 saddr;
        u8 ttl;
-       u8 inner_tos;
        u8 tos;
 
        /* Validate the protocol headers before we try to use them. */
@@ -1207,30 +1220,68 @@ int ovs_tnl_send(struct vport *vport, struct sk_buff *skb)
        }
 #endif
 
-       /* ToS */
-       if (skb->protocol == htons(ETH_P_IP))
-               inner_tos = ip_hdr(skb)->tos;
+       /* If OVS_CB(skb)->tun_key is NULL, point it at the local tun_key here,
+        * and zero it out.
+        */
+       if (!OVS_CB(skb)->tun_key) {
+               memset(&tun_key, 0, sizeof(tun_key));
+               OVS_CB(skb)->tun_key = &tun_key;
+       }
+
+       tunnel_hlen = tnl_vport->tnl_ops->hdr_len(mutable, OVS_CB(skb)->tun_key);
+       if (unlikely(tunnel_hlen < 0)) {
+               err = VPORT_E_TX_DROPPED;
+               goto error_free;
+       }
+       tunnel_hlen += sizeof(struct iphdr);
+
+       if (OVS_CB(skb)->tun_key->ipv4_dst) {
+               daddr = OVS_CB(skb)->tun_key->ipv4_dst;
+               saddr = OVS_CB(skb)->tun_key->ipv4_src;
+               tos = OVS_CB(skb)->tun_key->ipv4_tos;
+               ttl = OVS_CB(skb)->tun_key->ipv4_ttl;
+       } else {
+               u8 inner_tos;
+               daddr = mutable->key.daddr;
+               saddr = mutable->key.saddr;
+
+               /* ToS */
+               if (skb->protocol == htons(ETH_P_IP))
+                       inner_tos = ip_hdr(skb)->tos;
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-       else if (skb->protocol == htons(ETH_P_IPV6))
-               inner_tos = ipv6_get_dsfield(ipv6_hdr(skb));
+               else if (skb->protocol == htons(ETH_P_IPV6))
+                       inner_tos = ipv6_get_dsfield(ipv6_hdr(skb));
 #endif
-       else
-               inner_tos = 0;
+               else
+                       inner_tos = 0;
 
-       if (mutable->flags & TNL_F_TOS_INHERIT)
-               tos = inner_tos;
-       else
-               tos = mutable->tos;
+               if (mutable->flags & TNL_F_TOS_INHERIT)
+                       tos = inner_tos;
+               else
+                       tos = mutable->tos;
+
+               tos = INET_ECN_encapsulate(tos, inner_tos);
+
+               /* TTL */
+               ttl = mutable->ttl;
+               if (mutable->flags & TNL_F_TTL_INHERIT) {
+                       if (skb->protocol == htons(ETH_P_IP))
+                               ttl = ip_hdr(skb)->ttl;
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+                       else if (skb->protocol == htons(ETH_P_IPV6))
+                               ttl = ipv6_hdr(skb)->hop_limit;
+#endif
+               }
+
+       }
 
        /* Route lookup */
-       rt = find_route(vport, mutable, tos, &cache);
+       rt = find_route(vport, mutable, saddr, daddr, tos, &cache);
        if (unlikely(!rt))
                goto error_free;
        if (unlikely(!cache))
                unattached_dst = &rt_dst(rt);
 
-       tos = INET_ECN_encapsulate(tos, inner_tos);
-
        /* Reset SKB */
        nf_reset(skb);
        secpath_reset(skb);
@@ -1238,12 +1289,12 @@ int ovs_tnl_send(struct vport *vport, struct sk_buff *skb)
        skb_clear_rxhash(skb);
 
        /* Offloading */
-       skb = handle_offloads(skb, mutable, rt);
+       skb = handle_offloads(skb, mutable, rt, tunnel_hlen);
        if (IS_ERR(skb))
                goto error;
 
        /* MTU */
-       if (unlikely(!check_mtu(skb, vport, mutable, rt, &frag_off))) {
+       if (unlikely(!check_mtu(skb, vport, mutable, rt, &frag_off, tunnel_hlen))) {
                err = VPORT_E_TX_DROPPED;
                goto error_free;
        }
@@ -1252,25 +1303,19 @@ int ovs_tnl_send(struct vport *vport, struct sk_buff *skb)
         * If we are over the MTU, allow the IP stack to handle fragmentation.
         * Fragmentation is a slow path anyways.
         */
-       if (unlikely(skb->len + mutable->tunnel_hlen > dst_mtu(&rt_dst(rt)) &&
+       if (unlikely(skb->len + tunnel_hlen > dst_mtu(&rt_dst(rt)) &&
                     cache)) {
                unattached_dst = &rt_dst(rt);
                dst_hold(unattached_dst);
                cache = NULL;
        }
 
-       /* TTL */
-       ttl = mutable->ttl;
-       if (!ttl)
-               ttl = ip4_dst_hoplimit(&rt_dst(rt));
-
-       if (mutable->flags & TNL_F_TTL_INHERIT) {
-               if (skb->protocol == htons(ETH_P_IP))
-                       ttl = ip_hdr(skb)->ttl;
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-               else if (skb->protocol == htons(ETH_P_IPV6))
-                       ttl = ipv6_hdr(skb)->hop_limit;
-#endif
+       /* TTL Fixup. */
+       if (!OVS_CB(skb)->tun_key->ipv4_dst) {
+               if (!(mutable->flags & TNL_F_TTL_INHERIT)) {
+                       if (!ttl)
+                               ttl = ip4_dst_hoplimit(&rt_dst(rt));
+               }
        }
 
        while (skb) {
@@ -1288,8 +1333,8 @@ int ovs_tnl_send(struct vport *vport, struct sk_buff *skb)
                        skb_set_network_header(skb, cache->hh_len);
 
                } else {
-                       skb_push(skb, mutable->tunnel_hlen);
-                       create_tunnel_header(vport, mutable, rt, skb->data);
+                       skb_push(skb, tunnel_hlen);
+                       create_tunnel_header(vport, mutable, OVS_CB(skb)->tun_key, rt, skb->data);
                        skb_reset_network_header(skb);
 
                        if (next_skb)
@@ -1308,7 +1353,7 @@ int ovs_tnl_send(struct vport *vport, struct sk_buff *skb)
                ip_select_ident(iph, &rt_dst(rt), NULL);
 
                skb = tnl_vport->tnl_ops->update_header(vport, mutable,
-                                                       &rt_dst(rt), skb);
+                                                       &rt_dst(rt), skb, tunnel_hlen);
                if (unlikely(!skb))
                        goto next;
 
@@ -1341,7 +1386,7 @@ int ovs_tnl_send(struct vport *vport, struct sk_buff *skb)
                                        sent_len += orig_len;
                        }
                } else
-                       sent_len += send_frags(skb, mutable);
+                       sent_len += send_frags(skb, tunnel_hlen);
 
 next:
                skb = next_skb;
@@ -1427,12 +1472,6 @@ static int tnl_set_config(struct net *net, struct nlattr *options,
        else
                mutable->out_key = nla_get_be64(a[OVS_TUNNEL_ATTR_OUT_KEY]);
 
-       mutable->tunnel_hlen = tnl_ops->hdr_len(mutable);
-       if (mutable->tunnel_hlen < 0)
-               return mutable->tunnel_hlen;
-
-       mutable->tunnel_hlen += sizeof(struct iphdr);
-
        old_vport = port_table_lookup(&mutable->key, &old_mutable);
        if (old_vport && old_vport != cur_vport)
                return -EEXIST;
@@ -1442,7 +1481,8 @@ static int tnl_set_config(struct net *net, struct nlattr *options,
                struct net_device *dev;
                struct rtable *rt;
 
-               rt = __find_route(mutable, tnl_ops->ipproto, mutable->tos);
+               rt = __find_route(mutable, mutable->key.saddr, mutable->key.daddr,
+                                 tnl_ops->ipproto, mutable->tos);
                if (IS_ERR(rt))
                        return -EADDRNOTAVAIL;
                dev = rt_dst(rt).dev;