1.9.x 2.6.18 to 3.8
1.10.x 2.6.18 to 3.8
1.11.x 2.6.18 to 3.8
- 1.12.x 2.6.18 to 3.9
+ 1.12.x 2.6.18 to 3.10
Open vSwitch userspace should also work with the Linux kernel module
built into Linux 3.3 and later.
* New support for matching outer source and destination IP address
of tunneled packets, for tunnel ports configured with the newly
added "remote_ip=flow" and "local_ip=flow" options.
+ * Support for matching on metadata 'pkt_mark' for interacting with
+ other system components. On Linux this corresponds to the skb
+ mark.
- The Interface table in the database has a new "ifindex" column to
report the interface's OS-assigned ifindex.
- New "check-oftest" Makefile target for running OFTest against Open
through database paths (e.g. Private key option with the database name
should look like "--private-key=db:Open_vSwitch,SSL,private_key").
- Added ovs-dev.py, a utility script helpful for Open vSwitch developers.
- - Support for Linux kernels up to 3.9
+ - Support for Linux kernels up to 3.10
- ovs-ofctl:
* New "ofp-parse" for printing OpenFlow messages read from a file.
OVS_GREP_IFELSE([$KSRC/include/net/checksum.h], [csum_replace4])
OVS_GREP_IFELSE([$KSRC/include/net/checksum.h], [csum_unfold])
+ OVS_GREP_IFELSE([$KSRC/include/net/genetlink.h], [parallel_ops])
OVS_GREP_IFELSE([$KSRC/include/net/netlink.h], [NLA_NUL_STRING])
OVS_GREP_IFELSE([$KSRC/include/net/netlink.h], [nla_get_be16])
OVS_GREP_IFELSE([$KSRC/include/net/netlink.h], [nla_put_be16])
datapath.c \
dp_notify.c \
flow.c \
- tunnel.c \
vlan.c \
vport.c \
vport-gre.c \
compat.h \
datapath.h \
flow.h \
- tunnel.h \
vlan.h \
vport.h \
vport-internal_dev.h \
if (unlikely(err))
return err;
- __vlan_hwaccel_put_tag(skb, ntohs(tci));
+ __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), ntohs(tci));
return 0;
}
/* push down current VLAN tag */
current_tag = vlan_tx_tag_get(skb);
- if (!__vlan_put_tag(skb, current_tag))
+ if (!__vlan_put_tag(skb, skb->vlan_proto, current_tag))
return -ENOMEM;
if (get_ip_summed(skb) == OVS_CSUM_COMPLETE)
+ (2 * ETH_ALEN), VLAN_HLEN, 0));
}
- __vlan_hwaccel_put_tag(skb, ntohs(vlan->vlan_tci) & ~VLAN_TAG_PRESENT);
+ __vlan_hwaccel_put_tag(skb, vlan->vlan_tpid, ntohs(vlan->vlan_tci) & ~VLAN_TAG_PRESENT);
return 0;
}
#ifndef COMPAT_H
#define COMPAT_H 1
+#include <linux/in.h>
+#include <linux/in_route.h>
#include <linux/netlink.h>
+#include <net/route.h>
+#include <net/xfrm.h>
+
#ifndef HAVE_NLA_NUL_STRING
static inline int CHECK_NUL_STRING(struct nlattr *attr, int maxlen)
#define SET_NETNSOK .netnsok = true,
#endif
+#ifdef HAVE_PARALLEL_OPS
+#define SET_PARALLEL_OPS .parallel_ops = true,
+#else
+#define SET_PARALLEL_OPS
+#endif
+
+
#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,20)
#ifdef CONFIG_NETFILTER
static inline u32 skb_get_mark(struct sk_buff *skb)
#define inet_sport(sk) (inet_sk(sk)->inet_sport)
#endif
+static inline struct rtable *find_route(struct net *net,
+ __be32 *saddr, __be32 daddr,
+ u8 ipproto, u8 tos, u32 skb_mark)
+{
+ struct rtable *rt;
+ /* Tunnel configuration keeps DSCP part of TOS bits, But Linux
+ * router expect RT_TOS bits only. */
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,39)
+ struct flowi fl = { .nl_u = { .ip4_u = {
+ .daddr = daddr,
+ .saddr = *saddr,
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,20)
+ .fwmark = skb_mark,
+#endif
+ .tos = RT_TOS(tos) } },
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20)
+ .mark = skb_mark,
+#endif
+ .proto = ipproto };
+
+ if (unlikely(ip_route_output_key(net, &rt, &fl)))
+ return ERR_PTR(-EADDRNOTAVAIL);
+ *saddr = fl.nl_u.ip4_u.saddr;
+ return rt;
+#else
+ struct flowi4 fl = { .daddr = daddr,
+ .saddr = *saddr,
+ .flowi4_tos = RT_TOS(tos),
+ .flowi4_mark = skb_mark,
+ .flowi4_proto = ipproto };
+
+ rt = ip_route_output_key(net, &fl);
+ *saddr = fl.saddr;
+ return rt;
+#endif
+}
#endif /* compat.h */
#include "datapath.h"
#include "flow.h"
#include "vlan.h"
-#include "tunnel.h"
#include "vport-internal_dev.h"
#include "vport-netdev.h"
#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,18) || \
- LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0)
+ LINUX_VERSION_CODE >= KERNEL_VERSION(3,11,0)
#error Kernels before 2.6.18 or after 3.9 are not supported by this version of Open vSwitch.
#endif
.version = OVS_PACKET_VERSION,
.maxattr = OVS_PACKET_ATTR_MAX,
SET_NETNSOK
+ SET_PARALLEL_OPS
};
int ovs_dp_upcall(struct datapath *dp, struct sk_buff *skb,
.version = OVS_FLOW_VERSION,
.maxattr = OVS_FLOW_ATTR_MAX,
SET_NETNSOK
+ SET_PARALLEL_OPS
};
static struct genl_multicast_group ovs_dp_flow_multicast_group = {
.version = OVS_DATAPATH_VERSION,
.maxattr = OVS_DP_ATTR_MAX,
SET_NETNSOK
+ SET_PARALLEL_OPS
};
static struct genl_multicast_group ovs_dp_datapath_multicast_group = {
.version = OVS_VPORT_VERSION,
.maxattr = OVS_VPORT_ATTR_MAX,
SET_NETNSOK
+ SET_PARALLEL_OPS
};
struct genl_multicast_group ovs_dp_vport_multicast_group = {
#include "checksum.h"
#include "compat.h"
#include "flow.h"
-#include "tunnel.h"
#include "vlan.h"
#include "vport.h"
#include <linux/netdevice.h>
#include <net/genetlink.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
#include "datapath.h"
#include "vport-internal_dev.h"
u8 ipv4_ttl;
};
+static inline void ovs_flow_tun_key_init(struct ovs_key_ipv4_tunnel *tun_key,
+ const struct iphdr *iph, __be64 tun_id,
+ __be16 tun_flags)
+{
+ tun_key->tun_id = tun_id;
+ tun_key->ipv4_src = iph->saddr;
+ tun_key->ipv4_dst = iph->daddr;
+ tun_key->ipv4_tos = iph->tos;
+ tun_key->ipv4_ttl = iph->ttl;
+ tun_key->tun_flags = tun_flags;
+
+ /* clear struct padding. */
+ memset((unsigned char *) tun_key + OVS_TUNNEL_KEY_SIZE, 0,
+ sizeof(*tun_key) - OVS_TUNNEL_KEY_SIZE);
+}
+
struct sw_flow_key {
struct ovs_key_ipv4_tunnel tun_key; /* Encapsulating tunnel key. */
struct {
linux/compat/include/linux/mutex.h \
linux/compat/include/linux/net.h \
linux/compat/include/linux/netdevice.h \
+ linux/compat/include/linux/netdev_features.h \
linux/compat/include/linux/netfilter_bridge.h \
linux/compat/include/linux/netfilter_ipv4.h \
linux/compat/include/linux/netlink.h \
#include <linux/version.h>
#include_next <linux/if_vlan.h>
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0)
/*
* The behavior of __vlan_put_tag() has changed over time:
*
* to avoid the need to guess whether the version in the kernel tree is
* acceptable.
*/
-#define __vlan_put_tag rpl_vlan_put_tag
-static inline struct sk_buff *__vlan_put_tag(struct sk_buff *skb, u16 vlan_tci)
+#define __vlan_put_tag(skb, proto, tag) rpl__vlan_put_tag(skb, tag)
+
+static inline struct sk_buff *rpl__vlan_put_tag(struct sk_buff *skb, u16 vlan_tci)
{
struct vlan_ethhdr *veth;
return skb;
}
+static inline struct sk_buff *rpl___vlan_hwaccel_put_tag(struct sk_buff *skb,
+ __be16 vlan_proto,
+ u16 vlan_tci)
+{
+ return __vlan_hwaccel_put_tag(skb, vlan_tci);
+}
+
+#define __vlan_hwaccel_put_tag rpl___vlan_hwaccel_put_tag
+
+#endif
/* All of these were introduced in a single commit preceding 2.6.33, so
* presumably all of them or none of them are present. */
--- /dev/null
+#ifndef __LINUX_NETDEV_FEATURES_WRAPPER_H
+#define __LINUX_NETDEV_FEATURES_WRAPPER_H
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,3,0)
+#include_next <linux/netdev_features.h>
+#endif
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0)
+#define NETIF_F_HW_VLAN_CTAG_TX NETIF_F_HW_VLAN_TX
+#endif
+
+#endif
#endif
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,9,0)
+
+/* XEN dom0 networking assumes dev->master is bond device
+ * and it tries to access bond private structure from dev->master
+ * ptr on receive path. This causes panic. Therefore it is better
+ * not to backport this API.
+ **/
static inline int netdev_master_upper_dev_link(struct net_device *dev,
struct net_device *upper_dev)
{
- return netdev_set_master(dev, upper_dev);
+ return 0;
}
static inline void netdev_upper_dev_unlink(struct net_device *dev,
struct net_device *upper_dev)
{
- netdev_set_master(dev, NULL);
}
#endif
#endif
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0)
struct gre_base_hdr {
__be16 flags;
__be16 protocol;
};
#define GRE_HEADER_SECTION 4
-#define MAX_GRE_PROTO_PRIORITY 255
-struct gre_cisco_protocol {
- int (*handler)(struct sk_buff *skb, const struct tnl_ptk_info *tpi);
- u8 priority;
-};
-
-#define gre_build_header rpl_gre_build_header
-void gre_build_header(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
- int hdr_len);
-
-#define gre_handle_offloads rpl_gre_handle_offloads
-struct sk_buff *gre_handle_offloads(struct sk_buff *skb, bool gre_csum);
-
-int gre_cisco_register(struct gre_cisco_protocol *proto);
-int gre_cisco_unregister(struct gre_cisco_protocol *proto);
-
-static inline int ip_gre_calc_hlen(__be16 o_flags)
-{
- int addend = 4;
-
- if (o_flags & TUNNEL_CSUM)
- addend += 4;
- if (o_flags & TUNNEL_KEY)
- addend += 4;
- if (o_flags & TUNNEL_SEQ)
- addend += 4;
- return addend;
-}
-
static inline __be16 gre_flags_to_tnl_flags(__be16 flags)
{
__be16 tflags = 0;
return flags;
}
+#endif /* LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0) */
+
+#define MAX_GRE_PROTO_PRIORITY 255
+struct gre_cisco_protocol {
+ int (*handler)(struct sk_buff *skb, const struct tnl_ptk_info *tpi);
+ u8 priority;
+};
+
+int gre_cisco_register(struct gre_cisco_protocol *proto);
+int gre_cisco_unregister(struct gre_cisco_protocol *proto);
+
+#define gre_build_header rpl_gre_build_header
+void gre_build_header(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
+ int hdr_len);
+
+#define gre_handle_offloads rpl_gre_handle_offloads
+struct sk_buff *gre_handle_offloads(struct sk_buff *skb, bool gre_csum);
+
+static inline int ip_gre_calc_hlen(__be16 o_flags)
+{
+ int addend = 4;
+
+ if (o_flags & TUNNEL_CSUM)
+ addend += 4;
+ if (o_flags & TUNNEL_KEY)
+ addend += 4;
+ if (o_flags & TUNNEL_SEQ)
+ addend += 4;
+ return addend;
+}
+
+
#endif
#define PACKET_RCVD 0
#define PACKET_REJECT 1
-static inline void tunnel_ip_select_ident(struct sk_buff *skb,
- const struct iphdr *old_iph,
- struct dst_entry *dst)
-{
- struct iphdr *iph = ip_hdr(skb);
-
- /* Use inner packet iph-id if possible. */
- if (skb->protocol == htons(ETH_P_IP) && old_iph->id)
- iph->id = old_iph->id;
- else
- __ip_select_ident(iph, dst,
- (skb_shinfo(skb)->gso_segs ?: 1) - 1);
-}
-
int iptunnel_xmit(struct net *net, struct rtable *rt,
struct sk_buff *skb,
__be32 src, __be32 dst, __u8 proto,
iph->daddr = dst;
iph->saddr = src;
iph->ttl = ttl;
- tunnel_ip_select_ident(skb,
- (const struct iphdr *)skb_inner_network_header(skb),
- &rt_dst(rt));
+ __ip_select_ident(iph, &rt_dst(rt), (skb_shinfo(skb)->gso_segs ?: 1) - 1);
err = ip_local_out(skb);
if (unlikely(net_xmit_eval(err)))
+++ /dev/null
-/*
- * Copyright (c) 2007-2012 Nicira, Inc.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
- * 02110-1301, USA
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/in.h>
-#include <linux/in_route.h>
-#include <linux/inetdevice.h>
-#include <linux/jhash.h>
-#include <linux/list.h>
-#include <linux/kernel.h>
-#include <linux/version.h>
-#include <linux/workqueue.h>
-#include <linux/rculist.h>
-#include <net/route.h>
-#include <net/xfrm.h>
-
-#include "checksum.h"
-#include "compat.h"
-#include "datapath.h"
-#include "tunnel.h"
-#include "vlan.h"
-#include "vport.h"
-
-/**
- * ovs_tnl_rcv - ingress point for generic tunnel code
- *
- * @vport: port this packet was received on
- * @skb: received packet
- * @tos: ToS from encapsulating IP packet, used to copy ECN bits
- *
- * Must be called with rcu_read_lock.
- *
- * Packets received by this function are in the following state:
- * - skb->data points to the inner Ethernet header.
- * - The inner Ethernet header is in the linear data area.
- * - skb->csum does not include the inner Ethernet header.
- * - The layer pointers are undefined.
- */
-void ovs_tnl_rcv(struct vport *vport, struct sk_buff *skb,
- struct ovs_key_ipv4_tunnel *tun_key)
-{
- struct ethhdr *eh;
-
- skb_reset_mac_header(skb);
- eh = eth_hdr(skb);
-
- if (likely(ntohs(eh->h_proto) >= ETH_P_802_3_MIN))
- skb->protocol = eh->h_proto;
- else
- skb->protocol = htons(ETH_P_802_2);
-
- skb_dst_drop(skb);
- nf_reset(skb);
- skb_clear_rxhash(skb);
- secpath_reset(skb);
- vlan_set_tci(skb, 0);
-
- if (unlikely(compute_ip_summed(skb, false))) {
- kfree_skb(skb);
- return;
- }
-
- ovs_vport_receive(vport, skb, tun_key);
-}
-
-struct rtable *find_route(struct net *net,
- __be32 *saddr, __be32 daddr, u8 ipproto,
- u8 tos, u32 skb_mark)
-{
- struct rtable *rt;
- /* Tunnel configuration keeps DSCP part of TOS bits, But Linux
- * router expect RT_TOS bits only. */
-
-#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,39)
- struct flowi fl = { .nl_u = { .ip4_u = {
- .daddr = daddr,
- .saddr = *saddr,
-#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,20)
- .fwmark = skb_mark,
-#endif
- .tos = RT_TOS(tos) } },
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20)
- .mark = skb_mark,
-#endif
- .proto = ipproto };
-
- if (unlikely(ip_route_output_key(net, &rt, &fl)))
- return ERR_PTR(-EADDRNOTAVAIL);
- *saddr = fl.nl_u.ip4_u.saddr;
- return rt;
-#else
- struct flowi4 fl = { .daddr = daddr,
- .saddr = *saddr,
- .flowi4_tos = RT_TOS(tos),
- .flowi4_mark = skb_mark,
- .flowi4_proto = ipproto };
-
- rt = ip_route_output_key(net, &fl);
- *saddr = fl.saddr;
- return rt;
-#endif
-}
-
-static bool need_linearize(const struct sk_buff *skb)
-{
- int i;
-
- if (unlikely(skb_shinfo(skb)->frag_list))
- return true;
-
- /*
- * Generally speaking we should linearize if there are paged frags.
- * However, if all of the refcounts are 1 we know nobody else can
- * change them from underneath us and we can skip the linearization.
- */
- for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
- if (unlikely(page_count(skb_frag_page(&skb_shinfo(skb)->frags[i])) > 1))
- return true;
-
- return false;
-}
-
-static struct sk_buff *handle_offloads(struct sk_buff *skb)
-{
- int err;
-
- forward_ip_summed(skb, true);
-
- if (skb_is_gso(skb)) {
- struct sk_buff *nskb;
- char cb[sizeof(skb->cb)];
-
- memcpy(cb, skb->cb, sizeof(cb));
-
- nskb = __skb_gso_segment(skb, 0, false);
- if (IS_ERR(nskb)) {
- err = PTR_ERR(nskb);
- goto error;
- }
-
- consume_skb(skb);
- skb = nskb;
- while (nskb) {
- memcpy(nskb->cb, cb, sizeof(cb));
- nskb = nskb->next;
- }
- } else if (get_ip_summed(skb) == OVS_CSUM_PARTIAL) {
- /* Pages aren't locked and could change at any time.
- * If this happens after we compute the checksum, the
- * checksum will be wrong. We linearize now to avoid
- * this problem.
- */
- if (unlikely(need_linearize(skb))) {
- err = __skb_linearize(skb);
- if (unlikely(err))
- goto error;
- }
-
- err = skb_checksum_help(skb);
- if (unlikely(err))
- goto error;
- }
-
- set_ip_summed(skb, OVS_CSUM_NONE);
-
- return skb;
-
-error:
- return ERR_PTR(err);
-}
-
-/* Compute source UDP port for outgoing packet.
- * Currently we use the flow hash.
- */
-u16 ovs_tnl_get_src_port(struct sk_buff *skb)
-{
- int low;
- int high;
- unsigned int range;
- struct sw_flow_key *pkt_key = OVS_CB(skb)->pkt_key;
- u32 hash = jhash2((const u32 *)pkt_key,
- sizeof(*pkt_key) / sizeof(u32), 0);
-
- inet_get_local_port_range(&low, &high);
- range = (high - low) + 1;
- return (((u64) hash * range) >> 32) + low;
-}
-
-int ovs_tnl_send(struct vport *vport, struct sk_buff *skb,
- u8 ipproto, int tunnel_hlen,
- void (*build_header)(const struct vport *,
- struct sk_buff *,
- int tunnel_hlen))
-{
- int min_headroom;
- struct rtable *rt;
- __be32 saddr;
- int sent_len = 0;
- int err;
- struct sk_buff *nskb;
-
- /* Route lookup */
- saddr = OVS_CB(skb)->tun_key->ipv4_src;
- rt = find_route(ovs_dp_get_net(vport->dp),
- &saddr,
- OVS_CB(skb)->tun_key->ipv4_dst,
- ipproto,
- OVS_CB(skb)->tun_key->ipv4_tos,
- skb_get_mark(skb));
- if (IS_ERR(rt)) {
- err = PTR_ERR(rt);
- goto error;
- }
-
- tunnel_hlen += sizeof(struct iphdr);
-
- min_headroom = LL_RESERVED_SPACE(rt_dst(rt).dev) + rt_dst(rt).header_len
- + tunnel_hlen
- + (vlan_tx_tag_present(skb) ? VLAN_HLEN : 0);
-
- if (skb_headroom(skb) < min_headroom || skb_header_cloned(skb)) {
- int head_delta = SKB_DATA_ALIGN(min_headroom -
- skb_headroom(skb) +
- 16);
-
- err = pskb_expand_head(skb, max_t(int, head_delta, 0),
- 0, GFP_ATOMIC);
- if (unlikely(err))
- goto err_free_rt;
- }
-
- /* Offloading */
- nskb = handle_offloads(skb);
- if (IS_ERR(nskb)) {
- err = PTR_ERR(nskb);
- goto err_free_rt;
- }
- skb = nskb;
-
- /* Reset SKB */
- nf_reset(skb);
- secpath_reset(skb);
- skb_dst_drop(skb);
- skb_clear_rxhash(skb);
-
- while (skb) {
- struct sk_buff *next_skb = skb->next;
- struct iphdr *iph;
- int frag_len;
-
- skb->next = NULL;
-
- if (unlikely(vlan_deaccel_tag(skb)))
- goto next;
-
- frag_len = skb->len;
- skb_push(skb, tunnel_hlen);
- skb_reset_network_header(skb);
- skb_set_transport_header(skb, sizeof(struct iphdr));
-
- if (next_skb)
- skb_dst_set(skb, dst_clone(&rt_dst(rt)));
- else
- skb_dst_set(skb, &rt_dst(rt));
-
- /* Push Tunnel header. */
- build_header(vport, skb, tunnel_hlen);
-
- /* Push IP header. */
- iph = ip_hdr(skb);
- iph->version = 4;
- iph->ihl = sizeof(struct iphdr) >> 2;
- iph->protocol = ipproto;
- iph->daddr = OVS_CB(skb)->tun_key->ipv4_dst;
- iph->saddr = saddr;
- iph->tos = OVS_CB(skb)->tun_key->ipv4_tos;
- iph->ttl = OVS_CB(skb)->tun_key->ipv4_ttl;
- iph->frag_off = OVS_CB(skb)->tun_key->tun_flags &
- TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0;
- /*
- * Allow our local IP stack to fragment the outer packet even
- * if the DF bit is set as a last resort. We also need to
- * force selection of an IP ID here with __ip_select_ident(),
- * as ip_select_ident() assumes a proper ID is not needed when
- * when the DF bit is set.
- */
- skb->local_df = 1;
- __ip_select_ident(iph, skb_dst(skb), 0);
-
- memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
-
- err = ip_local_out(skb);
- if (unlikely(net_xmit_eval(err)))
- goto next;
-
- sent_len += frag_len;
-
-next:
- skb = next_skb;
- }
-
- return sent_len;
-
-err_free_rt:
- ip_rt_put(rt);
-error:
- return err;
-}
+++ /dev/null
-/*
- * Copyright (c) 2007-2012 Nicira, Inc.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
- * 02110-1301, USA
- */
-
-#ifndef TUNNEL_H
-#define TUNNEL_H 1
-
-#include <linux/version.h>
-#include <net/net_namespace.h>
-#include <net/netns/generic.h>
-
-#include "flow.h"
-#include "vport.h"
-
-struct rtable *find_route(struct net *net,
- __be32 *saddr, __be32 daddr, u8 ipproto,
- u8 tos, u32 skb_mark);
-
-u16 ovs_tnl_get_src_port(struct sk_buff *skb);
-
-int ovs_tnl_send(struct vport *vport, struct sk_buff *skb,
- u8 ipproto, int tunnel_hlen,
- void (*build_header)(const struct vport *,
- struct sk_buff *,
- int tunnel_hlen));
-
-void ovs_tnl_rcv(struct vport *vport, struct sk_buff *skb,
- struct ovs_key_ipv4_tunnel *tun_key);
-
-static inline void tnl_tun_key_init(struct ovs_key_ipv4_tunnel *tun_key,
- const struct iphdr *iph, __be64 tun_id,
- __be16 tun_flags)
-{
- tun_key->tun_id = tun_id;
- tun_key->ipv4_src = iph->saddr;
- tun_key->ipv4_dst = iph->daddr;
- tun_key->ipv4_tos = iph->tos;
- tun_key->ipv4_ttl = iph->ttl;
- tun_key->tun_flags = tun_flags;
-
- /* clear struct padding. */
- memset((unsigned char*) tun_key + OVS_TUNNEL_KEY_SIZE, 0,
- sizeof(*tun_key) - OVS_TUNNEL_KEY_SIZE);
-}
-
-#endif /* TUNNEL_H */
if (!vlan_tx_tag_present(skb))
return 0;
- skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
+ skb = __vlan_put_tag(skb, skb->vlan_proto, vlan_tx_tag_get(skb));
if (unlikely(!skb))
return -ENOMEM;
#include <linux/kernel.h>
#include <linux/workqueue.h>
#include <linux/rculist.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
#include <net/route.h>
#include <net/xfrm.h>
-
#include <net/icmp.h>
#include <net/ip.h>
#include <net/ip_tunnels.h>
#include <net/protocol.h>
#include "datapath.h"
-#include "tunnel.h"
#include "vport.h"
/* Returns the least-significant 32 bits of a __be64. */
return PACKET_REJECT;
key = key_to_tunnel_id(tpi->key, tpi->seq);
- tnl_tun_key_init(&tun_key, ip_hdr(skb), key, filter_tnl_flags(tpi->flags));
+ ovs_flow_tun_key_init(&tun_key, ip_hdr(skb), key, filter_tnl_flags(tpi->flags));
ovs_vport_receive(vport, skb, &tun_key);
return PACKET_RCVD;
static int gre64_send(struct vport *vport, struct sk_buff *skb)
{
- int hlen;
+ int hlen = GRE_HEADER_SECTION + /* GRE Hdr */
+ GRE_HEADER_SECTION + /* GRE Key */
+ GRE_HEADER_SECTION; /* GRE SEQ */
__be32 seq;
if (unlikely(!OVS_CB(skb)->tun_key))
return -EINVAL;
- hlen = ip_gre_calc_hlen(OVS_CB(skb)->tun_key->tun_flags)
- + GRE_HEADER_SECTION;
+ if (OVS_CB(skb)->tun_key->tun_flags & TUNNEL_CSUM)
+ hlen += GRE_HEADER_SECTION;
seq = be64_get_high32(OVS_CB(skb)->tun_key->tun_id);
- return __send(vport, skb, hlen, seq, TUNNEL_SEQ);
+ return __send(vport, skb, hlen, seq, (TUNNEL_KEY|TUNNEL_SEQ));
}
const struct vport_ops ovs_gre64_vport_ops = {
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/ethtool.h>
+#include <linux/netdev_features.h>
#include <linux/skbuff.h>
#include <linux/version.h>
#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,27)
netdev->vlan_features = netdev->features;
- netdev->features |= NETIF_F_HW_VLAN_TX;
+ netdev->features |= NETIF_F_HW_VLAN_CTAG_TX;
#endif
#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,39)
#include <net/icmp.h>
#include <net/ip.h>
+#include <net/route.h>
#include <net/udp.h>
+#include <net/xfrm.h>
#include "datapath.h"
-#include "tunnel.h"
#include "vport.h"
-
/*
* LISP encapsulation header:
*
#endif
}
+/* Compute source UDP port for outgoing packet.
+ * Currently we use the flow hash.
+ */
+static u16 ovs_tnl_get_src_port(struct sk_buff *skb)
+{
+ int low;
+ int high;
+ unsigned int range;
+ struct sw_flow_key *pkt_key = OVS_CB(skb)->pkt_key;
+ u32 hash = jhash2((const u32 *)pkt_key,
+ sizeof(*pkt_key) / sizeof(u32), 0);
+
+ inet_get_local_port_range(&low, &high);
+ range = (high - low) + 1;
+ return (((u64) hash * range) >> 32) + low;
+}
+
static void lisp_build_header(const struct vport *vport,
struct sk_buff *skb,
int tunnel_hlen)
lisph->u2.word2.locator_status_bits = 1;
}
+/**
+ * ovs_tnl_rcv - ingress point for generic tunnel code
+ *
+ * @vport: port this packet was received on
+ * @skb: received packet
+ * @tos: ToS from encapsulating IP packet, used to copy ECN bits
+ *
+ * Must be called with rcu_read_lock.
+ *
+ * Packets received by this function are in the following state:
+ * - skb->data points to the inner Ethernet header.
+ * - The inner Ethernet header is in the linear data area.
+ * - skb->csum does not include the inner Ethernet header.
+ * - The layer pointers are undefined.
+ */
+static void ovs_tnl_rcv(struct vport *vport, struct sk_buff *skb,
+ struct ovs_key_ipv4_tunnel *tun_key)
+{
+ struct ethhdr *eh;
+
+ skb_reset_mac_header(skb);
+ eh = eth_hdr(skb);
+
+ if (likely(ntohs(eh->h_proto) >= ETH_P_802_3_MIN))
+ skb->protocol = eh->h_proto;
+ else
+ skb->protocol = htons(ETH_P_802_2);
+
+ skb_dst_drop(skb);
+ nf_reset(skb);
+ skb_clear_rxhash(skb);
+ secpath_reset(skb);
+ vlan_set_tci(skb, 0);
+
+ if (unlikely(compute_ip_summed(skb, false))) {
+ kfree_skb(skb);
+ return;
+ }
+
+ ovs_vport_receive(vport, skb, tun_key);
+}
+
/* Called with rcu_read_lock and BH disabled. */
static int lisp_rcv(struct sock *sk, struct sk_buff *skb)
{
/* Save outer tunnel values */
iph = ip_hdr(skb);
- tnl_tun_key_init(&tun_key, iph, key, TUNNEL_KEY);
+ ovs_flow_tun_key_init(&tun_key, iph, key, TUNNEL_KEY);
/* Drop non-IP inner packets */
inner_iph = (struct iphdr *)(lisph + 1);
return ERR_PTR(err);
}
+static bool need_linearize(const struct sk_buff *skb)
+{
+ int i;
+
+ if (unlikely(skb_shinfo(skb)->frag_list))
+ return true;
+
+ /*
+ * Generally speaking we should linearize if there are paged frags.
+ * However, if all of the refcounts are 1 we know nobody else can
+ * change them from underneath us and we can skip the linearization.
+ */
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
+ if (unlikely(page_count(skb_frag_page(&skb_shinfo(skb)->frags[i])) > 1))
+ return true;
+
+ return false;
+}
+
+static struct sk_buff *handle_offloads(struct sk_buff *skb)
+{
+ int err;
+
+ forward_ip_summed(skb, true);
+
+
+ if (skb_is_gso(skb)) {
+ struct sk_buff *nskb;
+ char cb[sizeof(skb->cb)];
+
+ memcpy(cb, skb->cb, sizeof(cb));
+
+ nskb = __skb_gso_segment(skb, 0, false);
+ if (IS_ERR(nskb)) {
+ err = PTR_ERR(nskb);
+ goto error;
+ }
+
+ consume_skb(skb);
+ skb = nskb;
+ while (nskb) {
+ memcpy(nskb->cb, cb, sizeof(cb));
+ nskb = nskb->next;
+ }
+ } else if (get_ip_summed(skb) == OVS_CSUM_PARTIAL) {
+ /* Pages aren't locked and could change at any time.
+ * If this happens after we compute the checksum, the
+ * checksum will be wrong. We linearize now to avoid
+ * this problem.
+ */
+ if (unlikely(need_linearize(skb))) {
+ err = __skb_linearize(skb);
+ if (unlikely(err))
+ goto error;
+ }
+
+ err = skb_checksum_help(skb);
+ if (unlikely(err))
+ goto error;
+ }
+
+ set_ip_summed(skb, OVS_CSUM_NONE);
+
+ return skb;
+
+error:
+ return ERR_PTR(err);
+}
+
+static int ovs_tnl_send(struct vport *vport, struct sk_buff *skb,
+ u8 ipproto, int tunnel_hlen,
+ void (*build_header)(const struct vport *,
+ struct sk_buff *,
+ int tunnel_hlen))
+{
+ int min_headroom;
+ struct rtable *rt;
+ __be32 saddr;
+ int sent_len = 0;
+ int err;
+ struct sk_buff *nskb;
+
+ /* Route lookup */
+ saddr = OVS_CB(skb)->tun_key->ipv4_src;
+ rt = find_route(ovs_dp_get_net(vport->dp),
+ &saddr,
+ OVS_CB(skb)->tun_key->ipv4_dst,
+ ipproto,
+ OVS_CB(skb)->tun_key->ipv4_tos,
+ skb_get_mark(skb));
+ if (IS_ERR(rt)) {
+ err = PTR_ERR(rt);
+ goto error;
+ }
+
+ tunnel_hlen += sizeof(struct iphdr);
+
+ min_headroom = LL_RESERVED_SPACE(rt_dst(rt).dev) + rt_dst(rt).header_len
+ + tunnel_hlen
+ + (vlan_tx_tag_present(skb) ? VLAN_HLEN : 0);
+
+ if (skb_headroom(skb) < min_headroom || skb_header_cloned(skb)) {
+ int head_delta = SKB_DATA_ALIGN(min_headroom -
+ skb_headroom(skb) +
+ 16);
+
+ err = pskb_expand_head(skb, max_t(int, head_delta, 0),
+ 0, GFP_ATOMIC);
+ if (unlikely(err))
+ goto err_free_rt;
+ }
+
+ /* Offloading */
+ nskb = handle_offloads(skb);
+ if (IS_ERR(nskb)) {
+ err = PTR_ERR(nskb);
+ goto err_free_rt;
+ }
+ skb = nskb;
+
+ /* Reset SKB */
+ nf_reset(skb);
+ secpath_reset(skb);
+ skb_dst_drop(skb);
+ skb_clear_rxhash(skb);
+
+ while (skb) {
+ struct sk_buff *next_skb = skb->next;
+ struct iphdr *iph;
+ int frag_len;
+
+ skb->next = NULL;
+
+ if (unlikely(vlan_deaccel_tag(skb)))
+ goto next;
+
+ frag_len = skb->len;
+ skb_push(skb, tunnel_hlen);
+ skb_reset_network_header(skb);
+ skb_set_transport_header(skb, sizeof(struct iphdr));
+
+ if (next_skb)
+ skb_dst_set(skb, dst_clone(&rt_dst(rt)));
+ else
+ skb_dst_set(skb, &rt_dst(rt));
+
+ /* Push Tunnel header. */
+ build_header(vport, skb, tunnel_hlen);
+
+ /* Push IP header. */
+ iph = ip_hdr(skb);
+ iph->version = 4;
+ iph->ihl = sizeof(struct iphdr) >> 2;
+ iph->protocol = ipproto;
+ iph->daddr = OVS_CB(skb)->tun_key->ipv4_dst;
+ iph->saddr = saddr;
+ iph->tos = OVS_CB(skb)->tun_key->ipv4_tos;
+ iph->ttl = OVS_CB(skb)->tun_key->ipv4_ttl;
+ iph->frag_off = OVS_CB(skb)->tun_key->tun_flags &
+ TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0;
+ /*
+ * Allow our local IP stack to fragment the outer packet even
+ * if the DF bit is set as a last resort. We also need to
+ * force selection of an IP ID here with __ip_select_ident(),
+ * as ip_select_ident() assumes a proper ID is not needed when
+ * when the DF bit is set.
+ */
+ skb->local_df = 1;
+ __ip_select_ident(iph, skb_dst(skb), 0);
+
+ memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
+
+ err = ip_local_out(skb);
+ if (unlikely(net_xmit_eval(err)))
+ goto next;
+
+ sent_len += frag_len;
+
+next:
+ skb = next_skb;
+ }
+
+ return sent_len;
+
+err_free_rt:
+ ip_rt_put(rt);
+error:
+ return err;
+}
+
static int lisp_tnl_send(struct vport *vport, struct sk_buff *skb)
{
int tnl_len;
nskb = skb->next;
skb->next = NULL;
- skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
+ skb = __vlan_put_tag(skb, skb->vlan_proto, vlan_tx_tag_get(skb));
if (likely(skb)) {
len += skb->len;
vlan_set_tci(skb, 0);
}
tag:
- skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
+ skb = __vlan_put_tag(skb, skb->vlan_proto, vlan_tx_tag_get(skb));
if (unlikely(!skb))
return 0;
vlan_set_tci(skb, 0);
#include <net/vxlan.h>
#include "datapath.h"
-#include "tunnel.h"
#include "vport.h"
#define OVS_VXLAN_RCV_PRIORITY 8
/* Save outer tunnel values */
iph = ip_hdr(skb);
key = cpu_to_be64(ntohl(vx_vni) >> 8);
- tnl_tun_key_init(&tun_key, iph, key, TUNNEL_KEY);
+ ovs_flow_tun_key_init(&tun_key, iph, key, TUNNEL_KEY);
ovs_vport_receive(vport, skb, &tun_key);
return PACKET_RCVD;
CFLAGS += -O2
endif
+# Old versions of dpkg-buildflags do not understand --export=configure.
+# When dpkg-buildflags does not understand an option, it prints its full
+# --help output on stdout, so we have to avoid that here.
+buildflags := $(shell if dpkg-buildflags --export=configure >/dev/null 2>&1; \
+ then dpkg-buildflags --export=configure; fi)
+
configure: configure-stamp
configure-stamp:
dh_testdir
test -e Makefile || \
../configure --prefix=/usr --localstatedir=/var --enable-ssl \
--sysconfdir=/etc CFLAGS="$(CFLAGS)" \
- $(DATAPATH_CONFIGURE_OPTS))
+ $(buildflags) $(DATAPATH_CONFIGURE_OPTS))
touch configure-stamp
#Architecture
* - NXM_NX_ND_SLL
* - NXM_NX_ND_TLL
* - NXM_NX_REG(idx) for idx in the switch's accepted range.
+ * - NXM_NX_PKT_MARK
* - NXM_NX_TUN_IPV4_SRC
* - NXM_NX_TUN_IPV4_DST
*
*
* - NXM_NX_REG(idx) for idx in the switch's accepted range.
*
+ * - NXM_NX_PKT_MARK
+ *
* - NXM_OF_VLAN_TCI. Modifying this field's value has side effects on the
* packet's 802.1Q header. Setting a value with CFI=0 removes the 802.1Q
* header (if any), ignoring the other bits. Setting a value with CFI=1
#define NXM_NX_TUN_IPV4_DST NXM_HEADER (0x0001, 32, 4)
#define NXM_NX_TUN_IPV4_DST_W NXM_HEADER_W(0x0001, 32, 4)
+/* Metadata marked onto the packet in a system-dependent manner.
+ *
+ * The packet mark may be used to carry contextual information
+ * to other parts of the system outside of Open vSwitch. As a
+ * result, the semantics depend on system in use.
+ *
+ * Prereqs: None.
+ *
+ * Format: 32-bit integer in network byte order.
+ *
+ * Masking: Fully maskable. */
+#define NXM_NX_PKT_MARK NXM_HEADER (0x0001, 33, 4)
+#define NXM_NX_PKT_MARK_W NXM_HEADER_W(0x0001, 33, 4)
+
/* ## --------------------- ## */
/* ## Requests and replies. ## */
/* ## --------------------- ## */
/* Get actual <pthread.h> definitions for us to annotate and build on. */
#include_next <pthread.h>
-#include "compiler.h"
-
-int pthread_mutex_lock(pthread_mutex_t *mutex) OVS_ACQUIRES(mutex);
-int pthread_mutex_unlock(pthread_mutex_t *mutex) OVS_RELEASES(mutex);
-
-int pthread_rwlock_rdlock(pthread_rwlock_t *rwlock) OVS_ACQ_RDLOCK(rwlock);
-int pthread_rwlock_wrlock(pthread_rwlock_t *rwlock) OVS_ACQ_WRLOCK(rwlock);
-int pthread_rwlock_unlock(pthread_rwlock_t *rwlock) OVS_RELEASES(rwlock);
-
-int pthread_cond_wait(pthread_cond_t *, pthread_mutex_t *mutex)
- OVS_REQUIRES(mutex);
-
/* Sparse complains about the proper PTHREAD_*_INITIALIZER definitions.
* Luckily, it's not a real compiler so we can overwrite it with something
* simple. */
#undef PTHREAD_ERRORCHECK_MUTEX_INITIALIZER_NP
#define PTHREAD_ERRORCHECK_MUTEX_INITIALIZER_NP {}
-
-#define pthread_mutex_trylock(MUTEX) \
- ({ \
- int retval = pthread_mutex_trylock(mutex); \
- if (!retval) { \
- OVS_MACRO_LOCK(MUTEX); \
- } \
- retval; \
- })
-
-#define pthread_rwlock_tryrdlock(RWLOCK) \
- ({ \
- int retval = pthread_rwlock_tryrdlock(rwlock); \
- if (!retval) { \
- OVS_MACRO_LOCK(RWLOCK); \
- } \
- retval; \
- })
-#define pthread_rwlock_trywrlock(RWLOCK) \
- ({ \
- int retval = pthread_rwlock_trywrlock(rwlock); \
- if (!retval) { \
- OVS_MACRO_LOCK(RWLOCK); \
- } \
- retval; \
- })
lib/reconnect.c \
lib/reconnect.h \
lib/sat-math.h \
+ lib/seq.c \
+ lib/seq.h \
lib/sha1.c \
lib/sha1.h \
lib/shash.c \
static atomic_uint16_t udp_src = ATOMIC_VAR_INIT(0);
long long int min_tx, min_rx;
+ bool need_poll = false;
bool cpath_down;
const char *hwaddr;
uint8_t ea[ETH_ADDR_LEN];
|| (!bfd_in_poll(bfd) && bfd->cfg_min_tx < bfd->min_tx)) {
bfd->min_tx = bfd->cfg_min_tx;
}
- bfd_poll(bfd);
+ need_poll = true;
}
min_rx = smap_get_int(cfg, "min_rx", 1000);
|| (!bfd_in_poll(bfd) && bfd->cfg_min_rx > bfd->min_rx)) {
bfd->min_rx = bfd->cfg_min_rx;
}
- bfd_poll(bfd);
+ need_poll = true;
}
cpath_down = smap_get_bool(cfg, "cpath_down", false);
if (bfd->diag == DIAG_NONE || bfd->diag == DIAG_CPATH_DOWN) {
bfd_set_state(bfd, bfd->state, DIAG_NONE);
}
- bfd_poll(bfd);
+ need_poll = true;
}
hwaddr = smap_get(cfg, "bfd_dst_mac");
bfd->eth_dst_set = false;
}
+ if (need_poll) {
+ bfd_poll(bfd);
+ }
ovs_mutex_unlock(&mutex);
return bfd;
}
return (flow->dl_type == htons(ETH_TYPE_IP)
&& flow->nw_proto == IPPROTO_UDP
&& flow->tp_dst == htons(BFD_DEST_PORT)
- && (check_tnl_key || flow->tunnel.tun_id == htonll(0)));
+ && (!check_tnl_key || flow->tunnel.tun_id == htonll(0)));
}
void
ds_put_cstr(&ds, "poll ");
}
+ /* Do not copy the trailing whitespace. */
+ ds_chomp(&ds, ' ');
ovs_strlcpy(flag_str, ds_cstr(&ds), sizeof flag_str);
ds_destroy(&ds);
return flag_str;
poll_timer_wait_until(bond->next_fake_iface_update);
}
- if (!bond->bond_revalidate) {
+ if (bond->bond_revalidate) {
poll_immediate_wake();
}
ovs_rwlock_unlock(&rwlock);
struct flow_wildcards *wc, uint16_t vlan)
{
struct bond_slave *slave;
+ void *aux;
ovs_rwlock_rdlock(&rwlock);
slave = choose_output_slave(bond, flow, wc, vlan);
+ aux = slave ? slave->aux : NULL;
ovs_rwlock_unlock(&rwlock);
- return slave;
+
+ return aux;
}
\f
/* Rebalancing. */
#define OVS_EXCLUDED(...) __attribute__((locks_excluded(__VA_ARGS__)))
#define OVS_ACQ_BEFORE(...) __attribute__((acquired_before(__VA_ARGS__)))
#define OVS_ACQ_AFTER(...) __attribute__((acquired_after(__VA_ARGS__)))
-#elif __CHECKER__
-/* "sparse" annotations for mutexes and mutex-like constructs.
- *
- * Change the thread-safety check annotations to use "context" attribute.
- *
- * OVS_MACRO_LOCK and OVS_MACRO_RELEASE are suitable for use within macros,
- * where there is no function prototype to annotate. */
-#define OVS_LOCKABLE
-#define OVS_REQ_RDLOCK(...) __attribute__((context(MUTEX, 1, 1)))
-#define OVS_ACQ_RDLOCK(...) __attribute__((context(MUTEX, 0, 1)))
-#define OVS_REQ_WRLOCK(...) __attribute__((context(MUTEX, 1, 1)))
-#define OVS_ACQ_WRLOCK(...) __attribute__((context(MUTEX, 0, 1)))
-#define OVS_REQUIRES(...) __attribute__((context(MUTEX, 1, 1)))
-#define OVS_ACQUIRES(...) __attribute__((context(MUTEX, 0, 1)))
-#define OVS_TRY_WRLOCK(RETVAL, ...)
-#define OVS_TRY_RDLOCK(RETVAL, ...)
-#define OVS_TRY_LOCK(REVAL, ...)
-#define OVS_GUARDED
-#define OVS_GUARDED_BY(...)
-#define OVS_EXCLUDED(...)
-#define OVS_RELEASES(...) __attribute__((context(MUTEX, 1, 0)))
-#define OVS_ACQ_BEFORE(...)
-#define OVS_ACQ_AFTER(...)
-#define OVS_MACRO_LOCK(...) __context__(MUTEX, 0, 1)
-#define OVS_MACRO_RELEASE(...) __context__(MUTEX, 1, 0)
-#else
+#else /* not Clang */
#define OVS_LOCKABLE
#define OVS_REQ_RDLOCK(...)
#define OVS_ACQ_RDLOCK(...)
#define OVS_RELEASES(...)
#define OVS_ACQ_BEFORE(...)
#define OVS_ACQ_AFTER(...)
-#define OVS_MACRO_LOCK(...)
-#define OVS_MACRO_RELEASE(...)
#endif
/* ISO C says that a C implementation may choose any integer type for an enum
#include "packets.h"
#include "poll-loop.h"
#include "random.h"
+#include "seq.h"
#include "shash.h"
#include "sset.h"
#include "timeval.h"
struct dp_netdev_queue queues[N_QUEUES];
struct hmap flow_table; /* Flow table. */
+ struct seq *queue_seq; /* Incremented whenever a packet is queued. */
/* Statistics. */
long long int n_hit; /* Number of flow table matches. */
/* Ports. */
struct dp_netdev_port *ports[MAX_PORTS];
struct list port_list;
- unsigned int serial;
+ struct seq *port_seq; /* Incremented whenever a port changes. */
};
/* A port in a netdev-based datapath. */
struct dpif_netdev {
struct dpif dpif;
struct dp_netdev *dp;
- unsigned int dp_serial;
+ uint64_t last_port_seq;
};
/* All netdev-based datapaths. */
static void dp_netdev_port_input(struct dp_netdev *dp,
struct dp_netdev_port *port,
struct ofpbuf *packet, uint32_t skb_priority,
- uint32_t skb_mark, const struct flow_tnl *tnl);
+ uint32_t pkt_mark, const struct flow_tnl *tnl);
static struct dpif_netdev *
dpif_netdev_cast(const struct dpif *dpif)
dpif = xmalloc(sizeof *dpif);
dpif_init(&dpif->dpif, dp->class, dp->name, netflow_id >> 8, netflow_id);
dpif->dp = dp;
- dpif->dp_serial = dp->serial;
+ dpif->last_port_seq = seq_read(dp->port_seq);
return &dpif->dpif;
}
for (i = 0; i < N_QUEUES; i++) {
dp->queues[i].head = dp->queues[i].tail = 0;
}
+ dp->queue_seq = seq_create();
hmap_init(&dp->flow_table);
list_init(&dp->port_list);
+ dp->port_seq = seq_create();
error = do_add_port(dp, name, "internal", ODPP_LOCAL);
if (error) {
do_del_port(dp, port->port_no);
}
dp_netdev_purge_queues(dp);
+ seq_destroy(dp->queue_seq);
hmap_destroy(&dp->flow_table);
+ seq_destroy(dp->port_seq);
free(dp->name);
free(dp);
}
list_push_back(&dp->port_list, &port->node);
dp->ports[odp_to_u32(port_no)] = port;
- dp->serial++;
+ seq_change(dp->port_seq);
return 0;
}
list_remove(&port->node);
dp->ports[odp_to_u32(port_no)] = NULL;
- dp->serial++;
+ seq_change(dp->port_seq);
netdev_close(port->netdev);
netdev_restore_flags(port->sf);
dpif_netdev_port_poll(const struct dpif *dpif_, char **devnamep OVS_UNUSED)
{
struct dpif_netdev *dpif = dpif_netdev_cast(dpif_);
+ uint64_t new_port_seq;
int error;
ovs_mutex_lock(&dp_netdev_mutex);
- if (dpif->dp_serial != dpif->dp->serial) {
- dpif->dp_serial = dpif->dp->serial;
+ new_port_seq = seq_read(dpif->dp->port_seq);
+ if (dpif->last_port_seq != new_port_seq) {
+ dpif->last_port_seq = new_port_seq;
error = ENOBUFS;
} else {
error = EAGAIN;
{
struct dpif_netdev *dpif = dpif_netdev_cast(dpif_);
- /* XXX In a multithreaded process, there is a race window between this
- * function and the poll_block() in one thread and a change in
- * dpif->dp->serial in another thread. */
-
ovs_mutex_lock(&dp_netdev_mutex);
- if (dpif->dp_serial != dpif->dp->serial) {
- poll_immediate_wake();
- }
+ seq_wait(dpif->dp->port_seq, dpif->last_port_seq);
ovs_mutex_unlock(&dp_netdev_mutex);
}
static void
dpif_netdev_recv_wait(struct dpif *dpif)
{
- /* XXX In a multithreaded process, there is a race window between this
- * function and the poll_block() in one thread and a packet being queued in
- * another thread. */
+ struct dp_netdev *dp = get_dp_netdev(dpif);
+ uint64_t seq;
ovs_mutex_lock(&dp_netdev_mutex);
+ seq = seq_read(dp->queue_seq);
if (find_nonempty_queue(dpif)) {
poll_immediate_wake();
+ } else {
+ seq_wait(dp->queue_seq, seq);
}
ovs_mutex_unlock(&dp_netdev_mutex);
}
static void
dp_netdev_port_input(struct dp_netdev *dp, struct dp_netdev_port *port,
struct ofpbuf *packet, uint32_t skb_priority,
- uint32_t skb_mark, const struct flow_tnl *tnl)
+ uint32_t pkt_mark, const struct flow_tnl *tnl)
{
struct dp_netdev_flow *flow;
struct flow key;
return;
}
in_port_.odp_port = port->port_no;
- flow_extract(packet, skb_priority, skb_mark, tnl, &in_port_, &key);
+ flow_extract(packet, skb_priority, pkt_mark, tnl, &in_port_, &key);
flow = dp_netdev_lookup_flow(dp, &key);
if (flow) {
dp_netdev_flow_used(flow, packet);
buf->size = packet->size;
upcall->packet = buf;
+ seq_change(dp->queue_seq);
+
return 0;
} else {
dp->n_lost++;
dp->ports[odp_to_u32(port->port_no)] = NULL;
dp->ports[port_no] = port;
port->port_no = u32_to_odp(port_no);
- dp->serial++;
+ seq_change(dp->port_seq);
unixctl_command_reply(conn, NULL);
}
* present and has a correct length, and otherwise NULL.
*/
void
-flow_extract(struct ofpbuf *packet, uint32_t skb_priority, uint32_t skb_mark,
+flow_extract(struct ofpbuf *packet, uint32_t skb_priority, uint32_t pkt_mark,
const struct flow_tnl *tnl, const union flow_in_port *in_port,
struct flow *flow)
{
flow->in_port = *in_port;
}
flow->skb_priority = skb_priority;
- flow->skb_mark = skb_mark;
+ flow->pkt_mark = pkt_mark;
packet->l2 = b.data;
packet->l2_5 = NULL;
fmd->tun_dst = flow->tunnel.ip_dst;
fmd->metadata = flow->metadata;
memcpy(fmd->regs, flow->regs, sizeof fmd->regs);
+ fmd->pkt_mark = flow->pkt_mark;
fmd->in_port = flow->in_port.ofp_port;
}
ovs_be32 nw_dst; /* IPv4 destination address. */
ovs_be32 ipv6_label; /* IPv6 flow label. */
union flow_in_port in_port; /* Input port.*/
- uint32_t skb_mark; /* Packet mark. */
+ uint32_t pkt_mark; /* Packet mark. */
ovs_be32 mpls_lse; /* MPLS label stack entry. */
uint16_t mpls_depth; /* Depth of MPLS stack. */
ovs_be16 vlan_tci; /* If 802.1Q, TCI | VLAN_CFI; otherwise 0. */
ovs_be32 tun_dst; /* Tunnel outer IPv4 dst addr */
ovs_be64 metadata; /* OpenFlow 1.1+ metadata field. */
uint32_t regs[FLOW_N_REGS]; /* Registers. */
+ uint32_t pkt_mark; /* Packet mark. */
ofp_port_t in_port; /* OpenFlow port or zero. */
};
memset(&wc->masks.skb_priority, 0xff, sizeof wc->masks.skb_priority);
}
- if (flow->skb_mark) {
- memset(&wc->masks.skb_mark, 0xff, sizeof wc->masks.skb_mark);
+ if (flow->pkt_mark) {
+ memset(&wc->masks.pkt_mark, 0xff, sizeof wc->masks.pkt_mark);
}
for (i = 0; i < FLOW_N_REGS; i++) {
{
match->flow = *flow;
match->flow.skb_priority = 0;
- match->flow.skb_mark = 0;
flow_wildcards_init_exact(&match->wc);
}
}
void
-match_set_skb_mark(struct match *match, uint32_t skb_mark)
+match_set_pkt_mark(struct match *match, uint32_t pkt_mark)
{
- match->wc.masks.skb_mark = UINT32_MAX;
- match->flow.skb_mark = skb_mark;
+ match_set_pkt_mark_masked(match, pkt_mark, UINT32_MAX);
+}
+
+void
+match_set_pkt_mark_masked(struct match *match, uint32_t pkt_mark, uint32_t mask)
+{
+ match->flow.pkt_mark = pkt_mark & mask;
+ match->wc.masks.pkt_mark = mask;
}
void
ds_put_format(s, "priority=%u,", priority);
}
- if (wc->masks.skb_mark) {
- ds_put_format(s, "skb_mark=%#"PRIx32",", f->skb_mark);
+ switch (wc->masks.pkt_mark) {
+ case 0:
+ break;
+ case UINT32_MAX:
+ ds_put_format(s, "pkt_mark=%#"PRIx32",", f->pkt_mark);
+ break;
+ default:
+ ds_put_format(s, "pkt_mark=%#"PRIx32"/%#"PRIx32",",
+ f->pkt_mark, wc->masks.pkt_mark);
+ break;
}
if (wc->masks.skb_priority) {
void match_set_tun_flags(struct match *match, uint16_t flags);
void match_set_tun_flags_masked(struct match *match, uint16_t flags, uint16_t mask);
void match_set_in_port(struct match *, ofp_port_t ofp_port);
-void match_set_skb_mark(struct match *, uint32_t skb_mark);
+void match_set_pkt_mark(struct match *, uint32_t pkt_mark);
+void match_set_pkt_mark_masked(struct match *, uint32_t pkt_mark, uint32_t mask);
void match_set_skb_priority(struct match *, uint32_t skb_priority);
void match_set_dl_type(struct match *, ovs_be16);
void match_set_dl_src(struct match *, const uint8_t[6]);
0, NULL,
0, NULL,
}, {
- MFF_SKB_MARK, "skb_mark", NULL,
+ MFF_PKT_MARK, "pkt_mark", NULL,
MF_FIELD_SIZES(be32),
- MFM_NONE,
+ MFM_FULLY,
MFS_HEXADECIMAL,
MFP_NONE,
- false,
- 0, NULL,
- 0, NULL,
+ true,
+ NXM_NX_PKT_MARK, "NXM_NX_PKT_MARK",
+ NXM_NX_PKT_MARK, "NXM_NX_PKT_MARK",
},
#define REGISTER(IDX) \
return !wc->masks.in_port.ofp_port;
case MFF_SKB_PRIORITY:
return !wc->masks.skb_priority;
- case MFF_SKB_MARK:
- return !wc->masks.skb_mark;
+ case MFF_PKT_MARK:
+ return !wc->masks.pkt_mark;
CASE_MFF_REGS:
return !wc->masks.regs[mf->id - MFF_REG0];
case MFF_METADATA:
case MFF_IN_PORT:
case MFF_SKB_PRIORITY:
- case MFF_SKB_MARK:
+ case MFF_PKT_MARK:
CASE_MFF_REGS:
case MFF_ETH_SRC:
case MFF_ETH_DST:
value->be32 = htonl(flow->skb_priority);
break;
- case MFF_SKB_MARK:
- value->be32 = htonl(flow->skb_mark);
+ case MFF_PKT_MARK:
+ value->be32 = htonl(flow->pkt_mark);
break;
CASE_MFF_REGS:
match_set_skb_priority(match, ntohl(value->be32));
break;
- case MFF_SKB_MARK:
- match_set_skb_mark(match, ntohl(value->be32));
+ case MFF_PKT_MARK:
+ match_set_pkt_mark(match, ntohl(value->be32));
break;
CASE_MFF_REGS:
flow->skb_priority = ntohl(value->be32);
break;
- case MFF_SKB_MARK:
- flow->skb_mark = ntohl(value->be32);
+ case MFF_PKT_MARK:
+ flow->pkt_mark = ntohl(value->be32);
break;
CASE_MFF_REGS:
match->wc.masks.skb_priority = 0;
break;
- case MFF_SKB_MARK:
- match->flow.skb_mark = 0;
- match->wc.masks.skb_mark = 0;
+ case MFF_PKT_MARK:
+ match->flow.pkt_mark = 0;
+ match->wc.masks.pkt_mark = 0;
break;
CASE_MFF_REGS:
switch (mf->id) {
case MFF_IN_PORT:
case MFF_IN_PORT_OXM:
- case MFF_SKB_MARK:
case MFF_SKB_PRIORITY:
case MFF_ETH_TYPE:
case MFF_DL_VLAN:
ntohl(value->be32), ntohl(mask->be32));
break;
+ case MFF_PKT_MARK:
+ match_set_pkt_mark_masked(match, ntohl(value->be32),
+ ntohl(mask->be32));
+ break;
+
case MFF_ETH_DST:
match_set_dl_dst_masked(match, value->mac, mask->mac);
break;
case MFF_TUN_FLAGS:
case MFF_METADATA:
case MFF_IN_PORT:
- case MFF_SKB_MARK:
+ case MFF_PKT_MARK:
case MFF_SKB_PRIORITY:
CASE_MFF_REGS:
case MFF_ETH_SRC:
MFF_IN_PORT, /* be16 */
MFF_IN_PORT_OXM, /* be32 */
MFF_SKB_PRIORITY, /* be32 */
- MFF_SKB_MARK, /* be32 */
+ MFF_PKT_MARK, /* be32 */
#if FLOW_N_REGS > 0
MFF_REG0, /* be32 */
#include <sys/sysctl.h>
#if defined(__NetBSD__)
#include <net/route.h>
+#include <netinet/in.h>
+#include <netinet/if_inarp.h>
#endif
#include "rtbsd.h"
struct netdev_bsd {
struct netdev up;
+
+ /* Never changes after initialization. */
+ char *kernel_name;
+
+ /* Protects all members below. */
+ struct ovs_mutex mutex;
+
unsigned int cache_valid;
unsigned int change_seq;
/* Used for sending packets on non-tap devices. */
pcap_t *pcap;
int fd;
-
- char *kernel_name;
};
static int get_flags(const struct netdev *, int *flagsp);
static int set_flags(const char *, int flags);
static int do_set_addr(struct netdev *netdev,
- int ioctl_nr, const char *ioctl_name,
+ unsigned long ioctl_nr, const char *ioctl_name,
struct in_addr addr);
static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
static int set_etheraddr(const char *netdev_name, int hwaddr_family,
static void ifr_set_flags(struct ifreq *, int flags);
#ifdef __NetBSD__
-static int af_link_ioctl(int command, const void *arg);
+static int af_link_ioctl(unsigned long command, const void *arg);
#endif
static void netdev_bsd_run(void);
return error;
}
+ ovs_mutex_init(&netdev->mutex, PTHREAD_MUTEX_NORMAL);
netdev->change_seq = 1;
netdev->tap_fd = -1;
netdev->kernel_name = xstrdup(netdev_->name);
/* Create a tap device by opening /dev/tap. The TAPGIFNAME ioctl is used
* to retrieve the name of the tap device. */
+ ovs_mutex_init(&netdev->mutex, PTHREAD_MUTEX_NORMAL);
netdev->tap_fd = open("/dev/tap", O_RDWR);
netdev->change_seq = 1;
if (netdev->tap_fd < 0) {
return 0;
error_unref_notifier:
+ ovs_mutex_destroy(&netdev->mutex);
cache_notifier_unref();
error:
free(kernel_name);
pcap_close(netdev->pcap);
}
free(netdev->kernel_name);
+ ovs_mutex_destroy(&netdev->mutex);
}
static void
struct netdev_rx_bsd *rx = netdev_rx_bsd_cast(rx_);
struct netdev *netdev_ = rx->up.netdev;
struct netdev_bsd *netdev = netdev_bsd_cast(netdev_);
+ int error;
if (!strcmp(netdev_get_type(netdev_), "tap")) {
rx->pcap_handle = NULL;
rx->fd = netdev->tap_fd;
+ error = 0;
} else {
- int error = netdev_bsd_open_pcap(netdev_get_kernel_name(netdev_),
- &rx->pcap_handle, &rx->fd);
- if (error) {
- return error;
+ ovs_mutex_lock(&netdev->mutex);
+ error = netdev_bsd_open_pcap(netdev_get_kernel_name(netdev_),
+ &rx->pcap_handle, &rx->fd);
+ if (!error) {
+ netdev_bsd_changed(netdev);
}
-
- netdev_bsd_changed(netdev);
+ ovs_mutex_unlock(&netdev->mutex);
}
- return 0;
+ return error;
}
static void
{
struct netdev_bsd *dev = netdev_bsd_cast(netdev_);
const char *name = netdev_get_name(netdev_);
+ int error;
+ ovs_mutex_lock(&dev->mutex);
if (dev->tap_fd < 0 && !dev->pcap) {
- int error = netdev_bsd_open_pcap(name, &dev->pcap, &dev->fd);
- if (error) {
- return error;
- }
+ error = netdev_bsd_open_pcap(name, &dev->pcap, &dev->fd);
+ } else {
+ error = 0;
}
- for (;;) {
+ while (!error) {
ssize_t retval;
if (dev->tap_fd >= 0) {
retval = write(dev->tap_fd, data, size);
if (retval < 0) {
if (errno == EINTR) {
continue;
- } else if (errno != EAGAIN) {
- VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
- name, ovs_strerror(errno));
+ } else {
+ error = errno;
+ if (error != EAGAIN) {
+ VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: "
+ "%s", name, ovs_strerror(error));
+ }
}
- return errno;
} else if (retval != size) {
VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
"%zu) on %s", retval, size, name);
- return EMSGSIZE;
+ error = EMSGSIZE;
} else {
- return 0;
+ break;
}
}
+
+ ovs_mutex_unlock(&dev->mutex);
+ return error;
}
/*
{
struct netdev_bsd *dev = netdev_bsd_cast(netdev_);
+ ovs_mutex_lock(&dev->mutex);
if (dev->tap_fd >= 0) {
/* TAP device always accepts packets. */
poll_immediate_wake();
/* We haven't even tried to send a packet yet. */
poll_immediate_wake();
}
+ ovs_mutex_unlock(&dev->mutex);
}
/*
const uint8_t mac[ETH_ADDR_LEN])
{
struct netdev_bsd *netdev = netdev_bsd_cast(netdev_);
- int error;
+ int error = 0;
+ ovs_mutex_lock(&netdev->mutex);
if (!(netdev->cache_valid & VALID_ETHERADDR)
|| !eth_addr_equals(netdev->etheraddr, mac)) {
error = set_etheraddr(netdev_get_kernel_name(netdev_), AF_LINK,
memcpy(netdev->etheraddr, mac, ETH_ADDR_LEN);
netdev_bsd_changed(netdev);
}
- } else {
- error = 0;
}
+ ovs_mutex_unlock(&netdev->mutex);
+
return error;
}
uint8_t mac[ETH_ADDR_LEN])
{
struct netdev_bsd *netdev = netdev_bsd_cast(netdev_);
+ int error = 0;
+ ovs_mutex_lock(&netdev->mutex);
if (!(netdev->cache_valid & VALID_ETHERADDR)) {
- int error = get_etheraddr(netdev_get_kernel_name(netdev_),
- netdev->etheraddr);
- if (error) {
- return error;
+ error = get_etheraddr(netdev_get_kernel_name(netdev_),
+ netdev->etheraddr);
+ if (!error) {
+ netdev->cache_valid |= VALID_ETHERADDR;
}
- netdev->cache_valid |= VALID_ETHERADDR;
}
- memcpy(mac, netdev->etheraddr, ETH_ADDR_LEN);
+ if (!error) {
+ memcpy(mac, netdev->etheraddr, ETH_ADDR_LEN);
+ }
+ ovs_mutex_unlock(&netdev->mutex);
- return 0;
+ return error;
}
/*
netdev_bsd_get_mtu(const struct netdev *netdev_, int *mtup)
{
struct netdev_bsd *netdev = netdev_bsd_cast(netdev_);
+ int error = 0;
+ ovs_mutex_lock(&netdev->mutex);
if (!(netdev->cache_valid & VALID_MTU)) {
struct ifreq ifr;
- int error;
error = af_inet_ifreq_ioctl(netdev_get_kernel_name(netdev_), &ifr,
SIOCGIFMTU, "SIOCGIFMTU");
- if (error) {
- return error;
+ if (!error) {
+ netdev->mtu = ifr.ifr_mtu;
+ netdev->cache_valid |= VALID_MTU;
}
- netdev->mtu = ifr.ifr_mtu;
- netdev->cache_valid |= VALID_MTU;
}
+ if (!error) {
+ *mtup = netdev->mtu;
+ }
+ ovs_mutex_unlock(&netdev->mutex);
- *mtup = netdev->mtu;
return 0;
}
static int
-netdev_bsd_get_ifindex(const struct netdev *netdev)
+netdev_bsd_get_ifindex(const struct netdev *netdev_)
{
+ struct netdev_bsd *netdev = netdev_bsd_cast(netdev_);
int ifindex, error;
- error = get_ifindex(netdev, &ifindex);
+ ovs_mutex_lock(&netdev->mutex);
+ error = get_ifindex(netdev_, &ifindex);
+ ovs_mutex_unlock(&netdev->mutex);
+
return error ? -error : ifindex;
}
netdev_bsd_get_carrier(const struct netdev *netdev_, bool *carrier)
{
struct netdev_bsd *netdev = netdev_bsd_cast(netdev_);
+ int error = 0;
+ ovs_mutex_lock(&netdev->mutex);
if (!(netdev->cache_valid & VALID_CARRIER)) {
struct ifmediareq ifmr;
- int error;
memset(&ifmr, 0, sizeof(ifmr));
strncpy(ifmr.ifm_name, netdev_get_kernel_name(netdev_),
sizeof ifmr.ifm_name);
error = af_inet_ioctl(SIOCGIFMEDIA, &ifmr);
- if (error) {
+ if (!error) {
+ netdev->carrier = (ifmr.ifm_status & IFM_ACTIVE) == IFM_ACTIVE;
+ netdev->cache_valid |= VALID_CARRIER;
+
+ /* If the interface doesn't report whether the media is active,
+ * just assume it is active. */
+ if ((ifmr.ifm_status & IFM_AVALID) == 0) {
+ netdev->carrier = true;
+ }
+ } else {
VLOG_DBG_RL(&rl, "%s: ioctl(SIOCGIFMEDIA) failed: %s",
netdev_get_name(netdev_), ovs_strerror(error));
- return error;
- }
-
- netdev->carrier = (ifmr.ifm_status & IFM_ACTIVE) == IFM_ACTIVE;
- netdev->cache_valid |= VALID_CARRIER;
-
- /* If the interface doesn't report whether the media is active,
- * just assume it is active. */
- if ((ifmr.ifm_status & IFM_AVALID) == 0) {
- netdev->carrier = true;
}
}
- *carrier = netdev->carrier;
+ if (!error) {
+ *carrier = netdev->carrier;
+ }
+ ovs_mutex_unlock(&netdev->mutex);
- return 0;
+ return error;
}
static void
struct in_addr *netmask)
{
struct netdev_bsd *netdev = netdev_bsd_cast(netdev_);
+ int error = 0;
+ ovs_mutex_lock(&netdev->mutex);
if (!(netdev->cache_valid & VALID_IN4)) {
- const struct sockaddr_in *sin;
struct ifreq ifr;
- int error;
ifr.ifr_addr.sa_family = AF_INET;
error = af_inet_ifreq_ioctl(netdev_get_kernel_name(netdev_), &ifr,
SIOCGIFADDR, "SIOCGIFADDR");
- if (error) {
- return error;
- }
+ if (!error) {
+ const struct sockaddr_in *sin;
- sin = (struct sockaddr_in *) &ifr.ifr_addr;
- netdev->in4 = sin->sin_addr;
- error = af_inet_ifreq_ioctl(netdev_get_kernel_name(netdev_), &ifr,
- SIOCGIFNETMASK, "SIOCGIFNETMASK");
- if (error) {
- return error;
+ sin = (struct sockaddr_in *) &ifr.ifr_addr;
+ netdev->in4 = sin->sin_addr;
+ netdev->cache_valid |= VALID_IN4;
+ error = af_inet_ifreq_ioctl(netdev_get_kernel_name(netdev_), &ifr,
+ SIOCGIFNETMASK, "SIOCGIFNETMASK");
+ if (!error) {
+ *netmask = sin->sin_addr;
+ }
}
- netdev->netmask = sin->sin_addr;
- netdev->cache_valid |= VALID_IN4;
}
- *in4 = netdev->in4;
- *netmask = netdev->netmask;
+ if (!error) {
+ *in4 = netdev->in4;
+ *netmask = netdev->netmask;
+ }
+ ovs_mutex_unlock(&netdev->mutex);
- return in4->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
+ return error ? error : in4->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
}
/*
struct netdev_bsd *netdev = netdev_bsd_cast(netdev_);
int error;
+ ovs_mutex_lock(&netdev->mutex);
error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", addr);
if (!error) {
if (addr.s_addr != INADDR_ANY) {
}
netdev_bsd_changed(netdev);
}
+ ovs_mutex_unlock(&netdev->mutex);
+
return error;
}
#endif
}
+static int
+netdev_bsd_arp_lookup(const struct netdev *netdev OVS_UNUSED,
+ ovs_be32 ip OVS_UNUSED,
+ uint8_t mac[ETH_ADDR_LEN] OVS_UNUSED)
+{
+#if defined(__NetBSD__)
+ const struct rt_msghdr *rtm;
+ size_t needed;
+ char *buf;
+ const char *cp;
+ const char *ep;
+ int mib[6];
+ int error;
+
+ buf = NULL;
+ mib[0] = CTL_NET;
+ mib[1] = PF_ROUTE;
+ mib[2] = 0;
+ mib[3] = AF_INET;
+ mib[4] = NET_RT_FLAGS;
+ mib[5] = RTF_LLINFO;
+ if (sysctl(mib, 6, NULL, &needed, NULL, 0) == -1) {
+ error = errno;
+ goto error;
+ }
+ buf = xmalloc(needed);
+ if (sysctl(mib, 6, buf, &needed, NULL, 0) == -1) {
+ error = errno;
+ goto error;
+ }
+ ep = buf + needed;
+ for (cp = buf; cp < ep; cp += rtm->rtm_msglen) {
+ const struct sockaddr_inarp *sina;
+ const struct sockaddr_dl *sdl;
+
+ rtm = (const void *)cp;
+ sina = (const void *)(rtm + 1);
+ if (ip != sina->sin_addr.s_addr) {
+ continue;
+ }
+ sdl = (const void *)
+ ((const char *)(const void *)sina + RT_ROUNDUP(sina->sin_len));
+ if (sdl->sdl_alen == ETH_ADDR_LEN) {
+ memcpy(mac, &sdl->sdl_data[sdl->sdl_nlen], ETH_ADDR_LEN);
+ error = 0;
+ goto error;
+ }
+ }
+ error = ENXIO;
+error:
+ free(buf);
+ return error;
+#else
+ return EOPNOTSUPP;
+#endif
+}
+
static void
make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
{
static int
do_set_addr(struct netdev *netdev,
- int ioctl_nr, const char *ioctl_name, struct in_addr addr)
+ unsigned long ioctl_nr, const char *ioctl_name,
+ struct in_addr addr)
{
struct ifreq ifr;
make_in4_sockaddr(&ifr.ifr_addr, addr);
NULL, /* add_router */
netdev_bsd_get_next_hop,
NULL, /* get_status */
- NULL, /* arp_lookup */
+ netdev_bsd_arp_lookup, /* arp_lookup */
netdev_bsd_update_flags,
NULL, /* add_router */
netdev_bsd_get_next_hop,
NULL, /* get_status */
- NULL, /* arp_lookup */
+ netdev_bsd_arp_lookup, /* arp_lookup */
netdev_bsd_update_flags,
/* Calls ioctl() on an AF_LINK sock, passing the specified 'command' and
* 'arg'. Returns 0 if successful, otherwise a positive errno value. */
int
-af_link_ioctl(int command, const void *arg)
+af_link_ioctl(unsigned long command, const void *arg)
{
static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
static int sock;
struct list txq;
};
+/* Protects 'dummy_list'. */
+static struct ovs_mutex dummy_list_mutex = OVS_MUTEX_INITIALIZER;
+
+/* Contains all 'struct dummy_dev's. */
+static struct list dummy_list OVS_GUARDED_BY(dummy_list_mutex)
+ = LIST_INITIALIZER(&dummy_list);
+
struct netdev_dummy {
struct netdev up;
+
+ /* In dummy_list. */
+ struct list list_node OVS_GUARDED_BY(dummy_list_mutex);
+
+ /* Protects all members below. */
+ struct ovs_mutex mutex OVS_ACQ_AFTER(dummy_list_mutex);
+
uint8_t hwaddr[ETH_ADDR_LEN];
int mtu;
struct netdev_stats stats;
struct list rxes; /* List of child "netdev_rx_dummy"s. */
};
-static const struct netdev_class dummy_class;
-
/* Max 'recv_queue_len' in struct netdev_dummy. */
#define NETDEV_DUMMY_MAX_QUEUE 100
static unixctl_cb_func netdev_dummy_set_admin_state;
static int netdev_dummy_construct(struct netdev *);
-static void netdev_dummy_poll_notify(struct netdev_dummy *);
+static void netdev_dummy_poll_notify(struct netdev_dummy *netdev)
+ OVS_REQUIRES(netdev->mutex);
static void netdev_dummy_queue_packet(struct netdev_dummy *, struct ofpbuf *);
static void dummy_stream_close(struct dummy_stream *);
static void
netdev_dummy_run(void)
{
- struct shash dummy_netdevs;
- struct shash_node *node;
+ struct netdev_dummy *dev;
- shash_init(&dummy_netdevs);
- netdev_get_devices(&dummy_class, &dummy_netdevs);
- SHASH_FOR_EACH (node, &dummy_netdevs) {
- struct netdev_dummy *dev = node->data;
+ ovs_mutex_lock(&dummy_list_mutex);
+ LIST_FOR_EACH (dev, list_node, &dummy_list) {
size_t i;
+ ovs_mutex_lock(&dev->mutex);
+
if (dev->pstream) {
struct stream *new_stream;
int error;
}
}
- netdev_close(&dev->up);
+ ovs_mutex_unlock(&dev->mutex);
}
- shash_destroy(&dummy_netdevs);
+ ovs_mutex_unlock(&dummy_list_mutex);
}
static void
static void
netdev_dummy_wait(void)
{
- struct shash dummy_netdevs;
- struct shash_node *node;
+ struct netdev_dummy *dev;
- shash_init(&dummy_netdevs);
- netdev_get_devices(&dummy_class, &dummy_netdevs);
- SHASH_FOR_EACH (node, &dummy_netdevs) {
- struct netdev_dummy *dev = node->data;
+ ovs_mutex_lock(&dummy_list_mutex);
+ LIST_FOR_EACH (dev, list_node, &dummy_list) {
size_t i;
+ ovs_mutex_lock(&dev->mutex);
if (dev->pstream) {
pstream_wait(dev->pstream);
}
}
stream_recv_wait(s->stream);
}
- netdev_close(&dev->up);
+ ovs_mutex_unlock(&dev->mutex);
}
- shash_destroy(&dummy_netdevs);
+ ovs_mutex_unlock(&dummy_list_mutex);
}
static struct netdev *
unsigned int n;
atomic_add(&next_n, 1, &n);
+
+ ovs_mutex_init(&netdev->mutex, PTHREAD_MUTEX_NORMAL);
netdev->hwaddr[0] = 0xaa;
netdev->hwaddr[1] = 0x55;
netdev->hwaddr[2] = n >> 24;
list_init(&netdev->rxes);
+ ovs_mutex_lock(&dummy_list_mutex);
+ list_push_back(&dummy_list, &netdev->list_node);
+ ovs_mutex_unlock(&dummy_list_mutex);
+
return 0;
}
struct netdev_dummy *netdev = netdev_dummy_cast(netdev_);
size_t i;
+ ovs_mutex_lock(&dummy_list_mutex);
+ list_remove(&netdev->list_node);
+ ovs_mutex_unlock(&dummy_list_mutex);
+
pstream_close(netdev->pstream);
for (i = 0; i < netdev->n_streams; i++) {
dummy_stream_close(&netdev->streams[i]);
}
free(netdev->streams);
+ ovs_mutex_destroy(&netdev->mutex);
}
static void
struct netdev_dummy *netdev = netdev_dummy_cast(netdev_);
const char *pstream;
+ ovs_mutex_lock(&netdev->mutex);
netdev->ifindex = smap_get_int(args, "ifindex", -EOPNOTSUPP);
pstream = smap_get(args, "pstream");
}
}
}
+ ovs_mutex_unlock(&netdev->mutex);
+
return 0;
}
struct netdev_rx_dummy *rx = netdev_rx_dummy_cast(rx_);
struct netdev_dummy *netdev = netdev_dummy_cast(rx->up.netdev);
+ ovs_mutex_lock(&netdev->mutex);
list_push_back(&netdev->rxes, &rx->node);
list_init(&rx->recv_queue);
rx->recv_queue_len = 0;
+ ovs_mutex_unlock(&netdev->mutex);
return 0;
}
netdev_dummy_rx_destruct(struct netdev_rx *rx_)
{
struct netdev_rx_dummy *rx = netdev_rx_dummy_cast(rx_);
+ struct netdev_dummy *netdev = netdev_dummy_cast(rx->up.netdev);
+ ovs_mutex_lock(&netdev->mutex);
list_remove(&rx->node);
ofpbuf_list_delete(&rx->recv_queue);
+ ovs_mutex_unlock(&netdev->mutex);
}
static void
netdev_dummy_rx_recv(struct netdev_rx *rx_, void *buffer, size_t size)
{
struct netdev_rx_dummy *rx = netdev_rx_dummy_cast(rx_);
+ struct netdev_dummy *netdev = netdev_dummy_cast(rx->up.netdev);
struct ofpbuf *packet;
int retval;
- if (list_is_empty(&rx->recv_queue)) {
+ ovs_mutex_lock(&netdev->mutex);
+ if (!list_is_empty(&rx->recv_queue)) {
+ packet = ofpbuf_from_list(list_pop_front(&rx->recv_queue));
+ rx->recv_queue_len--;
+ } else {
+ packet = NULL;
+ }
+ ovs_mutex_unlock(&netdev->mutex);
+
+ if (!packet) {
return -EAGAIN;
}
- packet = ofpbuf_from_list(list_pop_front(&rx->recv_queue));
- rx->recv_queue_len--;
if (packet->size <= size) {
memcpy(buffer, packet->data, packet->size);
retval = packet->size;
netdev_dummy_rx_wait(struct netdev_rx *rx_)
{
struct netdev_rx_dummy *rx = netdev_rx_dummy_cast(rx_);
+ struct netdev_dummy *netdev = netdev_dummy_cast(rx->up.netdev);
+
+ ovs_mutex_lock(&netdev->mutex);
if (!list_is_empty(&rx->recv_queue)) {
poll_immediate_wake();
}
+ ovs_mutex_unlock(&netdev->mutex);
}
static int
netdev_dummy_rx_drain(struct netdev_rx *rx_)
{
struct netdev_rx_dummy *rx = netdev_rx_dummy_cast(rx_);
+ struct netdev_dummy *netdev = netdev_dummy_cast(rx->up.netdev);
+
+ ovs_mutex_lock(&netdev->mutex);
ofpbuf_list_delete(&rx->recv_queue);
rx->recv_queue_len = 0;
+ ovs_mutex_unlock(&netdev->mutex);
+
return 0;
}
}
}
+ ovs_mutex_lock(&dev->mutex);
dev->stats.tx_packets++;
dev->stats.tx_bytes += size;
list_push_back(&s->txq, &b->list_node);
}
}
+ ovs_mutex_unlock(&dev->mutex);
return 0;
}
{
struct netdev_dummy *dev = netdev_dummy_cast(netdev);
+ ovs_mutex_lock(&dev->mutex);
if (!eth_addr_equals(dev->hwaddr, mac)) {
memcpy(dev->hwaddr, mac, ETH_ADDR_LEN);
netdev_dummy_poll_notify(dev);
}
+ ovs_mutex_unlock(&dev->mutex);
return 0;
}
netdev_dummy_get_etheraddr(const struct netdev *netdev,
uint8_t mac[ETH_ADDR_LEN])
{
- const struct netdev_dummy *dev = netdev_dummy_cast(netdev);
+ struct netdev_dummy *dev = netdev_dummy_cast(netdev);
+ ovs_mutex_lock(&dev->mutex);
memcpy(mac, dev->hwaddr, ETH_ADDR_LEN);
+ ovs_mutex_unlock(&dev->mutex);
+
return 0;
}
static int
netdev_dummy_get_mtu(const struct netdev *netdev, int *mtup)
{
- const struct netdev_dummy *dev = netdev_dummy_cast(netdev);
+ struct netdev_dummy *dev = netdev_dummy_cast(netdev);
+ ovs_mutex_lock(&dev->mutex);
*mtup = dev->mtu;
+ ovs_mutex_unlock(&dev->mutex);
+
return 0;
}
{
struct netdev_dummy *dev = netdev_dummy_cast(netdev);
+ ovs_mutex_lock(&dev->mutex);
dev->mtu = mtu;
+ ovs_mutex_unlock(&dev->mutex);
+
return 0;
}
static int
netdev_dummy_get_stats(const struct netdev *netdev, struct netdev_stats *stats)
{
- const struct netdev_dummy *dev = netdev_dummy_cast(netdev);
+ struct netdev_dummy *dev = netdev_dummy_cast(netdev);
+ ovs_mutex_lock(&dev->mutex);
*stats = dev->stats;
+ ovs_mutex_unlock(&dev->mutex);
+
return 0;
}
{
struct netdev_dummy *dev = netdev_dummy_cast(netdev);
+ ovs_mutex_lock(&dev->mutex);
dev->stats = *stats;
+ ovs_mutex_unlock(&dev->mutex);
+
return 0;
}
netdev_dummy_get_ifindex(const struct netdev *netdev)
{
struct netdev_dummy *dev = netdev_dummy_cast(netdev);
+ int ifindex;
- return dev->ifindex;
+ ovs_mutex_lock(&dev->mutex);
+ ifindex = dev->ifindex;
+ ovs_mutex_unlock(&dev->mutex);
+
+ return ifindex;
}
static int
-netdev_dummy_update_flags(struct netdev *netdev_,
- enum netdev_flags off, enum netdev_flags on,
- enum netdev_flags *old_flagsp)
+netdev_dummy_update_flags__(struct netdev_dummy *netdev,
+ enum netdev_flags off, enum netdev_flags on,
+ enum netdev_flags *old_flagsp)
+ OVS_REQUIRES(netdev->mutex)
{
- struct netdev_dummy *netdev = netdev_dummy_cast(netdev_);
-
if ((off | on) & ~(NETDEV_UP | NETDEV_PROMISC)) {
return EINVAL;
}
if (*old_flagsp != netdev->flags) {
netdev_dummy_poll_notify(netdev);
}
+
return 0;
}
+static int
+netdev_dummy_update_flags(struct netdev *netdev_,
+ enum netdev_flags off, enum netdev_flags on,
+ enum netdev_flags *old_flagsp)
+{
+ struct netdev_dummy *netdev = netdev_dummy_cast(netdev_);
+ int error;
+
+ ovs_mutex_lock(&netdev->mutex);
+ error = netdev_dummy_update_flags__(netdev, off, on, old_flagsp);
+ ovs_mutex_unlock(&netdev->mutex);
+
+ return error;
+}
+
static unsigned int
-netdev_dummy_change_seq(const struct netdev *netdev)
+netdev_dummy_change_seq(const struct netdev *netdev_)
{
- return netdev_dummy_cast(netdev)->change_seq;
+ struct netdev_dummy *netdev = netdev_dummy_cast(netdev_);
+ unsigned int change_seq;
+
+ ovs_mutex_lock(&netdev->mutex);
+ change_seq = netdev->change_seq;
+ ovs_mutex_unlock(&netdev->mutex);
+
+ return change_seq;
}
\f
/* Helper functions. */
goto exit;
}
+ ovs_mutex_lock(&dummy_dev->mutex);
dummy_dev->stats.rx_packets++;
dummy_dev->stats.rx_bytes += packet->size;
-
netdev_dummy_queue_packet(dummy_dev, packet);
+ ovs_mutex_unlock(&dummy_dev->mutex);
}
unixctl_command_reply(conn, NULL);
static void
netdev_dummy_set_admin_state__(struct netdev_dummy *dev, bool admin_state)
+ OVS_REQUIRES(dev->mutex)
{
enum netdev_flags old_flags;
if (admin_state) {
- netdev_dummy_update_flags(&dev->up, 0, NETDEV_UP, &old_flags);
+ netdev_dummy_update_flags__(dev, 0, NETDEV_UP, &old_flags);
} else {
- netdev_dummy_update_flags(&dev->up, NETDEV_UP, 0, &old_flags);
+ netdev_dummy_update_flags__(dev, NETDEV_UP, 0, &old_flags);
}
}
if (netdev && is_dummy_class(netdev->netdev_class)) {
struct netdev_dummy *dummy_dev = netdev_dummy_cast(netdev);
+ ovs_mutex_lock(&dummy_dev->mutex);
netdev_dummy_set_admin_state__(dummy_dev, up);
+ ovs_mutex_unlock(&dummy_dev->mutex);
+
netdev_close(netdev);
} else {
unixctl_command_reply_error(conn, "Unknown Dummy Interface");
return;
}
} else {
- struct shash dummy_netdevs;
- struct shash_node *node;
-
- shash_init(&dummy_netdevs);
- netdev_get_devices(&dummy_class, &dummy_netdevs);
- SHASH_FOR_EACH (node, &dummy_netdevs) {
- struct netdev *netdev = node->data;
- netdev_dummy_set_admin_state__(netdev_dummy_cast(netdev), up);
- netdev_close(netdev);
+ struct netdev_dummy *netdev;
+
+ ovs_mutex_lock(&dummy_list_mutex);
+ LIST_FOR_EACH (netdev, list_node, &dummy_list) {
+ ovs_mutex_lock(&netdev->mutex);
+ netdev_dummy_set_admin_state__(netdev, up);
+ ovs_mutex_unlock(&netdev->mutex);
}
- shash_destroy(&dummy_netdevs);
+ ovs_mutex_unlock(&dummy_list_mutex);
}
unixctl_command_reply(conn, "OK");
}
SSET_FOR_EACH (type, &types) {
if (!netdev_unregister_provider(type)) {
struct netdev_class *class;
+ int error;
- class = xmalloc(sizeof *class);
- *class = dummy_class;
+ class = xmemdup(&dummy_class, sizeof dummy_class);
class->type = xstrdup(type);
- netdev_register_provider(class);
+ error = netdev_register_provider(class);
+ if (error) {
+ VLOG_ERR("%s: failed to register netdev provider (%s)",
+ type, ovs_strerror(error));
+ free(CONST_CAST(char *, class->type));
+ free(class);
+ }
}
}
sset_destroy(&types);
#define TC_RTAB_SIZE 1024
#endif
-static struct nln_notifier *netdev_linux_cache_notifier = NULL;
-static int cache_notifier_refcount;
-
enum {
VALID_IFINDEX = 1 << 0,
VALID_ETHERADDR = 1 << 1,
struct netdev_linux {
struct netdev up;
+ /* Protects all members below. */
+ struct ovs_mutex mutex;
+
unsigned int cache_valid;
unsigned int change_seq;
int cmd, const char *cmd_name);
static int get_flags(const struct netdev *, unsigned int *flags);
static int set_flags(const char *, unsigned int flags);
+static int update_flags(struct netdev_linux *netdev, enum netdev_flags off,
+ enum netdev_flags on, enum netdev_flags *old_flagsp)
+ OVS_REQUIRES(netdev->mutex);
static int do_get_ifindex(const char *netdev_name);
static int get_ifindex(const struct netdev *, int *ifindexp);
static int do_set_addr(struct netdev *netdev,
return CONTAINER_OF(rx, struct netdev_rx_linux, up);
}
\f
+static void netdev_linux_update(struct netdev_linux *netdev,
+ const struct rtnetlink_link_change *)
+ OVS_REQUIRES(netdev->mutex);
+static void netdev_linux_changed(struct netdev_linux *netdev,
+ unsigned int ifi_flags, unsigned int mask)
+ OVS_REQUIRES(netdev->mutex);
+
+/* Returns a NETLINK_ROUTE socket listening for RTNLGRP_LINK changes, or NULL
+ * if no such socket could be created. */
+static struct nl_sock *
+netdev_linux_notify_sock(void)
+{
+ static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
+ static struct nl_sock *sock;
+
+ if (ovsthread_once_start(&once)) {
+ int error;
+
+ error = nl_sock_create(NETLINK_ROUTE, &sock);
+ if (!error) {
+ error = nl_sock_join_mcgroup(sock, RTNLGRP_LINK);
+ if (error) {
+ nl_sock_destroy(sock);
+ sock = NULL;
+ }
+ }
+ ovsthread_once_done(&once);
+ }
+
+ return sock;
+}
+
static void
netdev_linux_run(void)
{
- rtnetlink_link_run();
+ struct nl_sock *sock;
+ int error;
+
netdev_linux_miimon_run();
+
+ sock = netdev_linux_notify_sock();
+ if (!sock) {
+ return;
+ }
+
+ do {
+ static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
+ uint64_t buf_stub[4096 / 8];
+ struct ofpbuf buf;
+
+ ofpbuf_use_stub(&buf, buf_stub, sizeof buf_stub);
+ error = nl_sock_recv(sock, &buf, false);
+ if (!error) {
+ struct rtnetlink_link_change change;
+
+ if (rtnetlink_link_parse(&buf, &change)) {
+ struct netdev *netdev_ = netdev_from_name(change.ifname);
+ if (netdev_ && is_netdev_linux_class(netdev_->netdev_class)) {
+ struct netdev_linux *netdev = netdev_linux_cast(netdev_);
+
+ ovs_mutex_lock(&netdev->mutex);
+ netdev_linux_update(netdev, &change);
+ ovs_mutex_unlock(&netdev->mutex);
+ }
+ netdev_close(netdev_);
+ }
+ } else if (error == ENOBUFS) {
+ struct shash device_shash;
+ struct shash_node *node;
+
+ nl_sock_drain(sock);
+
+ shash_init(&device_shash);
+ netdev_get_devices(&netdev_linux_class, &device_shash);
+ SHASH_FOR_EACH (node, &device_shash) {
+ struct netdev *netdev_ = node->data;
+ struct netdev_linux *netdev = netdev_linux_cast(netdev_);
+ unsigned int flags;
+
+ ovs_mutex_lock(&netdev->mutex);
+ get_flags(netdev_, &flags);
+ netdev_linux_changed(netdev, flags, 0);
+ ovs_mutex_unlock(&netdev->mutex);
+
+ netdev_close(netdev_);
+ }
+ shash_destroy(&device_shash);
+ } else if (error != EAGAIN) {
+ VLOG_WARN_RL(&rl, "error reading or parsing netlink (%s)",
+ ovs_strerror(error));
+ }
+ ofpbuf_uninit(&buf);
+ } while (!error);
}
static void
netdev_linux_wait(void)
{
- rtnetlink_link_wait();
+ struct nl_sock *sock;
+
netdev_linux_miimon_wait();
+ sock = netdev_linux_notify_sock();
+ if (sock) {
+ nl_sock_wait(sock, POLLIN);
+ }
}
static void
netdev_linux_changed(struct netdev_linux *dev,
unsigned int ifi_flags, unsigned int mask)
+ OVS_REQUIRES(dev->mutex)
{
dev->change_seq++;
if (!dev->change_seq) {
static void
netdev_linux_update(struct netdev_linux *dev,
const struct rtnetlink_link_change *change)
+ OVS_REQUIRES(dev->mutex)
{
if (change->nlmsg_type == RTM_NEWLINK) {
/* Keep drv-info */
}
}
-static void
-netdev_linux_cache_cb(const struct rtnetlink_link_change *change,
- void *aux OVS_UNUSED)
-{
- if (change) {
- struct netdev *base_dev = netdev_from_name(change->ifname);
- if (base_dev && is_netdev_linux_class(netdev_get_class(base_dev))) {
- netdev_linux_update(netdev_linux_cast(base_dev), change);
- netdev_close(base_dev);
- }
- } else {
- struct shash device_shash;
- struct shash_node *node;
-
- shash_init(&device_shash);
- netdev_get_devices(&netdev_linux_class, &device_shash);
- SHASH_FOR_EACH (node, &device_shash) {
- struct netdev *netdev = node->data;
- struct netdev_linux *dev = netdev_linux_cast(netdev);
- unsigned int flags;
-
- get_flags(&dev->up, &flags);
- netdev_linux_changed(dev, flags, 0);
- netdev_close(netdev);
- }
- shash_destroy(&device_shash);
- }
-}
-
-static int
-cache_notifier_ref(void)
-{
- if (!cache_notifier_refcount) {
- ovs_assert(!netdev_linux_cache_notifier);
-
- netdev_linux_cache_notifier =
- rtnetlink_link_notifier_create(netdev_linux_cache_cb, NULL);
-
- if (!netdev_linux_cache_notifier) {
- return EINVAL;
- }
- }
- cache_notifier_refcount++;
-
- return 0;
-}
-
-static void
-cache_notifier_unref(void)
-{
- ovs_assert(cache_notifier_refcount > 0);
- if (!--cache_notifier_refcount) {
- ovs_assert(netdev_linux_cache_notifier);
- rtnetlink_link_notifier_destroy(netdev_linux_cache_notifier);
- netdev_linux_cache_notifier = NULL;
- }
-}
-
static struct netdev *
netdev_linux_alloc(void)
{
return &netdev->up;
}
-static int
+static void
netdev_linux_common_construct(struct netdev_linux *netdev)
{
+ ovs_mutex_init(&netdev->mutex, PTHREAD_MUTEX_NORMAL);
netdev->change_seq = 1;
-
- return cache_notifier_ref();
}
/* Creates system and internal devices. */
struct netdev_linux *netdev = netdev_linux_cast(netdev_);
int error;
- error = netdev_linux_common_construct(netdev);
- if (error) {
- return error;
- }
+ netdev_linux_common_construct(netdev);
error = get_flags(&netdev->up, &netdev->ifi_flags);
if (error == ENODEV) {
if (netdev->up.netdev_class != &netdev_internal_class) {
/* The device does not exist, so don't allow it to be opened. */
- cache_notifier_unref();
return ENODEV;
} else {
/* "Internal" netdevs have to be created as netdev objects before
struct ifreq ifr;
int error;
- error = netdev_linux_common_construct(netdev);
- if (error) {
- goto error;
- }
+ netdev_linux_common_construct(netdev);
/* Open tap device. */
netdev->tap_fd = open(tap_dev, O_RDWR);
if (netdev->tap_fd < 0) {
error = errno;
VLOG_WARN("opening \"%s\" failed: %s", tap_dev, ovs_strerror(error));
- goto error_unref_notifier;
+ return error;
}
/* Create tap device. */
error_close:
close(netdev->tap_fd);
-error_unref_notifier:
- cache_notifier_unref();
-error:
return error;
}
close(netdev->tap_fd);
}
- cache_notifier_unref();
+ ovs_mutex_destroy(&netdev->mutex);
}
static void
struct netdev_linux *netdev = netdev_linux_cast(netdev_);
int error;
+ ovs_mutex_lock(&netdev->mutex);
rx->is_tap = is_tap_netdev(netdev_);
if (rx->is_tap) {
rx->fd = netdev->tap_fd;
goto error;
}
}
+ ovs_mutex_unlock(&netdev->mutex);
return 0;
if (rx->fd >= 0) {
close(rx->fd);
}
+ ovs_mutex_unlock(&netdev->mutex);
return error;
}
struct msghdr msg;
struct iovec iov;
int ifindex;
- int error;
int sock;
sock = af_packet_sock();
return -sock;
}
- error = get_ifindex(netdev_, &ifindex);
- if (error) {
- return error;
+ ifindex = netdev_get_ifindex(netdev_);
+ if (ifindex < 0) {
+ return -ifindex;
}
/* We don't bother setting most fields in sockaddr_ll because the
const uint8_t mac[ETH_ADDR_LEN])
{
struct netdev_linux *netdev = netdev_linux_cast(netdev_);
- struct netdev_saved_flags *sf = NULL;
+ enum netdev_flags old_flags = 0;
int error;
+ ovs_mutex_lock(&netdev->mutex);
+
if (netdev->cache_valid & VALID_ETHERADDR) {
- if (netdev->ether_addr_error) {
- return netdev->ether_addr_error;
- }
- if (eth_addr_equals(netdev->etheraddr, mac)) {
- return 0;
+ error = netdev->ether_addr_error;
+ if (error || eth_addr_equals(netdev->etheraddr, mac)) {
+ goto exit;
}
netdev->cache_valid &= ~VALID_ETHERADDR;
}
/* Tap devices must be brought down before setting the address. */
if (is_tap_netdev(netdev_)) {
- netdev_turn_flags_off(netdev_, NETDEV_UP, &sf);
+ update_flags(netdev, NETDEV_UP, 0, &old_flags);
}
error = set_etheraddr(netdev_get_name(netdev_), mac);
if (!error || error == ENODEV) {
}
}
- netdev_restore_flags(sf);
+ if (is_tap_netdev(netdev_) && old_flags & NETDEV_UP) {
+ update_flags(netdev, 0, NETDEV_UP, &old_flags);
+ }
+exit:
+ ovs_mutex_unlock(&netdev->mutex);
return error;
}
uint8_t mac[ETH_ADDR_LEN])
{
struct netdev_linux *netdev = netdev_linux_cast(netdev_);
+ int error;
+ ovs_mutex_lock(&netdev->mutex);
if (!(netdev->cache_valid & VALID_ETHERADDR)) {
- int error = get_etheraddr(netdev_get_name(netdev_),
- netdev->etheraddr);
-
- netdev->ether_addr_error = error;
+ netdev->ether_addr_error = get_etheraddr(netdev_get_name(netdev_),
+ netdev->etheraddr);
netdev->cache_valid |= VALID_ETHERADDR;
}
- if (!netdev->ether_addr_error) {
+ error = netdev->ether_addr_error;
+ if (!error) {
memcpy(mac, netdev->etheraddr, ETH_ADDR_LEN);
}
+ ovs_mutex_unlock(&netdev->mutex);
- return netdev->ether_addr_error;
+ return error;
}
/* Returns the maximum size of transmitted (and received) packets on 'netdev',
netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
{
struct netdev_linux *netdev = netdev_linux_cast(netdev_);
+ int error;
+
+ ovs_mutex_lock(&netdev->mutex);
if (!(netdev->cache_valid & VALID_MTU)) {
struct ifreq ifr;
- int error;
-
- error = af_inet_ifreq_ioctl(netdev_get_name(netdev_), &ifr,
- SIOCGIFMTU, "SIOCGIFMTU");
- netdev->netdev_mtu_error = error;
+ netdev->netdev_mtu_error = af_inet_ifreq_ioctl(
+ netdev_get_name(netdev_), &ifr, SIOCGIFMTU, "SIOCGIFMTU");
netdev->mtu = ifr.ifr_mtu;
netdev->cache_valid |= VALID_MTU;
}
- if (!netdev->netdev_mtu_error) {
+ error = netdev->netdev_mtu_error;
+ if (!error) {
*mtup = netdev->mtu;
}
- return netdev->netdev_mtu_error;
+ ovs_mutex_unlock(&netdev->mutex);
+
+ return error;
}
/* Sets the maximum size of transmitted (MTU) for given device using linux
struct ifreq ifr;
int error;
+ ovs_mutex_lock(&netdev->mutex);
if (netdev->cache_valid & VALID_MTU) {
- if (netdev->netdev_mtu_error) {
- return netdev->netdev_mtu_error;
- }
- if (netdev->mtu == mtu) {
- return 0;
+ error = netdev->netdev_mtu_error;
+ if (error || netdev->mtu == mtu) {
+ goto exit;
}
netdev->cache_valid &= ~VALID_MTU;
}
netdev->mtu = ifr.ifr_mtu;
netdev->cache_valid |= VALID_MTU;
}
+exit:
+ ovs_mutex_unlock(&netdev->mutex);
return error;
}
/* Returns the ifindex of 'netdev', if successful, as a positive number.
* On failure, returns a negative errno value. */
static int
-netdev_linux_get_ifindex(const struct netdev *netdev)
+netdev_linux_get_ifindex(const struct netdev *netdev_)
{
+ struct netdev_linux *netdev = netdev_linux_cast(netdev_);
int ifindex, error;
- error = get_ifindex(netdev, &ifindex);
+ ovs_mutex_lock(&netdev->mutex);
+ error = get_ifindex(netdev_, &ifindex);
+ ovs_mutex_unlock(&netdev->mutex);
+
return error ? -error : ifindex;
}
{
struct netdev_linux *netdev = netdev_linux_cast(netdev_);
+ ovs_mutex_lock(&netdev->mutex);
if (netdev->miimon_interval > 0) {
*carrier = netdev->miimon;
} else {
*carrier = (netdev->ifi_flags & IFF_RUNNING) != 0;
}
+ ovs_mutex_unlock(&netdev->mutex);
return 0;
}
static long long int
-netdev_linux_get_carrier_resets(const struct netdev *netdev)
+netdev_linux_get_carrier_resets(const struct netdev *netdev_)
{
- return netdev_linux_cast(netdev)->carrier_resets;
+ struct netdev_linux *netdev = netdev_linux_cast(netdev_);
+ long long int carrier_resets;
+
+ ovs_mutex_lock(&netdev->mutex);
+ carrier_resets = netdev->carrier_resets;
+ ovs_mutex_unlock(&netdev->mutex);
+
+ return carrier_resets;
}
static int
{
struct netdev_linux *netdev = netdev_linux_cast(netdev_);
+ ovs_mutex_lock(&netdev->mutex);
interval = interval > 0 ? MAX(interval, 100) : 0;
if (netdev->miimon_interval != interval) {
netdev->miimon_interval = interval;
timer_set_expired(&netdev->miimon_timer);
}
+ ovs_mutex_unlock(&netdev->mutex);
return 0;
}
struct netdev_linux *dev = netdev_linux_cast(netdev);
bool miimon;
- if (dev->miimon_interval <= 0 || !timer_expired(&dev->miimon_timer)) {
- netdev_close(netdev);
- continue;
- }
+ ovs_mutex_lock(&dev->mutex);
+ if (dev->miimon_interval > 0 && timer_expired(&dev->miimon_timer)) {
+ netdev_linux_get_miimon(dev->up.name, &miimon);
+ if (miimon != dev->miimon) {
+ dev->miimon = miimon;
+ netdev_linux_changed(dev, dev->ifi_flags, 0);
+ }
- netdev_linux_get_miimon(dev->up.name, &miimon);
- if (miimon != dev->miimon) {
- dev->miimon = miimon;
- netdev_linux_changed(dev, dev->ifi_flags, 0);
+ timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
}
-
- timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
+ ovs_mutex_unlock(&dev->mutex);
netdev_close(netdev);
}
struct netdev *netdev = node->data;
struct netdev_linux *dev = netdev_linux_cast(netdev);
+ ovs_mutex_lock(&dev->mutex);
if (dev->miimon_interval > 0) {
timer_wait(&dev->miimon_timer);
}
+ ovs_mutex_unlock(&dev->mutex);
netdev_close(netdev);
}
shash_destroy(&device_shash);
static int
netdev_linux_sys_get_stats(const struct netdev *netdev_,
- struct netdev_stats *stats)
+ struct netdev_stats *stats)
{
static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
static int use_netlink_stats;
struct netdev_stats dev_stats;
int error;
+ ovs_mutex_lock(&netdev->mutex);
get_stats_via_vport(netdev_, stats);
-
error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
-
if (error) {
- if (netdev->vport_stats_error) {
- return error;
- } else {
- return 0;
+ if (!netdev->vport_stats_error) {
+ error = 0;
}
- }
-
- if (netdev->vport_stats_error) {
+ } else if (netdev->vport_stats_error) {
/* stats not available from OVS then use ioctl stats. */
*stats = dev_stats;
} else {
stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
stats->tx_window_errors += dev_stats.tx_window_errors;
}
- return 0;
+ ovs_mutex_unlock(&netdev->mutex);
+
+ return error;
}
/* Retrieves current device stats for 'netdev-tap' netdev or
struct netdev_stats dev_stats;
int error;
+ ovs_mutex_lock(&netdev->mutex);
get_stats_via_vport(netdev_, stats);
-
error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
if (error) {
- if (netdev->vport_stats_error) {
- return error;
- } else {
- return 0;
+ if (!netdev->vport_stats_error) {
+ error = 0;
}
- }
+ } else if (netdev->vport_stats_error) {
+ /* Transmit and receive stats will appear to be swapped relative to the
+ * other ports since we are the one sending the data, not a remote
+ * computer. For consistency, we swap them back here. This does not
+ * apply if we are getting stats from the vport layer because it always
+ * tracks stats from the perspective of the switch. */
- /* If this port is an internal port then the transmit and receive stats
- * will appear to be swapped relative to the other ports since we are the
- * one sending the data, not a remote computer. For consistency, we swap
- * them back here. This does not apply if we are getting stats from the
- * vport layer because it always tracks stats from the perspective of the
- * switch. */
- if (netdev->vport_stats_error) {
*stats = dev_stats;
swap_uint64(&stats->rx_packets, &stats->tx_packets);
swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
stats->multicast += dev_stats.multicast;
stats->collisions += dev_stats.collisions;
}
- return 0;
+ ovs_mutex_unlock(&netdev->mutex);
+
+ return error;
}
static int
struct netdev_stats *stats)
{
struct netdev_linux *netdev = netdev_linux_cast(netdev_);
+ int error;
+ ovs_mutex_lock(&netdev->mutex);
get_stats_via_vport(netdev_, stats);
- return netdev->vport_stats_error;
+ error = netdev->vport_stats_error;
+ ovs_mutex_unlock(&netdev->mutex);
+
+ return error;
}
static int
static void
netdev_linux_read_features(struct netdev_linux *netdev)
+ OVS_REQUIRES(netdev->mutex)
{
struct ethtool_cmd ecmd;
uint32_t speed;
enum netdev_features *peer)
{
struct netdev_linux *netdev = netdev_linux_cast(netdev_);
+ int error;
+ ovs_mutex_lock(&netdev->mutex);
netdev_linux_read_features(netdev);
-
if (!netdev->get_features_error) {
*current = netdev->current;
*advertised = netdev->advertised;
*supported = netdev->supported;
*peer = 0; /* XXX */
}
- return netdev->get_features_error;
+ error = netdev->get_features_error;
+ ovs_mutex_unlock(&netdev->mutex);
+
+ return error;
}
/* Set the features advertised by 'netdev' to 'advertise'. */
static int
-netdev_linux_set_advertisements(struct netdev *netdev,
+netdev_linux_set_advertisements(struct netdev *netdev_,
enum netdev_features advertise)
{
+ struct netdev_linux *netdev = netdev_linux_cast(netdev_);
struct ethtool_cmd ecmd;
int error;
+ ovs_mutex_lock(&netdev->mutex);
+
COVERAGE_INC(netdev_get_ethtool);
memset(&ecmd, 0, sizeof ecmd);
- error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
+ error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
ETHTOOL_GSET, "ETHTOOL_GSET");
if (error) {
- return error;
+ goto exit;
}
ecmd.advertising = 0;
ecmd.advertising |= ADVERTISED_Asym_Pause;
}
COVERAGE_INC(netdev_set_ethtool);
- return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
- ETHTOOL_SSET, "ETHTOOL_SSET");
+ error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
+ ETHTOOL_SSET, "ETHTOOL_SSET");
+
+exit:
+ ovs_mutex_unlock(&netdev->mutex);
+ return error;
}
/* Attempts to set input rate limiting (policing) policy. Returns 0 if
const char *netdev_name = netdev_get_name(netdev_);
int error;
-
kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
: !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
: kbits_burst); /* Stick with user-specified value. */
+ ovs_mutex_lock(&netdev->mutex);
if (netdev->cache_valid & VALID_POLICING) {
- if (netdev->netdev_policing_error) {
- return netdev->netdev_policing_error;
- }
-
- if (netdev->kbits_rate == kbits_rate &&
- netdev->kbits_burst == kbits_burst) {
+ error = netdev->netdev_policing_error;
+ if (error || (netdev->kbits_rate == kbits_rate &&
+ netdev->kbits_burst == kbits_burst)) {
/* Assume that settings haven't changed since we last set them. */
- return 0;
+ goto out;
}
netdev->cache_valid &= ~VALID_POLICING;
}
netdev->netdev_policing_error = error;
netdev->cache_valid |= VALID_POLICING;
}
+ ovs_mutex_unlock(&netdev->mutex);
return error;
}
struct netdev_linux *netdev = netdev_linux_cast(netdev_);
int error;
+ ovs_mutex_lock(&netdev->mutex);
error = tc_query_qdisc(netdev_);
- if (error) {
- return error;
+ if (!error) {
+ *typep = netdev->tc->ops->ovs_name;
+ error = (netdev->tc->ops->qdisc_get
+ ? netdev->tc->ops->qdisc_get(netdev_, details)
+ : 0);
}
+ ovs_mutex_unlock(&netdev->mutex);
- *typep = netdev->tc->ops->ovs_name;
- return (netdev->tc->ops->qdisc_get
- ? netdev->tc->ops->qdisc_get(netdev_, details)
- : 0);
+ return error;
}
static int
return EOPNOTSUPP;
}
+ ovs_mutex_lock(&netdev->mutex);
error = tc_query_qdisc(netdev_);
if (error) {
- return error;
+ goto exit;
}
if (new_ops == netdev->tc->ops) {
- return new_ops->qdisc_set ? new_ops->qdisc_set(netdev_, details) : 0;
+ error = new_ops->qdisc_set ? new_ops->qdisc_set(netdev_, details) : 0;
} else {
/* Delete existing qdisc. */
error = tc_del_qdisc(netdev_);
if (error) {
- return error;
+ goto exit;
}
ovs_assert(netdev->tc == NULL);
/* Install new qdisc. */
error = new_ops->tc_install(netdev_, details);
ovs_assert((error == 0) == (netdev->tc != NULL));
-
- return error;
}
+
+exit:
+ ovs_mutex_unlock(&netdev->mutex);
+ return error;
}
static int
struct netdev_linux *netdev = netdev_linux_cast(netdev_);
int error;
+ ovs_mutex_lock(&netdev->mutex);
error = tc_query_qdisc(netdev_);
- if (error) {
- return error;
- } else {
+ if (!error) {
struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
- return (queue
+ error = (queue
? netdev->tc->ops->class_get(netdev_, queue, details)
: ENOENT);
}
+ ovs_mutex_unlock(&netdev->mutex);
+
+ return error;
}
static int
struct netdev_linux *netdev = netdev_linux_cast(netdev_);
int error;
+ ovs_mutex_lock(&netdev->mutex);
error = tc_query_qdisc(netdev_);
- if (error) {
- return error;
- } else if (queue_id >= netdev->tc->ops->n_queues
- || !netdev->tc->ops->class_set) {
- return EINVAL;
+ if (!error) {
+ error = (queue_id < netdev->tc->ops->n_queues
+ && netdev->tc->ops->class_set
+ ? netdev->tc->ops->class_set(netdev_, queue_id, details)
+ : EINVAL);
}
+ ovs_mutex_unlock(&netdev->mutex);
- return netdev->tc->ops->class_set(netdev_, queue_id, details);
+ return error;
}
static int
struct netdev_linux *netdev = netdev_linux_cast(netdev_);
int error;
+ ovs_mutex_lock(&netdev->mutex);
error = tc_query_qdisc(netdev_);
- if (error) {
- return error;
- } else if (!netdev->tc->ops->class_delete) {
- return EINVAL;
- } else {
- struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
- return (queue
- ? netdev->tc->ops->class_delete(netdev_, queue)
- : ENOENT);
+ if (!error) {
+ if (netdev->tc->ops->class_delete) {
+ struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
+ error = (queue
+ ? netdev->tc->ops->class_delete(netdev_, queue)
+ : ENOENT);
+ } else {
+ error = EINVAL;
+ }
}
+ ovs_mutex_unlock(&netdev->mutex);
+
+ return error;
}
static int
struct netdev_linux *netdev = netdev_linux_cast(netdev_);
int error;
+ ovs_mutex_lock(&netdev->mutex);
error = tc_query_qdisc(netdev_);
- if (error) {
- return error;
- } else if (!netdev->tc->ops->class_get_stats) {
- return EOPNOTSUPP;
- } else {
- const struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
- if (!queue) {
- return ENOENT;
+ if (!error) {
+ if (netdev->tc->ops->class_get_stats) {
+ const struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
+ if (queue) {
+ stats->created = queue->created;
+ error = netdev->tc->ops->class_get_stats(netdev_, queue,
+ stats);
+ } else {
+ error = ENOENT;
+ }
+ } else {
+ error = EOPNOTSUPP;
}
- stats->created = queue->created;
- return netdev->tc->ops->class_get_stats(netdev_, queue, stats);
}
+ ovs_mutex_unlock(&netdev->mutex);
+
+ return error;
}
static bool
netdev_dump_queues_cb *cb, void *aux)
{
struct netdev_linux *netdev = netdev_linux_cast(netdev_);
- struct tc_queue *queue, *next_queue;
- struct smap details;
- int last_error;
int error;
+ ovs_mutex_lock(&netdev->mutex);
error = tc_query_qdisc(netdev_);
- if (error) {
- return error;
- } else if (!netdev->tc->ops->class_get) {
- return EOPNOTSUPP;
- }
+ if (!error) {
+ if (netdev->tc->ops->class_get) {
+ struct tc_queue *queue, *next_queue;
+ struct smap details;
- last_error = 0;
- smap_init(&details);
- HMAP_FOR_EACH_SAFE (queue, next_queue, hmap_node,
- &netdev->tc->queues) {
- smap_clear(&details);
+ smap_init(&details);
+ HMAP_FOR_EACH_SAFE (queue, next_queue, hmap_node,
+ &netdev->tc->queues) {
+ int retval;
- error = netdev->tc->ops->class_get(netdev_, queue, &details);
- if (!error) {
- (*cb)(queue->queue_id, &details, aux);
+ smap_clear(&details);
+
+ retval = netdev->tc->ops->class_get(netdev_, queue, &details);
+ if (!retval) {
+ (*cb)(queue->queue_id, &details, aux);
+ } else {
+ error = retval;
+ }
+ }
+ smap_destroy(&details);
} else {
- last_error = error;
+ error = EOPNOTSUPP;
}
}
- smap_destroy(&details);
+ ovs_mutex_unlock(&netdev->mutex);
- return last_error;
+ return error;
}
static int
netdev_dump_queue_stats_cb *cb, void *aux)
{
struct netdev_linux *netdev = netdev_linux_cast(netdev_);
- struct nl_dump dump;
- struct ofpbuf msg;
- int last_error;
int error;
+ ovs_mutex_lock(&netdev->mutex);
error = tc_query_qdisc(netdev_);
- if (error) {
- return error;
- } else if (!netdev->tc->ops->class_dump_stats) {
- return EOPNOTSUPP;
- }
+ if (!error) {
+ struct nl_dump dump;
- last_error = 0;
- if (!start_queue_dump(netdev_, &dump)) {
- return ENODEV;
- }
- while (nl_dump_next(&dump, &msg)) {
- error = netdev->tc->ops->class_dump_stats(netdev_, &msg, cb, aux);
- if (error) {
- last_error = error;
+ if (!netdev->tc->ops->class_dump_stats) {
+ error = EOPNOTSUPP;
+ } else if (!start_queue_dump(netdev_, &dump)) {
+ error = ENODEV;
+ } else {
+ struct ofpbuf msg;
+ int retval;
+
+ while (nl_dump_next(&dump, &msg)) {
+ retval = netdev->tc->ops->class_dump_stats(netdev_, &msg,
+ cb, aux);
+ if (retval) {
+ error = retval;
+ }
+ }
+
+ retval = nl_dump_done(&dump);
+ if (retval) {
+ error = retval;
+ }
}
}
+ ovs_mutex_unlock(&netdev->mutex);
- error = nl_dump_done(&dump);
- return error ? error : last_error;
+ return error;
}
static int
struct in_addr *address, struct in_addr *netmask)
{
struct netdev_linux *netdev = netdev_linux_cast(netdev_);
+ int error;
+ ovs_mutex_lock(&netdev->mutex);
if (!(netdev->cache_valid & VALID_IN4)) {
- int error;
-
error = netdev_linux_get_ipv4(netdev_, &netdev->address,
SIOCGIFADDR, "SIOCGIFADDR");
- if (error) {
- return error;
+ if (!error) {
+ error = netdev_linux_get_ipv4(netdev_, &netdev->netmask,
+ SIOCGIFNETMASK, "SIOCGIFNETMASK");
+ if (!error) {
+ netdev->cache_valid |= VALID_IN4;
+ }
}
+ } else {
+ error = 0;
+ }
- error = netdev_linux_get_ipv4(netdev_, &netdev->netmask,
- SIOCGIFNETMASK, "SIOCGIFNETMASK");
- if (error) {
- return error;
+ if (!error) {
+ if (netdev->address.s_addr != INADDR_ANY) {
+ *address = netdev->address;
+ *netmask = netdev->netmask;
+ } else {
+ error = EADDRNOTAVAIL;
}
-
- netdev->cache_valid |= VALID_IN4;
}
- *address = netdev->address;
- *netmask = netdev->netmask;
- return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
+ ovs_mutex_unlock(&netdev->mutex);
+
+ return error;
}
static int
struct netdev_linux *netdev = netdev_linux_cast(netdev_);
int error;
+ ovs_mutex_lock(&netdev->mutex);
error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
if (!error) {
netdev->cache_valid |= VALID_IN4;
"SIOCSIFNETMASK", netmask);
}
}
+ ovs_mutex_unlock(&netdev->mutex);
+
return error;
}
netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
{
struct netdev_linux *netdev = netdev_linux_cast(netdev_);
+
+ ovs_mutex_lock(&netdev->mutex);
if (!(netdev->cache_valid & VALID_IN6)) {
FILE *file;
char line[128];
netdev->cache_valid |= VALID_IN6;
}
*in6 = netdev->in6;
+ ovs_mutex_unlock(&netdev->mutex);
+
return 0;
}
struct netdev_linux *netdev = netdev_linux_cast(netdev_);
int error = 0;
+ ovs_mutex_lock(&netdev->mutex);
if (!(netdev->cache_valid & VALID_DRVINFO)) {
struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev->drvinfo;
smap_add(smap, "driver_version", netdev->drvinfo.version);
smap_add(smap, "firmware_version", netdev->drvinfo.fw_version);
}
+ ovs_mutex_unlock(&netdev->mutex);
+
return error;
}
}
static int
-netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
- enum netdev_flags on, enum netdev_flags *old_flagsp)
+update_flags(struct netdev_linux *netdev, enum netdev_flags off,
+ enum netdev_flags on, enum netdev_flags *old_flagsp)
+ OVS_REQUIRES(netdev->mutex)
{
- struct netdev_linux *netdev = netdev_linux_cast(netdev_);
int old_flags, new_flags;
int error = 0;
*old_flagsp = iff_to_nd_flags(old_flags);
new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
if (new_flags != old_flags) {
- error = set_flags(netdev_get_name(netdev_), new_flags);
- get_flags(netdev_, &netdev->ifi_flags);
+ error = set_flags(netdev_get_name(&netdev->up), new_flags);
+ get_flags(&netdev->up, &netdev->ifi_flags);
}
+
+ return error;
+}
+
+static int
+netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
+ enum netdev_flags on, enum netdev_flags *old_flagsp)
+{
+ struct netdev_linux *netdev = netdev_linux_cast(netdev_);
+ int error;
+
+ ovs_mutex_lock(&netdev->mutex);
+ error = update_flags(netdev, off, on, old_flagsp);
+ ovs_mutex_unlock(&netdev->mutex);
+
return error;
}
static unsigned int
-netdev_linux_change_seq(const struct netdev *netdev)
+netdev_linux_change_seq(const struct netdev *netdev_)
{
- return netdev_linux_cast(netdev)->change_seq;
+ struct netdev_linux *netdev = netdev_linux_cast(netdev_);
+ unsigned int change_seq;
+
+ ovs_mutex_lock(&netdev->mutex);
+ change_seq = netdev->change_seq;
+ ovs_mutex_unlock(&netdev->mutex);
+
+ return change_seq;
}
#define NETDEV_LINUX_CLASS(NAME, CONSTRUCT, GET_STATS, SET_STATS, \
* Network device implementations may read these members but should not modify
* them. */
struct netdev {
+ /* The following do not change during the lifetime of a struct netdev. */
char *name; /* Name of network device. */
const struct netdev_class *netdev_class; /* Functions to control
this device. */
+
+ /* The following are protected by 'netdev_mutex' (internal to netdev.c). */
int ref_cnt; /* Times this devices was opened. */
struct shash_node *node; /* Pointer to element in global map. */
struct list saved_flags_list; /* Contains "struct netdev_saved_flags". */
int netdev_register_provider(const struct netdev_class *);
int netdev_unregister_provider(const char *type);
-const struct netdev_class *netdev_lookup_provider(const char *type);
extern const struct netdev_class netdev_linux_class;
extern const struct netdev_class netdev_internal_class;
struct netdev_vport {
struct netdev up;
+
+ /* Protects all members below. */
+ struct ovs_mutex mutex;
+
unsigned int change_seq;
uint8_t etheraddr[ETH_ADDR_LEN];
struct netdev_stats stats;
};
static int netdev_vport_construct(struct netdev *);
-static int get_patch_config(const struct netdev *, struct smap *args);
+static int get_patch_config(const struct netdev *netdev, struct smap *args);
static int get_tunnel_config(const struct netdev *, struct smap *args);
-static void netdev_vport_poll_notify(struct netdev_vport *);
+static void netdev_vport_poll_notify(struct netdev_vport *netdev)
+ OVS_REQUIRES(netdev->mutex);
static bool
is_vport_class(const struct netdev_class *class)
{
struct netdev_vport *netdev = netdev_vport_cast(netdev_);
+ ovs_mutex_init(&netdev->mutex, PTHREAD_MUTEX_NORMAL);
netdev->change_seq = 1;
eth_addr_random(netdev->etheraddr);
route_table_unregister();
free(netdev->peer);
+ ovs_mutex_destroy(&netdev->mutex);
}
static void
const uint8_t mac[ETH_ADDR_LEN])
{
struct netdev_vport *netdev = netdev_vport_cast(netdev_);
+
+ ovs_mutex_lock(&netdev->mutex);
memcpy(netdev->etheraddr, mac, ETH_ADDR_LEN);
netdev_vport_poll_notify(netdev);
+ ovs_mutex_unlock(&netdev->mutex);
+
return 0;
}
static int
-netdev_vport_get_etheraddr(const struct netdev *netdev,
+netdev_vport_get_etheraddr(const struct netdev *netdev_,
uint8_t mac[ETH_ADDR_LEN])
{
- memcpy(mac, netdev_vport_cast(netdev)->etheraddr, ETH_ADDR_LEN);
+ struct netdev_vport *netdev = netdev_vport_cast(netdev_);
+
+ ovs_mutex_lock(&netdev->mutex);
+ memcpy(mac, netdev->etheraddr, ETH_ADDR_LEN);
+ ovs_mutex_unlock(&netdev->mutex);
+
return 0;
}
static int
-tunnel_get_status(const struct netdev *netdev, struct smap *smap)
+tunnel_get_status(const struct netdev *netdev_, struct smap *smap)
{
+ struct netdev_vport *netdev = netdev_vport_cast(netdev_);
char iface[IFNAMSIZ];
ovs_be32 route;
- route = netdev_vport_cast(netdev)->tnl_cfg.ip_dst;
+ ovs_mutex_lock(&netdev->mutex);
+ route = netdev->tnl_cfg.ip_dst;
+ ovs_mutex_unlock(&netdev->mutex);
+
if (route_table_get_name(route, iface)) {
struct netdev *egress_netdev;
&tnl_cfg.out_key_present,
&tnl_cfg.out_key_flow);
+ ovs_mutex_lock(&dev->mutex);
dev->tnl_cfg = tnl_cfg;
netdev_vport_poll_notify(dev);
+ ovs_mutex_unlock(&dev->mutex);
return 0;
}
static int
get_tunnel_config(const struct netdev *dev, struct smap *args)
{
- const struct netdev_tunnel_config *tnl_cfg =
- &netdev_vport_cast(dev)->tnl_cfg;
+ struct netdev_vport *netdev = netdev_vport_cast(dev);
+ struct netdev_tunnel_config tnl_cfg;
- if (tnl_cfg->ip_dst) {
- smap_add_format(args, "remote_ip", IP_FMT, IP_ARGS(tnl_cfg->ip_dst));
- } else if (tnl_cfg->ip_dst_flow) {
+ ovs_mutex_lock(&netdev->mutex);
+ tnl_cfg = netdev->tnl_cfg;
+ ovs_mutex_unlock(&netdev->mutex);
+
+ if (tnl_cfg.ip_dst) {
+ smap_add_format(args, "remote_ip", IP_FMT, IP_ARGS(tnl_cfg.ip_dst));
+ } else if (tnl_cfg.ip_dst_flow) {
smap_add(args, "remote_ip", "flow");
}
- if (tnl_cfg->ip_src) {
- smap_add_format(args, "local_ip", IP_FMT, IP_ARGS(tnl_cfg->ip_src));
- } else if (tnl_cfg->ip_src_flow) {
+ if (tnl_cfg.ip_src) {
+ smap_add_format(args, "local_ip", IP_FMT, IP_ARGS(tnl_cfg.ip_src));
+ } else if (tnl_cfg.ip_src_flow) {
smap_add(args, "local_ip", "flow");
}
- if (tnl_cfg->in_key_flow && tnl_cfg->out_key_flow) {
+ if (tnl_cfg.in_key_flow && tnl_cfg.out_key_flow) {
smap_add(args, "key", "flow");
- } else if (tnl_cfg->in_key_present && tnl_cfg->out_key_present
- && tnl_cfg->in_key == tnl_cfg->out_key) {
- smap_add_format(args, "key", "%"PRIu64, ntohll(tnl_cfg->in_key));
+ } else if (tnl_cfg.in_key_present && tnl_cfg.out_key_present
+ && tnl_cfg.in_key == tnl_cfg.out_key) {
+ smap_add_format(args, "key", "%"PRIu64, ntohll(tnl_cfg.in_key));
} else {
- if (tnl_cfg->in_key_flow) {
+ if (tnl_cfg.in_key_flow) {
smap_add(args, "in_key", "flow");
- } else if (tnl_cfg->in_key_present) {
+ } else if (tnl_cfg.in_key_present) {
smap_add_format(args, "in_key", "%"PRIu64,
- ntohll(tnl_cfg->in_key));
+ ntohll(tnl_cfg.in_key));
}
- if (tnl_cfg->out_key_flow) {
+ if (tnl_cfg.out_key_flow) {
smap_add(args, "out_key", "flow");
- } else if (tnl_cfg->out_key_present) {
+ } else if (tnl_cfg.out_key_present) {
smap_add_format(args, "out_key", "%"PRIu64,
- ntohll(tnl_cfg->out_key));
+ ntohll(tnl_cfg.out_key));
}
}
- if (tnl_cfg->ttl_inherit) {
+ if (tnl_cfg.ttl_inherit) {
smap_add(args, "ttl", "inherit");
- } else if (tnl_cfg->ttl != DEFAULT_TTL) {
- smap_add_format(args, "ttl", "%"PRIu8, tnl_cfg->ttl);
+ } else if (tnl_cfg.ttl != DEFAULT_TTL) {
+ smap_add_format(args, "ttl", "%"PRIu8, tnl_cfg.ttl);
}
- if (tnl_cfg->tos_inherit) {
+ if (tnl_cfg.tos_inherit) {
smap_add(args, "tos", "inherit");
- } else if (tnl_cfg->tos) {
- smap_add_format(args, "tos", "0x%x", tnl_cfg->tos);
+ } else if (tnl_cfg.tos) {
+ smap_add_format(args, "tos", "0x%x", tnl_cfg.tos);
}
- if (tnl_cfg->dst_port) {
- uint16_t dst_port = ntohs(tnl_cfg->dst_port);
+ if (tnl_cfg.dst_port) {
+ uint16_t dst_port = ntohs(tnl_cfg.dst_port);
const char *type = netdev_get_type(dev);
if ((!strcmp("vxlan", type) && dst_port != VXLAN_DST_PORT) ||
}
}
- if (tnl_cfg->csum) {
+ if (tnl_cfg.csum) {
smap_add(args, "csum", "true");
}
- if (!tnl_cfg->dont_fragment) {
+ if (!tnl_cfg.dont_fragment) {
smap_add(args, "df_default", "false");
}
\f
/* Code specific to patch ports. */
-const char *
-netdev_vport_patch_peer(const struct netdev *netdev)
+/* If 'netdev' is a patch port, returns the name of its peer as a malloc()'d
+ * string that the caller must free.
+ *
+ * If 'netdev' is not a patch port, returns NULL. */
+char *
+netdev_vport_patch_peer(const struct netdev *netdev_)
{
- return (netdev_vport_is_patch(netdev)
- ? netdev_vport_cast(netdev)->peer
- : NULL);
+ char *peer = NULL;
+
+ if (netdev_vport_is_patch(netdev_)) {
+ struct netdev_vport *netdev = netdev_vport_cast(netdev_);
+
+ ovs_mutex_lock(&netdev->mutex);
+ if (netdev->peer) {
+ peer = xstrdup(netdev->peer);
+ }
+ ovs_mutex_unlock(&netdev->mutex);
+ }
+
+ return peer;
}
void
{
if (is_vport_class(netdev_get_class(netdev))) {
struct netdev_vport *dev = netdev_vport_cast(netdev);
+
+ ovs_mutex_lock(&dev->mutex);
dev->stats.rx_packets += stats->n_packets;
dev->stats.rx_bytes += stats->n_bytes;
+ ovs_mutex_unlock(&dev->mutex);
}
}
{
if (is_vport_class(netdev_get_class(netdev))) {
struct netdev_vport *dev = netdev_vport_cast(netdev);
+
+ ovs_mutex_lock(&dev->mutex);
dev->stats.tx_packets += stats->n_packets;
dev->stats.tx_bytes += stats->n_bytes;
+ ovs_mutex_unlock(&dev->mutex);
}
}
{
struct netdev_vport *dev = netdev_vport_cast(dev_);
+ ovs_mutex_lock(&dev->mutex);
if (dev->peer) {
smap_add(args, "peer", dev->peer);
}
+ ovs_mutex_unlock(&dev->mutex);
+
return 0;
}
return EINVAL;
}
+ ovs_mutex_lock(&dev->mutex);
free(dev->peer);
dev->peer = xstrdup(peer);
netdev_vport_poll_notify(dev);
+ ovs_mutex_unlock(&dev->mutex);
+
return 0;
}
get_stats(const struct netdev *netdev, struct netdev_stats *stats)
{
struct netdev_vport *dev = netdev_vport_cast(netdev);
- memcpy(stats, &dev->stats, sizeof *stats);
+
+ ovs_mutex_lock(&dev->mutex);
+ *stats = dev->stats;
+ ovs_mutex_unlock(&dev->mutex);
+
return 0;
}
\f
TUNNEL_CLASS("vxlan", "vxlan_system"),
TUNNEL_CLASS("lisp", "lisp_system")
};
- static bool inited;
+ static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
- int i;
+ if (ovsthread_once_start(&once)) {
+ int i;
- if (!inited) {
- inited = true;
for (i = 0; i < ARRAY_SIZE(vport_classes); i++) {
netdev_register_provider(&vport_classes[i].netdev_class);
}
+ ovsthread_once_done(&once);
}
}
bool netdev_vport_is_patch(const struct netdev *);
-const char *netdev_vport_patch_peer(const struct netdev *netdev);
+char *netdev_vport_patch_peer(const struct netdev *netdev);
void netdev_vport_inc_rx(const struct netdev *,
const struct dpif_flow_stats *);
enum netdev_flags saved_values;
};
-static struct shash netdev_classes = SHASH_INITIALIZER(&netdev_classes);
+/* Protects 'netdev_shash' and the mutable members of struct netdev. */
+static struct ovs_mutex netdev_mutex = OVS_MUTEX_INITIALIZER;
/* All created network devices. */
-static struct shash netdev_shash = SHASH_INITIALIZER(&netdev_shash);
+static struct shash netdev_shash OVS_GUARDED_BY(netdev_mutex)
+ = SHASH_INITIALIZER(&netdev_shash);
+
+/* Protects 'netdev_classes' against insertions or deletions.
+ *
+ * This is not an rwlock for performance reasons but to allow recursive
+ * acquisition when calling into providers. For example, netdev_run() calls
+ * into provider 'run' functions, which might reasonably want to call one of
+ * the netdev functions that takes netdev_class_rwlock read-only. */
+static struct ovs_rwlock netdev_class_rwlock OVS_ACQ_BEFORE(netdev_mutex)
+ = OVS_RWLOCK_INITIALIZER;
+
+/* Contains 'struct netdev_registered_class'es. */
+static struct hmap netdev_classes OVS_GUARDED_BY(netdev_class_rwlock)
+ = HMAP_INITIALIZER(&netdev_classes);
+
+struct netdev_registered_class {
+ struct hmap_node hmap_node; /* In 'netdev_classes', by class->type. */
+ const struct netdev_class *class;
+ atomic_int ref_cnt; /* Number of 'struct netdev's of this class. */
+};
/* This is set pretty low because we probably won't learn anything from the
* additional log messages. */
static void
netdev_initialize(void)
+ OVS_EXCLUDED(netdev_class_rwlock, netdev_mutex)
{
- static bool inited;
-
- if (!inited) {
- inited = true;
+ static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
+ if (ovsthread_once_start(&once)) {
fatal_signal_add_hook(restore_all_flags, NULL, NULL, true);
netdev_vport_patch_register();
netdev_register_provider(&netdev_tap_class);
netdev_register_provider(&netdev_bsd_class);
#endif
- netdev_register_provider(&netdev_tunnel_class);
- netdev_register_provider(&netdev_pltap_class);
+ netdev_register_provider(&netdev_tunnel_class);
+ netdev_register_provider(&netdev_pltap_class);
+
+ ovsthread_once_done(&once);
}
}
* main poll loop. */
void
netdev_run(void)
+ OVS_EXCLUDED(netdev_class_rwlock, netdev_mutex)
{
- struct shash_node *node;
- SHASH_FOR_EACH(node, &netdev_classes) {
- const struct netdev_class *netdev_class = node->data;
- if (netdev_class->run) {
- netdev_class->run();
- }
+ struct netdev_registered_class *rc;
+
+ ovs_rwlock_rdlock(&netdev_class_rwlock);
+ HMAP_FOR_EACH (rc, hmap_node, &netdev_classes) {
+ rc->class->run();
}
+ ovs_rwlock_unlock(&netdev_class_rwlock);
}
/* Arranges for poll_block() to wake up when netdev_run() needs to be called.
* main poll loop. */
void
netdev_wait(void)
+ OVS_EXCLUDED(netdev_class_rwlock, netdev_mutex)
{
- struct shash_node *node;
- SHASH_FOR_EACH(node, &netdev_classes) {
- const struct netdev_class *netdev_class = node->data;
- if (netdev_class->wait) {
- netdev_class->wait();
+ struct netdev_registered_class *rc;
+
+ ovs_rwlock_rdlock(&netdev_class_rwlock);
+ HMAP_FOR_EACH (rc, hmap_node, &netdev_classes) {
+ rc->class->wait();
+ }
+ ovs_rwlock_unlock(&netdev_class_rwlock);
+}
+
+static struct netdev_registered_class *
+netdev_lookup_class(const char *type)
+ OVS_REQ_RDLOCK(netdev_class_rwlock)
+{
+ struct netdev_registered_class *rc;
+
+ HMAP_FOR_EACH_WITH_HASH (rc, hmap_node, hash_string(type, 0),
+ &netdev_classes) {
+ if (!strcmp(type, rc->class->type)) {
+ return rc;
}
}
+ return NULL;
}
/* Initializes and registers a new netdev provider. After successful
* registration, new netdevs of that type can be opened using netdev_open(). */
int
netdev_register_provider(const struct netdev_class *new_class)
+ OVS_EXCLUDED(netdev_class_rwlock, netdev_mutex)
{
- if (shash_find(&netdev_classes, new_class->type)) {
+ int error;
+
+ ovs_rwlock_wrlock(&netdev_class_rwlock);
+ if (netdev_lookup_class(new_class->type)) {
VLOG_WARN("attempted to register duplicate netdev provider: %s",
new_class->type);
- return EEXIST;
- }
-
- if (new_class->init) {
- int error = new_class->init();
- if (error) {
+ error = EEXIST;
+ } else {
+ error = new_class->init ? new_class->init() : 0;
+ if (!error) {
+ struct netdev_registered_class *rc;
+
+ rc = xmalloc(sizeof *rc);
+ hmap_insert(&netdev_classes, &rc->hmap_node,
+ hash_string(new_class->type, 0));
+ rc->class = new_class;
+ atomic_init(&rc->ref_cnt, 0);
+ } else {
VLOG_ERR("failed to initialize %s network device class: %s",
new_class->type, ovs_strerror(error));
- return error;
}
}
+ ovs_rwlock_unlock(&netdev_class_rwlock);
- shash_add(&netdev_classes, new_class->type, new_class);
-
- return 0;
+ return error;
}
/* Unregisters a netdev provider. 'type' must have been previously
* new netdevs of that type cannot be opened using netdev_open(). */
int
netdev_unregister_provider(const char *type)
+ OVS_EXCLUDED(netdev_class_rwlock, netdev_mutex)
{
- struct shash_node *del_node, *netdev_node;
+ struct netdev_registered_class *rc;
+ int error;
- del_node = shash_find(&netdev_classes, type);
- if (!del_node) {
+ ovs_rwlock_wrlock(&netdev_class_rwlock);
+ rc = netdev_lookup_class(type);
+ if (!rc) {
VLOG_WARN("attempted to unregister a netdev provider that is not "
"registered: %s", type);
- return EAFNOSUPPORT;
- }
+ error = EAFNOSUPPORT;
+ } else {
+ int ref_cnt;
- SHASH_FOR_EACH (netdev_node, &netdev_shash) {
- struct netdev *netdev = netdev_node->data;
- if (!strcmp(netdev->netdev_class->type, type)) {
+ atomic_read(&rc->ref_cnt, &ref_cnt);
+ if (!ref_cnt) {
+ hmap_remove(&netdev_classes, &rc->hmap_node);
+ free(rc);
+ error = 0;
+ } else {
VLOG_WARN("attempted to unregister in use netdev provider: %s",
type);
- return EBUSY;
+ error = EBUSY;
}
}
+ ovs_rwlock_unlock(&netdev_class_rwlock);
- shash_delete(&netdev_classes, del_node);
-
- return 0;
-}
-
-const struct netdev_class *
-netdev_lookup_provider(const char *type)
-{
- netdev_initialize();
- return shash_find_data(&netdev_classes, type && type[0] ? type : "system");
+ return error;
}
/* Clears 'types' and enumerates the types of all currently registered netdev
* providers into it. The caller must first initialize the sset. */
void
netdev_enumerate_types(struct sset *types)
+ OVS_EXCLUDED(netdev_mutex)
{
- struct shash_node *node;
+ struct netdev_registered_class *rc;
netdev_initialize();
sset_clear(types);
- SHASH_FOR_EACH(node, &netdev_classes) {
- const struct netdev_class *netdev_class = node->data;
- sset_add(types, netdev_class->type);
+ ovs_rwlock_rdlock(&netdev_class_rwlock);
+ HMAP_FOR_EACH (rc, hmap_node, &netdev_classes) {
+ sset_add(types, rc->class->type);
}
+ ovs_rwlock_unlock(&netdev_class_rwlock);
}
/* Check that the network device name is not the same as any of the registered
* Returns true if there is a name conflict, false otherwise. */
bool
netdev_is_reserved_name(const char *name)
+ OVS_EXCLUDED(netdev_mutex)
{
- struct shash_node *node;
+ struct netdev_registered_class *rc;
netdev_initialize();
- SHASH_FOR_EACH (node, &netdev_classes) {
- const char *dpif_port;
- dpif_port = netdev_vport_class_get_dpif_port(node->data);
+
+ ovs_rwlock_rdlock(&netdev_class_rwlock);
+ HMAP_FOR_EACH (rc, hmap_node, &netdev_classes) {
+ const char *dpif_port = netdev_vport_class_get_dpif_port(rc->class);
if (dpif_port && !strcmp(dpif_port, name)) {
+ ovs_rwlock_unlock(&netdev_class_rwlock);
return true;
}
}
+ ovs_rwlock_unlock(&netdev_class_rwlock);
if (!strncmp(name, "ovs-", 4)) {
struct sset types;
* before they can be used. */
int
netdev_open(const char *name, const char *type, struct netdev **netdevp)
+ OVS_EXCLUDED(netdev_mutex)
{
struct netdev *netdev;
int error;
netdev_initialize();
+ ovs_rwlock_rdlock(&netdev_class_rwlock);
+ ovs_mutex_lock(&netdev_mutex);
netdev = shash_find_data(&netdev_shash, name);
if (!netdev) {
- const struct netdev_class *class;
+ struct netdev_registered_class *rc;
- class = netdev_lookup_provider(type);
- if (class) {
- netdev = class->alloc();
+ rc = netdev_lookup_class(type && type[0] ? type : "system");
+ if (rc) {
+ netdev = rc->class->alloc();
if (netdev) {
memset(netdev, 0, sizeof *netdev);
- netdev->netdev_class = class;
+ netdev->netdev_class = rc->class;
netdev->name = xstrdup(name);
netdev->node = shash_add(&netdev_shash, name, netdev);
list_init(&netdev->saved_flags_list);
- error = class->construct(netdev);
- if (error) {
- class->dealloc(netdev);
+ error = rc->class->construct(netdev);
+ if (!error) {
+ int old_ref_cnt;
+
+ atomic_add(&rc->ref_cnt, 1, &old_ref_cnt);
+ } else {
+ free(netdev->name);
+ ovs_assert(list_is_empty(&netdev->saved_flags_list));
+ shash_delete(&netdev_shash, netdev->node);
+ rc->class->dealloc(netdev);
}
} else {
error = ENOMEM;
error = 0;
}
+ ovs_mutex_unlock(&netdev_mutex);
+ ovs_rwlock_unlock(&netdev_class_rwlock);
+
if (!error) {
netdev->ref_cnt++;
*netdevp = netdev;
* 'netdev_' is null. */
struct netdev *
netdev_ref(const struct netdev *netdev_)
+ OVS_EXCLUDED(netdev_mutex)
{
struct netdev *netdev = CONST_CAST(struct netdev *, netdev_);
if (netdev) {
+ ovs_mutex_lock(&netdev_mutex);
ovs_assert(netdev->ref_cnt > 0);
netdev->ref_cnt++;
+ ovs_mutex_unlock(&netdev_mutex);
}
return netdev;
}
* or NULL if none are needed. */
int
netdev_set_config(struct netdev *netdev, const struct smap *args)
+ OVS_EXCLUDED(netdev_mutex)
{
if (netdev->netdev_class->set_config) {
- struct smap no_args = SMAP_INITIALIZER(&no_args);
+ const struct smap no_args = SMAP_INITIALIZER(&no_args);
return netdev->netdev_class->set_config(netdev,
args ? args : &no_args);
} else if (args && !smap_is_empty(args)) {
* smap_destroy(). */
int
netdev_get_config(const struct netdev *netdev, struct smap *args)
+ OVS_EXCLUDED(netdev_mutex)
{
int error;
const struct netdev_tunnel_config *
netdev_get_tunnel_config(const struct netdev *netdev)
+ OVS_EXCLUDED(netdev_mutex)
{
if (netdev->netdev_class->get_tunnel_config) {
return netdev->netdev_class->get_tunnel_config(netdev);
static void
netdev_unref(struct netdev *dev)
+ OVS_RELEASES(netdev_mutex)
{
ovs_assert(dev->ref_cnt);
if (!--dev->ref_cnt) {
+ const struct netdev_class *class = dev->netdev_class;
+ struct netdev_registered_class *rc;
+ int old_ref_cnt;
+
dev->netdev_class->destruct(dev);
shash_delete(&netdev_shash, dev->node);
free(dev->name);
dev->netdev_class->dealloc(dev);
+ ovs_mutex_unlock(&netdev_mutex);
+
+ ovs_rwlock_rdlock(&netdev_class_rwlock);
+ rc = netdev_lookup_class(class->type);
+ atomic_sub(&rc->ref_cnt, 1, &old_ref_cnt);
+ ovs_assert(old_ref_cnt > 0);
+ ovs_rwlock_unlock(&netdev_class_rwlock);
+ } else {
+ ovs_mutex_unlock(&netdev_mutex);
}
}
/* Closes and destroys 'netdev'. */
void
netdev_close(struct netdev *netdev)
+ OVS_EXCLUDED(netdev_mutex)
{
if (netdev) {
+ ovs_mutex_lock(&netdev_mutex);
netdev_unref(netdev);
}
}
int
netdev_rx_open(struct netdev *netdev, struct netdev_rx **rxp)
+ OVS_EXCLUDED(netdev_mutex)
{
int error;
rx->netdev = netdev;
error = netdev->netdev_class->rx_construct(rx);
if (!error) {
+ ovs_mutex_lock(&netdev_mutex);
netdev->ref_cnt++;
+ ovs_mutex_unlock(&netdev_mutex);
+
*rxp = rx;
return 0;
}
void
netdev_rx_close(struct netdev_rx *rx)
+ OVS_EXCLUDED(netdev_mutex)
{
if (rx) {
struct netdev *netdev = rx->netdev;
do_update_flags(struct netdev *netdev, enum netdev_flags off,
enum netdev_flags on, enum netdev_flags *old_flagsp,
struct netdev_saved_flags **sfp)
+ OVS_EXCLUDED(netdev_mutex)
{
struct netdev_saved_flags *sf = NULL;
enum netdev_flags old_flags;
enum netdev_flags new_flags = (old_flags & ~off) | on;
enum netdev_flags changed_flags = old_flags ^ new_flags;
if (changed_flags) {
+ ovs_mutex_lock(&netdev_mutex);
*sfp = sf = xmalloc(sizeof *sf);
sf->netdev = netdev;
list_push_front(&netdev->saved_flags_list, &sf->node);
sf->saved_values = changed_flags & new_flags;
netdev->ref_cnt++;
+ ovs_mutex_unlock(&netdev_mutex);
}
}
* Does nothing if 'sf' is NULL. */
void
netdev_restore_flags(struct netdev_saved_flags *sf)
+ OVS_EXCLUDED(netdev_mutex)
{
if (sf) {
struct netdev *netdev = sf->netdev;
sf->saved_flags & sf->saved_values,
sf->saved_flags & ~sf->saved_values,
&old_flags);
+
+ ovs_mutex_lock(&netdev_mutex);
list_remove(&sf->node);
free(sf);
-
netdev_unref(netdev);
}
}
* The caller must free the returned netdev with netdev_close(). */
struct netdev *
netdev_from_name(const char *name)
+ OVS_EXCLUDED(netdev_mutex)
{
struct netdev *netdev;
+ ovs_mutex_lock(&netdev_mutex);
netdev = shash_find_data(&netdev_shash, name);
if (netdev) {
- netdev_ref(netdev);
+ netdev->ref_cnt++;
}
+ ovs_mutex_unlock(&netdev_mutex);
return netdev;
}
void
netdev_get_devices(const struct netdev_class *netdev_class,
struct shash *device_list)
+ OVS_EXCLUDED(netdev_mutex)
{
struct shash_node *node;
+
+ ovs_mutex_lock(&netdev_mutex);
SHASH_FOR_EACH (node, &netdev_shash) {
struct netdev *dev = node->data;
shash_add(device_list, node->name, node->data);
}
}
+ ovs_mutex_unlock(&netdev_mutex);
}
const char *
* Every port on a switch must have a corresponding netdev that must minimally
* support a few operations, such as the ability to read the netdev's MTU.
* The PORTING file at the top of the source tree has more information in the
- * "Writing a netdev Provider" section. */
+ * "Writing a netdev Provider" section.
+ *
+ * Thread-safety
+ * =============
+ *
+ * Most of the netdev functions are fully thread-safe: they may be called from
+ * any number of threads on the same or different netdev objects. The
+ * exceptions are:
+ *
+ * netdev_rx_recv()
+ * netdev_rx_wait()
+ * netdev_rx_drain()
+ *
+ * These functions are conditionally thread-safe: they may be called from
+ * different threads only on different netdev_rx objects. (The client may
+ * create multiple netdev_rx objects for a single netdev and access each
+ * of those from a different thread.) */
struct netdev;
struct netdev_class;
htonl(flow->regs[i]), htonl(match->wc.masks.regs[i]));
}
+ /* Mark. */
+ nxm_put_32m(b, NXM_NX_PKT_MARK, htonl(flow->pkt_mark),
+ htonl(match->wc.masks.pkt_mark));
+
/* OpenFlow 1.1+ Metadata. */
nxm_put_64m(b, OXM_OF_METADATA, flow->metadata, match->wc.masks.metadata);
break;
case OVS_KEY_ATTR_SKB_MARK:
- flow->skb_mark = nl_attr_get_u32(a);
+ flow->pkt_mark = nl_attr_get_u32(a);
break;
case OVS_KEY_ATTR_ETHERNET:
tun_key_to_attr(buf, &data->tunnel);
}
- nl_msg_put_u32(buf, OVS_KEY_ATTR_SKB_MARK, data->skb_mark);
+ nl_msg_put_u32(buf, OVS_KEY_ATTR_SKB_MARK, data->pkt_mark);
/* Add an ingress port attribute if this is a mask or 'odp_in_port'
* is not the magical value "ODPP_NONE". */
}
if (present_attrs & (UINT64_C(1) << OVS_KEY_ATTR_SKB_MARK)) {
- flow->skb_mark = nl_attr_get_u32(attrs[OVS_KEY_ATTR_SKB_MARK]);
+ flow->pkt_mark = nl_attr_get_u32(attrs[OVS_KEY_ATTR_SKB_MARK]);
expected_attrs |= UINT64_C(1) << OVS_KEY_ATTR_SKB_MARK;
}
}
void
-odp_put_skb_mark_action(const uint32_t skb_mark,
+odp_put_pkt_mark_action(const uint32_t pkt_mark,
struct ofpbuf *odp_actions)
{
- commit_set_action(odp_actions, OVS_KEY_ATTR_SKB_MARK, &skb_mark,
- sizeof(skb_mark));
+ commit_set_action(odp_actions, OVS_KEY_ATTR_SKB_MARK, &pkt_mark,
+ sizeof(pkt_mark));
}
/* If any of the flow key data that ODP actions can modify are different in
}
static void
-commit_set_skb_mark_action(const struct flow *flow, struct flow *base,
+commit_set_pkt_mark_action(const struct flow *flow, struct flow *base,
struct ofpbuf *odp_actions,
struct flow_wildcards *wc)
{
- if (base->skb_mark == flow->skb_mark) {
+ if (base->pkt_mark == flow->pkt_mark) {
return;
}
- memset(&wc->masks.skb_mark, 0xff, sizeof wc->masks.skb_mark);
- base->skb_mark = flow->skb_mark;
+ memset(&wc->masks.pkt_mark, 0xff, sizeof wc->masks.pkt_mark);
+ base->pkt_mark = flow->pkt_mark;
- odp_put_skb_mark_action(base->skb_mark, odp_actions);
+ odp_put_pkt_mark_action(base->pkt_mark, odp_actions);
}
/* If any of the flow key data that ODP actions can modify are different in
* 'base' and 'flow', appends ODP actions to 'odp_actions' that change the flow
*/
commit_mpls_action(flow, base, odp_actions, wc);
commit_set_priority_action(flow, base, odp_actions, wc);
- commit_set_skb_mark_action(flow, base, odp_actions, wc);
+ commit_set_pkt_mark_action(flow, base, odp_actions, wc);
}
struct ofpbuf *odp_actions);
void odp_put_tunnel_action(const struct flow_tnl *tunnel,
struct ofpbuf *odp_actions);
-void odp_put_skb_mark_action(const uint32_t skb_mark,
+void odp_put_pkt_mark_action(const uint32_t pkt_mark,
struct ofpbuf *odp_actions);
/* Reasons why a subfacet might not be fast-pathable. */
}
}
+ if (pin.fmd.pkt_mark != 0) {
+ ds_put_format(string, " pkt_mark=0x%"PRIx32, pin.fmd.pkt_mark);
+ }
+
ds_put_format(string, " (via %s)",
ofputil_packet_in_reason_to_string(pin.reason, reasonbuf,
sizeof reasonbuf));
return OFPUTIL_P_NONE;
}
- /* skb_mark and skb_priority can't be sent in a flow_mod */
- if (wc->masks.skb_mark || wc->masks.skb_priority) {
+ /* skb_priority can't be sent in a flow_mod */
+ if (wc->masks.skb_priority) {
return OFPUTIL_P_NONE;
}
+ /* NXM and OXM support pkt_mark */
+ if (wc->masks.pkt_mark) {
+ return OFPUTIL_P_OF10_NXM_ANY | OFPUTIL_P_OF12_OXM
+ | OFPUTIL_P_OF13_OXM;
+ }
+
/* NXM, OXM, and OF1.1 support bitwise matching on ethernet addresses. */
if (!eth_mask_is_exact(wc->masks.dl_src)
&& !eth_addr_is_zero(wc->masks.dl_src)) {
pin->fmd.tun_dst = match->flow.tunnel.ip_dst;
pin->fmd.metadata = match->flow.metadata;
memcpy(pin->fmd.regs, match->flow.regs, sizeof pin->fmd.regs);
+ pin->fmd.pkt_mark = match->flow.pkt_mark;
}
enum ofperr
}
}
+ if (pin->fmd.pkt_mark != 0) {
+ match_set_pkt_mark(match, pin->fmd.pkt_mark);
+ }
+
match_set_in_port(match, pin->fmd.in_port);
}
}
static inline bool ovsthread_once_start(struct ovsthread_once *once)
- OVS_TRY_LOCK(true, &once->mutex);
+ OVS_TRY_LOCK(true, once->mutex);
void ovsthread_once_done(struct ovsthread_once *once)
- OVS_RELEASES(&once->mutex);
+ OVS_RELEASES(once->mutex);
bool ovsthread_once_start__(struct ovsthread_once *once)
- OVS_TRY_LOCK(false, &once->mutex);
+ OVS_TRY_LOCK(false, once->mutex);
static inline bool
ovsthread_once_is_done__(const struct ovsthread_once *once)
return OVS_UNLIKELY(!ovsthread_once_is_done__(once)
&& !ovsthread_once_start__(once));
}
-
-#ifdef __CHECKER__
-#define ovsthread_once_start(ONCE) \
- ((ONCE)->done ? false : ({ OVS_MACRO_LOCK((&ONCE->mutex)); true; }))
-#endif
\f
/* Thread ID.
*
#include "fatal-signal.h"
#include "list.h"
#include "ovs-thread.h"
+#include "seq.h"
#include "socket-util.h"
#include "timeval.h"
#include "vlog.h"
/* Handle any pending signals before doing anything else. */
fatal_signal_run();
+
+ seq_woke();
}
\f
static void
--- /dev/null
+/*
+ * Copyright (c) 2013 Nicira, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <config.h>
+
+#include "seq.h"
+
+#include <stdbool.h>
+
+#include "hash.h"
+#include "hmap.h"
+#include "latch.h"
+#include "list.h"
+#include "ovs-thread.h"
+#include "poll-loop.h"
+
+/* A sequence number object. */
+struct seq {
+ uint64_t value OVS_GUARDED;
+ struct hmap waiters OVS_GUARDED; /* Contains 'struct seq_waiter's. */
+};
+
+/* A thread waiting on a particular seq. */
+struct seq_waiter {
+ struct seq *seq OVS_GUARDED; /* Seq being waited for. */
+ struct hmap_node hmap_node OVS_GUARDED; /* In 'seq->waiters'. */
+ unsigned int ovsthread_id OVS_GUARDED; /* Key in 'waiters' hmap. */
+
+ struct seq_thread *thread OVS_GUARDED; /* Thread preparing to wait. */
+ struct list list_node OVS_GUARDED; /* In 'thread->waiters'. */
+
+ uint64_t value OVS_GUARDED; /* seq->value we're waiting to change. */
+};
+
+/* A thread that might be waiting on one or more seqs. */
+struct seq_thread {
+ struct list waiters OVS_GUARDED; /* Contains 'struct seq_waiter's. */
+ struct latch latch OVS_GUARDED; /* Wakeup latch for this thread. */
+ bool waiting OVS_GUARDED; /* True if latch_wait() already called. */
+};
+
+static struct ovs_mutex seq_mutex = OVS_ADAPTIVE_MUTEX_INITIALIZER;
+
+static uint64_t seq_next OVS_GUARDED_BY(seq_mutex) = 1;
+
+static pthread_key_t seq_thread_key;
+
+static void seq_init(void);
+static struct seq_thread *seq_thread_get(void) OVS_REQUIRES(seq_mutex);
+static void seq_thread_exit(void *thread_) OVS_EXCLUDED(seq_mutex);
+static void seq_thread_woke(struct seq_thread *) OVS_REQUIRES(seq_mutex);
+static void seq_waiter_destroy(struct seq_waiter *) OVS_REQUIRES(seq_mutex);
+static void seq_wake_waiters(struct seq *) OVS_REQUIRES(seq_mutex);
+
+/* Creates and returns a new 'seq' object. */
+struct seq * OVS_EXCLUDED(seq_mutex)
+seq_create(void)
+{
+ struct seq *seq;
+
+ seq_init();
+
+ seq = xmalloc(sizeof *seq);
+ ovs_mutex_lock(&seq_mutex);
+ seq->value = seq_next++;
+ hmap_init(&seq->waiters);
+ ovs_mutex_unlock(&seq_mutex);
+
+ return seq;
+}
+
+/* Destroys 'seq', waking up threads that were waiting on it, if any. */
+void
+seq_destroy(struct seq *seq)
+ OVS_EXCLUDED(seq_mutex)
+{
+ ovs_mutex_lock(&seq_mutex);
+ seq_wake_waiters(seq);
+ hmap_destroy(&seq->waiters);
+ free(seq);
+ ovs_mutex_unlock(&seq_mutex);
+}
+
+/* Increments 'seq''s sequence number, waking up any threads that are waiting
+ * on 'seq'. */
+void
+seq_change(struct seq *seq)
+ OVS_EXCLUDED(seq_mutex)
+{
+ ovs_mutex_lock(&seq_mutex);
+ seq->value = seq_next++;
+ seq_wake_waiters(seq);
+ ovs_mutex_unlock(&seq_mutex);
+}
+
+/* Returns 'seq''s current sequence number (which could change immediately).
+ *
+ * seq_read() and seq_wait() can be used together to yield a race-free wakeup
+ * when an object changes, even without an ability to lock the object. See
+ * Usage in seq.h for details. */
+uint64_t
+seq_read(const struct seq *seq)
+ OVS_EXCLUDED(seq_mutex)
+{
+ uint64_t value;
+
+ ovs_mutex_lock(&seq_mutex);
+ value = seq->value;
+ ovs_mutex_unlock(&seq_mutex);
+
+ return value;
+}
+
+static void
+seq_wait__(struct seq *seq, uint64_t value)
+ OVS_REQUIRES(seq_mutex)
+{
+ unsigned int id = ovsthread_id_self();
+ uint32_t hash = hash_int(id, 0);
+ struct seq_waiter *waiter;
+
+ HMAP_FOR_EACH_IN_BUCKET (waiter, hmap_node, hash, &seq->waiters) {
+ if (waiter->ovsthread_id == id) {
+ if (waiter->value != value) {
+ /* The current value is different from the value we've already
+ * waited for, */
+ poll_immediate_wake();
+ } else {
+ /* Already waiting on 'value', nothing more to do. */
+ }
+ return;
+ }
+ }
+
+ waiter = xmalloc(sizeof *waiter);
+ waiter->seq = seq;
+ hmap_insert(&seq->waiters, &waiter->hmap_node, hash);
+ waiter->value = value;
+ waiter->thread = seq_thread_get();
+ list_push_back(&waiter->thread->waiters, &waiter->list_node);
+
+ if (!waiter->thread->waiting) {
+ latch_wait(&waiter->thread->latch);
+ waiter->thread->waiting = true;
+ }
+}
+
+/* Causes the following poll_block() to wake up when 'seq''s sequence number
+ * changes from 'value'. (If 'seq''s sequence number isn't 'value', then
+ * poll_block() won't block at all.)
+ *
+ * seq_read() and seq_wait() can be used together to yield a race-free wakeup
+ * when an object changes, even without an ability to lock the object. See
+ * Usage in seq.h for details. */
+void
+seq_wait(const struct seq *seq_, uint64_t value)
+ OVS_EXCLUDED(seq_mutex)
+{
+ struct seq *seq = CONST_CAST(struct seq *, seq_);
+
+ ovs_mutex_lock(&seq_mutex);
+ if (value == seq->value) {
+ seq_wait__(seq, value);
+ } else {
+ poll_immediate_wake();
+ }
+ ovs_mutex_unlock(&seq_mutex);
+}
+
+/* Called by poll_block() just before it returns, this function destroys any
+ * seq_waiter objects associated with the current thread. */
+void
+seq_woke(void)
+ OVS_EXCLUDED(seq_mutex)
+{
+ struct seq_thread *thread;
+
+ seq_init();
+
+ thread = pthread_getspecific(seq_thread_key);
+ if (thread) {
+ ovs_mutex_lock(&seq_mutex);
+ seq_thread_woke(thread);
+ thread->waiting = false;
+ ovs_mutex_unlock(&seq_mutex);
+ }
+}
+\f
+static void
+seq_init(void)
+{
+ static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
+
+ if (ovsthread_once_start(&once)) {
+ xpthread_key_create(&seq_thread_key, seq_thread_exit);
+ ovsthread_once_done(&once);
+ }
+}
+
+static struct seq_thread *
+seq_thread_get(void)
+ OVS_REQUIRES(seq_mutex)
+{
+ struct seq_thread *thread = pthread_getspecific(seq_thread_key);
+ if (!thread) {
+ thread = xmalloc(sizeof *thread);
+ list_init(&thread->waiters);
+ latch_init(&thread->latch);
+ thread->waiting = false;
+
+ xpthread_setspecific(seq_thread_key, thread);
+ }
+ return thread;
+}
+
+static void
+seq_thread_exit(void *thread_)
+ OVS_EXCLUDED(seq_mutex)
+{
+ struct seq_thread *thread = thread_;
+
+ ovs_mutex_lock(&seq_mutex);
+ seq_thread_woke(thread);
+ latch_destroy(&thread->latch);
+ free(thread);
+ ovs_mutex_unlock(&seq_mutex);
+}
+
+static void
+seq_thread_woke(struct seq_thread *thread)
+ OVS_REQUIRES(seq_mutex)
+{
+ struct seq_waiter *waiter, *next_waiter;
+
+ LIST_FOR_EACH_SAFE (waiter, next_waiter, list_node, &thread->waiters) {
+ ovs_assert(waiter->thread == thread);
+ seq_waiter_destroy(waiter);
+ }
+ latch_poll(&thread->latch);
+}
+
+static void
+seq_waiter_destroy(struct seq_waiter *waiter)
+ OVS_REQUIRES(seq_mutex)
+{
+ hmap_remove(&waiter->seq->waiters, &waiter->hmap_node);
+ list_remove(&waiter->list_node);
+ free(waiter);
+}
+
+static void
+seq_wake_waiters(struct seq *seq)
+ OVS_REQUIRES(seq_mutex)
+{
+ struct seq_waiter *waiter, *next_waiter;
+
+ HMAP_FOR_EACH_SAFE (waiter, next_waiter, hmap_node, &seq->waiters) {
+ latch_set(&waiter->thread->latch);
+ seq_waiter_destroy(waiter);
+ }
+}
--- /dev/null
+/*
+ * Copyright (c) 2013 Nicira, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SEQ_H
+#define SEQ_H 1
+
+/* Thread-safe, pollable sequence number.
+ *
+ *
+ * Motivation
+ * ==========
+ *
+ * It is sometimes desirable to take an action whenever an object changes.
+ * Suppose we associate a sequence number with an object and increment the
+ * sequence number whenver we change the object. An observer can then record
+ * the sequence number it sees. Later on, if the current sequence number
+ * differs from the one it saw last, then the observer knows to examine the
+ * object for changes.
+ *
+ * Code that wants to run when a sequence number changes is challenging to
+ * implement in a multithreaded environment. A naive implementation, that
+ * simply checks whether the sequence number changed and, if so, calls
+ * poll_immediate_wake(), will fail when another thread increments the sequence
+ * number after the check (including during poll_block()).
+ *
+ * struct seq is a solution. It implements a sequence number along with enough
+ * internal infrastructure so that a thread waiting on a particular value will
+ * wake up if the sequence number changes, or even if the "struct seq" is
+ * destroyed.
+ *
+ *
+ * Usage
+ * =====
+ *
+ * The object that includes a sequence number should use seq_create() and
+ * seq_destroy() at creation and destruction, and seq_change() whenever the
+ * object's observable state changes.
+ *
+ * An observer may seq_read() to read the current sequence number and
+ * seq_wait() to cause poll_block() to wake up when the sequence number changes
+ * from a specified value.
+ *
+ * To avoid races, observers should use seq_read() to check for changes,
+ * process any changes, and then use seq_wait() to wait for a change from the
+ * previously read value. That is, a correct usage looks something like this:
+ *
+ * new_seq = seq_read(seq);
+ * if (new_seq != last_seq) {
+ * ...process changes...
+ * last_seq = new_seq;
+ * }
+ * seq_wait(seq, new_seq);
+ * poll_block();
+ *
+ *
+ * Alternate Usage
+ * ===============
+ *
+ * struct seq can also be used as a sort of pollable condition variable.
+ * Suppose that we want a thread to process items in a queue, and thus to be
+ * able to wake up whenever the queue is nonempty. This requires a lock to
+ * protect the queue and a seq to signal that the queue has become nonempty,
+ * e.g.:
+ *
+ * struct ovs_mutex mutex;
+ * struct list queue OVS_GUARDED_BY(mutex);
+ * struct seq nonempty_seq;
+ *
+ * To add an element to the queue:
+ *
+ * ovs_mutex_lock(&mutex);
+ * list_push_back(&queue, ...element...);
+ * if (list_is_singleton(&queue)) { // The 'if' test here is optional.
+ * seq_change(&nonempty_seq);
+ * }
+ * ovs_mutex_unlock(&mutex);
+ *
+ * To wait for the queue to become nonempty:
+ *
+ * ovs_mutex_lock(&mutex);
+ * if (list_is_empty(&queue)) {
+ * seq_wait(&nonempty_seq, seq_read(&nonempty_seq));
+ * } else {
+ * poll_immediate_wake();
+ * }
+ * ovs_mutex_unlock(&mutex);
+ *
+ * (In the above code 'mutex' prevents the queue from changing between
+ * seq_read() and seq_wait(). Otherwise, it would be necessary to seq_read(),
+ * check for a nonempty queue, and then seq_wait() on the previously read
+ * sequence number, as under Usage above.)
+ *
+ *
+ * Thread-safety
+ * =============
+ *
+ * Fully thread safe.
+ */
+
+#include <stdint.h>
+
+/* For implementation of an object with a sequence number attached. */
+struct seq *seq_create(void);
+void seq_destroy(struct seq *);
+void seq_change(struct seq *);
+
+/* For observers. */
+uint64_t seq_read(const struct seq *);
+void seq_wait(const struct seq *, uint64_t value);
+
+/* For poll_block() internal use. */
+void seq_woke(void);
+
+#endif /* seq.h */
ofproto/ofproto-dpif-mirror.h \
ofproto/ofproto-dpif-sflow.c \
ofproto/ofproto-dpif-sflow.h \
+ ofproto/ofproto-dpif-upcall.c \
+ ofproto/ofproto-dpif-upcall.h \
ofproto/ofproto-dpif-xlate.c \
ofproto/ofproto-dpif-xlate.h \
ofproto/ofproto-provider.h \
--- /dev/null
+/* Copyright (c) 2009, 2010, 2011, 2012, 2013 Nicira, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+
+#include <config.h>
+#include "ofproto-dpif-upcall.h"
+
+#include <errno.h>
+#include <stdbool.h>
+#include <inttypes.h>
+
+#include "coverage.h"
+#include "dynamic-string.h"
+#include "dpif.h"
+#include "fail-open.h"
+#include "latch.h"
+#include "seq.h"
+#include "list.h"
+#include "netlink.h"
+#include "ofpbuf.h"
+#include "ofproto-dpif.h"
+#include "packets.h"
+#include "poll-loop.h"
+#include "vlog.h"
+
+#define MAX_QUEUE_LENGTH 512
+
+VLOG_DEFINE_THIS_MODULE(ofproto_dpif_upcall);
+
+COVERAGE_DEFINE(upcall_queue_overflow);
+COVERAGE_DEFINE(drop_queue_overflow);
+COVERAGE_DEFINE(miss_queue_overflow);
+COVERAGE_DEFINE(fmb_queue_overflow);
+
+/* A thread that processes each upcall handed to it by the dispatcher thread,
+ * forwards the upcall's packet, and then queues it to the main ofproto_dpif
+ * to possibly set up a kernel flow as a cache. */
+struct handler {
+ struct udpif *udpif; /* Parent udpif. */
+ pthread_t thread; /* Thread ID. */
+
+ struct ovs_mutex mutex; /* Mutex guarding the following. */
+
+ /* Atomic queue of unprocessed miss upcalls. */
+ struct list upcalls OVS_GUARDED;
+ size_t n_upcalls OVS_GUARDED;
+
+ pthread_cond_t wake_cond; /* Wakes 'thread' while holding
+ 'mutex'. */
+};
+
+/* An upcall handler for ofproto_dpif.
+ *
+ * udpif is implemented as a "dispatcher" thread that reads upcalls from the
+ * kernel. It processes each upcall just enough to figure out its next
+ * destination. For a "miss" upcall (MISS_UPCALL), this is one of several
+ * "handler" threads (see struct handler). Other upcalls are queued to the
+ * main ofproto_dpif. */
+struct udpif {
+ struct dpif *dpif; /* Datapath handle. */
+ struct dpif_backer *backer; /* Opaque dpif_backer pointer. */
+
+ uint32_t secret; /* Random seed for upcall hash. */
+
+ pthread_t dispatcher; /* Dispatcher thread ID. */
+
+ struct handler *handlers; /* Miss handlers. */
+ size_t n_handlers;
+
+ /* Atomic queue of unprocessed drop keys. */
+ struct ovs_mutex drop_key_mutex;
+ struct list drop_keys OVS_GUARDED;
+ size_t n_drop_keys OVS_GUARDED;
+
+ /* Atomic queue of special upcalls for ofproto-dpif to process. */
+ struct ovs_mutex upcall_mutex;
+ struct list upcalls OVS_GUARDED;
+ size_t n_upcalls OVS_GUARDED;
+
+ /* Atomic queue of flow_miss_batches. */
+ struct ovs_mutex fmb_mutex;
+ struct list fmbs OVS_GUARDED;
+ size_t n_fmbs OVS_GUARDED;
+
+ /* Number of times udpif_revalidate() has been called. */
+ atomic_uint reval_seq;
+
+ struct seq *wait_seq;
+ uint64_t last_seq;
+
+ struct latch exit_latch; /* Tells child threads to exit. */
+};
+
+static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
+
+static void recv_upcalls(struct udpif *);
+static void handle_miss_upcalls(struct udpif *, struct list *upcalls);
+static void miss_destroy(struct flow_miss *);
+static void *udpif_dispatcher(void *);
+static void *udpif_miss_handler(void *);
+
+struct udpif *
+udpif_create(struct dpif_backer *backer, struct dpif *dpif)
+{
+ struct udpif *udpif = xzalloc(sizeof *udpif);
+
+ udpif->dpif = dpif;
+ udpif->backer = backer;
+ udpif->secret = random_uint32();
+ udpif->wait_seq = seq_create();
+ latch_init(&udpif->exit_latch);
+ list_init(&udpif->drop_keys);
+ list_init(&udpif->upcalls);
+ list_init(&udpif->fmbs);
+ atomic_init(&udpif->reval_seq, 0);
+ ovs_mutex_init(&udpif->drop_key_mutex, PTHREAD_MUTEX_NORMAL);
+ ovs_mutex_init(&udpif->upcall_mutex, PTHREAD_MUTEX_NORMAL);
+ ovs_mutex_init(&udpif->fmb_mutex, PTHREAD_MUTEX_NORMAL);
+
+ return udpif;
+}
+
+void
+udpif_destroy(struct udpif *udpif)
+{
+ struct flow_miss_batch *fmb;
+ struct drop_key *drop_key;
+ struct upcall *upcall;
+
+ udpif_recv_set(udpif, 0, false);
+
+ while ((drop_key = drop_key_next(udpif))) {
+ drop_key_destroy(drop_key);
+ }
+
+ while ((upcall = upcall_next(udpif))) {
+ upcall_destroy(upcall);
+ }
+
+ while ((fmb = flow_miss_batch_next(udpif))) {
+ flow_miss_batch_destroy(fmb);
+ }
+
+ ovs_mutex_destroy(&udpif->drop_key_mutex);
+ ovs_mutex_destroy(&udpif->upcall_mutex);
+ ovs_mutex_destroy(&udpif->fmb_mutex);
+ latch_destroy(&udpif->exit_latch);
+ seq_destroy(udpif->wait_seq);
+ free(udpif);
+}
+
+/* Tells 'udpif' to begin or stop handling flow misses depending on the value
+ * of 'enable'. 'n_handlers' is the number of miss_handler threads to create.
+ * Passing 'n_handlers' as zero is equivalent to passing 'enable' as false. */
+void
+udpif_recv_set(struct udpif *udpif, size_t n_handlers, bool enable)
+{
+ n_handlers = enable ? n_handlers : 0;
+ n_handlers = MIN(n_handlers, 64);
+
+ /* Stop the old threads (if any). */
+ if (udpif->handlers && udpif->n_handlers != n_handlers) {
+ size_t i;
+
+ latch_set(&udpif->exit_latch);
+
+ /* Wake the handlers so they can exit. */
+ for (i = 0; i < udpif->n_handlers; i++) {
+ struct handler *handler = &udpif->handlers[i];
+
+ ovs_mutex_lock(&handler->mutex);
+ xpthread_cond_signal(&handler->wake_cond);
+ ovs_mutex_unlock(&handler->mutex);
+ }
+
+ xpthread_join(udpif->dispatcher, NULL);
+ for (i = 0; i < udpif->n_handlers; i++) {
+ struct handler *handler = &udpif->handlers[i];
+ struct upcall *miss, *next;
+
+ xpthread_join(handler->thread, NULL);
+
+ ovs_mutex_lock(&handler->mutex);
+ LIST_FOR_EACH_SAFE (miss, next, list_node, &handler->upcalls) {
+ list_remove(&miss->list_node);
+ upcall_destroy(miss);
+ }
+ ovs_mutex_unlock(&handler->mutex);
+ ovs_mutex_destroy(&handler->mutex);
+
+ xpthread_cond_destroy(&handler->wake_cond);
+ }
+ latch_poll(&udpif->exit_latch);
+
+ free(udpif->handlers);
+ udpif->handlers = NULL;
+ udpif->n_handlers = 0;
+ }
+
+ /* Start new threads (if necessary). */
+ if (!udpif->handlers && n_handlers) {
+ size_t i;
+
+ udpif->n_handlers = n_handlers;
+ udpif->handlers = xzalloc(udpif->n_handlers * sizeof *udpif->handlers);
+ for (i = 0; i < udpif->n_handlers; i++) {
+ struct handler *handler = &udpif->handlers[i];
+
+ handler->udpif = udpif;
+ list_init(&handler->upcalls);
+ xpthread_cond_init(&handler->wake_cond, NULL);
+ ovs_mutex_init(&handler->mutex, PTHREAD_MUTEX_NORMAL);
+ xpthread_create(&handler->thread, NULL, udpif_miss_handler, handler);
+ }
+ xpthread_create(&udpif->dispatcher, NULL, udpif_dispatcher, udpif);
+ }
+}
+
+void
+udpif_run(struct udpif *udpif)
+{
+ udpif->last_seq = seq_read(udpif->wait_seq);
+}
+
+void
+udpif_wait(struct udpif *udpif)
+{
+ ovs_mutex_lock(&udpif->drop_key_mutex);
+ if (udpif->n_drop_keys) {
+ poll_immediate_wake();
+ }
+ ovs_mutex_unlock(&udpif->drop_key_mutex);
+
+ ovs_mutex_lock(&udpif->upcall_mutex);
+ if (udpif->n_upcalls) {
+ poll_immediate_wake();
+ }
+ ovs_mutex_unlock(&udpif->upcall_mutex);
+
+ ovs_mutex_lock(&udpif->fmb_mutex);
+ if (udpif->n_fmbs) {
+ poll_immediate_wake();
+ }
+ ovs_mutex_unlock(&udpif->fmb_mutex);
+
+ seq_wait(udpif->wait_seq, udpif->last_seq);
+}
+
+/* Notifies 'udpif' that something changed which may render previous
+ * xlate_actions() results invalid. */
+void
+udpif_revalidate(struct udpif *udpif)
+{
+ struct flow_miss_batch *fmb, *next_fmb;
+ unsigned int junk;
+
+ /* Since we remove each miss on revalidation, their statistics won't be
+ * accounted to the appropriate 'facet's in the upper layer. In most
+ * cases, this is alright because we've already pushed the stats to the
+ * relevant rules. However, NetFlow requires absolute packet counts on
+ * 'facet's which could now be incorrect. */
+ ovs_mutex_lock(&udpif->fmb_mutex);
+ atomic_add(&udpif->reval_seq, 1, &junk);
+ LIST_FOR_EACH_SAFE (fmb, next_fmb, list_node, &udpif->fmbs) {
+ list_remove(&fmb->list_node);
+ flow_miss_batch_destroy(fmb);
+ udpif->n_fmbs--;
+ }
+ ovs_mutex_unlock(&udpif->fmb_mutex);
+ udpif_drop_key_clear(udpif);
+}
+
+/* Retreives the next upcall which ofproto-dpif is responsible for handling.
+ * The caller is responsible for destroying the returned upcall with
+ * upcall_destroy(). */
+struct upcall *
+upcall_next(struct udpif *udpif)
+{
+ struct upcall *next = NULL;
+
+ ovs_mutex_lock(&udpif->upcall_mutex);
+ if (udpif->n_upcalls) {
+ udpif->n_upcalls--;
+ next = CONTAINER_OF(list_pop_front(&udpif->upcalls), struct upcall,
+ list_node);
+ }
+ ovs_mutex_unlock(&udpif->upcall_mutex);
+ return next;
+}
+
+/* Destroys and deallocates 'upcall'. */
+void
+upcall_destroy(struct upcall *upcall)
+{
+ if (upcall) {
+ ofpbuf_uninit(&upcall->upcall_buf);
+ free(upcall);
+ }
+}
+
+/* Retreives the next batch of processed flow misses for 'udpif' to install.
+ * The caller is responsible for destroying it with flow_miss_batch_destroy().
+ */
+struct flow_miss_batch *
+flow_miss_batch_next(struct udpif *udpif)
+{
+ struct flow_miss_batch *next = NULL;
+
+ ovs_mutex_lock(&udpif->fmb_mutex);
+ if (udpif->n_fmbs) {
+ udpif->n_fmbs--;
+ next = CONTAINER_OF(list_pop_front(&udpif->fmbs),
+ struct flow_miss_batch, list_node);
+ }
+ ovs_mutex_unlock(&udpif->fmb_mutex);
+ return next;
+}
+
+/* Destroys and deallocates 'fmb'. */
+void
+flow_miss_batch_destroy(struct flow_miss_batch *fmb)
+{
+ struct flow_miss *miss, *next;
+
+ if (!fmb) {
+ return;
+ }
+
+ HMAP_FOR_EACH_SAFE (miss, next, hmap_node, &fmb->misses) {
+ hmap_remove(&fmb->misses, &miss->hmap_node);
+ miss_destroy(miss);
+ }
+
+ hmap_destroy(&fmb->misses);
+ free(fmb);
+}
+
+/* Retreives the next drop key which ofproto-dpif needs to process. The caller
+ * is responsible for destroying it with drop_key_destroy(). */
+struct drop_key *
+drop_key_next(struct udpif *udpif)
+{
+ struct drop_key *next = NULL;
+
+ ovs_mutex_lock(&udpif->drop_key_mutex);
+ if (udpif->n_drop_keys) {
+ udpif->n_drop_keys--;
+ next = CONTAINER_OF(list_pop_front(&udpif->drop_keys), struct drop_key,
+ list_node);
+ }
+ ovs_mutex_unlock(&udpif->drop_key_mutex);
+ return next;
+}
+
+/* Destorys and deallocates 'drop_key'. */
+void
+drop_key_destroy(struct drop_key *drop_key)
+{
+ if (drop_key) {
+ free(drop_key->key);
+ free(drop_key);
+ }
+}
+
+/* Clears all drop keys waiting to be processed by drop_key_next(). */
+void
+udpif_drop_key_clear(struct udpif *udpif)
+{
+ struct drop_key *drop_key, *next;
+
+ ovs_mutex_lock(&udpif->drop_key_mutex);
+ LIST_FOR_EACH_SAFE (drop_key, next, list_node, &udpif->drop_keys) {
+ list_remove(&drop_key->list_node);
+ drop_key_destroy(drop_key);
+ udpif->n_drop_keys--;
+ }
+ ovs_mutex_unlock(&udpif->drop_key_mutex);
+}
+\f
+/* The dispatcher thread is responsible for receving upcalls from the kernel,
+ * assigning the miss upcalls to a miss_handler thread, and assigning the more
+ * complex ones to ofproto-dpif directly. */
+static void *
+udpif_dispatcher(void *arg)
+{
+ struct udpif *udpif = arg;
+
+ set_subprogram_name("dispatcher");
+ while (!latch_is_set(&udpif->exit_latch)) {
+ recv_upcalls(udpif);
+ dpif_recv_wait(udpif->dpif);
+ latch_wait(&udpif->exit_latch);
+ poll_block();
+ }
+
+ return NULL;
+}
+
+/* The miss handler thread is responsible for processing miss upcalls retreived
+ * by the dispatcher thread. Once finished it passes the processed miss
+ * upcalls to ofproto-dpif where they're installed in the datapath. */
+static void *
+udpif_miss_handler(void *arg)
+{
+ struct list misses = LIST_INITIALIZER(&misses);
+ struct handler *handler = arg;
+
+ set_subprogram_name("miss_handler");
+ for (;;) {
+ size_t i;
+
+ ovs_mutex_lock(&handler->mutex);
+
+ if (latch_is_set(&handler->udpif->exit_latch)) {
+ ovs_mutex_unlock(&handler->mutex);
+ return NULL;
+ }
+
+ if (!handler->n_upcalls) {
+ ovs_mutex_cond_wait(&handler->wake_cond, &handler->mutex);
+ }
+
+ for (i = 0; i < FLOW_MISS_MAX_BATCH; i++) {
+ if (handler->n_upcalls) {
+ handler->n_upcalls--;
+ list_push_back(&misses, list_pop_front(&handler->upcalls));
+ } else {
+ break;
+ }
+ }
+ ovs_mutex_unlock(&handler->mutex);
+
+ handle_miss_upcalls(handler->udpif, &misses);
+ }
+}
+\f
+static void
+miss_destroy(struct flow_miss *miss)
+{
+ struct upcall *upcall, *next;
+
+ LIST_FOR_EACH_SAFE (upcall, next, list_node, &miss->upcalls) {
+ list_remove(&upcall->list_node);
+ upcall_destroy(upcall);
+ }
+ xlate_out_uninit(&miss->xout);
+}
+
+static enum upcall_type
+classify_upcall(const struct upcall *upcall)
+{
+ const struct dpif_upcall *dpif_upcall = &upcall->dpif_upcall;
+ union user_action_cookie cookie;
+ size_t userdata_len;
+
+ /* First look at the upcall type. */
+ switch (dpif_upcall->type) {
+ case DPIF_UC_ACTION:
+ break;
+
+ case DPIF_UC_MISS:
+ return MISS_UPCALL;
+
+ case DPIF_N_UC_TYPES:
+ default:
+ VLOG_WARN_RL(&rl, "upcall has unexpected type %"PRIu32,
+ dpif_upcall->type);
+ return BAD_UPCALL;
+ }
+
+ /* "action" upcalls need a closer look. */
+ if (!dpif_upcall->userdata) {
+ VLOG_WARN_RL(&rl, "action upcall missing cookie");
+ return BAD_UPCALL;
+ }
+ userdata_len = nl_attr_get_size(dpif_upcall->userdata);
+ if (userdata_len < sizeof cookie.type
+ || userdata_len > sizeof cookie) {
+ VLOG_WARN_RL(&rl, "action upcall cookie has unexpected size %zu",
+ userdata_len);
+ return BAD_UPCALL;
+ }
+ memset(&cookie, 0, sizeof cookie);
+ memcpy(&cookie, nl_attr_get(dpif_upcall->userdata), userdata_len);
+ if (userdata_len == sizeof cookie.sflow
+ && cookie.type == USER_ACTION_COOKIE_SFLOW) {
+ return SFLOW_UPCALL;
+ } else if (userdata_len == sizeof cookie.slow_path
+ && cookie.type == USER_ACTION_COOKIE_SLOW_PATH) {
+ return MISS_UPCALL;
+ } else if (userdata_len == sizeof cookie.flow_sample
+ && cookie.type == USER_ACTION_COOKIE_FLOW_SAMPLE) {
+ return FLOW_SAMPLE_UPCALL;
+ } else if (userdata_len == sizeof cookie.ipfix
+ && cookie.type == USER_ACTION_COOKIE_IPFIX) {
+ return IPFIX_UPCALL;
+ } else {
+ VLOG_WARN_RL(&rl, "invalid user cookie of type %"PRIu16
+ " and size %zu", cookie.type, userdata_len);
+ return BAD_UPCALL;
+ }
+}
+
+static void
+recv_upcalls(struct udpif *udpif)
+{
+ static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(60, 60);
+ for (;;) {
+ struct upcall *upcall;
+ int error;
+
+ upcall = xmalloc(sizeof *upcall);
+ ofpbuf_use_stub(&upcall->upcall_buf, upcall->upcall_stub,
+ sizeof upcall->upcall_stub);
+ error = dpif_recv(udpif->dpif, &upcall->dpif_upcall,
+ &upcall->upcall_buf);
+ if (error) {
+ upcall_destroy(upcall);
+ break;
+ }
+
+ upcall->type = classify_upcall(upcall);
+ if (upcall->type == BAD_UPCALL) {
+ upcall_destroy(upcall);
+ } else if (upcall->type == MISS_UPCALL) {
+ struct dpif_upcall *dupcall = &upcall->dpif_upcall;
+ uint32_t hash = udpif->secret;
+ struct handler *handler;
+ struct nlattr *nla;
+ size_t n_bytes, left;
+
+ n_bytes = 0;
+ NL_ATTR_FOR_EACH (nla, left, dupcall->key, dupcall->key_len) {
+ enum ovs_key_attr type = nl_attr_type(nla);
+ if (type == OVS_KEY_ATTR_IN_PORT
+ || type == OVS_KEY_ATTR_TCP
+ || type == OVS_KEY_ATTR_UDP) {
+ if (nl_attr_get_size(nla) == 4) {
+ ovs_be32 attr = nl_attr_get_be32(nla);
+ hash = mhash_add(hash, (OVS_FORCE uint32_t) attr);
+ n_bytes += 4;
+ } else {
+ VLOG_WARN("Netlink attribute with incorrect size.");
+ }
+ }
+ }
+ hash = mhash_finish(hash, n_bytes);
+
+ handler = &udpif->handlers[hash % udpif->n_handlers];
+
+ ovs_mutex_lock(&handler->mutex);
+ if (handler->n_upcalls < MAX_QUEUE_LENGTH) {
+ list_push_back(&handler->upcalls, &upcall->list_node);
+ handler->n_upcalls++;
+ xpthread_cond_signal(&handler->wake_cond);
+ ovs_mutex_unlock(&handler->mutex);
+ if (!VLOG_DROP_DBG(&rl)) {
+ struct ds ds = DS_EMPTY_INITIALIZER;
+
+ odp_flow_key_format(upcall->dpif_upcall.key,
+ upcall->dpif_upcall.key_len,
+ &ds);
+ VLOG_DBG("dispatcher: miss enqueue (%s)", ds_cstr(&ds));
+ ds_destroy(&ds);
+ }
+ } else {
+ ovs_mutex_unlock(&handler->mutex);
+ COVERAGE_INC(miss_queue_overflow);
+ upcall_destroy(upcall);
+ }
+ } else {
+ ovs_mutex_lock(&udpif->upcall_mutex);
+ if (udpif->n_upcalls < MAX_QUEUE_LENGTH) {
+ udpif->n_upcalls++;
+ list_push_back(&udpif->upcalls, &upcall->list_node);
+ ovs_mutex_unlock(&udpif->upcall_mutex);
+ seq_change(udpif->wait_seq);
+ } else {
+ ovs_mutex_unlock(&udpif->upcall_mutex);
+ COVERAGE_INC(upcall_queue_overflow);
+ upcall_destroy(upcall);
+ }
+ }
+ }
+}
+
+static struct flow_miss *
+flow_miss_find(struct hmap *todo, const struct ofproto_dpif *ofproto,
+ const struct flow *flow, uint32_t hash)
+{
+ struct flow_miss *miss;
+
+ HMAP_FOR_EACH_WITH_HASH (miss, hmap_node, hash, todo) {
+ if (miss->ofproto == ofproto && flow_equal(&miss->flow, flow)) {
+ return miss;
+ }
+ }
+
+ return NULL;
+}
+
+/* Executes flow miss 'miss'. May add any required datapath operations
+ * to 'ops', incrementing '*n_ops' for each new op. */
+static void
+execute_flow_miss(struct flow_miss *miss, struct dpif_op *ops, size_t *n_ops)
+{
+ struct ofproto_dpif *ofproto = miss->ofproto;
+ struct flow_wildcards wc;
+ struct rule_dpif *rule;
+ struct ofpbuf *packet;
+ struct xlate_in xin;
+
+ memset(&miss->stats, 0, sizeof miss->stats);
+ miss->stats.used = time_msec();
+ LIST_FOR_EACH (packet, list_node, &miss->packets) {
+ miss->stats.tcp_flags |= packet_get_tcp_flags(packet, &miss->flow);
+ miss->stats.n_bytes += packet->size;
+ miss->stats.n_packets++;
+ }
+
+ flow_wildcards_init_catchall(&wc);
+ rule_dpif_lookup(ofproto, &miss->flow, &wc, &rule);
+ rule_credit_stats(rule, &miss->stats);
+ xlate_in_init(&xin, ofproto, &miss->flow, rule, miss->stats.tcp_flags,
+ NULL);
+ xin.may_learn = true;
+ xin.resubmit_stats = &miss->stats;
+ xlate_actions(&xin, &miss->xout);
+ flow_wildcards_or(&miss->xout.wc, &miss->xout.wc, &wc);
+
+ if (rule->up.cr.priority == FAIL_OPEN_PRIORITY) {
+ struct ofputil_packet_in pin;
+
+ /* Extra-special case for fail-open mode.
+ *
+ * We are in fail-open mode and the packet matched the fail-open
+ * rule, but we are connected to a controller too. We should send
+ * the packet up to the controller in the hope that it will try to
+ * set up a flow and thereby allow us to exit fail-open.
+ *
+ * See the top-level comment in fail-open.c for more information. */
+ pin.packet = packet->data;
+ pin.packet_len = packet->size;
+ pin.reason = OFPR_NO_MATCH;
+ pin.controller_id = 0;
+ pin.table_id = 0;
+ pin.cookie = 0;
+ pin.send_len = 0; /* Not used for flow table misses. */
+ flow_get_metadata(&miss->flow, &pin.fmd);
+ ofproto_dpif_send_packet_in(ofproto, &pin);
+ }
+
+ if (miss->xout.slow) {
+ LIST_FOR_EACH (packet, list_node, &miss->packets) {
+ struct xlate_in xin;
+
+ xlate_in_init(&xin, miss->ofproto, &miss->flow, rule, 0, packet);
+ xlate_actions_for_side_effects(&xin);
+ }
+ }
+ rule_release(rule);
+
+ if (miss->xout.odp_actions.size) {
+ LIST_FOR_EACH (packet, list_node, &miss->packets) {
+ struct dpif_op *op = &ops[*n_ops];
+ struct dpif_execute *execute = &op->u.execute;
+
+ if (miss->flow.in_port.ofp_port
+ != vsp_realdev_to_vlandev(miss->ofproto,
+ miss->flow.in_port.ofp_port,
+ miss->flow.vlan_tci)) {
+ /* This packet was received on a VLAN splinter port. We
+ * added a VLAN to the packet to make the packet resemble
+ * the flow, but the actions were composed assuming that
+ * the packet contained no VLAN. So, we must remove the
+ * VLAN header from the packet before trying to execute the
+ * actions. */
+ eth_pop_vlan(packet);
+ }
+
+ op->type = DPIF_OP_EXECUTE;
+ execute->key = miss->key;
+ execute->key_len = miss->key_len;
+ execute->packet = packet;
+ execute->actions = miss->xout.odp_actions.data;
+ execute->actions_len = miss->xout.odp_actions.size;
+
+ (*n_ops)++;
+ }
+ }
+}
+
+static void
+handle_miss_upcalls(struct udpif *udpif, struct list *upcalls)
+{
+ struct dpif_op *opsp[FLOW_MISS_MAX_BATCH];
+ struct dpif_op ops[FLOW_MISS_MAX_BATCH];
+ unsigned int old_reval_seq, new_reval_seq;
+ struct upcall *upcall, *next;
+ struct flow_miss_batch *fmb;
+ size_t n_upcalls, n_ops, i;
+ struct flow_miss *miss;
+
+ atomic_read(&udpif->reval_seq, &old_reval_seq);
+
+ /* Construct the to-do list.
+ *
+ * This just amounts to extracting the flow from each packet and sticking
+ * the packets that have the same flow in the same "flow_miss" structure so
+ * that we can process them together. */
+ fmb = xmalloc(sizeof *fmb);
+ hmap_init(&fmb->misses);
+ n_upcalls = 0;
+ LIST_FOR_EACH_SAFE (upcall, next, list_node, upcalls) {
+ struct dpif_upcall *dupcall = &upcall->dpif_upcall;
+ struct flow_miss *miss = &fmb->miss_buf[n_upcalls];
+ struct flow_miss *existing_miss;
+ struct ofproto_dpif *ofproto;
+ odp_port_t odp_in_port;
+ struct flow flow;
+ uint32_t hash;
+ int error;
+
+ error = xlate_receive(udpif->backer, dupcall->packet, dupcall->key,
+ dupcall->key_len, &flow, &miss->key_fitness,
+ &ofproto, &odp_in_port);
+
+ if (error == ENODEV) {
+ struct drop_key *drop_key;
+
+ /* Received packet on datapath port for which we couldn't
+ * associate an ofproto. This can happen if a port is removed
+ * while traffic is being received. Print a rate-limited message
+ * in case it happens frequently. Install a drop flow so
+ * that future packets of the flow are inexpensively dropped
+ * in the kernel. */
+ VLOG_INFO_RL(&rl, "received packet on unassociated datapath port "
+ "%"PRIu32, odp_in_port);
+
+ drop_key = xmalloc(sizeof *drop_key);
+ drop_key->key = xmemdup(dupcall->key, dupcall->key_len);
+ drop_key->key_len = dupcall->key_len;
+
+ ovs_mutex_lock(&udpif->drop_key_mutex);
+ if (udpif->n_drop_keys < MAX_QUEUE_LENGTH) {
+ udpif->n_drop_keys++;
+ list_push_back(&udpif->drop_keys, &drop_key->list_node);
+ ovs_mutex_unlock(&udpif->drop_key_mutex);
+ seq_change(udpif->wait_seq);
+ } else {
+ ovs_mutex_unlock(&udpif->drop_key_mutex);
+ COVERAGE_INC(drop_queue_overflow);
+ drop_key_destroy(drop_key);
+ }
+ continue;
+ } else if (error) {
+ continue;
+ }
+
+ flow_extract(dupcall->packet, flow.skb_priority, flow.pkt_mark,
+ &flow.tunnel, &flow.in_port, &miss->flow);
+
+ /* Add other packets to a to-do list. */
+ hash = flow_hash(&miss->flow, 0);
+ existing_miss = flow_miss_find(&fmb->misses, ofproto, &miss->flow, hash);
+ if (!existing_miss) {
+ hmap_insert(&fmb->misses, &miss->hmap_node, hash);
+ miss->ofproto = ofproto;
+ miss->key = dupcall->key;
+ miss->key_len = dupcall->key_len;
+ miss->upcall_type = dupcall->type;
+ list_init(&miss->packets);
+ list_init(&miss->upcalls);
+
+ n_upcalls++;
+ } else {
+ miss = existing_miss;
+ }
+ list_push_back(&miss->packets, &dupcall->packet->list_node);
+
+ list_remove(&upcall->list_node);
+ list_push_back(&miss->upcalls, &upcall->list_node);
+ }
+
+ LIST_FOR_EACH_SAFE (upcall, next, list_node, upcalls) {
+ list_remove(&upcall->list_node);
+ upcall_destroy(upcall);
+ }
+
+ /* Process each element in the to-do list, constructing the set of
+ * operations to batch. */
+ n_ops = 0;
+ HMAP_FOR_EACH (miss, hmap_node, &fmb->misses) {
+ execute_flow_miss(miss, ops, &n_ops);
+ }
+ ovs_assert(n_ops <= ARRAY_SIZE(ops));
+
+ /* Execute batch. */
+ for (i = 0; i < n_ops; i++) {
+ opsp[i] = &ops[i];
+ }
+ dpif_operate(udpif->dpif, opsp, n_ops);
+
+ ovs_mutex_lock(&udpif->fmb_mutex);
+ atomic_read(&udpif->reval_seq, &new_reval_seq);
+ if (old_reval_seq != new_reval_seq) {
+ /* udpif_revalidate() was called as we were calculating the actions.
+ * To be safe, we need to assume all the misses need revalidation. */
+ ovs_mutex_unlock(&udpif->fmb_mutex);
+ flow_miss_batch_destroy(fmb);
+ } else if (udpif->n_fmbs < MAX_QUEUE_LENGTH) {
+ udpif->n_fmbs++;
+ list_push_back(&udpif->fmbs, &fmb->list_node);
+ ovs_mutex_unlock(&udpif->fmb_mutex);
+ seq_change(udpif->wait_seq);
+ } else {
+ COVERAGE_INC(fmb_queue_overflow);
+ ovs_mutex_unlock(&udpif->fmb_mutex);
+ flow_miss_batch_destroy(fmb);
+ }
+}
--- /dev/null
+/* Copyright (c) 2013 Nicira, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+
+#ifndef OFPROTO_DPIF_UPCALL_H
+#define OFPROTO_DPIF_UPCALL_H
+
+#define FLOW_MISS_MAX_BATCH 50
+
+#include "dpif.h"
+#include "flow.h"
+#include "hmap.h"
+#include "list.h"
+#include "odp-util.h"
+#include "ofpbuf.h"
+#include "ofproto-dpif-xlate.h"
+
+struct dpif;
+struct dpif_backer;
+
+/* udif is responsible for retrieving upcalls from the kernel, processing miss
+ * upcalls, and handing more complex ones up to the main ofproto-dpif
+ * module. */
+
+struct udpif *udpif_create(struct dpif_backer *, struct dpif *);
+void udpif_recv_set(struct udpif *, size_t n_workers, bool enable);
+void udpif_destroy(struct udpif *);
+
+void udpif_run(struct udpif *);
+void udpif_wait(struct udpif *);
+
+void udpif_revalidate(struct udpif *);
+\f
+/* udpif can handle some upcalls on its own. Others need the main ofproto_dpif
+ * code to handle them. This interface passes upcalls not handled by udpif up
+ * to the ofproto_dpif main thread. */
+
+/* Type of an upcall. */
+enum upcall_type {
+ /* Handled internally by udpif code. Not returned by upcall_next().*/
+ BAD_UPCALL, /* Some kind of bug somewhere. */
+ MISS_UPCALL, /* A flow miss. */
+
+ /* Require main thread's involvement. May be returned by upcall_next(). */
+ SFLOW_UPCALL, /* sFlow sample. */
+ FLOW_SAMPLE_UPCALL, /* Per-flow sampling. */
+ IPFIX_UPCALL /* Per-bridge sampling. */
+};
+
+/* An upcall. */
+struct upcall {
+ struct list list_node; /* For queuing upcalls. */
+
+ enum upcall_type type; /* Classification. */
+
+ /* Raw upcall plus data for keeping track of the memory backing it. */
+ struct dpif_upcall dpif_upcall; /* As returned by dpif_recv() */
+ struct ofpbuf upcall_buf; /* Owns some data in 'dpif_upcall'. */
+ uint64_t upcall_stub[256 / 8]; /* Buffer to reduce need for malloc(). */
+};
+
+struct upcall *upcall_next(struct udpif *);
+void upcall_destroy(struct upcall *);
+\f
+/* udpif figures out how to forward packets, and does forward them, but it
+ * can't set up datapath flows on its own. This interface passes packet
+ * forwarding data from udpif to the higher level ofproto_dpif to allow the
+ * latter to set up datapath flows. */
+
+/* Flow miss batching.
+ *
+ * Some dpifs implement operations faster when you hand them off in a batch.
+ * To allow batching, "struct flow_miss" queues the dpif-related work needed
+ * for a given flow. Each "struct flow_miss" corresponds to sending one or
+ * more packets, plus possibly installing the flow in the dpif. */
+struct flow_miss {
+ struct hmap_node hmap_node;
+ struct ofproto_dpif *ofproto;
+
+ struct flow flow;
+ enum odp_key_fitness key_fitness;
+ const struct nlattr *key;
+ size_t key_len;
+ struct list packets;
+ enum dpif_upcall_type upcall_type;
+ struct dpif_flow_stats stats;
+
+ struct xlate_out xout;
+
+ struct list upcalls;
+};
+
+struct flow_miss_batch {
+ struct list list_node;
+
+ struct flow_miss miss_buf[FLOW_MISS_MAX_BATCH];
+ struct hmap misses;
+};
+
+struct flow_miss_batch *flow_miss_batch_next(struct udpif *);
+void flow_miss_batch_destroy(struct flow_miss_batch *);
+\f
+/* Drop keys are odp flow keys which have drop flows installed in the kernel.
+ * These are datapath flows which have no associated ofproto, if they did we
+ * would use facets.
+ *
+ * udpif can't install drop flows by itself. This interfaces allows udpif to
+ * pass the drop flows up to ofproto_dpif to get it to install them. */
+struct drop_key {
+ struct hmap_node hmap_node;
+ struct list list_node;
+ struct nlattr *key;
+ size_t key_len;
+};
+
+struct drop_key *drop_key_next(struct udpif *);
+void drop_key_destroy(struct drop_key *);
+void udpif_drop_key_clear(struct udpif *);
+
+#endif /* ofproto-dpif-upcall.h */
uint16_t vlan);
static void compose_output_action(struct xlate_ctx *, ofp_port_t ofp_port);
-static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
-
static struct xbridge *xbridge_lookup(const struct ofproto_dpif *);
static struct xbundle *xbundle_lookup(const struct ofbundle *);
static struct xport *xport_lookup(const struct ofport_dpif *);
struct flow_wildcards *wc = &ctx->xout->wc;
struct flow *flow = &ctx->xin->flow;
ovs_be16 flow_vlan_tci;
- uint32_t flow_skb_mark;
+ uint32_t flow_pkt_mark;
uint8_t flow_nw_tos;
odp_port_t out_port, odp_port;
uint8_t dscp;
}
flow_vlan_tci = flow->vlan_tci;
- flow_skb_mark = flow->skb_mark;
+ flow_pkt_mark = flow->pkt_mark;
flow_nw_tos = flow->nw_tos;
if (dscp_from_skb_priority(xport, flow->skb_priority, &dscp)) {
out_port = ofp_port_to_odp_port(ctx->xbridge, vlandev_port);
flow->vlan_tci = htons(0);
}
- flow->skb_mark &= ~IPSEC_MARK;
}
if (out_port != ODPP_NONE) {
out:
/* Restore flow */
flow->vlan_tci = flow_vlan_tci;
- flow->skb_mark = flow_skb_mark;
+ flow->pkt_mark = flow_pkt_mark;
flow->nw_tos = flow_nw_tos;
}
compose_output_action__(ctx, ofp_port, true);
}
-/* Common rule processing in one place to avoid duplicating code. */
-static struct rule_dpif *
-ctx_rule_hooks(struct xlate_ctx *ctx, struct rule_dpif *rule,
- bool may_packet_in)
-{
- if (ctx->xin->resubmit_hook) {
- ctx->xin->resubmit_hook(ctx->xin, rule, ctx->recurse);
- }
- if (rule == NULL && may_packet_in) {
- struct xport *xport;
-
- /* XXX
- * check if table configuration flags
- * OFPTC_TABLE_MISS_CONTROLLER, default.
- * OFPTC_TABLE_MISS_CONTINUE,
- * OFPTC_TABLE_MISS_DROP
- * When OF1.0, OFPTC_TABLE_MISS_CONTINUE is used. What to do? */
- xport = get_ofp_port(ctx->xbridge, ctx->xin->flow.in_port.ofp_port);
- rule = choose_miss_rule(xport ? xport->config : 0,
- ctx->xbridge->miss_rule,
- ctx->xbridge->no_packet_in_rule);
- }
- if (rule && ctx->xin->resubmit_stats) {
- rule_credit_stats(rule, ctx->xin->resubmit_stats);
- }
- return rule;
-}
-
static void
xlate_table_action(struct xlate_ctx *ctx,
ofp_port_t in_port, uint8_t table_id, bool may_packet_in)
/* Look up a flow with 'in_port' as the input port. */
ctx->xin->flow.in_port.ofp_port = in_port;
- rule = rule_dpif_lookup_in_table(ctx->xbridge->ofproto,
- &ctx->xin->flow, &ctx->xout->wc,
- table_id);
+ rule_dpif_lookup_in_table(ctx->xbridge->ofproto, &ctx->xin->flow,
+ &ctx->xout->wc, table_id, &rule);
/* Restore the original input port. Otherwise OFPP_NORMAL and
* OFPP_IN_PORT will have surprising behavior. */
ctx->xin->flow.in_port.ofp_port = old_in_port;
- rule = ctx_rule_hooks(ctx, rule, may_packet_in);
+ if (ctx->xin->resubmit_hook) {
+ ctx->xin->resubmit_hook(ctx->xin, rule, ctx->recurse);
+ }
+
+ if (rule == NULL && may_packet_in) {
+ struct xport *xport;
+
+ /* Makes clang's thread safety analysis happy. */
+ rule_release(rule);
+
+ /* XXX
+ * check if table configuration flags
+ * OFPTC_TABLE_MISS_CONTROLLER, default.
+ * OFPTC_TABLE_MISS_CONTINUE,
+ * OFPTC_TABLE_MISS_DROP
+ * When OF1.0, OFPTC_TABLE_MISS_CONTINUE is used. What to do? */
+ xport = get_ofp_port(ctx->xbridge, ctx->xin->flow.in_port.ofp_port);
+ rule = choose_miss_rule(xport ? xport->config : 0,
+ ctx->xbridge->miss_rule,
+ ctx->xbridge->no_packet_in_rule);
+ ovs_rwlock_rdlock(&rule->up.evict);
+ }
+
+ if (rule && ctx->xin->resubmit_stats) {
+ rule_credit_stats(rule, ctx->xin->resubmit_stats);
+ }
if (rule) {
struct rule_dpif *old_rule = ctx->rule;
ctx->rule = old_rule;
ctx->recurse--;
}
+ rule_release(rule);
ctx->table_id = old_table_id;
} else {
packet = ofpbuf_clone(ctx->xin->packet);
key.skb_priority = 0;
- key.skb_mark = 0;
+ key.pkt_mark = 0;
memset(&key.tunnel, 0, sizeof key.tunnel);
commit_odp_actions(&ctx->xin->flow, &ctx->base_flow,
return true;
}
-static bool
-tunnel_ecn_ok(struct xlate_ctx *ctx)
-{
- if (is_ip_any(&ctx->base_flow)
- && (ctx->xin->flow.tunnel.ip_tos & IP_ECN_MASK) == IP_ECN_CE) {
- if ((ctx->base_flow.nw_tos & IP_ECN_MASK) == IP_ECN_NOT_ECT) {
- VLOG_WARN_RL(&rl, "dropping tunnel packet marked ECN CE"
- " but is not ECN capable");
- return false;
- } else {
- /* Set the ECN CE value in the tunneled packet. */
- ctx->xin->flow.nw_tos |= IP_ECN_CE;
- }
- }
-
- return true;
-}
-
static void
do_xlate_actions(const struct ofpact *ofpacts, size_t ofpacts_len,
struct xlate_ctx *ctx)
{
struct flow_wildcards *wc = &ctx->xout->wc;
struct flow *flow = &ctx->xin->flow;
- bool was_evictable = true;
const struct ofpact *a;
- if (ctx->rule) {
- /* Don't let the rule we're working on get evicted underneath us. */
- was_evictable = ctx->rule->up.evictable;
- ctx->rule->up.evictable = false;
- }
-
OFPACT_FOR_EACH (a, ofpacts, ofpacts_len) {
struct ofpact_controller *controller;
const struct ofpact_metadata *metadata;
case OFPACT_SET_MPLS_TTL:
if (compose_set_mpls_ttl_action(ctx,
ofpact_get_SET_MPLS_TTL(a)->ttl)) {
- goto out;
+ return;
}
break;
case OFPACT_DEC_MPLS_TTL:
if (compose_dec_mpls_ttl_action(ctx)) {
- goto out;
+ return;
}
break;
case OFPACT_DEC_TTL:
wc->masks.nw_ttl = 0xff;
if (compose_dec_ttl(ctx, ofpact_get_DEC_TTL(a))) {
- goto out;
+ return;
}
break;
break;
}
}
-
-out:
- if (ctx->rule) {
- ctx->rule->up.evictable = was_evictable;
- }
}
void
struct flow orig_flow;
struct xlate_ctx ctx;
size_t ofpacts_len;
+ bool tnl_may_send;
COVERAGE_INC(xlate_actions);
memset(&wc->masks.dl_type, 0xff, sizeof wc->masks.dl_type);
wc->masks.nw_frag |= FLOW_NW_FRAG_MASK;
- if (tnl_port_should_receive(&ctx.xin->flow)) {
- memset(&wc->masks.tunnel, 0xff, sizeof wc->masks.tunnel);
- /* skb_mark is currently used only by tunnels but that will likely
- * change in the future. */
- memset(&wc->masks.skb_mark, 0xff, sizeof wc->masks.skb_mark);
- }
+ tnl_may_send = tnl_xlate_init(&ctx.base_flow, flow, wc);
if (ctx.xbridge->has_netflow) {
netflow_mask_wc(flow, wc);
}
add_ipfix_action(&ctx);
sample_actions_len = ctx.xout->odp_actions.size;
- if (tunnel_ecn_ok(&ctx) && (!in_port || may_receive(in_port, &ctx))) {
+ if (tnl_may_send && (!in_port || may_receive(in_port, &ctx))) {
do_xlate_actions(ofpacts, ofpacts_len, &ctx);
/* We've let OFPP_NORMAL and the learning action look at the
* See the License for the specific language governing permissions and
* limitations under the License. */
-#ifndef OFPROT_DPIF_XLATE_H
-#define OFPROT_DPIF_XLATE_H 1
+#ifndef OFPROTO_DPIF_XLATE_H
+#define OFPROTO_DPIF_XLATE_H 1
#include "flow.h"
#include "meta-flow.h"
#include "ofproto-dpif-ipfix.h"
#include "ofproto-dpif-mirror.h"
#include "ofproto-dpif-sflow.h"
+#include "ofproto-dpif-upcall.h"
#include "ofproto-dpif-xlate.h"
#include "poll-loop.h"
#include "simap.h"
COVERAGE_DEFINE(packet_in_overflow);
COVERAGE_DEFINE(flow_mod_overflow);
+#define N_THREADS 16
+
/* Number of implemented OpenFlow tables. */
enum { N_TABLES = 255 };
enum { TBL_INTERNAL = N_TABLES - 1 }; /* Used for internal hidden rules. */
struct flow_miss;
struct facet;
-static struct rule_dpif *rule_dpif_lookup(struct ofproto_dpif *,
- const struct flow *,
- struct flow_wildcards *wc);
-
static void rule_get_stats(struct rule *, uint64_t *packets, uint64_t *bytes);
struct ofbundle {
#define SUBFACET_DESTROY_MAX_BATCH 50
-static struct subfacet *subfacet_create(struct facet *, struct flow_miss *miss,
- long long int now);
+static struct subfacet *subfacet_create(struct facet *, struct flow_miss *);
static struct subfacet *subfacet_find(struct dpif_backer *,
const struct nlattr *key, size_t key_len,
uint32_t key_hash);
uint8_t tcp_flags; /* TCP flags seen for this 'rule'. */
struct xlate_out xout;
- bool fail_open; /* Facet matched the fail open rule. */
/* Storage for a single subfacet, to reduce malloc() time and space
* overhead. (A facet always has at least one subfacet and in the common
long long int learn_rl; /* Rate limiter for facet_learn(). */
};
-static struct facet *facet_create(const struct flow_miss *, struct rule_dpif *,
- struct xlate_out *,
- struct dpif_flow_stats *);
+static struct facet *facet_create(const struct flow_miss *);
static void facet_remove(struct facet *);
static void facet_free(struct facet *);
static void facet_flush_stats(struct facet *);
static void facet_reset_counters(struct facet *);
+static void flow_push_stats(struct ofproto_dpif *, struct flow *,
+ struct dpif_flow_stats *, bool may_learn);
static void facet_push_stats(struct facet *, bool may_learn);
static void facet_learn(struct facet *);
static void facet_account(struct facet *);
COVERAGE_DEFINE(rev_mac_learning);
COVERAGE_DEFINE(rev_inconsistency);
-/* Drop keys are odp flow keys which have drop flows installed in the kernel.
- * These are datapath flows which have no associated ofproto, if they did we
- * would use facets. */
-struct drop_key {
- struct hmap_node hmap_node;
- struct nlattr *key;
- size_t key_len;
-};
-
struct avg_subfacet_rates {
double add_rate; /* Moving average of new flows created per minute. */
double del_rate; /* Moving average of flows deleted per minute. */
char *type;
int refcount;
struct dpif *dpif;
+ struct udpif *udpif;
struct timer next_expiration;
struct ovs_rwlock odp_to_ofport_lock;
const struct ofpbuf *packet, struct ds *);
/* Upcalls. */
-#define FLOW_MISS_MAX_BATCH 50
-static int handle_upcalls(struct dpif_backer *, unsigned int max_batch);
+static void handle_upcalls(struct dpif_backer *);
/* Flow expiration. */
static int expire(struct dpif_backer *);
error = dpif_recv_set(backer->dpif, backer->recv_set_enable);
if (error) {
+ udpif_recv_set(backer->udpif, 0, false);
VLOG_ERR("Failed to enable receiving packets in dpif.");
return error;
}
+ udpif_recv_set(backer->udpif, N_THREADS, backer->recv_set_enable);
dpif_flow_flush(backer->dpif);
backer->need_revalidate = REV_RECONFIGURE;
}
run_fast_rl();
}
}
+
+ udpif_revalidate(backer->udpif);
}
if (!backer->recv_set_enable) {
}
static int
-dpif_backer_run_fast(struct dpif_backer *backer, int max_batch)
+dpif_backer_run_fast(struct dpif_backer *backer)
{
- unsigned int work;
-
- /* If recv_set_enable is false, we should not handle upcalls. */
- if (!backer->recv_set_enable) {
- return 0;
- }
-
- /* Handle one or more batches of upcalls, until there's nothing left to do
- * or until we do a fixed total amount of work.
- *
- * We do work in batches because it can be much cheaper to set up a number
- * of flows and fire off their patches all at once. We do multiple batches
- * because in some cases handling a packet can cause another packet to be
- * queued almost immediately as part of the return flow. Both
- * optimizations can make major improvements on some benchmarks and
- * presumably for real traffic as well. */
- work = 0;
- while (work < max_batch) {
- int retval = handle_upcalls(backer, max_batch - work);
- if (retval <= 0) {
- return -retval;
- }
- work += retval;
- }
+ udpif_run(backer->udpif);
+ handle_upcalls(backer);
return 0;
}
return 0;
}
- return dpif_backer_run_fast(backer, FLOW_MISS_MAX_BATCH);
+ return dpif_backer_run_fast(backer);
}
static void
run_fast_rl(void)
{
static long long int port_rl = LLONG_MIN;
- static unsigned int backer_rl = 0;
if (time_msec() >= port_rl) {
struct ofproto_dpif *ofproto;
}
port_rl = time_msec() + 200;
}
-
- /* XXX: We have to be careful not to do too much work in this function. If
- * we call dpif_backer_run_fast() too often, or with too large a batch,
- * performance improves signifcantly, but at a cost. It's possible for the
- * number of flows in the datapath to increase without bound, and for poll
- * loops to take 10s of seconds. The correct solution to this problem,
- * long term, is to separate flow miss handling into it's own thread so it
- * isn't affected by revalidations, and expirations. Until then, this is
- * the best we can do. */
- if (++backer_rl >= 10) {
- struct shash_node *node;
-
- backer_rl = 0;
- SHASH_FOR_EACH (node, &all_dpif_backers) {
- dpif_backer_run_fast(node->data, 1);
- }
- }
}
static void
node = shash_find(&all_dpif_backers, backer->type);
free(backer->type);
shash_delete(&all_dpif_backers, node);
+ udpif_destroy(backer->udpif);
dpif_close(backer->dpif);
ovs_assert(hmap_is_empty(&backer->subfacets));
free(backer);
return error;
}
+ backer->udpif = udpif_create(backer, backer->dpif);
backer->type = xstrdup(type);
backer->governor = NULL;
close_dpif_backer(backer);
return error;
}
+ udpif_recv_set(backer->udpif, N_THREADS, backer->recv_set_enable);
backer->max_n_subfacet = 0;
backer->created = time_msec();
return error;
}
- *rulep = rule_dpif_lookup_in_table(ofproto, &fm.match.flow, NULL,
- TBL_INTERNAL);
- ovs_assert(*rulep != NULL);
+ if (rule_dpif_lookup_in_table(ofproto, &fm.match.flow, NULL, TBL_INTERNAL,
+ rulep)) {
+ ovs_rwlock_unlock(&(*rulep)->up.evict);
+ } else {
+ NOT_REACHED();
+ }
return 0;
}
}
dpif_wait(ofproto->backer->dpif);
- dpif_recv_wait(ofproto->backer->dpif);
+ udpif_wait(ofproto->backer->udpif);
if (ofproto->sflow) {
dpif_sflow_wait(ofproto->sflow);
}
if (port->bundle != bundle) {
bundle->ofproto->backer->need_revalidate = REV_RECONFIGURE;
if (port->bundle) {
- bundle_del_port(port);
+ bundle_remove(&port->up);
}
port->bundle = bundle;
{
const struct ofproto_dpif *ofproto;
struct dpif_backer *backer;
- const char *peer_name;
+ char *peer_name;
if (!netdev_vport_is_patch(ofport->up.netdev)) {
return;
HMAP_FOR_EACH (ofproto, all_ofproto_dpifs_node, &all_ofproto_dpifs) {
struct ofport *peer_ofport;
struct ofport_dpif *peer;
- const char *peer_peer;
+ char *peer_peer;
if (ofproto->backer != backer) {
continue;
ofport->peer = peer;
ofport->peer->peer = ofport;
}
+ free(peer_peer);
- return;
+ break;
}
+ free(peer_name);
}
static void
\f
/* Upcall handling. */
-/* Flow miss batching.
- *
- * Some dpifs implement operations faster when you hand them off in a batch.
- * To allow batching, "struct flow_miss" queues the dpif-related work needed
- * for a given flow. Each "struct flow_miss" corresponds to sending one or
- * more packets, plus possibly installing the flow in the dpif.
- *
- * So far we only batch the operations that affect flow setup time the most.
- * It's possible to batch more than that, but the benefit might be minimal. */
-struct flow_miss {
- struct hmap_node hmap_node;
- struct ofproto_dpif *ofproto;
- struct flow flow;
- enum odp_key_fitness key_fitness;
- const struct nlattr *key;
- size_t key_len;
- struct list packets;
- enum dpif_upcall_type upcall_type;
-};
-
struct flow_miss_op {
struct dpif_op dpif_op;
struct subfacet *subfacet;
};
-/* Sends an OFPT_PACKET_IN message for 'packet' of type OFPR_NO_MATCH to each
- * OpenFlow controller as necessary according to their individual
- * configurations. */
-static void
-send_packet_in_miss(struct ofproto_dpif *ofproto, const struct ofpbuf *packet,
- const struct flow *flow)
-{
- struct ofputil_packet_in pin;
-
- pin.packet = packet->data;
- pin.packet_len = packet->size;
- pin.reason = OFPR_NO_MATCH;
- pin.controller_id = 0;
-
- pin.table_id = 0;
- pin.cookie = 0;
-
- pin.send_len = 0; /* not used for flow table misses */
-
- flow_get_metadata(flow, &pin.fmd);
-
- connmgr_send_packet_in(ofproto->up.connmgr, &pin);
-}
-
-static struct flow_miss *
-flow_miss_find(struct hmap *todo, const struct ofproto_dpif *ofproto,
- const struct flow *flow, uint32_t hash)
-{
- struct flow_miss *miss;
-
- HMAP_FOR_EACH_WITH_HASH (miss, hmap_node, hash, todo) {
- if (miss->ofproto == ofproto && flow_equal(&miss->flow, flow)) {
- return miss;
- }
- }
-
- return NULL;
-}
-
-/* Partially Initializes 'op' as an "execute" operation for 'miss' and
- * 'packet'. The caller must initialize op->actions and op->actions_len. If
- * 'miss' is associated with a subfacet the caller must also initialize the
- * returned op->subfacet, and if anything needs to be freed after processing
- * the op, the caller must initialize op->garbage also. */
-static void
-init_flow_miss_execute_op(struct flow_miss *miss, struct ofpbuf *packet,
- struct flow_miss_op *op)
-{
- if (miss->flow.in_port.ofp_port
- != vsp_realdev_to_vlandev(miss->ofproto, miss->flow.in_port.ofp_port,
- miss->flow.vlan_tci)) {
- /* This packet was received on a VLAN splinter port. We
- * added a VLAN to the packet to make the packet resemble
- * the flow, but the actions were composed assuming that
- * the packet contained no VLAN. So, we must remove the
- * VLAN header from the packet before trying to execute the
- * actions. */
- eth_pop_vlan(packet);
- }
-
- op->subfacet = NULL;
- op->xout_garbage = false;
- op->dpif_op.type = DPIF_OP_EXECUTE;
- op->dpif_op.u.execute.key = miss->key;
- op->dpif_op.u.execute.key_len = miss->key_len;
- op->dpif_op.u.execute.packet = packet;
- ofpbuf_use_stack(&op->mask, &op->maskbuf, sizeof op->maskbuf);
-}
-
-/* Helper for handle_flow_miss_without_facet() and
- * handle_flow_miss_with_facet(). */
-static void
-handle_flow_miss_common(struct ofproto_dpif *ofproto, struct ofpbuf *packet,
- const struct flow *flow, bool fail_open)
-{
- if (fail_open) {
- /*
- * Extra-special case for fail-open mode.
- *
- * We are in fail-open mode and the packet matched the fail-open
- * rule, but we are connected to a controller too. We should send
- * the packet up to the controller in the hope that it will try to
- * set up a flow and thereby allow us to exit fail-open.
- *
- * See the top-level comment in fail-open.c for more information.
- */
- send_packet_in_miss(ofproto, packet, flow);
- }
-}
-
/* Figures out whether a flow that missed in 'ofproto', whose details are in
* 'miss' masked by 'wc', is likely to be worth tracking in detail in userspace
* and (usually) installing a datapath flow. The answer is usually "yes" (a
* flows we impose some heuristics to decide which flows are likely to be worth
* tracking. */
static bool
-flow_miss_should_make_facet(struct flow_miss *miss, struct flow_wildcards *wc)
+flow_miss_should_make_facet(struct flow_miss *miss)
{
struct dpif_backer *backer = miss->ofproto->backer;
uint32_t hash;
backer->governor = governor_create();
}
- hash = flow_hash_in_wildcards(&miss->flow, wc, 0);
+ hash = flow_hash_in_wildcards(&miss->flow, &miss->xout.wc, 0);
return governor_should_install_flow(backer->governor, hash,
list_size(&miss->packets));
}
-/* Handles 'miss' without creating a facet or subfacet or creating any datapath
- * flow. 'miss->flow' must have matched 'rule' and been xlated into 'xout'.
- * May add an "execute" operation to 'ops' and increment '*n_ops'. */
-static void
-handle_flow_miss_without_facet(struct rule_dpif *rule, struct xlate_out *xout,
- struct flow_miss *miss,
- struct flow_miss_op *ops, size_t *n_ops)
-{
- struct ofpbuf *packet;
-
- LIST_FOR_EACH (packet, list_node, &miss->packets) {
-
- COVERAGE_INC(facet_suppress);
-
- handle_flow_miss_common(miss->ofproto, packet, &miss->flow,
- rule->up.cr.priority == FAIL_OPEN_PRIORITY);
-
- if (xout->slow) {
- struct xlate_in xin;
-
- xlate_in_init(&xin, miss->ofproto, &miss->flow, rule, 0, packet);
- xlate_actions_for_side_effects(&xin);
- }
-
- if (xout->odp_actions.size) {
- struct flow_miss_op *op = &ops[*n_ops];
- struct dpif_execute *execute = &op->dpif_op.u.execute;
-
- init_flow_miss_execute_op(miss, packet, op);
- xlate_out_copy(&op->xout, xout);
- execute->actions = op->xout.odp_actions.data;
- execute->actions_len = op->xout.odp_actions.size;
- op->xout_garbage = true;
-
- (*n_ops)++;
- }
- }
-}
-
/* Handles 'miss', which matches 'facet'. May add any required datapath
* operations to 'ops', incrementing '*n_ops' for each new op.
*
- * All of the packets in 'miss' are considered to have arrived at time 'now'.
- * This is really important only for new facets: if we just called time_msec()
- * here, then the new subfacet or its packets could look (occasionally) as
- * though it was used some time after the facet was used. That can make a
- * one-packet flow look like it has a nonzero duration, which looks odd in
- * e.g. NetFlow statistics.
- *
- * If non-null, 'stats' will be folded into 'facet'. */
+ * All of the packets in 'miss' are considered to have arrived at time
+ * 'miss->stats.used'. This is really important only for new facets: if we
+ * just called time_msec() here, then the new subfacet or its packets could
+ * look (occasionally) as though it was used some time after the facet was
+ * used. That can make a one-packet flow look like it has a nonzero duration,
+ * which looks odd in e.g. NetFlow statistics. */
static void
handle_flow_miss_with_facet(struct flow_miss *miss, struct facet *facet,
- long long int now, struct dpif_flow_stats *stats,
struct flow_miss_op *ops, size_t *n_ops)
{
enum subfacet_path want_path;
struct subfacet *subfacet;
- struct ofpbuf *packet;
- want_path = facet->xout.slow ? SF_SLOW_PATH : SF_FAST_PATH;
-
- LIST_FOR_EACH (packet, list_node, &miss->packets) {
- struct flow_miss_op *op = &ops[*n_ops];
-
- handle_flow_miss_common(miss->ofproto, packet, &miss->flow,
- facet->fail_open);
-
- if (want_path != SF_FAST_PATH) {
- struct rule_dpif *rule;
- struct xlate_in xin;
-
- rule = rule_dpif_lookup(facet->ofproto, &facet->flow, NULL);
- xlate_in_init(&xin, facet->ofproto, &miss->flow, rule, 0, packet);
- xlate_actions_for_side_effects(&xin);
- }
-
- if (facet->xout.odp_actions.size) {
- struct dpif_execute *execute = &op->dpif_op.u.execute;
+ facet->packet_count += miss->stats.n_packets;
+ facet->prev_packet_count += miss->stats.n_packets;
+ facet->byte_count += miss->stats.n_bytes;
+ facet->prev_byte_count += miss->stats.n_bytes;
- init_flow_miss_execute_op(miss, packet, op);
- execute->actions = facet->xout.odp_actions.data,
- execute->actions_len = facet->xout.odp_actions.size;
- (*n_ops)++;
- }
- }
+ subfacet = subfacet_create(facet, miss);
+ want_path = facet->xout.slow ? SF_SLOW_PATH : SF_FAST_PATH;
/* Don't install the flow if it's the result of the "userspace"
* action for an already installed facet. This can occur when a
* be rejected as overlapping by the datapath. */
if (miss->upcall_type == DPIF_UC_ACTION
&& !list_is_empty(&facet->subfacets)) {
- if (stats) {
- facet->used = MAX(facet->used, stats->used);
- facet->packet_count += stats->n_packets;
- facet->byte_count += stats->n_bytes;
- facet->tcp_flags |= stats->tcp_flags;
- }
return;
}
- subfacet = subfacet_create(facet, miss, now);
- if (stats) {
- subfacet_update_stats(subfacet, stats);
- }
-
+ subfacet = subfacet_create(facet, miss);
if (subfacet->path != want_path) {
struct flow_miss_op *op = &ops[(*n_ops)++];
struct dpif_flow_put *put = &op->dpif_op.u.flow_put;
handle_flow_miss(struct flow_miss *miss, struct flow_miss_op *ops,
size_t *n_ops)
{
- struct ofproto_dpif *ofproto = miss->ofproto;
- struct dpif_flow_stats stats__;
- struct dpif_flow_stats *stats = &stats__;
- struct ofpbuf *packet;
struct facet *facet;
- long long int now;
- now = time_msec();
- memset(stats, 0, sizeof *stats);
- stats->used = now;
- LIST_FOR_EACH (packet, list_node, &miss->packets) {
- stats->tcp_flags |= packet_get_tcp_flags(packet, &miss->flow);
- stats->n_bytes += packet->size;
- stats->n_packets++;
- }
+ miss->ofproto->n_missed += list_size(&miss->packets);
- facet = facet_lookup_valid(ofproto, &miss->flow);
+ facet = facet_lookup_valid(miss->ofproto, &miss->flow);
if (!facet) {
- struct flow_wildcards wc;
- struct rule_dpif *rule;
- struct xlate_out xout;
- struct xlate_in xin;
-
- flow_wildcards_init_catchall(&wc);
- rule = rule_dpif_lookup(ofproto, &miss->flow, &wc);
- rule_credit_stats(rule, stats);
-
- xlate_in_init(&xin, ofproto, &miss->flow, rule, stats->tcp_flags,
- NULL);
- xin.resubmit_stats = stats;
- xin.may_learn = true;
- xlate_actions(&xin, &xout);
- flow_wildcards_or(&xout.wc, &xout.wc, &wc);
-
/* There does not exist a bijection between 'struct flow' and datapath
* flow keys with fitness ODP_FIT_TO_LITTLE. This breaks a fundamental
* assumption used throughout the facet and subfacet handling code.
* Since we have to handle these misses in userspace anyway, we simply
* skip facet creation, avoiding the problem altogether. */
if (miss->key_fitness == ODP_FIT_TOO_LITTLE
- || !flow_miss_should_make_facet(miss, &xout.wc)) {
- handle_flow_miss_without_facet(rule, &xout, miss, ops, n_ops);
+ || !flow_miss_should_make_facet(miss)) {
return;
}
- facet = facet_create(miss, rule, &xout, stats);
- stats = NULL;
+ facet = facet_create(miss);
}
- handle_flow_miss_with_facet(miss, facet, now, stats, ops, n_ops);
+ handle_flow_miss_with_facet(miss, facet, ops, n_ops);
}
static struct drop_key *
}
hmap_remove(&backer->drop_keys, &drop_key->hmap_node);
- free(drop_key->key);
- free(drop_key);
+ drop_key_destroy(drop_key);
}
+
+ udpif_drop_key_clear(backer->udpif);
}
static void
-handle_miss_upcalls(struct dpif_backer *backer, struct dpif_upcall *upcalls,
- size_t n_upcalls)
+handle_flow_misses(struct dpif_backer *backer, struct flow_miss_batch *fmb)
{
- struct dpif_upcall *upcall;
+ struct flow_miss_op flow_miss_ops[FLOW_MISS_MAX_BATCH];
+ struct dpif_op *dpif_ops[FLOW_MISS_MAX_BATCH];
struct flow_miss *miss;
- struct flow_miss misses[FLOW_MISS_MAX_BATCH];
- struct flow_miss_op flow_miss_ops[FLOW_MISS_MAX_BATCH * 2];
- struct dpif_op *dpif_ops[FLOW_MISS_MAX_BATCH * 2];
- struct hmap todo;
- int n_misses;
- size_t n_ops;
- size_t i;
-
- if (!n_upcalls) {
- return;
- }
-
- /* Construct the to-do list.
- *
- * This just amounts to extracting the flow from each packet and sticking
- * the packets that have the same flow in the same "flow_miss" structure so
- * that we can process them together. */
- hmap_init(&todo);
- n_misses = 0;
- for (upcall = upcalls; upcall < &upcalls[n_upcalls]; upcall++) {
- struct flow_miss *miss = &misses[n_misses];
- struct flow_miss *existing_miss;
- struct ofproto_dpif *ofproto;
- odp_port_t odp_in_port;
- struct flow flow;
- uint32_t hash;
- int error;
-
- error = xlate_receive(backer, upcall->packet, upcall->key,
- upcall->key_len, &flow, &miss->key_fitness,
- &ofproto, &odp_in_port);
- if (error == ENODEV) {
- struct drop_key *drop_key;
-
- /* Received packet on datapath port for which we couldn't
- * associate an ofproto. This can happen if a port is removed
- * while traffic is being received. Print a rate-limited message
- * in case it happens frequently. Install a drop flow so
- * that future packets of the flow are inexpensively dropped
- * in the kernel. */
- VLOG_INFO_RL(&rl, "received packet on unassociated datapath port "
- "%"PRIu32, odp_in_port);
-
- drop_key = drop_key_lookup(backer, upcall->key, upcall->key_len);
- if (!drop_key) {
- int ret;
- ret = dpif_flow_put(backer->dpif,
- DPIF_FP_CREATE | DPIF_FP_MODIFY,
- upcall->key, upcall->key_len,
- NULL, 0, NULL, 0, NULL);
-
- if (!ret) {
- drop_key = xmalloc(sizeof *drop_key);
- drop_key->key = xmemdup(upcall->key, upcall->key_len);
- drop_key->key_len = upcall->key_len;
-
- hmap_insert(&backer->drop_keys, &drop_key->hmap_node,
- hash_bytes(drop_key->key, drop_key->key_len, 0));
- }
- }
- continue;
- }
- if (error) {
- continue;
- }
-
- ofproto->n_missed++;
- flow_extract(upcall->packet, flow.skb_priority, flow.skb_mark,
- &flow.tunnel, &flow.in_port, &miss->flow);
-
- /* Add other packets to a to-do list. */
- hash = flow_hash(&miss->flow, 0);
- existing_miss = flow_miss_find(&todo, ofproto, &miss->flow, hash);
- if (!existing_miss) {
- hmap_insert(&todo, &miss->hmap_node, hash);
- miss->ofproto = ofproto;
- miss->key = upcall->key;
- miss->key_len = upcall->key_len;
- miss->upcall_type = upcall->type;
- list_init(&miss->packets);
-
- n_misses++;
- } else {
- miss = existing_miss;
- }
- list_push_back(&miss->packets, &upcall->packet->list_node);
- }
+ size_t n_ops, i;
/* Process each element in the to-do list, constructing the set of
* operations to batch. */
n_ops = 0;
- HMAP_FOR_EACH (miss, hmap_node, &todo) {
+ HMAP_FOR_EACH (miss, hmap_node, &fmb->misses) {
handle_flow_miss(miss, flow_miss_ops, &n_ops);
}
ovs_assert(n_ops <= ARRAY_SIZE(flow_miss_ops));
subfacet->path = SF_NOT_INSTALLED;
}
-
- /* Free memory. */
- if (flow_miss_ops[i].xout_garbage) {
- xlate_out_uninit(&flow_miss_ops[i].xout);
- }
- }
- hmap_destroy(&todo);
-}
-
-static enum { SFLOW_UPCALL, MISS_UPCALL, BAD_UPCALL, FLOW_SAMPLE_UPCALL,
- IPFIX_UPCALL }
-classify_upcall(const struct dpif_upcall *upcall)
-{
- size_t userdata_len;
- union user_action_cookie cookie;
-
- /* First look at the upcall type. */
- switch (upcall->type) {
- case DPIF_UC_ACTION:
- break;
-
- case DPIF_UC_MISS:
- return MISS_UPCALL;
-
- case DPIF_N_UC_TYPES:
- default:
- VLOG_WARN_RL(&rl, "upcall has unexpected type %"PRIu32, upcall->type);
- return BAD_UPCALL;
- }
-
- /* "action" upcalls need a closer look. */
- if (!upcall->userdata) {
- VLOG_WARN_RL(&rl, "action upcall missing cookie");
- return BAD_UPCALL;
- }
- userdata_len = nl_attr_get_size(upcall->userdata);
- if (userdata_len < sizeof cookie.type
- || userdata_len > sizeof cookie) {
- VLOG_WARN_RL(&rl, "action upcall cookie has unexpected size %zu",
- userdata_len);
- return BAD_UPCALL;
- }
- memset(&cookie, 0, sizeof cookie);
- memcpy(&cookie, nl_attr_get(upcall->userdata), userdata_len);
- if (userdata_len == sizeof cookie.sflow
- && cookie.type == USER_ACTION_COOKIE_SFLOW) {
- return SFLOW_UPCALL;
- } else if (userdata_len == sizeof cookie.slow_path
- && cookie.type == USER_ACTION_COOKIE_SLOW_PATH) {
- return MISS_UPCALL;
- } else if (userdata_len == sizeof cookie.flow_sample
- && cookie.type == USER_ACTION_COOKIE_FLOW_SAMPLE) {
- return FLOW_SAMPLE_UPCALL;
- } else if (userdata_len == sizeof cookie.ipfix
- && cookie.type == USER_ACTION_COOKIE_IPFIX) {
- return IPFIX_UPCALL;
- } else {
- VLOG_WARN_RL(&rl, "invalid user cookie of type %"PRIu16
- " and size %zu", cookie.type, userdata_len);
- return BAD_UPCALL;
}
}
dpif_ipfix_bridge_sample(ofproto->ipfix, upcall->packet, &flow);
}
-static int
-handle_upcalls(struct dpif_backer *backer, unsigned int max_batch)
+static void
+handle_upcalls(struct dpif_backer *backer)
{
- struct dpif_upcall misses[FLOW_MISS_MAX_BATCH];
- struct ofpbuf miss_bufs[FLOW_MISS_MAX_BATCH];
- uint64_t miss_buf_stubs[FLOW_MISS_MAX_BATCH][4096 / 8];
+ struct flow_miss_batch *fmb;
int n_processed;
- int n_misses;
- int i;
-
- ovs_assert(max_batch <= FLOW_MISS_MAX_BATCH);
- n_misses = 0;
- for (n_processed = 0; n_processed < max_batch; n_processed++) {
- struct dpif_upcall *upcall = &misses[n_misses];
- struct ofpbuf *buf = &miss_bufs[n_misses];
- int error;
+ for (n_processed = 0; n_processed < FLOW_MISS_MAX_BATCH; n_processed++) {
+ struct upcall *upcall = upcall_next(backer->udpif);
- ofpbuf_use_stub(buf, miss_buf_stubs[n_misses],
- sizeof miss_buf_stubs[n_misses]);
- error = dpif_recv(backer->dpif, upcall, buf);
- if (error) {
- ofpbuf_uninit(buf);
+ if (!upcall) {
break;
}
- switch (classify_upcall(upcall)) {
- case MISS_UPCALL:
- /* Handle it later. */
- n_misses++;
- break;
-
+ switch (upcall->type) {
case SFLOW_UPCALL:
- handle_sflow_upcall(backer, upcall);
- ofpbuf_uninit(buf);
+ handle_sflow_upcall(backer, &upcall->dpif_upcall);
break;
case FLOW_SAMPLE_UPCALL:
- handle_flow_sample_upcall(backer, upcall);
- ofpbuf_uninit(buf);
+ handle_flow_sample_upcall(backer, &upcall->dpif_upcall);
break;
case IPFIX_UPCALL:
- handle_ipfix_upcall(backer, upcall);
- ofpbuf_uninit(buf);
+ handle_ipfix_upcall(backer, &upcall->dpif_upcall);
break;
case BAD_UPCALL:
- ofpbuf_uninit(buf);
break;
+
+ case MISS_UPCALL:
+ NOT_REACHED();
}
+
+ upcall_destroy(upcall);
}
- /* Handle deferred MISS_UPCALL processing. */
- handle_miss_upcalls(backer, misses, n_misses);
- for (i = 0; i < n_misses; i++) {
- ofpbuf_uninit(&miss_bufs[i]);
+ for (n_processed = 0; n_processed < FLOW_MISS_MAX_BATCH; n_processed++) {
+ struct drop_key *drop_key = drop_key_next(backer->udpif);
+ if (!drop_key) {
+ break;
+ }
+
+ if (!drop_key_lookup(backer, drop_key->key, drop_key->key_len)) {
+ hmap_insert(&backer->drop_keys, &drop_key->hmap_node,
+ hash_bytes(drop_key->key, drop_key->key_len, 0));
+ dpif_flow_put(backer->dpif, DPIF_FP_CREATE | DPIF_FP_MODIFY,
+ drop_key->key, drop_key->key_len,
+ NULL, 0, NULL, 0, NULL);
+ } else {
+ drop_key_destroy(drop_key);
+ }
}
- return n_processed;
+ fmb = flow_miss_batch_next(backer->udpif);
+ if (fmb) {
+ handle_flow_misses(backer, fmb);
+ flow_miss_batch_destroy(fmb);
+ }
}
\f
/* Flow expiration. */
return;
}
- COVERAGE_INC(ofproto_dpif_expired);
+ if (!ovs_rwlock_trywrlock(&rule->up.evict)) {
+ COVERAGE_INC(ofproto_dpif_expired);
- /* Get rid of the rule. */
- ofproto_rule_expire(&rule->up, reason);
+ /* Get rid of the rule. */
+ ofproto_rule_expire(&rule->up, reason);
+ }
}
\f
/* Facets. */
* The facet will initially have no subfacets. The caller should create (at
* least) one subfacet with subfacet_create(). */
static struct facet *
-facet_create(const struct flow_miss *miss, struct rule_dpif *rule,
- struct xlate_out *xout, struct dpif_flow_stats *stats)
+facet_create(const struct flow_miss *miss)
{
struct ofproto_dpif *ofproto = miss->ofproto;
struct facet *facet;
facet = xzalloc(sizeof *facet);
facet->ofproto = miss->ofproto;
- facet->packet_count = facet->prev_packet_count = stats->n_packets;
- facet->byte_count = facet->prev_byte_count = stats->n_bytes;
- facet->tcp_flags = stats->tcp_flags;
- facet->used = stats->used;
+ facet->used = miss->stats.used;
facet->flow = miss->flow;
facet->learn_rl = time_msec() + 500;
netflow_flow_init(&facet->nf_flow);
netflow_flow_update_time(ofproto->netflow, &facet->nf_flow, facet->used);
- xlate_out_copy(&facet->xout, xout);
+ xlate_out_copy(&facet->xout, &miss->xout);
match_init(&match, &facet->flow, &facet->xout.wc);
cls_rule_init(&facet->cr, &match, OFP_DEFAULT_PRIORITY);
ovs_rwlock_unlock(&ofproto->facets.rwlock);
facet->nf_flow.output_iface = facet->xout.nf_output_iface;
- facet->fail_open = rule->up.cr.priority == FAIL_OPEN_PRIORITY;
-
return facet;
}
{
if (facet) {
struct ofproto_dpif *ofproto = facet->ofproto;
- const struct rule_dpif *rule = rule_dpif_lookup(ofproto, &facet->flow,
- NULL);
- const struct ofpact *ofpacts = rule->up.ofpacts;
- size_t ofpacts_len = rule->up.ofpacts_len;
-
- if (ofpacts_len > 0 &&
- ofpacts->type == OFPACT_CONTROLLER &&
- ofpact_next(ofpacts) >= ofpact_end(ofpacts, ofpacts_len)) {
- return true;
- }
+ const struct ofpact *ofpacts;
+ struct rule_dpif *rule;
+ size_t ofpacts_len;
+ bool is_controller;
+
+ rule_dpif_lookup(ofproto, &facet->flow, NULL, &rule);
+ ofpacts_len = rule->up.ofpacts_len;
+ ofpacts = rule->up.ofpacts;
+ is_controller = ofpacts_len > 0
+ && ofpacts->type == OFPACT_CONTROLLER
+ && ofpact_next(ofpacts) >= ofpact_end(ofpacts, ofpacts_len);
+ rule_release(rule);
+ return is_controller;
}
return false;
}
struct xlate_in xin;
struct rule_dpif *rule;
- bool ok, fail_open;
+ bool ok;
/* Check the datapath actions for consistency. */
- rule = rule_dpif_lookup(facet->ofproto, &facet->flow, NULL);
+ rule_dpif_lookup(facet->ofproto, &facet->flow, NULL, &rule);
xlate_in_init(&xin, facet->ofproto, &facet->flow, rule, 0, NULL);
xlate_actions(&xin, &xout);
+ rule_release(rule);
- fail_open = rule->up.cr.priority == FAIL_OPEN_PRIORITY;
ok = ofpbuf_equal(&facet->xout.odp_actions, &xout.odp_actions)
- && facet->xout.slow == xout.slow
- && facet->fail_open == fail_open;
+ && facet->xout.slow == xout.slow;
if (!ok && !VLOG_DROP_WARN(&rl)) {
struct ds s = DS_EMPTY_INITIALIZER;
ds_put_format(&s, " slow path incorrect. should be %d", xout.slow);
}
- if (facet->fail_open != fail_open) {
- ds_put_format(&s, " fail open incorrect. should be %s",
- fail_open ? "true" : "false");
- }
ds_destroy(&s);
}
xlate_out_uninit(&xout);
}
flow_wildcards_init_catchall(&wc);
- new_rule = rule_dpif_lookup(ofproto, &facet->flow, &wc);
+ rule_dpif_lookup(ofproto, &facet->flow, &wc, &new_rule);
/* Calculate new datapath actions.
*
|| memcmp(&facet->xout.wc, &xout.wc, sizeof xout.wc)) {
facet_remove(facet);
xlate_out_uninit(&xout);
+ rule_release(new_rule);
return false;
}
facet->xout.mirrors = xout.mirrors;
facet->nf_flow.output_iface = facet->xout.nf_output_iface;
facet->used = MAX(facet->used, new_rule->up.created);
- facet->fail_open = new_rule->up.cr.priority == FAIL_OPEN_PRIORITY;
xlate_out_uninit(&xout);
+ rule_release(new_rule);
return true;
}
facet->accounted_bytes = 0;
}
+static void
+flow_push_stats(struct ofproto_dpif *ofproto, struct flow *flow,
+ struct dpif_flow_stats *stats, bool may_learn)
+{
+ struct ofport_dpif *in_port;
+ struct rule_dpif *rule;
+ struct xlate_in xin;
+
+ in_port = get_ofp_port(ofproto, flow->in_port.ofp_port);
+ if (in_port && in_port->is_tunnel) {
+ netdev_vport_inc_rx(in_port->up.netdev, stats);
+ }
+
+ rule_dpif_lookup(ofproto, flow, NULL, &rule);
+ rule_credit_stats(rule, stats);
+ xlate_in_init(&xin, ofproto, flow, rule, stats->tcp_flags, NULL);
+ xin.resubmit_stats = stats;
+ xin.may_learn = may_learn;
+ xlate_actions_for_side_effects(&xin);
+ rule_release(rule);
+}
+
static void
facet_push_stats(struct facet *facet, bool may_learn)
{
stats.tcp_flags = facet->tcp_flags;
if (may_learn || stats.n_packets || facet->used > facet->prev_used) {
- struct ofproto_dpif *ofproto = facet->ofproto;
- struct ofport_dpif *in_port;
- struct rule_dpif *rule;
- struct xlate_in xin;
-
facet->prev_packet_count = facet->packet_count;
facet->prev_byte_count = facet->byte_count;
facet->prev_used = facet->used;
- in_port = get_ofp_port(ofproto, facet->flow.in_port.ofp_port);
- if (in_port && in_port->is_tunnel) {
- netdev_vport_inc_rx(in_port->up.netdev, &stats);
- }
-
- rule = rule_dpif_lookup(ofproto, &facet->flow, NULL);
- rule_credit_stats(rule, &stats);
- netflow_flow_update_time(ofproto->netflow, &facet->nf_flow,
+ netflow_flow_update_time(facet->ofproto->netflow, &facet->nf_flow,
facet->used);
netflow_flow_update_flags(&facet->nf_flow, facet->tcp_flags);
- mirror_update_stats(ofproto->mbridge, facet->xout.mirrors,
+ mirror_update_stats(facet->ofproto->mbridge, facet->xout.mirrors,
stats.n_packets, stats.n_bytes);
-
- xlate_in_init(&xin, ofproto, &facet->flow, rule, stats.tcp_flags,
- NULL);
- xin.resubmit_stats = &stats;
- xin.may_learn = may_learn;
- xlate_actions_for_side_effects(&xin);
+ flow_push_stats(facet->ofproto, &facet->flow, &stats, may_learn);
}
}
* existing subfacet if there is one, otherwise creates and returns a
* new subfacet. */
static struct subfacet *
-subfacet_create(struct facet *facet, struct flow_miss *miss,
- long long int now)
+subfacet_create(struct facet *facet, struct flow_miss *miss)
{
struct dpif_backer *backer = miss->ofproto->backer;
enum odp_key_fitness key_fitness = miss->key_fitness;
subfacet->key_fitness = key_fitness;
subfacet->key = xmemdup(key, key_len);
subfacet->key_len = key_len;
- subfacet->used = now;
- subfacet->created = now;
+ subfacet->used = miss->stats.used;
+ subfacet->created = subfacet->used;
subfacet->dp_packet_count = 0;
subfacet->dp_byte_count = 0;
subfacet->path = SF_NOT_INSTALLED;
/* Lookup 'flow' in 'ofproto''s classifier. If 'wc' is non-null, sets
* the fields that were relevant as part of the lookup. */
-static struct rule_dpif *
+void
rule_dpif_lookup(struct ofproto_dpif *ofproto, const struct flow *flow,
- struct flow_wildcards *wc)
+ struct flow_wildcards *wc, struct rule_dpif **rule)
{
struct ofport_dpif *port;
- struct rule_dpif *rule;
- rule = rule_dpif_lookup_in_table(ofproto, flow, wc, 0);
- if (rule) {
- return rule;
+ if (rule_dpif_lookup_in_table(ofproto, flow, wc, 0, rule)) {
+ return;
}
port = get_ofp_port(ofproto, flow->in_port.ofp_port);
if (!port) {
flow->in_port.ofp_port);
}
- return choose_miss_rule(port ? port->up.pp.config : 0, ofproto->miss_rule,
- ofproto->no_packet_in_rule);
+ *rule = choose_miss_rule(port ? port->up.pp.config : 0, ofproto->miss_rule,
+ ofproto->no_packet_in_rule);
+ ovs_rwlock_rdlock(&(*rule)->up.evict);
}
-struct rule_dpif *
+bool
rule_dpif_lookup_in_table(struct ofproto_dpif *ofproto,
const struct flow *flow, struct flow_wildcards *wc,
- uint8_t table_id)
+ uint8_t table_id, struct rule_dpif **rule)
+ OVS_ACQ_RDLOCK((*rule)->up.evict)
{
struct cls_rule *cls_rule;
struct classifier *cls;
bool frag;
+ *rule = NULL;
if (table_id >= N_TABLES) {
- return NULL;
+ return false;
}
if (wc) {
}
cls = &ofproto->up.tables[table_id].cls;
+ ovs_rwlock_rdlock(&cls->rwlock);
frag = (flow->nw_frag & FLOW_NW_FRAG_ANY) != 0;
if (frag && ofproto->up.frag_handling == OFPC_FRAG_NORMAL) {
/* We must pretend that transport ports are unavailable. */
struct flow ofpc_normal_flow = *flow;
ofpc_normal_flow.tp_src = htons(0);
ofpc_normal_flow.tp_dst = htons(0);
- ovs_rwlock_rdlock(&cls->rwlock);
cls_rule = classifier_lookup(cls, &ofpc_normal_flow, wc);
- ovs_rwlock_unlock(&cls->rwlock);
} else if (frag && ofproto->up.frag_handling == OFPC_FRAG_DROP) {
cls_rule = &ofproto->drop_frags_rule->up.cr;
if (wc) {
flow_wildcards_init_exact(wc);
}
} else {
- ovs_rwlock_rdlock(&cls->rwlock);
cls_rule = classifier_lookup(cls, flow, wc);
- ovs_rwlock_unlock(&cls->rwlock);
}
- return rule_dpif_cast(rule_from_cls_rule(cls_rule));
+
+ *rule = rule_dpif_cast(rule_from_cls_rule(cls_rule));
+ if (*rule && ovs_rwlock_tryrdlock(&(*rule)->up.evict)) {
+ /* The rule is in the process of being removed. Best we can do is
+ * pretend it isn't there. */
+ *rule = NULL;
+ }
+ ovs_rwlock_unlock(&cls->rwlock);
+
+ return *rule != NULL;
}
/* Given a port configuration (specified as zero if there's no port), chooses
return config & OFPUTIL_PC_NO_PACKET_IN ? no_packet_in_rule : miss_rule;
}
+void
+rule_release(struct rule_dpif *rule)
+{
+ if (rule) {
+ ovs_rwlock_unlock(&rule->up.evict);
+ }
+}
+
static void
complete_operation(struct rule_dpif *rule)
{
/* Use the metadata from the flow and the packet argument
* to reconstruct the flow. */
- flow_extract(packet, flow.skb_priority, flow.skb_mark, NULL,
+ flow_extract(packet, flow.skb_priority, flow.pkt_mark, NULL,
&in_port_, &flow);
}
}
const struct ofpbuf *packet, struct ds *ds)
{
struct rule_dpif *rule;
+ struct flow_wildcards wc;
ds_put_cstr(ds, "Flow: ");
flow_format(ds, flow);
ds_put_char(ds, '\n');
- rule = rule_dpif_lookup(ofproto, flow, NULL);
+ flow_wildcards_init_catchall(&wc);
+ rule_dpif_lookup(ofproto, flow, &wc, &rule);
trace_format_rule(ds, 0, rule);
if (rule == ofproto->miss_rule) {
trace.xin.report_hook = trace_report;
xlate_actions(&trace.xin, &trace.xout);
+ flow_wildcards_or(&trace.xout.wc, &trace.xout.wc, &wc);
ds_put_char(ds, '\n');
trace_format_flow(ds, 0, "Final flow", &trace);
xlate_out_uninit(&trace.xout);
}
+
+ rule_release(rule);
}
static void
#include <stdint.h>
#include "hmapx.h"
+#include "odp-util.h"
#include "ofproto/ofproto-provider.h"
#include "ovs-thread.h"
#include "timer.h"
struct ofport_dpif;
struct dpif_backer;
+/* Ofproto-dpif -- DPIF based ofproto implementation.
+ *
+ * Ofproto-dpif provides an ofproto implementation for those platforms which
+ * implement the netdev and dpif interface defined in netdev.h and dpif.h. The
+ * most important of which is the Linux Kernel Module (dpif-linux), but
+ * alternatives are supported such as a userspace only implementation
+ * (dpif-netdev), and a dummy implementation used for unit testing.
+ *
+ * Ofproto-dpif is divided into three major chunks.
+ *
+ * - ofproto-dpif.c
+ * The main ofproto-dpif module is responsible for implementing the
+ * provider interface, installing and removing datapath flows, maintaining
+ * packet statistics, running protocols (BFD, LACP, STP, etc), and
+ * configuring relevant submodules.
+ *
+ * - ofproto-dpif-upcall.c
+ * Ofproto-dpif-upcall is responsible for retrieving upcalls from the kernel,
+ * processing miss upcalls, and handing more complex ones up to the main
+ * ofproto-dpif module. Miss upcall processing boils down to figuring out
+ * what each packet's actions are, executing them (i.e. asking the kernel to
+ * forward it), and handing it up to ofproto-dpif to decided whether or not
+ * to install a kernel flow.
+ *
+ * - ofproto-dpif-xlate.c
+ * Ofproto-dpif-xlate is responsible for translating translating OpenFlow
+ * actions into datapath actions. */
+
struct rule_dpif {
struct rule up;
return rule ? CONTAINER_OF(rule, struct rule_dpif, up) : NULL;
}
-struct rule_dpif *rule_dpif_lookup_in_table(struct ofproto_dpif *,
- const struct flow *,
- struct flow_wildcards *,
- uint8_t table_id);
+void rule_dpif_lookup(struct ofproto_dpif *, const struct flow *,
+ struct flow_wildcards *, struct rule_dpif **rule)
+ OVS_ACQ_RDLOCK((*rule)->up.evict);
+
+bool rule_dpif_lookup_in_table(struct ofproto_dpif *, const struct flow *,
+ struct flow_wildcards *, uint8_t table_id,
+ struct rule_dpif **rule)
+ OVS_ACQ_RDLOCK((*rule)->up.evict);
+
+void rule_release(struct rule_dpif *rule) OVS_RELEASES(rule->up.evict);
void rule_credit_stats(struct rule_dpif *, const struct dpif_flow_stats *);
uint16_t idle_timeout OVS_GUARDED; /* In seconds from ->used. */
/* Eviction groups. */
- bool evictable; /* If false, prevents eviction. */
struct heap_node evg_node; /* In eviction_group's "rules" heap. */
struct eviction_group *eviction_group; /* NULL if not in any group. */
+ /* The evict lock is used to prevent rules from being evicted while child
+ * threads are using them to xlate flows. A read lock means the rule is
+ * currently being used. A write lock means the rule is in the process of
+ * being evicted and should be considered gone. A rule will not be evicted
+ * unless both its own and its classifiers write locks are held.
+ * Therefore, while holding a classifier readlock, one can be assured that
+ * even write locked rules are safe. */
+ struct ovs_rwlock evict;
+
struct ofpact *ofpacts; /* Sequence of "struct ofpacts". */
unsigned int ofpacts_len; /* Size of 'ofpacts', in bytes. */
}
void ofproto_rule_update_used(struct rule *, long long int used);
-void ofproto_rule_expire(struct rule *, uint8_t reason);
+void ofproto_rule_expire(struct rule *rule, uint8_t reason)
+ OVS_RELEASES(rule->evict);
void ofproto_rule_destroy(struct ofproto *, struct classifier *cls,
struct rule *) OVS_REQ_WRLOCK(cls->rwlock);
.RS
.IP \fIskb_priority\fR
Packet QoS priority.
-.IP \fIskb_mark\fR
-SKB mark of the packet.
+.IP \fIpkt_mark\fR
+Mark of the packet.
.IP \fItun_id\fR
The tunnel ID on which the packet arrived.
.IP \fIin_port\fR
const struct mf_subfield *fields,
size_t n_fields);
-static void oftable_remove_rule(struct rule *);
+static void oftable_remove_rule(struct rule *rule) OVS_RELEASES(rule->evict);
static void oftable_remove_rule__(struct ofproto *ofproto,
struct classifier *cls, struct rule *rule)
- OVS_REQ_WRLOCK(cls->rwlock);
+ OVS_REQ_WRLOCK(cls->rwlock) OVS_RELEASES(rule->evict);
static struct rule *oftable_replace_rule(struct rule *);
static void oftable_substitute_rule(struct rule *old, struct rule *new);
struct heap rules; /* Contains "struct rule"s. */
};
-static struct rule *choose_rule_to_evict(struct oftable *);
+static bool choose_rule_to_evict(struct oftable *table, struct rule **rulep)
+ OVS_TRY_WRLOCK(true, (*rulep)->evict);
static void ofproto_evict(struct ofproto *);
static uint32_t rule_eviction_priority(struct rule *);
static enum ofperr add_flow(struct ofproto *, struct ofconn *,
struct ofputil_flow_mod *,
const struct ofp_header *);
-static void delete_flow__(struct rule *, struct ofopgroup *,
- enum ofp_flow_removed_reason);
+static void delete_flow__(struct rule *rule, struct ofopgroup *,
+ enum ofp_flow_removed_reason)
+ OVS_RELEASES(rule->evict);
static bool handle_openflow(struct ofconn *, const struct ofpbuf *);
static enum ofperr handle_flow_mod__(struct ofproto *, struct ofconn *,
struct ofputil_flow_mod *,
if (!rule->pending) {
ofoperation_create(group, rule, OFOPERATION_DELETE,
OFPRR_DELETE);
+ ovs_rwlock_wrlock(&rule->evict);
oftable_remove_rule__(ofproto, &table->cls, rule);
ofproto->ofproto_class->rule_destruct(rule);
}
/* Initiate deletion -> success. */
struct ofopgroup *group = ofopgroup_create_unattached(ofproto);
ofoperation_create(group, rule, OFOPERATION_DELETE, OFPRR_DELETE);
+ ovs_rwlock_wrlock(&rule->evict);
oftable_remove_rule(rule);
ofproto->ofproto_class->rule_destruct(rule);
ofopgroup_submit(group);
cls_rule_destroy(&rule->cr);
free(rule->ofpacts);
ovs_mutex_destroy(&rule->timeout_mutex);
+ ovs_rwlock_destroy(&rule->evict);
rule->ofproto->ofproto_class->rule_dealloc(rule);
}
}
struct rule *rule) OVS_REQ_WRLOCK(cls->rwlock)
{
ovs_assert(!rule->pending);
- oftable_remove_rule__(ofproto, cls, rule);
+ if (!ovs_rwlock_trywrlock(&rule->evict)) {
+ oftable_remove_rule__(ofproto, cls, rule);
+ } else {
+ NOT_REACHED();
+ }
ofproto_rule_destroy__(rule);
}
rule->ofpacts_len = fm->ofpacts_len;
rule->meter_id = find_meter(rule->ofpacts, rule->ofpacts_len);
list_init(&rule->meter_list_node);
- rule->evictable = true;
rule->eviction_group = NULL;
list_init(&rule->expirable);
rule->monitor_flags = 0;
rule->add_seqno = 0;
rule->modify_seqno = 0;
+ ovs_rwlock_init(&rule->evict);
/* Insert new rule. */
victim = oftable_replace_rule(rule);
n_rules = classifier_count(&table->cls);
ovs_rwlock_unlock(&table->cls.rwlock);
if (n_rules > table->max_flows) {
- bool was_evictable;
-
- was_evictable = rule->evictable;
- rule->evictable = false;
- evict = choose_rule_to_evict(table);
- rule->evictable = was_evictable;
-
- if (!evict) {
+ ovs_rwlock_rdlock(&rule->evict);
+ if (choose_rule_to_evict(table, &evict)) {
+ ovs_rwlock_unlock(&rule->evict);
+ ovs_rwlock_unlock(&evict->evict);
+ if (evict->pending) {
+ error = OFPROTO_POSTPONE;
+ goto exit;
+ }
+ } else {
+ ovs_rwlock_unlock(&rule->evict);
error = OFPERR_OFPFMFC_TABLE_FULL;
goto exit;
- } else if (evict->pending) {
- error = OFPROTO_POSTPONE;
- goto exit;
}
} else {
evict = NULL;
op->group->n_running--;
ofoperation_destroy(rule->pending);
} else if (evict) {
+ /* It would be better if we maintained the lock we took in
+ * choose_rule_to_evict() earlier, but that confuses the thread
+ * safety analysis, and this code is fragile enough that we really
+ * need it. In the worst case, we'll have to block a little while
+ * before we perform the eviction, which doesn't seem like a big
+ * problem. */
+ ovs_rwlock_wrlock(&evict->evict);
delete_flow__(evict, group, OFPRR_EVICTION);
}
ofopgroup_submit(group);
group = ofopgroup_create(ofproto, ofconn, request, UINT32_MAX);
LIST_FOR_EACH_SAFE (rule, next, ofproto_node, rules) {
+ ovs_rwlock_wrlock(&rule->evict);
delete_flow__(rule, group, reason);
}
ofopgroup_submit(group);
\f
/* Table overflow policy. */
-/* Chooses and returns a rule to evict from 'table'. Returns NULL if the table
- * is not configured to evict rules or if the table contains no evictable
- * rules. (Rules with 'evictable' set to false or with no timeouts are not
- * evictable.) */
-static struct rule *
-choose_rule_to_evict(struct oftable *table)
+/* Chooses and updates 'rulep' with a rule to evict from 'table'. Sets 'rulep'
+ * to NULL if the table is not configured to evict rules or if the table
+ * contains no evictable rules. (Rules with a readlock on their evict rwlock,
+ * or with no timeouts are not evictable.) */
+static bool
+choose_rule_to_evict(struct oftable *table, struct rule **rulep)
{
struct eviction_group *evg;
+ *rulep = NULL;
if (!table->eviction_fields) {
- return NULL;
+ return false;
}
/* In the common case, the outer and inner loops here will each be entered
struct rule *rule;
HEAP_FOR_EACH (rule, evg_node, &evg->rules) {
- if (rule->evictable) {
- return rule;
+ if (!ovs_rwlock_trywrlock(&rule->evict)) {
+ *rulep = rule;
+ return true;
}
}
}
- return NULL;
+ return false;
}
/* Searches 'ofproto' for tables that have more flows than their configured
break;
}
- rule = choose_rule_to_evict(table);
- if (!rule || rule->pending) {
+ if (!choose_rule_to_evict(table, &rule)) {
+ break;
+ }
+
+ if (rule->pending) {
+ ovs_rwlock_unlock(&rule->evict);
break;
}
/* Removes 'rule' from the oftable that contains it. */
static void
oftable_remove_rule__(struct ofproto *ofproto, struct classifier *cls,
- struct rule *rule) OVS_REQ_WRLOCK(cls->rwlock)
+ struct rule *rule)
+ OVS_REQ_WRLOCK(cls->rwlock) OVS_RELEASES(rule->evict)
{
classifier_remove(cls, &rule->cr);
if (rule->meter_id) {
if (!list_is_empty(&rule->meter_list_node)) {
list_remove(&rule->meter_list_node);
}
+ ovs_rwlock_unlock(&rule->evict);
}
static void
if (new) {
oftable_replace_rule(new);
} else {
+ ovs_rwlock_wrlock(&old->evict);
oftable_remove_rule(old);
}
}
VLOG_DEFINE_THIS_MODULE(tunnel);
+/* skb mark used for IPsec tunnel packets */
+#define IPSEC_MARK 1
+
struct tnl_match {
ovs_be64 in_key;
ovs_be32 ip_src;
ovs_be32 ip_dst;
odp_port_t odp_port;
- uint32_t skb_mark;
+ uint32_t pkt_mark;
bool in_key_flow;
bool ip_src_flow;
bool ip_dst_flow;
tnl_port->match.ip_dst = cfg->ip_dst;
tnl_port->match.ip_src_flow = cfg->ip_src_flow;
tnl_port->match.ip_dst_flow = cfg->ip_dst_flow;
- tnl_port->match.skb_mark = cfg->ipsec ? IPSEC_MARK : 0;
+ tnl_port->match.pkt_mark = cfg->ipsec ? IPSEC_MARK : 0;
tnl_port->match.in_key_flow = cfg->in_key_flow;
tnl_port->match.odp_port = odp_port;
match.ip_src = flow->tunnel.ip_dst;
match.ip_dst = flow->tunnel.ip_src;
match.in_key = flow->tunnel.tun_id;
- match.skb_mark = flow->skb_mark;
+ match.pkt_mark = flow->pkt_mark;
ovs_rwlock_rdlock(&rwlock);
tnl_port = tnl_find(&match);
return ofport;
}
+static bool
+tnl_ecn_ok(const struct flow *base_flow, struct flow *flow)
+{
+ if (is_ip_any(base_flow)
+ && (flow->tunnel.ip_tos & IP_ECN_MASK) == IP_ECN_CE) {
+ if ((base_flow->nw_tos & IP_ECN_MASK) == IP_ECN_NOT_ECT) {
+ VLOG_WARN_RL(&rl, "dropping tunnel packet marked ECN CE"
+ " but is not ECN capable");
+ return false;
+ } else {
+ /* Set the ECN CE value in the tunneled packet. */
+ flow->nw_tos |= IP_ECN_CE;
+ }
+ }
+
+ return true;
+}
+
+/* Should be called at the beginning of action translation to initialize
+ * wildcards and perform any actions based on receiving on tunnel port.
+ *
+ * Returns false if the packet must be dropped. */
+bool
+tnl_xlate_init(const struct flow *base_flow, struct flow *flow,
+ struct flow_wildcards *wc)
+{
+ if (tnl_port_should_receive(flow)) {
+ memset(&wc->masks.tunnel, 0xff, sizeof wc->masks.tunnel);
+ memset(&wc->masks.pkt_mark, 0xff, sizeof wc->masks.pkt_mark);
+
+ if (!tnl_ecn_ok(base_flow, flow)) {
+ return false;
+ }
+
+ flow->pkt_mark &= ~IPSEC_MARK;
+ }
+
+ return true;
+}
+
/* Given that 'flow' should be output to the ofport corresponding to
* 'tnl_port', updates 'flow''s tunnel headers and returns the actual datapath
* port that the output should happen on. May return ODPP_NONE if the output
if (!cfg->ip_dst_flow) {
flow->tunnel.ip_dst = tnl_port->match.ip_dst;
}
- flow->skb_mark = tnl_port->match.skb_mark;
+ flow->pkt_mark = tnl_port->match.pkt_mark;
if (!cfg->out_key_flow) {
flow->tunnel.tun_id = cfg->out_key;
}
ds_put_format(ds, ", dp port=%"PRIu32, match->odp_port);
- ds_put_format(ds, ", skb mark=%"PRIu32, match->skb_mark);
+ ds_put_format(ds, ", pkt mark=%"PRIu32, match->pkt_mark);
}
static void
#include <stdint.h>
#include "flow.h"
-/* skb mark used for IPsec tunnel packets */
-#define IPSEC_MARK 1
-
/* Tunnel port emulation layer.
*
* These functions emulate tunnel virtual ports based on the outer
void tnl_port_del(const struct ofport_dpif *);
const struct ofport_dpif *tnl_port_receive(const struct flow *);
+bool tnl_xlate_init(const struct flow *base_flow, struct flow *flow,
+ struct flow_wildcards *);
odp_port_t tnl_port_send(const struct ofport_dpif *, struct flow *,
struct flow_wildcards *wc);
-/ovsdbmonitor.py
+/ovsdbmonitor
OVS_VSWITCHD_STOP
AT_CLEANUP
+
+AT_SETUP([bfd - check_tnl_key])
+OVS_VSWITCHD_START([add-port br0 p1 -- set Interface p1 type=gre \
+ options:remote_ip=2.2.2.2 options:key=1 ofport_request=1 -- \
+ set interface p1 bfd:enable=true -- \
+ set bridge br0 fail-mode=standalone])
+
+# by default check_tnl_key is false. so we should process a bfd packet with tun_id=1.
+AT_CHECK([ovs-appctl ofproto/trace ovs-dummy 'tunnel(tun_id=0x1,src=2.2.2.2,dst=2.2.2.1,tos=0x0,ttl=64,flags(key)),in_port(1),skb_mark(0/0),eth(src=00:11:22:33:44:55,dst=00:23:20:00:00:01),eth_type(0x0800),ipv4(src=169.254.1.0/0.0.0.0,dst=169.254.1.1/0.0.0.0,proto=17/0xff,tos=0/0,ttl=255/0,frag=no/0xff),udp(src=49152/0,dst=3784/0xffff)' -generate], [0], [stdout])
+# check that the packet should be handled as BFD packet.
+AT_CHECK([tail -2 stdout], [0], [dnl
+This flow is handled by the userspace slow path because it:
+ - Consists of BFD packets.
+], [])
+
+# turn on the check_tnl_key.
+AT_CHECK([ovs-vsctl set interface p1 bfd:check_tnl_key=true])
+AT_CHECK([ovs-appctl ofproto/trace ovs-dummy 'tunnel(tun_id=0x1,src=2.2.2.2,dst=2.2.2.1,tos=0x0,ttl=64,flags(key)),in_port(1),skb_mark(0/0),eth(src=00:11:22:33:44:55,dst=00:23:20:00:00:01),eth_type(0x0800),ipv4(src=169.254.1.0/0.0.0.0,dst=169.254.1.1/0.0.0.0,proto=17/0xff,tos=0/0,ttl=255/0,frag=no/0xff),udp(src=49152/0,dst=3784/0xffff)' -generate], [0], [stdout])
+# check that the packet should be handled as normal packet.
+AT_CHECK([tail -1 stdout], [0],[dnl
+Datapath actions: 100
+], [])
+
+# set the tunnel key to 0.
+AT_CHECK([ovs-vsctl set interface p1 options:key=0])
+AT_CHECK([ovs-appctl ofproto/trace ovs-dummy 'tunnel(tun_id=0x0,src=2.2.2.2,dst=2.2.2.1,tos=0x0,ttl=64,flags(key)),in_port(1),skb_mark(0/0),eth(src=00:11:22:33:44:55,dst=00:23:20:00:00:01),eth_type(0x0800),ipv4(src=169.254.1.0/0.0.0.0,dst=169.254.1.1/0.0.0.0,proto=17/0xff,tos=0/0,ttl=255/0,frag=no/0xff),udp(src=49152/0,dst=3784/0xffff)' -generate], [0], [stdout])
+# check that the packet should be handled as BFD packet.
+AT_CHECK([tail -2 stdout], [0], [dnl
+This flow is handled by the userspace slow path because it:
+ - Consists of BFD packets.
+], [])
+
+OVS_VSWITCHD_STOP
+AT_CLEANUP
# Trace some packets arriving. The particular packets don't matter.
for i in 1 2 3 4 5 6 7 8 9 10; do
ovs-appctl netdev-dummy/receive p1 'in_port(1),eth(src=50:54:00:00:00:05,dst=50:54:00:00:00:07),eth_type(0x0800),ipv4(src=192.168.0.1,dst=192.168.0.2,proto=6,tos=0,ttl=64,frag=no),tcp(src=8,dst=9)'
+ ovs-appctl time/warp 10
done
# Check for the learning entry.
+ovs-appctl time/warp 1000
AT_CHECK([ovs-ofctl dump-flows br0 | ofctl_strip | sort], [0],
[[ n_packets=1, n_bytes=60, actions=load:0x3->NXM_NX_REG0[0..15],learn(table=0,priority=65535,NXM_OF_ETH_SRC[],NXM_OF_VLAN_TCI[0..11],output:NXM_NX_REG0[0..15]),output:2
- priority=65535,vlan_tci=0x0000/0x0fff,dl_src=50:54:00:00:00:05 actions=output:3
+ n_packets=9, n_bytes=540, priority=65535,vlan_tci=0x0000/0x0fff,dl_src=50:54:00:00:00:05 actions=output:3
NXST_FLOW reply:
]])
AT_CHECK([head -n 3 stdout], [0], [dnl
Bridge: br0
Packet: arp,metadata=0,in_port=0,vlan_tci=0x0000,dl_src=50:54:00:00:00:01,dl_dst=50:54:00:00:00:02,arp_spa=0.0.0.0,arp_tpa=0.0.0.0,arp_sha=00:00:00:00:00:00,arp_tha=00:00:00:00:00:00
-Flow: skb_mark=0x2,skb_priority=0x1,arp,metadata=0,in_port=1,vlan_tci=0x0000,dl_src=50:54:00:00:00:01,dl_dst=50:54:00:00:00:02,arp_spa=0.0.0.0,arp_tpa=0.0.0.0,arp_sha=00:00:00:00:00:00,arp_tha=00:00:00:00:00:00
+Flow: pkt_mark=0x2,skb_priority=0x1,arp,metadata=0,in_port=1,vlan_tci=0x0000,dl_src=50:54:00:00:00:01,dl_dst=50:54:00:00:00:02,arp_spa=0.0.0.0,arp_tpa=0.0.0.0,arp_sha=00:00:00:00:00:00,arp_tha=00:00:00:00:00:00
])
# Test command: ofproto/trace dp_name odp_flow packet
AT_CHECK([head -n 3 stdout], [0], [dnl
Bridge: br0
Packet: arp,metadata=0,in_port=0,vlan_tci=0x0000,dl_src=50:54:00:00:00:01,dl_dst=50:54:00:00:00:02,arp_spa=0.0.0.0,arp_tpa=0.0.0.0,arp_sha=00:00:00:00:00:00,arp_tha=00:00:00:00:00:00
-Flow: skb_mark=0x2,skb_priority=0x1,arp,metadata=0,in_port=1,vlan_tci=0x0000,dl_src=50:54:00:00:00:01,dl_dst=50:54:00:00:00:02,arp_spa=0.0.0.0,arp_tpa=0.0.0.0,arp_sha=00:00:00:00:00:00,arp_tha=00:00:00:00:00:00
+Flow: pkt_mark=0x2,skb_priority=0x1,arp,metadata=0,in_port=1,vlan_tci=0x0000,dl_src=50:54:00:00:00:01,dl_dst=50:54:00:00:00:02,arp_spa=0.0.0.0,arp_tpa=0.0.0.0,arp_sha=00:00:00:00:00:00,arp_tha=00:00:00:00:00:00
])
# Test command: ofproto/trace br_name br_flow packet
AT_CHECK([ovs-appctl ofproto/trace br0 \
- "in_port=2,skb_priority=2,skb_mark=1" "$pkt2to1"], [0], [stdout],[stderr])
+ "in_port=2,skb_priority=2,pkt_mark=1" "$pkt2to1"], [0], [stdout],[stderr])
AT_CHECK([tail -1 stdout], [0], [dnl
-Datapath actions: set(skb_mark(0)),1
+Datapath actions: 1
])
AT_CHECK([head -n 2 stdout], [0], [dnl
Packet: arp,metadata=0,in_port=0,vlan_tci=0x0000,dl_src=50:54:00:00:00:02,dl_dst=50:54:00:00:00:01,arp_spa=0.0.0.0,arp_tpa=0.0.0.0,arp_sha=00:00:00:00:00:00,arp_tha=00:00:00:00:00:00
-Flow: skb_mark=0x1,skb_priority=0x2,arp,metadata=0,in_port=2,vlan_tci=0x0000,dl_src=50:54:00:00:00:02,dl_dst=50:54:00:00:00:01,arp_spa=0.0.0.0,arp_tpa=0.0.0.0,arp_sha=00:00:00:00:00:00,arp_tha=00:00:00:00:00:00
+Flow: pkt_mark=0x1,skb_priority=0x2,arp,metadata=0,in_port=2,vlan_tci=0x0000,dl_src=50:54:00:00:00:02,dl_dst=50:54:00:00:00:01,arp_spa=0.0.0.0,arp_tpa=0.0.0.0,arp_sha=00:00:00:00:00:00,arp_tha=00:00:00:00:00:00
])
OVS_VSWITCHD_STOP
table=0 in_port=1 actions=load:2->NXM_NX_REG0[[0..15]],learn(table=1,priority=65535,NXM_OF_ETH_SRC[[]],NXM_OF_VLAN_TCI[[0..11]],output:NXM_NX_REG0[[0..15]]),output:2
])
AT_CHECK([ovs-ofctl add-flows br0 flows.txt])
-AT_CHECK([ovs-appctl netdev-dummy/receive p1 'in_port(1),eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800),ipv4(src=10.0.0.2,dst=10.0.0.1,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0)'])
-AT_CHECK([ovs-appctl netdev-dummy/receive p1 'in_port(1),eth(src=50:54:00:00:00:0b,dst=50:54:00:00:00:0c),eth_type(0x0800),ipv4(src=10.0.0.4,dst=10.0.0.3,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0)'])
+# We send each packet twice because the first packet in each flow causes the
+# flow table to change and thus revalidations, which (depending on timing)
+# can keep a megaflow from being installed. The revalidations are done by
+# the second iteration, allowing the flows to be installed.
+for i in 1 2; do
+ AT_CHECK([ovs-appctl netdev-dummy/receive p1 'in_port(1),eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800),ipv4(src=10.0.0.2,dst=10.0.0.1,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0)'])
+ AT_CHECK([ovs-appctl netdev-dummy/receive p1 'in_port(1),eth(src=50:54:00:00:00:0b,dst=50:54:00:00:00:0c),eth_type(0x0800),ipv4(src=10.0.0.4,dst=10.0.0.3,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0)'])
+ ovs-appctl time/warp 100
+done
dnl The original flow is missing due to a revalidation.
AT_CHECK([ovs-appctl dpif/dump-megaflows br0 | STRIP_XOUT], [0], [dnl
skb_priority=0,ip,in_port=1,vlan_tci=0x0000/0x0fff,dl_src=50:54:00:00:00:09,nw_frag=no, n_subfacets:1, used:0.0s, Datapath actions: <del>
AT_CAPTURE_FILE([monitor.log])
# Send a packet-out with a load action to set some metadata, and forward to controller
-AT_CHECK([ovs-ofctl packet-out br0 controller 'load(0xfafafafa5a5a5a5a->OXM_OF_METADATA[[0..63]]), controller' '0001020304050010203040501234'])
+AT_CHECK([ovs-ofctl packet-out br0 controller 'load(0xfafafafa5a5a5a5a->OXM_OF_METADATA[[0..63]]), load(0xaa->NXM_NX_PKT_MARK[[]]), controller' '0001020304050010203040501234'])
# Stop the monitor and check its output.
ovs-appctl -t ovs-ofctl ofctl/barrier
ovs-appctl -t ovs-ofctl exit
AT_CHECK([sed 's/ (xid=0x[[0-9a-fA-F]]*)//' monitor.log], [0], [dnl
-NXT_PACKET_IN: total_len=14 in_port=CONTROLLER metadata=0xfafafafa5a5a5a5a (via action) data_len=14 (unbuffered)
+NXT_PACKET_IN: total_len=14 in_port=CONTROLLER metadata=0xfafafafa5a5a5a5a pkt_mark=0xaa (via action) data_len=14 (unbuffered)
metadata=0,in_port=0,vlan_tci=0x0000,dl_src=00:10:20:30:40:50,dl_dst=00:01:02:03:04:05,dl_type=0x1234
OFPT_BARRIER_REPLY:
])
'metadata=0 NXM,OXM' \
'in_port=1 any' \
'skb_priority=0 none' \
- 'skb_mark=1 none' \
+ 'pkt_mark=1 NXM,OXM' \
'reg0=0 NXM,OXM' \
'reg1=1 NXM,OXM' \
'reg2=2 NXM,OXM' \
AT_CLEANUP
-AT_SETUP([ovs-ofctl parse-flows (skb_mark and skb_priority)])
+AT_SETUP([ovs-ofctl parse-flows (skb_priority)])
AT_DATA([flows.txt], [[
-skb_mark=0x12345678,skb_priority=0x12341234,tcp,tp_src=123,actions=flood
+skb_priority=0x12341234,tcp,tp_src=123,actions=flood
]])
AT_CHECK([ovs-ofctl parse-flows flows.txt
# comment
tcp,tp_src=123,actions=flood
in_port=LOCAL dl_vlan=9 dl_src=00:0A:E4:25:6B:B0 actions=drop
+pkt_mark=0xbb,actions=set_field:0xaa->pkt_mark
udp dl_vlan_pcp=7 idle_timeout=5 actions=strip_vlan output:0
tcp,nw_src=192.168.0.3,tp_dst=80 actions=set_queue:37,output:1
udp,nw_src=192.168.0.3,tp_dst=53 actions=pop_queue,output:1
chosen protocol: NXM+table_id
NXT_FLOW_MOD: ADD table:255 tcp,tp_src=123 actions=FLOOD
NXT_FLOW_MOD: ADD table:255 in_port=LOCAL,dl_vlan=9,dl_src=00:0a:e4:25:6b:b0 actions=drop
+NXT_FLOW_MOD: ADD table:255 pkt_mark=0xbb actions=load:0xaa->NXM_NX_PKT_MARK[]
NXT_FLOW_MOD: ADD table:255 udp,dl_vlan_pcp=7 idle:5 actions=strip_vlan,output:0
NXT_FLOW_MOD: ADD table:255 tcp,nw_src=192.168.0.3,tp_dst=80 actions=set_queue:37,output:1
NXT_FLOW_MOD: ADD table:255 udp,nw_src=192.168.0.3,tp_dst=53 actions=pop_queue,output:1
When a packet enters an OpenFlow switch, all of the registers are set
to 0. Only explicit Nicira extension actions change register values.
.
+.IP \fBpkt_mark=\fIvalue\fR[\fB/\fImask\fR]
+Matches packet metadata mark \fIvalue\fR either exactly or with optional
+\fImask\fR. The mark is associated data that may be passed into other
+system components in order to facilitate interaction between subsystems.
+On Linux this corresponds to the skb mark but the exact implementation is
+platform-dependent.
+.
.PP
Defining IPv6 flows (those with \fBdl_type\fR equal to 0x86dd) requires
support for NXM. The following shorthand notations are available for
Open Flow 1.2 and above.)
.
.IP
-Example: \fBset_field:fe80:0123:4567:890a:a6ba:dbff:fefe:59fa\->ipv6_src\fR
+Example: \fBset_field:00:11:22:33:44:55->eth_src\fR.
.
.IP "\fBmultipath(\fIfields\fB, \fIbasis\fB, \fIalgorithm\fB, \fIn_links\fB, \fIarg\fB, \fIdst\fB[\fIstart\fB..\fIend\fB])\fR"
Hashes \fIfields\fR using \fIbasis\fR as a universal hash parameter,