From: Giuseppe Lettieri Date: Thu, 15 Aug 2013 18:43:14 +0000 (+0200) Subject: Merge branch 'mainstream' X-Git-Tag: sliver-openvswitch-2.0.90-1~27 X-Git-Url: http://git.onelab.eu/?a=commitdiff_plain;h=fa04edcedfe5285fd8ad3a4d70fecb38df18293d;hp=82377f4d4ab6de76318f649190d0301f117b270e;p=sliver-openvswitch.git Merge branch 'mainstream' Conflicts: lib/netdev.c --- diff --git a/FAQ b/FAQ index 810803e6d..75d90076b 100644 --- a/FAQ +++ b/FAQ @@ -148,7 +148,7 @@ A: The following table lists the Linux kernel versions against which the 1.9.x 2.6.18 to 3.8 1.10.x 2.6.18 to 3.8 1.11.x 2.6.18 to 3.8 - 1.12.x 2.6.18 to 3.9 + 1.12.x 2.6.18 to 3.10 Open vSwitch userspace should also work with the Linux kernel module built into Linux 3.3 and later. diff --git a/NEWS b/NEWS index f9953ab36..1246383de 100644 --- a/NEWS +++ b/NEWS @@ -10,6 +10,9 @@ v1.12.0 - xx xxx xxxx * New support for matching outer source and destination IP address of tunneled packets, for tunnel ports configured with the newly added "remote_ip=flow" and "local_ip=flow" options. + * Support for matching on metadata 'pkt_mark' for interacting with + other system components. On Linux this corresponds to the skb + mark. - The Interface table in the database has a new "ifindex" column to report the interface's OS-assigned ifindex. - New "check-oftest" Makefile target for running OFTest against Open @@ -19,7 +22,7 @@ v1.12.0 - xx xxx xxxx through database paths (e.g. Private key option with the database name should look like "--private-key=db:Open_vSwitch,SSL,private_key"). - Added ovs-dev.py, a utility script helpful for Open vSwitch developers. - - Support for Linux kernels up to 3.9 + - Support for Linux kernels up to 3.10 - ovs-ofctl: * New "ofp-parse" for printing OpenFlow messages read from a file. diff --git a/acinclude.m4 b/acinclude.m4 index 6033bfa38..73ee5ce30 100644 --- a/acinclude.m4 +++ b/acinclude.m4 @@ -271,6 +271,7 @@ AC_DEFUN([OVS_CHECK_LINUX_COMPAT], [ OVS_GREP_IFELSE([$KSRC/include/net/checksum.h], [csum_replace4]) OVS_GREP_IFELSE([$KSRC/include/net/checksum.h], [csum_unfold]) + OVS_GREP_IFELSE([$KSRC/include/net/genetlink.h], [parallel_ops]) OVS_GREP_IFELSE([$KSRC/include/net/netlink.h], [NLA_NUL_STRING]) OVS_GREP_IFELSE([$KSRC/include/net/netlink.h], [nla_get_be16]) OVS_GREP_IFELSE([$KSRC/include/net/netlink.h], [nla_put_be16]) diff --git a/datapath/Modules.mk b/datapath/Modules.mk index 2ce888894..ccf4dfac0 100644 --- a/datapath/Modules.mk +++ b/datapath/Modules.mk @@ -12,7 +12,6 @@ openvswitch_sources = \ datapath.c \ dp_notify.c \ flow.c \ - tunnel.c \ vlan.c \ vport.c \ vport-gre.c \ @@ -26,7 +25,6 @@ openvswitch_headers = \ compat.h \ datapath.h \ flow.h \ - tunnel.h \ vlan.h \ vport.h \ vport-internal_dev.h \ diff --git a/datapath/actions.c b/datapath/actions.c index 0a2def677..2c09d57a8 100644 --- a/datapath/actions.c +++ b/datapath/actions.c @@ -100,7 +100,7 @@ static int pop_vlan(struct sk_buff *skb) if (unlikely(err)) return err; - __vlan_hwaccel_put_tag(skb, ntohs(tci)); + __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), ntohs(tci)); return 0; } @@ -112,7 +112,7 @@ static int push_vlan(struct sk_buff *skb, const struct ovs_action_push_vlan *vla /* push down current VLAN tag */ current_tag = vlan_tx_tag_get(skb); - if (!__vlan_put_tag(skb, current_tag)) + if (!__vlan_put_tag(skb, skb->vlan_proto, current_tag)) return -ENOMEM; if (get_ip_summed(skb) == OVS_CSUM_COMPLETE) @@ -120,7 +120,7 @@ static int push_vlan(struct sk_buff *skb, const struct ovs_action_push_vlan *vla + (2 * ETH_ALEN), VLAN_HLEN, 0)); } - __vlan_hwaccel_put_tag(skb, ntohs(vlan->vlan_tci) & ~VLAN_TAG_PRESENT); + __vlan_hwaccel_put_tag(skb, vlan->vlan_tpid, ntohs(vlan->vlan_tci) & ~VLAN_TAG_PRESENT); return 0; } diff --git a/datapath/compat.h b/datapath/compat.h index 4dfd1926d..8457dbf31 100644 --- a/datapath/compat.h +++ b/datapath/compat.h @@ -19,7 +19,12 @@ #ifndef COMPAT_H #define COMPAT_H 1 +#include +#include #include +#include +#include + #ifndef HAVE_NLA_NUL_STRING static inline int CHECK_NUL_STRING(struct nlattr *attr, int maxlen) @@ -61,6 +66,13 @@ static inline void skb_clear_rxhash(struct sk_buff *skb) #define SET_NETNSOK .netnsok = true, #endif +#ifdef HAVE_PARALLEL_OPS +#define SET_PARALLEL_OPS .parallel_ops = true, +#else +#define SET_PARALLEL_OPS +#endif + + #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,20) #ifdef CONFIG_NETFILTER static inline u32 skb_get_mark(struct sk_buff *skb) @@ -106,4 +118,41 @@ static inline void skb_set_mark(struct sk_buff *skb, u32 mark) #define inet_sport(sk) (inet_sk(sk)->inet_sport) #endif +static inline struct rtable *find_route(struct net *net, + __be32 *saddr, __be32 daddr, + u8 ipproto, u8 tos, u32 skb_mark) +{ + struct rtable *rt; + /* Tunnel configuration keeps DSCP part of TOS bits, But Linux + * router expect RT_TOS bits only. */ + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,39) + struct flowi fl = { .nl_u = { .ip4_u = { + .daddr = daddr, + .saddr = *saddr, +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,20) + .fwmark = skb_mark, +#endif + .tos = RT_TOS(tos) } }, +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20) + .mark = skb_mark, +#endif + .proto = ipproto }; + + if (unlikely(ip_route_output_key(net, &rt, &fl))) + return ERR_PTR(-EADDRNOTAVAIL); + *saddr = fl.nl_u.ip4_u.saddr; + return rt; +#else + struct flowi4 fl = { .daddr = daddr, + .saddr = *saddr, + .flowi4_tos = RT_TOS(tos), + .flowi4_mark = skb_mark, + .flowi4_proto = ipproto }; + + rt = ip_route_output_key(net, &fl); + *saddr = fl.saddr; + return rt; +#endif +} #endif /* compat.h */ diff --git a/datapath/datapath.c b/datapath/datapath.c index 190b61b9e..48f17c091 100644 --- a/datapath/datapath.c +++ b/datapath/datapath.c @@ -58,12 +58,11 @@ #include "datapath.h" #include "flow.h" #include "vlan.h" -#include "tunnel.h" #include "vport-internal_dev.h" #include "vport-netdev.h" #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,18) || \ - LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0) + LINUX_VERSION_CODE >= KERNEL_VERSION(3,11,0) #error Kernels before 2.6.18 or after 3.9 are not supported by this version of Open vSwitch. #endif @@ -280,6 +279,7 @@ static struct genl_family dp_packet_genl_family = { .version = OVS_PACKET_VERSION, .maxattr = OVS_PACKET_ATTR_MAX, SET_NETNSOK + SET_PARALLEL_OPS }; int ovs_dp_upcall(struct datapath *dp, struct sk_buff *skb, @@ -1010,6 +1010,7 @@ static struct genl_family dp_flow_genl_family = { .version = OVS_FLOW_VERSION, .maxattr = OVS_FLOW_ATTR_MAX, SET_NETNSOK + SET_PARALLEL_OPS }; static struct genl_multicast_group ovs_dp_flow_multicast_group = { @@ -1589,6 +1590,7 @@ static struct genl_family dp_datapath_genl_family = { .version = OVS_DATAPATH_VERSION, .maxattr = OVS_DP_ATTR_MAX, SET_NETNSOK + SET_PARALLEL_OPS }; static struct genl_multicast_group ovs_dp_datapath_multicast_group = { @@ -1968,6 +1970,7 @@ static struct genl_family dp_vport_genl_family = { .version = OVS_VPORT_VERSION, .maxattr = OVS_VPORT_ATTR_MAX, SET_NETNSOK + SET_PARALLEL_OPS }; struct genl_multicast_group ovs_dp_vport_multicast_group = { diff --git a/datapath/datapath.h b/datapath/datapath.h index eda87fdd3..064211dbc 100644 --- a/datapath/datapath.h +++ b/datapath/datapath.h @@ -29,7 +29,6 @@ #include "checksum.h" #include "compat.h" #include "flow.h" -#include "tunnel.h" #include "vlan.h" #include "vport.h" diff --git a/datapath/dp_notify.c b/datapath/dp_notify.c index ec573a51f..d5308933f 100644 --- a/datapath/dp_notify.c +++ b/datapath/dp_notify.c @@ -18,6 +18,8 @@ #include #include +#include +#include #include "datapath.h" #include "vport-internal_dev.h" diff --git a/datapath/flow.h b/datapath/flow.h index 5d1578363..d8277b5b2 100644 --- a/datapath/flow.h +++ b/datapath/flow.h @@ -58,6 +58,22 @@ struct ovs_key_ipv4_tunnel { u8 ipv4_ttl; }; +static inline void ovs_flow_tun_key_init(struct ovs_key_ipv4_tunnel *tun_key, + const struct iphdr *iph, __be64 tun_id, + __be16 tun_flags) +{ + tun_key->tun_id = tun_id; + tun_key->ipv4_src = iph->saddr; + tun_key->ipv4_dst = iph->daddr; + tun_key->ipv4_tos = iph->tos; + tun_key->ipv4_ttl = iph->ttl; + tun_key->tun_flags = tun_flags; + + /* clear struct padding. */ + memset((unsigned char *) tun_key + OVS_TUNNEL_KEY_SIZE, 0, + sizeof(*tun_key) - OVS_TUNNEL_KEY_SIZE); +} + struct sw_flow_key { struct ovs_key_ipv4_tunnel tun_key; /* Encapsulating tunnel key. */ struct { diff --git a/datapath/linux/Modules.mk b/datapath/linux/Modules.mk index edaeabbd9..5f9c79239 100644 --- a/datapath/linux/Modules.mk +++ b/datapath/linux/Modules.mk @@ -48,6 +48,7 @@ openvswitch_headers += \ linux/compat/include/linux/mutex.h \ linux/compat/include/linux/net.h \ linux/compat/include/linux/netdevice.h \ + linux/compat/include/linux/netdev_features.h \ linux/compat/include/linux/netfilter_bridge.h \ linux/compat/include/linux/netfilter_ipv4.h \ linux/compat/include/linux/netlink.h \ diff --git a/datapath/linux/compat/include/linux/if_vlan.h b/datapath/linux/compat/include/linux/if_vlan.h index b8b1961f7..730175be5 100644 --- a/datapath/linux/compat/include/linux/if_vlan.h +++ b/datapath/linux/compat/include/linux/if_vlan.h @@ -5,6 +5,7 @@ #include #include_next +#if LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0) /* * The behavior of __vlan_put_tag() has changed over time: * @@ -19,8 +20,9 @@ * to avoid the need to guess whether the version in the kernel tree is * acceptable. */ -#define __vlan_put_tag rpl_vlan_put_tag -static inline struct sk_buff *__vlan_put_tag(struct sk_buff *skb, u16 vlan_tci) +#define __vlan_put_tag(skb, proto, tag) rpl__vlan_put_tag(skb, tag) + +static inline struct sk_buff *rpl__vlan_put_tag(struct sk_buff *skb, u16 vlan_tci) { struct vlan_ethhdr *veth; @@ -45,6 +47,16 @@ static inline struct sk_buff *__vlan_put_tag(struct sk_buff *skb, u16 vlan_tci) return skb; } +static inline struct sk_buff *rpl___vlan_hwaccel_put_tag(struct sk_buff *skb, + __be16 vlan_proto, + u16 vlan_tci) +{ + return __vlan_hwaccel_put_tag(skb, vlan_tci); +} + +#define __vlan_hwaccel_put_tag rpl___vlan_hwaccel_put_tag + +#endif /* All of these were introduced in a single commit preceding 2.6.33, so * presumably all of them or none of them are present. */ diff --git a/datapath/linux/compat/include/linux/netdev_features.h b/datapath/linux/compat/include/linux/netdev_features.h new file mode 100644 index 000000000..0259413d9 --- /dev/null +++ b/datapath/linux/compat/include/linux/netdev_features.h @@ -0,0 +1,12 @@ +#ifndef __LINUX_NETDEV_FEATURES_WRAPPER_H +#define __LINUX_NETDEV_FEATURES_WRAPPER_H + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,3,0) +#include_next +#endif + +#if LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0) +#define NETIF_F_HW_VLAN_CTAG_TX NETIF_F_HW_VLAN_TX +#endif + +#endif diff --git a/datapath/linux/compat/include/linux/netdevice.h b/datapath/linux/compat/include/linux/netdevice.h index 3f66d3a32..f62bd6de0 100644 --- a/datapath/linux/compat/include/linux/netdevice.h +++ b/datapath/linux/compat/include/linux/netdevice.h @@ -190,16 +190,21 @@ static inline struct sk_buff *__skb_gso_segment(struct sk_buff *skb, #endif #if LINUX_VERSION_CODE < KERNEL_VERSION(3,9,0) + +/* XEN dom0 networking assumes dev->master is bond device + * and it tries to access bond private structure from dev->master + * ptr on receive path. This causes panic. Therefore it is better + * not to backport this API. + **/ static inline int netdev_master_upper_dev_link(struct net_device *dev, struct net_device *upper_dev) { - return netdev_set_master(dev, upper_dev); + return 0; } static inline void netdev_upper_dev_unlink(struct net_device *dev, struct net_device *upper_dev) { - netdev_set_master(dev, NULL); } #endif diff --git a/datapath/linux/compat/include/net/gre.h b/datapath/linux/compat/include/net/gre.h index bd0c3d42c..5f46aed68 100644 --- a/datapath/linux/compat/include/net/gre.h +++ b/datapath/linux/compat/include/net/gre.h @@ -21,41 +21,13 @@ int gre_del_protocol(const struct gre_protocol *proto, u8 version); #endif +#if LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0) struct gre_base_hdr { __be16 flags; __be16 protocol; }; #define GRE_HEADER_SECTION 4 -#define MAX_GRE_PROTO_PRIORITY 255 -struct gre_cisco_protocol { - int (*handler)(struct sk_buff *skb, const struct tnl_ptk_info *tpi); - u8 priority; -}; - -#define gre_build_header rpl_gre_build_header -void gre_build_header(struct sk_buff *skb, const struct tnl_ptk_info *tpi, - int hdr_len); - -#define gre_handle_offloads rpl_gre_handle_offloads -struct sk_buff *gre_handle_offloads(struct sk_buff *skb, bool gre_csum); - -int gre_cisco_register(struct gre_cisco_protocol *proto); -int gre_cisco_unregister(struct gre_cisco_protocol *proto); - -static inline int ip_gre_calc_hlen(__be16 o_flags) -{ - int addend = 4; - - if (o_flags & TUNNEL_CSUM) - addend += 4; - if (o_flags & TUNNEL_KEY) - addend += 4; - if (o_flags & TUNNEL_SEQ) - addend += 4; - return addend; -} - static inline __be16 gre_flags_to_tnl_flags(__be16 flags) { __be16 tflags = 0; @@ -99,4 +71,36 @@ static inline __be16 tnl_flags_to_gre_flags(__be16 tflags) return flags; } +#endif /* LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0) */ + +#define MAX_GRE_PROTO_PRIORITY 255 +struct gre_cisco_protocol { + int (*handler)(struct sk_buff *skb, const struct tnl_ptk_info *tpi); + u8 priority; +}; + +int gre_cisco_register(struct gre_cisco_protocol *proto); +int gre_cisco_unregister(struct gre_cisco_protocol *proto); + +#define gre_build_header rpl_gre_build_header +void gre_build_header(struct sk_buff *skb, const struct tnl_ptk_info *tpi, + int hdr_len); + +#define gre_handle_offloads rpl_gre_handle_offloads +struct sk_buff *gre_handle_offloads(struct sk_buff *skb, bool gre_csum); + +static inline int ip_gre_calc_hlen(__be16 o_flags) +{ + int addend = 4; + + if (o_flags & TUNNEL_CSUM) + addend += 4; + if (o_flags & TUNNEL_KEY) + addend += 4; + if (o_flags & TUNNEL_SEQ) + addend += 4; + return addend; +} + + #endif diff --git a/datapath/linux/compat/include/net/ip_tunnels.h b/datapath/linux/compat/include/net/ip_tunnels.h index ad17c9d9a..14f55d068 100644 --- a/datapath/linux/compat/include/net/ip_tunnels.h +++ b/datapath/linux/compat/include/net/ip_tunnels.h @@ -31,20 +31,6 @@ struct tnl_ptk_info { #define PACKET_RCVD 0 #define PACKET_REJECT 1 -static inline void tunnel_ip_select_ident(struct sk_buff *skb, - const struct iphdr *old_iph, - struct dst_entry *dst) -{ - struct iphdr *iph = ip_hdr(skb); - - /* Use inner packet iph-id if possible. */ - if (skb->protocol == htons(ETH_P_IP) && old_iph->id) - iph->id = old_iph->id; - else - __ip_select_ident(iph, dst, - (skb_shinfo(skb)->gso_segs ?: 1) - 1); -} - int iptunnel_xmit(struct net *net, struct rtable *rt, struct sk_buff *skb, __be32 src, __be32 dst, __u8 proto, diff --git a/datapath/linux/compat/ip_tunnels_core.c b/datapath/linux/compat/ip_tunnels_core.c index 03c47a22f..01cc2fbc6 100644 --- a/datapath/linux/compat/ip_tunnels_core.c +++ b/datapath/linux/compat/ip_tunnels_core.c @@ -68,9 +68,7 @@ int iptunnel_xmit(struct net *net, struct rtable *rt, iph->daddr = dst; iph->saddr = src; iph->ttl = ttl; - tunnel_ip_select_ident(skb, - (const struct iphdr *)skb_inner_network_header(skb), - &rt_dst(rt)); + __ip_select_ident(iph, &rt_dst(rt), (skb_shinfo(skb)->gso_segs ?: 1) - 1); err = ip_local_out(skb); if (unlikely(net_xmit_eval(err))) diff --git a/datapath/tunnel.c b/datapath/tunnel.c deleted file mode 100644 index bd63da555..000000000 --- a/datapath/tunnel.c +++ /dev/null @@ -1,324 +0,0 @@ -/* - * Copyright (c) 2007-2012 Nicira, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "checksum.h" -#include "compat.h" -#include "datapath.h" -#include "tunnel.h" -#include "vlan.h" -#include "vport.h" - -/** - * ovs_tnl_rcv - ingress point for generic tunnel code - * - * @vport: port this packet was received on - * @skb: received packet - * @tos: ToS from encapsulating IP packet, used to copy ECN bits - * - * Must be called with rcu_read_lock. - * - * Packets received by this function are in the following state: - * - skb->data points to the inner Ethernet header. - * - The inner Ethernet header is in the linear data area. - * - skb->csum does not include the inner Ethernet header. - * - The layer pointers are undefined. - */ -void ovs_tnl_rcv(struct vport *vport, struct sk_buff *skb, - struct ovs_key_ipv4_tunnel *tun_key) -{ - struct ethhdr *eh; - - skb_reset_mac_header(skb); - eh = eth_hdr(skb); - - if (likely(ntohs(eh->h_proto) >= ETH_P_802_3_MIN)) - skb->protocol = eh->h_proto; - else - skb->protocol = htons(ETH_P_802_2); - - skb_dst_drop(skb); - nf_reset(skb); - skb_clear_rxhash(skb); - secpath_reset(skb); - vlan_set_tci(skb, 0); - - if (unlikely(compute_ip_summed(skb, false))) { - kfree_skb(skb); - return; - } - - ovs_vport_receive(vport, skb, tun_key); -} - -struct rtable *find_route(struct net *net, - __be32 *saddr, __be32 daddr, u8 ipproto, - u8 tos, u32 skb_mark) -{ - struct rtable *rt; - /* Tunnel configuration keeps DSCP part of TOS bits, But Linux - * router expect RT_TOS bits only. */ - -#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,39) - struct flowi fl = { .nl_u = { .ip4_u = { - .daddr = daddr, - .saddr = *saddr, -#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,20) - .fwmark = skb_mark, -#endif - .tos = RT_TOS(tos) } }, -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20) - .mark = skb_mark, -#endif - .proto = ipproto }; - - if (unlikely(ip_route_output_key(net, &rt, &fl))) - return ERR_PTR(-EADDRNOTAVAIL); - *saddr = fl.nl_u.ip4_u.saddr; - return rt; -#else - struct flowi4 fl = { .daddr = daddr, - .saddr = *saddr, - .flowi4_tos = RT_TOS(tos), - .flowi4_mark = skb_mark, - .flowi4_proto = ipproto }; - - rt = ip_route_output_key(net, &fl); - *saddr = fl.saddr; - return rt; -#endif -} - -static bool need_linearize(const struct sk_buff *skb) -{ - int i; - - if (unlikely(skb_shinfo(skb)->frag_list)) - return true; - - /* - * Generally speaking we should linearize if there are paged frags. - * However, if all of the refcounts are 1 we know nobody else can - * change them from underneath us and we can skip the linearization. - */ - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) - if (unlikely(page_count(skb_frag_page(&skb_shinfo(skb)->frags[i])) > 1)) - return true; - - return false; -} - -static struct sk_buff *handle_offloads(struct sk_buff *skb) -{ - int err; - - forward_ip_summed(skb, true); - - if (skb_is_gso(skb)) { - struct sk_buff *nskb; - char cb[sizeof(skb->cb)]; - - memcpy(cb, skb->cb, sizeof(cb)); - - nskb = __skb_gso_segment(skb, 0, false); - if (IS_ERR(nskb)) { - err = PTR_ERR(nskb); - goto error; - } - - consume_skb(skb); - skb = nskb; - while (nskb) { - memcpy(nskb->cb, cb, sizeof(cb)); - nskb = nskb->next; - } - } else if (get_ip_summed(skb) == OVS_CSUM_PARTIAL) { - /* Pages aren't locked and could change at any time. - * If this happens after we compute the checksum, the - * checksum will be wrong. We linearize now to avoid - * this problem. - */ - if (unlikely(need_linearize(skb))) { - err = __skb_linearize(skb); - if (unlikely(err)) - goto error; - } - - err = skb_checksum_help(skb); - if (unlikely(err)) - goto error; - } - - set_ip_summed(skb, OVS_CSUM_NONE); - - return skb; - -error: - return ERR_PTR(err); -} - -/* Compute source UDP port for outgoing packet. - * Currently we use the flow hash. - */ -u16 ovs_tnl_get_src_port(struct sk_buff *skb) -{ - int low; - int high; - unsigned int range; - struct sw_flow_key *pkt_key = OVS_CB(skb)->pkt_key; - u32 hash = jhash2((const u32 *)pkt_key, - sizeof(*pkt_key) / sizeof(u32), 0); - - inet_get_local_port_range(&low, &high); - range = (high - low) + 1; - return (((u64) hash * range) >> 32) + low; -} - -int ovs_tnl_send(struct vport *vport, struct sk_buff *skb, - u8 ipproto, int tunnel_hlen, - void (*build_header)(const struct vport *, - struct sk_buff *, - int tunnel_hlen)) -{ - int min_headroom; - struct rtable *rt; - __be32 saddr; - int sent_len = 0; - int err; - struct sk_buff *nskb; - - /* Route lookup */ - saddr = OVS_CB(skb)->tun_key->ipv4_src; - rt = find_route(ovs_dp_get_net(vport->dp), - &saddr, - OVS_CB(skb)->tun_key->ipv4_dst, - ipproto, - OVS_CB(skb)->tun_key->ipv4_tos, - skb_get_mark(skb)); - if (IS_ERR(rt)) { - err = PTR_ERR(rt); - goto error; - } - - tunnel_hlen += sizeof(struct iphdr); - - min_headroom = LL_RESERVED_SPACE(rt_dst(rt).dev) + rt_dst(rt).header_len - + tunnel_hlen - + (vlan_tx_tag_present(skb) ? VLAN_HLEN : 0); - - if (skb_headroom(skb) < min_headroom || skb_header_cloned(skb)) { - int head_delta = SKB_DATA_ALIGN(min_headroom - - skb_headroom(skb) + - 16); - - err = pskb_expand_head(skb, max_t(int, head_delta, 0), - 0, GFP_ATOMIC); - if (unlikely(err)) - goto err_free_rt; - } - - /* Offloading */ - nskb = handle_offloads(skb); - if (IS_ERR(nskb)) { - err = PTR_ERR(nskb); - goto err_free_rt; - } - skb = nskb; - - /* Reset SKB */ - nf_reset(skb); - secpath_reset(skb); - skb_dst_drop(skb); - skb_clear_rxhash(skb); - - while (skb) { - struct sk_buff *next_skb = skb->next; - struct iphdr *iph; - int frag_len; - - skb->next = NULL; - - if (unlikely(vlan_deaccel_tag(skb))) - goto next; - - frag_len = skb->len; - skb_push(skb, tunnel_hlen); - skb_reset_network_header(skb); - skb_set_transport_header(skb, sizeof(struct iphdr)); - - if (next_skb) - skb_dst_set(skb, dst_clone(&rt_dst(rt))); - else - skb_dst_set(skb, &rt_dst(rt)); - - /* Push Tunnel header. */ - build_header(vport, skb, tunnel_hlen); - - /* Push IP header. */ - iph = ip_hdr(skb); - iph->version = 4; - iph->ihl = sizeof(struct iphdr) >> 2; - iph->protocol = ipproto; - iph->daddr = OVS_CB(skb)->tun_key->ipv4_dst; - iph->saddr = saddr; - iph->tos = OVS_CB(skb)->tun_key->ipv4_tos; - iph->ttl = OVS_CB(skb)->tun_key->ipv4_ttl; - iph->frag_off = OVS_CB(skb)->tun_key->tun_flags & - TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0; - /* - * Allow our local IP stack to fragment the outer packet even - * if the DF bit is set as a last resort. We also need to - * force selection of an IP ID here with __ip_select_ident(), - * as ip_select_ident() assumes a proper ID is not needed when - * when the DF bit is set. - */ - skb->local_df = 1; - __ip_select_ident(iph, skb_dst(skb), 0); - - memset(IPCB(skb), 0, sizeof(*IPCB(skb))); - - err = ip_local_out(skb); - if (unlikely(net_xmit_eval(err))) - goto next; - - sent_len += frag_len; - -next: - skb = next_skb; - } - - return sent_len; - -err_free_rt: - ip_rt_put(rt); -error: - return err; -} diff --git a/datapath/tunnel.h b/datapath/tunnel.h deleted file mode 100644 index 17de7c47a..000000000 --- a/datapath/tunnel.h +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Copyright (c) 2007-2012 Nicira, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA - */ - -#ifndef TUNNEL_H -#define TUNNEL_H 1 - -#include -#include -#include - -#include "flow.h" -#include "vport.h" - -struct rtable *find_route(struct net *net, - __be32 *saddr, __be32 daddr, u8 ipproto, - u8 tos, u32 skb_mark); - -u16 ovs_tnl_get_src_port(struct sk_buff *skb); - -int ovs_tnl_send(struct vport *vport, struct sk_buff *skb, - u8 ipproto, int tunnel_hlen, - void (*build_header)(const struct vport *, - struct sk_buff *, - int tunnel_hlen)); - -void ovs_tnl_rcv(struct vport *vport, struct sk_buff *skb, - struct ovs_key_ipv4_tunnel *tun_key); - -static inline void tnl_tun_key_init(struct ovs_key_ipv4_tunnel *tun_key, - const struct iphdr *iph, __be64 tun_id, - __be16 tun_flags) -{ - tun_key->tun_id = tun_id; - tun_key->ipv4_src = iph->saddr; - tun_key->ipv4_dst = iph->daddr; - tun_key->ipv4_tos = iph->tos; - tun_key->ipv4_ttl = iph->ttl; - tun_key->tun_flags = tun_flags; - - /* clear struct padding. */ - memset((unsigned char*) tun_key + OVS_TUNNEL_KEY_SIZE, 0, - sizeof(*tun_key) - OVS_TUNNEL_KEY_SIZE); -} - -#endif /* TUNNEL_H */ diff --git a/datapath/vlan.h b/datapath/vlan.h index 46d0db356..aee555144 100644 --- a/datapath/vlan.h +++ b/datapath/vlan.h @@ -89,7 +89,7 @@ static inline int vlan_deaccel_tag(struct sk_buff *skb) if (!vlan_tx_tag_present(skb)) return 0; - skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb)); + skb = __vlan_put_tag(skb, skb->vlan_proto, vlan_tx_tag_get(skb)); if (unlikely(!skb)) return -ENOMEM; diff --git a/datapath/vport-gre.c b/datapath/vport-gre.c index c74f5fccd..5af6dbec7 100644 --- a/datapath/vport-gre.c +++ b/datapath/vport-gre.c @@ -35,10 +35,11 @@ #include #include #include +#include +#include #include #include - #include #include #include @@ -46,7 +47,6 @@ #include #include "datapath.h" -#include "tunnel.h" #include "vport.h" /* Returns the least-significant 32 bits of a __be64. */ @@ -112,7 +112,7 @@ static int gre_rcv(struct sk_buff *skb, return PACKET_REJECT; key = key_to_tunnel_id(tpi->key, tpi->seq); - tnl_tun_key_init(&tun_key, ip_hdr(skb), key, filter_tnl_flags(tpi->flags)); + ovs_flow_tun_key_init(&tun_key, ip_hdr(skb), key, filter_tnl_flags(tpi->flags)); ovs_vport_receive(vport, skb, &tun_key); return PACKET_RCVD; @@ -335,17 +335,19 @@ static __be32 be64_get_high32(__be64 x) static int gre64_send(struct vport *vport, struct sk_buff *skb) { - int hlen; + int hlen = GRE_HEADER_SECTION + /* GRE Hdr */ + GRE_HEADER_SECTION + /* GRE Key */ + GRE_HEADER_SECTION; /* GRE SEQ */ __be32 seq; if (unlikely(!OVS_CB(skb)->tun_key)) return -EINVAL; - hlen = ip_gre_calc_hlen(OVS_CB(skb)->tun_key->tun_flags) - + GRE_HEADER_SECTION; + if (OVS_CB(skb)->tun_key->tun_flags & TUNNEL_CSUM) + hlen += GRE_HEADER_SECTION; seq = be64_get_high32(OVS_CB(skb)->tun_key->tun_id); - return __send(vport, skb, hlen, seq, TUNNEL_SEQ); + return __send(vport, skb, hlen, seq, (TUNNEL_KEY|TUNNEL_SEQ)); } const struct vport_ops ovs_gre64_vport_ops = { diff --git a/datapath/vport-internal_dev.c b/datapath/vport-internal_dev.c index 9ee1c42ed..db55ee0e5 100644 --- a/datapath/vport-internal_dev.c +++ b/datapath/vport-internal_dev.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -188,7 +189,7 @@ static void do_setup(struct net_device *netdev) #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,27) netdev->vlan_features = netdev->features; - netdev->features |= NETIF_F_HW_VLAN_TX; + netdev->features |= NETIF_F_HW_VLAN_CTAG_TX; #endif #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,39) diff --git a/datapath/vport-lisp.c b/datapath/vport-lisp.c index 54c10ae44..2f62d1193 100644 --- a/datapath/vport-lisp.c +++ b/datapath/vport-lisp.c @@ -30,13 +30,13 @@ #include #include +#include #include +#include #include "datapath.h" -#include "tunnel.h" #include "vport.h" - /* * LISP encapsulation header: * @@ -160,6 +160,23 @@ static __be64 instance_id_to_tunnel_id(__u8 *iid) #endif } +/* Compute source UDP port for outgoing packet. + * Currently we use the flow hash. + */ +static u16 ovs_tnl_get_src_port(struct sk_buff *skb) +{ + int low; + int high; + unsigned int range; + struct sw_flow_key *pkt_key = OVS_CB(skb)->pkt_key; + u32 hash = jhash2((const u32 *)pkt_key, + sizeof(*pkt_key) / sizeof(u32), 0); + + inet_get_local_port_range(&low, &high); + range = (high - low) + 1; + return (((u64) hash * range) >> 32) + low; +} + static void lisp_build_header(const struct vport *vport, struct sk_buff *skb, int tunnel_hlen) @@ -189,6 +206,48 @@ static void lisp_build_header(const struct vport *vport, lisph->u2.word2.locator_status_bits = 1; } +/** + * ovs_tnl_rcv - ingress point for generic tunnel code + * + * @vport: port this packet was received on + * @skb: received packet + * @tos: ToS from encapsulating IP packet, used to copy ECN bits + * + * Must be called with rcu_read_lock. + * + * Packets received by this function are in the following state: + * - skb->data points to the inner Ethernet header. + * - The inner Ethernet header is in the linear data area. + * - skb->csum does not include the inner Ethernet header. + * - The layer pointers are undefined. + */ +static void ovs_tnl_rcv(struct vport *vport, struct sk_buff *skb, + struct ovs_key_ipv4_tunnel *tun_key) +{ + struct ethhdr *eh; + + skb_reset_mac_header(skb); + eh = eth_hdr(skb); + + if (likely(ntohs(eh->h_proto) >= ETH_P_802_3_MIN)) + skb->protocol = eh->h_proto; + else + skb->protocol = htons(ETH_P_802_2); + + skb_dst_drop(skb); + nf_reset(skb); + skb_clear_rxhash(skb); + secpath_reset(skb); + vlan_set_tci(skb, 0); + + if (unlikely(compute_ip_summed(skb, false))) { + kfree_skb(skb); + return; + } + + ovs_vport_receive(vport, skb, tun_key); +} + /* Called with rcu_read_lock and BH disabled. */ static int lisp_rcv(struct sock *sk, struct sk_buff *skb) { @@ -218,7 +277,7 @@ static int lisp_rcv(struct sock *sk, struct sk_buff *skb) /* Save outer tunnel values */ iph = ip_hdr(skb); - tnl_tun_key_init(&tun_key, iph, key, TUNNEL_KEY); + ovs_flow_tun_key_init(&tun_key, iph, key, TUNNEL_KEY); /* Drop non-IP inner packets */ inner_iph = (struct iphdr *)(lisph + 1); @@ -361,6 +420,196 @@ error: return ERR_PTR(err); } +static bool need_linearize(const struct sk_buff *skb) +{ + int i; + + if (unlikely(skb_shinfo(skb)->frag_list)) + return true; + + /* + * Generally speaking we should linearize if there are paged frags. + * However, if all of the refcounts are 1 we know nobody else can + * change them from underneath us and we can skip the linearization. + */ + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) + if (unlikely(page_count(skb_frag_page(&skb_shinfo(skb)->frags[i])) > 1)) + return true; + + return false; +} + +static struct sk_buff *handle_offloads(struct sk_buff *skb) +{ + int err; + + forward_ip_summed(skb, true); + + + if (skb_is_gso(skb)) { + struct sk_buff *nskb; + char cb[sizeof(skb->cb)]; + + memcpy(cb, skb->cb, sizeof(cb)); + + nskb = __skb_gso_segment(skb, 0, false); + if (IS_ERR(nskb)) { + err = PTR_ERR(nskb); + goto error; + } + + consume_skb(skb); + skb = nskb; + while (nskb) { + memcpy(nskb->cb, cb, sizeof(cb)); + nskb = nskb->next; + } + } else if (get_ip_summed(skb) == OVS_CSUM_PARTIAL) { + /* Pages aren't locked and could change at any time. + * If this happens after we compute the checksum, the + * checksum will be wrong. We linearize now to avoid + * this problem. + */ + if (unlikely(need_linearize(skb))) { + err = __skb_linearize(skb); + if (unlikely(err)) + goto error; + } + + err = skb_checksum_help(skb); + if (unlikely(err)) + goto error; + } + + set_ip_summed(skb, OVS_CSUM_NONE); + + return skb; + +error: + return ERR_PTR(err); +} + +static int ovs_tnl_send(struct vport *vport, struct sk_buff *skb, + u8 ipproto, int tunnel_hlen, + void (*build_header)(const struct vport *, + struct sk_buff *, + int tunnel_hlen)) +{ + int min_headroom; + struct rtable *rt; + __be32 saddr; + int sent_len = 0; + int err; + struct sk_buff *nskb; + + /* Route lookup */ + saddr = OVS_CB(skb)->tun_key->ipv4_src; + rt = find_route(ovs_dp_get_net(vport->dp), + &saddr, + OVS_CB(skb)->tun_key->ipv4_dst, + ipproto, + OVS_CB(skb)->tun_key->ipv4_tos, + skb_get_mark(skb)); + if (IS_ERR(rt)) { + err = PTR_ERR(rt); + goto error; + } + + tunnel_hlen += sizeof(struct iphdr); + + min_headroom = LL_RESERVED_SPACE(rt_dst(rt).dev) + rt_dst(rt).header_len + + tunnel_hlen + + (vlan_tx_tag_present(skb) ? VLAN_HLEN : 0); + + if (skb_headroom(skb) < min_headroom || skb_header_cloned(skb)) { + int head_delta = SKB_DATA_ALIGN(min_headroom - + skb_headroom(skb) + + 16); + + err = pskb_expand_head(skb, max_t(int, head_delta, 0), + 0, GFP_ATOMIC); + if (unlikely(err)) + goto err_free_rt; + } + + /* Offloading */ + nskb = handle_offloads(skb); + if (IS_ERR(nskb)) { + err = PTR_ERR(nskb); + goto err_free_rt; + } + skb = nskb; + + /* Reset SKB */ + nf_reset(skb); + secpath_reset(skb); + skb_dst_drop(skb); + skb_clear_rxhash(skb); + + while (skb) { + struct sk_buff *next_skb = skb->next; + struct iphdr *iph; + int frag_len; + + skb->next = NULL; + + if (unlikely(vlan_deaccel_tag(skb))) + goto next; + + frag_len = skb->len; + skb_push(skb, tunnel_hlen); + skb_reset_network_header(skb); + skb_set_transport_header(skb, sizeof(struct iphdr)); + + if (next_skb) + skb_dst_set(skb, dst_clone(&rt_dst(rt))); + else + skb_dst_set(skb, &rt_dst(rt)); + + /* Push Tunnel header. */ + build_header(vport, skb, tunnel_hlen); + + /* Push IP header. */ + iph = ip_hdr(skb); + iph->version = 4; + iph->ihl = sizeof(struct iphdr) >> 2; + iph->protocol = ipproto; + iph->daddr = OVS_CB(skb)->tun_key->ipv4_dst; + iph->saddr = saddr; + iph->tos = OVS_CB(skb)->tun_key->ipv4_tos; + iph->ttl = OVS_CB(skb)->tun_key->ipv4_ttl; + iph->frag_off = OVS_CB(skb)->tun_key->tun_flags & + TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0; + /* + * Allow our local IP stack to fragment the outer packet even + * if the DF bit is set as a last resort. We also need to + * force selection of an IP ID here with __ip_select_ident(), + * as ip_select_ident() assumes a proper ID is not needed when + * when the DF bit is set. + */ + skb->local_df = 1; + __ip_select_ident(iph, skb_dst(skb), 0); + + memset(IPCB(skb), 0, sizeof(*IPCB(skb))); + + err = ip_local_out(skb); + if (unlikely(net_xmit_eval(err))) + goto next; + + sent_len += frag_len; + +next: + skb = next_skb; + } + + return sent_len; + +err_free_rt: + ip_rt_put(rt); +error: + return err; +} + static int lisp_tnl_send(struct vport *vport, struct sk_buff *skb) { int tnl_len; diff --git a/datapath/vport-netdev.c b/datapath/vport-netdev.c index 4bc16175c..50373b1ab 100644 --- a/datapath/vport-netdev.c +++ b/datapath/vport-netdev.c @@ -340,7 +340,7 @@ static int netdev_send(struct vport *vport, struct sk_buff *skb) nskb = skb->next; skb->next = NULL; - skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb)); + skb = __vlan_put_tag(skb, skb->vlan_proto, vlan_tx_tag_get(skb)); if (likely(skb)) { len += skb->len; vlan_set_tci(skb, 0); @@ -354,7 +354,7 @@ static int netdev_send(struct vport *vport, struct sk_buff *skb) } tag: - skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb)); + skb = __vlan_put_tag(skb, skb->vlan_proto, vlan_tx_tag_get(skb)); if (unlikely(!skb)) return 0; vlan_set_tci(skb, 0); diff --git a/datapath/vport-vxlan.c b/datapath/vport-vxlan.c index 5546820e6..f3ef94707 100644 --- a/datapath/vport-vxlan.c +++ b/datapath/vport-vxlan.c @@ -42,7 +42,6 @@ #include #include "datapath.h" -#include "tunnel.h" #include "vport.h" #define OVS_VXLAN_RCV_PRIORITY 8 @@ -73,7 +72,7 @@ static int vxlan_rcv(struct vxlan_handler *vh, struct sk_buff *skb, __be32 vx_vn /* Save outer tunnel values */ iph = ip_hdr(skb); key = cpu_to_be64(ntohl(vx_vni) >> 8); - tnl_tun_key_init(&tun_key, iph, key, TUNNEL_KEY); + ovs_flow_tun_key_init(&tun_key, iph, key, TUNNEL_KEY); ovs_vport_receive(vport, skb, &tun_key); return PACKET_RCVD; diff --git a/debian/rules b/debian/rules index d34bdb3d0..b21c8dbf0 100755 --- a/debian/rules +++ b/debian/rules @@ -29,6 +29,12 @@ else CFLAGS += -O2 endif +# Old versions of dpkg-buildflags do not understand --export=configure. +# When dpkg-buildflags does not understand an option, it prints its full +# --help output on stdout, so we have to avoid that here. +buildflags := $(shell if dpkg-buildflags --export=configure >/dev/null 2>&1; \ + then dpkg-buildflags --export=configure; fi) + configure: configure-stamp configure-stamp: dh_testdir @@ -40,7 +46,7 @@ configure-stamp: test -e Makefile || \ ../configure --prefix=/usr --localstatedir=/var --enable-ssl \ --sysconfdir=/etc CFLAGS="$(CFLAGS)" \ - $(DATAPATH_CONFIGURE_OPTS)) + $(buildflags) $(DATAPATH_CONFIGURE_OPTS)) touch configure-stamp #Architecture diff --git a/include/openflow/nicira-ext.h b/include/openflow/nicira-ext.h index 6319f4e77..de5ff6ae2 100644 --- a/include/openflow/nicira-ext.h +++ b/include/openflow/nicira-ext.h @@ -477,6 +477,7 @@ OFP_ASSERT(sizeof(struct nx_action_pop_queue) == 16); * - NXM_NX_ND_SLL * - NXM_NX_ND_TLL * - NXM_NX_REG(idx) for idx in the switch's accepted range. + * - NXM_NX_PKT_MARK * - NXM_NX_TUN_IPV4_SRC * - NXM_NX_TUN_IPV4_DST * @@ -498,6 +499,8 @@ OFP_ASSERT(sizeof(struct nx_action_pop_queue) == 16); * * - NXM_NX_REG(idx) for idx in the switch's accepted range. * + * - NXM_NX_PKT_MARK + * * - NXM_OF_VLAN_TCI. Modifying this field's value has side effects on the * packet's 802.1Q header. Setting a value with CFI=0 removes the 802.1Q * header (if any), ignoring the other bits. Setting a value with CFI=1 @@ -1766,6 +1769,20 @@ OFP_ASSERT(sizeof(struct nx_action_output_reg) == 24); #define NXM_NX_TUN_IPV4_DST NXM_HEADER (0x0001, 32, 4) #define NXM_NX_TUN_IPV4_DST_W NXM_HEADER_W(0x0001, 32, 4) +/* Metadata marked onto the packet in a system-dependent manner. + * + * The packet mark may be used to carry contextual information + * to other parts of the system outside of Open vSwitch. As a + * result, the semantics depend on system in use. + * + * Prereqs: None. + * + * Format: 32-bit integer in network byte order. + * + * Masking: Fully maskable. */ +#define NXM_NX_PKT_MARK NXM_HEADER (0x0001, 33, 4) +#define NXM_NX_PKT_MARK_W NXM_HEADER_W(0x0001, 33, 4) + /* ## --------------------- ## */ /* ## Requests and replies. ## */ /* ## --------------------- ## */ diff --git a/include/sparse/pthread.h b/include/sparse/pthread.h index aa4652efc..40c5ca3a5 100644 --- a/include/sparse/pthread.h +++ b/include/sparse/pthread.h @@ -21,18 +21,6 @@ /* Get actual definitions for us to annotate and build on. */ #include_next -#include "compiler.h" - -int pthread_mutex_lock(pthread_mutex_t *mutex) OVS_ACQUIRES(mutex); -int pthread_mutex_unlock(pthread_mutex_t *mutex) OVS_RELEASES(mutex); - -int pthread_rwlock_rdlock(pthread_rwlock_t *rwlock) OVS_ACQ_RDLOCK(rwlock); -int pthread_rwlock_wrlock(pthread_rwlock_t *rwlock) OVS_ACQ_WRLOCK(rwlock); -int pthread_rwlock_unlock(pthread_rwlock_t *rwlock) OVS_RELEASES(rwlock); - -int pthread_cond_wait(pthread_cond_t *, pthread_mutex_t *mutex) - OVS_REQUIRES(mutex); - /* Sparse complains about the proper PTHREAD_*_INITIALIZER definitions. * Luckily, it's not a real compiler so we can overwrite it with something * simple. */ @@ -47,29 +35,3 @@ int pthread_cond_wait(pthread_cond_t *, pthread_mutex_t *mutex) #undef PTHREAD_ERRORCHECK_MUTEX_INITIALIZER_NP #define PTHREAD_ERRORCHECK_MUTEX_INITIALIZER_NP {} - -#define pthread_mutex_trylock(MUTEX) \ - ({ \ - int retval = pthread_mutex_trylock(mutex); \ - if (!retval) { \ - OVS_MACRO_LOCK(MUTEX); \ - } \ - retval; \ - }) - -#define pthread_rwlock_tryrdlock(RWLOCK) \ - ({ \ - int retval = pthread_rwlock_tryrdlock(rwlock); \ - if (!retval) { \ - OVS_MACRO_LOCK(RWLOCK); \ - } \ - retval; \ - }) -#define pthread_rwlock_trywrlock(RWLOCK) \ - ({ \ - int retval = pthread_rwlock_trywrlock(rwlock); \ - if (!retval) { \ - OVS_MACRO_LOCK(RWLOCK); \ - } \ - retval; \ - }) diff --git a/lib/automake.mk b/lib/automake.mk index cd50e9122..fa7f17332 100644 --- a/lib/automake.mk +++ b/lib/automake.mk @@ -161,6 +161,8 @@ lib_libopenvswitch_a_SOURCES = \ lib/reconnect.c \ lib/reconnect.h \ lib/sat-math.h \ + lib/seq.c \ + lib/seq.h \ lib/sha1.c \ lib/sha1.h \ lib/shash.c \ diff --git a/lib/bfd.c b/lib/bfd.c index 81fd17877..74b27c476 100644 --- a/lib/bfd.c +++ b/lib/bfd.c @@ -261,6 +261,7 @@ bfd_configure(struct bfd *bfd, const char *name, const struct smap *cfg) static atomic_uint16_t udp_src = ATOMIC_VAR_INIT(0); long long int min_tx, min_rx; + bool need_poll = false; bool cpath_down; const char *hwaddr; uint8_t ea[ETH_ADDR_LEN]; @@ -315,7 +316,7 @@ bfd_configure(struct bfd *bfd, const char *name, const struct smap *cfg) || (!bfd_in_poll(bfd) && bfd->cfg_min_tx < bfd->min_tx)) { bfd->min_tx = bfd->cfg_min_tx; } - bfd_poll(bfd); + need_poll = true; } min_rx = smap_get_int(cfg, "min_rx", 1000); @@ -326,7 +327,7 @@ bfd_configure(struct bfd *bfd, const char *name, const struct smap *cfg) || (!bfd_in_poll(bfd) && bfd->cfg_min_rx > bfd->min_rx)) { bfd->min_rx = bfd->cfg_min_rx; } - bfd_poll(bfd); + need_poll = true; } cpath_down = smap_get_bool(cfg, "cpath_down", false); @@ -335,7 +336,7 @@ bfd_configure(struct bfd *bfd, const char *name, const struct smap *cfg) if (bfd->diag == DIAG_NONE || bfd->diag == DIAG_CPATH_DOWN) { bfd_set_state(bfd, bfd->state, DIAG_NONE); } - bfd_poll(bfd); + need_poll = true; } hwaddr = smap_get(cfg, "bfd_dst_mac"); @@ -347,6 +348,9 @@ bfd_configure(struct bfd *bfd, const char *name, const struct smap *cfg) bfd->eth_dst_set = false; } + if (need_poll) { + bfd_poll(bfd); + } ovs_mutex_unlock(&mutex); return bfd; } @@ -516,7 +520,7 @@ bfd_should_process_flow(const struct bfd *bfd, const struct flow *flow, return (flow->dl_type == htons(ETH_TYPE_IP) && flow->nw_proto == IPPROTO_UDP && flow->tp_dst == htons(BFD_DEST_PORT) - && (check_tnl_key || flow->tunnel.tun_id == htonll(0))); + && (!check_tnl_key || flow->tunnel.tun_id == htonll(0))); } void @@ -782,6 +786,8 @@ bfd_flag_str(enum flags flags) ds_put_cstr(&ds, "poll "); } + /* Do not copy the trailing whitespace. */ + ds_chomp(&ds, ' '); ovs_strlcpy(flag_str, ds_cstr(&ds), sizeof flag_str); ds_destroy(&ds); return flag_str; diff --git a/lib/bond.c b/lib/bond.c index 06dd3627f..3834774d4 100644 --- a/lib/bond.c +++ b/lib/bond.c @@ -475,7 +475,7 @@ bond_wait(struct bond *bond) poll_timer_wait_until(bond->next_fake_iface_update); } - if (!bond->bond_revalidate) { + if (bond->bond_revalidate) { poll_immediate_wake(); } ovs_rwlock_unlock(&rwlock); @@ -661,11 +661,14 @@ bond_choose_output_slave(struct bond *bond, const struct flow *flow, struct flow_wildcards *wc, uint16_t vlan) { struct bond_slave *slave; + void *aux; ovs_rwlock_rdlock(&rwlock); slave = choose_output_slave(bond, flow, wc, vlan); + aux = slave ? slave->aux : NULL; ovs_rwlock_unlock(&rwlock); - return slave; + + return aux; } /* Rebalancing. */ diff --git a/lib/compiler.h b/lib/compiler.h index 2ca81bdbc..519b83291 100644 --- a/lib/compiler.h +++ b/lib/compiler.h @@ -128,32 +128,7 @@ #define OVS_EXCLUDED(...) __attribute__((locks_excluded(__VA_ARGS__))) #define OVS_ACQ_BEFORE(...) __attribute__((acquired_before(__VA_ARGS__))) #define OVS_ACQ_AFTER(...) __attribute__((acquired_after(__VA_ARGS__))) -#elif __CHECKER__ -/* "sparse" annotations for mutexes and mutex-like constructs. - * - * Change the thread-safety check annotations to use "context" attribute. - * - * OVS_MACRO_LOCK and OVS_MACRO_RELEASE are suitable for use within macros, - * where there is no function prototype to annotate. */ -#define OVS_LOCKABLE -#define OVS_REQ_RDLOCK(...) __attribute__((context(MUTEX, 1, 1))) -#define OVS_ACQ_RDLOCK(...) __attribute__((context(MUTEX, 0, 1))) -#define OVS_REQ_WRLOCK(...) __attribute__((context(MUTEX, 1, 1))) -#define OVS_ACQ_WRLOCK(...) __attribute__((context(MUTEX, 0, 1))) -#define OVS_REQUIRES(...) __attribute__((context(MUTEX, 1, 1))) -#define OVS_ACQUIRES(...) __attribute__((context(MUTEX, 0, 1))) -#define OVS_TRY_WRLOCK(RETVAL, ...) -#define OVS_TRY_RDLOCK(RETVAL, ...) -#define OVS_TRY_LOCK(REVAL, ...) -#define OVS_GUARDED -#define OVS_GUARDED_BY(...) -#define OVS_EXCLUDED(...) -#define OVS_RELEASES(...) __attribute__((context(MUTEX, 1, 0))) -#define OVS_ACQ_BEFORE(...) -#define OVS_ACQ_AFTER(...) -#define OVS_MACRO_LOCK(...) __context__(MUTEX, 0, 1) -#define OVS_MACRO_RELEASE(...) __context__(MUTEX, 1, 0) -#else +#else /* not Clang */ #define OVS_LOCKABLE #define OVS_REQ_RDLOCK(...) #define OVS_ACQ_RDLOCK(...) @@ -170,8 +145,6 @@ #define OVS_RELEASES(...) #define OVS_ACQ_BEFORE(...) #define OVS_ACQ_AFTER(...) -#define OVS_MACRO_LOCK(...) -#define OVS_MACRO_RELEASE(...) #endif /* ISO C says that a C implementation may choose any integer type for an enum diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index a8a54a1a6..07c146775 100644 --- a/lib/dpif-netdev.c +++ b/lib/dpif-netdev.c @@ -49,6 +49,7 @@ #include "packets.h" #include "poll-loop.h" #include "random.h" +#include "seq.h" #include "shash.h" #include "sset.h" #include "timeval.h" @@ -92,6 +93,7 @@ struct dp_netdev { struct dp_netdev_queue queues[N_QUEUES]; struct hmap flow_table; /* Flow table. */ + struct seq *queue_seq; /* Incremented whenever a packet is queued. */ /* Statistics. */ long long int n_hit; /* Number of flow table matches. */ @@ -101,7 +103,7 @@ struct dp_netdev { /* Ports. */ struct dp_netdev_port *ports[MAX_PORTS]; struct list port_list; - unsigned int serial; + struct seq *port_seq; /* Incremented whenever a port changes. */ }; /* A port in a netdev-based datapath. */ @@ -134,7 +136,7 @@ struct dp_netdev_flow { struct dpif_netdev { struct dpif dpif; struct dp_netdev *dp; - unsigned int dp_serial; + uint64_t last_port_seq; }; /* All netdev-based datapaths. */ @@ -164,7 +166,7 @@ static void dp_netdev_execute_actions(struct dp_netdev *, static void dp_netdev_port_input(struct dp_netdev *dp, struct dp_netdev_port *port, struct ofpbuf *packet, uint32_t skb_priority, - uint32_t skb_mark, const struct flow_tnl *tnl); + uint32_t pkt_mark, const struct flow_tnl *tnl); static struct dpif_netdev * dpif_netdev_cast(const struct dpif *dpif) @@ -225,7 +227,7 @@ create_dpif_netdev(struct dp_netdev *dp) dpif = xmalloc(sizeof *dpif); dpif_init(&dpif->dpif, dp->class, dp->name, netflow_id >> 8, netflow_id); dpif->dp = dp; - dpif->dp_serial = dp->serial; + dpif->last_port_seq = seq_read(dp->port_seq); return &dpif->dpif; } @@ -288,8 +290,10 @@ create_dp_netdev(const char *name, const struct dpif_class *class, for (i = 0; i < N_QUEUES; i++) { dp->queues[i].head = dp->queues[i].tail = 0; } + dp->queue_seq = seq_create(); hmap_init(&dp->flow_table); list_init(&dp->port_list); + dp->port_seq = seq_create(); error = do_add_port(dp, name, "internal", ODPP_LOCAL); if (error) { @@ -352,7 +356,9 @@ dp_netdev_free(struct dp_netdev *dp) do_del_port(dp, port->port_no); } dp_netdev_purge_queues(dp); + seq_destroy(dp->queue_seq); hmap_destroy(&dp->flow_table); + seq_destroy(dp->port_seq); free(dp->name); free(dp); } @@ -454,7 +460,7 @@ do_add_port(struct dp_netdev *dp, const char *devname, const char *type, list_push_back(&dp->port_list, &port->node); dp->ports[odp_to_u32(port_no)] = port; - dp->serial++; + seq_change(dp->port_seq); return 0; } @@ -554,7 +560,7 @@ do_del_port(struct dp_netdev *dp, odp_port_t port_no) list_remove(&port->node); dp->ports[odp_to_u32(port_no)] = NULL; - dp->serial++; + seq_change(dp->port_seq); netdev_close(port->netdev); netdev_restore_flags(port->sf); @@ -700,11 +706,13 @@ static int dpif_netdev_port_poll(const struct dpif *dpif_, char **devnamep OVS_UNUSED) { struct dpif_netdev *dpif = dpif_netdev_cast(dpif_); + uint64_t new_port_seq; int error; ovs_mutex_lock(&dp_netdev_mutex); - if (dpif->dp_serial != dpif->dp->serial) { - dpif->dp_serial = dpif->dp->serial; + new_port_seq = seq_read(dpif->dp->port_seq); + if (dpif->last_port_seq != new_port_seq) { + dpif->last_port_seq = new_port_seq; error = ENOBUFS; } else { error = EAGAIN; @@ -719,14 +727,8 @@ dpif_netdev_port_poll_wait(const struct dpif *dpif_) { struct dpif_netdev *dpif = dpif_netdev_cast(dpif_); - /* XXX In a multithreaded process, there is a race window between this - * function and the poll_block() in one thread and a change in - * dpif->dp->serial in another thread. */ - ovs_mutex_lock(&dp_netdev_mutex); - if (dpif->dp_serial != dpif->dp->serial) { - poll_immediate_wake(); - } + seq_wait(dpif->dp->port_seq, dpif->last_port_seq); ovs_mutex_unlock(&dp_netdev_mutex); } @@ -1107,13 +1109,15 @@ dpif_netdev_recv(struct dpif *dpif, struct dpif_upcall *upcall, static void dpif_netdev_recv_wait(struct dpif *dpif) { - /* XXX In a multithreaded process, there is a race window between this - * function and the poll_block() in one thread and a packet being queued in - * another thread. */ + struct dp_netdev *dp = get_dp_netdev(dpif); + uint64_t seq; ovs_mutex_lock(&dp_netdev_mutex); + seq = seq_read(dp->queue_seq); if (find_nonempty_queue(dpif)) { poll_immediate_wake(); + } else { + seq_wait(dp->queue_seq, seq); } ovs_mutex_unlock(&dp_netdev_mutex); } @@ -1139,7 +1143,7 @@ dp_netdev_flow_used(struct dp_netdev_flow *flow, const struct ofpbuf *packet) static void dp_netdev_port_input(struct dp_netdev *dp, struct dp_netdev_port *port, struct ofpbuf *packet, uint32_t skb_priority, - uint32_t skb_mark, const struct flow_tnl *tnl) + uint32_t pkt_mark, const struct flow_tnl *tnl) { struct dp_netdev_flow *flow; struct flow key; @@ -1149,7 +1153,7 @@ dp_netdev_port_input(struct dp_netdev *dp, struct dp_netdev_port *port, return; } in_port_.odp_port = port->port_no; - flow_extract(packet, skb_priority, skb_mark, tnl, &in_port_, &key); + flow_extract(packet, skb_priority, pkt_mark, tnl, &in_port_, &key); flow = dp_netdev_lookup_flow(dp, &key); if (flow) { dp_netdev_flow_used(flow, packet); @@ -1274,6 +1278,8 @@ dp_netdev_output_userspace(struct dp_netdev *dp, const struct ofpbuf *packet, buf->size = packet->size; upcall->packet = buf; + seq_change(dp->queue_seq); + return 0; } else { dp->n_lost++; @@ -1375,7 +1381,7 @@ dpif_dummy_change_port_number(struct unixctl_conn *conn, int argc OVS_UNUSED, dp->ports[odp_to_u32(port->port_no)] = NULL; dp->ports[port_no] = port; port->port_no = u32_to_odp(port_no); - dp->serial++; + seq_change(dp->port_seq); unixctl_command_reply(conn, NULL); } diff --git a/lib/flow.c b/lib/flow.c index d899d260d..3e29aa152 100644 --- a/lib/flow.c +++ b/lib/flow.c @@ -356,7 +356,7 @@ invalid: * present and has a correct length, and otherwise NULL. */ void -flow_extract(struct ofpbuf *packet, uint32_t skb_priority, uint32_t skb_mark, +flow_extract(struct ofpbuf *packet, uint32_t skb_priority, uint32_t pkt_mark, const struct flow_tnl *tnl, const union flow_in_port *in_port, struct flow *flow) { @@ -375,7 +375,7 @@ flow_extract(struct ofpbuf *packet, uint32_t skb_priority, uint32_t skb_mark, flow->in_port = *in_port; } flow->skb_priority = skb_priority; - flow->skb_mark = skb_mark; + flow->pkt_mark = pkt_mark; packet->l2 = b.data; packet->l2_5 = NULL; @@ -500,6 +500,7 @@ flow_get_metadata(const struct flow *flow, struct flow_metadata *fmd) fmd->tun_dst = flow->tunnel.ip_dst; fmd->metadata = flow->metadata; memcpy(fmd->regs, flow->regs, sizeof fmd->regs); + fmd->pkt_mark = flow->pkt_mark; fmd->in_port = flow->in_port.ofp_port; } diff --git a/lib/flow.h b/lib/flow.h index 7c3654b00..8164d9c73 100644 --- a/lib/flow.h +++ b/lib/flow.h @@ -96,7 +96,7 @@ struct flow { ovs_be32 nw_dst; /* IPv4 destination address. */ ovs_be32 ipv6_label; /* IPv6 flow label. */ union flow_in_port in_port; /* Input port.*/ - uint32_t skb_mark; /* Packet mark. */ + uint32_t pkt_mark; /* Packet mark. */ ovs_be32 mpls_lse; /* MPLS label stack entry. */ uint16_t mpls_depth; /* Depth of MPLS stack. */ ovs_be16 vlan_tci; /* If 802.1Q, TCI | VLAN_CFI; otherwise 0. */ @@ -128,6 +128,7 @@ struct flow_metadata { ovs_be32 tun_dst; /* Tunnel outer IPv4 dst addr */ ovs_be64 metadata; /* OpenFlow 1.1+ metadata field. */ uint32_t regs[FLOW_N_REGS]; /* Registers. */ + uint32_t pkt_mark; /* Packet mark. */ ofp_port_t in_port; /* OpenFlow port or zero. */ }; diff --git a/lib/match.c b/lib/match.c index 91c05a764..e97b0b18f 100644 --- a/lib/match.c +++ b/lib/match.c @@ -60,8 +60,8 @@ match_wc_init(struct match *match, const struct flow *flow) memset(&wc->masks.skb_priority, 0xff, sizeof wc->masks.skb_priority); } - if (flow->skb_mark) { - memset(&wc->masks.skb_mark, 0xff, sizeof wc->masks.skb_mark); + if (flow->pkt_mark) { + memset(&wc->masks.pkt_mark, 0xff, sizeof wc->masks.pkt_mark); } for (i = 0; i < FLOW_N_REGS; i++) { @@ -138,7 +138,6 @@ match_init_exact(struct match *match, const struct flow *flow) { match->flow = *flow; match->flow.skb_priority = 0; - match->flow.skb_mark = 0; flow_wildcards_init_exact(&match->wc); } @@ -286,10 +285,16 @@ match_set_skb_priority(struct match *match, uint32_t skb_priority) } void -match_set_skb_mark(struct match *match, uint32_t skb_mark) +match_set_pkt_mark(struct match *match, uint32_t pkt_mark) { - match->wc.masks.skb_mark = UINT32_MAX; - match->flow.skb_mark = skb_mark; + match_set_pkt_mark_masked(match, pkt_mark, UINT32_MAX); +} + +void +match_set_pkt_mark_masked(struct match *match, uint32_t pkt_mark, uint32_t mask) +{ + match->flow.pkt_mark = pkt_mark & mask; + match->wc.masks.pkt_mark = mask; } void @@ -836,8 +841,16 @@ match_format(const struct match *match, struct ds *s, unsigned int priority) ds_put_format(s, "priority=%u,", priority); } - if (wc->masks.skb_mark) { - ds_put_format(s, "skb_mark=%#"PRIx32",", f->skb_mark); + switch (wc->masks.pkt_mark) { + case 0: + break; + case UINT32_MAX: + ds_put_format(s, "pkt_mark=%#"PRIx32",", f->pkt_mark); + break; + default: + ds_put_format(s, "pkt_mark=%#"PRIx32"/%#"PRIx32",", + f->pkt_mark, wc->masks.pkt_mark); + break; } if (wc->masks.skb_priority) { diff --git a/lib/match.h b/lib/match.h index 0ea1f2d6b..57887219f 100644 --- a/lib/match.h +++ b/lib/match.h @@ -61,7 +61,8 @@ void match_set_tun_tos_masked(struct match *match, uint8_t tos, uint8_t mask); void match_set_tun_flags(struct match *match, uint16_t flags); void match_set_tun_flags_masked(struct match *match, uint16_t flags, uint16_t mask); void match_set_in_port(struct match *, ofp_port_t ofp_port); -void match_set_skb_mark(struct match *, uint32_t skb_mark); +void match_set_pkt_mark(struct match *, uint32_t pkt_mark); +void match_set_pkt_mark_masked(struct match *, uint32_t pkt_mark, uint32_t mask); void match_set_skb_priority(struct match *, uint32_t skb_priority); void match_set_dl_type(struct match *, ovs_be16); void match_set_dl_src(struct match *, const uint8_t[6]); diff --git a/lib/meta-flow.c b/lib/meta-flow.c index 11fdfaa2f..ce061a35b 100644 --- a/lib/meta-flow.c +++ b/lib/meta-flow.c @@ -137,14 +137,14 @@ static const struct mf_field mf_fields[MFF_N_IDS] = { 0, NULL, 0, NULL, }, { - MFF_SKB_MARK, "skb_mark", NULL, + MFF_PKT_MARK, "pkt_mark", NULL, MF_FIELD_SIZES(be32), - MFM_NONE, + MFM_FULLY, MFS_HEXADECIMAL, MFP_NONE, - false, - 0, NULL, - 0, NULL, + true, + NXM_NX_PKT_MARK, "NXM_NX_PKT_MARK", + NXM_NX_PKT_MARK, "NXM_NX_PKT_MARK", }, #define REGISTER(IDX) \ @@ -706,8 +706,8 @@ mf_is_all_wild(const struct mf_field *mf, const struct flow_wildcards *wc) return !wc->masks.in_port.ofp_port; case MFF_SKB_PRIORITY: return !wc->masks.skb_priority; - case MFF_SKB_MARK: - return !wc->masks.skb_mark; + case MFF_PKT_MARK: + return !wc->masks.pkt_mark; CASE_MFF_REGS: return !wc->masks.regs[mf->id - MFF_REG0]; @@ -912,7 +912,7 @@ mf_is_value_valid(const struct mf_field *mf, const union mf_value *value) case MFF_METADATA: case MFF_IN_PORT: case MFF_SKB_PRIORITY: - case MFF_SKB_MARK: + case MFF_PKT_MARK: CASE_MFF_REGS: case MFF_ETH_SRC: case MFF_ETH_DST: @@ -1026,8 +1026,8 @@ mf_get_value(const struct mf_field *mf, const struct flow *flow, value->be32 = htonl(flow->skb_priority); break; - case MFF_SKB_MARK: - value->be32 = htonl(flow->skb_mark); + case MFF_PKT_MARK: + value->be32 = htonl(flow->pkt_mark); break; CASE_MFF_REGS: @@ -1216,8 +1216,8 @@ mf_set_value(const struct mf_field *mf, match_set_skb_priority(match, ntohl(value->be32)); break; - case MFF_SKB_MARK: - match_set_skb_mark(match, ntohl(value->be32)); + case MFF_PKT_MARK: + match_set_pkt_mark(match, ntohl(value->be32)); break; CASE_MFF_REGS: @@ -1405,8 +1405,8 @@ mf_set_flow_value(const struct mf_field *mf, flow->skb_priority = ntohl(value->be32); break; - case MFF_SKB_MARK: - flow->skb_mark = ntohl(value->be32); + case MFF_PKT_MARK: + flow->pkt_mark = ntohl(value->be32); break; CASE_MFF_REGS: @@ -1607,9 +1607,9 @@ mf_set_wild(const struct mf_field *mf, struct match *match) match->wc.masks.skb_priority = 0; break; - case MFF_SKB_MARK: - match->flow.skb_mark = 0; - match->wc.masks.skb_mark = 0; + case MFF_PKT_MARK: + match->flow.pkt_mark = 0; + match->wc.masks.pkt_mark = 0; break; CASE_MFF_REGS: @@ -1780,7 +1780,6 @@ mf_set(const struct mf_field *mf, switch (mf->id) { case MFF_IN_PORT: case MFF_IN_PORT_OXM: - case MFF_SKB_MARK: case MFF_SKB_PRIORITY: case MFF_ETH_TYPE: case MFF_DL_VLAN: @@ -1829,6 +1828,11 @@ mf_set(const struct mf_field *mf, ntohl(value->be32), ntohl(mask->be32)); break; + case MFF_PKT_MARK: + match_set_pkt_mark_masked(match, ntohl(value->be32), + ntohl(mask->be32)); + break; + case MFF_ETH_DST: match_set_dl_dst_masked(match, value->mac, mask->mac); break; @@ -1985,7 +1989,7 @@ mf_random_value(const struct mf_field *mf, union mf_value *value) case MFF_TUN_FLAGS: case MFF_METADATA: case MFF_IN_PORT: - case MFF_SKB_MARK: + case MFF_PKT_MARK: case MFF_SKB_PRIORITY: CASE_MFF_REGS: case MFF_ETH_SRC: diff --git a/lib/meta-flow.h b/lib/meta-flow.h index bc402dc0a..93b894dcb 100644 --- a/lib/meta-flow.h +++ b/lib/meta-flow.h @@ -41,7 +41,7 @@ enum mf_field_id { MFF_IN_PORT, /* be16 */ MFF_IN_PORT_OXM, /* be32 */ MFF_SKB_PRIORITY, /* be32 */ - MFF_SKB_MARK, /* be32 */ + MFF_PKT_MARK, /* be32 */ #if FLOW_N_REGS > 0 MFF_REG0, /* be32 */ diff --git a/lib/netdev-bsd.c b/lib/netdev-bsd.c index 0f625afd3..180ce7fa9 100644 --- a/lib/netdev-bsd.c +++ b/lib/netdev-bsd.c @@ -42,6 +42,8 @@ #include #if defined(__NetBSD__) #include +#include +#include #endif #include "rtbsd.h" @@ -76,6 +78,13 @@ struct netdev_rx_bsd { struct netdev_bsd { struct netdev up; + + /* Never changes after initialization. */ + char *kernel_name; + + /* Protects all members below. */ + struct ovs_mutex mutex; + unsigned int cache_valid; unsigned int change_seq; @@ -92,8 +101,6 @@ struct netdev_bsd { /* Used for sending packets on non-tap devices. */ pcap_t *pcap; int fd; - - char *kernel_name; }; @@ -128,7 +135,7 @@ static void destroy_tap(int fd, const char *name); static int get_flags(const struct netdev *, int *flagsp); static int set_flags(const char *, int flags); static int do_set_addr(struct netdev *netdev, - int ioctl_nr, const char *ioctl_name, + unsigned long ioctl_nr, const char *ioctl_name, struct in_addr addr); static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]); static int set_etheraddr(const char *netdev_name, int hwaddr_family, @@ -139,7 +146,7 @@ static int ifr_get_flags(const struct ifreq *); static void ifr_set_flags(struct ifreq *, int flags); #ifdef __NetBSD__ -static int af_link_ioctl(int command, const void *arg); +static int af_link_ioctl(unsigned long command, const void *arg); #endif static void netdev_bsd_run(void); @@ -286,6 +293,7 @@ netdev_bsd_construct_system(struct netdev *netdev_) return error; } + ovs_mutex_init(&netdev->mutex, PTHREAD_MUTEX_NORMAL); netdev->change_seq = 1; netdev->tap_fd = -1; netdev->kernel_name = xstrdup(netdev_->name); @@ -319,6 +327,7 @@ netdev_bsd_construct_tap(struct netdev *netdev_) /* Create a tap device by opening /dev/tap. The TAPGIFNAME ioctl is used * to retrieve the name of the tap device. */ + ovs_mutex_init(&netdev->mutex, PTHREAD_MUTEX_NORMAL); netdev->tap_fd = open("/dev/tap", O_RDWR); netdev->change_seq = 1; if (netdev->tap_fd < 0) { @@ -373,6 +382,7 @@ netdev_bsd_construct_tap(struct netdev *netdev_) return 0; error_unref_notifier: + ovs_mutex_destroy(&netdev->mutex); cache_notifier_unref(); error: free(kernel_name); @@ -393,6 +403,7 @@ netdev_bsd_destruct(struct netdev *netdev_) pcap_close(netdev->pcap); } free(netdev->kernel_name); + ovs_mutex_destroy(&netdev->mutex); } static void @@ -485,21 +496,23 @@ netdev_bsd_rx_construct(struct netdev_rx *rx_) struct netdev_rx_bsd *rx = netdev_rx_bsd_cast(rx_); struct netdev *netdev_ = rx->up.netdev; struct netdev_bsd *netdev = netdev_bsd_cast(netdev_); + int error; if (!strcmp(netdev_get_type(netdev_), "tap")) { rx->pcap_handle = NULL; rx->fd = netdev->tap_fd; + error = 0; } else { - int error = netdev_bsd_open_pcap(netdev_get_kernel_name(netdev_), - &rx->pcap_handle, &rx->fd); - if (error) { - return error; + ovs_mutex_lock(&netdev->mutex); + error = netdev_bsd_open_pcap(netdev_get_kernel_name(netdev_), + &rx->pcap_handle, &rx->fd); + if (!error) { + netdev_bsd_changed(netdev); } - - netdev_bsd_changed(netdev); + ovs_mutex_unlock(&netdev->mutex); } - return 0; + return error; } static void @@ -662,15 +675,16 @@ netdev_bsd_send(struct netdev *netdev_, const void *data, size_t size) { struct netdev_bsd *dev = netdev_bsd_cast(netdev_); const char *name = netdev_get_name(netdev_); + int error; + ovs_mutex_lock(&dev->mutex); if (dev->tap_fd < 0 && !dev->pcap) { - int error = netdev_bsd_open_pcap(name, &dev->pcap, &dev->fd); - if (error) { - return error; - } + error = netdev_bsd_open_pcap(name, &dev->pcap, &dev->fd); + } else { + error = 0; } - for (;;) { + while (!error) { ssize_t retval; if (dev->tap_fd >= 0) { retval = write(dev->tap_fd, data, size); @@ -680,19 +694,24 @@ netdev_bsd_send(struct netdev *netdev_, const void *data, size_t size) if (retval < 0) { if (errno == EINTR) { continue; - } else if (errno != EAGAIN) { - VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s", - name, ovs_strerror(errno)); + } else { + error = errno; + if (error != EAGAIN) { + VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: " + "%s", name, ovs_strerror(error)); + } } - return errno; } else if (retval != size) { VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of " "%zu) on %s", retval, size, name); - return EMSGSIZE; + error = EMSGSIZE; } else { - return 0; + break; } } + + ovs_mutex_unlock(&dev->mutex); + return error; } /* @@ -705,6 +724,7 @@ netdev_bsd_send_wait(struct netdev *netdev_) { struct netdev_bsd *dev = netdev_bsd_cast(netdev_); + ovs_mutex_lock(&dev->mutex); if (dev->tap_fd >= 0) { /* TAP device always accepts packets. */ poll_immediate_wake(); @@ -714,6 +734,7 @@ netdev_bsd_send_wait(struct netdev *netdev_) /* We haven't even tried to send a packet yet. */ poll_immediate_wake(); } + ovs_mutex_unlock(&dev->mutex); } /* @@ -725,8 +746,9 @@ netdev_bsd_set_etheraddr(struct netdev *netdev_, const uint8_t mac[ETH_ADDR_LEN]) { struct netdev_bsd *netdev = netdev_bsd_cast(netdev_); - int error; + int error = 0; + ovs_mutex_lock(&netdev->mutex); if (!(netdev->cache_valid & VALID_ETHERADDR) || !eth_addr_equals(netdev->etheraddr, mac)) { error = set_etheraddr(netdev_get_kernel_name(netdev_), AF_LINK, @@ -736,9 +758,9 @@ netdev_bsd_set_etheraddr(struct netdev *netdev_, memcpy(netdev->etheraddr, mac, ETH_ADDR_LEN); netdev_bsd_changed(netdev); } - } else { - error = 0; } + ovs_mutex_unlock(&netdev->mutex); + return error; } @@ -751,18 +773,22 @@ netdev_bsd_get_etheraddr(const struct netdev *netdev_, uint8_t mac[ETH_ADDR_LEN]) { struct netdev_bsd *netdev = netdev_bsd_cast(netdev_); + int error = 0; + ovs_mutex_lock(&netdev->mutex); if (!(netdev->cache_valid & VALID_ETHERADDR)) { - int error = get_etheraddr(netdev_get_kernel_name(netdev_), - netdev->etheraddr); - if (error) { - return error; + error = get_etheraddr(netdev_get_kernel_name(netdev_), + netdev->etheraddr); + if (!error) { + netdev->cache_valid |= VALID_ETHERADDR; } - netdev->cache_valid |= VALID_ETHERADDR; } - memcpy(mac, netdev->etheraddr, ETH_ADDR_LEN); + if (!error) { + memcpy(mac, netdev->etheraddr, ETH_ADDR_LEN); + } + ovs_mutex_unlock(&netdev->mutex); - return 0; + return error; } /* @@ -774,30 +800,37 @@ static int netdev_bsd_get_mtu(const struct netdev *netdev_, int *mtup) { struct netdev_bsd *netdev = netdev_bsd_cast(netdev_); + int error = 0; + ovs_mutex_lock(&netdev->mutex); if (!(netdev->cache_valid & VALID_MTU)) { struct ifreq ifr; - int error; error = af_inet_ifreq_ioctl(netdev_get_kernel_name(netdev_), &ifr, SIOCGIFMTU, "SIOCGIFMTU"); - if (error) { - return error; + if (!error) { + netdev->mtu = ifr.ifr_mtu; + netdev->cache_valid |= VALID_MTU; } - netdev->mtu = ifr.ifr_mtu; - netdev->cache_valid |= VALID_MTU; } + if (!error) { + *mtup = netdev->mtu; + } + ovs_mutex_unlock(&netdev->mutex); - *mtup = netdev->mtu; return 0; } static int -netdev_bsd_get_ifindex(const struct netdev *netdev) +netdev_bsd_get_ifindex(const struct netdev *netdev_) { + struct netdev_bsd *netdev = netdev_bsd_cast(netdev_); int ifindex, error; - error = get_ifindex(netdev, &ifindex); + ovs_mutex_lock(&netdev->mutex); + error = get_ifindex(netdev_, &ifindex); + ovs_mutex_unlock(&netdev->mutex); + return error ? -error : ifindex; } @@ -805,34 +838,37 @@ static int netdev_bsd_get_carrier(const struct netdev *netdev_, bool *carrier) { struct netdev_bsd *netdev = netdev_bsd_cast(netdev_); + int error = 0; + ovs_mutex_lock(&netdev->mutex); if (!(netdev->cache_valid & VALID_CARRIER)) { struct ifmediareq ifmr; - int error; memset(&ifmr, 0, sizeof(ifmr)); strncpy(ifmr.ifm_name, netdev_get_kernel_name(netdev_), sizeof ifmr.ifm_name); error = af_inet_ioctl(SIOCGIFMEDIA, &ifmr); - if (error) { + if (!error) { + netdev->carrier = (ifmr.ifm_status & IFM_ACTIVE) == IFM_ACTIVE; + netdev->cache_valid |= VALID_CARRIER; + + /* If the interface doesn't report whether the media is active, + * just assume it is active. */ + if ((ifmr.ifm_status & IFM_AVALID) == 0) { + netdev->carrier = true; + } + } else { VLOG_DBG_RL(&rl, "%s: ioctl(SIOCGIFMEDIA) failed: %s", netdev_get_name(netdev_), ovs_strerror(error)); - return error; - } - - netdev->carrier = (ifmr.ifm_status & IFM_ACTIVE) == IFM_ACTIVE; - netdev->cache_valid |= VALID_CARRIER; - - /* If the interface doesn't report whether the media is active, - * just assume it is active. */ - if ((ifmr.ifm_status & IFM_AVALID) == 0) { - netdev->carrier = true; } } - *carrier = netdev->carrier; + if (!error) { + *carrier = netdev->carrier; + } + ovs_mutex_unlock(&netdev->mutex); - return 0; + return error; } static void @@ -1074,33 +1110,35 @@ netdev_bsd_get_in4(const struct netdev *netdev_, struct in_addr *in4, struct in_addr *netmask) { struct netdev_bsd *netdev = netdev_bsd_cast(netdev_); + int error = 0; + ovs_mutex_lock(&netdev->mutex); if (!(netdev->cache_valid & VALID_IN4)) { - const struct sockaddr_in *sin; struct ifreq ifr; - int error; ifr.ifr_addr.sa_family = AF_INET; error = af_inet_ifreq_ioctl(netdev_get_kernel_name(netdev_), &ifr, SIOCGIFADDR, "SIOCGIFADDR"); - if (error) { - return error; - } + if (!error) { + const struct sockaddr_in *sin; - sin = (struct sockaddr_in *) &ifr.ifr_addr; - netdev->in4 = sin->sin_addr; - error = af_inet_ifreq_ioctl(netdev_get_kernel_name(netdev_), &ifr, - SIOCGIFNETMASK, "SIOCGIFNETMASK"); - if (error) { - return error; + sin = (struct sockaddr_in *) &ifr.ifr_addr; + netdev->in4 = sin->sin_addr; + netdev->cache_valid |= VALID_IN4; + error = af_inet_ifreq_ioctl(netdev_get_kernel_name(netdev_), &ifr, + SIOCGIFNETMASK, "SIOCGIFNETMASK"); + if (!error) { + *netmask = sin->sin_addr; + } } - netdev->netmask = sin->sin_addr; - netdev->cache_valid |= VALID_IN4; } - *in4 = netdev->in4; - *netmask = netdev->netmask; + if (!error) { + *in4 = netdev->in4; + *netmask = netdev->netmask; + } + ovs_mutex_unlock(&netdev->mutex); - return in4->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0; + return error ? error : in4->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0; } /* @@ -1115,6 +1153,7 @@ netdev_bsd_set_in4(struct netdev *netdev_, struct in_addr addr, struct netdev_bsd *netdev = netdev_bsd_cast(netdev_); int error; + ovs_mutex_lock(&netdev->mutex); error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", addr); if (!error) { if (addr.s_addr != INADDR_ANY) { @@ -1128,6 +1167,8 @@ netdev_bsd_set_in4(struct netdev *netdev_, struct in_addr addr, } netdev_bsd_changed(netdev); } + ovs_mutex_unlock(&netdev->mutex); + return error; } @@ -1293,6 +1334,63 @@ netdev_bsd_get_next_hop(const struct in_addr *host OVS_UNUSED, #endif } +static int +netdev_bsd_arp_lookup(const struct netdev *netdev OVS_UNUSED, + ovs_be32 ip OVS_UNUSED, + uint8_t mac[ETH_ADDR_LEN] OVS_UNUSED) +{ +#if defined(__NetBSD__) + const struct rt_msghdr *rtm; + size_t needed; + char *buf; + const char *cp; + const char *ep; + int mib[6]; + int error; + + buf = NULL; + mib[0] = CTL_NET; + mib[1] = PF_ROUTE; + mib[2] = 0; + mib[3] = AF_INET; + mib[4] = NET_RT_FLAGS; + mib[5] = RTF_LLINFO; + if (sysctl(mib, 6, NULL, &needed, NULL, 0) == -1) { + error = errno; + goto error; + } + buf = xmalloc(needed); + if (sysctl(mib, 6, buf, &needed, NULL, 0) == -1) { + error = errno; + goto error; + } + ep = buf + needed; + for (cp = buf; cp < ep; cp += rtm->rtm_msglen) { + const struct sockaddr_inarp *sina; + const struct sockaddr_dl *sdl; + + rtm = (const void *)cp; + sina = (const void *)(rtm + 1); + if (ip != sina->sin_addr.s_addr) { + continue; + } + sdl = (const void *) + ((const char *)(const void *)sina + RT_ROUNDUP(sina->sin_len)); + if (sdl->sdl_alen == ETH_ADDR_LEN) { + memcpy(mac, &sdl->sdl_data[sdl->sdl_nlen], ETH_ADDR_LEN); + error = 0; + goto error; + } + } + error = ENXIO; +error: + free(buf); + return error; +#else + return EOPNOTSUPP; +#endif +} + static void make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr) { @@ -1308,7 +1406,8 @@ make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr) static int do_set_addr(struct netdev *netdev, - int ioctl_nr, const char *ioctl_name, struct in_addr addr) + unsigned long ioctl_nr, const char *ioctl_name, + struct in_addr addr) { struct ifreq ifr; make_in4_sockaddr(&ifr.ifr_addr, addr); @@ -1420,7 +1519,7 @@ const struct netdev_class netdev_bsd_class = { NULL, /* add_router */ netdev_bsd_get_next_hop, NULL, /* get_status */ - NULL, /* arp_lookup */ + netdev_bsd_arp_lookup, /* arp_lookup */ netdev_bsd_update_flags, @@ -1483,7 +1582,7 @@ const struct netdev_class netdev_tap_class = { NULL, /* add_router */ netdev_bsd_get_next_hop, NULL, /* get_status */ - NULL, /* arp_lookup */ + netdev_bsd_arp_lookup, /* arp_lookup */ netdev_bsd_update_flags, @@ -1677,7 +1776,7 @@ ifr_set_flags(struct ifreq *ifr, int flags) /* Calls ioctl() on an AF_LINK sock, passing the specified 'command' and * 'arg'. Returns 0 if successful, otherwise a positive errno value. */ int -af_link_ioctl(int command, const void *arg) +af_link_ioctl(unsigned long command, const void *arg) { static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER; static int sock; diff --git a/lib/netdev-dummy.c b/lib/netdev-dummy.c index 0560adecf..5c312109b 100644 --- a/lib/netdev-dummy.c +++ b/lib/netdev-dummy.c @@ -44,8 +44,22 @@ struct dummy_stream { struct list txq; }; +/* Protects 'dummy_list'. */ +static struct ovs_mutex dummy_list_mutex = OVS_MUTEX_INITIALIZER; + +/* Contains all 'struct dummy_dev's. */ +static struct list dummy_list OVS_GUARDED_BY(dummy_list_mutex) + = LIST_INITIALIZER(&dummy_list); + struct netdev_dummy { struct netdev up; + + /* In dummy_list. */ + struct list list_node OVS_GUARDED_BY(dummy_list_mutex); + + /* Protects all members below. */ + struct ovs_mutex mutex OVS_ACQ_AFTER(dummy_list_mutex); + uint8_t hwaddr[ETH_ADDR_LEN]; int mtu; struct netdev_stats stats; @@ -60,8 +74,6 @@ struct netdev_dummy { struct list rxes; /* List of child "netdev_rx_dummy"s. */ }; -static const struct netdev_class dummy_class; - /* Max 'recv_queue_len' in struct netdev_dummy. */ #define NETDEV_DUMMY_MAX_QUEUE 100 @@ -75,7 +87,8 @@ struct netdev_rx_dummy { static unixctl_cb_func netdev_dummy_set_admin_state; static int netdev_dummy_construct(struct netdev *); -static void netdev_dummy_poll_notify(struct netdev_dummy *); +static void netdev_dummy_poll_notify(struct netdev_dummy *netdev) + OVS_REQUIRES(netdev->mutex); static void netdev_dummy_queue_packet(struct netdev_dummy *, struct ofpbuf *); static void dummy_stream_close(struct dummy_stream *); @@ -103,15 +116,14 @@ netdev_rx_dummy_cast(const struct netdev_rx *rx) static void netdev_dummy_run(void) { - struct shash dummy_netdevs; - struct shash_node *node; + struct netdev_dummy *dev; - shash_init(&dummy_netdevs); - netdev_get_devices(&dummy_class, &dummy_netdevs); - SHASH_FOR_EACH (node, &dummy_netdevs) { - struct netdev_dummy *dev = node->data; + ovs_mutex_lock(&dummy_list_mutex); + LIST_FOR_EACH (dev, list_node, &dummy_list) { size_t i; + ovs_mutex_lock(&dev->mutex); + if (dev->pstream) { struct stream *new_stream; int error; @@ -203,9 +215,9 @@ netdev_dummy_run(void) } } - netdev_close(&dev->up); + ovs_mutex_unlock(&dev->mutex); } - shash_destroy(&dummy_netdevs); + ovs_mutex_unlock(&dummy_list_mutex); } static void @@ -219,15 +231,13 @@ dummy_stream_close(struct dummy_stream *s) static void netdev_dummy_wait(void) { - struct shash dummy_netdevs; - struct shash_node *node; + struct netdev_dummy *dev; - shash_init(&dummy_netdevs); - netdev_get_devices(&dummy_class, &dummy_netdevs); - SHASH_FOR_EACH (node, &dummy_netdevs) { - struct netdev_dummy *dev = node->data; + ovs_mutex_lock(&dummy_list_mutex); + LIST_FOR_EACH (dev, list_node, &dummy_list) { size_t i; + ovs_mutex_lock(&dev->mutex); if (dev->pstream) { pstream_wait(dev->pstream); } @@ -240,9 +250,9 @@ netdev_dummy_wait(void) } stream_recv_wait(s->stream); } - netdev_close(&dev->up); + ovs_mutex_unlock(&dev->mutex); } - shash_destroy(&dummy_netdevs); + ovs_mutex_unlock(&dummy_list_mutex); } static struct netdev * @@ -260,6 +270,8 @@ netdev_dummy_construct(struct netdev *netdev_) unsigned int n; atomic_add(&next_n, 1, &n); + + ovs_mutex_init(&netdev->mutex, PTHREAD_MUTEX_NORMAL); netdev->hwaddr[0] = 0xaa; netdev->hwaddr[1] = 0x55; netdev->hwaddr[2] = n >> 24; @@ -277,6 +289,10 @@ netdev_dummy_construct(struct netdev *netdev_) list_init(&netdev->rxes); + ovs_mutex_lock(&dummy_list_mutex); + list_push_back(&dummy_list, &netdev->list_node); + ovs_mutex_unlock(&dummy_list_mutex); + return 0; } @@ -286,11 +302,16 @@ netdev_dummy_destruct(struct netdev *netdev_) struct netdev_dummy *netdev = netdev_dummy_cast(netdev_); size_t i; + ovs_mutex_lock(&dummy_list_mutex); + list_remove(&netdev->list_node); + ovs_mutex_unlock(&dummy_list_mutex); + pstream_close(netdev->pstream); for (i = 0; i < netdev->n_streams; i++) { dummy_stream_close(&netdev->streams[i]); } free(netdev->streams); + ovs_mutex_destroy(&netdev->mutex); } static void @@ -321,6 +342,7 @@ netdev_dummy_set_config(struct netdev *netdev_, const struct smap *args) struct netdev_dummy *netdev = netdev_dummy_cast(netdev_); const char *pstream; + ovs_mutex_lock(&netdev->mutex); netdev->ifindex = smap_get_int(args, "ifindex", -EOPNOTSUPP); pstream = smap_get(args, "pstream"); @@ -340,6 +362,8 @@ netdev_dummy_set_config(struct netdev *netdev_, const struct smap *args) } } } + ovs_mutex_unlock(&netdev->mutex); + return 0; } @@ -356,9 +380,11 @@ netdev_dummy_rx_construct(struct netdev_rx *rx_) struct netdev_rx_dummy *rx = netdev_rx_dummy_cast(rx_); struct netdev_dummy *netdev = netdev_dummy_cast(rx->up.netdev); + ovs_mutex_lock(&netdev->mutex); list_push_back(&netdev->rxes, &rx->node); list_init(&rx->recv_queue); rx->recv_queue_len = 0; + ovs_mutex_unlock(&netdev->mutex); return 0; } @@ -367,9 +393,12 @@ static void netdev_dummy_rx_destruct(struct netdev_rx *rx_) { struct netdev_rx_dummy *rx = netdev_rx_dummy_cast(rx_); + struct netdev_dummy *netdev = netdev_dummy_cast(rx->up.netdev); + ovs_mutex_lock(&netdev->mutex); list_remove(&rx->node); ofpbuf_list_delete(&rx->recv_queue); + ovs_mutex_unlock(&netdev->mutex); } static void @@ -384,15 +413,23 @@ static int netdev_dummy_rx_recv(struct netdev_rx *rx_, void *buffer, size_t size) { struct netdev_rx_dummy *rx = netdev_rx_dummy_cast(rx_); + struct netdev_dummy *netdev = netdev_dummy_cast(rx->up.netdev); struct ofpbuf *packet; int retval; - if (list_is_empty(&rx->recv_queue)) { + ovs_mutex_lock(&netdev->mutex); + if (!list_is_empty(&rx->recv_queue)) { + packet = ofpbuf_from_list(list_pop_front(&rx->recv_queue)); + rx->recv_queue_len--; + } else { + packet = NULL; + } + ovs_mutex_unlock(&netdev->mutex); + + if (!packet) { return -EAGAIN; } - packet = ofpbuf_from_list(list_pop_front(&rx->recv_queue)); - rx->recv_queue_len--; if (packet->size <= size) { memcpy(buffer, packet->data, packet->size); retval = packet->size; @@ -408,17 +445,26 @@ static void netdev_dummy_rx_wait(struct netdev_rx *rx_) { struct netdev_rx_dummy *rx = netdev_rx_dummy_cast(rx_); + struct netdev_dummy *netdev = netdev_dummy_cast(rx->up.netdev); + + ovs_mutex_lock(&netdev->mutex); if (!list_is_empty(&rx->recv_queue)) { poll_immediate_wake(); } + ovs_mutex_unlock(&netdev->mutex); } static int netdev_dummy_rx_drain(struct netdev_rx *rx_) { struct netdev_rx_dummy *rx = netdev_rx_dummy_cast(rx_); + struct netdev_dummy *netdev = netdev_dummy_cast(rx->up.netdev); + + ovs_mutex_lock(&netdev->mutex); ofpbuf_list_delete(&rx->recv_queue); rx->recv_queue_len = 0; + ovs_mutex_unlock(&netdev->mutex); + return 0; } @@ -443,6 +489,7 @@ netdev_dummy_send(struct netdev *netdev, const void *buffer, size_t size) } } + ovs_mutex_lock(&dev->mutex); dev->stats.tx_packets++; dev->stats.tx_bytes += size; @@ -457,6 +504,7 @@ netdev_dummy_send(struct netdev *netdev, const void *buffer, size_t size) list_push_back(&s->txq, &b->list_node); } } + ovs_mutex_unlock(&dev->mutex); return 0; } @@ -467,10 +515,12 @@ netdev_dummy_set_etheraddr(struct netdev *netdev, { struct netdev_dummy *dev = netdev_dummy_cast(netdev); + ovs_mutex_lock(&dev->mutex); if (!eth_addr_equals(dev->hwaddr, mac)) { memcpy(dev->hwaddr, mac, ETH_ADDR_LEN); netdev_dummy_poll_notify(dev); } + ovs_mutex_unlock(&dev->mutex); return 0; } @@ -479,18 +529,24 @@ static int netdev_dummy_get_etheraddr(const struct netdev *netdev, uint8_t mac[ETH_ADDR_LEN]) { - const struct netdev_dummy *dev = netdev_dummy_cast(netdev); + struct netdev_dummy *dev = netdev_dummy_cast(netdev); + ovs_mutex_lock(&dev->mutex); memcpy(mac, dev->hwaddr, ETH_ADDR_LEN); + ovs_mutex_unlock(&dev->mutex); + return 0; } static int netdev_dummy_get_mtu(const struct netdev *netdev, int *mtup) { - const struct netdev_dummy *dev = netdev_dummy_cast(netdev); + struct netdev_dummy *dev = netdev_dummy_cast(netdev); + ovs_mutex_lock(&dev->mutex); *mtup = dev->mtu; + ovs_mutex_unlock(&dev->mutex); + return 0; } @@ -499,16 +555,22 @@ netdev_dummy_set_mtu(const struct netdev *netdev, int mtu) { struct netdev_dummy *dev = netdev_dummy_cast(netdev); + ovs_mutex_lock(&dev->mutex); dev->mtu = mtu; + ovs_mutex_unlock(&dev->mutex); + return 0; } static int netdev_dummy_get_stats(const struct netdev *netdev, struct netdev_stats *stats) { - const struct netdev_dummy *dev = netdev_dummy_cast(netdev); + struct netdev_dummy *dev = netdev_dummy_cast(netdev); + ovs_mutex_lock(&dev->mutex); *stats = dev->stats; + ovs_mutex_unlock(&dev->mutex); + return 0; } @@ -517,7 +579,10 @@ netdev_dummy_set_stats(struct netdev *netdev, const struct netdev_stats *stats) { struct netdev_dummy *dev = netdev_dummy_cast(netdev); + ovs_mutex_lock(&dev->mutex); dev->stats = *stats; + ovs_mutex_unlock(&dev->mutex); + return 0; } @@ -525,17 +590,21 @@ static int netdev_dummy_get_ifindex(const struct netdev *netdev) { struct netdev_dummy *dev = netdev_dummy_cast(netdev); + int ifindex; - return dev->ifindex; + ovs_mutex_lock(&dev->mutex); + ifindex = dev->ifindex; + ovs_mutex_unlock(&dev->mutex); + + return ifindex; } static int -netdev_dummy_update_flags(struct netdev *netdev_, - enum netdev_flags off, enum netdev_flags on, - enum netdev_flags *old_flagsp) +netdev_dummy_update_flags__(struct netdev_dummy *netdev, + enum netdev_flags off, enum netdev_flags on, + enum netdev_flags *old_flagsp) + OVS_REQUIRES(netdev->mutex) { - struct netdev_dummy *netdev = netdev_dummy_cast(netdev_); - if ((off | on) & ~(NETDEV_UP | NETDEV_PROMISC)) { return EINVAL; } @@ -546,13 +615,36 @@ netdev_dummy_update_flags(struct netdev *netdev_, if (*old_flagsp != netdev->flags) { netdev_dummy_poll_notify(netdev); } + return 0; } +static int +netdev_dummy_update_flags(struct netdev *netdev_, + enum netdev_flags off, enum netdev_flags on, + enum netdev_flags *old_flagsp) +{ + struct netdev_dummy *netdev = netdev_dummy_cast(netdev_); + int error; + + ovs_mutex_lock(&netdev->mutex); + error = netdev_dummy_update_flags__(netdev, off, on, old_flagsp); + ovs_mutex_unlock(&netdev->mutex); + + return error; +} + static unsigned int -netdev_dummy_change_seq(const struct netdev *netdev) +netdev_dummy_change_seq(const struct netdev *netdev_) { - return netdev_dummy_cast(netdev)->change_seq; + struct netdev_dummy *netdev = netdev_dummy_cast(netdev_); + unsigned int change_seq; + + ovs_mutex_lock(&netdev->mutex); + change_seq = netdev->change_seq; + ovs_mutex_unlock(&netdev->mutex); + + return change_seq; } /* Helper functions. */ @@ -722,10 +814,11 @@ netdev_dummy_receive(struct unixctl_conn *conn, goto exit; } + ovs_mutex_lock(&dummy_dev->mutex); dummy_dev->stats.rx_packets++; dummy_dev->stats.rx_bytes += packet->size; - netdev_dummy_queue_packet(dummy_dev, packet); + ovs_mutex_unlock(&dummy_dev->mutex); } unixctl_command_reply(conn, NULL); @@ -736,13 +829,14 @@ exit: static void netdev_dummy_set_admin_state__(struct netdev_dummy *dev, bool admin_state) + OVS_REQUIRES(dev->mutex) { enum netdev_flags old_flags; if (admin_state) { - netdev_dummy_update_flags(&dev->up, 0, NETDEV_UP, &old_flags); + netdev_dummy_update_flags__(dev, 0, NETDEV_UP, &old_flags); } else { - netdev_dummy_update_flags(&dev->up, NETDEV_UP, 0, &old_flags); + netdev_dummy_update_flags__(dev, NETDEV_UP, 0, &old_flags); } } @@ -766,7 +860,10 @@ netdev_dummy_set_admin_state(struct unixctl_conn *conn, int argc, if (netdev && is_dummy_class(netdev->netdev_class)) { struct netdev_dummy *dummy_dev = netdev_dummy_cast(netdev); + ovs_mutex_lock(&dummy_dev->mutex); netdev_dummy_set_admin_state__(dummy_dev, up); + ovs_mutex_unlock(&dummy_dev->mutex); + netdev_close(netdev); } else { unixctl_command_reply_error(conn, "Unknown Dummy Interface"); @@ -774,17 +871,15 @@ netdev_dummy_set_admin_state(struct unixctl_conn *conn, int argc, return; } } else { - struct shash dummy_netdevs; - struct shash_node *node; - - shash_init(&dummy_netdevs); - netdev_get_devices(&dummy_class, &dummy_netdevs); - SHASH_FOR_EACH (node, &dummy_netdevs) { - struct netdev *netdev = node->data; - netdev_dummy_set_admin_state__(netdev_dummy_cast(netdev), up); - netdev_close(netdev); + struct netdev_dummy *netdev; + + ovs_mutex_lock(&dummy_list_mutex); + LIST_FOR_EACH (netdev, list_node, &dummy_list) { + ovs_mutex_lock(&netdev->mutex); + netdev_dummy_set_admin_state__(netdev, up); + ovs_mutex_unlock(&netdev->mutex); } - shash_destroy(&dummy_netdevs); + ovs_mutex_unlock(&dummy_list_mutex); } unixctl_command_reply(conn, "OK"); } @@ -807,11 +902,17 @@ netdev_dummy_register(bool override) SSET_FOR_EACH (type, &types) { if (!netdev_unregister_provider(type)) { struct netdev_class *class; + int error; - class = xmalloc(sizeof *class); - *class = dummy_class; + class = xmemdup(&dummy_class, sizeof dummy_class); class->type = xstrdup(type); - netdev_register_provider(class); + error = netdev_register_provider(class); + if (error) { + VLOG_ERR("%s: failed to register netdev provider (%s)", + type, ovs_strerror(error)); + free(CONST_CAST(char *, class->type)); + free(class); + } } } sset_destroy(&types); diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c index cf459059b..9a80b676c 100644 --- a/lib/netdev-linux.c +++ b/lib/netdev-linux.c @@ -106,9 +106,6 @@ COVERAGE_DEFINE(netdev_set_ethtool); #define TC_RTAB_SIZE 1024 #endif -static struct nln_notifier *netdev_linux_cache_notifier = NULL; -static int cache_notifier_refcount; - enum { VALID_IFINDEX = 1 << 0, VALID_ETHERADDR = 1 << 1, @@ -355,6 +352,9 @@ static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes); struct netdev_linux { struct netdev up; + /* Protects all members below. */ + struct ovs_mutex mutex; + unsigned int cache_valid; unsigned int change_seq; @@ -410,6 +410,9 @@ static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *, int cmd, const char *cmd_name); static int get_flags(const struct netdev *, unsigned int *flags); static int set_flags(const char *, unsigned int flags); +static int update_flags(struct netdev_linux *netdev, enum netdev_flags off, + enum netdev_flags on, enum netdev_flags *old_flagsp) + OVS_REQUIRES(netdev->mutex); static int do_get_ifindex(const char *netdev_name); static int get_ifindex(const struct netdev *, int *ifindexp); static int do_set_addr(struct netdev *netdev, @@ -450,23 +453,117 @@ netdev_rx_linux_cast(const struct netdev_rx *rx) return CONTAINER_OF(rx, struct netdev_rx_linux, up); } +static void netdev_linux_update(struct netdev_linux *netdev, + const struct rtnetlink_link_change *) + OVS_REQUIRES(netdev->mutex); +static void netdev_linux_changed(struct netdev_linux *netdev, + unsigned int ifi_flags, unsigned int mask) + OVS_REQUIRES(netdev->mutex); + +/* Returns a NETLINK_ROUTE socket listening for RTNLGRP_LINK changes, or NULL + * if no such socket could be created. */ +static struct nl_sock * +netdev_linux_notify_sock(void) +{ + static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER; + static struct nl_sock *sock; + + if (ovsthread_once_start(&once)) { + int error; + + error = nl_sock_create(NETLINK_ROUTE, &sock); + if (!error) { + error = nl_sock_join_mcgroup(sock, RTNLGRP_LINK); + if (error) { + nl_sock_destroy(sock); + sock = NULL; + } + } + ovsthread_once_done(&once); + } + + return sock; +} + static void netdev_linux_run(void) { - rtnetlink_link_run(); + struct nl_sock *sock; + int error; + netdev_linux_miimon_run(); + + sock = netdev_linux_notify_sock(); + if (!sock) { + return; + } + + do { + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5); + uint64_t buf_stub[4096 / 8]; + struct ofpbuf buf; + + ofpbuf_use_stub(&buf, buf_stub, sizeof buf_stub); + error = nl_sock_recv(sock, &buf, false); + if (!error) { + struct rtnetlink_link_change change; + + if (rtnetlink_link_parse(&buf, &change)) { + struct netdev *netdev_ = netdev_from_name(change.ifname); + if (netdev_ && is_netdev_linux_class(netdev_->netdev_class)) { + struct netdev_linux *netdev = netdev_linux_cast(netdev_); + + ovs_mutex_lock(&netdev->mutex); + netdev_linux_update(netdev, &change); + ovs_mutex_unlock(&netdev->mutex); + } + netdev_close(netdev_); + } + } else if (error == ENOBUFS) { + struct shash device_shash; + struct shash_node *node; + + nl_sock_drain(sock); + + shash_init(&device_shash); + netdev_get_devices(&netdev_linux_class, &device_shash); + SHASH_FOR_EACH (node, &device_shash) { + struct netdev *netdev_ = node->data; + struct netdev_linux *netdev = netdev_linux_cast(netdev_); + unsigned int flags; + + ovs_mutex_lock(&netdev->mutex); + get_flags(netdev_, &flags); + netdev_linux_changed(netdev, flags, 0); + ovs_mutex_unlock(&netdev->mutex); + + netdev_close(netdev_); + } + shash_destroy(&device_shash); + } else if (error != EAGAIN) { + VLOG_WARN_RL(&rl, "error reading or parsing netlink (%s)", + ovs_strerror(error)); + } + ofpbuf_uninit(&buf); + } while (!error); } static void netdev_linux_wait(void) { - rtnetlink_link_wait(); + struct nl_sock *sock; + netdev_linux_miimon_wait(); + sock = netdev_linux_notify_sock(); + if (sock) { + nl_sock_wait(sock, POLLIN); + } } static void netdev_linux_changed(struct netdev_linux *dev, unsigned int ifi_flags, unsigned int mask) + OVS_REQUIRES(dev->mutex) { dev->change_seq++; if (!dev->change_seq) { @@ -484,6 +581,7 @@ netdev_linux_changed(struct netdev_linux *dev, static void netdev_linux_update(struct netdev_linux *dev, const struct rtnetlink_link_change *change) + OVS_REQUIRES(dev->mutex) { if (change->nlmsg_type == RTM_NEWLINK) { /* Keep drv-info */ @@ -511,64 +609,6 @@ netdev_linux_update(struct netdev_linux *dev, } } -static void -netdev_linux_cache_cb(const struct rtnetlink_link_change *change, - void *aux OVS_UNUSED) -{ - if (change) { - struct netdev *base_dev = netdev_from_name(change->ifname); - if (base_dev && is_netdev_linux_class(netdev_get_class(base_dev))) { - netdev_linux_update(netdev_linux_cast(base_dev), change); - netdev_close(base_dev); - } - } else { - struct shash device_shash; - struct shash_node *node; - - shash_init(&device_shash); - netdev_get_devices(&netdev_linux_class, &device_shash); - SHASH_FOR_EACH (node, &device_shash) { - struct netdev *netdev = node->data; - struct netdev_linux *dev = netdev_linux_cast(netdev); - unsigned int flags; - - get_flags(&dev->up, &flags); - netdev_linux_changed(dev, flags, 0); - netdev_close(netdev); - } - shash_destroy(&device_shash); - } -} - -static int -cache_notifier_ref(void) -{ - if (!cache_notifier_refcount) { - ovs_assert(!netdev_linux_cache_notifier); - - netdev_linux_cache_notifier = - rtnetlink_link_notifier_create(netdev_linux_cache_cb, NULL); - - if (!netdev_linux_cache_notifier) { - return EINVAL; - } - } - cache_notifier_refcount++; - - return 0; -} - -static void -cache_notifier_unref(void) -{ - ovs_assert(cache_notifier_refcount > 0); - if (!--cache_notifier_refcount) { - ovs_assert(netdev_linux_cache_notifier); - rtnetlink_link_notifier_destroy(netdev_linux_cache_notifier); - netdev_linux_cache_notifier = NULL; - } -} - static struct netdev * netdev_linux_alloc(void) { @@ -576,12 +616,11 @@ netdev_linux_alloc(void) return &netdev->up; } -static int +static void netdev_linux_common_construct(struct netdev_linux *netdev) { + ovs_mutex_init(&netdev->mutex, PTHREAD_MUTEX_NORMAL); netdev->change_seq = 1; - - return cache_notifier_ref(); } /* Creates system and internal devices. */ @@ -591,16 +630,12 @@ netdev_linux_construct(struct netdev *netdev_) struct netdev_linux *netdev = netdev_linux_cast(netdev_); int error; - error = netdev_linux_common_construct(netdev); - if (error) { - return error; - } + netdev_linux_common_construct(netdev); error = get_flags(&netdev->up, &netdev->ifi_flags); if (error == ENODEV) { if (netdev->up.netdev_class != &netdev_internal_class) { /* The device does not exist, so don't allow it to be opened. */ - cache_notifier_unref(); return ENODEV; } else { /* "Internal" netdevs have to be created as netdev objects before @@ -628,17 +663,14 @@ netdev_linux_construct_tap(struct netdev *netdev_) struct ifreq ifr; int error; - error = netdev_linux_common_construct(netdev); - if (error) { - goto error; - } + netdev_linux_common_construct(netdev); /* Open tap device. */ netdev->tap_fd = open(tap_dev, O_RDWR); if (netdev->tap_fd < 0) { error = errno; VLOG_WARN("opening \"%s\" failed: %s", tap_dev, ovs_strerror(error)); - goto error_unref_notifier; + return error; } /* Create tap device. */ @@ -661,9 +693,6 @@ netdev_linux_construct_tap(struct netdev *netdev_) error_close: close(netdev->tap_fd); -error_unref_notifier: - cache_notifier_unref(); -error: return error; } @@ -682,7 +711,7 @@ netdev_linux_destruct(struct netdev *netdev_) close(netdev->tap_fd); } - cache_notifier_unref(); + ovs_mutex_destroy(&netdev->mutex); } static void @@ -707,6 +736,7 @@ netdev_linux_rx_construct(struct netdev_rx *rx_) struct netdev_linux *netdev = netdev_linux_cast(netdev_); int error; + ovs_mutex_lock(&netdev->mutex); rx->is_tap = is_tap_netdev(netdev_); if (rx->is_tap) { rx->fd = netdev->tap_fd; @@ -766,6 +796,7 @@ netdev_linux_rx_construct(struct netdev_rx *rx_) goto error; } } + ovs_mutex_unlock(&netdev->mutex); return 0; @@ -773,6 +804,7 @@ error: if (rx->fd >= 0) { close(rx->fd); } + ovs_mutex_unlock(&netdev->mutex); return error; } @@ -863,7 +895,6 @@ netdev_linux_send(struct netdev *netdev_, const void *data, size_t size) struct msghdr msg; struct iovec iov; int ifindex; - int error; int sock; sock = af_packet_sock(); @@ -871,9 +902,9 @@ netdev_linux_send(struct netdev *netdev_, const void *data, size_t size) return -sock; } - error = get_ifindex(netdev_, &ifindex); - if (error) { - return error; + ifindex = netdev_get_ifindex(netdev_); + if (ifindex < 0) { + return -ifindex; } /* We don't bother setting most fields in sockaddr_ll because the @@ -951,22 +982,22 @@ netdev_linux_set_etheraddr(struct netdev *netdev_, const uint8_t mac[ETH_ADDR_LEN]) { struct netdev_linux *netdev = netdev_linux_cast(netdev_); - struct netdev_saved_flags *sf = NULL; + enum netdev_flags old_flags = 0; int error; + ovs_mutex_lock(&netdev->mutex); + if (netdev->cache_valid & VALID_ETHERADDR) { - if (netdev->ether_addr_error) { - return netdev->ether_addr_error; - } - if (eth_addr_equals(netdev->etheraddr, mac)) { - return 0; + error = netdev->ether_addr_error; + if (error || eth_addr_equals(netdev->etheraddr, mac)) { + goto exit; } netdev->cache_valid &= ~VALID_ETHERADDR; } /* Tap devices must be brought down before setting the address. */ if (is_tap_netdev(netdev_)) { - netdev_turn_flags_off(netdev_, NETDEV_UP, &sf); + update_flags(netdev, NETDEV_UP, 0, &old_flags); } error = set_etheraddr(netdev_get_name(netdev_), mac); if (!error || error == ENODEV) { @@ -977,8 +1008,12 @@ netdev_linux_set_etheraddr(struct netdev *netdev_, } } - netdev_restore_flags(sf); + if (is_tap_netdev(netdev_) && old_flags & NETDEV_UP) { + update_flags(netdev, 0, NETDEV_UP, &old_flags); + } +exit: + ovs_mutex_unlock(&netdev->mutex); return error; } @@ -988,20 +1023,22 @@ netdev_linux_get_etheraddr(const struct netdev *netdev_, uint8_t mac[ETH_ADDR_LEN]) { struct netdev_linux *netdev = netdev_linux_cast(netdev_); + int error; + ovs_mutex_lock(&netdev->mutex); if (!(netdev->cache_valid & VALID_ETHERADDR)) { - int error = get_etheraddr(netdev_get_name(netdev_), - netdev->etheraddr); - - netdev->ether_addr_error = error; + netdev->ether_addr_error = get_etheraddr(netdev_get_name(netdev_), + netdev->etheraddr); netdev->cache_valid |= VALID_ETHERADDR; } - if (!netdev->ether_addr_error) { + error = netdev->ether_addr_error; + if (!error) { memcpy(mac, netdev->etheraddr, ETH_ADDR_LEN); } + ovs_mutex_unlock(&netdev->mutex); - return netdev->ether_addr_error; + return error; } /* Returns the maximum size of transmitted (and received) packets on 'netdev', @@ -1011,22 +1048,25 @@ static int netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup) { struct netdev_linux *netdev = netdev_linux_cast(netdev_); + int error; + + ovs_mutex_lock(&netdev->mutex); if (!(netdev->cache_valid & VALID_MTU)) { struct ifreq ifr; - int error; - - error = af_inet_ifreq_ioctl(netdev_get_name(netdev_), &ifr, - SIOCGIFMTU, "SIOCGIFMTU"); - netdev->netdev_mtu_error = error; + netdev->netdev_mtu_error = af_inet_ifreq_ioctl( + netdev_get_name(netdev_), &ifr, SIOCGIFMTU, "SIOCGIFMTU"); netdev->mtu = ifr.ifr_mtu; netdev->cache_valid |= VALID_MTU; } - if (!netdev->netdev_mtu_error) { + error = netdev->netdev_mtu_error; + if (!error) { *mtup = netdev->mtu; } - return netdev->netdev_mtu_error; + ovs_mutex_unlock(&netdev->mutex); + + return error; } /* Sets the maximum size of transmitted (MTU) for given device using linux @@ -1039,12 +1079,11 @@ netdev_linux_set_mtu(const struct netdev *netdev_, int mtu) struct ifreq ifr; int error; + ovs_mutex_lock(&netdev->mutex); if (netdev->cache_valid & VALID_MTU) { - if (netdev->netdev_mtu_error) { - return netdev->netdev_mtu_error; - } - if (netdev->mtu == mtu) { - return 0; + error = netdev->netdev_mtu_error; + if (error || netdev->mtu == mtu) { + goto exit; } netdev->cache_valid &= ~VALID_MTU; } @@ -1056,17 +1095,23 @@ netdev_linux_set_mtu(const struct netdev *netdev_, int mtu) netdev->mtu = ifr.ifr_mtu; netdev->cache_valid |= VALID_MTU; } +exit: + ovs_mutex_unlock(&netdev->mutex); return error; } /* Returns the ifindex of 'netdev', if successful, as a positive number. * On failure, returns a negative errno value. */ static int -netdev_linux_get_ifindex(const struct netdev *netdev) +netdev_linux_get_ifindex(const struct netdev *netdev_) { + struct netdev_linux *netdev = netdev_linux_cast(netdev_); int ifindex, error; - error = get_ifindex(netdev, &ifindex); + ovs_mutex_lock(&netdev->mutex); + error = get_ifindex(netdev_, &ifindex); + ovs_mutex_unlock(&netdev->mutex); + return error ? -error : ifindex; } @@ -1075,19 +1120,28 @@ netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier) { struct netdev_linux *netdev = netdev_linux_cast(netdev_); + ovs_mutex_lock(&netdev->mutex); if (netdev->miimon_interval > 0) { *carrier = netdev->miimon; } else { *carrier = (netdev->ifi_flags & IFF_RUNNING) != 0; } + ovs_mutex_unlock(&netdev->mutex); return 0; } static long long int -netdev_linux_get_carrier_resets(const struct netdev *netdev) +netdev_linux_get_carrier_resets(const struct netdev *netdev_) { - return netdev_linux_cast(netdev)->carrier_resets; + struct netdev_linux *netdev = netdev_linux_cast(netdev_); + long long int carrier_resets; + + ovs_mutex_lock(&netdev->mutex); + carrier_resets = netdev->carrier_resets; + ovs_mutex_unlock(&netdev->mutex); + + return carrier_resets; } static int @@ -1155,11 +1209,13 @@ netdev_linux_set_miimon_interval(struct netdev *netdev_, { struct netdev_linux *netdev = netdev_linux_cast(netdev_); + ovs_mutex_lock(&netdev->mutex); interval = interval > 0 ? MAX(interval, 100) : 0; if (netdev->miimon_interval != interval) { netdev->miimon_interval = interval; timer_set_expired(&netdev->miimon_timer); } + ovs_mutex_unlock(&netdev->mutex); return 0; } @@ -1177,18 +1233,17 @@ netdev_linux_miimon_run(void) struct netdev_linux *dev = netdev_linux_cast(netdev); bool miimon; - if (dev->miimon_interval <= 0 || !timer_expired(&dev->miimon_timer)) { - netdev_close(netdev); - continue; - } + ovs_mutex_lock(&dev->mutex); + if (dev->miimon_interval > 0 && timer_expired(&dev->miimon_timer)) { + netdev_linux_get_miimon(dev->up.name, &miimon); + if (miimon != dev->miimon) { + dev->miimon = miimon; + netdev_linux_changed(dev, dev->ifi_flags, 0); + } - netdev_linux_get_miimon(dev->up.name, &miimon); - if (miimon != dev->miimon) { - dev->miimon = miimon; - netdev_linux_changed(dev, dev->ifi_flags, 0); + timer_set_duration(&dev->miimon_timer, dev->miimon_interval); } - - timer_set_duration(&dev->miimon_timer, dev->miimon_interval); + ovs_mutex_unlock(&dev->mutex); netdev_close(netdev); } @@ -1207,9 +1262,11 @@ netdev_linux_miimon_wait(void) struct netdev *netdev = node->data; struct netdev_linux *dev = netdev_linux_cast(netdev); + ovs_mutex_lock(&dev->mutex); if (dev->miimon_interval > 0) { timer_wait(&dev->miimon_timer); } + ovs_mutex_unlock(&dev->mutex); netdev_close(netdev); } shash_destroy(&device_shash); @@ -1326,7 +1383,7 @@ get_stats_via_vport(const struct netdev *netdev_, static int netdev_linux_sys_get_stats(const struct netdev *netdev_, - struct netdev_stats *stats) + struct netdev_stats *stats) { static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER; static int use_netlink_stats; @@ -1365,19 +1422,14 @@ netdev_linux_get_stats(const struct netdev *netdev_, struct netdev_stats dev_stats; int error; + ovs_mutex_lock(&netdev->mutex); get_stats_via_vport(netdev_, stats); - error = netdev_linux_sys_get_stats(netdev_, &dev_stats); - if (error) { - if (netdev->vport_stats_error) { - return error; - } else { - return 0; + if (!netdev->vport_stats_error) { + error = 0; } - } - - if (netdev->vport_stats_error) { + } else if (netdev->vport_stats_error) { /* stats not available from OVS then use ioctl stats. */ *stats = dev_stats; } else { @@ -1399,7 +1451,9 @@ netdev_linux_get_stats(const struct netdev *netdev_, stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors; stats->tx_window_errors += dev_stats.tx_window_errors; } - return 0; + ovs_mutex_unlock(&netdev->mutex); + + return error; } /* Retrieves current device stats for 'netdev-tap' netdev or @@ -1411,24 +1465,20 @@ netdev_tap_get_stats(const struct netdev *netdev_, struct netdev_stats *stats) struct netdev_stats dev_stats; int error; + ovs_mutex_lock(&netdev->mutex); get_stats_via_vport(netdev_, stats); - error = netdev_linux_sys_get_stats(netdev_, &dev_stats); if (error) { - if (netdev->vport_stats_error) { - return error; - } else { - return 0; + if (!netdev->vport_stats_error) { + error = 0; } - } + } else if (netdev->vport_stats_error) { + /* Transmit and receive stats will appear to be swapped relative to the + * other ports since we are the one sending the data, not a remote + * computer. For consistency, we swap them back here. This does not + * apply if we are getting stats from the vport layer because it always + * tracks stats from the perspective of the switch. */ - /* If this port is an internal port then the transmit and receive stats - * will appear to be swapped relative to the other ports since we are the - * one sending the data, not a remote computer. For consistency, we swap - * them back here. This does not apply if we are getting stats from the - * vport layer because it always tracks stats from the perspective of the - * switch. */ - if (netdev->vport_stats_error) { *stats = dev_stats; swap_uint64(&stats->rx_packets, &stats->tx_packets); swap_uint64(&stats->rx_bytes, &stats->tx_bytes); @@ -1455,7 +1505,9 @@ netdev_tap_get_stats(const struct netdev *netdev_, struct netdev_stats *stats) stats->multicast += dev_stats.multicast; stats->collisions += dev_stats.collisions; } - return 0; + ovs_mutex_unlock(&netdev->mutex); + + return error; } static int @@ -1463,9 +1515,14 @@ netdev_internal_get_stats(const struct netdev *netdev_, struct netdev_stats *stats) { struct netdev_linux *netdev = netdev_linux_cast(netdev_); + int error; + ovs_mutex_lock(&netdev->mutex); get_stats_via_vport(netdev_, stats); - return netdev->vport_stats_error; + error = netdev->vport_stats_error; + ovs_mutex_unlock(&netdev->mutex); + + return error; } static int @@ -1505,6 +1562,7 @@ netdev_internal_set_stats(struct netdev *netdev, static void netdev_linux_read_features(struct netdev_linux *netdev) + OVS_REQUIRES(netdev->mutex) { struct ethtool_cmd ecmd; uint32_t speed; @@ -1646,32 +1704,39 @@ netdev_linux_get_features(const struct netdev *netdev_, enum netdev_features *peer) { struct netdev_linux *netdev = netdev_linux_cast(netdev_); + int error; + ovs_mutex_lock(&netdev->mutex); netdev_linux_read_features(netdev); - if (!netdev->get_features_error) { *current = netdev->current; *advertised = netdev->advertised; *supported = netdev->supported; *peer = 0; /* XXX */ } - return netdev->get_features_error; + error = netdev->get_features_error; + ovs_mutex_unlock(&netdev->mutex); + + return error; } /* Set the features advertised by 'netdev' to 'advertise'. */ static int -netdev_linux_set_advertisements(struct netdev *netdev, +netdev_linux_set_advertisements(struct netdev *netdev_, enum netdev_features advertise) { + struct netdev_linux *netdev = netdev_linux_cast(netdev_); struct ethtool_cmd ecmd; int error; + ovs_mutex_lock(&netdev->mutex); + COVERAGE_INC(netdev_get_ethtool); memset(&ecmd, 0, sizeof ecmd); - error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd, + error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd, ETHTOOL_GSET, "ETHTOOL_GSET"); if (error) { - return error; + goto exit; } ecmd.advertising = 0; @@ -1712,8 +1777,12 @@ netdev_linux_set_advertisements(struct netdev *netdev, ecmd.advertising |= ADVERTISED_Asym_Pause; } COVERAGE_INC(netdev_set_ethtool); - return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd, - ETHTOOL_SSET, "ETHTOOL_SSET"); + error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd, + ETHTOOL_SSET, "ETHTOOL_SSET"); + +exit: + ovs_mutex_unlock(&netdev->mutex); + return error; } /* Attempts to set input rate limiting (policing) policy. Returns 0 if @@ -1726,20 +1795,17 @@ netdev_linux_set_policing(struct netdev *netdev_, const char *netdev_name = netdev_get_name(netdev_); int error; - kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */ : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */ : kbits_burst); /* Stick with user-specified value. */ + ovs_mutex_lock(&netdev->mutex); if (netdev->cache_valid & VALID_POLICING) { - if (netdev->netdev_policing_error) { - return netdev->netdev_policing_error; - } - - if (netdev->kbits_rate == kbits_rate && - netdev->kbits_burst == kbits_burst) { + error = netdev->netdev_policing_error; + if (error || (netdev->kbits_rate == kbits_rate && + netdev->kbits_burst == kbits_burst)) { /* Assume that settings haven't changed since we last set them. */ - return 0; + goto out; } netdev->cache_valid &= ~VALID_POLICING; } @@ -1777,6 +1843,7 @@ out: netdev->netdev_policing_error = error; netdev->cache_valid |= VALID_POLICING; } + ovs_mutex_unlock(&netdev->mutex); return error; } @@ -1864,15 +1931,17 @@ netdev_linux_get_qos(const struct netdev *netdev_, struct netdev_linux *netdev = netdev_linux_cast(netdev_); int error; + ovs_mutex_lock(&netdev->mutex); error = tc_query_qdisc(netdev_); - if (error) { - return error; + if (!error) { + *typep = netdev->tc->ops->ovs_name; + error = (netdev->tc->ops->qdisc_get + ? netdev->tc->ops->qdisc_get(netdev_, details) + : 0); } + ovs_mutex_unlock(&netdev->mutex); - *typep = netdev->tc->ops->ovs_name; - return (netdev->tc->ops->qdisc_get - ? netdev->tc->ops->qdisc_get(netdev_, details) - : 0); + return error; } static int @@ -1888,27 +1957,30 @@ netdev_linux_set_qos(struct netdev *netdev_, return EOPNOTSUPP; } + ovs_mutex_lock(&netdev->mutex); error = tc_query_qdisc(netdev_); if (error) { - return error; + goto exit; } if (new_ops == netdev->tc->ops) { - return new_ops->qdisc_set ? new_ops->qdisc_set(netdev_, details) : 0; + error = new_ops->qdisc_set ? new_ops->qdisc_set(netdev_, details) : 0; } else { /* Delete existing qdisc. */ error = tc_del_qdisc(netdev_); if (error) { - return error; + goto exit; } ovs_assert(netdev->tc == NULL); /* Install new qdisc. */ error = new_ops->tc_install(netdev_, details); ovs_assert((error == 0) == (netdev->tc != NULL)); - - return error; } + +exit: + ovs_mutex_unlock(&netdev->mutex); + return error; } static int @@ -1918,15 +1990,17 @@ netdev_linux_get_queue(const struct netdev *netdev_, struct netdev_linux *netdev = netdev_linux_cast(netdev_); int error; + ovs_mutex_lock(&netdev->mutex); error = tc_query_qdisc(netdev_); - if (error) { - return error; - } else { + if (!error) { struct tc_queue *queue = tc_find_queue(netdev_, queue_id); - return (queue + error = (queue ? netdev->tc->ops->class_get(netdev_, queue, details) : ENOENT); } + ovs_mutex_unlock(&netdev->mutex); + + return error; } static int @@ -1936,15 +2010,17 @@ netdev_linux_set_queue(struct netdev *netdev_, struct netdev_linux *netdev = netdev_linux_cast(netdev_); int error; + ovs_mutex_lock(&netdev->mutex); error = tc_query_qdisc(netdev_); - if (error) { - return error; - } else if (queue_id >= netdev->tc->ops->n_queues - || !netdev->tc->ops->class_set) { - return EINVAL; + if (!error) { + error = (queue_id < netdev->tc->ops->n_queues + && netdev->tc->ops->class_set + ? netdev->tc->ops->class_set(netdev_, queue_id, details) + : EINVAL); } + ovs_mutex_unlock(&netdev->mutex); - return netdev->tc->ops->class_set(netdev_, queue_id, details); + return error; } static int @@ -1953,17 +2029,21 @@ netdev_linux_delete_queue(struct netdev *netdev_, unsigned int queue_id) struct netdev_linux *netdev = netdev_linux_cast(netdev_); int error; + ovs_mutex_lock(&netdev->mutex); error = tc_query_qdisc(netdev_); - if (error) { - return error; - } else if (!netdev->tc->ops->class_delete) { - return EINVAL; - } else { - struct tc_queue *queue = tc_find_queue(netdev_, queue_id); - return (queue - ? netdev->tc->ops->class_delete(netdev_, queue) - : ENOENT); + if (!error) { + if (netdev->tc->ops->class_delete) { + struct tc_queue *queue = tc_find_queue(netdev_, queue_id); + error = (queue + ? netdev->tc->ops->class_delete(netdev_, queue) + : ENOENT); + } else { + error = EINVAL; + } } + ovs_mutex_unlock(&netdev->mutex); + + return error; } static int @@ -1974,19 +2054,25 @@ netdev_linux_get_queue_stats(const struct netdev *netdev_, struct netdev_linux *netdev = netdev_linux_cast(netdev_); int error; + ovs_mutex_lock(&netdev->mutex); error = tc_query_qdisc(netdev_); - if (error) { - return error; - } else if (!netdev->tc->ops->class_get_stats) { - return EOPNOTSUPP; - } else { - const struct tc_queue *queue = tc_find_queue(netdev_, queue_id); - if (!queue) { - return ENOENT; + if (!error) { + if (netdev->tc->ops->class_get_stats) { + const struct tc_queue *queue = tc_find_queue(netdev_, queue_id); + if (queue) { + stats->created = queue->created; + error = netdev->tc->ops->class_get_stats(netdev_, queue, + stats); + } else { + error = ENOENT; + } + } else { + error = EOPNOTSUPP; } - stats->created = queue->created; - return netdev->tc->ops->class_get_stats(netdev_, queue, stats); } + ovs_mutex_unlock(&netdev->mutex); + + return error; } static bool @@ -2010,34 +2096,37 @@ netdev_linux_dump_queues(const struct netdev *netdev_, netdev_dump_queues_cb *cb, void *aux) { struct netdev_linux *netdev = netdev_linux_cast(netdev_); - struct tc_queue *queue, *next_queue; - struct smap details; - int last_error; int error; + ovs_mutex_lock(&netdev->mutex); error = tc_query_qdisc(netdev_); - if (error) { - return error; - } else if (!netdev->tc->ops->class_get) { - return EOPNOTSUPP; - } + if (!error) { + if (netdev->tc->ops->class_get) { + struct tc_queue *queue, *next_queue; + struct smap details; - last_error = 0; - smap_init(&details); - HMAP_FOR_EACH_SAFE (queue, next_queue, hmap_node, - &netdev->tc->queues) { - smap_clear(&details); + smap_init(&details); + HMAP_FOR_EACH_SAFE (queue, next_queue, hmap_node, + &netdev->tc->queues) { + int retval; - error = netdev->tc->ops->class_get(netdev_, queue, &details); - if (!error) { - (*cb)(queue->queue_id, &details, aux); + smap_clear(&details); + + retval = netdev->tc->ops->class_get(netdev_, queue, &details); + if (!retval) { + (*cb)(queue->queue_id, &details, aux); + } else { + error = retval; + } + } + smap_destroy(&details); } else { - last_error = error; + error = EOPNOTSUPP; } } - smap_destroy(&details); + ovs_mutex_unlock(&netdev->mutex); - return last_error; + return error; } static int @@ -2045,31 +2134,38 @@ netdev_linux_dump_queue_stats(const struct netdev *netdev_, netdev_dump_queue_stats_cb *cb, void *aux) { struct netdev_linux *netdev = netdev_linux_cast(netdev_); - struct nl_dump dump; - struct ofpbuf msg; - int last_error; int error; + ovs_mutex_lock(&netdev->mutex); error = tc_query_qdisc(netdev_); - if (error) { - return error; - } else if (!netdev->tc->ops->class_dump_stats) { - return EOPNOTSUPP; - } + if (!error) { + struct nl_dump dump; - last_error = 0; - if (!start_queue_dump(netdev_, &dump)) { - return ENODEV; - } - while (nl_dump_next(&dump, &msg)) { - error = netdev->tc->ops->class_dump_stats(netdev_, &msg, cb, aux); - if (error) { - last_error = error; + if (!netdev->tc->ops->class_dump_stats) { + error = EOPNOTSUPP; + } else if (!start_queue_dump(netdev_, &dump)) { + error = ENODEV; + } else { + struct ofpbuf msg; + int retval; + + while (nl_dump_next(&dump, &msg)) { + retval = netdev->tc->ops->class_dump_stats(netdev_, &msg, + cb, aux); + if (retval) { + error = retval; + } + } + + retval = nl_dump_done(&dump); + if (retval) { + error = retval; + } } } + ovs_mutex_unlock(&netdev->mutex); - error = nl_dump_done(&dump); - return error ? error : last_error; + return error; } static int @@ -2077,27 +2173,34 @@ netdev_linux_get_in4(const struct netdev *netdev_, struct in_addr *address, struct in_addr *netmask) { struct netdev_linux *netdev = netdev_linux_cast(netdev_); + int error; + ovs_mutex_lock(&netdev->mutex); if (!(netdev->cache_valid & VALID_IN4)) { - int error; - error = netdev_linux_get_ipv4(netdev_, &netdev->address, SIOCGIFADDR, "SIOCGIFADDR"); - if (error) { - return error; + if (!error) { + error = netdev_linux_get_ipv4(netdev_, &netdev->netmask, + SIOCGIFNETMASK, "SIOCGIFNETMASK"); + if (!error) { + netdev->cache_valid |= VALID_IN4; + } } + } else { + error = 0; + } - error = netdev_linux_get_ipv4(netdev_, &netdev->netmask, - SIOCGIFNETMASK, "SIOCGIFNETMASK"); - if (error) { - return error; + if (!error) { + if (netdev->address.s_addr != INADDR_ANY) { + *address = netdev->address; + *netmask = netdev->netmask; + } else { + error = EADDRNOTAVAIL; } - - netdev->cache_valid |= VALID_IN4; } - *address = netdev->address; - *netmask = netdev->netmask; - return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0; + ovs_mutex_unlock(&netdev->mutex); + + return error; } static int @@ -2107,6 +2210,7 @@ netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address, struct netdev_linux *netdev = netdev_linux_cast(netdev_); int error; + ovs_mutex_lock(&netdev->mutex); error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address); if (!error) { netdev->cache_valid |= VALID_IN4; @@ -2117,6 +2221,8 @@ netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address, "SIOCSIFNETMASK", netmask); } } + ovs_mutex_unlock(&netdev->mutex); + return error; } @@ -2142,6 +2248,8 @@ static int netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6) { struct netdev_linux *netdev = netdev_linux_cast(netdev_); + + ovs_mutex_lock(&netdev->mutex); if (!(netdev->cache_valid & VALID_IN6)) { FILE *file; char line[128]; @@ -2166,6 +2274,8 @@ netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6) netdev->cache_valid |= VALID_IN6; } *in6 = netdev->in6; + ovs_mutex_unlock(&netdev->mutex); + return 0; } @@ -2280,6 +2390,7 @@ netdev_linux_get_status(const struct netdev *netdev_, struct smap *smap) struct netdev_linux *netdev = netdev_linux_cast(netdev_); int error = 0; + ovs_mutex_lock(&netdev->mutex); if (!(netdev->cache_valid & VALID_DRVINFO)) { struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev->drvinfo; @@ -2299,6 +2410,8 @@ netdev_linux_get_status(const struct netdev *netdev_, struct smap *smap) smap_add(smap, "driver_version", netdev->drvinfo.version); smap_add(smap, "firmware_version", netdev->drvinfo.fw_version); } + ovs_mutex_unlock(&netdev->mutex); + return error; } @@ -2370,10 +2483,10 @@ iff_to_nd_flags(int iff) } static int -netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off, - enum netdev_flags on, enum netdev_flags *old_flagsp) +update_flags(struct netdev_linux *netdev, enum netdev_flags off, + enum netdev_flags on, enum netdev_flags *old_flagsp) + OVS_REQUIRES(netdev->mutex) { - struct netdev_linux *netdev = netdev_linux_cast(netdev_); int old_flags, new_flags; int error = 0; @@ -2381,16 +2494,38 @@ netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off, *old_flagsp = iff_to_nd_flags(old_flags); new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on); if (new_flags != old_flags) { - error = set_flags(netdev_get_name(netdev_), new_flags); - get_flags(netdev_, &netdev->ifi_flags); + error = set_flags(netdev_get_name(&netdev->up), new_flags); + get_flags(&netdev->up, &netdev->ifi_flags); } + + return error; +} + +static int +netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off, + enum netdev_flags on, enum netdev_flags *old_flagsp) +{ + struct netdev_linux *netdev = netdev_linux_cast(netdev_); + int error; + + ovs_mutex_lock(&netdev->mutex); + error = update_flags(netdev, off, on, old_flagsp); + ovs_mutex_unlock(&netdev->mutex); + return error; } static unsigned int -netdev_linux_change_seq(const struct netdev *netdev) +netdev_linux_change_seq(const struct netdev *netdev_) { - return netdev_linux_cast(netdev)->change_seq; + struct netdev_linux *netdev = netdev_linux_cast(netdev_); + unsigned int change_seq; + + ovs_mutex_lock(&netdev->mutex); + change_seq = netdev->change_seq; + ovs_mutex_unlock(&netdev->mutex); + + return change_seq; } #define NETDEV_LINUX_CLASS(NAME, CONSTRUCT, GET_STATS, SET_STATS, \ diff --git a/lib/netdev-provider.h b/lib/netdev-provider.h index 9457c171b..23905d413 100644 --- a/lib/netdev-provider.h +++ b/lib/netdev-provider.h @@ -33,9 +33,12 @@ extern "C" { * Network device implementations may read these members but should not modify * them. */ struct netdev { + /* The following do not change during the lifetime of a struct netdev. */ char *name; /* Name of network device. */ const struct netdev_class *netdev_class; /* Functions to control this device. */ + + /* The following are protected by 'netdev_mutex' (internal to netdev.c). */ int ref_cnt; /* Times this devices was opened. */ struct shash_node *node; /* Pointer to element in global map. */ struct list saved_flags_list; /* Contains "struct netdev_saved_flags". */ @@ -636,7 +639,6 @@ struct netdev_class { int netdev_register_provider(const struct netdev_class *); int netdev_unregister_provider(const char *type); -const struct netdev_class *netdev_lookup_provider(const char *type); extern const struct netdev_class netdev_linux_class; extern const struct netdev_class netdev_internal_class; diff --git a/lib/netdev-vport.c b/lib/netdev-vport.c index ac3da6345..76aa148cc 100644 --- a/lib/netdev-vport.c +++ b/lib/netdev-vport.c @@ -48,6 +48,10 @@ VLOG_DEFINE_THIS_MODULE(netdev_vport); struct netdev_vport { struct netdev up; + + /* Protects all members below. */ + struct ovs_mutex mutex; + unsigned int change_seq; uint8_t etheraddr[ETH_ADDR_LEN]; struct netdev_stats stats; @@ -65,9 +69,10 @@ struct vport_class { }; static int netdev_vport_construct(struct netdev *); -static int get_patch_config(const struct netdev *, struct smap *args); +static int get_patch_config(const struct netdev *netdev, struct smap *args); static int get_tunnel_config(const struct netdev *, struct smap *args); -static void netdev_vport_poll_notify(struct netdev_vport *); +static void netdev_vport_poll_notify(struct netdev_vport *netdev) + OVS_REQUIRES(netdev->mutex); static bool is_vport_class(const struct netdev_class *class) @@ -166,6 +171,7 @@ netdev_vport_construct(struct netdev *netdev_) { struct netdev_vport *netdev = netdev_vport_cast(netdev_); + ovs_mutex_init(&netdev->mutex, PTHREAD_MUTEX_NORMAL); netdev->change_seq = 1; eth_addr_random(netdev->etheraddr); @@ -181,6 +187,7 @@ netdev_vport_destruct(struct netdev *netdev_) route_table_unregister(); free(netdev->peer); + ovs_mutex_destroy(&netdev->mutex); } static void @@ -195,26 +202,39 @@ netdev_vport_set_etheraddr(struct netdev *netdev_, const uint8_t mac[ETH_ADDR_LEN]) { struct netdev_vport *netdev = netdev_vport_cast(netdev_); + + ovs_mutex_lock(&netdev->mutex); memcpy(netdev->etheraddr, mac, ETH_ADDR_LEN); netdev_vport_poll_notify(netdev); + ovs_mutex_unlock(&netdev->mutex); + return 0; } static int -netdev_vport_get_etheraddr(const struct netdev *netdev, +netdev_vport_get_etheraddr(const struct netdev *netdev_, uint8_t mac[ETH_ADDR_LEN]) { - memcpy(mac, netdev_vport_cast(netdev)->etheraddr, ETH_ADDR_LEN); + struct netdev_vport *netdev = netdev_vport_cast(netdev_); + + ovs_mutex_lock(&netdev->mutex); + memcpy(mac, netdev->etheraddr, ETH_ADDR_LEN); + ovs_mutex_unlock(&netdev->mutex); + return 0; } static int -tunnel_get_status(const struct netdev *netdev, struct smap *smap) +tunnel_get_status(const struct netdev *netdev_, struct smap *smap) { + struct netdev_vport *netdev = netdev_vport_cast(netdev_); char iface[IFNAMSIZ]; ovs_be32 route; - route = netdev_vport_cast(netdev)->tnl_cfg.ip_dst; + ovs_mutex_lock(&netdev->mutex); + route = netdev->tnl_cfg.ip_dst; + ovs_mutex_unlock(&netdev->mutex); + if (route_table_get_name(route, iface)) { struct netdev *egress_netdev; @@ -473,8 +493,10 @@ set_tunnel_config(struct netdev *dev_, const struct smap *args) &tnl_cfg.out_key_present, &tnl_cfg.out_key_flow); + ovs_mutex_lock(&dev->mutex); dev->tnl_cfg = tnl_cfg; netdev_vport_poll_notify(dev); + ovs_mutex_unlock(&dev->mutex); return 0; } @@ -482,56 +504,60 @@ set_tunnel_config(struct netdev *dev_, const struct smap *args) static int get_tunnel_config(const struct netdev *dev, struct smap *args) { - const struct netdev_tunnel_config *tnl_cfg = - &netdev_vport_cast(dev)->tnl_cfg; + struct netdev_vport *netdev = netdev_vport_cast(dev); + struct netdev_tunnel_config tnl_cfg; - if (tnl_cfg->ip_dst) { - smap_add_format(args, "remote_ip", IP_FMT, IP_ARGS(tnl_cfg->ip_dst)); - } else if (tnl_cfg->ip_dst_flow) { + ovs_mutex_lock(&netdev->mutex); + tnl_cfg = netdev->tnl_cfg; + ovs_mutex_unlock(&netdev->mutex); + + if (tnl_cfg.ip_dst) { + smap_add_format(args, "remote_ip", IP_FMT, IP_ARGS(tnl_cfg.ip_dst)); + } else if (tnl_cfg.ip_dst_flow) { smap_add(args, "remote_ip", "flow"); } - if (tnl_cfg->ip_src) { - smap_add_format(args, "local_ip", IP_FMT, IP_ARGS(tnl_cfg->ip_src)); - } else if (tnl_cfg->ip_src_flow) { + if (tnl_cfg.ip_src) { + smap_add_format(args, "local_ip", IP_FMT, IP_ARGS(tnl_cfg.ip_src)); + } else if (tnl_cfg.ip_src_flow) { smap_add(args, "local_ip", "flow"); } - if (tnl_cfg->in_key_flow && tnl_cfg->out_key_flow) { + if (tnl_cfg.in_key_flow && tnl_cfg.out_key_flow) { smap_add(args, "key", "flow"); - } else if (tnl_cfg->in_key_present && tnl_cfg->out_key_present - && tnl_cfg->in_key == tnl_cfg->out_key) { - smap_add_format(args, "key", "%"PRIu64, ntohll(tnl_cfg->in_key)); + } else if (tnl_cfg.in_key_present && tnl_cfg.out_key_present + && tnl_cfg.in_key == tnl_cfg.out_key) { + smap_add_format(args, "key", "%"PRIu64, ntohll(tnl_cfg.in_key)); } else { - if (tnl_cfg->in_key_flow) { + if (tnl_cfg.in_key_flow) { smap_add(args, "in_key", "flow"); - } else if (tnl_cfg->in_key_present) { + } else if (tnl_cfg.in_key_present) { smap_add_format(args, "in_key", "%"PRIu64, - ntohll(tnl_cfg->in_key)); + ntohll(tnl_cfg.in_key)); } - if (tnl_cfg->out_key_flow) { + if (tnl_cfg.out_key_flow) { smap_add(args, "out_key", "flow"); - } else if (tnl_cfg->out_key_present) { + } else if (tnl_cfg.out_key_present) { smap_add_format(args, "out_key", "%"PRIu64, - ntohll(tnl_cfg->out_key)); + ntohll(tnl_cfg.out_key)); } } - if (tnl_cfg->ttl_inherit) { + if (tnl_cfg.ttl_inherit) { smap_add(args, "ttl", "inherit"); - } else if (tnl_cfg->ttl != DEFAULT_TTL) { - smap_add_format(args, "ttl", "%"PRIu8, tnl_cfg->ttl); + } else if (tnl_cfg.ttl != DEFAULT_TTL) { + smap_add_format(args, "ttl", "%"PRIu8, tnl_cfg.ttl); } - if (tnl_cfg->tos_inherit) { + if (tnl_cfg.tos_inherit) { smap_add(args, "tos", "inherit"); - } else if (tnl_cfg->tos) { - smap_add_format(args, "tos", "0x%x", tnl_cfg->tos); + } else if (tnl_cfg.tos) { + smap_add_format(args, "tos", "0x%x", tnl_cfg.tos); } - if (tnl_cfg->dst_port) { - uint16_t dst_port = ntohs(tnl_cfg->dst_port); + if (tnl_cfg.dst_port) { + uint16_t dst_port = ntohs(tnl_cfg.dst_port); const char *type = netdev_get_type(dev); if ((!strcmp("vxlan", type) && dst_port != VXLAN_DST_PORT) || @@ -540,11 +566,11 @@ get_tunnel_config(const struct netdev *dev, struct smap *args) } } - if (tnl_cfg->csum) { + if (tnl_cfg.csum) { smap_add(args, "csum", "true"); } - if (!tnl_cfg->dont_fragment) { + if (!tnl_cfg.dont_fragment) { smap_add(args, "df_default", "false"); } @@ -553,12 +579,26 @@ get_tunnel_config(const struct netdev *dev, struct smap *args) /* Code specific to patch ports. */ -const char * -netdev_vport_patch_peer(const struct netdev *netdev) +/* If 'netdev' is a patch port, returns the name of its peer as a malloc()'d + * string that the caller must free. + * + * If 'netdev' is not a patch port, returns NULL. */ +char * +netdev_vport_patch_peer(const struct netdev *netdev_) { - return (netdev_vport_is_patch(netdev) - ? netdev_vport_cast(netdev)->peer - : NULL); + char *peer = NULL; + + if (netdev_vport_is_patch(netdev_)) { + struct netdev_vport *netdev = netdev_vport_cast(netdev_); + + ovs_mutex_lock(&netdev->mutex); + if (netdev->peer) { + peer = xstrdup(netdev->peer); + } + ovs_mutex_unlock(&netdev->mutex); + } + + return peer; } void @@ -567,8 +607,11 @@ netdev_vport_inc_rx(const struct netdev *netdev, { if (is_vport_class(netdev_get_class(netdev))) { struct netdev_vport *dev = netdev_vport_cast(netdev); + + ovs_mutex_lock(&dev->mutex); dev->stats.rx_packets += stats->n_packets; dev->stats.rx_bytes += stats->n_bytes; + ovs_mutex_unlock(&dev->mutex); } } @@ -578,8 +621,11 @@ netdev_vport_inc_tx(const struct netdev *netdev, { if (is_vport_class(netdev_get_class(netdev))) { struct netdev_vport *dev = netdev_vport_cast(netdev); + + ovs_mutex_lock(&dev->mutex); dev->stats.tx_packets += stats->n_packets; dev->stats.tx_bytes += stats->n_bytes; + ovs_mutex_unlock(&dev->mutex); } } @@ -588,9 +634,12 @@ get_patch_config(const struct netdev *dev_, struct smap *args) { struct netdev_vport *dev = netdev_vport_cast(dev_); + ovs_mutex_lock(&dev->mutex); if (dev->peer) { smap_add(args, "peer", dev->peer); } + ovs_mutex_unlock(&dev->mutex); + return 0; } @@ -617,9 +666,12 @@ set_patch_config(struct netdev *dev_, const struct smap *args) return EINVAL; } + ovs_mutex_lock(&dev->mutex); free(dev->peer); dev->peer = xstrdup(peer); netdev_vport_poll_notify(dev); + ovs_mutex_unlock(&dev->mutex); + return 0; } @@ -627,7 +679,11 @@ static int get_stats(const struct netdev *netdev, struct netdev_stats *stats) { struct netdev_vport *dev = netdev_vport_cast(netdev); - memcpy(stats, &dev->stats, sizeof *stats); + + ovs_mutex_lock(&dev->mutex); + *stats = dev->stats; + ovs_mutex_unlock(&dev->mutex); + return 0; } @@ -712,15 +768,15 @@ netdev_vport_tunnel_register(void) TUNNEL_CLASS("vxlan", "vxlan_system"), TUNNEL_CLASS("lisp", "lisp_system") }; - static bool inited; + static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER; - int i; + if (ovsthread_once_start(&once)) { + int i; - if (!inited) { - inited = true; for (i = 0; i < ARRAY_SIZE(vport_classes); i++) { netdev_register_provider(&vport_classes[i].netdev_class); } + ovsthread_once_done(&once); } } diff --git a/lib/netdev-vport.h b/lib/netdev-vport.h index 53949666e..dc490970c 100644 --- a/lib/netdev-vport.h +++ b/lib/netdev-vport.h @@ -31,7 +31,7 @@ void netdev_vport_patch_register(void); bool netdev_vport_is_patch(const struct netdev *); -const char *netdev_vport_patch_peer(const struct netdev *netdev); +char *netdev_vport_patch_peer(const struct netdev *netdev); void netdev_vport_inc_rx(const struct netdev *, const struct dpif_flow_stats *); diff --git a/lib/netdev.c b/lib/netdev.c index 5f4345aff..c70105ba0 100644 --- a/lib/netdev.c +++ b/lib/netdev.c @@ -56,10 +56,31 @@ struct netdev_saved_flags { enum netdev_flags saved_values; }; -static struct shash netdev_classes = SHASH_INITIALIZER(&netdev_classes); +/* Protects 'netdev_shash' and the mutable members of struct netdev. */ +static struct ovs_mutex netdev_mutex = OVS_MUTEX_INITIALIZER; /* All created network devices. */ -static struct shash netdev_shash = SHASH_INITIALIZER(&netdev_shash); +static struct shash netdev_shash OVS_GUARDED_BY(netdev_mutex) + = SHASH_INITIALIZER(&netdev_shash); + +/* Protects 'netdev_classes' against insertions or deletions. + * + * This is not an rwlock for performance reasons but to allow recursive + * acquisition when calling into providers. For example, netdev_run() calls + * into provider 'run' functions, which might reasonably want to call one of + * the netdev functions that takes netdev_class_rwlock read-only. */ +static struct ovs_rwlock netdev_class_rwlock OVS_ACQ_BEFORE(netdev_mutex) + = OVS_RWLOCK_INITIALIZER; + +/* Contains 'struct netdev_registered_class'es. */ +static struct hmap netdev_classes OVS_GUARDED_BY(netdev_class_rwlock) + = HMAP_INITIALIZER(&netdev_classes); + +struct netdev_registered_class { + struct hmap_node hmap_node; /* In 'netdev_classes', by class->type. */ + const struct netdev_class *class; + atomic_int ref_cnt; /* Number of 'struct netdev's of this class. */ +}; /* This is set pretty low because we probably won't learn anything from the * additional log messages. */ @@ -70,12 +91,11 @@ void update_device_args(struct netdev *, const struct shash *args); static void netdev_initialize(void) + OVS_EXCLUDED(netdev_class_rwlock, netdev_mutex) { - static bool inited; - - if (!inited) { - inited = true; + static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER; + if (ovsthread_once_start(&once)) { fatal_signal_add_hook(restore_all_flags, NULL, NULL, true); netdev_vport_patch_register(); @@ -89,8 +109,10 @@ netdev_initialize(void) netdev_register_provider(&netdev_tap_class); netdev_register_provider(&netdev_bsd_class); #endif - netdev_register_provider(&netdev_tunnel_class); - netdev_register_provider(&netdev_pltap_class); + netdev_register_provider(&netdev_tunnel_class); + netdev_register_provider(&netdev_pltap_class); + + ovsthread_once_done(&once); } } @@ -100,14 +122,15 @@ netdev_initialize(void) * main poll loop. */ void netdev_run(void) + OVS_EXCLUDED(netdev_class_rwlock, netdev_mutex) { - struct shash_node *node; - SHASH_FOR_EACH(node, &netdev_classes) { - const struct netdev_class *netdev_class = node->data; - if (netdev_class->run) { - netdev_class->run(); - } + struct netdev_registered_class *rc; + + ovs_rwlock_rdlock(&netdev_class_rwlock); + HMAP_FOR_EACH (rc, hmap_node, &netdev_classes) { + rc->class->run(); } + ovs_rwlock_unlock(&netdev_class_rwlock); } /* Arranges for poll_block() to wake up when netdev_run() needs to be called. @@ -116,39 +139,63 @@ netdev_run(void) * main poll loop. */ void netdev_wait(void) + OVS_EXCLUDED(netdev_class_rwlock, netdev_mutex) { - struct shash_node *node; - SHASH_FOR_EACH(node, &netdev_classes) { - const struct netdev_class *netdev_class = node->data; - if (netdev_class->wait) { - netdev_class->wait(); + struct netdev_registered_class *rc; + + ovs_rwlock_rdlock(&netdev_class_rwlock); + HMAP_FOR_EACH (rc, hmap_node, &netdev_classes) { + rc->class->wait(); + } + ovs_rwlock_unlock(&netdev_class_rwlock); +} + +static struct netdev_registered_class * +netdev_lookup_class(const char *type) + OVS_REQ_RDLOCK(netdev_class_rwlock) +{ + struct netdev_registered_class *rc; + + HMAP_FOR_EACH_WITH_HASH (rc, hmap_node, hash_string(type, 0), + &netdev_classes) { + if (!strcmp(type, rc->class->type)) { + return rc; } } + return NULL; } /* Initializes and registers a new netdev provider. After successful * registration, new netdevs of that type can be opened using netdev_open(). */ int netdev_register_provider(const struct netdev_class *new_class) + OVS_EXCLUDED(netdev_class_rwlock, netdev_mutex) { - if (shash_find(&netdev_classes, new_class->type)) { + int error; + + ovs_rwlock_wrlock(&netdev_class_rwlock); + if (netdev_lookup_class(new_class->type)) { VLOG_WARN("attempted to register duplicate netdev provider: %s", new_class->type); - return EEXIST; - } - - if (new_class->init) { - int error = new_class->init(); - if (error) { + error = EEXIST; + } else { + error = new_class->init ? new_class->init() : 0; + if (!error) { + struct netdev_registered_class *rc; + + rc = xmalloc(sizeof *rc); + hmap_insert(&netdev_classes, &rc->hmap_node, + hash_string(new_class->type, 0)); + rc->class = new_class; + atomic_init(&rc->ref_cnt, 0); + } else { VLOG_ERR("failed to initialize %s network device class: %s", new_class->type, ovs_strerror(error)); - return error; } } + ovs_rwlock_unlock(&netdev_class_rwlock); - shash_add(&netdev_classes, new_class->type, new_class); - - return 0; + return error; } /* Unregisters a netdev provider. 'type' must have been previously @@ -156,51 +203,52 @@ netdev_register_provider(const struct netdev_class *new_class) * new netdevs of that type cannot be opened using netdev_open(). */ int netdev_unregister_provider(const char *type) + OVS_EXCLUDED(netdev_class_rwlock, netdev_mutex) { - struct shash_node *del_node, *netdev_node; + struct netdev_registered_class *rc; + int error; - del_node = shash_find(&netdev_classes, type); - if (!del_node) { + ovs_rwlock_wrlock(&netdev_class_rwlock); + rc = netdev_lookup_class(type); + if (!rc) { VLOG_WARN("attempted to unregister a netdev provider that is not " "registered: %s", type); - return EAFNOSUPPORT; - } + error = EAFNOSUPPORT; + } else { + int ref_cnt; - SHASH_FOR_EACH (netdev_node, &netdev_shash) { - struct netdev *netdev = netdev_node->data; - if (!strcmp(netdev->netdev_class->type, type)) { + atomic_read(&rc->ref_cnt, &ref_cnt); + if (!ref_cnt) { + hmap_remove(&netdev_classes, &rc->hmap_node); + free(rc); + error = 0; + } else { VLOG_WARN("attempted to unregister in use netdev provider: %s", type); - return EBUSY; + error = EBUSY; } } + ovs_rwlock_unlock(&netdev_class_rwlock); - shash_delete(&netdev_classes, del_node); - - return 0; -} - -const struct netdev_class * -netdev_lookup_provider(const char *type) -{ - netdev_initialize(); - return shash_find_data(&netdev_classes, type && type[0] ? type : "system"); + return error; } /* Clears 'types' and enumerates the types of all currently registered netdev * providers into it. The caller must first initialize the sset. */ void netdev_enumerate_types(struct sset *types) + OVS_EXCLUDED(netdev_mutex) { - struct shash_node *node; + struct netdev_registered_class *rc; netdev_initialize(); sset_clear(types); - SHASH_FOR_EACH(node, &netdev_classes) { - const struct netdev_class *netdev_class = node->data; - sset_add(types, netdev_class->type); + ovs_rwlock_rdlock(&netdev_class_rwlock); + HMAP_FOR_EACH (rc, hmap_node, &netdev_classes) { + sset_add(types, rc->class->type); } + ovs_rwlock_unlock(&netdev_class_rwlock); } /* Check that the network device name is not the same as any of the registered @@ -210,17 +258,21 @@ netdev_enumerate_types(struct sset *types) * Returns true if there is a name conflict, false otherwise. */ bool netdev_is_reserved_name(const char *name) + OVS_EXCLUDED(netdev_mutex) { - struct shash_node *node; + struct netdev_registered_class *rc; netdev_initialize(); - SHASH_FOR_EACH (node, &netdev_classes) { - const char *dpif_port; - dpif_port = netdev_vport_class_get_dpif_port(node->data); + + ovs_rwlock_rdlock(&netdev_class_rwlock); + HMAP_FOR_EACH (rc, hmap_node, &netdev_classes) { + const char *dpif_port = netdev_vport_class_get_dpif_port(rc->class); if (dpif_port && !strcmp(dpif_port, name)) { + ovs_rwlock_unlock(&netdev_class_rwlock); return true; } } + ovs_rwlock_unlock(&netdev_class_rwlock); if (!strncmp(name, "ovs-", 4)) { struct sset types; @@ -249,29 +301,39 @@ netdev_is_reserved_name(const char *name) * before they can be used. */ int netdev_open(const char *name, const char *type, struct netdev **netdevp) + OVS_EXCLUDED(netdev_mutex) { struct netdev *netdev; int error; netdev_initialize(); + ovs_rwlock_rdlock(&netdev_class_rwlock); + ovs_mutex_lock(&netdev_mutex); netdev = shash_find_data(&netdev_shash, name); if (!netdev) { - const struct netdev_class *class; + struct netdev_registered_class *rc; - class = netdev_lookup_provider(type); - if (class) { - netdev = class->alloc(); + rc = netdev_lookup_class(type && type[0] ? type : "system"); + if (rc) { + netdev = rc->class->alloc(); if (netdev) { memset(netdev, 0, sizeof *netdev); - netdev->netdev_class = class; + netdev->netdev_class = rc->class; netdev->name = xstrdup(name); netdev->node = shash_add(&netdev_shash, name, netdev); list_init(&netdev->saved_flags_list); - error = class->construct(netdev); - if (error) { - class->dealloc(netdev); + error = rc->class->construct(netdev); + if (!error) { + int old_ref_cnt; + + atomic_add(&rc->ref_cnt, 1, &old_ref_cnt); + } else { + free(netdev->name); + ovs_assert(list_is_empty(&netdev->saved_flags_list)); + shash_delete(&netdev_shash, netdev->node); + rc->class->dealloc(netdev); } } else { error = ENOMEM; @@ -285,6 +347,9 @@ netdev_open(const char *name, const char *type, struct netdev **netdevp) error = 0; } + ovs_mutex_unlock(&netdev_mutex); + ovs_rwlock_unlock(&netdev_class_rwlock); + if (!error) { netdev->ref_cnt++; *netdevp = netdev; @@ -298,12 +363,15 @@ netdev_open(const char *name, const char *type, struct netdev **netdevp) * 'netdev_' is null. */ struct netdev * netdev_ref(const struct netdev *netdev_) + OVS_EXCLUDED(netdev_mutex) { struct netdev *netdev = CONST_CAST(struct netdev *, netdev_); if (netdev) { + ovs_mutex_lock(&netdev_mutex); ovs_assert(netdev->ref_cnt > 0); netdev->ref_cnt++; + ovs_mutex_unlock(&netdev_mutex); } return netdev; } @@ -312,9 +380,10 @@ netdev_ref(const struct netdev *netdev_) * or NULL if none are needed. */ int netdev_set_config(struct netdev *netdev, const struct smap *args) + OVS_EXCLUDED(netdev_mutex) { if (netdev->netdev_class->set_config) { - struct smap no_args = SMAP_INITIALIZER(&no_args); + const struct smap no_args = SMAP_INITIALIZER(&no_args); return netdev->netdev_class->set_config(netdev, args ? args : &no_args); } else if (args && !smap_is_empty(args)) { @@ -334,6 +403,7 @@ netdev_set_config(struct netdev *netdev, const struct smap *args) * smap_destroy(). */ int netdev_get_config(const struct netdev *netdev, struct smap *args) + OVS_EXCLUDED(netdev_mutex) { int error; @@ -352,6 +422,7 @@ netdev_get_config(const struct netdev *netdev, struct smap *args) const struct netdev_tunnel_config * netdev_get_tunnel_config(const struct netdev *netdev) + OVS_EXCLUDED(netdev_mutex) { if (netdev->netdev_class->get_tunnel_config) { return netdev->netdev_class->get_tunnel_config(netdev); @@ -362,22 +433,38 @@ netdev_get_tunnel_config(const struct netdev *netdev) static void netdev_unref(struct netdev *dev) + OVS_RELEASES(netdev_mutex) { ovs_assert(dev->ref_cnt); if (!--dev->ref_cnt) { + const struct netdev_class *class = dev->netdev_class; + struct netdev_registered_class *rc; + int old_ref_cnt; + dev->netdev_class->destruct(dev); shash_delete(&netdev_shash, dev->node); free(dev->name); dev->netdev_class->dealloc(dev); + ovs_mutex_unlock(&netdev_mutex); + + ovs_rwlock_rdlock(&netdev_class_rwlock); + rc = netdev_lookup_class(class->type); + atomic_sub(&rc->ref_cnt, 1, &old_ref_cnt); + ovs_assert(old_ref_cnt > 0); + ovs_rwlock_unlock(&netdev_class_rwlock); + } else { + ovs_mutex_unlock(&netdev_mutex); } } /* Closes and destroys 'netdev'. */ void netdev_close(struct netdev *netdev) + OVS_EXCLUDED(netdev_mutex) { if (netdev) { + ovs_mutex_lock(&netdev_mutex); netdev_unref(netdev); } } @@ -403,6 +490,7 @@ netdev_parse_name(const char *netdev_name_, char **name, char **type) int netdev_rx_open(struct netdev *netdev, struct netdev_rx **rxp) + OVS_EXCLUDED(netdev_mutex) { int error; @@ -412,7 +500,10 @@ netdev_rx_open(struct netdev *netdev, struct netdev_rx **rxp) rx->netdev = netdev; error = netdev->netdev_class->rx_construct(rx); if (!error) { + ovs_mutex_lock(&netdev_mutex); netdev->ref_cnt++; + ovs_mutex_unlock(&netdev_mutex); + *rxp = rx; return 0; } @@ -430,6 +521,7 @@ netdev_rx_open(struct netdev *netdev, struct netdev_rx **rxp) void netdev_rx_close(struct netdev_rx *rx) + OVS_EXCLUDED(netdev_mutex) { if (rx) { struct netdev *netdev = rx->netdev; @@ -849,6 +941,7 @@ static int do_update_flags(struct netdev *netdev, enum netdev_flags off, enum netdev_flags on, enum netdev_flags *old_flagsp, struct netdev_saved_flags **sfp) + OVS_EXCLUDED(netdev_mutex) { struct netdev_saved_flags *sf = NULL; enum netdev_flags old_flags; @@ -865,6 +958,7 @@ do_update_flags(struct netdev *netdev, enum netdev_flags off, enum netdev_flags new_flags = (old_flags & ~off) | on; enum netdev_flags changed_flags = old_flags ^ new_flags; if (changed_flags) { + ovs_mutex_lock(&netdev_mutex); *sfp = sf = xmalloc(sizeof *sf); sf->netdev = netdev; list_push_front(&netdev->saved_flags_list, &sf->node); @@ -872,6 +966,7 @@ do_update_flags(struct netdev *netdev, enum netdev_flags off, sf->saved_values = changed_flags & new_flags; netdev->ref_cnt++; + ovs_mutex_unlock(&netdev_mutex); } } @@ -935,6 +1030,7 @@ netdev_turn_flags_off(struct netdev *netdev, enum netdev_flags flags, * Does nothing if 'sf' is NULL. */ void netdev_restore_flags(struct netdev_saved_flags *sf) + OVS_EXCLUDED(netdev_mutex) { if (sf) { struct netdev *netdev = sf->netdev; @@ -944,9 +1040,10 @@ netdev_restore_flags(struct netdev_saved_flags *sf) sf->saved_flags & sf->saved_values, sf->saved_flags & ~sf->saved_values, &old_flags); + + ovs_mutex_lock(&netdev_mutex); list_remove(&sf->node); free(sf); - netdev_unref(netdev); } } @@ -1381,13 +1478,16 @@ netdev_get_class(const struct netdev *netdev) * The caller must free the returned netdev with netdev_close(). */ struct netdev * netdev_from_name(const char *name) + OVS_EXCLUDED(netdev_mutex) { struct netdev *netdev; + ovs_mutex_lock(&netdev_mutex); netdev = shash_find_data(&netdev_shash, name); if (netdev) { - netdev_ref(netdev); + netdev->ref_cnt++; } + ovs_mutex_unlock(&netdev_mutex); return netdev; } @@ -1399,8 +1499,11 @@ netdev_from_name(const char *name) void netdev_get_devices(const struct netdev_class *netdev_class, struct shash *device_list) + OVS_EXCLUDED(netdev_mutex) { struct shash_node *node; + + ovs_mutex_lock(&netdev_mutex); SHASH_FOR_EACH (node, &netdev_shash) { struct netdev *dev = node->data; @@ -1409,6 +1512,7 @@ netdev_get_devices(const struct netdev_class *netdev_class, shash_add(device_list, node->name, node->data); } } + ovs_mutex_unlock(&netdev_mutex); } const char * diff --git a/lib/netdev.h b/lib/netdev.h index eb1870b4e..287f6cc55 100644 --- a/lib/netdev.h +++ b/lib/netdev.h @@ -31,7 +31,23 @@ extern "C" { * Every port on a switch must have a corresponding netdev that must minimally * support a few operations, such as the ability to read the netdev's MTU. * The PORTING file at the top of the source tree has more information in the - * "Writing a netdev Provider" section. */ + * "Writing a netdev Provider" section. + * + * Thread-safety + * ============= + * + * Most of the netdev functions are fully thread-safe: they may be called from + * any number of threads on the same or different netdev objects. The + * exceptions are: + * + * netdev_rx_recv() + * netdev_rx_wait() + * netdev_rx_drain() + * + * These functions are conditionally thread-safe: they may be called from + * different threads only on different netdev_rx objects. (The client may + * create multiple netdev_rx objects for a single netdev and access each + * of those from a different thread.) */ struct netdev; struct netdev_class; diff --git a/lib/nx-match.c b/lib/nx-match.c index 940dd9a44..09f7f548b 100644 --- a/lib/nx-match.c +++ b/lib/nx-match.c @@ -693,6 +693,10 @@ nx_put_raw(struct ofpbuf *b, bool oxm, const struct match *match, htonl(flow->regs[i]), htonl(match->wc.masks.regs[i])); } + /* Mark. */ + nxm_put_32m(b, NXM_NX_PKT_MARK, htonl(flow->pkt_mark), + htonl(match->wc.masks.pkt_mark)); + /* OpenFlow 1.1+ Metadata. */ nxm_put_64m(b, OXM_OF_METADATA, flow->metadata, match->wc.masks.metadata); diff --git a/lib/odp-execute.c b/lib/odp-execute.c index e6e8c918f..d505c6034 100644 --- a/lib/odp-execute.c +++ b/lib/odp-execute.c @@ -65,7 +65,7 @@ odp_execute_set_action(struct ofpbuf *packet, const struct nlattr *a, break; case OVS_KEY_ATTR_SKB_MARK: - flow->skb_mark = nl_attr_get_u32(a); + flow->pkt_mark = nl_attr_get_u32(a); break; case OVS_KEY_ATTR_ETHERNET: diff --git a/lib/odp-util.c b/lib/odp-util.c index 78d5a1b44..a09042e98 100644 --- a/lib/odp-util.c +++ b/lib/odp-util.c @@ -2361,7 +2361,7 @@ odp_flow_key_from_flow__(struct ofpbuf *buf, const struct flow *data, tun_key_to_attr(buf, &data->tunnel); } - nl_msg_put_u32(buf, OVS_KEY_ATTR_SKB_MARK, data->skb_mark); + nl_msg_put_u32(buf, OVS_KEY_ATTR_SKB_MARK, data->pkt_mark); /* Add an ingress port attribute if this is a mask or 'odp_in_port' * is not the magical value "ODPP_NONE". */ @@ -2932,7 +2932,7 @@ odp_flow_key_to_flow(const struct nlattr *key, size_t key_len, } if (present_attrs & (UINT64_C(1) << OVS_KEY_ATTR_SKB_MARK)) { - flow->skb_mark = nl_attr_get_u32(attrs[OVS_KEY_ATTR_SKB_MARK]); + flow->pkt_mark = nl_attr_get_u32(attrs[OVS_KEY_ATTR_SKB_MARK]); expected_attrs |= UINT64_C(1) << OVS_KEY_ATTR_SKB_MARK; } @@ -3044,11 +3044,11 @@ commit_set_action(struct ofpbuf *odp_actions, enum ovs_key_attr key_type, } void -odp_put_skb_mark_action(const uint32_t skb_mark, +odp_put_pkt_mark_action(const uint32_t pkt_mark, struct ofpbuf *odp_actions) { - commit_set_action(odp_actions, OVS_KEY_ATTR_SKB_MARK, &skb_mark, - sizeof(skb_mark)); + commit_set_action(odp_actions, OVS_KEY_ATTR_SKB_MARK, &pkt_mark, + sizeof(pkt_mark)); } /* If any of the flow key data that ODP actions can modify are different in @@ -3306,18 +3306,18 @@ commit_set_priority_action(const struct flow *flow, struct flow *base, } static void -commit_set_skb_mark_action(const struct flow *flow, struct flow *base, +commit_set_pkt_mark_action(const struct flow *flow, struct flow *base, struct ofpbuf *odp_actions, struct flow_wildcards *wc) { - if (base->skb_mark == flow->skb_mark) { + if (base->pkt_mark == flow->pkt_mark) { return; } - memset(&wc->masks.skb_mark, 0xff, sizeof wc->masks.skb_mark); - base->skb_mark = flow->skb_mark; + memset(&wc->masks.pkt_mark, 0xff, sizeof wc->masks.pkt_mark); + base->pkt_mark = flow->pkt_mark; - odp_put_skb_mark_action(base->skb_mark, odp_actions); + odp_put_pkt_mark_action(base->pkt_mark, odp_actions); } /* If any of the flow key data that ODP actions can modify are different in * 'base' and 'flow', appends ODP actions to 'odp_actions' that change the flow @@ -3339,5 +3339,5 @@ commit_odp_actions(const struct flow *flow, struct flow *base, */ commit_mpls_action(flow, base, odp_actions, wc); commit_set_priority_action(flow, base, odp_actions, wc); - commit_set_skb_mark_action(flow, base, odp_actions, wc); + commit_set_pkt_mark_action(flow, base, odp_actions, wc); } diff --git a/lib/odp-util.h b/lib/odp-util.h index 7e2788861..0c40f3822 100644 --- a/lib/odp-util.h +++ b/lib/odp-util.h @@ -181,7 +181,7 @@ size_t odp_put_userspace_action(uint32_t pid, struct ofpbuf *odp_actions); void odp_put_tunnel_action(const struct flow_tnl *tunnel, struct ofpbuf *odp_actions); -void odp_put_skb_mark_action(const uint32_t skb_mark, +void odp_put_pkt_mark_action(const uint32_t pkt_mark, struct ofpbuf *odp_actions); /* Reasons why a subfacet might not be fast-pathable. */ diff --git a/lib/ofp-print.c b/lib/ofp-print.c index 1a4dd9ce8..21989a95a 100644 --- a/lib/ofp-print.c +++ b/lib/ofp-print.c @@ -130,6 +130,10 @@ ofp_print_packet_in(struct ds *string, const struct ofp_header *oh, } } + if (pin.fmd.pkt_mark != 0) { + ds_put_format(string, " pkt_mark=0x%"PRIx32, pin.fmd.pkt_mark); + } + ds_put_format(string, " (via %s)", ofputil_packet_in_reason_to_string(pin.reason, reasonbuf, sizeof reasonbuf)); diff --git a/lib/ofp-util.c b/lib/ofp-util.c index d1bcf9c7d..45ff0a149 100644 --- a/lib/ofp-util.c +++ b/lib/ofp-util.c @@ -1134,11 +1134,17 @@ ofputil_usable_protocols(const struct match *match) return OFPUTIL_P_NONE; } - /* skb_mark and skb_priority can't be sent in a flow_mod */ - if (wc->masks.skb_mark || wc->masks.skb_priority) { + /* skb_priority can't be sent in a flow_mod */ + if (wc->masks.skb_priority) { return OFPUTIL_P_NONE; } + /* NXM and OXM support pkt_mark */ + if (wc->masks.pkt_mark) { + return OFPUTIL_P_OF10_NXM_ANY | OFPUTIL_P_OF12_OXM + | OFPUTIL_P_OF13_OXM; + } + /* NXM, OXM, and OF1.1 support bitwise matching on ethernet addresses. */ if (!eth_mask_is_exact(wc->masks.dl_src) && !eth_addr_is_zero(wc->masks.dl_src)) { @@ -2917,6 +2923,7 @@ ofputil_decode_packet_in_finish(struct ofputil_packet_in *pin, pin->fmd.tun_dst = match->flow.tunnel.ip_dst; pin->fmd.metadata = match->flow.metadata; memcpy(pin->fmd.regs, match->flow.regs, sizeof pin->fmd.regs); + pin->fmd.pkt_mark = match->flow.pkt_mark; } enum ofperr @@ -3031,6 +3038,10 @@ ofputil_packet_in_to_match(const struct ofputil_packet_in *pin, } } + if (pin->fmd.pkt_mark != 0) { + match_set_pkt_mark(match, pin->fmd.pkt_mark); + } + match_set_in_port(match, pin->fmd.in_port); } diff --git a/lib/ovs-thread.h b/lib/ovs-thread.h index 35476867e..b7bc5d198 100644 --- a/lib/ovs-thread.h +++ b/lib/ovs-thread.h @@ -467,12 +467,12 @@ struct ovsthread_once { } static inline bool ovsthread_once_start(struct ovsthread_once *once) - OVS_TRY_LOCK(true, &once->mutex); + OVS_TRY_LOCK(true, once->mutex); void ovsthread_once_done(struct ovsthread_once *once) - OVS_RELEASES(&once->mutex); + OVS_RELEASES(once->mutex); bool ovsthread_once_start__(struct ovsthread_once *once) - OVS_TRY_LOCK(false, &once->mutex); + OVS_TRY_LOCK(false, once->mutex); static inline bool ovsthread_once_is_done__(const struct ovsthread_once *once) @@ -496,11 +496,6 @@ ovsthread_once_start(struct ovsthread_once *once) return OVS_UNLIKELY(!ovsthread_once_is_done__(once) && !ovsthread_once_start__(once)); } - -#ifdef __CHECKER__ -#define ovsthread_once_start(ONCE) \ - ((ONCE)->done ? false : ({ OVS_MACRO_LOCK((&ONCE->mutex)); true; })) -#endif /* Thread ID. * diff --git a/lib/poll-loop.c b/lib/poll-loop.c index 5f9b9cdfd..4eb118701 100644 --- a/lib/poll-loop.c +++ b/lib/poll-loop.c @@ -26,6 +26,7 @@ #include "fatal-signal.h" #include "list.h" #include "ovs-thread.h" +#include "seq.h" #include "socket-util.h" #include "timeval.h" #include "vlog.h" @@ -248,6 +249,8 @@ poll_block(void) /* Handle any pending signals before doing anything else. */ fatal_signal_run(); + + seq_woke(); } static void diff --git a/lib/seq.c b/lib/seq.c new file mode 100644 index 000000000..36e506570 --- /dev/null +++ b/lib/seq.c @@ -0,0 +1,274 @@ +/* + * Copyright (c) 2013 Nicira, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include "seq.h" + +#include + +#include "hash.h" +#include "hmap.h" +#include "latch.h" +#include "list.h" +#include "ovs-thread.h" +#include "poll-loop.h" + +/* A sequence number object. */ +struct seq { + uint64_t value OVS_GUARDED; + struct hmap waiters OVS_GUARDED; /* Contains 'struct seq_waiter's. */ +}; + +/* A thread waiting on a particular seq. */ +struct seq_waiter { + struct seq *seq OVS_GUARDED; /* Seq being waited for. */ + struct hmap_node hmap_node OVS_GUARDED; /* In 'seq->waiters'. */ + unsigned int ovsthread_id OVS_GUARDED; /* Key in 'waiters' hmap. */ + + struct seq_thread *thread OVS_GUARDED; /* Thread preparing to wait. */ + struct list list_node OVS_GUARDED; /* In 'thread->waiters'. */ + + uint64_t value OVS_GUARDED; /* seq->value we're waiting to change. */ +}; + +/* A thread that might be waiting on one or more seqs. */ +struct seq_thread { + struct list waiters OVS_GUARDED; /* Contains 'struct seq_waiter's. */ + struct latch latch OVS_GUARDED; /* Wakeup latch for this thread. */ + bool waiting OVS_GUARDED; /* True if latch_wait() already called. */ +}; + +static struct ovs_mutex seq_mutex = OVS_ADAPTIVE_MUTEX_INITIALIZER; + +static uint64_t seq_next OVS_GUARDED_BY(seq_mutex) = 1; + +static pthread_key_t seq_thread_key; + +static void seq_init(void); +static struct seq_thread *seq_thread_get(void) OVS_REQUIRES(seq_mutex); +static void seq_thread_exit(void *thread_) OVS_EXCLUDED(seq_mutex); +static void seq_thread_woke(struct seq_thread *) OVS_REQUIRES(seq_mutex); +static void seq_waiter_destroy(struct seq_waiter *) OVS_REQUIRES(seq_mutex); +static void seq_wake_waiters(struct seq *) OVS_REQUIRES(seq_mutex); + +/* Creates and returns a new 'seq' object. */ +struct seq * OVS_EXCLUDED(seq_mutex) +seq_create(void) +{ + struct seq *seq; + + seq_init(); + + seq = xmalloc(sizeof *seq); + ovs_mutex_lock(&seq_mutex); + seq->value = seq_next++; + hmap_init(&seq->waiters); + ovs_mutex_unlock(&seq_mutex); + + return seq; +} + +/* Destroys 'seq', waking up threads that were waiting on it, if any. */ +void +seq_destroy(struct seq *seq) + OVS_EXCLUDED(seq_mutex) +{ + ovs_mutex_lock(&seq_mutex); + seq_wake_waiters(seq); + hmap_destroy(&seq->waiters); + free(seq); + ovs_mutex_unlock(&seq_mutex); +} + +/* Increments 'seq''s sequence number, waking up any threads that are waiting + * on 'seq'. */ +void +seq_change(struct seq *seq) + OVS_EXCLUDED(seq_mutex) +{ + ovs_mutex_lock(&seq_mutex); + seq->value = seq_next++; + seq_wake_waiters(seq); + ovs_mutex_unlock(&seq_mutex); +} + +/* Returns 'seq''s current sequence number (which could change immediately). + * + * seq_read() and seq_wait() can be used together to yield a race-free wakeup + * when an object changes, even without an ability to lock the object. See + * Usage in seq.h for details. */ +uint64_t +seq_read(const struct seq *seq) + OVS_EXCLUDED(seq_mutex) +{ + uint64_t value; + + ovs_mutex_lock(&seq_mutex); + value = seq->value; + ovs_mutex_unlock(&seq_mutex); + + return value; +} + +static void +seq_wait__(struct seq *seq, uint64_t value) + OVS_REQUIRES(seq_mutex) +{ + unsigned int id = ovsthread_id_self(); + uint32_t hash = hash_int(id, 0); + struct seq_waiter *waiter; + + HMAP_FOR_EACH_IN_BUCKET (waiter, hmap_node, hash, &seq->waiters) { + if (waiter->ovsthread_id == id) { + if (waiter->value != value) { + /* The current value is different from the value we've already + * waited for, */ + poll_immediate_wake(); + } else { + /* Already waiting on 'value', nothing more to do. */ + } + return; + } + } + + waiter = xmalloc(sizeof *waiter); + waiter->seq = seq; + hmap_insert(&seq->waiters, &waiter->hmap_node, hash); + waiter->value = value; + waiter->thread = seq_thread_get(); + list_push_back(&waiter->thread->waiters, &waiter->list_node); + + if (!waiter->thread->waiting) { + latch_wait(&waiter->thread->latch); + waiter->thread->waiting = true; + } +} + +/* Causes the following poll_block() to wake up when 'seq''s sequence number + * changes from 'value'. (If 'seq''s sequence number isn't 'value', then + * poll_block() won't block at all.) + * + * seq_read() and seq_wait() can be used together to yield a race-free wakeup + * when an object changes, even without an ability to lock the object. See + * Usage in seq.h for details. */ +void +seq_wait(const struct seq *seq_, uint64_t value) + OVS_EXCLUDED(seq_mutex) +{ + struct seq *seq = CONST_CAST(struct seq *, seq_); + + ovs_mutex_lock(&seq_mutex); + if (value == seq->value) { + seq_wait__(seq, value); + } else { + poll_immediate_wake(); + } + ovs_mutex_unlock(&seq_mutex); +} + +/* Called by poll_block() just before it returns, this function destroys any + * seq_waiter objects associated with the current thread. */ +void +seq_woke(void) + OVS_EXCLUDED(seq_mutex) +{ + struct seq_thread *thread; + + seq_init(); + + thread = pthread_getspecific(seq_thread_key); + if (thread) { + ovs_mutex_lock(&seq_mutex); + seq_thread_woke(thread); + thread->waiting = false; + ovs_mutex_unlock(&seq_mutex); + } +} + +static void +seq_init(void) +{ + static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER; + + if (ovsthread_once_start(&once)) { + xpthread_key_create(&seq_thread_key, seq_thread_exit); + ovsthread_once_done(&once); + } +} + +static struct seq_thread * +seq_thread_get(void) + OVS_REQUIRES(seq_mutex) +{ + struct seq_thread *thread = pthread_getspecific(seq_thread_key); + if (!thread) { + thread = xmalloc(sizeof *thread); + list_init(&thread->waiters); + latch_init(&thread->latch); + thread->waiting = false; + + xpthread_setspecific(seq_thread_key, thread); + } + return thread; +} + +static void +seq_thread_exit(void *thread_) + OVS_EXCLUDED(seq_mutex) +{ + struct seq_thread *thread = thread_; + + ovs_mutex_lock(&seq_mutex); + seq_thread_woke(thread); + latch_destroy(&thread->latch); + free(thread); + ovs_mutex_unlock(&seq_mutex); +} + +static void +seq_thread_woke(struct seq_thread *thread) + OVS_REQUIRES(seq_mutex) +{ + struct seq_waiter *waiter, *next_waiter; + + LIST_FOR_EACH_SAFE (waiter, next_waiter, list_node, &thread->waiters) { + ovs_assert(waiter->thread == thread); + seq_waiter_destroy(waiter); + } + latch_poll(&thread->latch); +} + +static void +seq_waiter_destroy(struct seq_waiter *waiter) + OVS_REQUIRES(seq_mutex) +{ + hmap_remove(&waiter->seq->waiters, &waiter->hmap_node); + list_remove(&waiter->list_node); + free(waiter); +} + +static void +seq_wake_waiters(struct seq *seq) + OVS_REQUIRES(seq_mutex) +{ + struct seq_waiter *waiter, *next_waiter; + + HMAP_FOR_EACH_SAFE (waiter, next_waiter, hmap_node, &seq->waiters) { + latch_set(&waiter->thread->latch); + seq_waiter_destroy(waiter); + } +} diff --git a/lib/seq.h b/lib/seq.h new file mode 100644 index 000000000..c764809e5 --- /dev/null +++ b/lib/seq.h @@ -0,0 +1,127 @@ +/* + * Copyright (c) 2013 Nicira, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef SEQ_H +#define SEQ_H 1 + +/* Thread-safe, pollable sequence number. + * + * + * Motivation + * ========== + * + * It is sometimes desirable to take an action whenever an object changes. + * Suppose we associate a sequence number with an object and increment the + * sequence number whenver we change the object. An observer can then record + * the sequence number it sees. Later on, if the current sequence number + * differs from the one it saw last, then the observer knows to examine the + * object for changes. + * + * Code that wants to run when a sequence number changes is challenging to + * implement in a multithreaded environment. A naive implementation, that + * simply checks whether the sequence number changed and, if so, calls + * poll_immediate_wake(), will fail when another thread increments the sequence + * number after the check (including during poll_block()). + * + * struct seq is a solution. It implements a sequence number along with enough + * internal infrastructure so that a thread waiting on a particular value will + * wake up if the sequence number changes, or even if the "struct seq" is + * destroyed. + * + * + * Usage + * ===== + * + * The object that includes a sequence number should use seq_create() and + * seq_destroy() at creation and destruction, and seq_change() whenever the + * object's observable state changes. + * + * An observer may seq_read() to read the current sequence number and + * seq_wait() to cause poll_block() to wake up when the sequence number changes + * from a specified value. + * + * To avoid races, observers should use seq_read() to check for changes, + * process any changes, and then use seq_wait() to wait for a change from the + * previously read value. That is, a correct usage looks something like this: + * + * new_seq = seq_read(seq); + * if (new_seq != last_seq) { + * ...process changes... + * last_seq = new_seq; + * } + * seq_wait(seq, new_seq); + * poll_block(); + * + * + * Alternate Usage + * =============== + * + * struct seq can also be used as a sort of pollable condition variable. + * Suppose that we want a thread to process items in a queue, and thus to be + * able to wake up whenever the queue is nonempty. This requires a lock to + * protect the queue and a seq to signal that the queue has become nonempty, + * e.g.: + * + * struct ovs_mutex mutex; + * struct list queue OVS_GUARDED_BY(mutex); + * struct seq nonempty_seq; + * + * To add an element to the queue: + * + * ovs_mutex_lock(&mutex); + * list_push_back(&queue, ...element...); + * if (list_is_singleton(&queue)) { // The 'if' test here is optional. + * seq_change(&nonempty_seq); + * } + * ovs_mutex_unlock(&mutex); + * + * To wait for the queue to become nonempty: + * + * ovs_mutex_lock(&mutex); + * if (list_is_empty(&queue)) { + * seq_wait(&nonempty_seq, seq_read(&nonempty_seq)); + * } else { + * poll_immediate_wake(); + * } + * ovs_mutex_unlock(&mutex); + * + * (In the above code 'mutex' prevents the queue from changing between + * seq_read() and seq_wait(). Otherwise, it would be necessary to seq_read(), + * check for a nonempty queue, and then seq_wait() on the previously read + * sequence number, as under Usage above.) + * + * + * Thread-safety + * ============= + * + * Fully thread safe. + */ + +#include + +/* For implementation of an object with a sequence number attached. */ +struct seq *seq_create(void); +void seq_destroy(struct seq *); +void seq_change(struct seq *); + +/* For observers. */ +uint64_t seq_read(const struct seq *); +void seq_wait(const struct seq *, uint64_t value); + +/* For poll_block() internal use. */ +void seq_woke(void); + +#endif /* seq.h */ diff --git a/ofproto/automake.mk b/ofproto/automake.mk index af9a12a3e..47ca1b81f 100644 --- a/ofproto/automake.mk +++ b/ofproto/automake.mk @@ -30,6 +30,8 @@ ofproto_libofproto_a_SOURCES = \ ofproto/ofproto-dpif-mirror.h \ ofproto/ofproto-dpif-sflow.c \ ofproto/ofproto-dpif-sflow.h \ + ofproto/ofproto-dpif-upcall.c \ + ofproto/ofproto-dpif-upcall.h \ ofproto/ofproto-dpif-xlate.c \ ofproto/ofproto-dpif-xlate.h \ ofproto/ofproto-provider.h \ diff --git a/ofproto/ofproto-dpif-upcall.c b/ofproto/ofproto-dpif-upcall.c new file mode 100644 index 000000000..ff9b2d5f2 --- /dev/null +++ b/ofproto/ofproto-dpif-upcall.c @@ -0,0 +1,831 @@ +/* Copyright (c) 2009, 2010, 2011, 2012, 2013 Nicira, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#include +#include "ofproto-dpif-upcall.h" + +#include +#include +#include + +#include "coverage.h" +#include "dynamic-string.h" +#include "dpif.h" +#include "fail-open.h" +#include "latch.h" +#include "seq.h" +#include "list.h" +#include "netlink.h" +#include "ofpbuf.h" +#include "ofproto-dpif.h" +#include "packets.h" +#include "poll-loop.h" +#include "vlog.h" + +#define MAX_QUEUE_LENGTH 512 + +VLOG_DEFINE_THIS_MODULE(ofproto_dpif_upcall); + +COVERAGE_DEFINE(upcall_queue_overflow); +COVERAGE_DEFINE(drop_queue_overflow); +COVERAGE_DEFINE(miss_queue_overflow); +COVERAGE_DEFINE(fmb_queue_overflow); + +/* A thread that processes each upcall handed to it by the dispatcher thread, + * forwards the upcall's packet, and then queues it to the main ofproto_dpif + * to possibly set up a kernel flow as a cache. */ +struct handler { + struct udpif *udpif; /* Parent udpif. */ + pthread_t thread; /* Thread ID. */ + + struct ovs_mutex mutex; /* Mutex guarding the following. */ + + /* Atomic queue of unprocessed miss upcalls. */ + struct list upcalls OVS_GUARDED; + size_t n_upcalls OVS_GUARDED; + + pthread_cond_t wake_cond; /* Wakes 'thread' while holding + 'mutex'. */ +}; + +/* An upcall handler for ofproto_dpif. + * + * udpif is implemented as a "dispatcher" thread that reads upcalls from the + * kernel. It processes each upcall just enough to figure out its next + * destination. For a "miss" upcall (MISS_UPCALL), this is one of several + * "handler" threads (see struct handler). Other upcalls are queued to the + * main ofproto_dpif. */ +struct udpif { + struct dpif *dpif; /* Datapath handle. */ + struct dpif_backer *backer; /* Opaque dpif_backer pointer. */ + + uint32_t secret; /* Random seed for upcall hash. */ + + pthread_t dispatcher; /* Dispatcher thread ID. */ + + struct handler *handlers; /* Miss handlers. */ + size_t n_handlers; + + /* Atomic queue of unprocessed drop keys. */ + struct ovs_mutex drop_key_mutex; + struct list drop_keys OVS_GUARDED; + size_t n_drop_keys OVS_GUARDED; + + /* Atomic queue of special upcalls for ofproto-dpif to process. */ + struct ovs_mutex upcall_mutex; + struct list upcalls OVS_GUARDED; + size_t n_upcalls OVS_GUARDED; + + /* Atomic queue of flow_miss_batches. */ + struct ovs_mutex fmb_mutex; + struct list fmbs OVS_GUARDED; + size_t n_fmbs OVS_GUARDED; + + /* Number of times udpif_revalidate() has been called. */ + atomic_uint reval_seq; + + struct seq *wait_seq; + uint64_t last_seq; + + struct latch exit_latch; /* Tells child threads to exit. */ +}; + +static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5); + +static void recv_upcalls(struct udpif *); +static void handle_miss_upcalls(struct udpif *, struct list *upcalls); +static void miss_destroy(struct flow_miss *); +static void *udpif_dispatcher(void *); +static void *udpif_miss_handler(void *); + +struct udpif * +udpif_create(struct dpif_backer *backer, struct dpif *dpif) +{ + struct udpif *udpif = xzalloc(sizeof *udpif); + + udpif->dpif = dpif; + udpif->backer = backer; + udpif->secret = random_uint32(); + udpif->wait_seq = seq_create(); + latch_init(&udpif->exit_latch); + list_init(&udpif->drop_keys); + list_init(&udpif->upcalls); + list_init(&udpif->fmbs); + atomic_init(&udpif->reval_seq, 0); + ovs_mutex_init(&udpif->drop_key_mutex, PTHREAD_MUTEX_NORMAL); + ovs_mutex_init(&udpif->upcall_mutex, PTHREAD_MUTEX_NORMAL); + ovs_mutex_init(&udpif->fmb_mutex, PTHREAD_MUTEX_NORMAL); + + return udpif; +} + +void +udpif_destroy(struct udpif *udpif) +{ + struct flow_miss_batch *fmb; + struct drop_key *drop_key; + struct upcall *upcall; + + udpif_recv_set(udpif, 0, false); + + while ((drop_key = drop_key_next(udpif))) { + drop_key_destroy(drop_key); + } + + while ((upcall = upcall_next(udpif))) { + upcall_destroy(upcall); + } + + while ((fmb = flow_miss_batch_next(udpif))) { + flow_miss_batch_destroy(fmb); + } + + ovs_mutex_destroy(&udpif->drop_key_mutex); + ovs_mutex_destroy(&udpif->upcall_mutex); + ovs_mutex_destroy(&udpif->fmb_mutex); + latch_destroy(&udpif->exit_latch); + seq_destroy(udpif->wait_seq); + free(udpif); +} + +/* Tells 'udpif' to begin or stop handling flow misses depending on the value + * of 'enable'. 'n_handlers' is the number of miss_handler threads to create. + * Passing 'n_handlers' as zero is equivalent to passing 'enable' as false. */ +void +udpif_recv_set(struct udpif *udpif, size_t n_handlers, bool enable) +{ + n_handlers = enable ? n_handlers : 0; + n_handlers = MIN(n_handlers, 64); + + /* Stop the old threads (if any). */ + if (udpif->handlers && udpif->n_handlers != n_handlers) { + size_t i; + + latch_set(&udpif->exit_latch); + + /* Wake the handlers so they can exit. */ + for (i = 0; i < udpif->n_handlers; i++) { + struct handler *handler = &udpif->handlers[i]; + + ovs_mutex_lock(&handler->mutex); + xpthread_cond_signal(&handler->wake_cond); + ovs_mutex_unlock(&handler->mutex); + } + + xpthread_join(udpif->dispatcher, NULL); + for (i = 0; i < udpif->n_handlers; i++) { + struct handler *handler = &udpif->handlers[i]; + struct upcall *miss, *next; + + xpthread_join(handler->thread, NULL); + + ovs_mutex_lock(&handler->mutex); + LIST_FOR_EACH_SAFE (miss, next, list_node, &handler->upcalls) { + list_remove(&miss->list_node); + upcall_destroy(miss); + } + ovs_mutex_unlock(&handler->mutex); + ovs_mutex_destroy(&handler->mutex); + + xpthread_cond_destroy(&handler->wake_cond); + } + latch_poll(&udpif->exit_latch); + + free(udpif->handlers); + udpif->handlers = NULL; + udpif->n_handlers = 0; + } + + /* Start new threads (if necessary). */ + if (!udpif->handlers && n_handlers) { + size_t i; + + udpif->n_handlers = n_handlers; + udpif->handlers = xzalloc(udpif->n_handlers * sizeof *udpif->handlers); + for (i = 0; i < udpif->n_handlers; i++) { + struct handler *handler = &udpif->handlers[i]; + + handler->udpif = udpif; + list_init(&handler->upcalls); + xpthread_cond_init(&handler->wake_cond, NULL); + ovs_mutex_init(&handler->mutex, PTHREAD_MUTEX_NORMAL); + xpthread_create(&handler->thread, NULL, udpif_miss_handler, handler); + } + xpthread_create(&udpif->dispatcher, NULL, udpif_dispatcher, udpif); + } +} + +void +udpif_run(struct udpif *udpif) +{ + udpif->last_seq = seq_read(udpif->wait_seq); +} + +void +udpif_wait(struct udpif *udpif) +{ + ovs_mutex_lock(&udpif->drop_key_mutex); + if (udpif->n_drop_keys) { + poll_immediate_wake(); + } + ovs_mutex_unlock(&udpif->drop_key_mutex); + + ovs_mutex_lock(&udpif->upcall_mutex); + if (udpif->n_upcalls) { + poll_immediate_wake(); + } + ovs_mutex_unlock(&udpif->upcall_mutex); + + ovs_mutex_lock(&udpif->fmb_mutex); + if (udpif->n_fmbs) { + poll_immediate_wake(); + } + ovs_mutex_unlock(&udpif->fmb_mutex); + + seq_wait(udpif->wait_seq, udpif->last_seq); +} + +/* Notifies 'udpif' that something changed which may render previous + * xlate_actions() results invalid. */ +void +udpif_revalidate(struct udpif *udpif) +{ + struct flow_miss_batch *fmb, *next_fmb; + unsigned int junk; + + /* Since we remove each miss on revalidation, their statistics won't be + * accounted to the appropriate 'facet's in the upper layer. In most + * cases, this is alright because we've already pushed the stats to the + * relevant rules. However, NetFlow requires absolute packet counts on + * 'facet's which could now be incorrect. */ + ovs_mutex_lock(&udpif->fmb_mutex); + atomic_add(&udpif->reval_seq, 1, &junk); + LIST_FOR_EACH_SAFE (fmb, next_fmb, list_node, &udpif->fmbs) { + list_remove(&fmb->list_node); + flow_miss_batch_destroy(fmb); + udpif->n_fmbs--; + } + ovs_mutex_unlock(&udpif->fmb_mutex); + udpif_drop_key_clear(udpif); +} + +/* Retreives the next upcall which ofproto-dpif is responsible for handling. + * The caller is responsible for destroying the returned upcall with + * upcall_destroy(). */ +struct upcall * +upcall_next(struct udpif *udpif) +{ + struct upcall *next = NULL; + + ovs_mutex_lock(&udpif->upcall_mutex); + if (udpif->n_upcalls) { + udpif->n_upcalls--; + next = CONTAINER_OF(list_pop_front(&udpif->upcalls), struct upcall, + list_node); + } + ovs_mutex_unlock(&udpif->upcall_mutex); + return next; +} + +/* Destroys and deallocates 'upcall'. */ +void +upcall_destroy(struct upcall *upcall) +{ + if (upcall) { + ofpbuf_uninit(&upcall->upcall_buf); + free(upcall); + } +} + +/* Retreives the next batch of processed flow misses for 'udpif' to install. + * The caller is responsible for destroying it with flow_miss_batch_destroy(). + */ +struct flow_miss_batch * +flow_miss_batch_next(struct udpif *udpif) +{ + struct flow_miss_batch *next = NULL; + + ovs_mutex_lock(&udpif->fmb_mutex); + if (udpif->n_fmbs) { + udpif->n_fmbs--; + next = CONTAINER_OF(list_pop_front(&udpif->fmbs), + struct flow_miss_batch, list_node); + } + ovs_mutex_unlock(&udpif->fmb_mutex); + return next; +} + +/* Destroys and deallocates 'fmb'. */ +void +flow_miss_batch_destroy(struct flow_miss_batch *fmb) +{ + struct flow_miss *miss, *next; + + if (!fmb) { + return; + } + + HMAP_FOR_EACH_SAFE (miss, next, hmap_node, &fmb->misses) { + hmap_remove(&fmb->misses, &miss->hmap_node); + miss_destroy(miss); + } + + hmap_destroy(&fmb->misses); + free(fmb); +} + +/* Retreives the next drop key which ofproto-dpif needs to process. The caller + * is responsible for destroying it with drop_key_destroy(). */ +struct drop_key * +drop_key_next(struct udpif *udpif) +{ + struct drop_key *next = NULL; + + ovs_mutex_lock(&udpif->drop_key_mutex); + if (udpif->n_drop_keys) { + udpif->n_drop_keys--; + next = CONTAINER_OF(list_pop_front(&udpif->drop_keys), struct drop_key, + list_node); + } + ovs_mutex_unlock(&udpif->drop_key_mutex); + return next; +} + +/* Destorys and deallocates 'drop_key'. */ +void +drop_key_destroy(struct drop_key *drop_key) +{ + if (drop_key) { + free(drop_key->key); + free(drop_key); + } +} + +/* Clears all drop keys waiting to be processed by drop_key_next(). */ +void +udpif_drop_key_clear(struct udpif *udpif) +{ + struct drop_key *drop_key, *next; + + ovs_mutex_lock(&udpif->drop_key_mutex); + LIST_FOR_EACH_SAFE (drop_key, next, list_node, &udpif->drop_keys) { + list_remove(&drop_key->list_node); + drop_key_destroy(drop_key); + udpif->n_drop_keys--; + } + ovs_mutex_unlock(&udpif->drop_key_mutex); +} + +/* The dispatcher thread is responsible for receving upcalls from the kernel, + * assigning the miss upcalls to a miss_handler thread, and assigning the more + * complex ones to ofproto-dpif directly. */ +static void * +udpif_dispatcher(void *arg) +{ + struct udpif *udpif = arg; + + set_subprogram_name("dispatcher"); + while (!latch_is_set(&udpif->exit_latch)) { + recv_upcalls(udpif); + dpif_recv_wait(udpif->dpif); + latch_wait(&udpif->exit_latch); + poll_block(); + } + + return NULL; +} + +/* The miss handler thread is responsible for processing miss upcalls retreived + * by the dispatcher thread. Once finished it passes the processed miss + * upcalls to ofproto-dpif where they're installed in the datapath. */ +static void * +udpif_miss_handler(void *arg) +{ + struct list misses = LIST_INITIALIZER(&misses); + struct handler *handler = arg; + + set_subprogram_name("miss_handler"); + for (;;) { + size_t i; + + ovs_mutex_lock(&handler->mutex); + + if (latch_is_set(&handler->udpif->exit_latch)) { + ovs_mutex_unlock(&handler->mutex); + return NULL; + } + + if (!handler->n_upcalls) { + ovs_mutex_cond_wait(&handler->wake_cond, &handler->mutex); + } + + for (i = 0; i < FLOW_MISS_MAX_BATCH; i++) { + if (handler->n_upcalls) { + handler->n_upcalls--; + list_push_back(&misses, list_pop_front(&handler->upcalls)); + } else { + break; + } + } + ovs_mutex_unlock(&handler->mutex); + + handle_miss_upcalls(handler->udpif, &misses); + } +} + +static void +miss_destroy(struct flow_miss *miss) +{ + struct upcall *upcall, *next; + + LIST_FOR_EACH_SAFE (upcall, next, list_node, &miss->upcalls) { + list_remove(&upcall->list_node); + upcall_destroy(upcall); + } + xlate_out_uninit(&miss->xout); +} + +static enum upcall_type +classify_upcall(const struct upcall *upcall) +{ + const struct dpif_upcall *dpif_upcall = &upcall->dpif_upcall; + union user_action_cookie cookie; + size_t userdata_len; + + /* First look at the upcall type. */ + switch (dpif_upcall->type) { + case DPIF_UC_ACTION: + break; + + case DPIF_UC_MISS: + return MISS_UPCALL; + + case DPIF_N_UC_TYPES: + default: + VLOG_WARN_RL(&rl, "upcall has unexpected type %"PRIu32, + dpif_upcall->type); + return BAD_UPCALL; + } + + /* "action" upcalls need a closer look. */ + if (!dpif_upcall->userdata) { + VLOG_WARN_RL(&rl, "action upcall missing cookie"); + return BAD_UPCALL; + } + userdata_len = nl_attr_get_size(dpif_upcall->userdata); + if (userdata_len < sizeof cookie.type + || userdata_len > sizeof cookie) { + VLOG_WARN_RL(&rl, "action upcall cookie has unexpected size %zu", + userdata_len); + return BAD_UPCALL; + } + memset(&cookie, 0, sizeof cookie); + memcpy(&cookie, nl_attr_get(dpif_upcall->userdata), userdata_len); + if (userdata_len == sizeof cookie.sflow + && cookie.type == USER_ACTION_COOKIE_SFLOW) { + return SFLOW_UPCALL; + } else if (userdata_len == sizeof cookie.slow_path + && cookie.type == USER_ACTION_COOKIE_SLOW_PATH) { + return MISS_UPCALL; + } else if (userdata_len == sizeof cookie.flow_sample + && cookie.type == USER_ACTION_COOKIE_FLOW_SAMPLE) { + return FLOW_SAMPLE_UPCALL; + } else if (userdata_len == sizeof cookie.ipfix + && cookie.type == USER_ACTION_COOKIE_IPFIX) { + return IPFIX_UPCALL; + } else { + VLOG_WARN_RL(&rl, "invalid user cookie of type %"PRIu16 + " and size %zu", cookie.type, userdata_len); + return BAD_UPCALL; + } +} + +static void +recv_upcalls(struct udpif *udpif) +{ + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(60, 60); + for (;;) { + struct upcall *upcall; + int error; + + upcall = xmalloc(sizeof *upcall); + ofpbuf_use_stub(&upcall->upcall_buf, upcall->upcall_stub, + sizeof upcall->upcall_stub); + error = dpif_recv(udpif->dpif, &upcall->dpif_upcall, + &upcall->upcall_buf); + if (error) { + upcall_destroy(upcall); + break; + } + + upcall->type = classify_upcall(upcall); + if (upcall->type == BAD_UPCALL) { + upcall_destroy(upcall); + } else if (upcall->type == MISS_UPCALL) { + struct dpif_upcall *dupcall = &upcall->dpif_upcall; + uint32_t hash = udpif->secret; + struct handler *handler; + struct nlattr *nla; + size_t n_bytes, left; + + n_bytes = 0; + NL_ATTR_FOR_EACH (nla, left, dupcall->key, dupcall->key_len) { + enum ovs_key_attr type = nl_attr_type(nla); + if (type == OVS_KEY_ATTR_IN_PORT + || type == OVS_KEY_ATTR_TCP + || type == OVS_KEY_ATTR_UDP) { + if (nl_attr_get_size(nla) == 4) { + ovs_be32 attr = nl_attr_get_be32(nla); + hash = mhash_add(hash, (OVS_FORCE uint32_t) attr); + n_bytes += 4; + } else { + VLOG_WARN("Netlink attribute with incorrect size."); + } + } + } + hash = mhash_finish(hash, n_bytes); + + handler = &udpif->handlers[hash % udpif->n_handlers]; + + ovs_mutex_lock(&handler->mutex); + if (handler->n_upcalls < MAX_QUEUE_LENGTH) { + list_push_back(&handler->upcalls, &upcall->list_node); + handler->n_upcalls++; + xpthread_cond_signal(&handler->wake_cond); + ovs_mutex_unlock(&handler->mutex); + if (!VLOG_DROP_DBG(&rl)) { + struct ds ds = DS_EMPTY_INITIALIZER; + + odp_flow_key_format(upcall->dpif_upcall.key, + upcall->dpif_upcall.key_len, + &ds); + VLOG_DBG("dispatcher: miss enqueue (%s)", ds_cstr(&ds)); + ds_destroy(&ds); + } + } else { + ovs_mutex_unlock(&handler->mutex); + COVERAGE_INC(miss_queue_overflow); + upcall_destroy(upcall); + } + } else { + ovs_mutex_lock(&udpif->upcall_mutex); + if (udpif->n_upcalls < MAX_QUEUE_LENGTH) { + udpif->n_upcalls++; + list_push_back(&udpif->upcalls, &upcall->list_node); + ovs_mutex_unlock(&udpif->upcall_mutex); + seq_change(udpif->wait_seq); + } else { + ovs_mutex_unlock(&udpif->upcall_mutex); + COVERAGE_INC(upcall_queue_overflow); + upcall_destroy(upcall); + } + } + } +} + +static struct flow_miss * +flow_miss_find(struct hmap *todo, const struct ofproto_dpif *ofproto, + const struct flow *flow, uint32_t hash) +{ + struct flow_miss *miss; + + HMAP_FOR_EACH_WITH_HASH (miss, hmap_node, hash, todo) { + if (miss->ofproto == ofproto && flow_equal(&miss->flow, flow)) { + return miss; + } + } + + return NULL; +} + +/* Executes flow miss 'miss'. May add any required datapath operations + * to 'ops', incrementing '*n_ops' for each new op. */ +static void +execute_flow_miss(struct flow_miss *miss, struct dpif_op *ops, size_t *n_ops) +{ + struct ofproto_dpif *ofproto = miss->ofproto; + struct flow_wildcards wc; + struct rule_dpif *rule; + struct ofpbuf *packet; + struct xlate_in xin; + + memset(&miss->stats, 0, sizeof miss->stats); + miss->stats.used = time_msec(); + LIST_FOR_EACH (packet, list_node, &miss->packets) { + miss->stats.tcp_flags |= packet_get_tcp_flags(packet, &miss->flow); + miss->stats.n_bytes += packet->size; + miss->stats.n_packets++; + } + + flow_wildcards_init_catchall(&wc); + rule_dpif_lookup(ofproto, &miss->flow, &wc, &rule); + rule_credit_stats(rule, &miss->stats); + xlate_in_init(&xin, ofproto, &miss->flow, rule, miss->stats.tcp_flags, + NULL); + xin.may_learn = true; + xin.resubmit_stats = &miss->stats; + xlate_actions(&xin, &miss->xout); + flow_wildcards_or(&miss->xout.wc, &miss->xout.wc, &wc); + + if (rule->up.cr.priority == FAIL_OPEN_PRIORITY) { + struct ofputil_packet_in pin; + + /* Extra-special case for fail-open mode. + * + * We are in fail-open mode and the packet matched the fail-open + * rule, but we are connected to a controller too. We should send + * the packet up to the controller in the hope that it will try to + * set up a flow and thereby allow us to exit fail-open. + * + * See the top-level comment in fail-open.c for more information. */ + pin.packet = packet->data; + pin.packet_len = packet->size; + pin.reason = OFPR_NO_MATCH; + pin.controller_id = 0; + pin.table_id = 0; + pin.cookie = 0; + pin.send_len = 0; /* Not used for flow table misses. */ + flow_get_metadata(&miss->flow, &pin.fmd); + ofproto_dpif_send_packet_in(ofproto, &pin); + } + + if (miss->xout.slow) { + LIST_FOR_EACH (packet, list_node, &miss->packets) { + struct xlate_in xin; + + xlate_in_init(&xin, miss->ofproto, &miss->flow, rule, 0, packet); + xlate_actions_for_side_effects(&xin); + } + } + rule_release(rule); + + if (miss->xout.odp_actions.size) { + LIST_FOR_EACH (packet, list_node, &miss->packets) { + struct dpif_op *op = &ops[*n_ops]; + struct dpif_execute *execute = &op->u.execute; + + if (miss->flow.in_port.ofp_port + != vsp_realdev_to_vlandev(miss->ofproto, + miss->flow.in_port.ofp_port, + miss->flow.vlan_tci)) { + /* This packet was received on a VLAN splinter port. We + * added a VLAN to the packet to make the packet resemble + * the flow, but the actions were composed assuming that + * the packet contained no VLAN. So, we must remove the + * VLAN header from the packet before trying to execute the + * actions. */ + eth_pop_vlan(packet); + } + + op->type = DPIF_OP_EXECUTE; + execute->key = miss->key; + execute->key_len = miss->key_len; + execute->packet = packet; + execute->actions = miss->xout.odp_actions.data; + execute->actions_len = miss->xout.odp_actions.size; + + (*n_ops)++; + } + } +} + +static void +handle_miss_upcalls(struct udpif *udpif, struct list *upcalls) +{ + struct dpif_op *opsp[FLOW_MISS_MAX_BATCH]; + struct dpif_op ops[FLOW_MISS_MAX_BATCH]; + unsigned int old_reval_seq, new_reval_seq; + struct upcall *upcall, *next; + struct flow_miss_batch *fmb; + size_t n_upcalls, n_ops, i; + struct flow_miss *miss; + + atomic_read(&udpif->reval_seq, &old_reval_seq); + + /* Construct the to-do list. + * + * This just amounts to extracting the flow from each packet and sticking + * the packets that have the same flow in the same "flow_miss" structure so + * that we can process them together. */ + fmb = xmalloc(sizeof *fmb); + hmap_init(&fmb->misses); + n_upcalls = 0; + LIST_FOR_EACH_SAFE (upcall, next, list_node, upcalls) { + struct dpif_upcall *dupcall = &upcall->dpif_upcall; + struct flow_miss *miss = &fmb->miss_buf[n_upcalls]; + struct flow_miss *existing_miss; + struct ofproto_dpif *ofproto; + odp_port_t odp_in_port; + struct flow flow; + uint32_t hash; + int error; + + error = xlate_receive(udpif->backer, dupcall->packet, dupcall->key, + dupcall->key_len, &flow, &miss->key_fitness, + &ofproto, &odp_in_port); + + if (error == ENODEV) { + struct drop_key *drop_key; + + /* Received packet on datapath port for which we couldn't + * associate an ofproto. This can happen if a port is removed + * while traffic is being received. Print a rate-limited message + * in case it happens frequently. Install a drop flow so + * that future packets of the flow are inexpensively dropped + * in the kernel. */ + VLOG_INFO_RL(&rl, "received packet on unassociated datapath port " + "%"PRIu32, odp_in_port); + + drop_key = xmalloc(sizeof *drop_key); + drop_key->key = xmemdup(dupcall->key, dupcall->key_len); + drop_key->key_len = dupcall->key_len; + + ovs_mutex_lock(&udpif->drop_key_mutex); + if (udpif->n_drop_keys < MAX_QUEUE_LENGTH) { + udpif->n_drop_keys++; + list_push_back(&udpif->drop_keys, &drop_key->list_node); + ovs_mutex_unlock(&udpif->drop_key_mutex); + seq_change(udpif->wait_seq); + } else { + ovs_mutex_unlock(&udpif->drop_key_mutex); + COVERAGE_INC(drop_queue_overflow); + drop_key_destroy(drop_key); + } + continue; + } else if (error) { + continue; + } + + flow_extract(dupcall->packet, flow.skb_priority, flow.pkt_mark, + &flow.tunnel, &flow.in_port, &miss->flow); + + /* Add other packets to a to-do list. */ + hash = flow_hash(&miss->flow, 0); + existing_miss = flow_miss_find(&fmb->misses, ofproto, &miss->flow, hash); + if (!existing_miss) { + hmap_insert(&fmb->misses, &miss->hmap_node, hash); + miss->ofproto = ofproto; + miss->key = dupcall->key; + miss->key_len = dupcall->key_len; + miss->upcall_type = dupcall->type; + list_init(&miss->packets); + list_init(&miss->upcalls); + + n_upcalls++; + } else { + miss = existing_miss; + } + list_push_back(&miss->packets, &dupcall->packet->list_node); + + list_remove(&upcall->list_node); + list_push_back(&miss->upcalls, &upcall->list_node); + } + + LIST_FOR_EACH_SAFE (upcall, next, list_node, upcalls) { + list_remove(&upcall->list_node); + upcall_destroy(upcall); + } + + /* Process each element in the to-do list, constructing the set of + * operations to batch. */ + n_ops = 0; + HMAP_FOR_EACH (miss, hmap_node, &fmb->misses) { + execute_flow_miss(miss, ops, &n_ops); + } + ovs_assert(n_ops <= ARRAY_SIZE(ops)); + + /* Execute batch. */ + for (i = 0; i < n_ops; i++) { + opsp[i] = &ops[i]; + } + dpif_operate(udpif->dpif, opsp, n_ops); + + ovs_mutex_lock(&udpif->fmb_mutex); + atomic_read(&udpif->reval_seq, &new_reval_seq); + if (old_reval_seq != new_reval_seq) { + /* udpif_revalidate() was called as we were calculating the actions. + * To be safe, we need to assume all the misses need revalidation. */ + ovs_mutex_unlock(&udpif->fmb_mutex); + flow_miss_batch_destroy(fmb); + } else if (udpif->n_fmbs < MAX_QUEUE_LENGTH) { + udpif->n_fmbs++; + list_push_back(&udpif->fmbs, &fmb->list_node); + ovs_mutex_unlock(&udpif->fmb_mutex); + seq_change(udpif->wait_seq); + } else { + COVERAGE_INC(fmb_queue_overflow); + ovs_mutex_unlock(&udpif->fmb_mutex); + flow_miss_batch_destroy(fmb); + } +} diff --git a/ofproto/ofproto-dpif-upcall.h b/ofproto/ofproto-dpif-upcall.h new file mode 100644 index 000000000..f74206031 --- /dev/null +++ b/ofproto/ofproto-dpif-upcall.h @@ -0,0 +1,130 @@ +/* Copyright (c) 2013 Nicira, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#ifndef OFPROTO_DPIF_UPCALL_H +#define OFPROTO_DPIF_UPCALL_H + +#define FLOW_MISS_MAX_BATCH 50 + +#include "dpif.h" +#include "flow.h" +#include "hmap.h" +#include "list.h" +#include "odp-util.h" +#include "ofpbuf.h" +#include "ofproto-dpif-xlate.h" + +struct dpif; +struct dpif_backer; + +/* udif is responsible for retrieving upcalls from the kernel, processing miss + * upcalls, and handing more complex ones up to the main ofproto-dpif + * module. */ + +struct udpif *udpif_create(struct dpif_backer *, struct dpif *); +void udpif_recv_set(struct udpif *, size_t n_workers, bool enable); +void udpif_destroy(struct udpif *); + +void udpif_run(struct udpif *); +void udpif_wait(struct udpif *); + +void udpif_revalidate(struct udpif *); + +/* udpif can handle some upcalls on its own. Others need the main ofproto_dpif + * code to handle them. This interface passes upcalls not handled by udpif up + * to the ofproto_dpif main thread. */ + +/* Type of an upcall. */ +enum upcall_type { + /* Handled internally by udpif code. Not returned by upcall_next().*/ + BAD_UPCALL, /* Some kind of bug somewhere. */ + MISS_UPCALL, /* A flow miss. */ + + /* Require main thread's involvement. May be returned by upcall_next(). */ + SFLOW_UPCALL, /* sFlow sample. */ + FLOW_SAMPLE_UPCALL, /* Per-flow sampling. */ + IPFIX_UPCALL /* Per-bridge sampling. */ +}; + +/* An upcall. */ +struct upcall { + struct list list_node; /* For queuing upcalls. */ + + enum upcall_type type; /* Classification. */ + + /* Raw upcall plus data for keeping track of the memory backing it. */ + struct dpif_upcall dpif_upcall; /* As returned by dpif_recv() */ + struct ofpbuf upcall_buf; /* Owns some data in 'dpif_upcall'. */ + uint64_t upcall_stub[256 / 8]; /* Buffer to reduce need for malloc(). */ +}; + +struct upcall *upcall_next(struct udpif *); +void upcall_destroy(struct upcall *); + +/* udpif figures out how to forward packets, and does forward them, but it + * can't set up datapath flows on its own. This interface passes packet + * forwarding data from udpif to the higher level ofproto_dpif to allow the + * latter to set up datapath flows. */ + +/* Flow miss batching. + * + * Some dpifs implement operations faster when you hand them off in a batch. + * To allow batching, "struct flow_miss" queues the dpif-related work needed + * for a given flow. Each "struct flow_miss" corresponds to sending one or + * more packets, plus possibly installing the flow in the dpif. */ +struct flow_miss { + struct hmap_node hmap_node; + struct ofproto_dpif *ofproto; + + struct flow flow; + enum odp_key_fitness key_fitness; + const struct nlattr *key; + size_t key_len; + struct list packets; + enum dpif_upcall_type upcall_type; + struct dpif_flow_stats stats; + + struct xlate_out xout; + + struct list upcalls; +}; + +struct flow_miss_batch { + struct list list_node; + + struct flow_miss miss_buf[FLOW_MISS_MAX_BATCH]; + struct hmap misses; +}; + +struct flow_miss_batch *flow_miss_batch_next(struct udpif *); +void flow_miss_batch_destroy(struct flow_miss_batch *); + +/* Drop keys are odp flow keys which have drop flows installed in the kernel. + * These are datapath flows which have no associated ofproto, if they did we + * would use facets. + * + * udpif can't install drop flows by itself. This interfaces allows udpif to + * pass the drop flows up to ofproto_dpif to get it to install them. */ +struct drop_key { + struct hmap_node hmap_node; + struct list list_node; + struct nlattr *key; + size_t key_len; +}; + +struct drop_key *drop_key_next(struct udpif *); +void drop_key_destroy(struct drop_key *); +void udpif_drop_key_clear(struct udpif *); + +#endif /* ofproto-dpif-upcall.h */ diff --git a/ofproto/ofproto-dpif-xlate.c b/ofproto/ofproto-dpif-xlate.c index 30697ac7b..8be808827 100644 --- a/ofproto/ofproto-dpif-xlate.c +++ b/ofproto/ofproto-dpif-xlate.c @@ -201,8 +201,6 @@ static void output_normal(struct xlate_ctx *, const struct xbundle *, uint16_t vlan); static void compose_output_action(struct xlate_ctx *, ofp_port_t ofp_port); -static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5); - static struct xbridge *xbridge_lookup(const struct ofproto_dpif *); static struct xbundle *xbundle_lookup(const struct ofbundle *); static struct xport *xport_lookup(const struct ofport_dpif *); @@ -1519,7 +1517,7 @@ compose_output_action__(struct xlate_ctx *ctx, ofp_port_t ofp_port, struct flow_wildcards *wc = &ctx->xout->wc; struct flow *flow = &ctx->xin->flow; ovs_be16 flow_vlan_tci; - uint32_t flow_skb_mark; + uint32_t flow_pkt_mark; uint8_t flow_nw_tos; odp_port_t out_port, odp_port; uint8_t dscp; @@ -1587,7 +1585,7 @@ compose_output_action__(struct xlate_ctx *ctx, ofp_port_t ofp_port, } flow_vlan_tci = flow->vlan_tci; - flow_skb_mark = flow->skb_mark; + flow_pkt_mark = flow->pkt_mark; flow_nw_tos = flow->nw_tos; if (dscp_from_skb_priority(xport, flow->skb_priority, &dscp)) { @@ -1633,7 +1631,6 @@ compose_output_action__(struct xlate_ctx *ctx, ofp_port_t ofp_port, out_port = ofp_port_to_odp_port(ctx->xbridge, vlandev_port); flow->vlan_tci = htons(0); } - flow->skb_mark &= ~IPSEC_MARK; } if (out_port != ODPP_NONE) { @@ -1650,7 +1647,7 @@ compose_output_action__(struct xlate_ctx *ctx, ofp_port_t ofp_port, out: /* Restore flow */ flow->vlan_tci = flow_vlan_tci; - flow->skb_mark = flow_skb_mark; + flow->pkt_mark = flow_pkt_mark; flow->nw_tos = flow_nw_tos; } @@ -1660,34 +1657,6 @@ compose_output_action(struct xlate_ctx *ctx, ofp_port_t ofp_port) compose_output_action__(ctx, ofp_port, true); } -/* Common rule processing in one place to avoid duplicating code. */ -static struct rule_dpif * -ctx_rule_hooks(struct xlate_ctx *ctx, struct rule_dpif *rule, - bool may_packet_in) -{ - if (ctx->xin->resubmit_hook) { - ctx->xin->resubmit_hook(ctx->xin, rule, ctx->recurse); - } - if (rule == NULL && may_packet_in) { - struct xport *xport; - - /* XXX - * check if table configuration flags - * OFPTC_TABLE_MISS_CONTROLLER, default. - * OFPTC_TABLE_MISS_CONTINUE, - * OFPTC_TABLE_MISS_DROP - * When OF1.0, OFPTC_TABLE_MISS_CONTINUE is used. What to do? */ - xport = get_ofp_port(ctx->xbridge, ctx->xin->flow.in_port.ofp_port); - rule = choose_miss_rule(xport ? xport->config : 0, - ctx->xbridge->miss_rule, - ctx->xbridge->no_packet_in_rule); - } - if (rule && ctx->xin->resubmit_stats) { - rule_credit_stats(rule, ctx->xin->resubmit_stats); - } - return rule; -} - static void xlate_table_action(struct xlate_ctx *ctx, ofp_port_t in_port, uint8_t table_id, bool may_packet_in) @@ -1701,15 +1670,39 @@ xlate_table_action(struct xlate_ctx *ctx, /* Look up a flow with 'in_port' as the input port. */ ctx->xin->flow.in_port.ofp_port = in_port; - rule = rule_dpif_lookup_in_table(ctx->xbridge->ofproto, - &ctx->xin->flow, &ctx->xout->wc, - table_id); + rule_dpif_lookup_in_table(ctx->xbridge->ofproto, &ctx->xin->flow, + &ctx->xout->wc, table_id, &rule); /* Restore the original input port. Otherwise OFPP_NORMAL and * OFPP_IN_PORT will have surprising behavior. */ ctx->xin->flow.in_port.ofp_port = old_in_port; - rule = ctx_rule_hooks(ctx, rule, may_packet_in); + if (ctx->xin->resubmit_hook) { + ctx->xin->resubmit_hook(ctx->xin, rule, ctx->recurse); + } + + if (rule == NULL && may_packet_in) { + struct xport *xport; + + /* Makes clang's thread safety analysis happy. */ + rule_release(rule); + + /* XXX + * check if table configuration flags + * OFPTC_TABLE_MISS_CONTROLLER, default. + * OFPTC_TABLE_MISS_CONTINUE, + * OFPTC_TABLE_MISS_DROP + * When OF1.0, OFPTC_TABLE_MISS_CONTINUE is used. What to do? */ + xport = get_ofp_port(ctx->xbridge, ctx->xin->flow.in_port.ofp_port); + rule = choose_miss_rule(xport ? xport->config : 0, + ctx->xbridge->miss_rule, + ctx->xbridge->no_packet_in_rule); + ovs_rwlock_rdlock(&rule->up.evict); + } + + if (rule && ctx->xin->resubmit_stats) { + rule_credit_stats(rule, ctx->xin->resubmit_stats); + } if (rule) { struct rule_dpif *old_rule = ctx->rule; @@ -1720,6 +1713,7 @@ xlate_table_action(struct xlate_ctx *ctx, ctx->rule = old_rule; ctx->recurse--; } + rule_release(rule); ctx->table_id = old_table_id; } else { @@ -1788,7 +1782,7 @@ execute_controller_action(struct xlate_ctx *ctx, int len, packet = ofpbuf_clone(ctx->xin->packet); key.skb_priority = 0; - key.skb_mark = 0; + key.pkt_mark = 0; memset(&key.tunnel, 0, sizeof key.tunnel); commit_odp_actions(&ctx->xin->flow, &ctx->base_flow, @@ -2174,39 +2168,14 @@ may_receive(const struct xport *xport, struct xlate_ctx *ctx) return true; } -static bool -tunnel_ecn_ok(struct xlate_ctx *ctx) -{ - if (is_ip_any(&ctx->base_flow) - && (ctx->xin->flow.tunnel.ip_tos & IP_ECN_MASK) == IP_ECN_CE) { - if ((ctx->base_flow.nw_tos & IP_ECN_MASK) == IP_ECN_NOT_ECT) { - VLOG_WARN_RL(&rl, "dropping tunnel packet marked ECN CE" - " but is not ECN capable"); - return false; - } else { - /* Set the ECN CE value in the tunneled packet. */ - ctx->xin->flow.nw_tos |= IP_ECN_CE; - } - } - - return true; -} - static void do_xlate_actions(const struct ofpact *ofpacts, size_t ofpacts_len, struct xlate_ctx *ctx) { struct flow_wildcards *wc = &ctx->xout->wc; struct flow *flow = &ctx->xin->flow; - bool was_evictable = true; const struct ofpact *a; - if (ctx->rule) { - /* Don't let the rule we're working on get evicted underneath us. */ - was_evictable = ctx->rule->up.evictable; - ctx->rule->up.evictable = false; - } - OFPACT_FOR_EACH (a, ofpacts, ofpacts_len) { struct ofpact_controller *controller; const struct ofpact_metadata *metadata; @@ -2352,20 +2321,20 @@ do_xlate_actions(const struct ofpact *ofpacts, size_t ofpacts_len, case OFPACT_SET_MPLS_TTL: if (compose_set_mpls_ttl_action(ctx, ofpact_get_SET_MPLS_TTL(a)->ttl)) { - goto out; + return; } break; case OFPACT_DEC_MPLS_TTL: if (compose_dec_mpls_ttl_action(ctx)) { - goto out; + return; } break; case OFPACT_DEC_TTL: wc->masks.nw_ttl = 0xff; if (compose_dec_ttl(ctx, ofpact_get_DEC_TTL(a))) { - goto out; + return; } break; @@ -2432,11 +2401,6 @@ do_xlate_actions(const struct ofpact *ofpacts, size_t ofpacts_len, break; } } - -out: - if (ctx->rule) { - ctx->rule->up.evictable = was_evictable; - } } void @@ -2567,6 +2531,7 @@ xlate_actions(struct xlate_in *xin, struct xlate_out *xout) struct flow orig_flow; struct xlate_ctx ctx; size_t ofpacts_len; + bool tnl_may_send; COVERAGE_INC(xlate_actions); @@ -2622,12 +2587,7 @@ xlate_actions(struct xlate_in *xin, struct xlate_out *xout) memset(&wc->masks.dl_type, 0xff, sizeof wc->masks.dl_type); wc->masks.nw_frag |= FLOW_NW_FRAG_MASK; - if (tnl_port_should_receive(&ctx.xin->flow)) { - memset(&wc->masks.tunnel, 0xff, sizeof wc->masks.tunnel); - /* skb_mark is currently used only by tunnels but that will likely - * change in the future. */ - memset(&wc->masks.skb_mark, 0xff, sizeof wc->masks.skb_mark); - } + tnl_may_send = tnl_xlate_init(&ctx.base_flow, flow, wc); if (ctx.xbridge->has_netflow) { netflow_mask_wc(flow, wc); } @@ -2696,7 +2656,7 @@ xlate_actions(struct xlate_in *xin, struct xlate_out *xout) add_ipfix_action(&ctx); sample_actions_len = ctx.xout->odp_actions.size; - if (tunnel_ecn_ok(&ctx) && (!in_port || may_receive(in_port, &ctx))) { + if (tnl_may_send && (!in_port || may_receive(in_port, &ctx))) { do_xlate_actions(ofpacts, ofpacts_len, &ctx); /* We've let OFPP_NORMAL and the learning action look at the diff --git a/ofproto/ofproto-dpif-xlate.h b/ofproto/ofproto-dpif-xlate.h index 1c37bc335..ba24e926f 100644 --- a/ofproto/ofproto-dpif-xlate.h +++ b/ofproto/ofproto-dpif-xlate.h @@ -12,8 +12,8 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#ifndef OFPROT_DPIF_XLATE_H -#define OFPROT_DPIF_XLATE_H 1 +#ifndef OFPROTO_DPIF_XLATE_H +#define OFPROTO_DPIF_XLATE_H 1 #include "flow.h" #include "meta-flow.h" diff --git a/ofproto/ofproto-dpif.c b/ofproto/ofproto-dpif.c index 15826193f..229b16cc7 100644 --- a/ofproto/ofproto-dpif.c +++ b/ofproto/ofproto-dpif.c @@ -52,6 +52,7 @@ #include "ofproto-dpif-ipfix.h" #include "ofproto-dpif-mirror.h" #include "ofproto-dpif-sflow.h" +#include "ofproto-dpif-upcall.h" #include "ofproto-dpif-xlate.h" #include "poll-loop.h" #include "simap.h" @@ -74,6 +75,8 @@ COVERAGE_DEFINE(subfacet_install_fail); COVERAGE_DEFINE(packet_in_overflow); COVERAGE_DEFINE(flow_mod_overflow); +#define N_THREADS 16 + /* Number of implemented OpenFlow tables. */ enum { N_TABLES = 255 }; enum { TBL_INTERNAL = N_TABLES - 1 }; /* Used for internal hidden rules. */ @@ -82,10 +85,6 @@ BUILD_ASSERT_DECL(N_TABLES >= 2 && N_TABLES <= 255); struct flow_miss; struct facet; -static struct rule_dpif *rule_dpif_lookup(struct ofproto_dpif *, - const struct flow *, - struct flow_wildcards *wc); - static void rule_get_stats(struct rule *, uint64_t *packets, uint64_t *bytes); struct ofbundle { @@ -170,8 +169,7 @@ struct subfacet { #define SUBFACET_DESTROY_MAX_BATCH 50 -static struct subfacet *subfacet_create(struct facet *, struct flow_miss *miss, - long long int now); +static struct subfacet *subfacet_create(struct facet *, struct flow_miss *); static struct subfacet *subfacet_find(struct dpif_backer *, const struct nlattr *key, size_t key_len, uint32_t key_hash); @@ -247,7 +245,6 @@ struct facet { uint8_t tcp_flags; /* TCP flags seen for this 'rule'. */ struct xlate_out xout; - bool fail_open; /* Facet matched the fail open rule. */ /* Storage for a single subfacet, to reduce malloc() time and space * overhead. (A facet always has at least one subfacet and in the common @@ -259,9 +256,7 @@ struct facet { long long int learn_rl; /* Rate limiter for facet_learn(). */ }; -static struct facet *facet_create(const struct flow_miss *, struct rule_dpif *, - struct xlate_out *, - struct dpif_flow_stats *); +static struct facet *facet_create(const struct flow_miss *); static void facet_remove(struct facet *); static void facet_free(struct facet *); @@ -274,6 +269,8 @@ static bool facet_check_consistency(struct facet *); static void facet_flush_stats(struct facet *); static void facet_reset_counters(struct facet *); +static void flow_push_stats(struct ofproto_dpif *, struct flow *, + struct dpif_flow_stats *, bool may_learn); static void facet_push_stats(struct facet *, bool may_learn); static void facet_learn(struct facet *); static void facet_account(struct facet *); @@ -382,15 +379,6 @@ COVERAGE_DEFINE(rev_flow_table); COVERAGE_DEFINE(rev_mac_learning); COVERAGE_DEFINE(rev_inconsistency); -/* Drop keys are odp flow keys which have drop flows installed in the kernel. - * These are datapath flows which have no associated ofproto, if they did we - * would use facets. */ -struct drop_key { - struct hmap_node hmap_node; - struct nlattr *key; - size_t key_len; -}; - struct avg_subfacet_rates { double add_rate; /* Moving average of new flows created per minute. */ double del_rate; /* Moving average of flows deleted per minute. */ @@ -401,6 +389,7 @@ struct dpif_backer { char *type; int refcount; struct dpif *dpif; + struct udpif *udpif; struct timer next_expiration; struct ovs_rwlock odp_to_ofport_lock; @@ -534,8 +523,7 @@ static void ofproto_trace(struct ofproto_dpif *, const struct flow *, const struct ofpbuf *packet, struct ds *); /* Upcalls. */ -#define FLOW_MISS_MAX_BATCH 50 -static int handle_upcalls(struct dpif_backer *, unsigned int max_batch); +static void handle_upcalls(struct dpif_backer *); /* Flow expiration. */ static int expire(struct dpif_backer *); @@ -708,9 +696,11 @@ type_run(const char *type) error = dpif_recv_set(backer->dpif, backer->recv_set_enable); if (error) { + udpif_recv_set(backer->udpif, 0, false); VLOG_ERR("Failed to enable receiving packets in dpif."); return error; } + udpif_recv_set(backer->udpif, N_THREADS, backer->recv_set_enable); dpif_flow_flush(backer->dpif); backer->need_revalidate = REV_RECONFIGURE; } @@ -841,6 +831,8 @@ type_run(const char *type) run_fast_rl(); } } + + udpif_revalidate(backer->udpif); } if (!backer->recv_set_enable) { @@ -1004,32 +996,10 @@ process_dpif_port_error(struct dpif_backer *backer, int error) } static int -dpif_backer_run_fast(struct dpif_backer *backer, int max_batch) +dpif_backer_run_fast(struct dpif_backer *backer) { - unsigned int work; - - /* If recv_set_enable is false, we should not handle upcalls. */ - if (!backer->recv_set_enable) { - return 0; - } - - /* Handle one or more batches of upcalls, until there's nothing left to do - * or until we do a fixed total amount of work. - * - * We do work in batches because it can be much cheaper to set up a number - * of flows and fire off their patches all at once. We do multiple batches - * because in some cases handling a packet can cause another packet to be - * queued almost immediately as part of the return flow. Both - * optimizations can make major improvements on some benchmarks and - * presumably for real traffic as well. */ - work = 0; - while (work < max_batch) { - int retval = handle_upcalls(backer, max_batch - work); - if (retval <= 0) { - return -retval; - } - work += retval; - } + udpif_run(backer->udpif); + handle_upcalls(backer); return 0; } @@ -1046,14 +1016,13 @@ type_run_fast(const char *type) return 0; } - return dpif_backer_run_fast(backer, FLOW_MISS_MAX_BATCH); + return dpif_backer_run_fast(backer); } static void run_fast_rl(void) { static long long int port_rl = LLONG_MIN; - static unsigned int backer_rl = 0; if (time_msec() >= port_rl) { struct ofproto_dpif *ofproto; @@ -1063,23 +1032,6 @@ run_fast_rl(void) } port_rl = time_msec() + 200; } - - /* XXX: We have to be careful not to do too much work in this function. If - * we call dpif_backer_run_fast() too often, or with too large a batch, - * performance improves signifcantly, but at a cost. It's possible for the - * number of flows in the datapath to increase without bound, and for poll - * loops to take 10s of seconds. The correct solution to this problem, - * long term, is to separate flow miss handling into it's own thread so it - * isn't affected by revalidations, and expirations. Until then, this is - * the best we can do. */ - if (++backer_rl >= 10) { - struct shash_node *node; - - backer_rl = 0; - SHASH_FOR_EACH (node, &all_dpif_backers) { - dpif_backer_run_fast(node->data, 1); - } - } } static void @@ -1139,6 +1091,7 @@ close_dpif_backer(struct dpif_backer *backer) node = shash_find(&all_dpif_backers, backer->type); free(backer->type); shash_delete(&all_dpif_backers, node); + udpif_destroy(backer->udpif); dpif_close(backer->dpif); ovs_assert(hmap_is_empty(&backer->subfacets)); @@ -1208,6 +1161,7 @@ open_dpif_backer(const char *type, struct dpif_backer **backerp) free(backer); return error; } + backer->udpif = udpif_create(backer, backer->dpif); backer->type = xstrdup(type); backer->governor = NULL; @@ -1255,6 +1209,7 @@ open_dpif_backer(const char *type, struct dpif_backer **backerp) close_dpif_backer(backer); return error; } + udpif_recv_set(backer->udpif, N_THREADS, backer->recv_set_enable); backer->max_n_subfacet = 0; backer->created = time_msec(); @@ -1387,9 +1342,12 @@ add_internal_flow(struct ofproto_dpif *ofproto, int id, return error; } - *rulep = rule_dpif_lookup_in_table(ofproto, &fm.match.flow, NULL, - TBL_INTERNAL); - ovs_assert(*rulep != NULL); + if (rule_dpif_lookup_in_table(ofproto, &fm.match.flow, NULL, TBL_INTERNAL, + rulep)) { + ovs_rwlock_unlock(&(*rulep)->up.evict); + } else { + NOT_REACHED(); + } return 0; } @@ -1668,7 +1626,7 @@ wait(struct ofproto *ofproto_) } dpif_wait(ofproto->backer->dpif); - dpif_recv_wait(ofproto->backer->dpif); + udpif_wait(ofproto->backer->udpif); if (ofproto->sflow) { dpif_sflow_wait(ofproto->sflow); } @@ -2442,7 +2400,7 @@ bundle_add_port(struct ofbundle *bundle, ofp_port_t ofp_port, if (port->bundle != bundle) { bundle->ofproto->backer->need_revalidate = REV_RECONFIGURE; if (port->bundle) { - bundle_del_port(port); + bundle_remove(&port->up); } port->bundle = bundle; @@ -2905,7 +2863,7 @@ ofport_update_peer(struct ofport_dpif *ofport) { const struct ofproto_dpif *ofproto; struct dpif_backer *backer; - const char *peer_name; + char *peer_name; if (!netdev_vport_is_patch(ofport->up.netdev)) { return; @@ -2927,7 +2885,7 @@ ofport_update_peer(struct ofport_dpif *ofport) HMAP_FOR_EACH (ofproto, all_ofproto_dpifs_node, &all_ofproto_dpifs) { struct ofport *peer_ofport; struct ofport_dpif *peer; - const char *peer_peer; + char *peer_peer; if (ofproto->backer != backer) { continue; @@ -2945,9 +2903,11 @@ ofport_update_peer(struct ofport_dpif *ofport) ofport->peer = peer; ofport->peer->peer = ofport; } + free(peer_peer); - return; + break; } + free(peer_name); } static void @@ -3270,26 +3230,6 @@ port_is_lacp_current(const struct ofport *ofport_) /* Upcall handling. */ -/* Flow miss batching. - * - * Some dpifs implement operations faster when you hand them off in a batch. - * To allow batching, "struct flow_miss" queues the dpif-related work needed - * for a given flow. Each "struct flow_miss" corresponds to sending one or - * more packets, plus possibly installing the flow in the dpif. - * - * So far we only batch the operations that affect flow setup time the most. - * It's possible to batch more than that, but the benefit might be minimal. */ -struct flow_miss { - struct hmap_node hmap_node; - struct ofproto_dpif *ofproto; - struct flow flow; - enum odp_key_fitness key_fitness; - const struct nlattr *key; - size_t key_len; - struct list packets; - enum dpif_upcall_type upcall_type; -}; - struct flow_miss_op { struct dpif_op dpif_op; @@ -3305,96 +3245,6 @@ struct flow_miss_op { struct subfacet *subfacet; }; -/* Sends an OFPT_PACKET_IN message for 'packet' of type OFPR_NO_MATCH to each - * OpenFlow controller as necessary according to their individual - * configurations. */ -static void -send_packet_in_miss(struct ofproto_dpif *ofproto, const struct ofpbuf *packet, - const struct flow *flow) -{ - struct ofputil_packet_in pin; - - pin.packet = packet->data; - pin.packet_len = packet->size; - pin.reason = OFPR_NO_MATCH; - pin.controller_id = 0; - - pin.table_id = 0; - pin.cookie = 0; - - pin.send_len = 0; /* not used for flow table misses */ - - flow_get_metadata(flow, &pin.fmd); - - connmgr_send_packet_in(ofproto->up.connmgr, &pin); -} - -static struct flow_miss * -flow_miss_find(struct hmap *todo, const struct ofproto_dpif *ofproto, - const struct flow *flow, uint32_t hash) -{ - struct flow_miss *miss; - - HMAP_FOR_EACH_WITH_HASH (miss, hmap_node, hash, todo) { - if (miss->ofproto == ofproto && flow_equal(&miss->flow, flow)) { - return miss; - } - } - - return NULL; -} - -/* Partially Initializes 'op' as an "execute" operation for 'miss' and - * 'packet'. The caller must initialize op->actions and op->actions_len. If - * 'miss' is associated with a subfacet the caller must also initialize the - * returned op->subfacet, and if anything needs to be freed after processing - * the op, the caller must initialize op->garbage also. */ -static void -init_flow_miss_execute_op(struct flow_miss *miss, struct ofpbuf *packet, - struct flow_miss_op *op) -{ - if (miss->flow.in_port.ofp_port - != vsp_realdev_to_vlandev(miss->ofproto, miss->flow.in_port.ofp_port, - miss->flow.vlan_tci)) { - /* This packet was received on a VLAN splinter port. We - * added a VLAN to the packet to make the packet resemble - * the flow, but the actions were composed assuming that - * the packet contained no VLAN. So, we must remove the - * VLAN header from the packet before trying to execute the - * actions. */ - eth_pop_vlan(packet); - } - - op->subfacet = NULL; - op->xout_garbage = false; - op->dpif_op.type = DPIF_OP_EXECUTE; - op->dpif_op.u.execute.key = miss->key; - op->dpif_op.u.execute.key_len = miss->key_len; - op->dpif_op.u.execute.packet = packet; - ofpbuf_use_stack(&op->mask, &op->maskbuf, sizeof op->maskbuf); -} - -/* Helper for handle_flow_miss_without_facet() and - * handle_flow_miss_with_facet(). */ -static void -handle_flow_miss_common(struct ofproto_dpif *ofproto, struct ofpbuf *packet, - const struct flow *flow, bool fail_open) -{ - if (fail_open) { - /* - * Extra-special case for fail-open mode. - * - * We are in fail-open mode and the packet matched the fail-open - * rule, but we are connected to a controller too. We should send - * the packet up to the controller in the hope that it will try to - * set up a flow and thereby allow us to exit fail-open. - * - * See the top-level comment in fail-open.c for more information. - */ - send_packet_in_miss(ofproto, packet, flow); - } -} - /* Figures out whether a flow that missed in 'ofproto', whose details are in * 'miss' masked by 'wc', is likely to be worth tracking in detail in userspace * and (usually) installing a datapath flow. The answer is usually "yes" (a @@ -3403,7 +3253,7 @@ handle_flow_miss_common(struct ofproto_dpif *ofproto, struct ofpbuf *packet, * flows we impose some heuristics to decide which flows are likely to be worth * tracking. */ static bool -flow_miss_should_make_facet(struct flow_miss *miss, struct flow_wildcards *wc) +flow_miss_should_make_facet(struct flow_miss *miss) { struct dpif_backer *backer = miss->ofproto->backer; uint32_t hash; @@ -3428,96 +3278,34 @@ flow_miss_should_make_facet(struct flow_miss *miss, struct flow_wildcards *wc) backer->governor = governor_create(); } - hash = flow_hash_in_wildcards(&miss->flow, wc, 0); + hash = flow_hash_in_wildcards(&miss->flow, &miss->xout.wc, 0); return governor_should_install_flow(backer->governor, hash, list_size(&miss->packets)); } -/* Handles 'miss' without creating a facet or subfacet or creating any datapath - * flow. 'miss->flow' must have matched 'rule' and been xlated into 'xout'. - * May add an "execute" operation to 'ops' and increment '*n_ops'. */ -static void -handle_flow_miss_without_facet(struct rule_dpif *rule, struct xlate_out *xout, - struct flow_miss *miss, - struct flow_miss_op *ops, size_t *n_ops) -{ - struct ofpbuf *packet; - - LIST_FOR_EACH (packet, list_node, &miss->packets) { - - COVERAGE_INC(facet_suppress); - - handle_flow_miss_common(miss->ofproto, packet, &miss->flow, - rule->up.cr.priority == FAIL_OPEN_PRIORITY); - - if (xout->slow) { - struct xlate_in xin; - - xlate_in_init(&xin, miss->ofproto, &miss->flow, rule, 0, packet); - xlate_actions_for_side_effects(&xin); - } - - if (xout->odp_actions.size) { - struct flow_miss_op *op = &ops[*n_ops]; - struct dpif_execute *execute = &op->dpif_op.u.execute; - - init_flow_miss_execute_op(miss, packet, op); - xlate_out_copy(&op->xout, xout); - execute->actions = op->xout.odp_actions.data; - execute->actions_len = op->xout.odp_actions.size; - op->xout_garbage = true; - - (*n_ops)++; - } - } -} - /* Handles 'miss', which matches 'facet'. May add any required datapath * operations to 'ops', incrementing '*n_ops' for each new op. * - * All of the packets in 'miss' are considered to have arrived at time 'now'. - * This is really important only for new facets: if we just called time_msec() - * here, then the new subfacet or its packets could look (occasionally) as - * though it was used some time after the facet was used. That can make a - * one-packet flow look like it has a nonzero duration, which looks odd in - * e.g. NetFlow statistics. - * - * If non-null, 'stats' will be folded into 'facet'. */ + * All of the packets in 'miss' are considered to have arrived at time + * 'miss->stats.used'. This is really important only for new facets: if we + * just called time_msec() here, then the new subfacet or its packets could + * look (occasionally) as though it was used some time after the facet was + * used. That can make a one-packet flow look like it has a nonzero duration, + * which looks odd in e.g. NetFlow statistics. */ static void handle_flow_miss_with_facet(struct flow_miss *miss, struct facet *facet, - long long int now, struct dpif_flow_stats *stats, struct flow_miss_op *ops, size_t *n_ops) { enum subfacet_path want_path; struct subfacet *subfacet; - struct ofpbuf *packet; - want_path = facet->xout.slow ? SF_SLOW_PATH : SF_FAST_PATH; - - LIST_FOR_EACH (packet, list_node, &miss->packets) { - struct flow_miss_op *op = &ops[*n_ops]; - - handle_flow_miss_common(miss->ofproto, packet, &miss->flow, - facet->fail_open); - - if (want_path != SF_FAST_PATH) { - struct rule_dpif *rule; - struct xlate_in xin; - - rule = rule_dpif_lookup(facet->ofproto, &facet->flow, NULL); - xlate_in_init(&xin, facet->ofproto, &miss->flow, rule, 0, packet); - xlate_actions_for_side_effects(&xin); - } - - if (facet->xout.odp_actions.size) { - struct dpif_execute *execute = &op->dpif_op.u.execute; + facet->packet_count += miss->stats.n_packets; + facet->prev_packet_count += miss->stats.n_packets; + facet->byte_count += miss->stats.n_bytes; + facet->prev_byte_count += miss->stats.n_bytes; - init_flow_miss_execute_op(miss, packet, op); - execute->actions = facet->xout.odp_actions.data, - execute->actions_len = facet->xout.odp_actions.size; - (*n_ops)++; - } - } + subfacet = subfacet_create(facet, miss); + want_path = facet->xout.slow ? SF_SLOW_PATH : SF_FAST_PATH; /* Don't install the flow if it's the result of the "userspace" * action for an already installed facet. This can occur when a @@ -3526,20 +3314,10 @@ handle_flow_miss_with_facet(struct flow_miss *miss, struct facet *facet, * be rejected as overlapping by the datapath. */ if (miss->upcall_type == DPIF_UC_ACTION && !list_is_empty(&facet->subfacets)) { - if (stats) { - facet->used = MAX(facet->used, stats->used); - facet->packet_count += stats->n_packets; - facet->byte_count += stats->n_bytes; - facet->tcp_flags |= stats->tcp_flags; - } return; } - subfacet = subfacet_create(facet, miss, now); - if (stats) { - subfacet_update_stats(subfacet, stats); - } - + subfacet = subfacet_create(facet, miss); if (subfacet->path != want_path) { struct flow_miss_op *op = &ops[(*n_ops)++]; struct dpif_flow_put *put = &op->dpif_op.u.flow_put; @@ -3579,55 +3357,25 @@ static void handle_flow_miss(struct flow_miss *miss, struct flow_miss_op *ops, size_t *n_ops) { - struct ofproto_dpif *ofproto = miss->ofproto; - struct dpif_flow_stats stats__; - struct dpif_flow_stats *stats = &stats__; - struct ofpbuf *packet; struct facet *facet; - long long int now; - now = time_msec(); - memset(stats, 0, sizeof *stats); - stats->used = now; - LIST_FOR_EACH (packet, list_node, &miss->packets) { - stats->tcp_flags |= packet_get_tcp_flags(packet, &miss->flow); - stats->n_bytes += packet->size; - stats->n_packets++; - } + miss->ofproto->n_missed += list_size(&miss->packets); - facet = facet_lookup_valid(ofproto, &miss->flow); + facet = facet_lookup_valid(miss->ofproto, &miss->flow); if (!facet) { - struct flow_wildcards wc; - struct rule_dpif *rule; - struct xlate_out xout; - struct xlate_in xin; - - flow_wildcards_init_catchall(&wc); - rule = rule_dpif_lookup(ofproto, &miss->flow, &wc); - rule_credit_stats(rule, stats); - - xlate_in_init(&xin, ofproto, &miss->flow, rule, stats->tcp_flags, - NULL); - xin.resubmit_stats = stats; - xin.may_learn = true; - xlate_actions(&xin, &xout); - flow_wildcards_or(&xout.wc, &xout.wc, &wc); - /* There does not exist a bijection between 'struct flow' and datapath * flow keys with fitness ODP_FIT_TO_LITTLE. This breaks a fundamental * assumption used throughout the facet and subfacet handling code. * Since we have to handle these misses in userspace anyway, we simply * skip facet creation, avoiding the problem altogether. */ if (miss->key_fitness == ODP_FIT_TOO_LITTLE - || !flow_miss_should_make_facet(miss, &xout.wc)) { - handle_flow_miss_without_facet(rule, &xout, miss, ops, n_ops); + || !flow_miss_should_make_facet(miss)) { return; } - facet = facet_create(miss, rule, &xout, stats); - stats = NULL; + facet = facet_create(miss); } - handle_flow_miss_with_facet(miss, facet, now, stats, ops, n_ops); + handle_flow_miss_with_facet(miss, facet, ops, n_ops); } static struct drop_key * @@ -3666,109 +3414,24 @@ drop_key_clear(struct dpif_backer *backer) } hmap_remove(&backer->drop_keys, &drop_key->hmap_node); - free(drop_key->key); - free(drop_key); + drop_key_destroy(drop_key); } + + udpif_drop_key_clear(backer->udpif); } static void -handle_miss_upcalls(struct dpif_backer *backer, struct dpif_upcall *upcalls, - size_t n_upcalls) +handle_flow_misses(struct dpif_backer *backer, struct flow_miss_batch *fmb) { - struct dpif_upcall *upcall; + struct flow_miss_op flow_miss_ops[FLOW_MISS_MAX_BATCH]; + struct dpif_op *dpif_ops[FLOW_MISS_MAX_BATCH]; struct flow_miss *miss; - struct flow_miss misses[FLOW_MISS_MAX_BATCH]; - struct flow_miss_op flow_miss_ops[FLOW_MISS_MAX_BATCH * 2]; - struct dpif_op *dpif_ops[FLOW_MISS_MAX_BATCH * 2]; - struct hmap todo; - int n_misses; - size_t n_ops; - size_t i; - - if (!n_upcalls) { - return; - } - - /* Construct the to-do list. - * - * This just amounts to extracting the flow from each packet and sticking - * the packets that have the same flow in the same "flow_miss" structure so - * that we can process them together. */ - hmap_init(&todo); - n_misses = 0; - for (upcall = upcalls; upcall < &upcalls[n_upcalls]; upcall++) { - struct flow_miss *miss = &misses[n_misses]; - struct flow_miss *existing_miss; - struct ofproto_dpif *ofproto; - odp_port_t odp_in_port; - struct flow flow; - uint32_t hash; - int error; - - error = xlate_receive(backer, upcall->packet, upcall->key, - upcall->key_len, &flow, &miss->key_fitness, - &ofproto, &odp_in_port); - if (error == ENODEV) { - struct drop_key *drop_key; - - /* Received packet on datapath port for which we couldn't - * associate an ofproto. This can happen if a port is removed - * while traffic is being received. Print a rate-limited message - * in case it happens frequently. Install a drop flow so - * that future packets of the flow are inexpensively dropped - * in the kernel. */ - VLOG_INFO_RL(&rl, "received packet on unassociated datapath port " - "%"PRIu32, odp_in_port); - - drop_key = drop_key_lookup(backer, upcall->key, upcall->key_len); - if (!drop_key) { - int ret; - ret = dpif_flow_put(backer->dpif, - DPIF_FP_CREATE | DPIF_FP_MODIFY, - upcall->key, upcall->key_len, - NULL, 0, NULL, 0, NULL); - - if (!ret) { - drop_key = xmalloc(sizeof *drop_key); - drop_key->key = xmemdup(upcall->key, upcall->key_len); - drop_key->key_len = upcall->key_len; - - hmap_insert(&backer->drop_keys, &drop_key->hmap_node, - hash_bytes(drop_key->key, drop_key->key_len, 0)); - } - } - continue; - } - if (error) { - continue; - } - - ofproto->n_missed++; - flow_extract(upcall->packet, flow.skb_priority, flow.skb_mark, - &flow.tunnel, &flow.in_port, &miss->flow); - - /* Add other packets to a to-do list. */ - hash = flow_hash(&miss->flow, 0); - existing_miss = flow_miss_find(&todo, ofproto, &miss->flow, hash); - if (!existing_miss) { - hmap_insert(&todo, &miss->hmap_node, hash); - miss->ofproto = ofproto; - miss->key = upcall->key; - miss->key_len = upcall->key_len; - miss->upcall_type = upcall->type; - list_init(&miss->packets); - - n_misses++; - } else { - miss = existing_miss; - } - list_push_back(&miss->packets, &upcall->packet->list_node); - } + size_t n_ops, i; /* Process each element in the to-do list, constructing the set of * operations to batch. */ n_ops = 0; - HMAP_FOR_EACH (miss, hmap_node, &todo) { + HMAP_FOR_EACH (miss, hmap_node, &fmb->misses) { handle_flow_miss(miss, flow_miss_ops, &n_ops); } ovs_assert(n_ops <= ARRAY_SIZE(flow_miss_ops)); @@ -3801,66 +3464,6 @@ handle_miss_upcalls(struct dpif_backer *backer, struct dpif_upcall *upcalls, subfacet->path = SF_NOT_INSTALLED; } - - /* Free memory. */ - if (flow_miss_ops[i].xout_garbage) { - xlate_out_uninit(&flow_miss_ops[i].xout); - } - } - hmap_destroy(&todo); -} - -static enum { SFLOW_UPCALL, MISS_UPCALL, BAD_UPCALL, FLOW_SAMPLE_UPCALL, - IPFIX_UPCALL } -classify_upcall(const struct dpif_upcall *upcall) -{ - size_t userdata_len; - union user_action_cookie cookie; - - /* First look at the upcall type. */ - switch (upcall->type) { - case DPIF_UC_ACTION: - break; - - case DPIF_UC_MISS: - return MISS_UPCALL; - - case DPIF_N_UC_TYPES: - default: - VLOG_WARN_RL(&rl, "upcall has unexpected type %"PRIu32, upcall->type); - return BAD_UPCALL; - } - - /* "action" upcalls need a closer look. */ - if (!upcall->userdata) { - VLOG_WARN_RL(&rl, "action upcall missing cookie"); - return BAD_UPCALL; - } - userdata_len = nl_attr_get_size(upcall->userdata); - if (userdata_len < sizeof cookie.type - || userdata_len > sizeof cookie) { - VLOG_WARN_RL(&rl, "action upcall cookie has unexpected size %zu", - userdata_len); - return BAD_UPCALL; - } - memset(&cookie, 0, sizeof cookie); - memcpy(&cookie, nl_attr_get(upcall->userdata), userdata_len); - if (userdata_len == sizeof cookie.sflow - && cookie.type == USER_ACTION_COOKIE_SFLOW) { - return SFLOW_UPCALL; - } else if (userdata_len == sizeof cookie.slow_path - && cookie.type == USER_ACTION_COOKIE_SLOW_PATH) { - return MISS_UPCALL; - } else if (userdata_len == sizeof cookie.flow_sample - && cookie.type == USER_ACTION_COOKIE_FLOW_SAMPLE) { - return FLOW_SAMPLE_UPCALL; - } else if (userdata_len == sizeof cookie.ipfix - && cookie.type == USER_ACTION_COOKIE_IPFIX) { - return IPFIX_UPCALL; - } else { - VLOG_WARN_RL(&rl, "invalid user cookie of type %"PRIu16 - " and size %zu", cookie.type, userdata_len); - return BAD_UPCALL; } } @@ -3929,66 +3532,64 @@ handle_ipfix_upcall(struct dpif_backer *backer, dpif_ipfix_bridge_sample(ofproto->ipfix, upcall->packet, &flow); } -static int -handle_upcalls(struct dpif_backer *backer, unsigned int max_batch) +static void +handle_upcalls(struct dpif_backer *backer) { - struct dpif_upcall misses[FLOW_MISS_MAX_BATCH]; - struct ofpbuf miss_bufs[FLOW_MISS_MAX_BATCH]; - uint64_t miss_buf_stubs[FLOW_MISS_MAX_BATCH][4096 / 8]; + struct flow_miss_batch *fmb; int n_processed; - int n_misses; - int i; - - ovs_assert(max_batch <= FLOW_MISS_MAX_BATCH); - n_misses = 0; - for (n_processed = 0; n_processed < max_batch; n_processed++) { - struct dpif_upcall *upcall = &misses[n_misses]; - struct ofpbuf *buf = &miss_bufs[n_misses]; - int error; + for (n_processed = 0; n_processed < FLOW_MISS_MAX_BATCH; n_processed++) { + struct upcall *upcall = upcall_next(backer->udpif); - ofpbuf_use_stub(buf, miss_buf_stubs[n_misses], - sizeof miss_buf_stubs[n_misses]); - error = dpif_recv(backer->dpif, upcall, buf); - if (error) { - ofpbuf_uninit(buf); + if (!upcall) { break; } - switch (classify_upcall(upcall)) { - case MISS_UPCALL: - /* Handle it later. */ - n_misses++; - break; - + switch (upcall->type) { case SFLOW_UPCALL: - handle_sflow_upcall(backer, upcall); - ofpbuf_uninit(buf); + handle_sflow_upcall(backer, &upcall->dpif_upcall); break; case FLOW_SAMPLE_UPCALL: - handle_flow_sample_upcall(backer, upcall); - ofpbuf_uninit(buf); + handle_flow_sample_upcall(backer, &upcall->dpif_upcall); break; case IPFIX_UPCALL: - handle_ipfix_upcall(backer, upcall); - ofpbuf_uninit(buf); + handle_ipfix_upcall(backer, &upcall->dpif_upcall); break; case BAD_UPCALL: - ofpbuf_uninit(buf); break; + + case MISS_UPCALL: + NOT_REACHED(); } + + upcall_destroy(upcall); } - /* Handle deferred MISS_UPCALL processing. */ - handle_miss_upcalls(backer, misses, n_misses); - for (i = 0; i < n_misses; i++) { - ofpbuf_uninit(&miss_bufs[i]); + for (n_processed = 0; n_processed < FLOW_MISS_MAX_BATCH; n_processed++) { + struct drop_key *drop_key = drop_key_next(backer->udpif); + if (!drop_key) { + break; + } + + if (!drop_key_lookup(backer, drop_key->key, drop_key->key_len)) { + hmap_insert(&backer->drop_keys, &drop_key->hmap_node, + hash_bytes(drop_key->key, drop_key->key_len, 0)); + dpif_flow_put(backer->dpif, DPIF_FP_CREATE | DPIF_FP_MODIFY, + drop_key->key, drop_key->key_len, + NULL, 0, NULL, 0, NULL); + } else { + drop_key_destroy(drop_key); + } } - return n_processed; + fmb = flow_miss_batch_next(backer->udpif); + if (fmb) { + handle_flow_misses(backer, fmb); + flow_miss_batch_destroy(fmb); + } } /* Flow expiration. */ @@ -4340,10 +3941,12 @@ rule_expire(struct rule_dpif *rule) return; } - COVERAGE_INC(ofproto_dpif_expired); + if (!ovs_rwlock_trywrlock(&rule->up.evict)) { + COVERAGE_INC(ofproto_dpif_expired); - /* Get rid of the rule. */ - ofproto_rule_expire(&rule->up, reason); + /* Get rid of the rule. */ + ofproto_rule_expire(&rule->up, reason); + } } /* Facets. */ @@ -4360,8 +3963,7 @@ rule_expire(struct rule_dpif *rule) * The facet will initially have no subfacets. The caller should create (at * least) one subfacet with subfacet_create(). */ static struct facet * -facet_create(const struct flow_miss *miss, struct rule_dpif *rule, - struct xlate_out *xout, struct dpif_flow_stats *stats) +facet_create(const struct flow_miss *miss) { struct ofproto_dpif *ofproto = miss->ofproto; struct facet *facet; @@ -4369,10 +3971,7 @@ facet_create(const struct flow_miss *miss, struct rule_dpif *rule, facet = xzalloc(sizeof *facet); facet->ofproto = miss->ofproto; - facet->packet_count = facet->prev_packet_count = stats->n_packets; - facet->byte_count = facet->prev_byte_count = stats->n_bytes; - facet->tcp_flags = stats->tcp_flags; - facet->used = stats->used; + facet->used = miss->stats.used; facet->flow = miss->flow; facet->learn_rl = time_msec() + 500; @@ -4380,7 +3979,7 @@ facet_create(const struct flow_miss *miss, struct rule_dpif *rule, netflow_flow_init(&facet->nf_flow); netflow_flow_update_time(ofproto->netflow, &facet->nf_flow, facet->used); - xlate_out_copy(&facet->xout, xout); + xlate_out_copy(&facet->xout, &miss->xout); match_init(&match, &facet->flow, &facet->xout.wc); cls_rule_init(&facet->cr, &match, OFP_DEFAULT_PRIORITY); @@ -4389,8 +3988,6 @@ facet_create(const struct flow_miss *miss, struct rule_dpif *rule, ovs_rwlock_unlock(&ofproto->facets.rwlock); facet->nf_flow.output_iface = facet->xout.nf_output_iface; - facet->fail_open = rule->up.cr.priority == FAIL_OPEN_PRIORITY; - return facet; } @@ -4540,16 +4137,19 @@ facet_is_controller_flow(struct facet *facet) { if (facet) { struct ofproto_dpif *ofproto = facet->ofproto; - const struct rule_dpif *rule = rule_dpif_lookup(ofproto, &facet->flow, - NULL); - const struct ofpact *ofpacts = rule->up.ofpacts; - size_t ofpacts_len = rule->up.ofpacts_len; - - if (ofpacts_len > 0 && - ofpacts->type == OFPACT_CONTROLLER && - ofpact_next(ofpacts) >= ofpact_end(ofpacts, ofpacts_len)) { - return true; - } + const struct ofpact *ofpacts; + struct rule_dpif *rule; + size_t ofpacts_len; + bool is_controller; + + rule_dpif_lookup(ofproto, &facet->flow, NULL, &rule); + ofpacts_len = rule->up.ofpacts_len; + ofpacts = rule->up.ofpacts; + is_controller = ofpacts_len > 0 + && ofpacts->type == OFPACT_CONTROLLER + && ofpact_next(ofpacts) >= ofpact_end(ofpacts, ofpacts_len); + rule_release(rule); + return is_controller; } return false; } @@ -4636,17 +4236,16 @@ facet_check_consistency(struct facet *facet) struct xlate_in xin; struct rule_dpif *rule; - bool ok, fail_open; + bool ok; /* Check the datapath actions for consistency. */ - rule = rule_dpif_lookup(facet->ofproto, &facet->flow, NULL); + rule_dpif_lookup(facet->ofproto, &facet->flow, NULL, &rule); xlate_in_init(&xin, facet->ofproto, &facet->flow, rule, 0, NULL); xlate_actions(&xin, &xout); + rule_release(rule); - fail_open = rule->up.cr.priority == FAIL_OPEN_PRIORITY; ok = ofpbuf_equal(&facet->xout.odp_actions, &xout.odp_actions) - && facet->xout.slow == xout.slow - && facet->fail_open == fail_open; + && facet->xout.slow == xout.slow; if (!ok && !VLOG_DROP_WARN(&rl)) { struct ds s = DS_EMPTY_INITIALIZER; @@ -4667,10 +4266,6 @@ facet_check_consistency(struct facet *facet) ds_put_format(&s, " slow path incorrect. should be %d", xout.slow); } - if (facet->fail_open != fail_open) { - ds_put_format(&s, " fail open incorrect. should be %s", - fail_open ? "true" : "false"); - } ds_destroy(&s); } xlate_out_uninit(&xout); @@ -4722,7 +4317,7 @@ facet_revalidate(struct facet *facet) } flow_wildcards_init_catchall(&wc); - new_rule = rule_dpif_lookup(ofproto, &facet->flow, &wc); + rule_dpif_lookup(ofproto, &facet->flow, &wc, &new_rule); /* Calculate new datapath actions. * @@ -4745,6 +4340,7 @@ facet_revalidate(struct facet *facet) || memcmp(&facet->xout.wc, &xout.wc, sizeof xout.wc)) { facet_remove(facet); xlate_out_uninit(&xout); + rule_release(new_rule); return false; } @@ -4774,9 +4370,9 @@ facet_revalidate(struct facet *facet) facet->xout.mirrors = xout.mirrors; facet->nf_flow.output_iface = facet->xout.nf_output_iface; facet->used = MAX(facet->used, new_rule->up.created); - facet->fail_open = new_rule->up.cr.priority == FAIL_OPEN_PRIORITY; xlate_out_uninit(&xout); + rule_release(new_rule); return true; } @@ -4790,6 +4386,28 @@ facet_reset_counters(struct facet *facet) facet->accounted_bytes = 0; } +static void +flow_push_stats(struct ofproto_dpif *ofproto, struct flow *flow, + struct dpif_flow_stats *stats, bool may_learn) +{ + struct ofport_dpif *in_port; + struct rule_dpif *rule; + struct xlate_in xin; + + in_port = get_ofp_port(ofproto, flow->in_port.ofp_port); + if (in_port && in_port->is_tunnel) { + netdev_vport_inc_rx(in_port->up.netdev, stats); + } + + rule_dpif_lookup(ofproto, flow, NULL, &rule); + rule_credit_stats(rule, stats); + xlate_in_init(&xin, ofproto, flow, rule, stats->tcp_flags, NULL); + xin.resubmit_stats = stats; + xin.may_learn = may_learn; + xlate_actions_for_side_effects(&xin); + rule_release(rule); +} + static void facet_push_stats(struct facet *facet, bool may_learn) { @@ -4805,33 +4423,16 @@ facet_push_stats(struct facet *facet, bool may_learn) stats.tcp_flags = facet->tcp_flags; if (may_learn || stats.n_packets || facet->used > facet->prev_used) { - struct ofproto_dpif *ofproto = facet->ofproto; - struct ofport_dpif *in_port; - struct rule_dpif *rule; - struct xlate_in xin; - facet->prev_packet_count = facet->packet_count; facet->prev_byte_count = facet->byte_count; facet->prev_used = facet->used; - in_port = get_ofp_port(ofproto, facet->flow.in_port.ofp_port); - if (in_port && in_port->is_tunnel) { - netdev_vport_inc_rx(in_port->up.netdev, &stats); - } - - rule = rule_dpif_lookup(ofproto, &facet->flow, NULL); - rule_credit_stats(rule, &stats); - netflow_flow_update_time(ofproto->netflow, &facet->nf_flow, + netflow_flow_update_time(facet->ofproto->netflow, &facet->nf_flow, facet->used); netflow_flow_update_flags(&facet->nf_flow, facet->tcp_flags); - mirror_update_stats(ofproto->mbridge, facet->xout.mirrors, + mirror_update_stats(facet->ofproto->mbridge, facet->xout.mirrors, stats.n_packets, stats.n_bytes); - - xlate_in_init(&xin, ofproto, &facet->flow, rule, stats.tcp_flags, - NULL); - xin.resubmit_stats = &stats; - xin.may_learn = may_learn; - xlate_actions_for_side_effects(&xin); + flow_push_stats(facet->ofproto, &facet->flow, &stats, may_learn); } } @@ -4903,8 +4504,7 @@ subfacet_find(struct dpif_backer *backer, const struct nlattr *key, * existing subfacet if there is one, otherwise creates and returns a * new subfacet. */ static struct subfacet * -subfacet_create(struct facet *facet, struct flow_miss *miss, - long long int now) +subfacet_create(struct facet *facet, struct flow_miss *miss) { struct dpif_backer *backer = miss->ofproto->backer; enum odp_key_fitness key_fitness = miss->key_fitness; @@ -4938,8 +4538,8 @@ subfacet_create(struct facet *facet, struct flow_miss *miss, subfacet->key_fitness = key_fitness; subfacet->key = xmemdup(key, key_len); subfacet->key_len = key_len; - subfacet->used = now; - subfacet->created = now; + subfacet->used = miss->stats.used; + subfacet->created = subfacet->used; subfacet->dp_packet_count = 0; subfacet->dp_byte_count = 0; subfacet->path = SF_NOT_INSTALLED; @@ -5132,16 +4732,14 @@ subfacet_update_stats(struct subfacet *subfacet, /* Lookup 'flow' in 'ofproto''s classifier. If 'wc' is non-null, sets * the fields that were relevant as part of the lookup. */ -static struct rule_dpif * +void rule_dpif_lookup(struct ofproto_dpif *ofproto, const struct flow *flow, - struct flow_wildcards *wc) + struct flow_wildcards *wc, struct rule_dpif **rule) { struct ofport_dpif *port; - struct rule_dpif *rule; - rule = rule_dpif_lookup_in_table(ofproto, flow, wc, 0); - if (rule) { - return rule; + if (rule_dpif_lookup_in_table(ofproto, flow, wc, 0, rule)) { + return; } port = get_ofp_port(ofproto, flow->in_port.ofp_port); if (!port) { @@ -5149,21 +4747,24 @@ rule_dpif_lookup(struct ofproto_dpif *ofproto, const struct flow *flow, flow->in_port.ofp_port); } - return choose_miss_rule(port ? port->up.pp.config : 0, ofproto->miss_rule, - ofproto->no_packet_in_rule); + *rule = choose_miss_rule(port ? port->up.pp.config : 0, ofproto->miss_rule, + ofproto->no_packet_in_rule); + ovs_rwlock_rdlock(&(*rule)->up.evict); } -struct rule_dpif * +bool rule_dpif_lookup_in_table(struct ofproto_dpif *ofproto, const struct flow *flow, struct flow_wildcards *wc, - uint8_t table_id) + uint8_t table_id, struct rule_dpif **rule) + OVS_ACQ_RDLOCK((*rule)->up.evict) { struct cls_rule *cls_rule; struct classifier *cls; bool frag; + *rule = NULL; if (table_id >= N_TABLES) { - return NULL; + return false; } if (wc) { @@ -5172,26 +4773,32 @@ rule_dpif_lookup_in_table(struct ofproto_dpif *ofproto, } cls = &ofproto->up.tables[table_id].cls; + ovs_rwlock_rdlock(&cls->rwlock); frag = (flow->nw_frag & FLOW_NW_FRAG_ANY) != 0; if (frag && ofproto->up.frag_handling == OFPC_FRAG_NORMAL) { /* We must pretend that transport ports are unavailable. */ struct flow ofpc_normal_flow = *flow; ofpc_normal_flow.tp_src = htons(0); ofpc_normal_flow.tp_dst = htons(0); - ovs_rwlock_rdlock(&cls->rwlock); cls_rule = classifier_lookup(cls, &ofpc_normal_flow, wc); - ovs_rwlock_unlock(&cls->rwlock); } else if (frag && ofproto->up.frag_handling == OFPC_FRAG_DROP) { cls_rule = &ofproto->drop_frags_rule->up.cr; if (wc) { flow_wildcards_init_exact(wc); } } else { - ovs_rwlock_rdlock(&cls->rwlock); cls_rule = classifier_lookup(cls, flow, wc); - ovs_rwlock_unlock(&cls->rwlock); } - return rule_dpif_cast(rule_from_cls_rule(cls_rule)); + + *rule = rule_dpif_cast(rule_from_cls_rule(cls_rule)); + if (*rule && ovs_rwlock_tryrdlock(&(*rule)->up.evict)) { + /* The rule is in the process of being removed. Best we can do is + * pretend it isn't there. */ + *rule = NULL; + } + ovs_rwlock_unlock(&cls->rwlock); + + return *rule != NULL; } /* Given a port configuration (specified as zero if there's no port), chooses @@ -5204,6 +4811,14 @@ choose_miss_rule(enum ofputil_port_config config, struct rule_dpif *miss_rule, return config & OFPUTIL_PC_NO_PACKET_IN ? no_packet_in_rule : miss_rule; } +void +rule_release(struct rule_dpif *rule) +{ + if (rule) { + ovs_rwlock_unlock(&rule->up.evict); + } +} + static void complete_operation(struct rule_dpif *rule) { @@ -5798,7 +5413,7 @@ ofproto_unixctl_trace(struct unixctl_conn *conn, int argc, const char *argv[], /* Use the metadata from the flow and the packet argument * to reconstruct the flow. */ - flow_extract(packet, flow.skb_priority, flow.skb_mark, NULL, + flow_extract(packet, flow.skb_priority, flow.pkt_mark, NULL, &in_port_, &flow); } } @@ -5818,12 +5433,14 @@ ofproto_trace(struct ofproto_dpif *ofproto, const struct flow *flow, const struct ofpbuf *packet, struct ds *ds) { struct rule_dpif *rule; + struct flow_wildcards wc; ds_put_cstr(ds, "Flow: "); flow_format(ds, flow); ds_put_char(ds, '\n'); - rule = rule_dpif_lookup(ofproto, flow, NULL); + flow_wildcards_init_catchall(&wc); + rule_dpif_lookup(ofproto, flow, &wc, &rule); trace_format_rule(ds, 0, rule); if (rule == ofproto->miss_rule) { @@ -5853,6 +5470,7 @@ ofproto_trace(struct ofproto_dpif *ofproto, const struct flow *flow, trace.xin.report_hook = trace_report; xlate_actions(&trace.xin, &trace.xout); + flow_wildcards_or(&trace.xout.wc, &trace.xout.wc, &wc); ds_put_char(ds, '\n'); trace_format_flow(ds, 0, "Final flow", &trace); @@ -5893,6 +5511,8 @@ ofproto_trace(struct ofproto_dpif *ofproto, const struct flow *flow, xlate_out_uninit(&trace.xout); } + + rule_release(rule); } static void diff --git a/ofproto/ofproto-dpif.h b/ofproto/ofproto-dpif.h index a74146b80..6a4ae078b 100644 --- a/ofproto/ofproto-dpif.h +++ b/ofproto/ofproto-dpif.h @@ -18,6 +18,7 @@ #include #include "hmapx.h" +#include "odp-util.h" #include "ofproto/ofproto-provider.h" #include "ovs-thread.h" #include "timer.h" @@ -29,6 +30,34 @@ struct ofproto_dpif; struct ofport_dpif; struct dpif_backer; +/* Ofproto-dpif -- DPIF based ofproto implementation. + * + * Ofproto-dpif provides an ofproto implementation for those platforms which + * implement the netdev and dpif interface defined in netdev.h and dpif.h. The + * most important of which is the Linux Kernel Module (dpif-linux), but + * alternatives are supported such as a userspace only implementation + * (dpif-netdev), and a dummy implementation used for unit testing. + * + * Ofproto-dpif is divided into three major chunks. + * + * - ofproto-dpif.c + * The main ofproto-dpif module is responsible for implementing the + * provider interface, installing and removing datapath flows, maintaining + * packet statistics, running protocols (BFD, LACP, STP, etc), and + * configuring relevant submodules. + * + * - ofproto-dpif-upcall.c + * Ofproto-dpif-upcall is responsible for retrieving upcalls from the kernel, + * processing miss upcalls, and handing more complex ones up to the main + * ofproto-dpif module. Miss upcall processing boils down to figuring out + * what each packet's actions are, executing them (i.e. asking the kernel to + * forward it), and handing it up to ofproto-dpif to decided whether or not + * to install a kernel flow. + * + * - ofproto-dpif-xlate.c + * Ofproto-dpif-xlate is responsible for translating translating OpenFlow + * actions into datapath actions. */ + struct rule_dpif { struct rule up; @@ -55,10 +84,16 @@ static inline struct rule_dpif *rule_dpif_cast(const struct rule *rule) return rule ? CONTAINER_OF(rule, struct rule_dpif, up) : NULL; } -struct rule_dpif *rule_dpif_lookup_in_table(struct ofproto_dpif *, - const struct flow *, - struct flow_wildcards *, - uint8_t table_id); +void rule_dpif_lookup(struct ofproto_dpif *, const struct flow *, + struct flow_wildcards *, struct rule_dpif **rule) + OVS_ACQ_RDLOCK((*rule)->up.evict); + +bool rule_dpif_lookup_in_table(struct ofproto_dpif *, const struct flow *, + struct flow_wildcards *, uint8_t table_id, + struct rule_dpif **rule) + OVS_ACQ_RDLOCK((*rule)->up.evict); + +void rule_release(struct rule_dpif *rule) OVS_RELEASES(rule->up.evict); void rule_credit_stats(struct rule_dpif *, const struct dpif_flow_stats *); diff --git a/ofproto/ofproto-provider.h b/ofproto/ofproto-provider.h index f081482dc..aa262bc07 100644 --- a/ofproto/ofproto-provider.h +++ b/ofproto/ofproto-provider.h @@ -230,10 +230,18 @@ struct rule { uint16_t idle_timeout OVS_GUARDED; /* In seconds from ->used. */ /* Eviction groups. */ - bool evictable; /* If false, prevents eviction. */ struct heap_node evg_node; /* In eviction_group's "rules" heap. */ struct eviction_group *eviction_group; /* NULL if not in any group. */ + /* The evict lock is used to prevent rules from being evicted while child + * threads are using them to xlate flows. A read lock means the rule is + * currently being used. A write lock means the rule is in the process of + * being evicted and should be considered gone. A rule will not be evicted + * unless both its own and its classifiers write locks are held. + * Therefore, while holding a classifier readlock, one can be assured that + * even write locked rules are safe. */ + struct ovs_rwlock evict; + struct ofpact *ofpacts; /* Sequence of "struct ofpacts". */ unsigned int ofpacts_len; /* Size of 'ofpacts', in bytes. */ @@ -265,7 +273,8 @@ rule_from_cls_rule(const struct cls_rule *cls_rule) } void ofproto_rule_update_used(struct rule *, long long int used); -void ofproto_rule_expire(struct rule *, uint8_t reason); +void ofproto_rule_expire(struct rule *rule, uint8_t reason) + OVS_RELEASES(rule->evict); void ofproto_rule_destroy(struct ofproto *, struct classifier *cls, struct rule *) OVS_REQ_WRLOCK(cls->rwlock); diff --git a/ofproto/ofproto-unixctl.man b/ofproto/ofproto-unixctl.man index 8141de9b2..dd8e8d8a5 100644 --- a/ofproto/ofproto-unixctl.man +++ b/ofproto/ofproto-unixctl.man @@ -86,8 +86,8 @@ only metadata. The metadata can be: .RS .IP \fIskb_priority\fR Packet QoS priority. -.IP \fIskb_mark\fR -SKB mark of the packet. +.IP \fIpkt_mark\fR +Mark of the packet. .IP \fItun_id\fR The tunnel ID on which the packet arrived. .IP \fIin_port\fR diff --git a/ofproto/ofproto.c b/ofproto/ofproto.c index 3cdc72c18..bbdb2d208 100644 --- a/ofproto/ofproto.c +++ b/ofproto/ofproto.c @@ -152,10 +152,10 @@ static void oftable_enable_eviction(struct oftable *, const struct mf_subfield *fields, size_t n_fields); -static void oftable_remove_rule(struct rule *); +static void oftable_remove_rule(struct rule *rule) OVS_RELEASES(rule->evict); static void oftable_remove_rule__(struct ofproto *ofproto, struct classifier *cls, struct rule *rule) - OVS_REQ_WRLOCK(cls->rwlock); + OVS_REQ_WRLOCK(cls->rwlock) OVS_RELEASES(rule->evict); static struct rule *oftable_replace_rule(struct rule *); static void oftable_substitute_rule(struct rule *old, struct rule *new); @@ -181,7 +181,8 @@ struct eviction_group { struct heap rules; /* Contains "struct rule"s. */ }; -static struct rule *choose_rule_to_evict(struct oftable *); +static bool choose_rule_to_evict(struct oftable *table, struct rule **rulep) + OVS_TRY_WRLOCK(true, (*rulep)->evict); static void ofproto_evict(struct ofproto *); static uint32_t rule_eviction_priority(struct rule *); @@ -202,8 +203,9 @@ static bool rule_is_modifiable(const struct rule *); static enum ofperr add_flow(struct ofproto *, struct ofconn *, struct ofputil_flow_mod *, const struct ofp_header *); -static void delete_flow__(struct rule *, struct ofopgroup *, - enum ofp_flow_removed_reason); +static void delete_flow__(struct rule *rule, struct ofopgroup *, + enum ofp_flow_removed_reason) + OVS_RELEASES(rule->evict); static bool handle_openflow(struct ofconn *, const struct ofpbuf *); static enum ofperr handle_flow_mod__(struct ofproto *, struct ofconn *, struct ofputil_flow_mod *, @@ -1077,6 +1079,7 @@ ofproto_flush__(struct ofproto *ofproto) if (!rule->pending) { ofoperation_create(group, rule, OFOPERATION_DELETE, OFPRR_DELETE); + ovs_rwlock_wrlock(&rule->evict); oftable_remove_rule__(ofproto, &table->cls, rule); ofproto->ofproto_class->rule_destruct(rule); } @@ -1678,6 +1681,7 @@ ofproto_delete_flow(struct ofproto *ofproto, /* Initiate deletion -> success. */ struct ofopgroup *group = ofopgroup_create_unattached(ofproto); ofoperation_create(group, rule, OFOPERATION_DELETE, OFPRR_DELETE); + ovs_rwlock_wrlock(&rule->evict); oftable_remove_rule(rule); ofproto->ofproto_class->rule_destruct(rule); ofopgroup_submit(group); @@ -2181,6 +2185,7 @@ ofproto_rule_destroy__(struct rule *rule) cls_rule_destroy(&rule->cr); free(rule->ofpacts); ovs_mutex_destroy(&rule->timeout_mutex); + ovs_rwlock_destroy(&rule->evict); rule->ofproto->ofproto_class->rule_dealloc(rule); } } @@ -2198,7 +2203,11 @@ ofproto_rule_destroy(struct ofproto *ofproto, struct classifier *cls, struct rule *rule) OVS_REQ_WRLOCK(cls->rwlock) { ovs_assert(!rule->pending); - oftable_remove_rule__(ofproto, cls, rule); + if (!ovs_rwlock_trywrlock(&rule->evict)) { + oftable_remove_rule__(ofproto, cls, rule); + } else { + NOT_REACHED(); + } ofproto_rule_destroy__(rule); } @@ -3423,12 +3432,12 @@ add_flow(struct ofproto *ofproto, struct ofconn *ofconn, rule->ofpacts_len = fm->ofpacts_len; rule->meter_id = find_meter(rule->ofpacts, rule->ofpacts_len); list_init(&rule->meter_list_node); - rule->evictable = true; rule->eviction_group = NULL; list_init(&rule->expirable); rule->monitor_flags = 0; rule->add_seqno = 0; rule->modify_seqno = 0; + ovs_rwlock_init(&rule->evict); /* Insert new rule. */ victim = oftable_replace_rule(rule); @@ -3445,19 +3454,18 @@ add_flow(struct ofproto *ofproto, struct ofconn *ofconn, n_rules = classifier_count(&table->cls); ovs_rwlock_unlock(&table->cls.rwlock); if (n_rules > table->max_flows) { - bool was_evictable; - - was_evictable = rule->evictable; - rule->evictable = false; - evict = choose_rule_to_evict(table); - rule->evictable = was_evictable; - - if (!evict) { + ovs_rwlock_rdlock(&rule->evict); + if (choose_rule_to_evict(table, &evict)) { + ovs_rwlock_unlock(&rule->evict); + ovs_rwlock_unlock(&evict->evict); + if (evict->pending) { + error = OFPROTO_POSTPONE; + goto exit; + } + } else { + ovs_rwlock_unlock(&rule->evict); error = OFPERR_OFPFMFC_TABLE_FULL; goto exit; - } else if (evict->pending) { - error = OFPROTO_POSTPONE; - goto exit; } } else { evict = NULL; @@ -3472,6 +3480,13 @@ add_flow(struct ofproto *ofproto, struct ofconn *ofconn, op->group->n_running--; ofoperation_destroy(rule->pending); } else if (evict) { + /* It would be better if we maintained the lock we took in + * choose_rule_to_evict() earlier, but that confuses the thread + * safety analysis, and this code is fragile enough that we really + * need it. In the worst case, we'll have to block a little while + * before we perform the eviction, which doesn't seem like a big + * problem. */ + ovs_rwlock_wrlock(&evict->evict); delete_flow__(evict, group, OFPRR_EVICTION); } ofopgroup_submit(group); @@ -3642,6 +3657,7 @@ delete_flows__(struct ofproto *ofproto, struct ofconn *ofconn, group = ofopgroup_create(ofproto, ofconn, request, UINT32_MAX); LIST_FOR_EACH_SAFE (rule, next, ofproto_node, rules) { + ovs_rwlock_wrlock(&rule->evict); delete_flow__(rule, group, reason); } ofopgroup_submit(group); @@ -5065,17 +5081,18 @@ pick_fallback_dpid(void) /* Table overflow policy. */ -/* Chooses and returns a rule to evict from 'table'. Returns NULL if the table - * is not configured to evict rules or if the table contains no evictable - * rules. (Rules with 'evictable' set to false or with no timeouts are not - * evictable.) */ -static struct rule * -choose_rule_to_evict(struct oftable *table) +/* Chooses and updates 'rulep' with a rule to evict from 'table'. Sets 'rulep' + * to NULL if the table is not configured to evict rules or if the table + * contains no evictable rules. (Rules with a readlock on their evict rwlock, + * or with no timeouts are not evictable.) */ +static bool +choose_rule_to_evict(struct oftable *table, struct rule **rulep) { struct eviction_group *evg; + *rulep = NULL; if (!table->eviction_fields) { - return NULL; + return false; } /* In the common case, the outer and inner loops here will each be entered @@ -5094,13 +5111,14 @@ choose_rule_to_evict(struct oftable *table) struct rule *rule; HEAP_FOR_EACH (rule, evg_node, &evg->rules) { - if (rule->evictable) { - return rule; + if (!ovs_rwlock_trywrlock(&rule->evict)) { + *rulep = rule; + return true; } } } - return NULL; + return false; } /* Searches 'ofproto' for tables that have more flows than their configured @@ -5129,8 +5147,12 @@ ofproto_evict(struct ofproto *ofproto) break; } - rule = choose_rule_to_evict(table); - if (!rule || rule->pending) { + if (!choose_rule_to_evict(table, &rule)) { + break; + } + + if (rule->pending) { + ovs_rwlock_unlock(&rule->evict); break; } @@ -5437,7 +5459,8 @@ oftable_enable_eviction(struct oftable *table, /* Removes 'rule' from the oftable that contains it. */ static void oftable_remove_rule__(struct ofproto *ofproto, struct classifier *cls, - struct rule *rule) OVS_REQ_WRLOCK(cls->rwlock) + struct rule *rule) + OVS_REQ_WRLOCK(cls->rwlock) OVS_RELEASES(rule->evict) { classifier_remove(cls, &rule->cr); if (rule->meter_id) { @@ -5453,6 +5476,7 @@ oftable_remove_rule__(struct ofproto *ofproto, struct classifier *cls, if (!list_is_empty(&rule->meter_list_node)) { list_remove(&rule->meter_list_node); } + ovs_rwlock_unlock(&rule->evict); } static void @@ -5518,6 +5542,7 @@ oftable_substitute_rule(struct rule *old, struct rule *new) if (new) { oftable_replace_rule(new); } else { + ovs_rwlock_wrlock(&old->evict); oftable_remove_rule(old); } } diff --git a/ofproto/tunnel.c b/ofproto/tunnel.c index c23e2d769..0ba0066fe 100644 --- a/ofproto/tunnel.c +++ b/ofproto/tunnel.c @@ -31,12 +31,15 @@ VLOG_DEFINE_THIS_MODULE(tunnel); +/* skb mark used for IPsec tunnel packets */ +#define IPSEC_MARK 1 + struct tnl_match { ovs_be64 in_key; ovs_be32 ip_src; ovs_be32 ip_dst; odp_port_t odp_port; - uint32_t skb_mark; + uint32_t pkt_mark; bool in_key_flow; bool ip_src_flow; bool ip_dst_flow; @@ -101,7 +104,7 @@ tnl_port_add__(const struct ofport_dpif *ofport, const struct netdev *netdev, tnl_port->match.ip_dst = cfg->ip_dst; tnl_port->match.ip_src_flow = cfg->ip_src_flow; tnl_port->match.ip_dst_flow = cfg->ip_dst_flow; - tnl_port->match.skb_mark = cfg->ipsec ? IPSEC_MARK : 0; + tnl_port->match.pkt_mark = cfg->ipsec ? IPSEC_MARK : 0; tnl_port->match.in_key_flow = cfg->in_key_flow; tnl_port->match.odp_port = odp_port; @@ -213,7 +216,7 @@ tnl_port_receive(const struct flow *flow) OVS_EXCLUDED(rwlock) match.ip_src = flow->tunnel.ip_dst; match.ip_dst = flow->tunnel.ip_src; match.in_key = flow->tunnel.tun_id; - match.skb_mark = flow->skb_mark; + match.pkt_mark = flow->pkt_mark; ovs_rwlock_rdlock(&rwlock); tnl_port = tnl_find(&match); @@ -249,6 +252,46 @@ out: return ofport; } +static bool +tnl_ecn_ok(const struct flow *base_flow, struct flow *flow) +{ + if (is_ip_any(base_flow) + && (flow->tunnel.ip_tos & IP_ECN_MASK) == IP_ECN_CE) { + if ((base_flow->nw_tos & IP_ECN_MASK) == IP_ECN_NOT_ECT) { + VLOG_WARN_RL(&rl, "dropping tunnel packet marked ECN CE" + " but is not ECN capable"); + return false; + } else { + /* Set the ECN CE value in the tunneled packet. */ + flow->nw_tos |= IP_ECN_CE; + } + } + + return true; +} + +/* Should be called at the beginning of action translation to initialize + * wildcards and perform any actions based on receiving on tunnel port. + * + * Returns false if the packet must be dropped. */ +bool +tnl_xlate_init(const struct flow *base_flow, struct flow *flow, + struct flow_wildcards *wc) +{ + if (tnl_port_should_receive(flow)) { + memset(&wc->masks.tunnel, 0xff, sizeof wc->masks.tunnel); + memset(&wc->masks.pkt_mark, 0xff, sizeof wc->masks.pkt_mark); + + if (!tnl_ecn_ok(base_flow, flow)) { + return false; + } + + flow->pkt_mark &= ~IPSEC_MARK; + } + + return true; +} + /* Given that 'flow' should be output to the ofport corresponding to * 'tnl_port', updates 'flow''s tunnel headers and returns the actual datapath * port that the output should happen on. May return ODPP_NONE if the output @@ -282,7 +325,7 @@ tnl_port_send(const struct ofport_dpif *ofport, struct flow *flow, if (!cfg->ip_dst_flow) { flow->tunnel.ip_dst = tnl_port->match.ip_dst; } - flow->skb_mark = tnl_port->match.skb_mark; + flow->pkt_mark = tnl_port->match.pkt_mark; if (!cfg->out_key_flow) { flow->tunnel.tun_id = cfg->out_key; @@ -444,7 +487,7 @@ tnl_match_fmt(const struct tnl_match *match, struct ds *ds) } ds_put_format(ds, ", dp port=%"PRIu32, match->odp_port); - ds_put_format(ds, ", skb mark=%"PRIu32, match->skb_mark); + ds_put_format(ds, ", pkt mark=%"PRIu32, match->pkt_mark); } static void diff --git a/ofproto/tunnel.h b/ofproto/tunnel.h index f175f1a15..27a2f7dbc 100644 --- a/ofproto/tunnel.h +++ b/ofproto/tunnel.h @@ -20,9 +20,6 @@ #include #include "flow.h" -/* skb mark used for IPsec tunnel packets */ -#define IPSEC_MARK 1 - /* Tunnel port emulation layer. * * These functions emulate tunnel virtual ports based on the outer @@ -39,6 +36,8 @@ void tnl_port_add(const struct ofport_dpif *, const struct netdev *, void tnl_port_del(const struct ofport_dpif *); const struct ofport_dpif *tnl_port_receive(const struct flow *); +bool tnl_xlate_init(const struct flow *base_flow, struct flow *flow, + struct flow_wildcards *); odp_port_t tnl_port_send(const struct ofport_dpif *, struct flow *, struct flow_wildcards *wc); diff --git a/ovsdb/ovsdbmonitor/.gitignore b/ovsdb/ovsdbmonitor/.gitignore index d6f433b6e..e02ced007 100644 --- a/ovsdb/ovsdbmonitor/.gitignore +++ b/ovsdb/ovsdbmonitor/.gitignore @@ -1 +1 @@ -/ovsdbmonitor.py +/ovsdbmonitor diff --git a/tests/bfd.at b/tests/bfd.at index c54fff034..fb8b1d3d7 100644 --- a/tests/bfd.at +++ b/tests/bfd.at @@ -213,3 +213,37 @@ BFD_CHECK_RX([p0], [1000ms], [300ms]) OVS_VSWITCHD_STOP AT_CLEANUP + +AT_SETUP([bfd - check_tnl_key]) +OVS_VSWITCHD_START([add-port br0 p1 -- set Interface p1 type=gre \ + options:remote_ip=2.2.2.2 options:key=1 ofport_request=1 -- \ + set interface p1 bfd:enable=true -- \ + set bridge br0 fail-mode=standalone]) + +# by default check_tnl_key is false. so we should process a bfd packet with tun_id=1. +AT_CHECK([ovs-appctl ofproto/trace ovs-dummy 'tunnel(tun_id=0x1,src=2.2.2.2,dst=2.2.2.1,tos=0x0,ttl=64,flags(key)),in_port(1),skb_mark(0/0),eth(src=00:11:22:33:44:55,dst=00:23:20:00:00:01),eth_type(0x0800),ipv4(src=169.254.1.0/0.0.0.0,dst=169.254.1.1/0.0.0.0,proto=17/0xff,tos=0/0,ttl=255/0,frag=no/0xff),udp(src=49152/0,dst=3784/0xffff)' -generate], [0], [stdout]) +# check that the packet should be handled as BFD packet. +AT_CHECK([tail -2 stdout], [0], [dnl +This flow is handled by the userspace slow path because it: + - Consists of BFD packets. +], []) + +# turn on the check_tnl_key. +AT_CHECK([ovs-vsctl set interface p1 bfd:check_tnl_key=true]) +AT_CHECK([ovs-appctl ofproto/trace ovs-dummy 'tunnel(tun_id=0x1,src=2.2.2.2,dst=2.2.2.1,tos=0x0,ttl=64,flags(key)),in_port(1),skb_mark(0/0),eth(src=00:11:22:33:44:55,dst=00:23:20:00:00:01),eth_type(0x0800),ipv4(src=169.254.1.0/0.0.0.0,dst=169.254.1.1/0.0.0.0,proto=17/0xff,tos=0/0,ttl=255/0,frag=no/0xff),udp(src=49152/0,dst=3784/0xffff)' -generate], [0], [stdout]) +# check that the packet should be handled as normal packet. +AT_CHECK([tail -1 stdout], [0],[dnl +Datapath actions: 100 +], []) + +# set the tunnel key to 0. +AT_CHECK([ovs-vsctl set interface p1 options:key=0]) +AT_CHECK([ovs-appctl ofproto/trace ovs-dummy 'tunnel(tun_id=0x0,src=2.2.2.2,dst=2.2.2.1,tos=0x0,ttl=64,flags(key)),in_port(1),skb_mark(0/0),eth(src=00:11:22:33:44:55,dst=00:23:20:00:00:01),eth_type(0x0800),ipv4(src=169.254.1.0/0.0.0.0,dst=169.254.1.1/0.0.0.0,proto=17/0xff,tos=0/0,ttl=255/0,frag=no/0xff),udp(src=49152/0,dst=3784/0xffff)' -generate], [0], [stdout]) +# check that the packet should be handled as BFD packet. +AT_CHECK([tail -2 stdout], [0], [dnl +This flow is handled by the userspace slow path because it: + - Consists of BFD packets. +], []) + +OVS_VSWITCHD_STOP +AT_CLEANUP diff --git a/tests/learn.at b/tests/learn.at index 7e781c377..fc8d07199 100644 --- a/tests/learn.at +++ b/tests/learn.at @@ -291,12 +291,14 @@ AT_CHECK([[ovs-ofctl add-flow br0 'actions=load:3->NXM_NX_REG0[0..15],learn(tabl # Trace some packets arriving. The particular packets don't matter. for i in 1 2 3 4 5 6 7 8 9 10; do ovs-appctl netdev-dummy/receive p1 'in_port(1),eth(src=50:54:00:00:00:05,dst=50:54:00:00:00:07),eth_type(0x0800),ipv4(src=192.168.0.1,dst=192.168.0.2,proto=6,tos=0,ttl=64,frag=no),tcp(src=8,dst=9)' + ovs-appctl time/warp 10 done # Check for the learning entry. +ovs-appctl time/warp 1000 AT_CHECK([ovs-ofctl dump-flows br0 | ofctl_strip | sort], [0], [[ n_packets=1, n_bytes=60, actions=load:0x3->NXM_NX_REG0[0..15],learn(table=0,priority=65535,NXM_OF_ETH_SRC[],NXM_OF_VLAN_TCI[0..11],output:NXM_NX_REG0[0..15]),output:2 - priority=65535,vlan_tci=0x0000/0x0fff,dl_src=50:54:00:00:00:05 actions=output:3 + n_packets=9, n_bytes=540, priority=65535,vlan_tci=0x0000/0x0fff,dl_src=50:54:00:00:00:05 actions=output:3 NXST_FLOW reply: ]]) diff --git a/tests/ofproto-dpif.at b/tests/ofproto-dpif.at index 46e1deaa7..b09399875 100644 --- a/tests/ofproto-dpif.at +++ b/tests/ofproto-dpif.at @@ -1196,7 +1196,7 @@ Datapath actions: 2 AT_CHECK([head -n 3 stdout], [0], [dnl Bridge: br0 Packet: arp,metadata=0,in_port=0,vlan_tci=0x0000,dl_src=50:54:00:00:00:01,dl_dst=50:54:00:00:00:02,arp_spa=0.0.0.0,arp_tpa=0.0.0.0,arp_sha=00:00:00:00:00:00,arp_tha=00:00:00:00:00:00 -Flow: skb_mark=0x2,skb_priority=0x1,arp,metadata=0,in_port=1,vlan_tci=0x0000,dl_src=50:54:00:00:00:01,dl_dst=50:54:00:00:00:02,arp_spa=0.0.0.0,arp_tpa=0.0.0.0,arp_sha=00:00:00:00:00:00,arp_tha=00:00:00:00:00:00 +Flow: pkt_mark=0x2,skb_priority=0x1,arp,metadata=0,in_port=1,vlan_tci=0x0000,dl_src=50:54:00:00:00:01,dl_dst=50:54:00:00:00:02,arp_spa=0.0.0.0,arp_tpa=0.0.0.0,arp_sha=00:00:00:00:00:00,arp_tha=00:00:00:00:00:00 ]) # Test command: ofproto/trace dp_name odp_flow packet @@ -1208,18 +1208,18 @@ Datapath actions: 2 AT_CHECK([head -n 3 stdout], [0], [dnl Bridge: br0 Packet: arp,metadata=0,in_port=0,vlan_tci=0x0000,dl_src=50:54:00:00:00:01,dl_dst=50:54:00:00:00:02,arp_spa=0.0.0.0,arp_tpa=0.0.0.0,arp_sha=00:00:00:00:00:00,arp_tha=00:00:00:00:00:00 -Flow: skb_mark=0x2,skb_priority=0x1,arp,metadata=0,in_port=1,vlan_tci=0x0000,dl_src=50:54:00:00:00:01,dl_dst=50:54:00:00:00:02,arp_spa=0.0.0.0,arp_tpa=0.0.0.0,arp_sha=00:00:00:00:00:00,arp_tha=00:00:00:00:00:00 +Flow: pkt_mark=0x2,skb_priority=0x1,arp,metadata=0,in_port=1,vlan_tci=0x0000,dl_src=50:54:00:00:00:01,dl_dst=50:54:00:00:00:02,arp_spa=0.0.0.0,arp_tpa=0.0.0.0,arp_sha=00:00:00:00:00:00,arp_tha=00:00:00:00:00:00 ]) # Test command: ofproto/trace br_name br_flow packet AT_CHECK([ovs-appctl ofproto/trace br0 \ - "in_port=2,skb_priority=2,skb_mark=1" "$pkt2to1"], [0], [stdout],[stderr]) + "in_port=2,skb_priority=2,pkt_mark=1" "$pkt2to1"], [0], [stdout],[stderr]) AT_CHECK([tail -1 stdout], [0], [dnl -Datapath actions: set(skb_mark(0)),1 +Datapath actions: 1 ]) AT_CHECK([head -n 2 stdout], [0], [dnl Packet: arp,metadata=0,in_port=0,vlan_tci=0x0000,dl_src=50:54:00:00:00:02,dl_dst=50:54:00:00:00:01,arp_spa=0.0.0.0,arp_tpa=0.0.0.0,arp_sha=00:00:00:00:00:00,arp_tha=00:00:00:00:00:00 -Flow: skb_mark=0x1,skb_priority=0x2,arp,metadata=0,in_port=2,vlan_tci=0x0000,dl_src=50:54:00:00:00:02,dl_dst=50:54:00:00:00:01,arp_spa=0.0.0.0,arp_tpa=0.0.0.0,arp_sha=00:00:00:00:00:00,arp_tha=00:00:00:00:00:00 +Flow: pkt_mark=0x1,skb_priority=0x2,arp,metadata=0,in_port=2,vlan_tci=0x0000,dl_src=50:54:00:00:00:02,dl_dst=50:54:00:00:00:01,arp_spa=0.0.0.0,arp_tpa=0.0.0.0,arp_sha=00:00:00:00:00:00,arp_tha=00:00:00:00:00:00 ]) OVS_VSWITCHD_STOP @@ -2581,8 +2581,15 @@ AT_DATA([flows.txt], [dnl table=0 in_port=1 actions=load:2->NXM_NX_REG0[[0..15]],learn(table=1,priority=65535,NXM_OF_ETH_SRC[[]],NXM_OF_VLAN_TCI[[0..11]],output:NXM_NX_REG0[[0..15]]),output:2 ]) AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) -AT_CHECK([ovs-appctl netdev-dummy/receive p1 'in_port(1),eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800),ipv4(src=10.0.0.2,dst=10.0.0.1,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0)']) -AT_CHECK([ovs-appctl netdev-dummy/receive p1 'in_port(1),eth(src=50:54:00:00:00:0b,dst=50:54:00:00:00:0c),eth_type(0x0800),ipv4(src=10.0.0.4,dst=10.0.0.3,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0)']) +# We send each packet twice because the first packet in each flow causes the +# flow table to change and thus revalidations, which (depending on timing) +# can keep a megaflow from being installed. The revalidations are done by +# the second iteration, allowing the flows to be installed. +for i in 1 2; do + AT_CHECK([ovs-appctl netdev-dummy/receive p1 'in_port(1),eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800),ipv4(src=10.0.0.2,dst=10.0.0.1,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0)']) + AT_CHECK([ovs-appctl netdev-dummy/receive p1 'in_port(1),eth(src=50:54:00:00:00:0b,dst=50:54:00:00:00:0c),eth_type(0x0800),ipv4(src=10.0.0.4,dst=10.0.0.3,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0)']) + ovs-appctl time/warp 100 +done dnl The original flow is missing due to a revalidation. AT_CHECK([ovs-appctl dpif/dump-megaflows br0 | STRIP_XOUT], [0], [dnl skb_priority=0,ip,in_port=1,vlan_tci=0x0000/0x0fff,dl_src=50:54:00:00:00:09,nw_frag=no, n_subfacets:1, used:0.0s, Datapath actions: diff --git a/tests/ofproto.at b/tests/ofproto.at index e2e6f1b0f..38bfb0225 100644 --- a/tests/ofproto.at +++ b/tests/ofproto.at @@ -1563,14 +1563,14 @@ ovs-appctl -t ovs-ofctl ofctl/set-output-file monitor.log AT_CAPTURE_FILE([monitor.log]) # Send a packet-out with a load action to set some metadata, and forward to controller -AT_CHECK([ovs-ofctl packet-out br0 controller 'load(0xfafafafa5a5a5a5a->OXM_OF_METADATA[[0..63]]), controller' '0001020304050010203040501234']) +AT_CHECK([ovs-ofctl packet-out br0 controller 'load(0xfafafafa5a5a5a5a->OXM_OF_METADATA[[0..63]]), load(0xaa->NXM_NX_PKT_MARK[[]]), controller' '0001020304050010203040501234']) # Stop the monitor and check its output. ovs-appctl -t ovs-ofctl ofctl/barrier ovs-appctl -t ovs-ofctl exit AT_CHECK([sed 's/ (xid=0x[[0-9a-fA-F]]*)//' monitor.log], [0], [dnl -NXT_PACKET_IN: total_len=14 in_port=CONTROLLER metadata=0xfafafafa5a5a5a5a (via action) data_len=14 (unbuffered) +NXT_PACKET_IN: total_len=14 in_port=CONTROLLER metadata=0xfafafafa5a5a5a5a pkt_mark=0xaa (via action) data_len=14 (unbuffered) metadata=0,in_port=0,vlan_tci=0x0000,dl_src=00:10:20:30:40:50,dl_dst=00:01:02:03:04:05,dl_type=0x1234 OFPT_BARRIER_REPLY: ]) diff --git a/tests/ovs-ofctl.at b/tests/ovs-ofctl.at index 18f915243..996ea0699 100644 --- a/tests/ovs-ofctl.at +++ b/tests/ovs-ofctl.at @@ -12,7 +12,7 @@ for test_case in \ 'metadata=0 NXM,OXM' \ 'in_port=1 any' \ 'skb_priority=0 none' \ - 'skb_mark=1 none' \ + 'pkt_mark=1 NXM,OXM' \ 'reg0=0 NXM,OXM' \ 'reg1=1 NXM,OXM' \ 'reg2=2 NXM,OXM' \ @@ -180,9 +180,9 @@ AT_CHECK([ovs-ofctl parse-flows flows.txt AT_CLEANUP -AT_SETUP([ovs-ofctl parse-flows (skb_mark and skb_priority)]) +AT_SETUP([ovs-ofctl parse-flows (skb_priority)]) AT_DATA([flows.txt], [[ -skb_mark=0x12345678,skb_priority=0x12341234,tcp,tp_src=123,actions=flood +skb_priority=0x12341234,tcp,tp_src=123,actions=flood ]]) AT_CHECK([ovs-ofctl parse-flows flows.txt @@ -197,6 +197,7 @@ AT_DATA([flows.txt], [[ # comment tcp,tp_src=123,actions=flood in_port=LOCAL dl_vlan=9 dl_src=00:0A:E4:25:6B:B0 actions=drop +pkt_mark=0xbb,actions=set_field:0xaa->pkt_mark udp dl_vlan_pcp=7 idle_timeout=5 actions=strip_vlan output:0 tcp,nw_src=192.168.0.3,tp_dst=80 actions=set_queue:37,output:1 udp,nw_src=192.168.0.3,tp_dst=53 actions=pop_queue,output:1 @@ -232,6 +233,7 @@ AT_CHECK([[sed 's/ (xid=0x[0-9a-fA-F]*)//' stdout]], [0], chosen protocol: NXM+table_id NXT_FLOW_MOD: ADD table:255 tcp,tp_src=123 actions=FLOOD NXT_FLOW_MOD: ADD table:255 in_port=LOCAL,dl_vlan=9,dl_src=00:0a:e4:25:6b:b0 actions=drop +NXT_FLOW_MOD: ADD table:255 pkt_mark=0xbb actions=load:0xaa->NXM_NX_PKT_MARK[] NXT_FLOW_MOD: ADD table:255 udp,dl_vlan_pcp=7 idle:5 actions=strip_vlan,output:0 NXT_FLOW_MOD: ADD table:255 tcp,nw_src=192.168.0.3,tp_dst=80 actions=set_queue:37,output:1 NXT_FLOW_MOD: ADD table:255 udp,nw_src=192.168.0.3,tp_dst=53 actions=pop_queue,output:1 diff --git a/utilities/ovs-ofctl.8.in b/utilities/ovs-ofctl.8.in index 3e6c7fe48..47b591a9b 100644 --- a/utilities/ovs-ofctl.8.in +++ b/utilities/ovs-ofctl.8.in @@ -806,6 +806,13 @@ exactly, and a 0-bit wildcards that bit. When a packet enters an OpenFlow switch, all of the registers are set to 0. Only explicit Nicira extension actions change register values. . +.IP \fBpkt_mark=\fIvalue\fR[\fB/\fImask\fR] +Matches packet metadata mark \fIvalue\fR either exactly or with optional +\fImask\fR. The mark is associated data that may be passed into other +system components in order to facilitate interaction between subsystems. +On Linux this corresponds to the skb mark but the exact implementation is +platform-dependent. +. .PP Defining IPv6 flows (those with \fBdl_type\fR equal to 0x86dd) requires support for NXM. The following shorthand notations are available for @@ -1107,7 +1114,7 @@ be specified as a name used for matching. (This is similar to Open Flow 1.2 and above.) . .IP -Example: \fBset_field:fe80:0123:4567:890a:a6ba:dbff:fefe:59fa\->ipv6_src\fR +Example: \fBset_field:00:11:22:33:44:55->eth_src\fR. . .IP "\fBmultipath(\fIfields\fB, \fIbasis\fB, \fIalgorithm\fB, \fIn_links\fB, \fIarg\fB, \fIdst\fB[\fIstart\fB..\fIend\fB])\fR" Hashes \fIfields\fR using \fIbasis\fR as a universal hash parameter,