+v0.90.6 - 6 Oct 2009
+--------------------
+ - Bug fixes
+
v0.90.5 - 21 Sep 2009
---------------------
- Generalize in-band control to more diverse network setups
[OVS_DEFINE([HAVE_NLA_NUL_STRING])])
OVS_GREP_IFELSE([$KSRC26/include/linux/err.h], [ERR_CAST],
[OVS_DEFINE([HAVE_ERR_CAST])])
+ OVS_GREP_IFELSE([$KSRC26/include/net/checksum.h], [csum_unfold],
+ [OVS_DEFINE([HAVE_CSUM_UNFOLD])])
+ OVS_GREP_IFELSE([$KSRC26/include/linux/skbuff.h], [skb_cow],
+ [OVS_DEFINE([HAVE_SKB_COW])])
OVS_CHECK_LOG2_H
OVS_CHECK_VETH
if cmp -s datapath/linux-2.6/kcompat.h.new \
# limitations under the License.
AC_PREREQ(2.63)
-AC_INIT(openvswitch, 0.90.5, ovs-bugs@openvswitch.org)
+AC_INIT(openvswitch, 0.90.6, ovs-bugs@openvswitch.org)
NX_BUILDNR
AC_CONFIG_SRCDIR([datapath/datapath.c])
AC_CONFIG_MACRO_DIR([m4])
break;
case ODPAT_SET_VLAN_PCP:
- if (a->vlan_pcp.vlan_pcp & ~VLAN_PCP_MASK)
+ if (a->vlan_pcp.vlan_pcp
+ & ~(VLAN_PCP_MASK >> VLAN_PCP_SHIFT))
return -EINVAL;
break;
return 0;
}
+static int get_listen_mask(const struct file *f)
+{
+ return (long)f->private_data;
+}
+
+static void set_listen_mask(struct file *f, int listen_mask)
+{
+ f->private_data = (void*)(long)listen_mask;
+}
+
static long openvswitch_ioctl(struct file *f, unsigned int cmd,
unsigned long argp)
{
break;
case ODP_GET_LISTEN_MASK:
- err = put_user((int)f->private_data, (int __user *)argp);
+ err = put_user(get_listen_mask(f), (int __user *)argp);
break;
case ODP_SET_LISTEN_MASK:
if (listeners & ~ODPL_ALL)
break;
err = 0;
- f->private_data = (void*)listeners;
+ set_listen_mask(f, listeners);
break;
case ODP_PORT_QUERY:
loff_t *ppos)
{
/* XXX is there sufficient synchronization here? */
- int listeners = (int) f->private_data;
+ int listeners = get_listen_mask(f);
int dp_idx = iminor(f->f_dentry->d_inode);
struct datapath *dp = get_dp(dp_idx);
struct sk_buff *skb;
}
}
success:
- copy_bytes = min(skb->len, nbytes);
+ copy_bytes = min_t(size_t, skb->len, nbytes);
iov.iov_base = buf;
iov.iov_len = copy_bytes;
retval = skb_copy_datagram_iovec(skb, 0, &iov, iov.iov_len);
if (dp) {
mask = 0;
poll_wait(file, &dp->waitqueue, wait);
- if (dp_has_packet_of_interest(dp, (int)file->private_data))
+ if (dp_has_packet_of_interest(dp, get_listen_mask(file)))
mask |= POLLIN | POLLRDNORM;
} else {
mask = POLLIN | POLLRDNORM | POLLHUP;
/* Mask for the priority bits in a vlan header. If we ever merge upstream
* then this should go into include/linux/if_vlan.h. */
#define VLAN_PCP_MASK 0xe000
+#define VLAN_PCP_SHIFT 13
#define DP_MAX_PORTS 1024
#define DP_MAX_GROUPS 16
free_netdev(netdev);
}
+static int dp_dev_do_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
+{
+ if (dp_ioctl_hook)
+ return dp_ioctl_hook(dev, ifr, cmd);
+ return -EOPNOTSUPP;
+}
+
+#ifdef HAVE_NET_DEVICE_OPS
+static const struct net_device_ops dp_dev_netdev_ops = {
+ .ndo_init = dp_dev_init,
+ .ndo_open = dp_dev_open,
+ .ndo_stop = dp_dev_stop,
+ .ndo_start_xmit = dp_dev_xmit,
+ .ndo_set_mac_address = dp_dev_mac_addr,
+ .ndo_do_ioctl = dp_dev_do_ioctl,
+ .ndo_change_mtu = dp_dev_change_mtu,
+ .ndo_get_stats = dp_dev_get_stats,
+};
+#endif
+
static void
do_setup(struct net_device *netdev)
{
ether_setup(netdev);
- netdev->do_ioctl = dp_ioctl_hook;
+#ifdef HAVE_NET_DEVICE_OPS
+ netdev->netdev_ops = &dp_dev_netdev_ops;
+#else
+ netdev->do_ioctl = dp_dev_do_ioctl;
netdev->get_stats = dp_dev_get_stats;
netdev->hard_start_xmit = dp_dev_xmit;
netdev->open = dp_dev_open;
- SET_ETHTOOL_OPS(netdev, &dp_ethtool_ops);
netdev->stop = dp_dev_stop;
- netdev->tx_queue_len = 0;
netdev->set_mac_address = dp_dev_mac_addr;
netdev->change_mtu = dp_dev_change_mtu;
netdev->init = dp_dev_init;
+#endif
+
netdev->destructor = dp_dev_free;
+ SET_ETHTOOL_OPS(netdev, &dp_ethtool_ops);
+ netdev->tx_queue_len = 0;
netdev->flags = IFF_BROADCAST | IFF_MULTICAST;
netdev->features = NETIF_F_LLTX; /* XXX other features? */
int is_dp_dev(struct net_device *netdev)
{
+#ifdef HAVE_NET_DEVICE_OPS
+ return netdev->netdev_ops == &dp_dev_netdev_ops;
+#else
return netdev->open == dp_dev_open;
+#endif
}
openvswitch_headers += \
linux-2.6/compat-2.6/compat26.h \
linux-2.6/compat-2.6/include/asm-generic/bug.h \
+ linux-2.6/compat-2.6/include/linux/cpumask.h \
linux-2.6/compat-2.6/include/linux/dmi.h \
linux-2.6/compat-2.6/include/linux/err.h \
linux-2.6/compat-2.6/include/linux/icmp.h \
linux-2.6/compat-2.6/include/linux/jiffies.h \
linux-2.6/compat-2.6/include/linux/kernel.h \
linux-2.6/compat-2.6/include/linux/kobject.h \
- linux-2.6/compat-2.6/include/linux/log2.h \
linux-2.6/compat-2.6/include/linux/lockdep.h \
+ linux-2.6/compat-2.6/include/linux/log2.h \
linux-2.6/compat-2.6/include/linux/mutex.h \
linux-2.6/compat-2.6/include/linux/netdevice.h \
linux-2.6/compat-2.6/include/linux/netfilter_bridge.h \
#define NET_SKB_PAD 16
#endif
-#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,23)
+#ifndef HAVE_SKB_COW
static inline int __skb_cow(struct sk_buff *skb, unsigned int headroom,
int cloned)
{
{
return __skb_cow(skb, headroom, skb_header_cloned(skb));
}
-#endif /* linux < 2.6.23 */
+#endif /* !HAVE_SKB_COW */
#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,17)
#include_next <net/checksum.h>
-#include <linux/version.h>
-#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,20)
-
+#ifndef HAVE_CSUM_UNFOLD
static inline __wsum csum_unfold(__sum16 n)
{
return (__force __wsum)n;
}
-
-#endif /* linux kernel < 2.6.20 */
+#endif /* !HAVE_CSUM_UNFOLD */
#endif /* checksum.h */
#include <unistd.h>
#include "fatal-signal.h"
#include "dirs.h"
+#include "timeval.h"
#include "util.h"
#define THIS_MODULE VLM_daemon
if (chdir_) {
chdir("/");
}
+ time_postfork();
break;
case -1:
/* Time at which to die with SIGALRM (if not TIME_MIN). */
static time_t deadline = TIME_MIN;
+static void setup_timer(void);
static void sigalrm_handler(int);
static void refresh_if_ticked(void);
static time_t time_add(time_t, time_t);
time_init(void)
{
struct sigaction sa;
- struct itimerval itimer;
-
if (inited) {
return;
}
ovs_fatal(errno, "sigaction(SIGALRM) failed");
}
- /* Set up periodic timer. */
+ /* Set up periodic signal. */
+ setup_timer();
+}
+
+static void
+setup_timer(void)
+{
+ struct itimerval itimer;
+
itimer.it_interval.tv_sec = 0;
itimer.it_interval.tv_usec = TIME_UPDATE_INTERVAL * 1000;
itimer.it_value = itimer.it_interval;
}
}
+/* Set up the interval timer, to ensure that time advances even without calling
+ * time_refresh().
+ *
+ * A child created with fork() does not inherit the parent's interval timer, so
+ * this function needs to be called from the child after fork(). */
+void
+time_postfork(void)
+{
+ setup_timer();
+}
+
/* Forces a refresh of the current time from the kernel. It is not usually
* necessary to call this function, since the time will be refreshed
* automatically at least every TIME_UPDATE_INTERVAL milliseconds. */
#define TIME_UPDATE_INTERVAL 100
void time_init(void);
+void time_postfork(void);
void time_refresh(void);
time_t time_now(void);
long long int time_msec(void);
struct odp_flow *odp_flows;
size_t n_odp_flows;
+ packet_count = rule->packet_count;
+ byte_count = rule->byte_count;
+
n_odp_flows = rule->cr.wc.wildcards ? list_size(&rule->list) : 1;
odp_flows = xcalloc(1, n_odp_flows * sizeof *odp_flows);
if (rule->cr.wc.wildcards) {
size_t i = 0;
LIST_FOR_EACH (subrule, struct rule, list, &rule->list) {
odp_flows[i++].key = subrule->cr.flow;
+ packet_count += subrule->packet_count;
+ byte_count += subrule->byte_count;
}
} else {
odp_flows[0].key = rule->cr.flow;
struct vconn *vconn;
struct ofpbuf *buffer;
struct ofp_flow_mod *ofm;
+ struct ofp_match match;
- /* Parse and send. */
- ofm = make_openflow(sizeof *ofm, OFPT_FLOW_MOD, &buffer);
- str_to_flow(argv[2], &ofm->match, buffer,
+ /* Parse and send. str_to_flow() will expand and reallocate the data in
+ * 'buffer', so we can't keep pointers to across the str_to_flow() call. */
+ make_openflow(sizeof *ofm, OFPT_FLOW_MOD, &buffer);
+ str_to_flow(argv[2], &match, buffer,
NULL, NULL, &priority, &idle_timeout, &hard_timeout);
+ ofm = buffer->data;
+ ofm->match = match;
if (s->strict) {
ofm->command = htons(OFPFC_MODIFY_STRICT);
} else {
packet would be duplicated, once for every bond slave, because the
physical switch attached to the bond will flood those packets.
-Bonding also drops some multicast packets received on the active
-slave: those for the vswitch has learned that the packet's MAC is on a
-port other than the bond port itself. This is because it is likely
-that the vswitch itself sent the multicast packet out the bond port,
-on a slave other than the active slave, and is now receiving the
-packet back on the active slave. However, the vswitch makes an
+Bonding also drops received packets when the vswitch has learned that
+the packet's MAC is on a port other than the bond port itself. This is
+because it is likely that the vswitch itself sent the packet out the
+bond port on a different slave and is now receiving the packet back.
+This occurs when the packet is multicast or the physical switch has not
+yet learned the MAC and is flooding it. However, the vswitch makes an
exception to this rule for broadcast ARP replies, which indicate that
the MAC has moved to another switch, probably due to VM migration.
(ARP replies are normally unicast, so this exception does not match
slaves in order from most-loaded to least-loaded. If highly loaded
slave H is significantly more heavily loaded than the least-loaded
slave L, and slave H carries at least two hashes, then vswitchd shifts
-one of H's hashes to L. However, vswitchd will not shift a hash from
-H to L if that will cause L's load to exceed H's load.
+one of H's hashes to L. However, vswitchd will only shift a hash from
+H to L if it will decrease the ratio of the load between H and L by at
+least 0.1.
Currently, "significantly more loaded" means that H must carry at
least 1 Mbps more traffic, and that traffic must be at least 3%
if (port_includes_vlan(port, m->out_vlan)
&& set_dst(dst, flow, in_port, port, tags))
{
+ int flow_vlan;
+
if (port->vlan < 0) {
dst->vlan = m->out_vlan;
}
if (dst_is_duplicate(dsts, dst - dsts, dst)) {
continue;
}
- if (dst->dp_ifidx == flow->in_port
- && dst->vlan == vlan) {
+
+ /* Use the vlan tag on the original flow instead of
+ * the one passed in the vlan parameter. This ensures
+ * that we compare the vlan from before any implicit
+ * tagging tags place. This is necessary because
+ * dst->vlan is the final vlan, after removing implicit
+ * tags. */
+ flow_vlan = ntohs(flow->dl_vlan);
+ if (flow_vlan == 0) {
+ flow_vlan = OFP_VLAN_NONE;
+ }
+ if (port == in_port && dst->vlan == flow_vlan) {
/* Don't send out input port on same VLAN. */
continue;
}
goto done;
}
- /* Multicast (and broadcast) packets on bonds need special attention, to
- * avoid receiving duplicates. */
- if (in_port->n_ifaces > 1 && eth_addr_is_multicast(flow->dl_dst)) {
- *tags |= in_port->active_iface_tag;
- if (in_port->active_iface != in_iface->port_ifidx) {
- /* Drop all multicast packets on inactive slaves. */
- goto done;
- } else {
- /* Drop all multicast packets for which we have learned a different
- * input port, because we probably sent the packet on one slave
- * and got it back on the active slave. Broadcast ARP replies are
- * an exception to this rule: the host has moved to another
- * switch. */
- int src_idx = mac_learning_lookup(br->ml, flow->dl_src, vlan);
- if (src_idx != -1 && src_idx != in_port->port_idx) {
- if (packet) {
- if (!is_bcast_arp_reply(flow, packet)) {
- goto done;
- }
- } else {
- /* No way to know whether it's an ARP reply, because the
- * flow entry doesn't include enough information and we
- * don't have a packet. Punt. */
- return false;
- }
+ /* Packets received on bonds need special attention to avoid duplicates. */
+ if (in_port->n_ifaces > 1) {
+ int src_idx;
+
+ if (eth_addr_is_multicast(flow->dl_dst)) {
+ *tags |= in_port->active_iface_tag;
+ if (in_port->active_iface != in_iface->port_ifidx) {
+ /* Drop all multicast packets on inactive slaves. */
+ goto done;
}
}
+
+ /* Drop all packets for which we have learned a different input
+ * port, because we probably sent the packet on one slave and got
+ * it back on the other. Broadcast ARP replies are an exception
+ * to this rule: the host has moved to another switch. */
+ src_idx = mac_learning_lookup(br->ml, flow->dl_src, vlan);
+ if (src_idx != -1 && src_idx != in_port->port_idx &&
+ (!packet || !is_bcast_arp_reply(flow, packet))) {
+ goto done;
+ }
}
/* MAC learning. */
tags);
if (out_port_idx >= 0 && out_port_idx < br->n_ports) {
out_port = br->ports[out_port_idx];
+ } else if (!packet) {
+ /* If we are revalidating but don't have a learning entry then
+ * eject the flow. Installing a flow that floods packets will
+ * prevent us from seeing future packets and learning properly. */
+ return false;
}
}
/* Shifts 'hash' from 'from' to 'to' within 'port'. */
static void
bond_shift_load(struct slave_balance *from, struct slave_balance *to,
- struct bond_entry *hash)
+ int hash_idx)
{
+ struct bond_entry *hash = from->hashes[hash_idx];
struct port *port = from->iface->port;
uint64_t delta = hash->tx_bytes;
* it require more work, the only purpose it would be to allow that hash to
* be migrated to another slave in this rebalancing run, and there is no
* point in doing that. */
- if (from->hashes[0] == hash) {
+ if (hash_idx == 0) {
from->hashes++;
} else {
- int i = hash - from->hashes[0];
- memmove(from->hashes + i, from->hashes + i + 1,
- (from->n_hashes - (i + 1)) * sizeof *from->hashes);
+ memmove(from->hashes + hash_idx, from->hashes + hash_idx + 1,
+ (from->n_hashes - (hash_idx + 1)) * sizeof *from->hashes);
}
from->n_hashes--;
/* 'from' is carrying significantly more load than 'to', and that
* load is split across at least two different hashes. Pick a hash
* to migrate to 'to' (the least-loaded slave), given that doing so
- * must not cause 'to''s load to exceed 'from''s load.
+ * must decrease the ratio of the load on the two slaves by at
+ * least 0.1.
*
* The sort order we use means that we prefer to shift away the
* smallest hashes instead of the biggest ones. There is little
* reason behind this decision; we could use the opposite sort
* order to shift away big hashes ahead of small ones. */
size_t i;
+ bool order_swapped;
for (i = 0; i < from->n_hashes; i++) {
+ double old_ratio, new_ratio;
uint64_t delta = from->hashes[i]->tx_bytes;
- if (to->tx_bytes + delta < from->tx_bytes - delta) {
+
+ if (delta == 0 || from->tx_bytes - delta == 0) {
+ /* Pointless move. */
+ continue;
+ }
+
+ order_swapped = from->tx_bytes - delta < to->tx_bytes + delta;
+
+ if (to->tx_bytes == 0) {
+ /* Nothing on the new slave, move it. */
+ break;
+ }
+
+ old_ratio = (double)from->tx_bytes / to->tx_bytes;
+ new_ratio = (double)(from->tx_bytes - delta) /
+ (to->tx_bytes + delta);
+
+ if (new_ratio == 0) {
+ /* Should already be covered but check to prevent division
+ * by zero. */
+ continue;
+ }
+
+ if (new_ratio < 1) {
+ new_ratio = 1 / new_ratio;
+ }
+
+ if (old_ratio - new_ratio > 0.1) {
+ /* Would decrease the ratio, move it. */
break;
}
}
if (i < from->n_hashes) {
- bond_shift_load(from, to, from->hashes[i]);
+ bond_shift_load(from, to, i);
+ port->bond_compat_is_stale = true;
+
+ /* If the result of the migration changed the relative order of
+ * 'from' and 'to' swap them back to maintain invariants. */
+ if (order_swapped) {
+ swap_bals(from, to);
+ }
/* Re-sort 'bals'. Note that this may make 'from' and 'to'
* point to different slave_balance structures. It is only
} else {
from++;
}
- port->bond_compat_is_stale = true;
}
}
needed by the controller. This is called by the "vif" script,
which is run when virtual interfaces are added and removed.
+ usr_share_vswitch_scripts_refresh-xs-network-uuids
+
+ Script to refresh bridge.<bridge>.xs-network-uuids keys, which
+ can get out-of-sync following a pool join. Running this script
+ is an alternative to rebooting the host.
+
root_vswitch_scripts_sysconfig.template
Template for vswitch's /etc/sysconfig/vswitch configuration
xenserver/usr_sbin_xen-bugtool \
xenserver/usr_share_vswitch_scripts_sysconfig.template \
xenserver/usr_share_vswitch_scripts_dump-vif-details \
+ xenserver/usr_share_vswitch_scripts_refresh-xs-network-uuids \
xenserver/vswitch-xen.spec
--- /dev/null
+#! /bin/sh
+
+. /etc/xensource-inventory
+
+for pif in $(xe pif-list --minimal host-uuid=${INSTALLATION_UUID} currently-attached=true VLAN=-1 | sed 's/,/ /g'); do
+ printf "Refreshing PIF %s... " $pif
+ if /opt/xensource/libexec/interface-reconfigure --pif-uuid=$pif up; then
+ printf "done\n"
+ else
+ printf "error!\n"
+ fi
+done
$RPM_BUILD_ROOT/usr/share/vswitch/scripts/vif
install -m 755 xenserver/usr_share_vswitch_scripts_dump-vif-details \
$RPM_BUILD_ROOT/usr/share/vswitch/scripts/dump-vif-details
+install -m 755 xenserver/usr_share_vswitch_scripts_refresh-xs-network-uuids \
+ $RPM_BUILD_ROOT/usr/share/vswitch/scripts/refresh-xs-network-uuids
install -m 755 xenserver/usr_sbin_xen-bugtool \
$RPM_BUILD_ROOT/usr/share/vswitch/scripts/xen-bugtool
install -m 755 xenserver/usr_sbin_brctl \
/lib/modules/%{xen_version}/kernel/net/vswitch/openvswitch_mod.ko
/lib/modules/%{xen_version}/kernel/net/vswitch/brcompat_mod.ko
/usr/share/vswitch/scripts/dump-vif-details
+/usr/share/vswitch/scripts/refresh-xs-network-uuids
/usr/share/vswitch/scripts/interface-reconfigure
/usr/share/vswitch/scripts/vif
/usr/share/vswitch/scripts/xen-bugtool