Merge "citrix" into "master".

author Ben Pfaff <blp@nicira.com>

Fri, 23 Oct 2009 00:43:28 +0000 (17:43 -0700)

committer Ben Pfaff <blp@nicira.com>

Fri, 23 Oct 2009 00:43:28 +0000 (17:43 -0700)
author Ben Pfaff <blp@nicira.com>
Fri, 23 Oct 2009 00:43:28 +0000 (17:43 -0700)
committer Ben Pfaff <blp@nicira.com>
Fri, 23 Oct 2009 00:43:28 +0000 (17:43 -0700)
diff --git a/ChangeLog b/ChangeLog

index f2f56a3..eff97aa 100644 (file)
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,7 @@
+v0.90.6 - 6 Oct 2009
+--------------------
+    - Bug fixes
+
  v0.90.5 - 21 Sep 2009
  ---------------------
      - Generalize in-band control to more diverse network setups
diff --git a/acinclude.m4 b/acinclude.m4

index 2f38997..6ba647a 100644 (file)
--- a/acinclude.m4
+++ b/acinclude.m4
@@ -119,6 +119,10 @@ AC_DEFUN([OVS_CHECK_LINUX26_COMPAT], [
                    [OVS_DEFINE([HAVE_NLA_NUL_STRING])])
    OVS_GREP_IFELSE([$KSRC26/include/linux/err.h], [ERR_CAST],
                    [OVS_DEFINE([HAVE_ERR_CAST])])
+  OVS_GREP_IFELSE([$KSRC26/include/net/checksum.h], [csum_unfold],
+                  [OVS_DEFINE([HAVE_CSUM_UNFOLD])])
+  OVS_GREP_IFELSE([$KSRC26/include/linux/skbuff.h], [skb_cow],
+                  [OVS_DEFINE([HAVE_SKB_COW])])
    OVS_CHECK_LOG2_H
    OVS_CHECK_VETH
    if cmp -s datapath/linux-2.6/kcompat.h.new \
diff --git a/configure.ac b/configure.ac

index 400ea78..c8eed7c 100644 (file)
--- a/configure.ac
+++ b/configure.ac
@@ -13,7 +13,7 @@
  # limitations under the License.
  
  AC_PREREQ(2.63)
-AC_INIT(openvswitch, 0.90.5, ovs-bugs@openvswitch.org)
+AC_INIT(openvswitch, 0.90.6, ovs-bugs@openvswitch.org)
  NX_BUILDNR
  AC_CONFIG_SRCDIR([datapath/datapath.c])
  AC_CONFIG_MACRO_DIR([m4])
diff --git a/datapath/datapath.c b/datapath/datapath.c

index b8ef104..1ae1d77 100644 (file)
--- a/datapath/datapath.c
+++ b/datapath/datapath.c
@@ -782,7 +782,8 @@ static int validate_actions(const struct sw_flow_actions *actions)
                         break;
  
                 case ODPAT_SET_VLAN_PCP:
-                       if (a->vlan_pcp.vlan_pcp & ~VLAN_PCP_MASK)
+                       if (a->vlan_pcp.vlan_pcp
+                           & ~(VLAN_PCP_MASK >> VLAN_PCP_SHIFT))
                                 return -EINVAL;
                         break;
  
@@ -1371,6 +1372,16 @@ get_port_group(struct datapath *dp, struct odp_port_group *upg)
         return 0;
  }
  
+static int get_listen_mask(const struct file *f)
+{
+       return (long)f->private_data;
+}
+
+static void set_listen_mask(struct file *f, int listen_mask)
+{
+       f->private_data = (void*)(long)listen_mask;
+}
+
  static long openvswitch_ioctl(struct file *f, unsigned int cmd,
                            unsigned long argp)
  {
@@ -1426,7 +1437,7 @@ static long openvswitch_ioctl(struct file *f, unsigned int cmd,
                 break;
  
         case ODP_GET_LISTEN_MASK:
-               err = put_user((int)f->private_data, (int __user *)argp);
+               err = put_user(get_listen_mask(f), (int __user *)argp);
                 break;
  
         case ODP_SET_LISTEN_MASK:
@@ -1437,7 +1448,7 @@ static long openvswitch_ioctl(struct file *f, unsigned int cmd,
                 if (listeners & ~ODPL_ALL)
                         break;
                 err = 0;
-               f->private_data = (void*)listeners;
+               set_listen_mask(f, listeners);
                 break;
  
         case ODP_PORT_QUERY:
@@ -1503,7 +1514,7 @@ ssize_t openvswitch_read(struct file *f, char __user *buf, size_t nbytes,
                       loff_t *ppos)
  {
         /* XXX is there sufficient synchronization here? */
-       int listeners = (int) f->private_data;
+       int listeners = get_listen_mask(f);
         int dp_idx = iminor(f->f_dentry->d_inode);
         struct datapath *dp = get_dp(dp_idx);
         struct sk_buff *skb;
@@ -1543,7 +1554,7 @@ ssize_t openvswitch_read(struct file *f, char __user *buf, size_t nbytes,
                 }
         }
  success:
-       copy_bytes = min(skb->len, nbytes);
+       copy_bytes = min_t(size_t, skb->len, nbytes);
         iov.iov_base = buf;
         iov.iov_len = copy_bytes;
         retval = skb_copy_datagram_iovec(skb, 0, &iov, iov.iov_len);
@@ -1565,7 +1576,7 @@ static unsigned int openvswitch_poll(struct file *file, poll_table *wait)
         if (dp) {
                 mask = 0;
                 poll_wait(file, &dp->waitqueue, wait);
-               if (dp_has_packet_of_interest(dp, (int)file->private_data))
+               if (dp_has_packet_of_interest(dp, get_listen_mask(file)))
                         mask |= POLLIN | POLLRDNORM;
         } else {
                 mask = POLLIN | POLLRDNORM | POLLHUP;
diff --git a/datapath/datapath.h b/datapath/datapath.h

index d28250a..9b4c438 100644 (file)
--- a/datapath/datapath.h
+++ b/datapath/datapath.h
@@ -24,6 +24,7 @@
  /* Mask for the priority bits in a vlan header.  If we ever merge upstream
   * then this should go into include/linux/if_vlan.h. */
  #define VLAN_PCP_MASK 0xe000
+#define VLAN_PCP_SHIFT 13
  
  #define DP_MAX_PORTS 1024
  #define DP_MAX_GROUPS 16
diff --git a/datapath/dp_dev.c b/datapath/dp_dev.c

index 008f3f6..284a6b5 100644 (file)
--- a/datapath/dp_dev.c
+++ b/datapath/dp_dev.c
@@ -157,22 +157,47 @@ static void dp_dev_free(struct net_device *netdev)
         free_netdev(netdev);
  }
  
+static int dp_dev_do_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
+{
+       if (dp_ioctl_hook)
+               return dp_ioctl_hook(dev, ifr, cmd);
+       return -EOPNOTSUPP;
+}
+
+#ifdef HAVE_NET_DEVICE_OPS
+static const struct net_device_ops dp_dev_netdev_ops = {
+       .ndo_init = dp_dev_init,
+       .ndo_open = dp_dev_open,
+       .ndo_stop = dp_dev_stop,
+       .ndo_start_xmit = dp_dev_xmit,
+       .ndo_set_mac_address = dp_dev_mac_addr,
+       .ndo_do_ioctl = dp_dev_do_ioctl,
+       .ndo_change_mtu = dp_dev_change_mtu,
+       .ndo_get_stats = dp_dev_get_stats,
+};
+#endif
+
  static void
  do_setup(struct net_device *netdev)
  {
         ether_setup(netdev);
  
-       netdev->do_ioctl = dp_ioctl_hook;
+#ifdef HAVE_NET_DEVICE_OPS
+       netdev->netdev_ops = &dp_dev_netdev_ops;
+#else
+       netdev->do_ioctl = dp_dev_do_ioctl;
         netdev->get_stats = dp_dev_get_stats;
         netdev->hard_start_xmit = dp_dev_xmit;
         netdev->open = dp_dev_open;
-       SET_ETHTOOL_OPS(netdev, &dp_ethtool_ops);
         netdev->stop = dp_dev_stop;
-       netdev->tx_queue_len = 0;
         netdev->set_mac_address = dp_dev_mac_addr;
         netdev->change_mtu = dp_dev_change_mtu;
         netdev->init = dp_dev_init;
+#endif
+
         netdev->destructor = dp_dev_free;
+       SET_ETHTOOL_OPS(netdev, &dp_ethtool_ops);
+       netdev->tx_queue_len = 0;
  
         netdev->flags = IFF_BROADCAST | IFF_MULTICAST;
         netdev->features = NETIF_F_LLTX; /* XXX other features? */
@@ -233,5 +258,9 @@ void dp_dev_destroy(struct net_device *netdev)
  
  int is_dp_dev(struct net_device *netdev) 
  {
+#ifdef HAVE_NET_DEVICE_OPS
+       return netdev->netdev_ops == &dp_dev_netdev_ops;
+#else
         return netdev->open == dp_dev_open;
+#endif
  }
diff --git a/datapath/linux-2.6/Modules.mk b/datapath/linux-2.6/Modules.mk

index e5aa51d..7892583 100644 (file)
--- a/datapath/linux-2.6/Modules.mk
+++ b/datapath/linux-2.6/Modules.mk
@@ -4,6 +4,7 @@ openvswitch_sources += \
  openvswitch_headers += \
         linux-2.6/compat-2.6/compat26.h \
         linux-2.6/compat-2.6/include/asm-generic/bug.h \
+       linux-2.6/compat-2.6/include/linux/cpumask.h \
         linux-2.6/compat-2.6/include/linux/dmi.h \
         linux-2.6/compat-2.6/include/linux/err.h \
         linux-2.6/compat-2.6/include/linux/icmp.h \
@@ -13,8 +14,8 @@ openvswitch_headers += \
         linux-2.6/compat-2.6/include/linux/jiffies.h \
         linux-2.6/compat-2.6/include/linux/kernel.h \
         linux-2.6/compat-2.6/include/linux/kobject.h \
-       linux-2.6/compat-2.6/include/linux/log2.h \
         linux-2.6/compat-2.6/include/linux/lockdep.h \
+       linux-2.6/compat-2.6/include/linux/log2.h \
         linux-2.6/compat-2.6/include/linux/mutex.h \
         linux-2.6/compat-2.6/include/linux/netdevice.h \
         linux-2.6/compat-2.6/include/linux/netfilter_bridge.h \
diff --git a/datapath/linux-2.6/compat-2.6/include/linux/skbuff.h b/datapath/linux-2.6/compat-2.6/include/linux/skbuff.h

index 666ef85..2831721 100644 (file)
--- a/datapath/linux-2.6/compat-2.6/include/linux/skbuff.h
+++ b/datapath/linux-2.6/compat-2.6/include/linux/skbuff.h
@@ -42,7 +42,7 @@ static inline void skb_copy_to_linear_data_offset(struct sk_buff *skb,
  #define NET_SKB_PAD    16
  #endif
  
-#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,23)
+#ifndef HAVE_SKB_COW
  static inline int __skb_cow(struct sk_buff *skb, unsigned int headroom,
                              int cloned)
  {
@@ -63,7 +63,7 @@ static inline int skb_cow_head(struct sk_buff *skb, unsigned int headroom)
  {
         return __skb_cow(skb, headroom, skb_header_cloned(skb));
  }
-#endif  /* linux < 2.6.23 */
+#endif  /* !HAVE_SKB_COW */
  
  
  #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,17)
diff --git a/datapath/linux-2.6/compat-2.6/include/net/checksum.h b/datapath/linux-2.6/compat-2.6/include/net/checksum.h

index c64c6bd..3b6debb 100644 (file)
--- a/datapath/linux-2.6/compat-2.6/include/net/checksum.h
+++ b/datapath/linux-2.6/compat-2.6/include/net/checksum.h
@@ -3,14 +3,11 @@
  
  #include_next <net/checksum.h>
  
-#include <linux/version.h>
-#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,20)
-
+#ifndef HAVE_CSUM_UNFOLD
  static inline __wsum csum_unfold(__sum16 n)
  {
         return (__force __wsum)n;
  }
-
-#endif /* linux kernel < 2.6.20 */
+#endif /* !HAVE_CSUM_UNFOLD */
  
  #endif /* checksum.h */
diff --git a/lib/daemon.c b/lib/daemon.c

index 1e3f002..a35c639 100644 (file)
--- a/lib/daemon.c
+++ b/lib/daemon.c
@@ -23,6 +23,7 @@
  #include <unistd.h>
  #include "fatal-signal.h"
  #include "dirs.h"
+#include "timeval.h"
  #include "util.h"
  
  #define THIS_MODULE VLM_daemon
@@ -222,6 +223,7 @@ daemonize(void)
              if (chdir_) {
                  chdir("/");
              }
+            time_postfork();
              break;
  
          case -1:
diff --git a/lib/timeval.c b/lib/timeval.c

index 3cca338..314b3f4 100644 (file)
--- a/lib/timeval.c
+++ b/lib/timeval.c
@@ -43,6 +43,7 @@ static struct timeval now;
  /* Time at which to die with SIGALRM (if not TIME_MIN). */
  static time_t deadline = TIME_MIN;
  
+static void setup_timer(void);
  static void sigalrm_handler(int);
  static void refresh_if_ticked(void);
  static time_t time_add(time_t, time_t);
@@ -57,8 +58,6 @@ void
  time_init(void)
  {
      struct sigaction sa;
-    struct itimerval itimer;
-
      if (inited) {
          return;
      }
@@ -78,7 +77,15 @@ time_init(void)
          ovs_fatal(errno, "sigaction(SIGALRM) failed");
      }
  
-    /* Set up periodic timer. */
+    /* Set up periodic signal. */
+    setup_timer();
+}
+
+static void
+setup_timer(void)
+{
+    struct itimerval itimer;
+
      itimer.it_interval.tv_sec = 0;
      itimer.it_interval.tv_usec = TIME_UPDATE_INTERVAL * 1000;
      itimer.it_value = itimer.it_interval;
@@ -87,6 +94,17 @@ time_init(void)
      }
  }
  
+/* Set up the interval timer, to ensure that time advances even without calling
+ * time_refresh().
+ *
+ * A child created with fork() does not inherit the parent's interval timer, so
+ * this function needs to be called from the child after fork(). */
+void
+time_postfork(void)
+{
+    setup_timer();
+}
+
  /* Forces a refresh of the current time from the kernel.  It is not usually
   * necessary to call this function, since the time will be refreshed
   * automatically at least every TIME_UPDATE_INTERVAL milliseconds. */
diff --git a/lib/timeval.h b/lib/timeval.h

index 660a207..8567d75 100644 (file)
--- a/lib/timeval.h
+++ b/lib/timeval.h
@@ -41,6 +41,7 @@ BUILD_ASSERT_DECL(TYPE_IS_SIGNED(time_t));
  #define TIME_UPDATE_INTERVAL 100
  
  void time_init(void);
+void time_postfork(void);
  void time_refresh(void);
  time_t time_now(void);
  long long int time_msec(void);
diff --git a/ofproto/ofproto.c b/ofproto/ofproto.c

index f31794f..eb8a7a9 100644 (file)
--- a/ofproto/ofproto.c
+++ b/ofproto/ofproto.c
@@ -2423,12 +2423,17 @@ query_stats(struct ofproto *p, struct rule *rule,
      struct odp_flow *odp_flows;
      size_t n_odp_flows;
  
+    packet_count = rule->packet_count;
+    byte_count = rule->byte_count;
+
      n_odp_flows = rule->cr.wc.wildcards ? list_size(&rule->list) : 1;
      odp_flows = xcalloc(1, n_odp_flows * sizeof *odp_flows);
      if (rule->cr.wc.wildcards) {
          size_t i = 0;
          LIST_FOR_EACH (subrule, struct rule, list, &rule->list) {
              odp_flows[i++].key = subrule->cr.flow;
+            packet_count += subrule->packet_count;
+            byte_count += subrule->byte_count;
          }
      } else {
          odp_flows[0].key = rule->cr.flow;
diff --git a/utilities/ovs-ofctl.c b/utilities/ovs-ofctl.c

index e873ed7..665a2c7 100644 (file)
--- a/utilities/ovs-ofctl.c
+++ b/utilities/ovs-ofctl.c
@@ -918,11 +918,15 @@ do_mod_flows(const struct settings *s, int argc UNUSED, char *argv[])
      struct vconn *vconn;
      struct ofpbuf *buffer;
      struct ofp_flow_mod *ofm;
+    struct ofp_match match;
  
-    /* Parse and send. */
-    ofm = make_openflow(sizeof *ofm, OFPT_FLOW_MOD, &buffer);
-    str_to_flow(argv[2], &ofm->match, buffer,
+    /* Parse and send.  str_to_flow() will expand and reallocate the data in
+     * 'buffer', so we can't keep pointers to across the str_to_flow() call. */
+    make_openflow(sizeof *ofm, OFPT_FLOW_MOD, &buffer);
+    str_to_flow(argv[2], &match, buffer,
                  NULL, NULL, &priority, &idle_timeout, &hard_timeout);
+    ofm = buffer->data;
+    ofm->match = match;
      if (s->strict) {
          ofm->command = htons(OFPFC_MODIFY_STRICT);
      } else {
diff --git a/vswitchd/INTERNALS b/vswitchd/INTERNALS

index 49a4158..3001756 100644 (file)
--- a/vswitchd/INTERNALS
+++ b/vswitchd/INTERNALS
@@ -83,12 +83,12 @@ received on other slaves are dropped.  Otherwise, every multicast
  packet would be duplicated, once for every bond slave, because the
  physical switch attached to the bond will flood those packets.
  
-Bonding also drops some multicast packets received on the active
-slave: those for the vswitch has learned that the packet's MAC is on a
-port other than the bond port itself.  This is because it is likely
-that the vswitch itself sent the multicast packet out the bond port,
-on a slave other than the active slave, and is now receiving the
-packet back on the active slave.  However, the vswitch makes an
+Bonding also drops received packets when the vswitch has learned that
+the packet's MAC is on a port other than the bond port itself.  This is
+because it is likely that the vswitch itself sent the packet out the
+bond port on a different slave and is now receiving the packet back.
+This occurs when the packet is multicast or the physical switch has not
+yet learned the MAC and is flooding it.  However, the vswitch makes an
  exception to this rule for broadcast ARP replies, which indicate that
  the MAC has moved to another switch, probably due to VM migration.
  (ARP replies are normally unicast, so this exception does not match
@@ -121,8 +121,9 @@ more heavily than data sent less recently.  It considers each of the
  slaves in order from most-loaded to least-loaded.  If highly loaded
  slave H is significantly more heavily loaded than the least-loaded
  slave L, and slave H carries at least two hashes, then vswitchd shifts
-one of H's hashes to L.  However, vswitchd will not shift a hash from
-H to L if that will cause L's load to exceed H's load.
+one of H's hashes to L.  However, vswitchd will only shift a hash from
+H to L if it will decrease the ratio of the load between H and L by at
+least 0.1.
  
  Currently, "significantly more loaded" means that H must carry at
  least 1 Mbps more traffic, and that traffic must be at least 3%
diff --git a/vswitchd/bridge.c b/vswitchd/bridge.c

index bece252..b2c051d 100644 (file)
--- a/vswitchd/bridge.c
+++ b/vswitchd/bridge.c
@@ -1783,14 +1783,26 @@ compose_dsts(const struct bridge *br, const flow_t *flow, uint16_t vlan,
                      if (port_includes_vlan(port, m->out_vlan)
                          && set_dst(dst, flow, in_port, port, tags))
                      {
+                        int flow_vlan;
+
                          if (port->vlan < 0) {
                              dst->vlan = m->out_vlan;
                          }
                          if (dst_is_duplicate(dsts, dst - dsts, dst)) {
                              continue;
                          }
-                        if (dst->dp_ifidx == flow->in_port
-                            && dst->vlan == vlan) {
+
+                        /* Use the vlan tag on the original flow instead of
+                         * the one passed in the vlan parameter.  This ensures
+                         * that we compare the vlan from before any implicit
+                         * tagging tags place. This is necessary because
+                         * dst->vlan is the final vlan, after removing implicit
+                         * tags. */
+                        flow_vlan = ntohs(flow->dl_vlan);
+                        if (flow_vlan == 0) {
+                            flow_vlan = OFP_VLAN_NONE;
+                        }
+                        if (port == in_port && dst->vlan == flow_vlan) {
                              /* Don't send out input port on same VLAN. */
                              continue;
                          }
@@ -1953,33 +1965,27 @@ process_flow(struct bridge *br, const flow_t *flow,
          goto done;
      }
  
-    /* Multicast (and broadcast) packets on bonds need special attention, to
-     * avoid receiving duplicates. */
-    if (in_port->n_ifaces > 1 && eth_addr_is_multicast(flow->dl_dst)) {
-        *tags |= in_port->active_iface_tag;
-        if (in_port->active_iface != in_iface->port_ifidx) {
-            /* Drop all multicast packets on inactive slaves. */
-            goto done;
-        } else {
-            /* Drop all multicast packets for which we have learned a different
-             * input port, because we probably sent the packet on one slave
-             * and got it back on the active slave.  Broadcast ARP replies are
-             * an exception to this rule: the host has moved to another
-             * switch. */
-            int src_idx = mac_learning_lookup(br->ml, flow->dl_src, vlan);
-            if (src_idx != -1 && src_idx != in_port->port_idx) {
-                if (packet) {
-                    if (!is_bcast_arp_reply(flow, packet)) {
-                        goto done;
-                    }
-                } else {
-                    /* No way to know whether it's an ARP reply, because the
-                     * flow entry doesn't include enough information and we
-                     * don't have a packet.  Punt. */
-                    return false;
-                }
+    /* Packets received on bonds need special attention to avoid duplicates. */
+    if (in_port->n_ifaces > 1) {
+        int src_idx;
+
+        if (eth_addr_is_multicast(flow->dl_dst)) {
+            *tags |= in_port->active_iface_tag;
+            if (in_port->active_iface != in_iface->port_ifidx) {
+                /* Drop all multicast packets on inactive slaves. */
+                goto done;
              }
          }
+
+        /* Drop all packets for which we have learned a different input
+         * port, because we probably sent the packet on one slave and got
+         * it back on the other.  Broadcast ARP replies are an exception
+         * to this rule: the host has moved to another switch. */
+        src_idx = mac_learning_lookup(br->ml, flow->dl_src, vlan);
+        if (src_idx != -1 && src_idx != in_port->port_idx &&
+            (!packet || !is_bcast_arp_reply(flow, packet))) {
+                goto done;
+        }
      }
  
      /* MAC learning. */
@@ -2009,6 +2015,11 @@ process_flow(struct bridge *br, const flow_t *flow,
                                                 tags);
          if (out_port_idx >= 0 && out_port_idx < br->n_ports) {
              out_port = br->ports[out_port_idx];
+        } else if (!packet) {
+            /* If we are revalidating but don't have a learning entry then
+             * eject the flow.  Installing a flow that floods packets will
+             * prevent us from seeing future packets and learning properly. */
+            return false;
          }
      }
  
@@ -2259,8 +2270,9 @@ log_bals(const struct slave_balance *bals, size_t n_bals, struct port *port)
  /* Shifts 'hash' from 'from' to 'to' within 'port'. */
  static void
  bond_shift_load(struct slave_balance *from, struct slave_balance *to,
-                struct bond_entry *hash)
+                int hash_idx)
  {
+    struct bond_entry *hash = from->hashes[hash_idx];
      struct port *port = from->iface->port;
      uint64_t delta = hash->tx_bytes;
  
@@ -2278,12 +2290,11 @@ bond_shift_load(struct slave_balance *from, struct slave_balance *to,
       * it require more work, the only purpose it would be to allow that hash to
       * be migrated to another slave in this rebalancing run, and there is no
       * point in doing that.  */
-    if (from->hashes[0] == hash) {
+    if (hash_idx == 0) {
          from->hashes++;
      } else {
-        int i = hash - from->hashes[0];
-        memmove(from->hashes + i, from->hashes + i + 1,
-                (from->n_hashes - (i + 1)) * sizeof *from->hashes);
+        memmove(from->hashes + hash_idx, from->hashes + hash_idx + 1,
+                (from->n_hashes - (hash_idx + 1)) * sizeof *from->hashes);
      }
      from->n_hashes--;
  
@@ -2368,22 +2379,60 @@ bond_rebalance_port(struct port *port)
              /* 'from' is carrying significantly more load than 'to', and that
               * load is split across at least two different hashes.  Pick a hash
               * to migrate to 'to' (the least-loaded slave), given that doing so
-             * must not cause 'to''s load to exceed 'from''s load.
+             * must decrease the ratio of the load on the two slaves by at
+             * least 0.1.
               *
               * The sort order we use means that we prefer to shift away the
               * smallest hashes instead of the biggest ones.  There is little
               * reason behind this decision; we could use the opposite sort
               * order to shift away big hashes ahead of small ones. */
              size_t i;
+            bool order_swapped;
  
              for (i = 0; i < from->n_hashes; i++) {
+                double old_ratio, new_ratio;
                  uint64_t delta = from->hashes[i]->tx_bytes;
-                if (to->tx_bytes + delta < from->tx_bytes - delta) {
+
+                if (delta == 0 || from->tx_bytes - delta == 0) {
+                    /* Pointless move. */
+                    continue;
+                }
+
+                order_swapped = from->tx_bytes - delta < to->tx_bytes + delta;
+
+                if (to->tx_bytes == 0) {
+                    /* Nothing on the new slave, move it. */
+                    break;
+                }
+
+                old_ratio = (double)from->tx_bytes / to->tx_bytes;
+                new_ratio = (double)(from->tx_bytes - delta) /
+                            (to->tx_bytes + delta);
+
+                if (new_ratio == 0) {
+                    /* Should already be covered but check to prevent division
+                     * by zero. */
+                    continue;
+                }
+
+                if (new_ratio < 1) {
+                    new_ratio = 1 / new_ratio;
+                }
+
+                if (old_ratio - new_ratio > 0.1) {
+                    /* Would decrease the ratio, move it. */
                      break;
                  }
              }
              if (i < from->n_hashes) {
-                bond_shift_load(from, to, from->hashes[i]);
+                bond_shift_load(from, to, i);
+                port->bond_compat_is_stale = true;
+
+                /* If the result of the migration changed the relative order of
+                 * 'from' and 'to' swap them back to maintain invariants. */
+                if (order_swapped) {
+                    swap_bals(from, to);
+                }
  
                  /* Re-sort 'bals'.  Note that this may make 'from' and 'to'
                   * point to different slave_balance structures.  It is only
@@ -2394,7 +2443,6 @@ bond_rebalance_port(struct port *port)
              } else {
                  from++;
              }
-            port->bond_compat_is_stale = true;
          }
      }
  
diff --git a/xenserver/README b/xenserver/README

index 276cd6c..ff692fd 100644 (file)
--- a/xenserver/README
+++ b/xenserver/README
@@ -50,6 +50,12 @@ files are:
          needed by the controller.  This is called by the "vif" script,
          which is run when virtual interfaces are added and removed.
  
+    usr_share_vswitch_scripts_refresh-xs-network-uuids
+
+        Script to refresh bridge.<bridge>.xs-network-uuids keys, which
+        can get out-of-sync following a pool join.  Running this script
+        is an alternative to rebooting the host.
+
      root_vswitch_scripts_sysconfig.template
  
          Template for vswitch's /etc/sysconfig/vswitch configuration
diff --git a/xenserver/automake.mk b/xenserver/automake.mk

index 3275434..221925c 100644 (file)
--- a/xenserver/automake.mk
+++ b/xenserver/automake.mk
@@ -19,4 +19,5 @@ EXTRA_DIST += \
         xenserver/usr_sbin_xen-bugtool \
         xenserver/usr_share_vswitch_scripts_sysconfig.template \
         xenserver/usr_share_vswitch_scripts_dump-vif-details \
+       xenserver/usr_share_vswitch_scripts_refresh-xs-network-uuids \
         xenserver/vswitch-xen.spec
diff --git a/xenserver/usr_share_vswitch_scripts_refresh-xs-network-uuids b/xenserver/usr_share_vswitch_scripts_refresh-xs-network-uuids

new file mode 100755 (executable)

index 0000000..34fe1e7
--- /dev/null
+++ b/xenserver/usr_share_vswitch_scripts_refresh-xs-network-uuids
@@ -0,0 +1,12 @@
+#! /bin/sh
+
+. /etc/xensource-inventory
+
+for pif in $(xe pif-list --minimal host-uuid=${INSTALLATION_UUID} currently-attached=true VLAN=-1 | sed 's/,/ /g'); do
+    printf "Refreshing PIF %s...  " $pif
+    if /opt/xensource/libexec/interface-reconfigure --pif-uuid=$pif up; then
+        printf "done\n"
+    else
+        printf "error!\n"
+    fi
+done
diff --git a/xenserver/vswitch-xen.spec b/xenserver/vswitch-xen.spec

index d9a18bd..d8e32e4 100644 (file)
--- a/xenserver/vswitch-xen.spec
+++ b/xenserver/vswitch-xen.spec
@@ -73,6 +73,8 @@ install -m 755 xenserver/etc_xensource_scripts_vif \
               $RPM_BUILD_ROOT/usr/share/vswitch/scripts/vif
  install -m 755 xenserver/usr_share_vswitch_scripts_dump-vif-details \
                 $RPM_BUILD_ROOT/usr/share/vswitch/scripts/dump-vif-details
+install -m 755 xenserver/usr_share_vswitch_scripts_refresh-xs-network-uuids \
+               $RPM_BUILD_ROOT/usr/share/vswitch/scripts/refresh-xs-network-uuids
  install -m 755 xenserver/usr_sbin_xen-bugtool \
               $RPM_BUILD_ROOT/usr/share/vswitch/scripts/xen-bugtool
  install -m 755 xenserver/usr_sbin_brctl \
@@ -304,6 +306,7 @@ fi
  /lib/modules/%{xen_version}/kernel/net/vswitch/openvswitch_mod.ko
  /lib/modules/%{xen_version}/kernel/net/vswitch/brcompat_mod.ko
  /usr/share/vswitch/scripts/dump-vif-details
+/usr/share/vswitch/scripts/refresh-xs-network-uuids
  /usr/share/vswitch/scripts/interface-reconfigure
  /usr/share/vswitch/scripts/vif
  /usr/share/vswitch/scripts/xen-bugtool
author	Ben Pfaff <blp@nicira.com>
	Fri, 23 Oct 2009 00:43:28 +0000 (17:43 -0700)
committer	Ben Pfaff <blp@nicira.com>
	Fri, 23 Oct 2009 00:43:28 +0000 (17:43 -0700)
ChangeLog		patch \| blob \| history
acinclude.m4		patch \| blob \| history
configure.ac		patch \| blob \| history
datapath/datapath.c		patch \| blob \| history
datapath/datapath.h		patch \| blob \| history
datapath/dp_dev.c		patch \| blob \| history
datapath/linux-2.6/Modules.mk		patch \| blob \| history
datapath/linux-2.6/compat-2.6/include/linux/skbuff.h		patch \| blob \| history
datapath/linux-2.6/compat-2.6/include/net/checksum.h		patch \| blob \| history
lib/daemon.c		patch \| blob \| history
lib/timeval.c		patch \| blob \| history
lib/timeval.h		patch \| blob \| history
ofproto/ofproto.c		patch \| blob \| history
utilities/ovs-ofctl.c		patch \| blob \| history
vswitchd/INTERNALS		patch \| blob \| history
vswitchd/bridge.c		patch \| blob \| history
xenserver/README		patch \| blob \| history
xenserver/automake.mk		patch \| blob \| history
xenserver/usr_share_vswitch_scripts_refresh-xs-network-uuids	[new file with mode: 0755]	patch \| blob
xenserver/vswitch-xen.spec		patch \| blob \| history