- struct slave_balance tmp = *a;
- *a = *b;
- *b = tmp;
-}
-
-/* Restores the 'n_bals' slave_balance structures in 'bals' to sorted order
- * given that 'p' (and only 'p') might be in the wrong location.
- *
- * This function invalidates 'p', since it might now be in a different memory
- * location. */
-static void
-resort_bals(struct slave_balance *p,
- struct slave_balance bals[], size_t n_bals)
-{
- if (n_bals > 1) {
- for (; p > bals && p->tx_bytes > p[-1].tx_bytes; p--) {
- swap_bals(p, p - 1);
- }
- for (; p < &bals[n_bals - 1] && p->tx_bytes < p[1].tx_bytes; p++) {
- swap_bals(p, p + 1);
- }
- }
-}
-
-static void
-log_bals(const struct slave_balance *bals, size_t n_bals, struct port *port)
-{
- if (VLOG_IS_DBG_ENABLED()) {
- struct ds ds = DS_EMPTY_INITIALIZER;
- const struct slave_balance *b;
-
- for (b = bals; b < bals + n_bals; b++) {
- size_t i;
-
- if (b > bals) {
- ds_put_char(&ds, ',');
- }
- ds_put_format(&ds, " %s %"PRIu64"kB",
- b->iface->name, b->tx_bytes / 1024);
-
- if (!b->iface->enabled) {
- ds_put_cstr(&ds, " (disabled)");
- }
- if (b->n_hashes > 0) {
- ds_put_cstr(&ds, " (");
- for (i = 0; i < b->n_hashes; i++) {
- const struct bond_entry *e = b->hashes[i];
- if (i > 0) {
- ds_put_cstr(&ds, " + ");
- }
- ds_put_format(&ds, "h%td: %"PRIu64"kB",
- e - port->bond_hash, e->tx_bytes / 1024);
- }
- ds_put_cstr(&ds, ")");
- }
- }
- VLOG_DBG("bond %s:%s", port->name, ds_cstr(&ds));
- ds_destroy(&ds);
- }
-}
-
-/* Shifts 'hash' from 'from' to 'to' within 'port'. */
-static void
-bond_shift_load(struct slave_balance *from, struct slave_balance *to,
- int hash_idx)
-{
- struct bond_entry *hash = from->hashes[hash_idx];
- struct port *port = from->iface->port;
- uint64_t delta = hash->tx_bytes;
-
- assert(port->bond_mode != BM_AB);
-
- VLOG_INFO("bond %s: shift %"PRIu64"kB of load (with hash %td) "
- "from %s to %s (now carrying %"PRIu64"kB and "
- "%"PRIu64"kB load, respectively)",
- port->name, delta / 1024, hash - port->bond_hash,
- from->iface->name, to->iface->name,
- (from->tx_bytes - delta) / 1024,
- (to->tx_bytes + delta) / 1024);
-
- /* Delete element from from->hashes.
- *
- * We don't bother to add the element to to->hashes because not only would
- * it require more work, the only purpose it would be to allow that hash to
- * be migrated to another slave in this rebalancing run, and there is no
- * point in doing that. */
- if (hash_idx == 0) {
- from->hashes++;
- } else {
- memmove(from->hashes + hash_idx, from->hashes + hash_idx + 1,
- (from->n_hashes - (hash_idx + 1)) * sizeof *from->hashes);
- }
- from->n_hashes--;
-
- /* Shift load away from 'from' to 'to'. */
- from->tx_bytes -= delta;
- to->tx_bytes += delta;
-
- /* Arrange for flows to be revalidated. */
- ofproto_revalidate(port->bridge->ofproto, hash->tag);
- hash->iface = to->iface;
- hash->tag = tag_create_random();
-}
-
-static void
-bond_rebalance_port(struct port *port)
-{
- struct slave_balance *bals;
- size_t n_bals;
- struct bond_entry *hashes[BOND_MASK + 1];
- struct slave_balance *b, *from, *to;
- struct bond_entry *e;
- struct iface *iface;
- size_t i;
-
- assert(port->bond_mode != BM_AB);
-
- /* Sets up 'bals' to describe each of the port's interfaces, sorted in
- * descending order of tx_bytes, so that bals[0] represents the most
- * heavily loaded slave and bals[n_bals - 1] represents the least heavily
- * loaded slave.
- *
- * The code is a bit tricky: to avoid dynamically allocating a 'hashes'
- * array for each slave_balance structure, we sort our local array of
- * hashes in order by slave, so that all of the hashes for a given slave
- * become contiguous in memory, and then we point each 'hashes' members of
- * a slave_balance structure to the start of a contiguous group. */
- n_bals = port->n_ifaces;
- b = bals = xmalloc(n_bals * sizeof *bals);
- LIST_FOR_EACH (iface, port_elem, &port->ifaces) {
- b->iface = iface;
- b->tx_bytes = 0;
- b->hashes = NULL;
- b->n_hashes = 0;
- b++;
- }
- assert(b == &bals[n_bals]);
- for (i = 0; i <= BOND_MASK; i++) {
- hashes[i] = &port->bond_hash[i];
- }
- qsort(hashes, BOND_MASK + 1, sizeof *hashes, compare_bond_entries);
- for (i = 0; i <= BOND_MASK; i++) {
- e = hashes[i];
- if (!e->iface) {
- continue;
- }
-
- for (b = bals; b < &bals[n_bals]; b++) {
- if (b->iface == e->iface) {
- b->tx_bytes += e->tx_bytes;
- if (!b->hashes) {
- b->hashes = &hashes[i];
- }
- b->n_hashes++;
- break;
- }
- }
- }
- qsort(bals, n_bals, sizeof *bals, compare_slave_balance);
- log_bals(bals, n_bals, port);
-
- /* Discard slaves that aren't enabled (which were sorted to the back of the
- * array earlier). */
- while (!bals[n_bals - 1].iface->enabled) {
- n_bals--;
- if (!n_bals) {
- goto exit;
- }
- }
-
- /* Shift load from the most-loaded slaves to the least-loaded slaves. */
- to = &bals[n_bals - 1];
- for (from = bals; from < to; ) {
- uint64_t overload = from->tx_bytes - to->tx_bytes;
- if (overload < to->tx_bytes >> 5 || overload < 100000) {
- /* The extra load on 'from' (and all less-loaded slaves), compared
- * to that of 'to' (the least-loaded slave), is less than ~3%, or
- * it is less than ~1Mbps. No point in rebalancing. */
- break;
- } else if (from->n_hashes == 1) {
- /* 'from' only carries a single MAC hash, so we can't shift any
- * load away from it, even though we want to. */
- from++;
- } else {
- /* 'from' is carrying significantly more load than 'to', and that
- * load is split across at least two different hashes. Pick a hash
- * to migrate to 'to' (the least-loaded slave), given that doing so
- * must decrease the ratio of the load on the two slaves by at
- * least 0.1.
- *
- * The sort order we use means that we prefer to shift away the
- * smallest hashes instead of the biggest ones. There is little
- * reason behind this decision; we could use the opposite sort
- * order to shift away big hashes ahead of small ones. */
- bool order_swapped;
-
- for (i = 0; i < from->n_hashes; i++) {
- double old_ratio, new_ratio;
- uint64_t delta = from->hashes[i]->tx_bytes;
-
- if (delta == 0 || from->tx_bytes - delta == 0) {
- /* Pointless move. */
- continue;
- }
-
- order_swapped = from->tx_bytes - delta < to->tx_bytes + delta;
-
- if (to->tx_bytes == 0) {
- /* Nothing on the new slave, move it. */
- break;
- }
-
- old_ratio = (double)from->tx_bytes / to->tx_bytes;
- new_ratio = (double)(from->tx_bytes - delta) /
- (to->tx_bytes + delta);
-
- if (new_ratio == 0) {
- /* Should already be covered but check to prevent division
- * by zero. */
- continue;
- }
-
- if (new_ratio < 1) {
- new_ratio = 1 / new_ratio;
- }
-
- if (old_ratio - new_ratio > 0.1) {
- /* Would decrease the ratio, move it. */
- break;
- }
- }
- if (i < from->n_hashes) {
- bond_shift_load(from, to, i);
-
- /* If the result of the migration changed the relative order of
- * 'from' and 'to' swap them back to maintain invariants. */
- if (order_swapped) {
- swap_bals(from, to);
- }
-
- /* Re-sort 'bals'. Note that this may make 'from' and 'to'
- * point to different slave_balance structures. It is only
- * valid to do these two operations in a row at all because we
- * know that 'from' will not move past 'to' and vice versa. */
- resort_bals(from, bals, n_bals);
- resort_bals(to, bals, n_bals);
- } else {
- from++;
- }
- }
- }
-
- /* Implement exponentially weighted moving average. A weight of 1/2 causes
- * historical data to decay to <1% in 7 rebalancing runs. */
- for (e = &port->bond_hash[0]; e <= &port->bond_hash[BOND_MASK]; e++) {
- e->tx_bytes /= 2;
- if (!e->tx_bytes) {
- e->iface = NULL;
- }
- }
-
-exit:
- free(bals);
-}
-
-static void
-bond_send_learning_packets(struct port *port)
-{
- struct bridge *br = port->bridge;
- struct mac_entry *e;
- struct ofpbuf packet;
- int error, n_packets, n_errors;
-
- if (!port->n_ifaces || !port->active_iface || bond_is_tcp_hash(port)) {
- return;
- }
-
- ofpbuf_init(&packet, 128);
- error = n_packets = n_errors = 0;
- LIST_FOR_EACH (e, lru_node, &br->ml->lrus) {
- tag_type tags = 0;
- uint16_t dp_ifidx;
- struct flow flow;
- int retval;
-
- if (e->port.p == port) {
- continue;
- }
-
- compose_benign_packet(&packet, "Open vSwitch Bond Failover", 0xf177,
- e->mac);
- flow_extract(&packet, 0, ODPP_NONE, &flow);
-
- if (!choose_output_iface(port, &flow, e->vlan, &dp_ifidx, &tags)) {
- continue;
- }
-
- /* Send packet. */
- n_packets++;
- retval = ofproto_send_packet(br->ofproto, dp_ifidx, e->vlan, &packet);
- if (retval) {
- error = retval;
- n_errors++;
- }
- }
- ofpbuf_uninit(&packet);
-
- if (n_errors) {
- static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
- VLOG_WARN_RL(&rl, "bond %s: %d errors sending %d gratuitous learning "
- "packets, last error was: %s",
- port->name, n_errors, n_packets, strerror(error));
- } else {
- VLOG_DBG("bond %s: sent %d gratuitous learning packets",
- port->name, n_packets);
- }
-}
-\f
-/* Bonding unixctl user interface functions. */
-
-static void
-bond_unixctl_list(struct unixctl_conn *conn,
- const char *args OVS_UNUSED, void *aux OVS_UNUSED)
-{
- struct ds ds = DS_EMPTY_INITIALIZER;
- const struct bridge *br;
-
- ds_put_cstr(&ds, "bridge\tbond\ttype\tslaves\n");
-
- LIST_FOR_EACH (br, node, &all_bridges) {
- struct port *port;
-
- HMAP_FOR_EACH (port, hmap_node, &br->ports) {
- if (port->n_ifaces > 1) {
- struct iface *iface;
-
- ds_put_format(&ds, "%s\t%s\t%s\t", br->name, port->name,
- bond_mode_to_string(port->bond_mode));
- LIST_FOR_EACH (iface, port_elem, &port->ifaces) {
- if (&iface->port_elem != list_front(&port->ifaces)) {
- ds_put_cstr(&ds, ", ");
- }
- ds_put_cstr(&ds, iface->name);
- }
- ds_put_char(&ds, '\n');
- }
- }
- }
- unixctl_command_reply(conn, 200, ds_cstr(&ds));
- ds_destroy(&ds);
-}
-
-static struct port *
-bond_find(const char *name)
-{
- const struct bridge *br;
-
- LIST_FOR_EACH (br, node, &all_bridges) {
- struct port *port;
-
- HMAP_FOR_EACH (port, hmap_node, &br->ports) {
- if (!strcmp(port->name, name) && port->n_ifaces > 1) {
- return port;
- }
- }
- }
- return NULL;
-}
-
-static void
-bond_unixctl_show(struct unixctl_conn *conn,
- const char *args, void *aux OVS_UNUSED)
-{
- struct ds ds = DS_EMPTY_INITIALIZER;
- const struct port *port;
- struct iface *iface;
-
- port = bond_find(args);
- if (!port) {
- unixctl_command_reply(conn, 501, "no such bond");
- return;
- }
-
- ds_put_format(&ds, "bond_mode: %s\n",
- bond_mode_to_string(port->bond_mode));
-
- if (port->lacp) {
- ds_put_format(&ds, "lacp: %s\n",
- port->lacp_active ? "active" : "passive");
- } else {
- ds_put_cstr(&ds, "lacp: off\n");
- }
-
- if (port->bond_mode != BM_AB) {
- ds_put_format(&ds, "bond-hash-algorithm: %s\n",
- bond_is_tcp_hash(port) ? "balance-tcp" : "balance-slb");
- }
-
-
- ds_put_format(&ds, "bond-detect-mode: %s\n",
- port->monitor ? "carrier" : "miimon");
-
- if (!port->monitor) {
- ds_put_format(&ds, "bond-miimon-interval: %lld\n",
- port->miimon_interval);
- }
-
- ds_put_format(&ds, "updelay: %d ms\n", port->updelay);
- ds_put_format(&ds, "downdelay: %d ms\n", port->downdelay);
-
- if (port->bond_mode != BM_AB) {
- ds_put_format(&ds, "next rebalance: %lld ms\n",
- port->bond_next_rebalance - time_msec());
- }
-
- LIST_FOR_EACH (iface, port_elem, &port->ifaces) {
- struct bond_entry *be;
- struct flow flow;
-
- /* Basic info. */
- ds_put_format(&ds, "\nslave %s: %s\n",
- iface->name, iface->enabled ? "enabled" : "disabled");
- if (iface == port->active_iface) {
- ds_put_cstr(&ds, "\tactive slave\n");
- }
- if (iface->delay_expires != LLONG_MAX) {
- ds_put_format(&ds, "\t%s expires in %lld ms\n",
- iface->enabled ? "downdelay" : "updelay",
- iface->delay_expires - time_msec());
- }
-
- if (port->bond_mode == BM_AB) {
- continue;
- }
-
- /* Hashes. */
- memset(&flow, 0, sizeof flow);
- for (be = port->bond_hash; be <= &port->bond_hash[BOND_MASK]; be++) {
- int hash = be - port->bond_hash;
- struct mac_entry *me;
-
- if (be->iface != iface) {
- continue;
- }
-
- ds_put_format(&ds, "\thash %d: %"PRIu64" kB load\n",
- hash, be->tx_bytes / 1024);
-
- if (port->bond_mode != BM_SLB) {
- continue;
- }
-
- /* MACs. */
- LIST_FOR_EACH (me, lru_node, &port->bridge->ml->lrus) {
- uint16_t dp_ifidx;
- tag_type tags = 0;
-
- memcpy(flow.dl_src, me->mac, ETH_ADDR_LEN);
- if (bond_hash_src(me->mac, me->vlan) == hash
- && me->port.p != port
- && choose_output_iface(port, &flow, me->vlan,
- &dp_ifidx, &tags)
- && dp_ifidx == iface->dp_ifidx)
- {
- ds_put_format(&ds, "\t\t"ETH_ADDR_FMT"\n",
- ETH_ADDR_ARGS(me->mac));
- }
- }
- }
- }
- unixctl_command_reply(conn, 200, ds_cstr(&ds));
- ds_destroy(&ds);
-}
-
-static void
-bond_unixctl_migrate(struct unixctl_conn *conn, const char *args_,
- void *aux OVS_UNUSED)
-{
- char *args = (char *) args_;
- char *save_ptr = NULL;
- char *bond_s, *hash_s, *slave_s;
- struct port *port;
- struct iface *iface;
- struct bond_entry *entry;
- int hash;
-
- bond_s = strtok_r(args, " ", &save_ptr);
- hash_s = strtok_r(NULL, " ", &save_ptr);
- slave_s = strtok_r(NULL, " ", &save_ptr);
- if (!slave_s) {
- unixctl_command_reply(conn, 501,
- "usage: bond/migrate BOND HASH SLAVE");
- return;
- }
-
- port = bond_find(bond_s);
- if (!port) {
- unixctl_command_reply(conn, 501, "no such bond");
- return;
- }
-
- if (port->bond_mode != BM_SLB) {
- unixctl_command_reply(conn, 501, "not an SLB bond");
- return;
- }
-
- if (strspn(hash_s, "0123456789") == strlen(hash_s)) {
- hash = atoi(hash_s) & BOND_MASK;
- } else {
- unixctl_command_reply(conn, 501, "bad hash");
- return;
- }
-
- iface = port_lookup_iface(port, slave_s);
- if (!iface) {
- unixctl_command_reply(conn, 501, "no such slave");
- return;
- }
-
- if (!iface->enabled) {
- unixctl_command_reply(conn, 501, "cannot migrate to disabled slave");
- return;
- }
-
- entry = &port->bond_hash[hash];
- ofproto_revalidate(port->bridge->ofproto, entry->tag);
- entry->iface = iface;
- entry->tag = tag_create_random();
- unixctl_command_reply(conn, 200, "migrated");
-}
-
-static void
-bond_unixctl_set_active_slave(struct unixctl_conn *conn, const char *args_,
- void *aux OVS_UNUSED)
-{
- char *args = (char *) args_;
- char *save_ptr = NULL;
- char *bond_s, *slave_s;
- struct port *port;
- struct iface *iface;
-
- bond_s = strtok_r(args, " ", &save_ptr);
- slave_s = strtok_r(NULL, " ", &save_ptr);
- if (!slave_s) {
- unixctl_command_reply(conn, 501,
- "usage: bond/set-active-slave BOND SLAVE");
- return;
- }
-
- port = bond_find(bond_s);
- if (!port) {
- unixctl_command_reply(conn, 501, "no such bond");
- return;
- }
-
- iface = port_lookup_iface(port, slave_s);
- if (!iface) {
- unixctl_command_reply(conn, 501, "no such slave");
- return;
- }
-
- if (!iface->enabled) {
- unixctl_command_reply(conn, 501, "cannot make disabled slave active");
- return;
- }
-
- if (port->active_iface != iface) {
- ofproto_revalidate(port->bridge->ofproto,
- port_get_active_iface_tag(port));
- port->active_iface = iface;
- VLOG_INFO("port %s: active interface is now %s",
- port->name, iface->name);
- bond_send_learning_packets(port);
- unixctl_command_reply(conn, 200, "done");
- } else {
- unixctl_command_reply(conn, 200, "no change");
- }
-}
-
-static void
-enable_slave(struct unixctl_conn *conn, const char *args_, bool enable)
-{
- char *args = (char *) args_;
- char *save_ptr = NULL;
- char *bond_s, *slave_s;
- struct port *port;
- struct iface *iface;
-
- bond_s = strtok_r(args, " ", &save_ptr);
- slave_s = strtok_r(NULL, " ", &save_ptr);
- if (!slave_s) {
- unixctl_command_reply(conn, 501,
- "usage: bond/enable/disable-slave BOND SLAVE");
- return;
- }
-
- port = bond_find(bond_s);
- if (!port) {
- unixctl_command_reply(conn, 501, "no such bond");
- return;
- }
-
- iface = port_lookup_iface(port, slave_s);
- if (!iface) {
- unixctl_command_reply(conn, 501, "no such slave");
- return;
- }
-
- bond_enable_slave(iface, enable);
- unixctl_command_reply(conn, 501, enable ? "enabled" : "disabled");
-}
-
-static void
-bond_unixctl_enable_slave(struct unixctl_conn *conn, const char *args,
- void *aux OVS_UNUSED)
-{
- enable_slave(conn, args, true);
-}
-
-static void
-bond_unixctl_disable_slave(struct unixctl_conn *conn, const char *args,
- void *aux OVS_UNUSED)
-{
- enable_slave(conn, args, false);
-}
-
-static void
-bond_unixctl_hash(struct unixctl_conn *conn, const char *args_,
- void *aux OVS_UNUSED)
-{
- char *args = (char *) args_;
- uint8_t mac[ETH_ADDR_LEN];
- uint8_t hash;
- char *hash_cstr;
- unsigned int vlan;
- char *mac_s, *vlan_s;
- char *save_ptr = NULL;
-
- mac_s = strtok_r(args, " ", &save_ptr);
- vlan_s = strtok_r(NULL, " ", &save_ptr);
-
- if (vlan_s) {
- if (sscanf(vlan_s, "%u", &vlan) != 1) {
- unixctl_command_reply(conn, 501, "invalid vlan");
- return;
- }
- } else {
- vlan = OFP_VLAN_NONE;
- }
-
- if (sscanf(mac_s, ETH_ADDR_SCAN_FMT, ETH_ADDR_SCAN_ARGS(mac))
- == ETH_ADDR_SCAN_COUNT) {
- hash = bond_hash_src(mac, vlan);
-
- hash_cstr = xasprintf("%u", hash);
- unixctl_command_reply(conn, 200, hash_cstr);
- free(hash_cstr);
- } else {
- unixctl_command_reply(conn, 501, "invalid mac");
- }
-}
-
-static void
-bond_init(void)
-{
- unixctl_command_register("bond/list", bond_unixctl_list, NULL);
- unixctl_command_register("bond/show", bond_unixctl_show, NULL);
- unixctl_command_register("bond/migrate", bond_unixctl_migrate, NULL);
- unixctl_command_register("bond/set-active-slave",
- bond_unixctl_set_active_slave, NULL);
- unixctl_command_register("bond/enable-slave", bond_unixctl_enable_slave,
- NULL);
- unixctl_command_register("bond/disable-slave", bond_unixctl_disable_slave,
- NULL);
- unixctl_command_register("bond/hash", bond_unixctl_hash, NULL);
-}
-\f
-/* Port functions. */
-
-static void
-lacp_send_pdu_cb(void *aux, const struct lacp_pdu *pdu)
-{
- struct iface *iface = aux;
- uint8_t ea[ETH_ADDR_LEN];
- int error;
-
- error = netdev_get_etheraddr(iface->netdev, ea);
- if (!error) {
- struct ofpbuf packet;
-
- ofpbuf_init(&packet, ETH_HEADER_LEN + LACP_PDU_LEN);
- compose_lacp_packet(&packet, ea, pdu);
- ofproto_send_packet(iface->port->bridge->ofproto,
- iface->dp_ifidx, 0, &packet);
- ofpbuf_uninit(&packet);
- } else {
- static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 10);
- VLOG_ERR_RL(&rl, "iface %s: failed to obtain Ethernet address "
- "(%s)", iface->name, strerror(error));
- }
-}
-
-static void
-port_run(struct port *port)
-{
- if (port->monitor) {
- char *devname;
-
- /* Track carrier going up and down on interfaces. */
- while (!netdev_monitor_poll(port->monitor, &devname)) {
- struct iface *iface;
-
- iface = port_lookup_iface(port, devname);
- if (iface) {
- iface_update_carrier(iface);
- }
- free(devname);
- }
- } else if (time_msec() >= port->miimon_next_update) {
- struct iface *iface;
-
- LIST_FOR_EACH (iface, port_elem, &port->ifaces) {
- iface_update_carrier(iface);
- }
- port->miimon_next_update = time_msec() + port->miimon_interval;
- }
-
- if (port->lacp) {
- struct iface *iface;
-
- LIST_FOR_EACH (iface, port_elem, &port->ifaces) {
- lacp_slave_enable(port->lacp, iface, iface->enabled);
- }
-
- lacp_run(port->lacp, lacp_send_pdu_cb);
- }
-
- bond_run(port);
-}
-
-static void
-port_wait(struct port *port)
-{
- if (port->monitor) {
- netdev_monitor_poll_wait(port->monitor);
- } else {
- poll_timer_wait_until(port->miimon_next_update);
- }
-
- if (port->lacp) {
- lacp_wait(port->lacp);
- }
-
- bond_wait(port);
-}
-
-static struct port *
-port_create(struct bridge *br, const char *name)
-{
- struct port *port;