2 * Copyright (c) 2008, 2009, 2010, 2011, 2012, 2013 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
27 #include "dynamic-string.h"
36 #include "poll-loop.h"
42 VLOG_DEFINE_THIS_MODULE(bond);
44 /* Bit-mask for hashing a flow down to a bucket.
45 * There are (BOND_MASK + 1) buckets. */
46 #define BOND_MASK 0xff
48 /* A hash bucket for mapping a flow to a slave.
49 * "struct bond" has an array of (BOND_MASK + 1) of these. */
51 struct bond_slave *slave; /* Assigned slave, NULL if unassigned. */
52 uint64_t tx_bytes; /* Count of bytes recently transmitted. */
53 struct list list_node; /* In bond_slave's 'entries' list. */
56 /* A bond slave, that is, one of the links comprising a bond. */
58 struct hmap_node hmap_node; /* In struct bond's slaves hmap. */
59 struct bond *bond; /* The bond that contains this slave. */
60 void *aux; /* Client-provided handle for this slave. */
62 struct netdev *netdev; /* Network device, owned by the client. */
63 unsigned int change_seq; /* Tracks changes in 'netdev'. */
64 char *name; /* Name (a copy of netdev_get_name(netdev)). */
67 long long delay_expires; /* Time after which 'enabled' may change. */
68 bool enabled; /* May be chosen for flows? */
69 bool may_enable; /* Client considers this slave bondable. */
71 /* Rebalancing info. Used only by bond_rebalance(). */
72 struct list bal_node; /* In bond_rebalance()'s 'bals' list. */
73 struct list entries; /* 'struct bond_entry's assigned here. */
74 uint64_t tx_bytes; /* Sum across 'tx_bytes' of entries. */
77 /* A bond, that is, a set of network devices grouped to improve performance or
80 struct hmap_node hmap_node; /* In 'all_bonds' hmap. */
81 char *name; /* Name provided by client. */
87 enum bond_mode balance; /* Balancing mode, one of BM_*. */
88 struct bond_slave *active_slave;
89 int updelay, downdelay; /* Delay before slave goes up/down, in ms. */
90 enum lacp_status lacp_status; /* Status of LACP negotiations. */
91 bool bond_revalidate; /* True if flows need revalidation. */
92 uint32_t basis; /* Basis for flow hash function. */
94 /* SLB specific bonding info. */
95 struct bond_entry *hash; /* An array of (BOND_MASK + 1) elements. */
96 int rebalance_interval; /* Interval between rebalances, in ms. */
97 long long int next_rebalance; /* Next rebalancing time. */
98 bool send_learning_packets;
100 /* Legacy compatibility. */
101 long long int next_fake_iface_update; /* LLONG_MAX if disabled. */
106 static struct ovs_rwlock rwlock = OVS_RWLOCK_INITIALIZER;
107 static struct hmap all_bonds__ = HMAP_INITIALIZER(&all_bonds__);
108 static struct hmap *const all_bonds OVS_GUARDED_BY(rwlock) = &all_bonds__;
110 static void bond_entry_reset(struct bond *) OVS_REQ_WRLOCK(rwlock);
111 static struct bond_slave *bond_slave_lookup(struct bond *, const void *slave_)
112 OVS_REQ_RDLOCK(rwlock);
113 static void bond_enable_slave(struct bond_slave *, bool enable)
114 OVS_REQ_WRLOCK(rwlock);
115 static void bond_link_status_update(struct bond_slave *)
116 OVS_REQ_WRLOCK(rwlock);
117 static void bond_choose_active_slave(struct bond *)
118 OVS_REQ_WRLOCK(rwlock);;
119 static unsigned int bond_hash_src(const uint8_t mac[ETH_ADDR_LEN],
120 uint16_t vlan, uint32_t basis);
121 static unsigned int bond_hash_tcp(const struct flow *, uint16_t vlan,
123 static struct bond_entry *lookup_bond_entry(const struct bond *,
126 OVS_REQ_RDLOCK(rwlock);
127 static struct bond_slave *choose_output_slave(const struct bond *,
129 struct flow_wildcards *,
131 OVS_REQ_RDLOCK(rwlock);
132 static void bond_update_fake_slave_stats(struct bond *)
133 OVS_REQ_RDLOCK(rwlock);
135 /* Attempts to parse 's' as the name of a bond balancing mode. If successful,
136 * stores the mode in '*balance' and returns true. Otherwise returns false
137 * without modifying '*balance'. */
139 bond_mode_from_string(enum bond_mode *balance, const char *s)
141 if (!strcmp(s, bond_mode_to_string(BM_TCP))) {
143 } else if (!strcmp(s, bond_mode_to_string(BM_SLB))) {
145 } else if (!strcmp(s, bond_mode_to_string(BM_AB))) {
153 /* Returns a string representing 'balance'. */
155 bond_mode_to_string(enum bond_mode balance) {
158 return "balance-tcp";
160 return "balance-slb";
162 return "active-backup";
168 /* Creates and returns a new bond whose configuration is initially taken from
171 * The caller should register each slave on the new bond by calling
172 * bond_slave_register(). */
174 bond_create(const struct bond_settings *s)
178 bond = xzalloc(sizeof *bond);
179 hmap_init(&bond->slaves);
180 bond->next_fake_iface_update = LLONG_MAX;
181 atomic_init(&bond->ref_cnt, 1);
183 bond_reconfigure(bond, s);
188 bond_ref(const struct bond *bond_)
190 struct bond *bond = CONST_CAST(struct bond *, bond_);
194 atomic_add(&bond->ref_cnt, 1, &orig);
195 ovs_assert(orig > 0);
202 bond_unref(struct bond *bond)
204 struct bond_slave *slave, *next_slave;
211 atomic_sub(&bond->ref_cnt, 1, &orig);
212 ovs_assert(orig > 0);
217 ovs_rwlock_wrlock(&rwlock);
218 hmap_remove(all_bonds, &bond->hmap_node);
219 ovs_rwlock_unlock(&rwlock);
221 HMAP_FOR_EACH_SAFE (slave, next_slave, hmap_node, &bond->slaves) {
222 hmap_remove(&bond->slaves, &slave->hmap_node);
223 /* Client owns 'slave->netdev'. */
227 hmap_destroy(&bond->slaves);
234 /* Updates 'bond''s overall configuration to 's'.
236 * The caller should register each slave on 'bond' by calling
237 * bond_slave_register(). This is optional if none of the slaves'
238 * configuration has changed. In any case it can't hurt.
240 * Returns true if the configuration has changed in such a way that requires
244 bond_reconfigure(struct bond *bond, const struct bond_settings *s)
246 bool revalidate = false;
248 ovs_rwlock_wrlock(&rwlock);
249 if (!bond->name || strcmp(bond->name, s->name)) {
251 hmap_remove(all_bonds, &bond->hmap_node);
254 bond->name = xstrdup(s->name);
255 hmap_insert(all_bonds, &bond->hmap_node, hash_string(bond->name, 0));
258 bond->updelay = s->up_delay;
259 bond->downdelay = s->down_delay;
261 if (bond->rebalance_interval != s->rebalance_interval) {
262 bond->rebalance_interval = s->rebalance_interval;
266 if (bond->balance != s->balance) {
267 bond->balance = s->balance;
271 if (bond->basis != s->basis) {
272 bond->basis = s->basis;
277 if (bond->next_fake_iface_update == LLONG_MAX) {
278 bond->next_fake_iface_update = time_msec();
281 bond->next_fake_iface_update = LLONG_MAX;
284 if (bond->bond_revalidate) {
286 bond->bond_revalidate = false;
289 if (bond->balance == BM_AB || !bond->hash || revalidate) {
290 bond_entry_reset(bond);
293 ovs_rwlock_unlock(&rwlock);
298 bond_slave_set_netdev__(struct bond_slave *slave, struct netdev *netdev)
299 OVS_REQ_WRLOCK(rwlock)
301 if (slave->netdev != netdev) {
302 slave->netdev = netdev;
303 slave->change_seq = 0;
307 /* Registers 'slave_' as a slave of 'bond'. The 'slave_' pointer is an
308 * arbitrary client-provided pointer that uniquely identifies a slave within a
309 * bond. If 'slave_' already exists within 'bond' then this function
310 * reconfigures the existing slave.
312 * 'netdev' must be the network device that 'slave_' represents. It is owned
313 * by the client, so the client must not close it before either unregistering
314 * 'slave_' or destroying 'bond'.
317 bond_slave_register(struct bond *bond, void *slave_, struct netdev *netdev)
319 struct bond_slave *slave;
321 ovs_rwlock_wrlock(&rwlock);
322 slave = bond_slave_lookup(bond, slave_);
324 slave = xzalloc(sizeof *slave);
326 hmap_insert(&bond->slaves, &slave->hmap_node, hash_pointer(slave_, 0));
329 slave->delay_expires = LLONG_MAX;
330 slave->name = xstrdup(netdev_get_name(netdev));
331 bond->bond_revalidate = true;
333 slave->enabled = false;
334 bond_enable_slave(slave, netdev_get_carrier(netdev));
337 bond_slave_set_netdev__(slave, netdev);
340 slave->name = xstrdup(netdev_get_name(netdev));
341 ovs_rwlock_unlock(&rwlock);
344 /* Updates the network device to be used with 'slave_' to 'netdev'.
346 * This is useful if the caller closes and re-opens the network device
347 * registered with bond_slave_register() but doesn't need to change anything
350 bond_slave_set_netdev(struct bond *bond, void *slave_, struct netdev *netdev)
352 struct bond_slave *slave;
354 ovs_rwlock_wrlock(&rwlock);
355 slave = bond_slave_lookup(bond, slave_);
357 bond_slave_set_netdev__(slave, netdev);
359 ovs_rwlock_unlock(&rwlock);
362 /* Unregisters 'slave_' from 'bond'. If 'bond' does not contain such a slave
363 * then this function has no effect.
365 * Unregistering a slave invalidates all flows. */
367 bond_slave_unregister(struct bond *bond, const void *slave_)
369 struct bond_slave *slave;
372 ovs_rwlock_wrlock(&rwlock);
373 slave = bond_slave_lookup(bond, slave_);
378 bond->bond_revalidate = true;
379 bond_enable_slave(slave, false);
381 del_active = bond->active_slave == slave;
383 struct bond_entry *e;
384 for (e = bond->hash; e <= &bond->hash[BOND_MASK]; e++) {
385 if (e->slave == slave) {
393 hmap_remove(&bond->slaves, &slave->hmap_node);
394 /* Client owns 'slave->netdev'. */
398 bond_choose_active_slave(bond);
399 bond->send_learning_packets = true;
402 ovs_rwlock_unlock(&rwlock);
405 /* Should be called on each slave in 'bond' before bond_run() to indicate
406 * whether or not 'slave_' may be enabled. This function is intended to allow
407 * other protocols to have some impact on bonding decisions. For example LACP
408 * or high level link monitoring protocols may decide that a given slave should
409 * not be able to send traffic. */
411 bond_slave_set_may_enable(struct bond *bond, void *slave_, bool may_enable)
413 ovs_rwlock_wrlock(&rwlock);
414 bond_slave_lookup(bond, slave_)->may_enable = may_enable;
415 ovs_rwlock_unlock(&rwlock);
418 /* Performs periodic maintenance on 'bond'.
420 * Returns true if the caller should revalidate its flows.
422 * The caller should check bond_should_send_learning_packets() afterward. */
424 bond_run(struct bond *bond, enum lacp_status lacp_status)
426 struct bond_slave *slave;
429 ovs_rwlock_wrlock(&rwlock);
430 if (bond->lacp_status != lacp_status) {
431 bond->lacp_status = lacp_status;
432 bond->bond_revalidate = true;
435 /* Enable slaves based on link status and LACP feedback. */
436 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
437 bond_link_status_update(slave);
438 slave->change_seq = netdev_change_seq(slave->netdev);
440 if (!bond->active_slave || !bond->active_slave->enabled) {
441 bond_choose_active_slave(bond);
444 /* Update fake bond interface stats. */
445 if (time_msec() >= bond->next_fake_iface_update) {
446 bond_update_fake_slave_stats(bond);
447 bond->next_fake_iface_update = time_msec() + 1000;
450 revalidate = bond->bond_revalidate;
451 bond->bond_revalidate = false;
452 ovs_rwlock_unlock(&rwlock);
457 /* Causes poll_block() to wake up when 'bond' needs something to be done. */
459 bond_wait(struct bond *bond)
461 struct bond_slave *slave;
463 ovs_rwlock_rdlock(&rwlock);
464 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
465 if (slave->delay_expires != LLONG_MAX) {
466 poll_timer_wait_until(slave->delay_expires);
469 if (slave->change_seq != netdev_change_seq(slave->netdev)) {
470 poll_immediate_wake();
474 if (bond->next_fake_iface_update != LLONG_MAX) {
475 poll_timer_wait_until(bond->next_fake_iface_update);
478 if (bond->bond_revalidate) {
479 poll_immediate_wake();
481 ovs_rwlock_unlock(&rwlock);
483 /* We don't wait for bond->next_rebalance because rebalancing can only run
484 * at a flow account checkpoint. ofproto does checkpointing on its own
485 * schedule and bond_rebalance() gets called afterward, so we'd just be
486 * waking up for no purpose. */
489 /* MAC learning table interaction. */
492 may_send_learning_packets(const struct bond *bond)
494 return bond->lacp_status == LACP_DISABLED
495 && (bond->balance == BM_SLB || bond->balance == BM_AB)
496 && bond->active_slave;
499 /* Returns true if 'bond' needs the client to send out packets to assist with
500 * MAC learning on 'bond'. If this function returns true, then the client
501 * should iterate through its MAC learning table for the bridge on which 'bond'
502 * is located. For each MAC that has been learned on a port other than 'bond',
503 * it should call bond_compose_learning_packet().
505 * This function will only return true if 'bond' is in SLB or active-backup
506 * mode and LACP is not negotiated. Otherwise sending learning packets isn't
509 * Calling this function resets the state that it checks. */
511 bond_should_send_learning_packets(struct bond *bond)
515 ovs_rwlock_wrlock(&rwlock);
516 send = bond->send_learning_packets && may_send_learning_packets(bond);
517 bond->send_learning_packets = false;
518 ovs_rwlock_unlock(&rwlock);
522 /* Sends a gratuitous learning packet on 'bond' from 'eth_src' on 'vlan'.
524 * See bond_should_send_learning_packets() for description of usage. The
525 * caller should send the composed packet on the port associated with
526 * port_aux and takes ownership of the returned ofpbuf. */
528 bond_compose_learning_packet(struct bond *bond,
529 const uint8_t eth_src[ETH_ADDR_LEN],
530 uint16_t vlan, void **port_aux)
532 struct bond_slave *slave;
533 struct ofpbuf *packet;
536 ovs_rwlock_rdlock(&rwlock);
537 ovs_assert(may_send_learning_packets(bond));
538 memset(&flow, 0, sizeof flow);
539 memcpy(flow.dl_src, eth_src, ETH_ADDR_LEN);
540 slave = choose_output_slave(bond, &flow, NULL, vlan);
542 packet = ofpbuf_new(0);
543 compose_rarp(packet, eth_src);
545 eth_push_vlan(packet, htons(vlan));
548 *port_aux = slave->aux;
549 ovs_rwlock_unlock(&rwlock);
553 /* Checks whether a packet that arrived on 'slave_' within 'bond', with an
554 * Ethernet destination address of 'eth_dst', should be admitted.
556 * The return value is one of the following:
558 * - BV_ACCEPT: Admit the packet.
560 * - BV_DROP: Drop the packet.
562 * - BV_DROP_IF_MOVED: Consult the MAC learning table for the packet's
563 * Ethernet source address and VLAN. If there is none, or if the packet
564 * is on the learned port, then admit the packet. If a different port has
565 * been learned, however, drop the packet (and do not use it for MAC
569 bond_check_admissibility(struct bond *bond, const void *slave_,
570 const uint8_t eth_dst[ETH_ADDR_LEN])
572 enum bond_verdict verdict = BV_DROP;
573 struct bond_slave *slave;
575 ovs_rwlock_rdlock(&rwlock);
576 slave = bond_slave_lookup(bond, slave_);
581 /* LACP bonds have very loose admissibility restrictions because we can
582 * assume the remote switch is aware of the bond and will "do the right
583 * thing". However, as a precaution we drop packets on disabled slaves
584 * because no correctly implemented partner switch should be sending
587 * If LACP is configured, but LACP negotiations have been unsuccessful, we
588 * drop all incoming traffic. */
589 switch (bond->lacp_status) {
590 case LACP_NEGOTIATED:
591 verdict = slave->enabled ? BV_ACCEPT : BV_DROP;
593 case LACP_CONFIGURED:
599 /* Drop all multicast packets on inactive slaves. */
600 if (eth_addr_is_multicast(eth_dst)) {
601 if (bond->active_slave != slave) {
606 switch (bond->balance) {
608 /* Drop all packets which arrive on backup slaves. This is similar to
609 * how Linux bonding handles active-backup bonds. */
610 if (bond->active_slave != slave) {
611 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
613 VLOG_DBG_RL(&rl, "active-backup bond received packet on backup"
614 " slave (%s) destined for " ETH_ADDR_FMT,
615 slave->name, ETH_ADDR_ARGS(eth_dst));
622 /* TCP balanced bonds require successful LACP negotiated. Based on the
623 * above check, LACP is off on this bond. Therfore, we drop all
624 * incoming traffic. */
628 /* Drop all packets for which we have learned a different input port,
629 * because we probably sent the packet on one slave and got it back on
630 * the other. Gratuitous ARP packets are an exception to this rule:
631 * the host has moved to another switch. The exception to the
632 * exception is if we locked the learning table to avoid reflections on
634 verdict = BV_DROP_IF_MOVED;
640 ovs_rwlock_unlock(&rwlock);
645 /* Returns the slave (registered on 'bond' by bond_slave_register()) to which
646 * a packet with the given 'flow' and 'vlan' should be forwarded. Returns
647 * NULL if the packet should be dropped because no slaves are enabled.
649 * 'vlan' is not necessarily the same as 'flow->vlan_tci'. First, 'vlan'
650 * should be a VID only (i.e. excluding the PCP bits). Second,
651 * 'flow->vlan_tci' is the VLAN TCI that appeared on the packet (so it will be
652 * nonzero only for trunk ports), whereas 'vlan' is the logical VLAN that the
653 * packet belongs to (so for an access port it will be the access port's VLAN).
655 * If 'wc' is non-NULL, bitwise-OR's 'wc' with the set of bits that were
656 * significant in the selection. At some point earlier, 'wc' should
657 * have been initialized (e.g., by flow_wildcards_init_catchall()).
660 bond_choose_output_slave(struct bond *bond, const struct flow *flow,
661 struct flow_wildcards *wc, uint16_t vlan)
663 struct bond_slave *slave;
666 ovs_rwlock_rdlock(&rwlock);
667 slave = choose_output_slave(bond, flow, wc, vlan);
668 aux = slave ? slave->aux : NULL;
669 ovs_rwlock_unlock(&rwlock);
677 bond_is_balanced(const struct bond *bond) OVS_REQ_RDLOCK(rwlock)
679 return bond->rebalance_interval
680 && (bond->balance == BM_SLB || bond->balance == BM_TCP);
683 /* Notifies 'bond' that 'n_bytes' bytes were sent in 'flow' within 'vlan'. */
685 bond_account(struct bond *bond, const struct flow *flow, uint16_t vlan,
688 ovs_rwlock_wrlock(&rwlock);
689 if (bond_is_balanced(bond)) {
690 lookup_bond_entry(bond, flow, vlan)->tx_bytes += n_bytes;
692 ovs_rwlock_unlock(&rwlock);
695 static struct bond_slave *
696 bond_slave_from_bal_node(struct list *bal) OVS_REQ_RDLOCK(rwlock)
698 return CONTAINER_OF(bal, struct bond_slave, bal_node);
702 log_bals(struct bond *bond, const struct list *bals)
704 if (VLOG_IS_DBG_ENABLED()) {
705 struct ds ds = DS_EMPTY_INITIALIZER;
706 const struct bond_slave *slave;
708 LIST_FOR_EACH (slave, bal_node, bals) {
710 ds_put_char(&ds, ',');
712 ds_put_format(&ds, " %s %"PRIu64"kB",
713 slave->name, slave->tx_bytes / 1024);
715 if (!slave->enabled) {
716 ds_put_cstr(&ds, " (disabled)");
718 if (!list_is_empty(&slave->entries)) {
719 struct bond_entry *e;
721 ds_put_cstr(&ds, " (");
722 LIST_FOR_EACH (e, list_node, &slave->entries) {
723 if (&e->list_node != list_front(&slave->entries)) {
724 ds_put_cstr(&ds, " + ");
726 ds_put_format(&ds, "h%td: %"PRIu64"kB",
727 e - bond->hash, e->tx_bytes / 1024);
729 ds_put_cstr(&ds, ")");
732 VLOG_DBG("bond %s:%s", bond->name, ds_cstr(&ds));
737 /* Shifts 'hash' from its current slave to 'to'. */
739 bond_shift_load(struct bond_entry *hash, struct bond_slave *to)
741 struct bond_slave *from = hash->slave;
742 struct bond *bond = from->bond;
743 uint64_t delta = hash->tx_bytes;
745 VLOG_INFO("bond %s: shift %"PRIu64"kB of load (with hash %td) "
746 "from %s to %s (now carrying %"PRIu64"kB and "
747 "%"PRIu64"kB load, respectively)",
748 bond->name, delta / 1024, hash - bond->hash,
749 from->name, to->name,
750 (from->tx_bytes - delta) / 1024,
751 (to->tx_bytes + delta) / 1024);
753 /* Shift load away from 'from' to 'to'. */
754 from->tx_bytes -= delta;
755 to->tx_bytes += delta;
757 /* Arrange for flows to be revalidated. */
759 bond->bond_revalidate = true;
762 /* Picks and returns a bond_entry to migrate from 'from' (the most heavily
763 * loaded bond slave) to a bond slave that has 'to_tx_bytes' bytes of load,
764 * given that doing so must decrease the ratio of the load on the two slaves by
765 * at least 0.1. Returns NULL if there is no appropriate entry.
767 * The list of entries isn't sorted. I don't know of a reason to prefer to
768 * shift away small hashes or large hashes. */
769 static struct bond_entry *
770 choose_entry_to_migrate(const struct bond_slave *from, uint64_t to_tx_bytes)
772 struct bond_entry *e;
774 if (list_is_short(&from->entries)) {
775 /* 'from' carries no more than one MAC hash, so shifting load away from
776 * it would be pointless. */
780 LIST_FOR_EACH (e, list_node, &from->entries) {
781 double old_ratio, new_ratio;
784 if (to_tx_bytes == 0) {
785 /* Nothing on the new slave, move it. */
790 old_ratio = (double)from->tx_bytes / to_tx_bytes;
791 new_ratio = (double)(from->tx_bytes - delta) / (to_tx_bytes + delta);
792 if (old_ratio - new_ratio > 0.1
793 && fabs(new_ratio - 1.0) < fabs(old_ratio - 1.0)) {
794 /* We're aiming for an ideal ratio of 1, meaning both the 'from'
795 and 'to' slave have the same load. Therefore, we only move an
796 entry if it decreases the load on 'from', and brings us closer
797 to equal traffic load. */
805 /* Inserts 'slave' into 'bals' so that descending order of 'tx_bytes' is
808 insert_bal(struct list *bals, struct bond_slave *slave)
810 struct bond_slave *pos;
812 LIST_FOR_EACH (pos, bal_node, bals) {
813 if (slave->tx_bytes > pos->tx_bytes) {
817 list_insert(&pos->bal_node, &slave->bal_node);
820 /* Removes 'slave' from its current list and then inserts it into 'bals' so
821 * that descending order of 'tx_bytes' is maintained. */
823 reinsert_bal(struct list *bals, struct bond_slave *slave)
825 list_remove(&slave->bal_node);
826 insert_bal(bals, slave);
829 /* If 'bond' needs rebalancing, does so.
831 * The caller should have called bond_account() for each active flow, to ensure
832 * that flow data is consistently accounted at this point. */
834 bond_rebalance(struct bond *bond)
836 struct bond_slave *slave;
837 struct bond_entry *e;
840 ovs_rwlock_wrlock(&rwlock);
841 if (!bond_is_balanced(bond) || time_msec() < bond->next_rebalance) {
842 ovs_rwlock_unlock(&rwlock);
845 bond->next_rebalance = time_msec() + bond->rebalance_interval;
847 /* Add each bond_entry to its slave's 'entries' list.
848 * Compute each slave's tx_bytes as the sum of its entries' tx_bytes. */
849 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
851 list_init(&slave->entries);
853 for (e = &bond->hash[0]; e <= &bond->hash[BOND_MASK]; e++) {
854 if (e->slave && e->tx_bytes) {
855 e->slave->tx_bytes += e->tx_bytes;
856 list_push_back(&e->slave->entries, &e->list_node);
860 /* Add enabled slaves to 'bals' in descending order of tx_bytes.
862 * XXX This is O(n**2) in the number of slaves but it could be O(n lg n)
863 * with a proper list sort algorithm. */
865 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
866 if (slave->enabled) {
867 insert_bal(&bals, slave);
870 log_bals(bond, &bals);
872 /* Shift load from the most-loaded slaves to the least-loaded slaves. */
873 while (!list_is_short(&bals)) {
874 struct bond_slave *from = bond_slave_from_bal_node(list_front(&bals));
875 struct bond_slave *to = bond_slave_from_bal_node(list_back(&bals));
878 overload = from->tx_bytes - to->tx_bytes;
879 if (overload < to->tx_bytes >> 5 || overload < 100000) {
880 /* The extra load on 'from' (and all less-loaded slaves), compared
881 * to that of 'to' (the least-loaded slave), is less than ~3%, or
882 * it is less than ~1Mbps. No point in rebalancing. */
886 /* 'from' is carrying significantly more load than 'to'. Pick a hash
887 * to move from 'from' to 'to'. */
888 e = choose_entry_to_migrate(from, to->tx_bytes);
890 bond_shift_load(e, to);
892 /* Delete element from from->entries.
894 * We don't add the element to to->hashes. That would only allow
895 * 'e' to be migrated to another slave in this rebalancing run, and
896 * there is no point in doing that. */
897 list_remove(&e->list_node);
899 /* Re-sort 'bals'. */
900 reinsert_bal(&bals, from);
901 reinsert_bal(&bals, to);
903 /* Can't usefully migrate anything away from 'from'.
904 * Don't reconsider it. */
905 list_remove(&from->bal_node);
909 /* Implement exponentially weighted moving average. A weight of 1/2 causes
910 * historical data to decay to <1% in 7 rebalancing runs. 1,000,000 bytes
911 * take 20 rebalancing runs to decay to 0 and get deleted entirely. */
912 for (e = &bond->hash[0]; e <= &bond->hash[BOND_MASK]; e++) {
918 ovs_rwlock_unlock(&rwlock);
921 /* Bonding unixctl user interface functions. */
924 bond_find(const char *name) OVS_REQ_RDLOCK(rwlock)
928 HMAP_FOR_EACH_WITH_HASH (bond, hmap_node, hash_string(name, 0),
930 if (!strcmp(bond->name, name)) {
937 static struct bond_slave *
938 bond_lookup_slave(struct bond *bond, const char *slave_name)
940 struct bond_slave *slave;
942 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
943 if (!strcmp(slave->name, slave_name)) {
951 bond_unixctl_list(struct unixctl_conn *conn,
952 int argc OVS_UNUSED, const char *argv[] OVS_UNUSED,
953 void *aux OVS_UNUSED)
955 struct ds ds = DS_EMPTY_INITIALIZER;
956 const struct bond *bond;
958 ds_put_cstr(&ds, "bond\ttype\tslaves\n");
960 ovs_rwlock_rdlock(&rwlock);
961 HMAP_FOR_EACH (bond, hmap_node, all_bonds) {
962 const struct bond_slave *slave;
965 ds_put_format(&ds, "%s\t%s\t",
966 bond->name, bond_mode_to_string(bond->balance));
969 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
971 ds_put_cstr(&ds, ", ");
973 ds_put_cstr(&ds, slave->name);
975 ds_put_char(&ds, '\n');
977 ovs_rwlock_unlock(&rwlock);
978 unixctl_command_reply(conn, ds_cstr(&ds));
983 bond_print_details(struct ds *ds, const struct bond *bond)
984 OVS_REQ_RDLOCK(rwlock)
986 struct shash slave_shash = SHASH_INITIALIZER(&slave_shash);
987 const struct shash_node **sorted_slaves = NULL;
988 const struct bond_slave *slave;
991 ds_put_format(ds, "---- %s ----\n", bond->name);
992 ds_put_format(ds, "bond_mode: %s\n",
993 bond_mode_to_string(bond->balance));
995 ds_put_format(ds, "bond-hash-basis: %"PRIu32"\n", bond->basis);
997 ds_put_format(ds, "updelay: %d ms\n", bond->updelay);
998 ds_put_format(ds, "downdelay: %d ms\n", bond->downdelay);
1000 if (bond_is_balanced(bond)) {
1001 ds_put_format(ds, "next rebalance: %lld ms\n",
1002 bond->next_rebalance - time_msec());
1005 ds_put_cstr(ds, "lacp_status: ");
1006 switch (bond->lacp_status) {
1007 case LACP_NEGOTIATED:
1008 ds_put_cstr(ds, "negotiated\n");
1010 case LACP_CONFIGURED:
1011 ds_put_cstr(ds, "configured\n");
1014 ds_put_cstr(ds, "off\n");
1017 ds_put_cstr(ds, "<unknown>\n");
1021 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1022 shash_add(&slave_shash, slave->name, slave);
1024 sorted_slaves = shash_sort(&slave_shash);
1026 for (i = 0; i < shash_count(&slave_shash); i++) {
1027 struct bond_entry *be;
1029 slave = sorted_slaves[i]->data;
1032 ds_put_format(ds, "\nslave %s: %s\n",
1033 slave->name, slave->enabled ? "enabled" : "disabled");
1034 if (slave == bond->active_slave) {
1035 ds_put_cstr(ds, "\tactive slave\n");
1037 if (slave->delay_expires != LLONG_MAX) {
1038 ds_put_format(ds, "\t%s expires in %lld ms\n",
1039 slave->enabled ? "downdelay" : "updelay",
1040 slave->delay_expires - time_msec());
1043 ds_put_format(ds, "\tmay_enable: %s\n",
1044 slave->may_enable ? "true" : "false");
1046 if (!bond_is_balanced(bond)) {
1051 for (be = bond->hash; be <= &bond->hash[BOND_MASK]; be++) {
1052 int hash = be - bond->hash;
1054 if (be->slave != slave) {
1058 ds_put_format(ds, "\thash %d: %"PRIu64" kB load\n",
1059 hash, be->tx_bytes / 1024);
1061 /* XXX How can we list the MACs assigned to hashes of SLB bonds? */
1064 shash_destroy(&slave_shash);
1065 free(sorted_slaves);
1066 ds_put_cstr(ds, "\n");
1070 bond_unixctl_show(struct unixctl_conn *conn,
1071 int argc, const char *argv[],
1072 void *aux OVS_UNUSED)
1074 struct ds ds = DS_EMPTY_INITIALIZER;
1076 ovs_rwlock_rdlock(&rwlock);
1078 const struct bond *bond = bond_find(argv[1]);
1081 unixctl_command_reply_error(conn, "no such bond");
1084 bond_print_details(&ds, bond);
1086 const struct bond *bond;
1088 HMAP_FOR_EACH (bond, hmap_node, all_bonds) {
1089 bond_print_details(&ds, bond);
1093 unixctl_command_reply(conn, ds_cstr(&ds));
1097 ovs_rwlock_unlock(&rwlock);
1101 bond_unixctl_migrate(struct unixctl_conn *conn,
1102 int argc OVS_UNUSED, const char *argv[],
1103 void *aux OVS_UNUSED)
1105 const char *bond_s = argv[1];
1106 const char *hash_s = argv[2];
1107 const char *slave_s = argv[3];
1109 struct bond_slave *slave;
1110 struct bond_entry *entry;
1113 ovs_rwlock_wrlock(&rwlock);
1114 bond = bond_find(bond_s);
1116 unixctl_command_reply_error(conn, "no such bond");
1120 if (bond->balance != BM_SLB) {
1121 unixctl_command_reply_error(conn, "not an SLB bond");
1125 if (strspn(hash_s, "0123456789") == strlen(hash_s)) {
1126 hash = atoi(hash_s) & BOND_MASK;
1128 unixctl_command_reply_error(conn, "bad hash");
1132 slave = bond_lookup_slave(bond, slave_s);
1134 unixctl_command_reply_error(conn, "no such slave");
1138 if (!slave->enabled) {
1139 unixctl_command_reply_error(conn, "cannot migrate to disabled slave");
1143 entry = &bond->hash[hash];
1144 bond->bond_revalidate = true;
1145 entry->slave = slave;
1146 unixctl_command_reply(conn, "migrated");
1149 ovs_rwlock_unlock(&rwlock);
1153 bond_unixctl_set_active_slave(struct unixctl_conn *conn,
1154 int argc OVS_UNUSED, const char *argv[],
1155 void *aux OVS_UNUSED)
1157 const char *bond_s = argv[1];
1158 const char *slave_s = argv[2];
1160 struct bond_slave *slave;
1162 ovs_rwlock_wrlock(&rwlock);
1163 bond = bond_find(bond_s);
1165 unixctl_command_reply_error(conn, "no such bond");
1169 slave = bond_lookup_slave(bond, slave_s);
1171 unixctl_command_reply_error(conn, "no such slave");
1175 if (!slave->enabled) {
1176 unixctl_command_reply_error(conn, "cannot make disabled slave active");
1180 if (bond->active_slave != slave) {
1181 bond->bond_revalidate = true;
1182 bond->active_slave = slave;
1183 VLOG_INFO("bond %s: active interface is now %s",
1184 bond->name, slave->name);
1185 bond->send_learning_packets = true;
1186 unixctl_command_reply(conn, "done");
1188 unixctl_command_reply(conn, "no change");
1191 ovs_rwlock_unlock(&rwlock);
1195 enable_slave(struct unixctl_conn *conn, const char *argv[], bool enable)
1197 const char *bond_s = argv[1];
1198 const char *slave_s = argv[2];
1200 struct bond_slave *slave;
1202 ovs_rwlock_wrlock(&rwlock);
1203 bond = bond_find(bond_s);
1205 unixctl_command_reply_error(conn, "no such bond");
1209 slave = bond_lookup_slave(bond, slave_s);
1211 unixctl_command_reply_error(conn, "no such slave");
1215 bond_enable_slave(slave, enable);
1216 unixctl_command_reply(conn, enable ? "enabled" : "disabled");
1219 ovs_rwlock_unlock(&rwlock);
1223 bond_unixctl_enable_slave(struct unixctl_conn *conn,
1224 int argc OVS_UNUSED, const char *argv[],
1225 void *aux OVS_UNUSED)
1227 enable_slave(conn, argv, true);
1231 bond_unixctl_disable_slave(struct unixctl_conn *conn,
1232 int argc OVS_UNUSED, const char *argv[],
1233 void *aux OVS_UNUSED)
1235 enable_slave(conn, argv, false);
1239 bond_unixctl_hash(struct unixctl_conn *conn, int argc, const char *argv[],
1240 void *aux OVS_UNUSED)
1242 const char *mac_s = argv[1];
1243 const char *vlan_s = argc > 2 ? argv[2] : NULL;
1244 const char *basis_s = argc > 3 ? argv[3] : NULL;
1245 uint8_t mac[ETH_ADDR_LEN];
1252 if (sscanf(vlan_s, "%u", &vlan) != 1) {
1253 unixctl_command_reply_error(conn, "invalid vlan");
1261 if (sscanf(basis_s, "%"PRIu32, &basis) != 1) {
1262 unixctl_command_reply_error(conn, "invalid basis");
1269 if (sscanf(mac_s, ETH_ADDR_SCAN_FMT, ETH_ADDR_SCAN_ARGS(mac))
1270 == ETH_ADDR_SCAN_COUNT) {
1271 hash = bond_hash_src(mac, vlan, basis) & BOND_MASK;
1273 hash_cstr = xasprintf("%u", hash);
1274 unixctl_command_reply(conn, hash_cstr);
1277 unixctl_command_reply_error(conn, "invalid mac");
1284 unixctl_command_register("bond/list", "", 0, 0, bond_unixctl_list, NULL);
1285 unixctl_command_register("bond/show", "[port]", 0, 1, bond_unixctl_show,
1287 unixctl_command_register("bond/migrate", "port hash slave", 3, 3,
1288 bond_unixctl_migrate, NULL);
1289 unixctl_command_register("bond/set-active-slave", "port slave", 2, 2,
1290 bond_unixctl_set_active_slave, NULL);
1291 unixctl_command_register("bond/enable-slave", "port slave", 2, 2,
1292 bond_unixctl_enable_slave, NULL);
1293 unixctl_command_register("bond/disable-slave", "port slave", 2, 2,
1294 bond_unixctl_disable_slave, NULL);
1295 unixctl_command_register("bond/hash", "mac [vlan] [basis]", 1, 3,
1296 bond_unixctl_hash, NULL);
1300 bond_entry_reset(struct bond *bond)
1302 if (bond->balance != BM_AB) {
1303 size_t hash_len = (BOND_MASK + 1) * sizeof *bond->hash;
1306 bond->hash = xmalloc(hash_len);
1308 memset(bond->hash, 0, hash_len);
1310 bond->next_rebalance = time_msec() + bond->rebalance_interval;
1317 static struct bond_slave *
1318 bond_slave_lookup(struct bond *bond, const void *slave_)
1320 struct bond_slave *slave;
1322 HMAP_FOR_EACH_IN_BUCKET (slave, hmap_node, hash_pointer(slave_, 0),
1324 if (slave->aux == slave_) {
1333 bond_enable_slave(struct bond_slave *slave, bool enable)
1335 slave->delay_expires = LLONG_MAX;
1336 if (enable != slave->enabled) {
1337 slave->bond->bond_revalidate = true;
1338 slave->enabled = enable;
1339 VLOG_INFO("interface %s: %s", slave->name,
1340 slave->enabled ? "enabled" : "disabled");
1345 bond_link_status_update(struct bond_slave *slave)
1347 struct bond *bond = slave->bond;
1350 up = netdev_get_carrier(slave->netdev) && slave->may_enable;
1351 if ((up == slave->enabled) != (slave->delay_expires == LLONG_MAX)) {
1352 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
1353 VLOG_INFO_RL(&rl, "interface %s: link state %s",
1354 slave->name, up ? "up" : "down");
1355 if (up == slave->enabled) {
1356 slave->delay_expires = LLONG_MAX;
1357 VLOG_INFO_RL(&rl, "interface %s: will not be %s",
1358 slave->name, up ? "disabled" : "enabled");
1360 int delay = (bond->lacp_status != LACP_DISABLED ? 0
1361 : up ? bond->updelay : bond->downdelay);
1362 slave->delay_expires = time_msec() + delay;
1364 VLOG_INFO_RL(&rl, "interface %s: will be %s if it stays %s "
1367 up ? "enabled" : "disabled",
1374 if (time_msec() >= slave->delay_expires) {
1375 bond_enable_slave(slave, up);
1380 bond_hash_src(const uint8_t mac[ETH_ADDR_LEN], uint16_t vlan, uint32_t basis)
1382 return hash_3words(hash_bytes(mac, ETH_ADDR_LEN, 0), vlan, basis);
1386 bond_hash_tcp(const struct flow *flow, uint16_t vlan, uint32_t basis)
1388 struct flow hash_flow = *flow;
1389 hash_flow.vlan_tci = htons(vlan);
1391 /* The symmetric quality of this hash function is not required, but
1392 * flow_hash_symmetric_l4 already exists, and is sufficient for our
1393 * purposes, so we use it out of convenience. */
1394 return flow_hash_symmetric_l4(&hash_flow, basis);
1398 bond_hash(const struct bond *bond, const struct flow *flow, uint16_t vlan)
1400 ovs_assert(bond->balance == BM_TCP || bond->balance == BM_SLB);
1402 return (bond->balance == BM_TCP
1403 ? bond_hash_tcp(flow, vlan, bond->basis)
1404 : bond_hash_src(flow->dl_src, vlan, bond->basis));
1407 static struct bond_entry *
1408 lookup_bond_entry(const struct bond *bond, const struct flow *flow,
1411 return &bond->hash[bond_hash(bond, flow, vlan) & BOND_MASK];
1414 static struct bond_slave *
1415 choose_output_slave(const struct bond *bond, const struct flow *flow,
1416 struct flow_wildcards *wc, uint16_t vlan)
1418 struct bond_entry *e;
1420 if (bond->lacp_status == LACP_CONFIGURED) {
1421 /* LACP has been configured on this bond but negotiations were
1422 * unsuccussful. Drop all traffic. */
1426 switch (bond->balance) {
1428 return bond->active_slave;
1431 if (bond->lacp_status != LACP_NEGOTIATED) {
1432 /* Must have LACP negotiations for TCP balanced bonds. */
1436 flow_mask_hash_fields(flow, wc, NX_HASH_FIELDS_SYMMETRIC_L4);
1441 flow_mask_hash_fields(flow, wc, NX_HASH_FIELDS_ETH_SRC);
1443 e = lookup_bond_entry(bond, flow, vlan);
1444 if (!e->slave || !e->slave->enabled) {
1445 e->slave = CONTAINER_OF(hmap_random_node(&bond->slaves),
1446 struct bond_slave, hmap_node);
1447 if (!e->slave->enabled) {
1448 e->slave = bond->active_slave;
1458 static struct bond_slave *
1459 bond_choose_slave(const struct bond *bond)
1461 struct bond_slave *slave, *best;
1463 /* Find an enabled slave. */
1464 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1465 if (slave->enabled) {
1470 /* All interfaces are disabled. Find an interface that will be enabled
1471 * after its updelay expires. */
1473 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1474 if (slave->delay_expires != LLONG_MAX
1475 && slave->may_enable
1476 && (!best || slave->delay_expires < best->delay_expires)) {
1484 bond_choose_active_slave(struct bond *bond)
1486 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
1487 struct bond_slave *old_active_slave = bond->active_slave;
1489 bond->active_slave = bond_choose_slave(bond);
1490 if (bond->active_slave) {
1491 if (bond->active_slave->enabled) {
1492 VLOG_INFO_RL(&rl, "bond %s: active interface is now %s",
1493 bond->name, bond->active_slave->name);
1495 VLOG_INFO_RL(&rl, "bond %s: active interface is now %s, skipping "
1496 "remaining %lld ms updelay (since no interface was "
1497 "enabled)", bond->name, bond->active_slave->name,
1498 bond->active_slave->delay_expires - time_msec());
1499 bond_enable_slave(bond->active_slave, true);
1502 bond->send_learning_packets = true;
1503 } else if (old_active_slave) {
1504 VLOG_INFO_RL(&rl, "bond %s: all interfaces disabled", bond->name);
1508 /* Attempts to make the sum of the bond slaves' statistics appear on the fake
1509 * bond interface. */
1511 bond_update_fake_slave_stats(struct bond *bond)
1513 struct netdev_stats bond_stats;
1514 struct bond_slave *slave;
1515 struct netdev *bond_dev;
1517 memset(&bond_stats, 0, sizeof bond_stats);
1519 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1520 struct netdev_stats slave_stats;
1522 if (!netdev_get_stats(slave->netdev, &slave_stats)) {
1523 /* XXX: We swap the stats here because they are swapped back when
1524 * reported by the internal device. The reason for this is
1525 * internal devices normally represent packets going into the
1526 * system but when used as fake bond device they represent packets
1527 * leaving the system. We really should do this in the internal
1528 * device itself because changing it here reverses the counts from
1529 * the perspective of the switch. However, the internal device
1530 * doesn't know what type of device it represents so we have to do
1531 * it here for now. */
1532 bond_stats.tx_packets += slave_stats.rx_packets;
1533 bond_stats.tx_bytes += slave_stats.rx_bytes;
1534 bond_stats.rx_packets += slave_stats.tx_packets;
1535 bond_stats.rx_bytes += slave_stats.tx_bytes;
1539 if (!netdev_open(bond->name, "system", &bond_dev)) {
1540 netdev_set_stats(bond_dev, &bond_stats);
1541 netdev_close(bond_dev);