2 * Copyright (c) 2008, 2009, 2010, 2011, 2012, 2013 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
27 #include "dynamic-string.h"
36 #include "poll-loop.h"
43 VLOG_DEFINE_THIS_MODULE(bond);
45 /* Bit-mask for hashing a flow down to a bucket.
46 * There are (BOND_MASK + 1) buckets. */
47 #define BOND_MASK 0xff
49 /* A hash bucket for mapping a flow to a slave.
50 * "struct bond" has an array of (BOND_MASK + 1) of these. */
52 struct bond_slave *slave; /* Assigned slave, NULL if unassigned. */
53 uint64_t tx_bytes; /* Count of bytes recently transmitted. */
54 tag_type tag; /* Tag for entry<->facet association. */
55 struct list list_node; /* In bond_slave's 'entries' list. */
58 /* A bond slave, that is, one of the links comprising a bond. */
60 struct hmap_node hmap_node; /* In struct bond's slaves hmap. */
61 struct bond *bond; /* The bond that contains this slave. */
62 void *aux; /* Client-provided handle for this slave. */
64 struct netdev *netdev; /* Network device, owned by the client. */
65 unsigned int change_seq; /* Tracks changes in 'netdev'. */
66 char *name; /* Name (a copy of netdev_get_name(netdev)). */
69 long long delay_expires; /* Time after which 'enabled' may change. */
70 bool enabled; /* May be chosen for flows? */
71 bool may_enable; /* Client considers this slave bondable. */
72 tag_type tag; /* Tag associated with this slave. */
74 /* Rebalancing info. Used only by bond_rebalance(). */
75 struct list bal_node; /* In bond_rebalance()'s 'bals' list. */
76 struct list entries; /* 'struct bond_entry's assigned here. */
77 uint64_t tx_bytes; /* Sum across 'tx_bytes' of entries. */
80 /* A bond, that is, a set of network devices grouped to improve performance or
83 struct hmap_node hmap_node; /* In 'all_bonds' hmap. */
84 char *name; /* Name provided by client. */
90 enum bond_mode balance; /* Balancing mode, one of BM_*. */
91 struct bond_slave *active_slave;
92 tag_type no_slaves_tag; /* Tag for flows when all slaves disabled. */
93 int updelay, downdelay; /* Delay before slave goes up/down, in ms. */
94 enum lacp_status lacp_status; /* Status of LACP negotiations. */
95 bool bond_revalidate; /* True if flows need revalidation. */
96 uint32_t basis; /* Basis for flow hash function. */
98 /* SLB specific bonding info. */
99 struct bond_entry *hash; /* An array of (BOND_MASK + 1) elements. */
100 int rebalance_interval; /* Interval between rebalances, in ms. */
101 long long int next_rebalance; /* Next rebalancing time. */
102 bool send_learning_packets;
104 /* Legacy compatibility. */
105 long long int next_fake_iface_update; /* LLONG_MAX if disabled. */
107 /* Tag set saved for next bond_run(). This tag set is a kluge for cases
108 * where we can't otherwise provide revalidation feedback to the client.
109 * That's only unixctl commands now; I hope no other cases will arise. */
110 struct tag_set unixctl_tags;
113 static struct hmap all_bonds = HMAP_INITIALIZER(&all_bonds);
115 static void bond_entry_reset(struct bond *);
116 static struct bond_slave *bond_slave_lookup(struct bond *, const void *slave_);
117 static void bond_enable_slave(struct bond_slave *, bool enable,
119 static void bond_link_status_update(struct bond_slave *, struct tag_set *);
120 static void bond_choose_active_slave(struct bond *, struct tag_set *);
121 static unsigned int bond_hash_src(const uint8_t mac[ETH_ADDR_LEN],
122 uint16_t vlan, uint32_t basis);
123 static unsigned int bond_hash_tcp(const struct flow *, uint16_t vlan,
125 static struct bond_entry *lookup_bond_entry(const struct bond *,
128 static tag_type bond_get_active_slave_tag(const struct bond *);
129 static struct bond_slave *choose_output_slave(const struct bond *,
131 struct flow_wildcards *,
132 uint16_t vlan, tag_type *tags);
133 static void bond_update_fake_slave_stats(struct bond *);
135 /* Attempts to parse 's' as the name of a bond balancing mode. If successful,
136 * stores the mode in '*balance' and returns true. Otherwise returns false
137 * without modifying '*balance'. */
139 bond_mode_from_string(enum bond_mode *balance, const char *s)
141 if (!strcmp(s, bond_mode_to_string(BM_TCP))) {
143 } else if (!strcmp(s, bond_mode_to_string(BM_SLB))) {
145 } else if (!strcmp(s, bond_mode_to_string(BM_AB))) {
153 /* Returns a string representing 'balance'. */
155 bond_mode_to_string(enum bond_mode balance) {
158 return "balance-tcp";
160 return "balance-slb";
162 return "active-backup";
168 /* Creates and returns a new bond whose configuration is initially taken from
171 * The caller should register each slave on the new bond by calling
172 * bond_slave_register(). */
174 bond_create(const struct bond_settings *s)
178 bond = xzalloc(sizeof *bond);
179 hmap_init(&bond->slaves);
180 bond->no_slaves_tag = tag_create_random();
181 bond->next_fake_iface_update = LLONG_MAX;
183 bond_reconfigure(bond, s);
185 tag_set_init(&bond->unixctl_tags);
192 bond_destroy(struct bond *bond)
194 struct bond_slave *slave, *next_slave;
200 hmap_remove(&all_bonds, &bond->hmap_node);
202 HMAP_FOR_EACH_SAFE (slave, next_slave, hmap_node, &bond->slaves) {
203 hmap_remove(&bond->slaves, &slave->hmap_node);
204 /* Client owns 'slave->netdev'. */
208 hmap_destroy(&bond->slaves);
215 /* Updates 'bond''s overall configuration to 's'.
217 * The caller should register each slave on 'bond' by calling
218 * bond_slave_register(). This is optional if none of the slaves'
219 * configuration has changed. In any case it can't hurt.
221 * Returns true if the configuration has changed in such a way that requires
225 bond_reconfigure(struct bond *bond, const struct bond_settings *s)
227 bool revalidate = false;
229 if (!bond->name || strcmp(bond->name, s->name)) {
231 hmap_remove(&all_bonds, &bond->hmap_node);
234 bond->name = xstrdup(s->name);
235 hmap_insert(&all_bonds, &bond->hmap_node, hash_string(bond->name, 0));
238 bond->updelay = s->up_delay;
239 bond->downdelay = s->down_delay;
241 if (bond->rebalance_interval != s->rebalance_interval) {
242 bond->rebalance_interval = s->rebalance_interval;
246 if (bond->balance != s->balance) {
247 bond->balance = s->balance;
251 if (bond->basis != s->basis) {
252 bond->basis = s->basis;
257 if (bond->next_fake_iface_update == LLONG_MAX) {
258 bond->next_fake_iface_update = time_msec();
261 bond->next_fake_iface_update = LLONG_MAX;
264 if (bond->bond_revalidate) {
266 bond->bond_revalidate = false;
269 if (bond->balance == BM_AB || !bond->hash || revalidate) {
270 bond_entry_reset(bond);
277 bond_slave_set_netdev__(struct bond_slave *slave, struct netdev *netdev)
279 if (slave->netdev != netdev) {
280 slave->netdev = netdev;
281 slave->change_seq = 0;
285 /* Registers 'slave_' as a slave of 'bond'. The 'slave_' pointer is an
286 * arbitrary client-provided pointer that uniquely identifies a slave within a
287 * bond. If 'slave_' already exists within 'bond' then this function
288 * reconfigures the existing slave.
290 * 'netdev' must be the network device that 'slave_' represents. It is owned
291 * by the client, so the client must not close it before either unregistering
292 * 'slave_' or destroying 'bond'.
295 bond_slave_register(struct bond *bond, void *slave_, struct netdev *netdev)
297 struct bond_slave *slave = bond_slave_lookup(bond, slave_);
300 slave = xzalloc(sizeof *slave);
302 hmap_insert(&bond->slaves, &slave->hmap_node, hash_pointer(slave_, 0));
305 slave->delay_expires = LLONG_MAX;
306 slave->name = xstrdup(netdev_get_name(netdev));
307 bond->bond_revalidate = true;
309 slave->enabled = false;
310 bond_enable_slave(slave, netdev_get_carrier(netdev), NULL);
313 bond_slave_set_netdev__(slave, netdev);
316 slave->name = xstrdup(netdev_get_name(netdev));
319 /* Updates the network device to be used with 'slave_' to 'netdev'.
321 * This is useful if the caller closes and re-opens the network device
322 * registered with bond_slave_register() but doesn't need to change anything
325 bond_slave_set_netdev(struct bond *bond, void *slave_, struct netdev *netdev)
327 struct bond_slave *slave = bond_slave_lookup(bond, slave_);
329 bond_slave_set_netdev__(slave, netdev);
333 /* Unregisters 'slave_' from 'bond'. If 'bond' does not contain such a slave
334 * then this function has no effect.
336 * Unregistering a slave invalidates all flows. */
338 bond_slave_unregister(struct bond *bond, const void *slave_)
340 struct bond_slave *slave = bond_slave_lookup(bond, slave_);
347 bond_enable_slave(slave, false, NULL);
349 del_active = bond->active_slave == slave;
351 struct bond_entry *e;
352 for (e = bond->hash; e <= &bond->hash[BOND_MASK]; e++) {
353 if (e->slave == slave) {
361 hmap_remove(&bond->slaves, &slave->hmap_node);
362 /* Client owns 'slave->netdev'. */
369 bond_choose_active_slave(bond, &tags);
370 bond->send_learning_packets = true;
374 /* Should be called on each slave in 'bond' before bond_run() to indicate
375 * whether or not 'slave_' may be enabled. This function is intended to allow
376 * other protocols to have some impact on bonding decisions. For example LACP
377 * or high level link monitoring protocols may decide that a given slave should
378 * not be able to send traffic. */
380 bond_slave_set_may_enable(struct bond *bond, void *slave_, bool may_enable)
382 bond_slave_lookup(bond, slave_)->may_enable = may_enable;
385 /* Performs periodic maintenance on 'bond'. The caller must provide 'tags' to
386 * allow tagged flows to be invalidated.
388 * The caller should check bond_should_send_learning_packets() afterward. */
390 bond_run(struct bond *bond, struct tag_set *tags, enum lacp_status lacp_status)
392 struct bond_slave *slave;
394 if (bond->lacp_status != lacp_status) {
395 bond->lacp_status = lacp_status;
396 bond->bond_revalidate = true;
399 /* Enable slaves based on link status and LACP feedback. */
400 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
401 bond_link_status_update(slave, tags);
402 slave->change_seq = netdev_change_seq(slave->netdev);
404 if (!bond->active_slave || !bond->active_slave->enabled) {
405 bond_choose_active_slave(bond, tags);
408 /* Update fake bond interface stats. */
409 if (time_msec() >= bond->next_fake_iface_update) {
410 bond_update_fake_slave_stats(bond);
411 bond->next_fake_iface_update = time_msec() + 1000;
414 if (bond->bond_revalidate) {
415 struct bond_slave *slave;
417 bond->bond_revalidate = false;
418 bond_entry_reset(bond);
419 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
420 tag_set_add(tags, slave->tag);
422 tag_set_add(tags, bond->no_slaves_tag);
425 /* Invalidate any tags required by */
426 tag_set_union(tags, &bond->unixctl_tags);
427 tag_set_init(&bond->unixctl_tags);
430 /* Causes poll_block() to wake up when 'bond' needs something to be done. */
432 bond_wait(struct bond *bond)
434 struct bond_slave *slave;
436 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
437 if (slave->delay_expires != LLONG_MAX) {
438 poll_timer_wait_until(slave->delay_expires);
441 if (slave->change_seq != netdev_change_seq(slave->netdev)) {
442 poll_immediate_wake();
446 if (bond->next_fake_iface_update != LLONG_MAX) {
447 poll_timer_wait_until(bond->next_fake_iface_update);
450 /* Ensure that any saved tags get revalidated right away. */
451 if (!tag_set_is_empty(&bond->unixctl_tags)) {
452 poll_immediate_wake();
455 /* We don't wait for bond->next_rebalance because rebalancing can only run
456 * at a flow account checkpoint. ofproto does checkpointing on its own
457 * schedule and bond_rebalance() gets called afterward, so we'd just be
458 * waking up for no purpose. */
461 /* MAC learning table interaction. */
464 may_send_learning_packets(const struct bond *bond)
466 return bond->lacp_status == LACP_DISABLED
467 && (bond->balance == BM_SLB || bond->balance == BM_AB)
468 && bond->active_slave;
471 /* Returns true if 'bond' needs the client to send out packets to assist with
472 * MAC learning on 'bond'. If this function returns true, then the client
473 * should iterate through its MAC learning table for the bridge on which 'bond'
474 * is located. For each MAC that has been learned on a port other than 'bond',
475 * it should call bond_compose_learning_packet().
477 * This function will only return true if 'bond' is in SLB or active-backup
478 * mode and LACP is not negotiated. Otherwise sending learning packets isn't
481 * Calling this function resets the state that it checks. */
483 bond_should_send_learning_packets(struct bond *bond)
485 bool send = bond->send_learning_packets && may_send_learning_packets(bond);
486 bond->send_learning_packets = false;
490 /* Sends a gratuitous learning packet on 'bond' from 'eth_src' on 'vlan'.
492 * See bond_should_send_learning_packets() for description of usage. The
493 * caller should send the composed packet on the port associated with
494 * port_aux and takes ownership of the returned ofpbuf. */
496 bond_compose_learning_packet(struct bond *bond,
497 const uint8_t eth_src[ETH_ADDR_LEN],
498 uint16_t vlan, void **port_aux)
500 struct bond_slave *slave;
501 struct ofpbuf *packet;
505 ovs_assert(may_send_learning_packets(bond));
507 memset(&flow, 0, sizeof flow);
508 memcpy(flow.dl_src, eth_src, ETH_ADDR_LEN);
509 slave = choose_output_slave(bond, &flow, NULL, vlan, &tags);
511 packet = ofpbuf_new(0);
512 compose_rarp(packet, eth_src);
514 eth_push_vlan(packet, htons(vlan));
517 *port_aux = slave->aux;
521 /* Checks whether a packet that arrived on 'slave_' within 'bond', with an
522 * Ethernet destination address of 'eth_dst', should be admitted.
524 * The return value is one of the following:
526 * - BV_ACCEPT: Admit the packet.
528 * - BV_DROP: Drop the packet.
530 * - BV_DROP_IF_MOVED: Consult the MAC learning table for the packet's
531 * Ethernet source address and VLAN. If there is none, or if the packet
532 * is on the learned port, then admit the packet. If a different port has
533 * been learned, however, drop the packet (and do not use it for MAC
537 bond_check_admissibility(struct bond *bond, const void *slave_,
538 const uint8_t eth_dst[ETH_ADDR_LEN], tag_type *tags)
540 struct bond_slave *slave = bond_slave_lookup(bond, slave_);
542 /* LACP bonds have very loose admissibility restrictions because we can
543 * assume the remote switch is aware of the bond and will "do the right
544 * thing". However, as a precaution we drop packets on disabled slaves
545 * because no correctly implemented partner switch should be sending
548 * If LACP is configured, but LACP negotiations have been unsuccessful, we
549 * drop all incoming traffic. */
550 switch (bond->lacp_status) {
551 case LACP_NEGOTIATED: return slave->enabled ? BV_ACCEPT : BV_DROP;
552 case LACP_CONFIGURED: return BV_DROP;
553 case LACP_DISABLED: break;
556 /* Drop all multicast packets on inactive slaves. */
557 if (eth_addr_is_multicast(eth_dst)) {
558 *tags |= bond_get_active_slave_tag(bond);
559 if (bond->active_slave != bond_slave_lookup(bond, slave_)) {
564 switch (bond->balance) {
566 /* Drop all packets which arrive on backup slaves. This is similar to
567 * how Linux bonding handles active-backup bonds. */
568 *tags |= bond_get_active_slave_tag(bond);
569 if (bond->active_slave != slave) {
570 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
572 VLOG_DBG_RL(&rl, "active-backup bond received packet on backup"
573 " slave (%s) destined for " ETH_ADDR_FMT,
574 slave->name, ETH_ADDR_ARGS(eth_dst));
580 /* TCP balanced bonds require successful LACP negotiated. Based on the
581 * above check, LACP is off on this bond. Therfore, we drop all
582 * incoming traffic. */
586 /* Drop all packets for which we have learned a different input port,
587 * because we probably sent the packet on one slave and got it back on
588 * the other. Gratuitous ARP packets are an exception to this rule:
589 * the host has moved to another switch. The exception to the
590 * exception is if we locked the learning table to avoid reflections on
592 return BV_DROP_IF_MOVED;
598 /* Returns the slave (registered on 'bond' by bond_slave_register()) to which
599 * a packet with the given 'flow' and 'vlan' should be forwarded. Returns
600 * NULL if the packet should be dropped because no slaves are enabled.
602 * 'vlan' is not necessarily the same as 'flow->vlan_tci'. First, 'vlan'
603 * should be a VID only (i.e. excluding the PCP bits). Second,
604 * 'flow->vlan_tci' is the VLAN TCI that appeared on the packet (so it will be
605 * nonzero only for trunk ports), whereas 'vlan' is the logical VLAN that the
606 * packet belongs to (so for an access port it will be the access port's VLAN).
608 * Adds a tag to '*tags' that associates the flow with the returned slave.
610 * If 'wc' is non-NULL, bitwise-OR's 'wc' with the set of bits that were
611 * significant in the selection. At some point earlier, 'wc' should
612 * have been initialized (e.g., by flow_wildcards_init_catchall()).
615 bond_choose_output_slave(struct bond *bond, const struct flow *flow,
616 struct flow_wildcards *wc, uint16_t vlan,
619 struct bond_slave *slave = choose_output_slave(bond, flow, wc, vlan, tags);
624 *tags |= bond->no_slaves_tag;
632 bond_is_balanced(const struct bond *bond)
634 return bond->rebalance_interval
635 && (bond->balance == BM_SLB || bond->balance == BM_TCP);
638 /* Notifies 'bond' that 'n_bytes' bytes were sent in 'flow' within 'vlan'. */
640 bond_account(struct bond *bond, const struct flow *flow, uint16_t vlan,
643 if (bond_is_balanced(bond)) {
644 lookup_bond_entry(bond, flow, vlan)->tx_bytes += n_bytes;
648 static struct bond_slave *
649 bond_slave_from_bal_node(struct list *bal)
651 return CONTAINER_OF(bal, struct bond_slave, bal_node);
655 log_bals(struct bond *bond, const struct list *bals)
657 if (VLOG_IS_DBG_ENABLED()) {
658 struct ds ds = DS_EMPTY_INITIALIZER;
659 const struct bond_slave *slave;
661 LIST_FOR_EACH (slave, bal_node, bals) {
663 ds_put_char(&ds, ',');
665 ds_put_format(&ds, " %s %"PRIu64"kB",
666 slave->name, slave->tx_bytes / 1024);
668 if (!slave->enabled) {
669 ds_put_cstr(&ds, " (disabled)");
671 if (!list_is_empty(&slave->entries)) {
672 struct bond_entry *e;
674 ds_put_cstr(&ds, " (");
675 LIST_FOR_EACH (e, list_node, &slave->entries) {
676 if (&e->list_node != list_front(&slave->entries)) {
677 ds_put_cstr(&ds, " + ");
679 ds_put_format(&ds, "h%td: %"PRIu64"kB",
680 e - bond->hash, e->tx_bytes / 1024);
682 ds_put_cstr(&ds, ")");
685 VLOG_DBG("bond %s:%s", bond->name, ds_cstr(&ds));
690 /* Shifts 'hash' from its current slave to 'to'. */
692 bond_shift_load(struct bond_entry *hash, struct bond_slave *to,
695 struct bond_slave *from = hash->slave;
696 struct bond *bond = from->bond;
697 uint64_t delta = hash->tx_bytes;
699 VLOG_INFO("bond %s: shift %"PRIu64"kB of load (with hash %td) "
700 "from %s to %s (now carrying %"PRIu64"kB and "
701 "%"PRIu64"kB load, respectively)",
702 bond->name, delta / 1024, hash - bond->hash,
703 from->name, to->name,
704 (from->tx_bytes - delta) / 1024,
705 (to->tx_bytes + delta) / 1024);
707 /* Shift load away from 'from' to 'to'. */
708 from->tx_bytes -= delta;
709 to->tx_bytes += delta;
711 /* Arrange for flows to be revalidated. */
712 tag_set_add(set, hash->tag);
714 hash->tag = tag_create_random();
717 /* Picks and returns a bond_entry to migrate from 'from' (the most heavily
718 * loaded bond slave) to a bond slave that has 'to_tx_bytes' bytes of load,
719 * given that doing so must decrease the ratio of the load on the two slaves by
720 * at least 0.1. Returns NULL if there is no appropriate entry.
722 * The list of entries isn't sorted. I don't know of a reason to prefer to
723 * shift away small hashes or large hashes. */
724 static struct bond_entry *
725 choose_entry_to_migrate(const struct bond_slave *from, uint64_t to_tx_bytes)
727 struct bond_entry *e;
729 if (list_is_short(&from->entries)) {
730 /* 'from' carries no more than one MAC hash, so shifting load away from
731 * it would be pointless. */
735 LIST_FOR_EACH (e, list_node, &from->entries) {
736 double old_ratio, new_ratio;
739 if (to_tx_bytes == 0) {
740 /* Nothing on the new slave, move it. */
745 old_ratio = (double)from->tx_bytes / to_tx_bytes;
746 new_ratio = (double)(from->tx_bytes - delta) / (to_tx_bytes + delta);
747 if (old_ratio - new_ratio > 0.1
748 && fabs(new_ratio - 1.0) < fabs(old_ratio - 1.0)) {
749 /* We're aiming for an ideal ratio of 1, meaning both the 'from'
750 and 'to' slave have the same load. Therefore, we only move an
751 entry if it decreases the load on 'from', and brings us closer
752 to equal traffic load. */
760 /* Inserts 'slave' into 'bals' so that descending order of 'tx_bytes' is
763 insert_bal(struct list *bals, struct bond_slave *slave)
765 struct bond_slave *pos;
767 LIST_FOR_EACH (pos, bal_node, bals) {
768 if (slave->tx_bytes > pos->tx_bytes) {
772 list_insert(&pos->bal_node, &slave->bal_node);
775 /* Removes 'slave' from its current list and then inserts it into 'bals' so
776 * that descending order of 'tx_bytes' is maintained. */
778 reinsert_bal(struct list *bals, struct bond_slave *slave)
780 list_remove(&slave->bal_node);
781 insert_bal(bals, slave);
784 /* If 'bond' needs rebalancing, does so.
786 * The caller should have called bond_account() for each active flow, to ensure
787 * that flow data is consistently accounted at this point. */
789 bond_rebalance(struct bond *bond, struct tag_set *tags)
791 struct bond_slave *slave;
792 struct bond_entry *e;
795 if (!bond_is_balanced(bond) || time_msec() < bond->next_rebalance) {
798 bond->next_rebalance = time_msec() + bond->rebalance_interval;
800 /* Add each bond_entry to its slave's 'entries' list.
801 * Compute each slave's tx_bytes as the sum of its entries' tx_bytes. */
802 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
804 list_init(&slave->entries);
806 for (e = &bond->hash[0]; e <= &bond->hash[BOND_MASK]; e++) {
807 if (e->slave && e->tx_bytes) {
808 e->slave->tx_bytes += e->tx_bytes;
809 list_push_back(&e->slave->entries, &e->list_node);
813 /* Add enabled slaves to 'bals' in descending order of tx_bytes.
815 * XXX This is O(n**2) in the number of slaves but it could be O(n lg n)
816 * with a proper list sort algorithm. */
818 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
819 if (slave->enabled) {
820 insert_bal(&bals, slave);
823 log_bals(bond, &bals);
825 /* Shift load from the most-loaded slaves to the least-loaded slaves. */
826 while (!list_is_short(&bals)) {
827 struct bond_slave *from = bond_slave_from_bal_node(list_front(&bals));
828 struct bond_slave *to = bond_slave_from_bal_node(list_back(&bals));
831 overload = from->tx_bytes - to->tx_bytes;
832 if (overload < to->tx_bytes >> 5 || overload < 100000) {
833 /* The extra load on 'from' (and all less-loaded slaves), compared
834 * to that of 'to' (the least-loaded slave), is less than ~3%, or
835 * it is less than ~1Mbps. No point in rebalancing. */
839 /* 'from' is carrying significantly more load than 'to'. Pick a hash
840 * to move from 'from' to 'to'. */
841 e = choose_entry_to_migrate(from, to->tx_bytes);
843 bond_shift_load(e, to, tags);
845 /* Delete element from from->entries.
847 * We don't add the element to to->hashes. That would only allow
848 * 'e' to be migrated to another slave in this rebalancing run, and
849 * there is no point in doing that. */
850 list_remove(&e->list_node);
852 /* Re-sort 'bals'. */
853 reinsert_bal(&bals, from);
854 reinsert_bal(&bals, to);
856 /* Can't usefully migrate anything away from 'from'.
857 * Don't reconsider it. */
858 list_remove(&from->bal_node);
862 /* Implement exponentially weighted moving average. A weight of 1/2 causes
863 * historical data to decay to <1% in 7 rebalancing runs. 1,000,000 bytes
864 * take 20 rebalancing runs to decay to 0 and get deleted entirely. */
865 for (e = &bond->hash[0]; e <= &bond->hash[BOND_MASK]; e++) {
873 /* Bonding unixctl user interface functions. */
876 bond_find(const char *name)
880 HMAP_FOR_EACH_WITH_HASH (bond, hmap_node, hash_string(name, 0),
882 if (!strcmp(bond->name, name)) {
889 static struct bond_slave *
890 bond_lookup_slave(struct bond *bond, const char *slave_name)
892 struct bond_slave *slave;
894 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
895 if (!strcmp(slave->name, slave_name)) {
903 bond_unixctl_list(struct unixctl_conn *conn,
904 int argc OVS_UNUSED, const char *argv[] OVS_UNUSED,
905 void *aux OVS_UNUSED)
907 struct ds ds = DS_EMPTY_INITIALIZER;
908 const struct bond *bond;
910 ds_put_cstr(&ds, "bond\ttype\tslaves\n");
912 HMAP_FOR_EACH (bond, hmap_node, &all_bonds) {
913 const struct bond_slave *slave;
916 ds_put_format(&ds, "%s\t%s\t",
917 bond->name, bond_mode_to_string(bond->balance));
920 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
922 ds_put_cstr(&ds, ", ");
924 ds_put_cstr(&ds, slave->name);
926 ds_put_char(&ds, '\n');
928 unixctl_command_reply(conn, ds_cstr(&ds));
933 bond_print_details(struct ds *ds, const struct bond *bond)
935 struct shash slave_shash = SHASH_INITIALIZER(&slave_shash);
936 const struct shash_node **sorted_slaves = NULL;
937 const struct bond_slave *slave;
940 ds_put_format(ds, "---- %s ----\n", bond->name);
941 ds_put_format(ds, "bond_mode: %s\n",
942 bond_mode_to_string(bond->balance));
944 ds_put_format(ds, "bond-hash-basis: %"PRIu32"\n", bond->basis);
946 ds_put_format(ds, "updelay: %d ms\n", bond->updelay);
947 ds_put_format(ds, "downdelay: %d ms\n", bond->downdelay);
949 if (bond_is_balanced(bond)) {
950 ds_put_format(ds, "next rebalance: %lld ms\n",
951 bond->next_rebalance - time_msec());
954 ds_put_cstr(ds, "lacp_status: ");
955 switch (bond->lacp_status) {
956 case LACP_NEGOTIATED:
957 ds_put_cstr(ds, "negotiated\n");
959 case LACP_CONFIGURED:
960 ds_put_cstr(ds, "configured\n");
963 ds_put_cstr(ds, "off\n");
966 ds_put_cstr(ds, "<unknown>\n");
970 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
971 shash_add(&slave_shash, slave->name, slave);
973 sorted_slaves = shash_sort(&slave_shash);
975 for (i = 0; i < shash_count(&slave_shash); i++) {
976 struct bond_entry *be;
978 slave = sorted_slaves[i]->data;
981 ds_put_format(ds, "\nslave %s: %s\n",
982 slave->name, slave->enabled ? "enabled" : "disabled");
983 if (slave == bond->active_slave) {
984 ds_put_cstr(ds, "\tactive slave\n");
986 if (slave->delay_expires != LLONG_MAX) {
987 ds_put_format(ds, "\t%s expires in %lld ms\n",
988 slave->enabled ? "downdelay" : "updelay",
989 slave->delay_expires - time_msec());
992 ds_put_format(ds, "\tmay_enable: %s\n",
993 slave->may_enable ? "true" : "false");
995 if (!bond_is_balanced(bond)) {
1000 for (be = bond->hash; be <= &bond->hash[BOND_MASK]; be++) {
1001 int hash = be - bond->hash;
1003 if (be->slave != slave) {
1007 ds_put_format(ds, "\thash %d: %"PRIu64" kB load\n",
1008 hash, be->tx_bytes / 1024);
1010 /* XXX How can we list the MACs assigned to hashes of SLB bonds? */
1013 shash_destroy(&slave_shash);
1014 free(sorted_slaves);
1015 ds_put_cstr(ds, "\n");
1019 bond_unixctl_show(struct unixctl_conn *conn,
1020 int argc, const char *argv[],
1021 void *aux OVS_UNUSED)
1023 struct ds ds = DS_EMPTY_INITIALIZER;
1026 const struct bond *bond = bond_find(argv[1]);
1029 unixctl_command_reply_error(conn, "no such bond");
1032 bond_print_details(&ds, bond);
1034 const struct bond *bond;
1036 HMAP_FOR_EACH (bond, hmap_node, &all_bonds) {
1037 bond_print_details(&ds, bond);
1041 unixctl_command_reply(conn, ds_cstr(&ds));
1046 bond_unixctl_migrate(struct unixctl_conn *conn,
1047 int argc OVS_UNUSED, const char *argv[],
1048 void *aux OVS_UNUSED)
1050 const char *bond_s = argv[1];
1051 const char *hash_s = argv[2];
1052 const char *slave_s = argv[3];
1054 struct bond_slave *slave;
1055 struct bond_entry *entry;
1058 bond = bond_find(bond_s);
1060 unixctl_command_reply_error(conn, "no such bond");
1064 if (bond->balance != BM_SLB) {
1065 unixctl_command_reply_error(conn, "not an SLB bond");
1069 if (strspn(hash_s, "0123456789") == strlen(hash_s)) {
1070 hash = atoi(hash_s) & BOND_MASK;
1072 unixctl_command_reply_error(conn, "bad hash");
1076 slave = bond_lookup_slave(bond, slave_s);
1078 unixctl_command_reply_error(conn, "no such slave");
1082 if (!slave->enabled) {
1083 unixctl_command_reply_error(conn, "cannot migrate to disabled slave");
1087 entry = &bond->hash[hash];
1088 tag_set_add(&bond->unixctl_tags, entry->tag);
1089 entry->slave = slave;
1090 entry->tag = tag_create_random();
1091 unixctl_command_reply(conn, "migrated");
1095 bond_unixctl_set_active_slave(struct unixctl_conn *conn,
1096 int argc OVS_UNUSED, const char *argv[],
1097 void *aux OVS_UNUSED)
1099 const char *bond_s = argv[1];
1100 const char *slave_s = argv[2];
1102 struct bond_slave *slave;
1104 bond = bond_find(bond_s);
1106 unixctl_command_reply_error(conn, "no such bond");
1110 slave = bond_lookup_slave(bond, slave_s);
1112 unixctl_command_reply_error(conn, "no such slave");
1116 if (!slave->enabled) {
1117 unixctl_command_reply_error(conn, "cannot make disabled slave active");
1121 if (bond->active_slave != slave) {
1122 tag_set_add(&bond->unixctl_tags, bond_get_active_slave_tag(bond));
1123 bond->active_slave = slave;
1124 bond->active_slave->tag = tag_create_random();
1125 VLOG_INFO("bond %s: active interface is now %s",
1126 bond->name, slave->name);
1127 bond->send_learning_packets = true;
1128 unixctl_command_reply(conn, "done");
1130 unixctl_command_reply(conn, "no change");
1135 enable_slave(struct unixctl_conn *conn, const char *argv[], bool enable)
1137 const char *bond_s = argv[1];
1138 const char *slave_s = argv[2];
1140 struct bond_slave *slave;
1142 bond = bond_find(bond_s);
1144 unixctl_command_reply_error(conn, "no such bond");
1148 slave = bond_lookup_slave(bond, slave_s);
1150 unixctl_command_reply_error(conn, "no such slave");
1154 bond_enable_slave(slave, enable, &bond->unixctl_tags);
1155 unixctl_command_reply(conn, enable ? "enabled" : "disabled");
1159 bond_unixctl_enable_slave(struct unixctl_conn *conn,
1160 int argc OVS_UNUSED, const char *argv[],
1161 void *aux OVS_UNUSED)
1163 enable_slave(conn, argv, true);
1167 bond_unixctl_disable_slave(struct unixctl_conn *conn,
1168 int argc OVS_UNUSED, const char *argv[],
1169 void *aux OVS_UNUSED)
1171 enable_slave(conn, argv, false);
1175 bond_unixctl_hash(struct unixctl_conn *conn, int argc, const char *argv[],
1176 void *aux OVS_UNUSED)
1178 const char *mac_s = argv[1];
1179 const char *vlan_s = argc > 2 ? argv[2] : NULL;
1180 const char *basis_s = argc > 3 ? argv[3] : NULL;
1181 uint8_t mac[ETH_ADDR_LEN];
1188 if (sscanf(vlan_s, "%u", &vlan) != 1) {
1189 unixctl_command_reply_error(conn, "invalid vlan");
1197 if (sscanf(basis_s, "%"PRIu32, &basis) != 1) {
1198 unixctl_command_reply_error(conn, "invalid basis");
1205 if (sscanf(mac_s, ETH_ADDR_SCAN_FMT, ETH_ADDR_SCAN_ARGS(mac))
1206 == ETH_ADDR_SCAN_COUNT) {
1207 hash = bond_hash_src(mac, vlan, basis) & BOND_MASK;
1209 hash_cstr = xasprintf("%u", hash);
1210 unixctl_command_reply(conn, hash_cstr);
1213 unixctl_command_reply_error(conn, "invalid mac");
1220 unixctl_command_register("bond/list", "", 0, 0, bond_unixctl_list, NULL);
1221 unixctl_command_register("bond/show", "[port]", 0, 1, bond_unixctl_show,
1223 unixctl_command_register("bond/migrate", "port hash slave", 3, 3,
1224 bond_unixctl_migrate, NULL);
1225 unixctl_command_register("bond/set-active-slave", "port slave", 2, 2,
1226 bond_unixctl_set_active_slave, NULL);
1227 unixctl_command_register("bond/enable-slave", "port slave", 2, 2,
1228 bond_unixctl_enable_slave, NULL);
1229 unixctl_command_register("bond/disable-slave", "port slave", 2, 2,
1230 bond_unixctl_disable_slave, NULL);
1231 unixctl_command_register("bond/hash", "mac [vlan] [basis]", 1, 3,
1232 bond_unixctl_hash, NULL);
1236 bond_entry_reset(struct bond *bond)
1238 if (bond->balance != BM_AB) {
1239 size_t hash_len = (BOND_MASK + 1) * sizeof *bond->hash;
1242 bond->hash = xmalloc(hash_len);
1244 memset(bond->hash, 0, hash_len);
1246 bond->next_rebalance = time_msec() + bond->rebalance_interval;
1253 static struct bond_slave *
1254 bond_slave_lookup(struct bond *bond, const void *slave_)
1256 struct bond_slave *slave;
1258 HMAP_FOR_EACH_IN_BUCKET (slave, hmap_node, hash_pointer(slave_, 0),
1260 if (slave->aux == slave_) {
1269 bond_enable_slave(struct bond_slave *slave, bool enable, struct tag_set *tags)
1271 slave->delay_expires = LLONG_MAX;
1272 if (enable != slave->enabled) {
1273 slave->enabled = enable;
1274 if (!slave->enabled) {
1275 VLOG_INFO("interface %s: disabled", slave->name);
1277 tag_set_add(tags, slave->tag);
1280 VLOG_INFO("interface %s: enabled", slave->name);
1281 slave->tag = tag_create_random();
1287 bond_link_status_update(struct bond_slave *slave, struct tag_set *tags)
1289 struct bond *bond = slave->bond;
1292 up = netdev_get_carrier(slave->netdev) && slave->may_enable;
1293 if ((up == slave->enabled) != (slave->delay_expires == LLONG_MAX)) {
1294 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
1295 VLOG_INFO_RL(&rl, "interface %s: link state %s",
1296 slave->name, up ? "up" : "down");
1297 if (up == slave->enabled) {
1298 slave->delay_expires = LLONG_MAX;
1299 VLOG_INFO_RL(&rl, "interface %s: will not be %s",
1300 slave->name, up ? "disabled" : "enabled");
1302 int delay = (bond->lacp_status != LACP_DISABLED ? 0
1303 : up ? bond->updelay : bond->downdelay);
1304 slave->delay_expires = time_msec() + delay;
1306 VLOG_INFO_RL(&rl, "interface %s: will be %s if it stays %s "
1309 up ? "enabled" : "disabled",
1316 if (time_msec() >= slave->delay_expires) {
1317 bond_enable_slave(slave, up, tags);
1322 bond_hash_src(const uint8_t mac[ETH_ADDR_LEN], uint16_t vlan, uint32_t basis)
1324 return hash_3words(hash_bytes(mac, ETH_ADDR_LEN, 0), vlan, basis);
1328 bond_hash_tcp(const struct flow *flow, uint16_t vlan, uint32_t basis)
1330 struct flow hash_flow = *flow;
1331 hash_flow.vlan_tci = htons(vlan);
1333 /* The symmetric quality of this hash function is not required, but
1334 * flow_hash_symmetric_l4 already exists, and is sufficient for our
1335 * purposes, so we use it out of convenience. */
1336 return flow_hash_symmetric_l4(&hash_flow, basis);
1340 bond_hash(const struct bond *bond, const struct flow *flow, uint16_t vlan)
1342 ovs_assert(bond->balance == BM_TCP || bond->balance == BM_SLB);
1344 return (bond->balance == BM_TCP
1345 ? bond_hash_tcp(flow, vlan, bond->basis)
1346 : bond_hash_src(flow->dl_src, vlan, bond->basis));
1349 static struct bond_entry *
1350 lookup_bond_entry(const struct bond *bond, const struct flow *flow,
1353 return &bond->hash[bond_hash(bond, flow, vlan) & BOND_MASK];
1356 static struct bond_slave *
1357 choose_output_slave(const struct bond *bond, const struct flow *flow,
1358 struct flow_wildcards *wc, uint16_t vlan, tag_type *tags)
1360 struct bond_entry *e;
1362 if (bond->lacp_status == LACP_CONFIGURED) {
1363 /* LACP has been configured on this bond but negotiations were
1364 * unsuccussful. Drop all traffic. */
1368 switch (bond->balance) {
1370 return bond->active_slave;
1373 if (bond->lacp_status != LACP_NEGOTIATED) {
1374 /* Must have LACP negotiations for TCP balanced bonds. */
1378 flow_mask_hash_fields(wc, NX_HASH_FIELDS_SYMMETRIC_L4);
1383 flow_mask_hash_fields(wc, NX_HASH_FIELDS_ETH_SRC);
1385 e = lookup_bond_entry(bond, flow, vlan);
1386 if (!e->slave || !e->slave->enabled) {
1387 e->slave = CONTAINER_OF(hmap_random_node(&bond->slaves),
1388 struct bond_slave, hmap_node);
1389 if (!e->slave->enabled) {
1390 e->slave = bond->active_slave;
1392 e->tag = tag_create_random();
1402 static struct bond_slave *
1403 bond_choose_slave(const struct bond *bond)
1405 struct bond_slave *slave, *best;
1407 /* Find an enabled slave. */
1408 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1409 if (slave->enabled) {
1414 /* All interfaces are disabled. Find an interface that will be enabled
1415 * after its updelay expires. */
1417 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1418 if (slave->delay_expires != LLONG_MAX
1419 && slave->may_enable
1420 && (!best || slave->delay_expires < best->delay_expires)) {
1428 bond_choose_active_slave(struct bond *bond, struct tag_set *tags)
1430 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
1431 struct bond_slave *old_active_slave = bond->active_slave;
1433 bond->active_slave = bond_choose_slave(bond);
1434 if (bond->active_slave) {
1435 if (bond->active_slave->enabled) {
1436 VLOG_INFO_RL(&rl, "bond %s: active interface is now %s",
1437 bond->name, bond->active_slave->name);
1439 VLOG_INFO_RL(&rl, "bond %s: active interface is now %s, skipping "
1440 "remaining %lld ms updelay (since no interface was "
1441 "enabled)", bond->name, bond->active_slave->name,
1442 bond->active_slave->delay_expires - time_msec());
1443 bond_enable_slave(bond->active_slave, true, tags);
1446 if (!old_active_slave) {
1447 tag_set_add(tags, bond->no_slaves_tag);
1450 bond->send_learning_packets = true;
1451 } else if (old_active_slave) {
1452 VLOG_INFO_RL(&rl, "bond %s: all interfaces disabled", bond->name);
1456 /* Returns the tag for 'bond''s active slave, or 'bond''s no_slaves_tag if
1457 * there is no active slave. */
1459 bond_get_active_slave_tag(const struct bond *bond)
1461 return (bond->active_slave
1462 ? bond->active_slave->tag
1463 : bond->no_slaves_tag);
1466 /* Attempts to make the sum of the bond slaves' statistics appear on the fake
1467 * bond interface. */
1469 bond_update_fake_slave_stats(struct bond *bond)
1471 struct netdev_stats bond_stats;
1472 struct bond_slave *slave;
1473 struct netdev *bond_dev;
1475 memset(&bond_stats, 0, sizeof bond_stats);
1477 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1478 struct netdev_stats slave_stats;
1480 if (!netdev_get_stats(slave->netdev, &slave_stats)) {
1481 /* XXX: We swap the stats here because they are swapped back when
1482 * reported by the internal device. The reason for this is
1483 * internal devices normally represent packets going into the
1484 * system but when used as fake bond device they represent packets
1485 * leaving the system. We really should do this in the internal
1486 * device itself because changing it here reverses the counts from
1487 * the perspective of the switch. However, the internal device
1488 * doesn't know what type of device it represents so we have to do
1489 * it here for now. */
1490 bond_stats.tx_packets += slave_stats.rx_packets;
1491 bond_stats.tx_bytes += slave_stats.rx_bytes;
1492 bond_stats.rx_packets += slave_stats.tx_packets;
1493 bond_stats.rx_bytes += slave_stats.tx_bytes;
1497 if (!netdev_open(bond->name, "system", &bond_dev)) {
1498 netdev_set_stats(bond_dev, &bond_stats);
1499 netdev_close(bond_dev);