2 * Copyright (c) 2008, 2009, 2010, 2011, 2012, 2013 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
27 #include "dynamic-string.h"
36 #include "poll-loop.h"
43 VLOG_DEFINE_THIS_MODULE(bond);
45 /* Bit-mask for hashing a flow down to a bucket.
46 * There are (BOND_MASK + 1) buckets. */
47 #define BOND_MASK 0xff
49 /* A hash bucket for mapping a flow to a slave.
50 * "struct bond" has an array of (BOND_MASK + 1) of these. */
52 struct bond_slave *slave; /* Assigned slave, NULL if unassigned. */
53 uint64_t tx_bytes; /* Count of bytes recently transmitted. */
54 tag_type tag; /* Tag for entry<->facet association. */
55 struct list list_node; /* In bond_slave's 'entries' list. */
58 /* A bond slave, that is, one of the links comprising a bond. */
60 struct hmap_node hmap_node; /* In struct bond's slaves hmap. */
61 struct bond *bond; /* The bond that contains this slave. */
62 void *aux; /* Client-provided handle for this slave. */
64 struct netdev *netdev; /* Network device, owned by the client. */
65 unsigned int change_seq; /* Tracks changes in 'netdev'. */
66 char *name; /* Name (a copy of netdev_get_name(netdev)). */
69 long long delay_expires; /* Time after which 'enabled' may change. */
70 bool enabled; /* May be chosen for flows? */
71 bool may_enable; /* Client considers this slave bondable. */
72 tag_type tag; /* Tag associated with this slave. */
74 /* Rebalancing info. Used only by bond_rebalance(). */
75 struct list bal_node; /* In bond_rebalance()'s 'bals' list. */
76 struct list entries; /* 'struct bond_entry's assigned here. */
77 uint64_t tx_bytes; /* Sum across 'tx_bytes' of entries. */
80 /* A bond, that is, a set of network devices grouped to improve performance or
83 struct hmap_node hmap_node; /* In 'all_bonds' hmap. */
84 char *name; /* Name provided by client. */
90 enum bond_mode balance; /* Balancing mode, one of BM_*. */
91 struct bond_slave *active_slave;
92 tag_type no_slaves_tag; /* Tag for flows when all slaves disabled. */
93 int updelay, downdelay; /* Delay before slave goes up/down, in ms. */
94 enum lacp_status lacp_status; /* Status of LACP negotiations. */
95 bool bond_revalidate; /* True if flows need revalidation. */
96 uint32_t basis; /* Basis for flow hash function. */
98 /* SLB specific bonding info. */
99 struct bond_entry *hash; /* An array of (BOND_MASK + 1) elements. */
100 int rebalance_interval; /* Interval between rebalances, in ms. */
101 long long int next_rebalance; /* Next rebalancing time. */
102 bool send_learning_packets;
104 /* Legacy compatibility. */
105 long long int next_fake_iface_update; /* LLONG_MAX if disabled. */
107 /* Tag set saved for next bond_run(). This tag set is a kluge for cases
108 * where we can't otherwise provide revalidation feedback to the client.
109 * That's only unixctl commands now; I hope no other cases will arise. */
110 struct tag_set unixctl_tags;
115 static struct hmap all_bonds = HMAP_INITIALIZER(&all_bonds);
117 static void bond_entry_reset(struct bond *);
118 static struct bond_slave *bond_slave_lookup(struct bond *, const void *slave_);
119 static void bond_enable_slave(struct bond_slave *, bool enable,
121 static void bond_link_status_update(struct bond_slave *, struct tag_set *);
122 static void bond_choose_active_slave(struct bond *, struct tag_set *);
123 static unsigned int bond_hash_src(const uint8_t mac[ETH_ADDR_LEN],
124 uint16_t vlan, uint32_t basis);
125 static unsigned int bond_hash_tcp(const struct flow *, uint16_t vlan,
127 static struct bond_entry *lookup_bond_entry(const struct bond *,
130 static tag_type bond_get_active_slave_tag(const struct bond *);
131 static struct bond_slave *choose_output_slave(const struct bond *,
133 struct flow_wildcards *,
134 uint16_t vlan, tag_type *tags);
135 static void bond_update_fake_slave_stats(struct bond *);
137 /* Attempts to parse 's' as the name of a bond balancing mode. If successful,
138 * stores the mode in '*balance' and returns true. Otherwise returns false
139 * without modifying '*balance'. */
141 bond_mode_from_string(enum bond_mode *balance, const char *s)
143 if (!strcmp(s, bond_mode_to_string(BM_TCP))) {
145 } else if (!strcmp(s, bond_mode_to_string(BM_SLB))) {
147 } else if (!strcmp(s, bond_mode_to_string(BM_AB))) {
155 /* Returns a string representing 'balance'. */
157 bond_mode_to_string(enum bond_mode balance) {
160 return "balance-tcp";
162 return "balance-slb";
164 return "active-backup";
170 /* Creates and returns a new bond whose configuration is initially taken from
173 * The caller should register each slave on the new bond by calling
174 * bond_slave_register(). */
176 bond_create(const struct bond_settings *s)
180 bond = xzalloc(sizeof *bond);
181 hmap_init(&bond->slaves);
182 bond->no_slaves_tag = tag_create_random();
183 bond->next_fake_iface_update = LLONG_MAX;
186 bond_reconfigure(bond, s);
188 tag_set_init(&bond->unixctl_tags);
194 bond_ref(const struct bond *bond_)
196 struct bond *bond = CONST_CAST(struct bond *, bond_);
199 ovs_assert(bond->ref_cnt > 0);
207 bond_unref(struct bond *bond)
209 struct bond_slave *slave, *next_slave;
215 ovs_assert(bond->ref_cnt > 0);
216 if (--bond->ref_cnt) {
220 hmap_remove(&all_bonds, &bond->hmap_node);
222 HMAP_FOR_EACH_SAFE (slave, next_slave, hmap_node, &bond->slaves) {
223 hmap_remove(&bond->slaves, &slave->hmap_node);
224 /* Client owns 'slave->netdev'. */
228 hmap_destroy(&bond->slaves);
235 /* Updates 'bond''s overall configuration to 's'.
237 * The caller should register each slave on 'bond' by calling
238 * bond_slave_register(). This is optional if none of the slaves'
239 * configuration has changed. In any case it can't hurt.
241 * Returns true if the configuration has changed in such a way that requires
245 bond_reconfigure(struct bond *bond, const struct bond_settings *s)
247 bool revalidate = false;
249 if (!bond->name || strcmp(bond->name, s->name)) {
251 hmap_remove(&all_bonds, &bond->hmap_node);
254 bond->name = xstrdup(s->name);
255 hmap_insert(&all_bonds, &bond->hmap_node, hash_string(bond->name, 0));
258 bond->updelay = s->up_delay;
259 bond->downdelay = s->down_delay;
261 if (bond->rebalance_interval != s->rebalance_interval) {
262 bond->rebalance_interval = s->rebalance_interval;
266 if (bond->balance != s->balance) {
267 bond->balance = s->balance;
271 if (bond->basis != s->basis) {
272 bond->basis = s->basis;
277 if (bond->next_fake_iface_update == LLONG_MAX) {
278 bond->next_fake_iface_update = time_msec();
281 bond->next_fake_iface_update = LLONG_MAX;
284 if (bond->bond_revalidate) {
286 bond->bond_revalidate = false;
289 if (bond->balance == BM_AB || !bond->hash || revalidate) {
290 bond_entry_reset(bond);
297 bond_slave_set_netdev__(struct bond_slave *slave, struct netdev *netdev)
299 if (slave->netdev != netdev) {
300 slave->netdev = netdev;
301 slave->change_seq = 0;
305 /* Registers 'slave_' as a slave of 'bond'. The 'slave_' pointer is an
306 * arbitrary client-provided pointer that uniquely identifies a slave within a
307 * bond. If 'slave_' already exists within 'bond' then this function
308 * reconfigures the existing slave.
310 * 'netdev' must be the network device that 'slave_' represents. It is owned
311 * by the client, so the client must not close it before either unregistering
312 * 'slave_' or destroying 'bond'.
315 bond_slave_register(struct bond *bond, void *slave_, struct netdev *netdev)
317 struct bond_slave *slave = bond_slave_lookup(bond, slave_);
320 slave = xzalloc(sizeof *slave);
322 hmap_insert(&bond->slaves, &slave->hmap_node, hash_pointer(slave_, 0));
325 slave->delay_expires = LLONG_MAX;
326 slave->name = xstrdup(netdev_get_name(netdev));
327 bond->bond_revalidate = true;
329 slave->enabled = false;
330 bond_enable_slave(slave, netdev_get_carrier(netdev), NULL);
333 bond_slave_set_netdev__(slave, netdev);
336 slave->name = xstrdup(netdev_get_name(netdev));
339 /* Updates the network device to be used with 'slave_' to 'netdev'.
341 * This is useful if the caller closes and re-opens the network device
342 * registered with bond_slave_register() but doesn't need to change anything
345 bond_slave_set_netdev(struct bond *bond, void *slave_, struct netdev *netdev)
347 struct bond_slave *slave = bond_slave_lookup(bond, slave_);
349 bond_slave_set_netdev__(slave, netdev);
353 /* Unregisters 'slave_' from 'bond'. If 'bond' does not contain such a slave
354 * then this function has no effect.
356 * Unregistering a slave invalidates all flows. */
358 bond_slave_unregister(struct bond *bond, const void *slave_)
360 struct bond_slave *slave = bond_slave_lookup(bond, slave_);
367 bond_enable_slave(slave, false, NULL);
369 del_active = bond->active_slave == slave;
371 struct bond_entry *e;
372 for (e = bond->hash; e <= &bond->hash[BOND_MASK]; e++) {
373 if (e->slave == slave) {
381 hmap_remove(&bond->slaves, &slave->hmap_node);
382 /* Client owns 'slave->netdev'. */
389 bond_choose_active_slave(bond, &tags);
390 bond->send_learning_packets = true;
394 /* Should be called on each slave in 'bond' before bond_run() to indicate
395 * whether or not 'slave_' may be enabled. This function is intended to allow
396 * other protocols to have some impact on bonding decisions. For example LACP
397 * or high level link monitoring protocols may decide that a given slave should
398 * not be able to send traffic. */
400 bond_slave_set_may_enable(struct bond *bond, void *slave_, bool may_enable)
402 bond_slave_lookup(bond, slave_)->may_enable = may_enable;
405 /* Performs periodic maintenance on 'bond'. The caller must provide 'tags' to
406 * allow tagged flows to be invalidated.
408 * The caller should check bond_should_send_learning_packets() afterward. */
410 bond_run(struct bond *bond, struct tag_set *tags, enum lacp_status lacp_status)
412 struct bond_slave *slave;
414 if (bond->lacp_status != lacp_status) {
415 bond->lacp_status = lacp_status;
416 bond->bond_revalidate = true;
419 /* Enable slaves based on link status and LACP feedback. */
420 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
421 bond_link_status_update(slave, tags);
422 slave->change_seq = netdev_change_seq(slave->netdev);
424 if (!bond->active_slave || !bond->active_slave->enabled) {
425 bond_choose_active_slave(bond, tags);
428 /* Update fake bond interface stats. */
429 if (time_msec() >= bond->next_fake_iface_update) {
430 bond_update_fake_slave_stats(bond);
431 bond->next_fake_iface_update = time_msec() + 1000;
434 if (bond->bond_revalidate) {
435 struct bond_slave *slave;
437 bond->bond_revalidate = false;
438 bond_entry_reset(bond);
439 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
440 tag_set_add(tags, slave->tag);
442 tag_set_add(tags, bond->no_slaves_tag);
445 /* Invalidate any tags required by */
446 tag_set_union(tags, &bond->unixctl_tags);
447 tag_set_init(&bond->unixctl_tags);
450 /* Causes poll_block() to wake up when 'bond' needs something to be done. */
452 bond_wait(struct bond *bond)
454 struct bond_slave *slave;
456 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
457 if (slave->delay_expires != LLONG_MAX) {
458 poll_timer_wait_until(slave->delay_expires);
461 if (slave->change_seq != netdev_change_seq(slave->netdev)) {
462 poll_immediate_wake();
466 if (bond->next_fake_iface_update != LLONG_MAX) {
467 poll_timer_wait_until(bond->next_fake_iface_update);
470 /* Ensure that any saved tags get revalidated right away. */
471 if (!tag_set_is_empty(&bond->unixctl_tags)) {
472 poll_immediate_wake();
475 /* We don't wait for bond->next_rebalance because rebalancing can only run
476 * at a flow account checkpoint. ofproto does checkpointing on its own
477 * schedule and bond_rebalance() gets called afterward, so we'd just be
478 * waking up for no purpose. */
481 /* MAC learning table interaction. */
484 may_send_learning_packets(const struct bond *bond)
486 return bond->lacp_status == LACP_DISABLED
487 && (bond->balance == BM_SLB || bond->balance == BM_AB)
488 && bond->active_slave;
491 /* Returns true if 'bond' needs the client to send out packets to assist with
492 * MAC learning on 'bond'. If this function returns true, then the client
493 * should iterate through its MAC learning table for the bridge on which 'bond'
494 * is located. For each MAC that has been learned on a port other than 'bond',
495 * it should call bond_compose_learning_packet().
497 * This function will only return true if 'bond' is in SLB or active-backup
498 * mode and LACP is not negotiated. Otherwise sending learning packets isn't
501 * Calling this function resets the state that it checks. */
503 bond_should_send_learning_packets(struct bond *bond)
505 bool send = bond->send_learning_packets && may_send_learning_packets(bond);
506 bond->send_learning_packets = false;
510 /* Sends a gratuitous learning packet on 'bond' from 'eth_src' on 'vlan'.
512 * See bond_should_send_learning_packets() for description of usage. The
513 * caller should send the composed packet on the port associated with
514 * port_aux and takes ownership of the returned ofpbuf. */
516 bond_compose_learning_packet(struct bond *bond,
517 const uint8_t eth_src[ETH_ADDR_LEN],
518 uint16_t vlan, void **port_aux)
520 struct bond_slave *slave;
521 struct ofpbuf *packet;
525 ovs_assert(may_send_learning_packets(bond));
527 memset(&flow, 0, sizeof flow);
528 memcpy(flow.dl_src, eth_src, ETH_ADDR_LEN);
529 slave = choose_output_slave(bond, &flow, NULL, vlan, &tags);
531 packet = ofpbuf_new(0);
532 compose_rarp(packet, eth_src);
534 eth_push_vlan(packet, htons(vlan));
537 *port_aux = slave->aux;
541 /* Checks whether a packet that arrived on 'slave_' within 'bond', with an
542 * Ethernet destination address of 'eth_dst', should be admitted.
544 * The return value is one of the following:
546 * - BV_ACCEPT: Admit the packet.
548 * - BV_DROP: Drop the packet.
550 * - BV_DROP_IF_MOVED: Consult the MAC learning table for the packet's
551 * Ethernet source address and VLAN. If there is none, or if the packet
552 * is on the learned port, then admit the packet. If a different port has
553 * been learned, however, drop the packet (and do not use it for MAC
557 bond_check_admissibility(struct bond *bond, const void *slave_,
558 const uint8_t eth_dst[ETH_ADDR_LEN], tag_type *tags)
560 struct bond_slave *slave = bond_slave_lookup(bond, slave_);
566 /* LACP bonds have very loose admissibility restrictions because we can
567 * assume the remote switch is aware of the bond and will "do the right
568 * thing". However, as a precaution we drop packets on disabled slaves
569 * because no correctly implemented partner switch should be sending
572 * If LACP is configured, but LACP negotiations have been unsuccessful, we
573 * drop all incoming traffic. */
574 switch (bond->lacp_status) {
575 case LACP_NEGOTIATED: return slave->enabled ? BV_ACCEPT : BV_DROP;
576 case LACP_CONFIGURED: return BV_DROP;
577 case LACP_DISABLED: break;
580 /* Drop all multicast packets on inactive slaves. */
581 if (eth_addr_is_multicast(eth_dst)) {
582 *tags |= bond_get_active_slave_tag(bond);
583 if (bond->active_slave != slave) {
588 switch (bond->balance) {
590 /* Drop all packets which arrive on backup slaves. This is similar to
591 * how Linux bonding handles active-backup bonds. */
592 *tags |= bond_get_active_slave_tag(bond);
593 if (bond->active_slave != slave) {
594 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
596 VLOG_DBG_RL(&rl, "active-backup bond received packet on backup"
597 " slave (%s) destined for " ETH_ADDR_FMT,
598 slave->name, ETH_ADDR_ARGS(eth_dst));
604 /* TCP balanced bonds require successful LACP negotiated. Based on the
605 * above check, LACP is off on this bond. Therfore, we drop all
606 * incoming traffic. */
610 /* Drop all packets for which we have learned a different input port,
611 * because we probably sent the packet on one slave and got it back on
612 * the other. Gratuitous ARP packets are an exception to this rule:
613 * the host has moved to another switch. The exception to the
614 * exception is if we locked the learning table to avoid reflections on
616 return BV_DROP_IF_MOVED;
622 /* Returns the slave (registered on 'bond' by bond_slave_register()) to which
623 * a packet with the given 'flow' and 'vlan' should be forwarded. Returns
624 * NULL if the packet should be dropped because no slaves are enabled.
626 * 'vlan' is not necessarily the same as 'flow->vlan_tci'. First, 'vlan'
627 * should be a VID only (i.e. excluding the PCP bits). Second,
628 * 'flow->vlan_tci' is the VLAN TCI that appeared on the packet (so it will be
629 * nonzero only for trunk ports), whereas 'vlan' is the logical VLAN that the
630 * packet belongs to (so for an access port it will be the access port's VLAN).
632 * Adds a tag to '*tags' that associates the flow with the returned slave.
634 * If 'wc' is non-NULL, bitwise-OR's 'wc' with the set of bits that were
635 * significant in the selection. At some point earlier, 'wc' should
636 * have been initialized (e.g., by flow_wildcards_init_catchall()).
639 bond_choose_output_slave(struct bond *bond, const struct flow *flow,
640 struct flow_wildcards *wc, uint16_t vlan,
643 struct bond_slave *slave = choose_output_slave(bond, flow, wc, vlan, tags);
648 *tags |= bond->no_slaves_tag;
656 bond_is_balanced(const struct bond *bond)
658 return bond->rebalance_interval
659 && (bond->balance == BM_SLB || bond->balance == BM_TCP);
662 /* Notifies 'bond' that 'n_bytes' bytes were sent in 'flow' within 'vlan'. */
664 bond_account(struct bond *bond, const struct flow *flow, uint16_t vlan,
667 if (bond_is_balanced(bond)) {
668 lookup_bond_entry(bond, flow, vlan)->tx_bytes += n_bytes;
672 static struct bond_slave *
673 bond_slave_from_bal_node(struct list *bal)
675 return CONTAINER_OF(bal, struct bond_slave, bal_node);
679 log_bals(struct bond *bond, const struct list *bals)
681 if (VLOG_IS_DBG_ENABLED()) {
682 struct ds ds = DS_EMPTY_INITIALIZER;
683 const struct bond_slave *slave;
685 LIST_FOR_EACH (slave, bal_node, bals) {
687 ds_put_char(&ds, ',');
689 ds_put_format(&ds, " %s %"PRIu64"kB",
690 slave->name, slave->tx_bytes / 1024);
692 if (!slave->enabled) {
693 ds_put_cstr(&ds, " (disabled)");
695 if (!list_is_empty(&slave->entries)) {
696 struct bond_entry *e;
698 ds_put_cstr(&ds, " (");
699 LIST_FOR_EACH (e, list_node, &slave->entries) {
700 if (&e->list_node != list_front(&slave->entries)) {
701 ds_put_cstr(&ds, " + ");
703 ds_put_format(&ds, "h%td: %"PRIu64"kB",
704 e - bond->hash, e->tx_bytes / 1024);
706 ds_put_cstr(&ds, ")");
709 VLOG_DBG("bond %s:%s", bond->name, ds_cstr(&ds));
714 /* Shifts 'hash' from its current slave to 'to'. */
716 bond_shift_load(struct bond_entry *hash, struct bond_slave *to,
719 struct bond_slave *from = hash->slave;
720 struct bond *bond = from->bond;
721 uint64_t delta = hash->tx_bytes;
723 VLOG_INFO("bond %s: shift %"PRIu64"kB of load (with hash %td) "
724 "from %s to %s (now carrying %"PRIu64"kB and "
725 "%"PRIu64"kB load, respectively)",
726 bond->name, delta / 1024, hash - bond->hash,
727 from->name, to->name,
728 (from->tx_bytes - delta) / 1024,
729 (to->tx_bytes + delta) / 1024);
731 /* Shift load away from 'from' to 'to'. */
732 from->tx_bytes -= delta;
733 to->tx_bytes += delta;
735 /* Arrange for flows to be revalidated. */
736 tag_set_add(set, hash->tag);
738 hash->tag = tag_create_random();
741 /* Picks and returns a bond_entry to migrate from 'from' (the most heavily
742 * loaded bond slave) to a bond slave that has 'to_tx_bytes' bytes of load,
743 * given that doing so must decrease the ratio of the load on the two slaves by
744 * at least 0.1. Returns NULL if there is no appropriate entry.
746 * The list of entries isn't sorted. I don't know of a reason to prefer to
747 * shift away small hashes or large hashes. */
748 static struct bond_entry *
749 choose_entry_to_migrate(const struct bond_slave *from, uint64_t to_tx_bytes)
751 struct bond_entry *e;
753 if (list_is_short(&from->entries)) {
754 /* 'from' carries no more than one MAC hash, so shifting load away from
755 * it would be pointless. */
759 LIST_FOR_EACH (e, list_node, &from->entries) {
760 double old_ratio, new_ratio;
763 if (to_tx_bytes == 0) {
764 /* Nothing on the new slave, move it. */
769 old_ratio = (double)from->tx_bytes / to_tx_bytes;
770 new_ratio = (double)(from->tx_bytes - delta) / (to_tx_bytes + delta);
771 if (old_ratio - new_ratio > 0.1
772 && fabs(new_ratio - 1.0) < fabs(old_ratio - 1.0)) {
773 /* We're aiming for an ideal ratio of 1, meaning both the 'from'
774 and 'to' slave have the same load. Therefore, we only move an
775 entry if it decreases the load on 'from', and brings us closer
776 to equal traffic load. */
784 /* Inserts 'slave' into 'bals' so that descending order of 'tx_bytes' is
787 insert_bal(struct list *bals, struct bond_slave *slave)
789 struct bond_slave *pos;
791 LIST_FOR_EACH (pos, bal_node, bals) {
792 if (slave->tx_bytes > pos->tx_bytes) {
796 list_insert(&pos->bal_node, &slave->bal_node);
799 /* Removes 'slave' from its current list and then inserts it into 'bals' so
800 * that descending order of 'tx_bytes' is maintained. */
802 reinsert_bal(struct list *bals, struct bond_slave *slave)
804 list_remove(&slave->bal_node);
805 insert_bal(bals, slave);
808 /* If 'bond' needs rebalancing, does so.
810 * The caller should have called bond_account() for each active flow, to ensure
811 * that flow data is consistently accounted at this point. */
813 bond_rebalance(struct bond *bond, struct tag_set *tags)
815 struct bond_slave *slave;
816 struct bond_entry *e;
819 if (!bond_is_balanced(bond) || time_msec() < bond->next_rebalance) {
822 bond->next_rebalance = time_msec() + bond->rebalance_interval;
824 /* Add each bond_entry to its slave's 'entries' list.
825 * Compute each slave's tx_bytes as the sum of its entries' tx_bytes. */
826 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
828 list_init(&slave->entries);
830 for (e = &bond->hash[0]; e <= &bond->hash[BOND_MASK]; e++) {
831 if (e->slave && e->tx_bytes) {
832 e->slave->tx_bytes += e->tx_bytes;
833 list_push_back(&e->slave->entries, &e->list_node);
837 /* Add enabled slaves to 'bals' in descending order of tx_bytes.
839 * XXX This is O(n**2) in the number of slaves but it could be O(n lg n)
840 * with a proper list sort algorithm. */
842 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
843 if (slave->enabled) {
844 insert_bal(&bals, slave);
847 log_bals(bond, &bals);
849 /* Shift load from the most-loaded slaves to the least-loaded slaves. */
850 while (!list_is_short(&bals)) {
851 struct bond_slave *from = bond_slave_from_bal_node(list_front(&bals));
852 struct bond_slave *to = bond_slave_from_bal_node(list_back(&bals));
855 overload = from->tx_bytes - to->tx_bytes;
856 if (overload < to->tx_bytes >> 5 || overload < 100000) {
857 /* The extra load on 'from' (and all less-loaded slaves), compared
858 * to that of 'to' (the least-loaded slave), is less than ~3%, or
859 * it is less than ~1Mbps. No point in rebalancing. */
863 /* 'from' is carrying significantly more load than 'to'. Pick a hash
864 * to move from 'from' to 'to'. */
865 e = choose_entry_to_migrate(from, to->tx_bytes);
867 bond_shift_load(e, to, tags);
869 /* Delete element from from->entries.
871 * We don't add the element to to->hashes. That would only allow
872 * 'e' to be migrated to another slave in this rebalancing run, and
873 * there is no point in doing that. */
874 list_remove(&e->list_node);
876 /* Re-sort 'bals'. */
877 reinsert_bal(&bals, from);
878 reinsert_bal(&bals, to);
880 /* Can't usefully migrate anything away from 'from'.
881 * Don't reconsider it. */
882 list_remove(&from->bal_node);
886 /* Implement exponentially weighted moving average. A weight of 1/2 causes
887 * historical data to decay to <1% in 7 rebalancing runs. 1,000,000 bytes
888 * take 20 rebalancing runs to decay to 0 and get deleted entirely. */
889 for (e = &bond->hash[0]; e <= &bond->hash[BOND_MASK]; e++) {
897 /* Bonding unixctl user interface functions. */
900 bond_find(const char *name)
904 HMAP_FOR_EACH_WITH_HASH (bond, hmap_node, hash_string(name, 0),
906 if (!strcmp(bond->name, name)) {
913 static struct bond_slave *
914 bond_lookup_slave(struct bond *bond, const char *slave_name)
916 struct bond_slave *slave;
918 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
919 if (!strcmp(slave->name, slave_name)) {
927 bond_unixctl_list(struct unixctl_conn *conn,
928 int argc OVS_UNUSED, const char *argv[] OVS_UNUSED,
929 void *aux OVS_UNUSED)
931 struct ds ds = DS_EMPTY_INITIALIZER;
932 const struct bond *bond;
934 ds_put_cstr(&ds, "bond\ttype\tslaves\n");
936 HMAP_FOR_EACH (bond, hmap_node, &all_bonds) {
937 const struct bond_slave *slave;
940 ds_put_format(&ds, "%s\t%s\t",
941 bond->name, bond_mode_to_string(bond->balance));
944 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
946 ds_put_cstr(&ds, ", ");
948 ds_put_cstr(&ds, slave->name);
950 ds_put_char(&ds, '\n');
952 unixctl_command_reply(conn, ds_cstr(&ds));
957 bond_print_details(struct ds *ds, const struct bond *bond)
959 struct shash slave_shash = SHASH_INITIALIZER(&slave_shash);
960 const struct shash_node **sorted_slaves = NULL;
961 const struct bond_slave *slave;
964 ds_put_format(ds, "---- %s ----\n", bond->name);
965 ds_put_format(ds, "bond_mode: %s\n",
966 bond_mode_to_string(bond->balance));
968 ds_put_format(ds, "bond-hash-basis: %"PRIu32"\n", bond->basis);
970 ds_put_format(ds, "updelay: %d ms\n", bond->updelay);
971 ds_put_format(ds, "downdelay: %d ms\n", bond->downdelay);
973 if (bond_is_balanced(bond)) {
974 ds_put_format(ds, "next rebalance: %lld ms\n",
975 bond->next_rebalance - time_msec());
978 ds_put_cstr(ds, "lacp_status: ");
979 switch (bond->lacp_status) {
980 case LACP_NEGOTIATED:
981 ds_put_cstr(ds, "negotiated\n");
983 case LACP_CONFIGURED:
984 ds_put_cstr(ds, "configured\n");
987 ds_put_cstr(ds, "off\n");
990 ds_put_cstr(ds, "<unknown>\n");
994 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
995 shash_add(&slave_shash, slave->name, slave);
997 sorted_slaves = shash_sort(&slave_shash);
999 for (i = 0; i < shash_count(&slave_shash); i++) {
1000 struct bond_entry *be;
1002 slave = sorted_slaves[i]->data;
1005 ds_put_format(ds, "\nslave %s: %s\n",
1006 slave->name, slave->enabled ? "enabled" : "disabled");
1007 if (slave == bond->active_slave) {
1008 ds_put_cstr(ds, "\tactive slave\n");
1010 if (slave->delay_expires != LLONG_MAX) {
1011 ds_put_format(ds, "\t%s expires in %lld ms\n",
1012 slave->enabled ? "downdelay" : "updelay",
1013 slave->delay_expires - time_msec());
1016 ds_put_format(ds, "\tmay_enable: %s\n",
1017 slave->may_enable ? "true" : "false");
1019 if (!bond_is_balanced(bond)) {
1024 for (be = bond->hash; be <= &bond->hash[BOND_MASK]; be++) {
1025 int hash = be - bond->hash;
1027 if (be->slave != slave) {
1031 ds_put_format(ds, "\thash %d: %"PRIu64" kB load\n",
1032 hash, be->tx_bytes / 1024);
1034 /* XXX How can we list the MACs assigned to hashes of SLB bonds? */
1037 shash_destroy(&slave_shash);
1038 free(sorted_slaves);
1039 ds_put_cstr(ds, "\n");
1043 bond_unixctl_show(struct unixctl_conn *conn,
1044 int argc, const char *argv[],
1045 void *aux OVS_UNUSED)
1047 struct ds ds = DS_EMPTY_INITIALIZER;
1050 const struct bond *bond = bond_find(argv[1]);
1053 unixctl_command_reply_error(conn, "no such bond");
1056 bond_print_details(&ds, bond);
1058 const struct bond *bond;
1060 HMAP_FOR_EACH (bond, hmap_node, &all_bonds) {
1061 bond_print_details(&ds, bond);
1065 unixctl_command_reply(conn, ds_cstr(&ds));
1070 bond_unixctl_migrate(struct unixctl_conn *conn,
1071 int argc OVS_UNUSED, const char *argv[],
1072 void *aux OVS_UNUSED)
1074 const char *bond_s = argv[1];
1075 const char *hash_s = argv[2];
1076 const char *slave_s = argv[3];
1078 struct bond_slave *slave;
1079 struct bond_entry *entry;
1082 bond = bond_find(bond_s);
1084 unixctl_command_reply_error(conn, "no such bond");
1088 if (bond->balance != BM_SLB) {
1089 unixctl_command_reply_error(conn, "not an SLB bond");
1093 if (strspn(hash_s, "0123456789") == strlen(hash_s)) {
1094 hash = atoi(hash_s) & BOND_MASK;
1096 unixctl_command_reply_error(conn, "bad hash");
1100 slave = bond_lookup_slave(bond, slave_s);
1102 unixctl_command_reply_error(conn, "no such slave");
1106 if (!slave->enabled) {
1107 unixctl_command_reply_error(conn, "cannot migrate to disabled slave");
1111 entry = &bond->hash[hash];
1112 tag_set_add(&bond->unixctl_tags, entry->tag);
1113 entry->slave = slave;
1114 entry->tag = tag_create_random();
1115 unixctl_command_reply(conn, "migrated");
1119 bond_unixctl_set_active_slave(struct unixctl_conn *conn,
1120 int argc OVS_UNUSED, const char *argv[],
1121 void *aux OVS_UNUSED)
1123 const char *bond_s = argv[1];
1124 const char *slave_s = argv[2];
1126 struct bond_slave *slave;
1128 bond = bond_find(bond_s);
1130 unixctl_command_reply_error(conn, "no such bond");
1134 slave = bond_lookup_slave(bond, slave_s);
1136 unixctl_command_reply_error(conn, "no such slave");
1140 if (!slave->enabled) {
1141 unixctl_command_reply_error(conn, "cannot make disabled slave active");
1145 if (bond->active_slave != slave) {
1146 tag_set_add(&bond->unixctl_tags, bond_get_active_slave_tag(bond));
1147 bond->active_slave = slave;
1148 bond->active_slave->tag = tag_create_random();
1149 VLOG_INFO("bond %s: active interface is now %s",
1150 bond->name, slave->name);
1151 bond->send_learning_packets = true;
1152 unixctl_command_reply(conn, "done");
1154 unixctl_command_reply(conn, "no change");
1159 enable_slave(struct unixctl_conn *conn, const char *argv[], bool enable)
1161 const char *bond_s = argv[1];
1162 const char *slave_s = argv[2];
1164 struct bond_slave *slave;
1166 bond = bond_find(bond_s);
1168 unixctl_command_reply_error(conn, "no such bond");
1172 slave = bond_lookup_slave(bond, slave_s);
1174 unixctl_command_reply_error(conn, "no such slave");
1178 bond_enable_slave(slave, enable, &bond->unixctl_tags);
1179 unixctl_command_reply(conn, enable ? "enabled" : "disabled");
1183 bond_unixctl_enable_slave(struct unixctl_conn *conn,
1184 int argc OVS_UNUSED, const char *argv[],
1185 void *aux OVS_UNUSED)
1187 enable_slave(conn, argv, true);
1191 bond_unixctl_disable_slave(struct unixctl_conn *conn,
1192 int argc OVS_UNUSED, const char *argv[],
1193 void *aux OVS_UNUSED)
1195 enable_slave(conn, argv, false);
1199 bond_unixctl_hash(struct unixctl_conn *conn, int argc, const char *argv[],
1200 void *aux OVS_UNUSED)
1202 const char *mac_s = argv[1];
1203 const char *vlan_s = argc > 2 ? argv[2] : NULL;
1204 const char *basis_s = argc > 3 ? argv[3] : NULL;
1205 uint8_t mac[ETH_ADDR_LEN];
1212 if (sscanf(vlan_s, "%u", &vlan) != 1) {
1213 unixctl_command_reply_error(conn, "invalid vlan");
1221 if (sscanf(basis_s, "%"PRIu32, &basis) != 1) {
1222 unixctl_command_reply_error(conn, "invalid basis");
1229 if (sscanf(mac_s, ETH_ADDR_SCAN_FMT, ETH_ADDR_SCAN_ARGS(mac))
1230 == ETH_ADDR_SCAN_COUNT) {
1231 hash = bond_hash_src(mac, vlan, basis) & BOND_MASK;
1233 hash_cstr = xasprintf("%u", hash);
1234 unixctl_command_reply(conn, hash_cstr);
1237 unixctl_command_reply_error(conn, "invalid mac");
1244 unixctl_command_register("bond/list", "", 0, 0, bond_unixctl_list, NULL);
1245 unixctl_command_register("bond/show", "[port]", 0, 1, bond_unixctl_show,
1247 unixctl_command_register("bond/migrate", "port hash slave", 3, 3,
1248 bond_unixctl_migrate, NULL);
1249 unixctl_command_register("bond/set-active-slave", "port slave", 2, 2,
1250 bond_unixctl_set_active_slave, NULL);
1251 unixctl_command_register("bond/enable-slave", "port slave", 2, 2,
1252 bond_unixctl_enable_slave, NULL);
1253 unixctl_command_register("bond/disable-slave", "port slave", 2, 2,
1254 bond_unixctl_disable_slave, NULL);
1255 unixctl_command_register("bond/hash", "mac [vlan] [basis]", 1, 3,
1256 bond_unixctl_hash, NULL);
1260 bond_entry_reset(struct bond *bond)
1262 if (bond->balance != BM_AB) {
1263 size_t hash_len = (BOND_MASK + 1) * sizeof *bond->hash;
1266 bond->hash = xmalloc(hash_len);
1268 memset(bond->hash, 0, hash_len);
1270 bond->next_rebalance = time_msec() + bond->rebalance_interval;
1277 static struct bond_slave *
1278 bond_slave_lookup(struct bond *bond, const void *slave_)
1280 struct bond_slave *slave;
1282 HMAP_FOR_EACH_IN_BUCKET (slave, hmap_node, hash_pointer(slave_, 0),
1284 if (slave->aux == slave_) {
1293 bond_enable_slave(struct bond_slave *slave, bool enable, struct tag_set *tags)
1295 slave->delay_expires = LLONG_MAX;
1296 if (enable != slave->enabled) {
1297 slave->enabled = enable;
1298 if (!slave->enabled) {
1299 VLOG_INFO("interface %s: disabled", slave->name);
1301 tag_set_add(tags, slave->tag);
1304 VLOG_INFO("interface %s: enabled", slave->name);
1305 slave->tag = tag_create_random();
1311 bond_link_status_update(struct bond_slave *slave, struct tag_set *tags)
1313 struct bond *bond = slave->bond;
1316 up = netdev_get_carrier(slave->netdev) && slave->may_enable;
1317 if ((up == slave->enabled) != (slave->delay_expires == LLONG_MAX)) {
1318 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
1319 VLOG_INFO_RL(&rl, "interface %s: link state %s",
1320 slave->name, up ? "up" : "down");
1321 if (up == slave->enabled) {
1322 slave->delay_expires = LLONG_MAX;
1323 VLOG_INFO_RL(&rl, "interface %s: will not be %s",
1324 slave->name, up ? "disabled" : "enabled");
1326 int delay = (bond->lacp_status != LACP_DISABLED ? 0
1327 : up ? bond->updelay : bond->downdelay);
1328 slave->delay_expires = time_msec() + delay;
1330 VLOG_INFO_RL(&rl, "interface %s: will be %s if it stays %s "
1333 up ? "enabled" : "disabled",
1340 if (time_msec() >= slave->delay_expires) {
1341 bond_enable_slave(slave, up, tags);
1346 bond_hash_src(const uint8_t mac[ETH_ADDR_LEN], uint16_t vlan, uint32_t basis)
1348 return hash_3words(hash_bytes(mac, ETH_ADDR_LEN, 0), vlan, basis);
1352 bond_hash_tcp(const struct flow *flow, uint16_t vlan, uint32_t basis)
1354 struct flow hash_flow = *flow;
1355 hash_flow.vlan_tci = htons(vlan);
1357 /* The symmetric quality of this hash function is not required, but
1358 * flow_hash_symmetric_l4 already exists, and is sufficient for our
1359 * purposes, so we use it out of convenience. */
1360 return flow_hash_symmetric_l4(&hash_flow, basis);
1364 bond_hash(const struct bond *bond, const struct flow *flow, uint16_t vlan)
1366 ovs_assert(bond->balance == BM_TCP || bond->balance == BM_SLB);
1368 return (bond->balance == BM_TCP
1369 ? bond_hash_tcp(flow, vlan, bond->basis)
1370 : bond_hash_src(flow->dl_src, vlan, bond->basis));
1373 static struct bond_entry *
1374 lookup_bond_entry(const struct bond *bond, const struct flow *flow,
1377 return &bond->hash[bond_hash(bond, flow, vlan) & BOND_MASK];
1380 static struct bond_slave *
1381 choose_output_slave(const struct bond *bond, const struct flow *flow,
1382 struct flow_wildcards *wc, uint16_t vlan, tag_type *tags)
1384 struct bond_entry *e;
1386 if (bond->lacp_status == LACP_CONFIGURED) {
1387 /* LACP has been configured on this bond but negotiations were
1388 * unsuccussful. Drop all traffic. */
1392 switch (bond->balance) {
1394 return bond->active_slave;
1397 if (bond->lacp_status != LACP_NEGOTIATED) {
1398 /* Must have LACP negotiations for TCP balanced bonds. */
1402 flow_mask_hash_fields(flow, wc, NX_HASH_FIELDS_SYMMETRIC_L4);
1407 flow_mask_hash_fields(flow, wc, NX_HASH_FIELDS_ETH_SRC);
1409 e = lookup_bond_entry(bond, flow, vlan);
1410 if (!e->slave || !e->slave->enabled) {
1411 e->slave = CONTAINER_OF(hmap_random_node(&bond->slaves),
1412 struct bond_slave, hmap_node);
1413 if (!e->slave->enabled) {
1414 e->slave = bond->active_slave;
1416 e->tag = tag_create_random();
1426 static struct bond_slave *
1427 bond_choose_slave(const struct bond *bond)
1429 struct bond_slave *slave, *best;
1431 /* Find an enabled slave. */
1432 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1433 if (slave->enabled) {
1438 /* All interfaces are disabled. Find an interface that will be enabled
1439 * after its updelay expires. */
1441 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1442 if (slave->delay_expires != LLONG_MAX
1443 && slave->may_enable
1444 && (!best || slave->delay_expires < best->delay_expires)) {
1452 bond_choose_active_slave(struct bond *bond, struct tag_set *tags)
1454 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
1455 struct bond_slave *old_active_slave = bond->active_slave;
1457 bond->active_slave = bond_choose_slave(bond);
1458 if (bond->active_slave) {
1459 if (bond->active_slave->enabled) {
1460 VLOG_INFO_RL(&rl, "bond %s: active interface is now %s",
1461 bond->name, bond->active_slave->name);
1463 VLOG_INFO_RL(&rl, "bond %s: active interface is now %s, skipping "
1464 "remaining %lld ms updelay (since no interface was "
1465 "enabled)", bond->name, bond->active_slave->name,
1466 bond->active_slave->delay_expires - time_msec());
1467 bond_enable_slave(bond->active_slave, true, tags);
1470 if (!old_active_slave) {
1471 tag_set_add(tags, bond->no_slaves_tag);
1474 bond->send_learning_packets = true;
1475 } else if (old_active_slave) {
1476 VLOG_INFO_RL(&rl, "bond %s: all interfaces disabled", bond->name);
1480 /* Returns the tag for 'bond''s active slave, or 'bond''s no_slaves_tag if
1481 * there is no active slave. */
1483 bond_get_active_slave_tag(const struct bond *bond)
1485 return (bond->active_slave
1486 ? bond->active_slave->tag
1487 : bond->no_slaves_tag);
1490 /* Attempts to make the sum of the bond slaves' statistics appear on the fake
1491 * bond interface. */
1493 bond_update_fake_slave_stats(struct bond *bond)
1495 struct netdev_stats bond_stats;
1496 struct bond_slave *slave;
1497 struct netdev *bond_dev;
1499 memset(&bond_stats, 0, sizeof bond_stats);
1501 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1502 struct netdev_stats slave_stats;
1504 if (!netdev_get_stats(slave->netdev, &slave_stats)) {
1505 /* XXX: We swap the stats here because they are swapped back when
1506 * reported by the internal device. The reason for this is
1507 * internal devices normally represent packets going into the
1508 * system but when used as fake bond device they represent packets
1509 * leaving the system. We really should do this in the internal
1510 * device itself because changing it here reverses the counts from
1511 * the perspective of the switch. However, the internal device
1512 * doesn't know what type of device it represents so we have to do
1513 * it here for now. */
1514 bond_stats.tx_packets += slave_stats.rx_packets;
1515 bond_stats.tx_bytes += slave_stats.rx_bytes;
1516 bond_stats.rx_packets += slave_stats.tx_packets;
1517 bond_stats.rx_bytes += slave_stats.tx_bytes;
1521 if (!netdev_open(bond->name, "system", &bond_dev)) {
1522 netdev_set_stats(bond_dev, &bond_stats);
1523 netdev_close(bond_dev);