2 * Copyright (c) 2008, 2009, 2010, 2011, 2012, 2013 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
27 #include "dynamic-string.h"
36 #include "poll-loop.h"
43 VLOG_DEFINE_THIS_MODULE(bond);
45 /* Bit-mask for hashing a flow down to a bucket.
46 * There are (BOND_MASK + 1) buckets. */
47 #define BOND_MASK 0xff
49 /* A hash bucket for mapping a flow to a slave.
50 * "struct bond" has an array of (BOND_MASK + 1) of these. */
52 struct bond_slave *slave; /* Assigned slave, NULL if unassigned. */
53 uint64_t tx_bytes; /* Count of bytes recently transmitted. */
54 tag_type tag; /* Tag for entry<->facet association. */
55 struct list list_node; /* In bond_slave's 'entries' list. */
58 /* A bond slave, that is, one of the links comprising a bond. */
60 struct hmap_node hmap_node; /* In struct bond's slaves hmap. */
61 struct bond *bond; /* The bond that contains this slave. */
62 void *aux; /* Client-provided handle for this slave. */
64 struct netdev *netdev; /* Network device, owned by the client. */
65 unsigned int change_seq; /* Tracks changes in 'netdev'. */
66 char *name; /* Name (a copy of netdev_get_name(netdev)). */
69 long long delay_expires; /* Time after which 'enabled' may change. */
70 bool enabled; /* May be chosen for flows? */
71 bool may_enable; /* Client considers this slave bondable. */
72 tag_type tag; /* Tag associated with this slave. */
74 /* Rebalancing info. Used only by bond_rebalance(). */
75 struct list bal_node; /* In bond_rebalance()'s 'bals' list. */
76 struct list entries; /* 'struct bond_entry's assigned here. */
77 uint64_t tx_bytes; /* Sum across 'tx_bytes' of entries. */
80 /* A bond, that is, a set of network devices grouped to improve performance or
83 struct hmap_node hmap_node; /* In 'all_bonds' hmap. */
84 char *name; /* Name provided by client. */
90 enum bond_mode balance; /* Balancing mode, one of BM_*. */
91 struct bond_slave *active_slave;
92 tag_type no_slaves_tag; /* Tag for flows when all slaves disabled. */
93 int updelay, downdelay; /* Delay before slave goes up/down, in ms. */
94 enum lacp_status lacp_status; /* Status of LACP negotiations. */
95 bool bond_revalidate; /* True if flows need revalidation. */
96 uint32_t basis; /* Basis for flow hash function. */
98 /* SLB specific bonding info. */
99 struct bond_entry *hash; /* An array of (BOND_MASK + 1) elements. */
100 int rebalance_interval; /* Interval between rebalances, in ms. */
101 long long int next_rebalance; /* Next rebalancing time. */
102 bool send_learning_packets;
104 /* Legacy compatibility. */
105 long long int next_fake_iface_update; /* LLONG_MAX if disabled. */
107 /* Tag set saved for next bond_run(). This tag set is a kluge for cases
108 * where we can't otherwise provide revalidation feedback to the client.
109 * That's only unixctl commands now; I hope no other cases will arise. */
110 struct tag_set unixctl_tags;
115 static struct hmap all_bonds = HMAP_INITIALIZER(&all_bonds);
117 static void bond_entry_reset(struct bond *);
118 static struct bond_slave *bond_slave_lookup(struct bond *, const void *slave_);
119 static void bond_enable_slave(struct bond_slave *, bool enable,
121 static void bond_link_status_update(struct bond_slave *, struct tag_set *);
122 static void bond_choose_active_slave(struct bond *, struct tag_set *);
123 static unsigned int bond_hash_src(const uint8_t mac[ETH_ADDR_LEN],
124 uint16_t vlan, uint32_t basis);
125 static unsigned int bond_hash_tcp(const struct flow *, uint16_t vlan,
127 static struct bond_entry *lookup_bond_entry(const struct bond *,
130 static tag_type bond_get_active_slave_tag(const struct bond *);
131 static struct bond_slave *choose_output_slave(const struct bond *,
133 struct flow_wildcards *,
134 uint16_t vlan, tag_type *tags);
135 static void bond_update_fake_slave_stats(struct bond *);
137 /* Attempts to parse 's' as the name of a bond balancing mode. If successful,
138 * stores the mode in '*balance' and returns true. Otherwise returns false
139 * without modifying '*balance'. */
141 bond_mode_from_string(enum bond_mode *balance, const char *s)
143 if (!strcmp(s, bond_mode_to_string(BM_TCP))) {
145 } else if (!strcmp(s, bond_mode_to_string(BM_SLB))) {
147 } else if (!strcmp(s, bond_mode_to_string(BM_AB))) {
155 /* Returns a string representing 'balance'. */
157 bond_mode_to_string(enum bond_mode balance) {
160 return "balance-tcp";
162 return "balance-slb";
164 return "active-backup";
170 /* Creates and returns a new bond whose configuration is initially taken from
173 * The caller should register each slave on the new bond by calling
174 * bond_slave_register(). */
176 bond_create(const struct bond_settings *s)
180 bond = xzalloc(sizeof *bond);
181 hmap_init(&bond->slaves);
182 bond->no_slaves_tag = tag_create_random();
183 bond->next_fake_iface_update = LLONG_MAX;
186 bond_reconfigure(bond, s);
188 tag_set_init(&bond->unixctl_tags);
194 bond_ref(const struct bond *bond_)
196 struct bond *bond = CONST_CAST(struct bond *, bond_);
198 ovs_assert(bond->ref_cnt > 0);
205 bond_unref(struct bond *bond)
207 struct bond_slave *slave, *next_slave;
213 ovs_assert(bond->ref_cnt > 0);
214 if (--bond->ref_cnt) {
218 hmap_remove(&all_bonds, &bond->hmap_node);
220 HMAP_FOR_EACH_SAFE (slave, next_slave, hmap_node, &bond->slaves) {
221 hmap_remove(&bond->slaves, &slave->hmap_node);
222 /* Client owns 'slave->netdev'. */
226 hmap_destroy(&bond->slaves);
233 /* Updates 'bond''s overall configuration to 's'.
235 * The caller should register each slave on 'bond' by calling
236 * bond_slave_register(). This is optional if none of the slaves'
237 * configuration has changed. In any case it can't hurt.
239 * Returns true if the configuration has changed in such a way that requires
243 bond_reconfigure(struct bond *bond, const struct bond_settings *s)
245 bool revalidate = false;
247 if (!bond->name || strcmp(bond->name, s->name)) {
249 hmap_remove(&all_bonds, &bond->hmap_node);
252 bond->name = xstrdup(s->name);
253 hmap_insert(&all_bonds, &bond->hmap_node, hash_string(bond->name, 0));
256 bond->updelay = s->up_delay;
257 bond->downdelay = s->down_delay;
259 if (bond->rebalance_interval != s->rebalance_interval) {
260 bond->rebalance_interval = s->rebalance_interval;
264 if (bond->balance != s->balance) {
265 bond->balance = s->balance;
269 if (bond->basis != s->basis) {
270 bond->basis = s->basis;
275 if (bond->next_fake_iface_update == LLONG_MAX) {
276 bond->next_fake_iface_update = time_msec();
279 bond->next_fake_iface_update = LLONG_MAX;
282 if (bond->bond_revalidate) {
284 bond->bond_revalidate = false;
287 if (bond->balance == BM_AB || !bond->hash || revalidate) {
288 bond_entry_reset(bond);
295 bond_slave_set_netdev__(struct bond_slave *slave, struct netdev *netdev)
297 if (slave->netdev != netdev) {
298 slave->netdev = netdev;
299 slave->change_seq = 0;
303 /* Registers 'slave_' as a slave of 'bond'. The 'slave_' pointer is an
304 * arbitrary client-provided pointer that uniquely identifies a slave within a
305 * bond. If 'slave_' already exists within 'bond' then this function
306 * reconfigures the existing slave.
308 * 'netdev' must be the network device that 'slave_' represents. It is owned
309 * by the client, so the client must not close it before either unregistering
310 * 'slave_' or destroying 'bond'.
313 bond_slave_register(struct bond *bond, void *slave_, struct netdev *netdev)
315 struct bond_slave *slave = bond_slave_lookup(bond, slave_);
318 slave = xzalloc(sizeof *slave);
320 hmap_insert(&bond->slaves, &slave->hmap_node, hash_pointer(slave_, 0));
323 slave->delay_expires = LLONG_MAX;
324 slave->name = xstrdup(netdev_get_name(netdev));
325 bond->bond_revalidate = true;
327 slave->enabled = false;
328 bond_enable_slave(slave, netdev_get_carrier(netdev), NULL);
331 bond_slave_set_netdev__(slave, netdev);
334 slave->name = xstrdup(netdev_get_name(netdev));
337 /* Updates the network device to be used with 'slave_' to 'netdev'.
339 * This is useful if the caller closes and re-opens the network device
340 * registered with bond_slave_register() but doesn't need to change anything
343 bond_slave_set_netdev(struct bond *bond, void *slave_, struct netdev *netdev)
345 struct bond_slave *slave = bond_slave_lookup(bond, slave_);
347 bond_slave_set_netdev__(slave, netdev);
351 /* Unregisters 'slave_' from 'bond'. If 'bond' does not contain such a slave
352 * then this function has no effect.
354 * Unregistering a slave invalidates all flows. */
356 bond_slave_unregister(struct bond *bond, const void *slave_)
358 struct bond_slave *slave = bond_slave_lookup(bond, slave_);
365 bond_enable_slave(slave, false, NULL);
367 del_active = bond->active_slave == slave;
369 struct bond_entry *e;
370 for (e = bond->hash; e <= &bond->hash[BOND_MASK]; e++) {
371 if (e->slave == slave) {
379 hmap_remove(&bond->slaves, &slave->hmap_node);
380 /* Client owns 'slave->netdev'. */
387 bond_choose_active_slave(bond, &tags);
388 bond->send_learning_packets = true;
392 /* Should be called on each slave in 'bond' before bond_run() to indicate
393 * whether or not 'slave_' may be enabled. This function is intended to allow
394 * other protocols to have some impact on bonding decisions. For example LACP
395 * or high level link monitoring protocols may decide that a given slave should
396 * not be able to send traffic. */
398 bond_slave_set_may_enable(struct bond *bond, void *slave_, bool may_enable)
400 bond_slave_lookup(bond, slave_)->may_enable = may_enable;
403 /* Performs periodic maintenance on 'bond'. The caller must provide 'tags' to
404 * allow tagged flows to be invalidated.
406 * The caller should check bond_should_send_learning_packets() afterward. */
408 bond_run(struct bond *bond, struct tag_set *tags, enum lacp_status lacp_status)
410 struct bond_slave *slave;
412 if (bond->lacp_status != lacp_status) {
413 bond->lacp_status = lacp_status;
414 bond->bond_revalidate = true;
417 /* Enable slaves based on link status and LACP feedback. */
418 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
419 bond_link_status_update(slave, tags);
420 slave->change_seq = netdev_change_seq(slave->netdev);
422 if (!bond->active_slave || !bond->active_slave->enabled) {
423 bond_choose_active_slave(bond, tags);
426 /* Update fake bond interface stats. */
427 if (time_msec() >= bond->next_fake_iface_update) {
428 bond_update_fake_slave_stats(bond);
429 bond->next_fake_iface_update = time_msec() + 1000;
432 if (bond->bond_revalidate) {
433 struct bond_slave *slave;
435 bond->bond_revalidate = false;
436 bond_entry_reset(bond);
437 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
438 tag_set_add(tags, slave->tag);
440 tag_set_add(tags, bond->no_slaves_tag);
443 /* Invalidate any tags required by */
444 tag_set_union(tags, &bond->unixctl_tags);
445 tag_set_init(&bond->unixctl_tags);
448 /* Causes poll_block() to wake up when 'bond' needs something to be done. */
450 bond_wait(struct bond *bond)
452 struct bond_slave *slave;
454 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
455 if (slave->delay_expires != LLONG_MAX) {
456 poll_timer_wait_until(slave->delay_expires);
459 if (slave->change_seq != netdev_change_seq(slave->netdev)) {
460 poll_immediate_wake();
464 if (bond->next_fake_iface_update != LLONG_MAX) {
465 poll_timer_wait_until(bond->next_fake_iface_update);
468 /* Ensure that any saved tags get revalidated right away. */
469 if (!tag_set_is_empty(&bond->unixctl_tags)) {
470 poll_immediate_wake();
473 /* We don't wait for bond->next_rebalance because rebalancing can only run
474 * at a flow account checkpoint. ofproto does checkpointing on its own
475 * schedule and bond_rebalance() gets called afterward, so we'd just be
476 * waking up for no purpose. */
479 /* MAC learning table interaction. */
482 may_send_learning_packets(const struct bond *bond)
484 return bond->lacp_status == LACP_DISABLED
485 && (bond->balance == BM_SLB || bond->balance == BM_AB)
486 && bond->active_slave;
489 /* Returns true if 'bond' needs the client to send out packets to assist with
490 * MAC learning on 'bond'. If this function returns true, then the client
491 * should iterate through its MAC learning table for the bridge on which 'bond'
492 * is located. For each MAC that has been learned on a port other than 'bond',
493 * it should call bond_compose_learning_packet().
495 * This function will only return true if 'bond' is in SLB or active-backup
496 * mode and LACP is not negotiated. Otherwise sending learning packets isn't
499 * Calling this function resets the state that it checks. */
501 bond_should_send_learning_packets(struct bond *bond)
503 bool send = bond->send_learning_packets && may_send_learning_packets(bond);
504 bond->send_learning_packets = false;
508 /* Sends a gratuitous learning packet on 'bond' from 'eth_src' on 'vlan'.
510 * See bond_should_send_learning_packets() for description of usage. The
511 * caller should send the composed packet on the port associated with
512 * port_aux and takes ownership of the returned ofpbuf. */
514 bond_compose_learning_packet(struct bond *bond,
515 const uint8_t eth_src[ETH_ADDR_LEN],
516 uint16_t vlan, void **port_aux)
518 struct bond_slave *slave;
519 struct ofpbuf *packet;
523 ovs_assert(may_send_learning_packets(bond));
525 memset(&flow, 0, sizeof flow);
526 memcpy(flow.dl_src, eth_src, ETH_ADDR_LEN);
527 slave = choose_output_slave(bond, &flow, NULL, vlan, &tags);
529 packet = ofpbuf_new(0);
530 compose_rarp(packet, eth_src);
532 eth_push_vlan(packet, htons(vlan));
535 *port_aux = slave->aux;
539 /* Checks whether a packet that arrived on 'slave_' within 'bond', with an
540 * Ethernet destination address of 'eth_dst', should be admitted.
542 * The return value is one of the following:
544 * - BV_ACCEPT: Admit the packet.
546 * - BV_DROP: Drop the packet.
548 * - BV_DROP_IF_MOVED: Consult the MAC learning table for the packet's
549 * Ethernet source address and VLAN. If there is none, or if the packet
550 * is on the learned port, then admit the packet. If a different port has
551 * been learned, however, drop the packet (and do not use it for MAC
555 bond_check_admissibility(struct bond *bond, const void *slave_,
556 const uint8_t eth_dst[ETH_ADDR_LEN], tag_type *tags)
558 struct bond_slave *slave = bond_slave_lookup(bond, slave_);
560 /* LACP bonds have very loose admissibility restrictions because we can
561 * assume the remote switch is aware of the bond and will "do the right
562 * thing". However, as a precaution we drop packets on disabled slaves
563 * because no correctly implemented partner switch should be sending
566 * If LACP is configured, but LACP negotiations have been unsuccessful, we
567 * drop all incoming traffic. */
568 switch (bond->lacp_status) {
569 case LACP_NEGOTIATED: return slave->enabled ? BV_ACCEPT : BV_DROP;
570 case LACP_CONFIGURED: return BV_DROP;
571 case LACP_DISABLED: break;
574 /* Drop all multicast packets on inactive slaves. */
575 if (eth_addr_is_multicast(eth_dst)) {
576 *tags |= bond_get_active_slave_tag(bond);
577 if (bond->active_slave != bond_slave_lookup(bond, slave_)) {
582 switch (bond->balance) {
584 /* Drop all packets which arrive on backup slaves. This is similar to
585 * how Linux bonding handles active-backup bonds. */
586 *tags |= bond_get_active_slave_tag(bond);
587 if (bond->active_slave != slave) {
588 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
590 VLOG_DBG_RL(&rl, "active-backup bond received packet on backup"
591 " slave (%s) destined for " ETH_ADDR_FMT,
592 slave->name, ETH_ADDR_ARGS(eth_dst));
598 /* TCP balanced bonds require successful LACP negotiated. Based on the
599 * above check, LACP is off on this bond. Therfore, we drop all
600 * incoming traffic. */
604 /* Drop all packets for which we have learned a different input port,
605 * because we probably sent the packet on one slave and got it back on
606 * the other. Gratuitous ARP packets are an exception to this rule:
607 * the host has moved to another switch. The exception to the
608 * exception is if we locked the learning table to avoid reflections on
610 return BV_DROP_IF_MOVED;
616 /* Returns the slave (registered on 'bond' by bond_slave_register()) to which
617 * a packet with the given 'flow' and 'vlan' should be forwarded. Returns
618 * NULL if the packet should be dropped because no slaves are enabled.
620 * 'vlan' is not necessarily the same as 'flow->vlan_tci'. First, 'vlan'
621 * should be a VID only (i.e. excluding the PCP bits). Second,
622 * 'flow->vlan_tci' is the VLAN TCI that appeared on the packet (so it will be
623 * nonzero only for trunk ports), whereas 'vlan' is the logical VLAN that the
624 * packet belongs to (so for an access port it will be the access port's VLAN).
626 * Adds a tag to '*tags' that associates the flow with the returned slave.
628 * If 'wc' is non-NULL, bitwise-OR's 'wc' with the set of bits that were
629 * significant in the selection. At some point earlier, 'wc' should
630 * have been initialized (e.g., by flow_wildcards_init_catchall()).
633 bond_choose_output_slave(struct bond *bond, const struct flow *flow,
634 struct flow_wildcards *wc, uint16_t vlan,
637 struct bond_slave *slave = choose_output_slave(bond, flow, wc, vlan, tags);
642 *tags |= bond->no_slaves_tag;
650 bond_is_balanced(const struct bond *bond)
652 return bond->rebalance_interval
653 && (bond->balance == BM_SLB || bond->balance == BM_TCP);
656 /* Notifies 'bond' that 'n_bytes' bytes were sent in 'flow' within 'vlan'. */
658 bond_account(struct bond *bond, const struct flow *flow, uint16_t vlan,
661 if (bond_is_balanced(bond)) {
662 lookup_bond_entry(bond, flow, vlan)->tx_bytes += n_bytes;
666 static struct bond_slave *
667 bond_slave_from_bal_node(struct list *bal)
669 return CONTAINER_OF(bal, struct bond_slave, bal_node);
673 log_bals(struct bond *bond, const struct list *bals)
675 if (VLOG_IS_DBG_ENABLED()) {
676 struct ds ds = DS_EMPTY_INITIALIZER;
677 const struct bond_slave *slave;
679 LIST_FOR_EACH (slave, bal_node, bals) {
681 ds_put_char(&ds, ',');
683 ds_put_format(&ds, " %s %"PRIu64"kB",
684 slave->name, slave->tx_bytes / 1024);
686 if (!slave->enabled) {
687 ds_put_cstr(&ds, " (disabled)");
689 if (!list_is_empty(&slave->entries)) {
690 struct bond_entry *e;
692 ds_put_cstr(&ds, " (");
693 LIST_FOR_EACH (e, list_node, &slave->entries) {
694 if (&e->list_node != list_front(&slave->entries)) {
695 ds_put_cstr(&ds, " + ");
697 ds_put_format(&ds, "h%td: %"PRIu64"kB",
698 e - bond->hash, e->tx_bytes / 1024);
700 ds_put_cstr(&ds, ")");
703 VLOG_DBG("bond %s:%s", bond->name, ds_cstr(&ds));
708 /* Shifts 'hash' from its current slave to 'to'. */
710 bond_shift_load(struct bond_entry *hash, struct bond_slave *to,
713 struct bond_slave *from = hash->slave;
714 struct bond *bond = from->bond;
715 uint64_t delta = hash->tx_bytes;
717 VLOG_INFO("bond %s: shift %"PRIu64"kB of load (with hash %td) "
718 "from %s to %s (now carrying %"PRIu64"kB and "
719 "%"PRIu64"kB load, respectively)",
720 bond->name, delta / 1024, hash - bond->hash,
721 from->name, to->name,
722 (from->tx_bytes - delta) / 1024,
723 (to->tx_bytes + delta) / 1024);
725 /* Shift load away from 'from' to 'to'. */
726 from->tx_bytes -= delta;
727 to->tx_bytes += delta;
729 /* Arrange for flows to be revalidated. */
730 tag_set_add(set, hash->tag);
732 hash->tag = tag_create_random();
735 /* Picks and returns a bond_entry to migrate from 'from' (the most heavily
736 * loaded bond slave) to a bond slave that has 'to_tx_bytes' bytes of load,
737 * given that doing so must decrease the ratio of the load on the two slaves by
738 * at least 0.1. Returns NULL if there is no appropriate entry.
740 * The list of entries isn't sorted. I don't know of a reason to prefer to
741 * shift away small hashes or large hashes. */
742 static struct bond_entry *
743 choose_entry_to_migrate(const struct bond_slave *from, uint64_t to_tx_bytes)
745 struct bond_entry *e;
747 if (list_is_short(&from->entries)) {
748 /* 'from' carries no more than one MAC hash, so shifting load away from
749 * it would be pointless. */
753 LIST_FOR_EACH (e, list_node, &from->entries) {
754 double old_ratio, new_ratio;
757 if (to_tx_bytes == 0) {
758 /* Nothing on the new slave, move it. */
763 old_ratio = (double)from->tx_bytes / to_tx_bytes;
764 new_ratio = (double)(from->tx_bytes - delta) / (to_tx_bytes + delta);
765 if (old_ratio - new_ratio > 0.1
766 && fabs(new_ratio - 1.0) < fabs(old_ratio - 1.0)) {
767 /* We're aiming for an ideal ratio of 1, meaning both the 'from'
768 and 'to' slave have the same load. Therefore, we only move an
769 entry if it decreases the load on 'from', and brings us closer
770 to equal traffic load. */
778 /* Inserts 'slave' into 'bals' so that descending order of 'tx_bytes' is
781 insert_bal(struct list *bals, struct bond_slave *slave)
783 struct bond_slave *pos;
785 LIST_FOR_EACH (pos, bal_node, bals) {
786 if (slave->tx_bytes > pos->tx_bytes) {
790 list_insert(&pos->bal_node, &slave->bal_node);
793 /* Removes 'slave' from its current list and then inserts it into 'bals' so
794 * that descending order of 'tx_bytes' is maintained. */
796 reinsert_bal(struct list *bals, struct bond_slave *slave)
798 list_remove(&slave->bal_node);
799 insert_bal(bals, slave);
802 /* If 'bond' needs rebalancing, does so.
804 * The caller should have called bond_account() for each active flow, to ensure
805 * that flow data is consistently accounted at this point. */
807 bond_rebalance(struct bond *bond, struct tag_set *tags)
809 struct bond_slave *slave;
810 struct bond_entry *e;
813 if (!bond_is_balanced(bond) || time_msec() < bond->next_rebalance) {
816 bond->next_rebalance = time_msec() + bond->rebalance_interval;
818 /* Add each bond_entry to its slave's 'entries' list.
819 * Compute each slave's tx_bytes as the sum of its entries' tx_bytes. */
820 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
822 list_init(&slave->entries);
824 for (e = &bond->hash[0]; e <= &bond->hash[BOND_MASK]; e++) {
825 if (e->slave && e->tx_bytes) {
826 e->slave->tx_bytes += e->tx_bytes;
827 list_push_back(&e->slave->entries, &e->list_node);
831 /* Add enabled slaves to 'bals' in descending order of tx_bytes.
833 * XXX This is O(n**2) in the number of slaves but it could be O(n lg n)
834 * with a proper list sort algorithm. */
836 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
837 if (slave->enabled) {
838 insert_bal(&bals, slave);
841 log_bals(bond, &bals);
843 /* Shift load from the most-loaded slaves to the least-loaded slaves. */
844 while (!list_is_short(&bals)) {
845 struct bond_slave *from = bond_slave_from_bal_node(list_front(&bals));
846 struct bond_slave *to = bond_slave_from_bal_node(list_back(&bals));
849 overload = from->tx_bytes - to->tx_bytes;
850 if (overload < to->tx_bytes >> 5 || overload < 100000) {
851 /* The extra load on 'from' (and all less-loaded slaves), compared
852 * to that of 'to' (the least-loaded slave), is less than ~3%, or
853 * it is less than ~1Mbps. No point in rebalancing. */
857 /* 'from' is carrying significantly more load than 'to'. Pick a hash
858 * to move from 'from' to 'to'. */
859 e = choose_entry_to_migrate(from, to->tx_bytes);
861 bond_shift_load(e, to, tags);
863 /* Delete element from from->entries.
865 * We don't add the element to to->hashes. That would only allow
866 * 'e' to be migrated to another slave in this rebalancing run, and
867 * there is no point in doing that. */
868 list_remove(&e->list_node);
870 /* Re-sort 'bals'. */
871 reinsert_bal(&bals, from);
872 reinsert_bal(&bals, to);
874 /* Can't usefully migrate anything away from 'from'.
875 * Don't reconsider it. */
876 list_remove(&from->bal_node);
880 /* Implement exponentially weighted moving average. A weight of 1/2 causes
881 * historical data to decay to <1% in 7 rebalancing runs. 1,000,000 bytes
882 * take 20 rebalancing runs to decay to 0 and get deleted entirely. */
883 for (e = &bond->hash[0]; e <= &bond->hash[BOND_MASK]; e++) {
891 /* Bonding unixctl user interface functions. */
894 bond_find(const char *name)
898 HMAP_FOR_EACH_WITH_HASH (bond, hmap_node, hash_string(name, 0),
900 if (!strcmp(bond->name, name)) {
907 static struct bond_slave *
908 bond_lookup_slave(struct bond *bond, const char *slave_name)
910 struct bond_slave *slave;
912 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
913 if (!strcmp(slave->name, slave_name)) {
921 bond_unixctl_list(struct unixctl_conn *conn,
922 int argc OVS_UNUSED, const char *argv[] OVS_UNUSED,
923 void *aux OVS_UNUSED)
925 struct ds ds = DS_EMPTY_INITIALIZER;
926 const struct bond *bond;
928 ds_put_cstr(&ds, "bond\ttype\tslaves\n");
930 HMAP_FOR_EACH (bond, hmap_node, &all_bonds) {
931 const struct bond_slave *slave;
934 ds_put_format(&ds, "%s\t%s\t",
935 bond->name, bond_mode_to_string(bond->balance));
938 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
940 ds_put_cstr(&ds, ", ");
942 ds_put_cstr(&ds, slave->name);
944 ds_put_char(&ds, '\n');
946 unixctl_command_reply(conn, ds_cstr(&ds));
951 bond_print_details(struct ds *ds, const struct bond *bond)
953 struct shash slave_shash = SHASH_INITIALIZER(&slave_shash);
954 const struct shash_node **sorted_slaves = NULL;
955 const struct bond_slave *slave;
958 ds_put_format(ds, "---- %s ----\n", bond->name);
959 ds_put_format(ds, "bond_mode: %s\n",
960 bond_mode_to_string(bond->balance));
962 ds_put_format(ds, "bond-hash-basis: %"PRIu32"\n", bond->basis);
964 ds_put_format(ds, "updelay: %d ms\n", bond->updelay);
965 ds_put_format(ds, "downdelay: %d ms\n", bond->downdelay);
967 if (bond_is_balanced(bond)) {
968 ds_put_format(ds, "next rebalance: %lld ms\n",
969 bond->next_rebalance - time_msec());
972 ds_put_cstr(ds, "lacp_status: ");
973 switch (bond->lacp_status) {
974 case LACP_NEGOTIATED:
975 ds_put_cstr(ds, "negotiated\n");
977 case LACP_CONFIGURED:
978 ds_put_cstr(ds, "configured\n");
981 ds_put_cstr(ds, "off\n");
984 ds_put_cstr(ds, "<unknown>\n");
988 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
989 shash_add(&slave_shash, slave->name, slave);
991 sorted_slaves = shash_sort(&slave_shash);
993 for (i = 0; i < shash_count(&slave_shash); i++) {
994 struct bond_entry *be;
996 slave = sorted_slaves[i]->data;
999 ds_put_format(ds, "\nslave %s: %s\n",
1000 slave->name, slave->enabled ? "enabled" : "disabled");
1001 if (slave == bond->active_slave) {
1002 ds_put_cstr(ds, "\tactive slave\n");
1004 if (slave->delay_expires != LLONG_MAX) {
1005 ds_put_format(ds, "\t%s expires in %lld ms\n",
1006 slave->enabled ? "downdelay" : "updelay",
1007 slave->delay_expires - time_msec());
1010 ds_put_format(ds, "\tmay_enable: %s\n",
1011 slave->may_enable ? "true" : "false");
1013 if (!bond_is_balanced(bond)) {
1018 for (be = bond->hash; be <= &bond->hash[BOND_MASK]; be++) {
1019 int hash = be - bond->hash;
1021 if (be->slave != slave) {
1025 ds_put_format(ds, "\thash %d: %"PRIu64" kB load\n",
1026 hash, be->tx_bytes / 1024);
1028 /* XXX How can we list the MACs assigned to hashes of SLB bonds? */
1031 shash_destroy(&slave_shash);
1032 free(sorted_slaves);
1033 ds_put_cstr(ds, "\n");
1037 bond_unixctl_show(struct unixctl_conn *conn,
1038 int argc, const char *argv[],
1039 void *aux OVS_UNUSED)
1041 struct ds ds = DS_EMPTY_INITIALIZER;
1044 const struct bond *bond = bond_find(argv[1]);
1047 unixctl_command_reply_error(conn, "no such bond");
1050 bond_print_details(&ds, bond);
1052 const struct bond *bond;
1054 HMAP_FOR_EACH (bond, hmap_node, &all_bonds) {
1055 bond_print_details(&ds, bond);
1059 unixctl_command_reply(conn, ds_cstr(&ds));
1064 bond_unixctl_migrate(struct unixctl_conn *conn,
1065 int argc OVS_UNUSED, const char *argv[],
1066 void *aux OVS_UNUSED)
1068 const char *bond_s = argv[1];
1069 const char *hash_s = argv[2];
1070 const char *slave_s = argv[3];
1072 struct bond_slave *slave;
1073 struct bond_entry *entry;
1076 bond = bond_find(bond_s);
1078 unixctl_command_reply_error(conn, "no such bond");
1082 if (bond->balance != BM_SLB) {
1083 unixctl_command_reply_error(conn, "not an SLB bond");
1087 if (strspn(hash_s, "0123456789") == strlen(hash_s)) {
1088 hash = atoi(hash_s) & BOND_MASK;
1090 unixctl_command_reply_error(conn, "bad hash");
1094 slave = bond_lookup_slave(bond, slave_s);
1096 unixctl_command_reply_error(conn, "no such slave");
1100 if (!slave->enabled) {
1101 unixctl_command_reply_error(conn, "cannot migrate to disabled slave");
1105 entry = &bond->hash[hash];
1106 tag_set_add(&bond->unixctl_tags, entry->tag);
1107 entry->slave = slave;
1108 entry->tag = tag_create_random();
1109 unixctl_command_reply(conn, "migrated");
1113 bond_unixctl_set_active_slave(struct unixctl_conn *conn,
1114 int argc OVS_UNUSED, const char *argv[],
1115 void *aux OVS_UNUSED)
1117 const char *bond_s = argv[1];
1118 const char *slave_s = argv[2];
1120 struct bond_slave *slave;
1122 bond = bond_find(bond_s);
1124 unixctl_command_reply_error(conn, "no such bond");
1128 slave = bond_lookup_slave(bond, slave_s);
1130 unixctl_command_reply_error(conn, "no such slave");
1134 if (!slave->enabled) {
1135 unixctl_command_reply_error(conn, "cannot make disabled slave active");
1139 if (bond->active_slave != slave) {
1140 tag_set_add(&bond->unixctl_tags, bond_get_active_slave_tag(bond));
1141 bond->active_slave = slave;
1142 bond->active_slave->tag = tag_create_random();
1143 VLOG_INFO("bond %s: active interface is now %s",
1144 bond->name, slave->name);
1145 bond->send_learning_packets = true;
1146 unixctl_command_reply(conn, "done");
1148 unixctl_command_reply(conn, "no change");
1153 enable_slave(struct unixctl_conn *conn, const char *argv[], bool enable)
1155 const char *bond_s = argv[1];
1156 const char *slave_s = argv[2];
1158 struct bond_slave *slave;
1160 bond = bond_find(bond_s);
1162 unixctl_command_reply_error(conn, "no such bond");
1166 slave = bond_lookup_slave(bond, slave_s);
1168 unixctl_command_reply_error(conn, "no such slave");
1172 bond_enable_slave(slave, enable, &bond->unixctl_tags);
1173 unixctl_command_reply(conn, enable ? "enabled" : "disabled");
1177 bond_unixctl_enable_slave(struct unixctl_conn *conn,
1178 int argc OVS_UNUSED, const char *argv[],
1179 void *aux OVS_UNUSED)
1181 enable_slave(conn, argv, true);
1185 bond_unixctl_disable_slave(struct unixctl_conn *conn,
1186 int argc OVS_UNUSED, const char *argv[],
1187 void *aux OVS_UNUSED)
1189 enable_slave(conn, argv, false);
1193 bond_unixctl_hash(struct unixctl_conn *conn, int argc, const char *argv[],
1194 void *aux OVS_UNUSED)
1196 const char *mac_s = argv[1];
1197 const char *vlan_s = argc > 2 ? argv[2] : NULL;
1198 const char *basis_s = argc > 3 ? argv[3] : NULL;
1199 uint8_t mac[ETH_ADDR_LEN];
1206 if (sscanf(vlan_s, "%u", &vlan) != 1) {
1207 unixctl_command_reply_error(conn, "invalid vlan");
1215 if (sscanf(basis_s, "%"PRIu32, &basis) != 1) {
1216 unixctl_command_reply_error(conn, "invalid basis");
1223 if (sscanf(mac_s, ETH_ADDR_SCAN_FMT, ETH_ADDR_SCAN_ARGS(mac))
1224 == ETH_ADDR_SCAN_COUNT) {
1225 hash = bond_hash_src(mac, vlan, basis) & BOND_MASK;
1227 hash_cstr = xasprintf("%u", hash);
1228 unixctl_command_reply(conn, hash_cstr);
1231 unixctl_command_reply_error(conn, "invalid mac");
1238 unixctl_command_register("bond/list", "", 0, 0, bond_unixctl_list, NULL);
1239 unixctl_command_register("bond/show", "[port]", 0, 1, bond_unixctl_show,
1241 unixctl_command_register("bond/migrate", "port hash slave", 3, 3,
1242 bond_unixctl_migrate, NULL);
1243 unixctl_command_register("bond/set-active-slave", "port slave", 2, 2,
1244 bond_unixctl_set_active_slave, NULL);
1245 unixctl_command_register("bond/enable-slave", "port slave", 2, 2,
1246 bond_unixctl_enable_slave, NULL);
1247 unixctl_command_register("bond/disable-slave", "port slave", 2, 2,
1248 bond_unixctl_disable_slave, NULL);
1249 unixctl_command_register("bond/hash", "mac [vlan] [basis]", 1, 3,
1250 bond_unixctl_hash, NULL);
1254 bond_entry_reset(struct bond *bond)
1256 if (bond->balance != BM_AB) {
1257 size_t hash_len = (BOND_MASK + 1) * sizeof *bond->hash;
1260 bond->hash = xmalloc(hash_len);
1262 memset(bond->hash, 0, hash_len);
1264 bond->next_rebalance = time_msec() + bond->rebalance_interval;
1271 static struct bond_slave *
1272 bond_slave_lookup(struct bond *bond, const void *slave_)
1274 struct bond_slave *slave;
1276 HMAP_FOR_EACH_IN_BUCKET (slave, hmap_node, hash_pointer(slave_, 0),
1278 if (slave->aux == slave_) {
1287 bond_enable_slave(struct bond_slave *slave, bool enable, struct tag_set *tags)
1289 slave->delay_expires = LLONG_MAX;
1290 if (enable != slave->enabled) {
1291 slave->enabled = enable;
1292 if (!slave->enabled) {
1293 VLOG_INFO("interface %s: disabled", slave->name);
1295 tag_set_add(tags, slave->tag);
1298 VLOG_INFO("interface %s: enabled", slave->name);
1299 slave->tag = tag_create_random();
1305 bond_link_status_update(struct bond_slave *slave, struct tag_set *tags)
1307 struct bond *bond = slave->bond;
1310 up = netdev_get_carrier(slave->netdev) && slave->may_enable;
1311 if ((up == slave->enabled) != (slave->delay_expires == LLONG_MAX)) {
1312 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
1313 VLOG_INFO_RL(&rl, "interface %s: link state %s",
1314 slave->name, up ? "up" : "down");
1315 if (up == slave->enabled) {
1316 slave->delay_expires = LLONG_MAX;
1317 VLOG_INFO_RL(&rl, "interface %s: will not be %s",
1318 slave->name, up ? "disabled" : "enabled");
1320 int delay = (bond->lacp_status != LACP_DISABLED ? 0
1321 : up ? bond->updelay : bond->downdelay);
1322 slave->delay_expires = time_msec() + delay;
1324 VLOG_INFO_RL(&rl, "interface %s: will be %s if it stays %s "
1327 up ? "enabled" : "disabled",
1334 if (time_msec() >= slave->delay_expires) {
1335 bond_enable_slave(slave, up, tags);
1340 bond_hash_src(const uint8_t mac[ETH_ADDR_LEN], uint16_t vlan, uint32_t basis)
1342 return hash_3words(hash_bytes(mac, ETH_ADDR_LEN, 0), vlan, basis);
1346 bond_hash_tcp(const struct flow *flow, uint16_t vlan, uint32_t basis)
1348 struct flow hash_flow = *flow;
1349 hash_flow.vlan_tci = htons(vlan);
1351 /* The symmetric quality of this hash function is not required, but
1352 * flow_hash_symmetric_l4 already exists, and is sufficient for our
1353 * purposes, so we use it out of convenience. */
1354 return flow_hash_symmetric_l4(&hash_flow, basis);
1358 bond_hash(const struct bond *bond, const struct flow *flow, uint16_t vlan)
1360 ovs_assert(bond->balance == BM_TCP || bond->balance == BM_SLB);
1362 return (bond->balance == BM_TCP
1363 ? bond_hash_tcp(flow, vlan, bond->basis)
1364 : bond_hash_src(flow->dl_src, vlan, bond->basis));
1367 static struct bond_entry *
1368 lookup_bond_entry(const struct bond *bond, const struct flow *flow,
1371 return &bond->hash[bond_hash(bond, flow, vlan) & BOND_MASK];
1374 static struct bond_slave *
1375 choose_output_slave(const struct bond *bond, const struct flow *flow,
1376 struct flow_wildcards *wc, uint16_t vlan, tag_type *tags)
1378 struct bond_entry *e;
1380 if (bond->lacp_status == LACP_CONFIGURED) {
1381 /* LACP has been configured on this bond but negotiations were
1382 * unsuccussful. Drop all traffic. */
1386 switch (bond->balance) {
1388 return bond->active_slave;
1391 if (bond->lacp_status != LACP_NEGOTIATED) {
1392 /* Must have LACP negotiations for TCP balanced bonds. */
1396 flow_mask_hash_fields(wc, NX_HASH_FIELDS_SYMMETRIC_L4);
1401 flow_mask_hash_fields(wc, NX_HASH_FIELDS_ETH_SRC);
1403 e = lookup_bond_entry(bond, flow, vlan);
1404 if (!e->slave || !e->slave->enabled) {
1405 e->slave = CONTAINER_OF(hmap_random_node(&bond->slaves),
1406 struct bond_slave, hmap_node);
1407 if (!e->slave->enabled) {
1408 e->slave = bond->active_slave;
1410 e->tag = tag_create_random();
1420 static struct bond_slave *
1421 bond_choose_slave(const struct bond *bond)
1423 struct bond_slave *slave, *best;
1425 /* Find an enabled slave. */
1426 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1427 if (slave->enabled) {
1432 /* All interfaces are disabled. Find an interface that will be enabled
1433 * after its updelay expires. */
1435 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1436 if (slave->delay_expires != LLONG_MAX
1437 && slave->may_enable
1438 && (!best || slave->delay_expires < best->delay_expires)) {
1446 bond_choose_active_slave(struct bond *bond, struct tag_set *tags)
1448 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
1449 struct bond_slave *old_active_slave = bond->active_slave;
1451 bond->active_slave = bond_choose_slave(bond);
1452 if (bond->active_slave) {
1453 if (bond->active_slave->enabled) {
1454 VLOG_INFO_RL(&rl, "bond %s: active interface is now %s",
1455 bond->name, bond->active_slave->name);
1457 VLOG_INFO_RL(&rl, "bond %s: active interface is now %s, skipping "
1458 "remaining %lld ms updelay (since no interface was "
1459 "enabled)", bond->name, bond->active_slave->name,
1460 bond->active_slave->delay_expires - time_msec());
1461 bond_enable_slave(bond->active_slave, true, tags);
1464 if (!old_active_slave) {
1465 tag_set_add(tags, bond->no_slaves_tag);
1468 bond->send_learning_packets = true;
1469 } else if (old_active_slave) {
1470 VLOG_INFO_RL(&rl, "bond %s: all interfaces disabled", bond->name);
1474 /* Returns the tag for 'bond''s active slave, or 'bond''s no_slaves_tag if
1475 * there is no active slave. */
1477 bond_get_active_slave_tag(const struct bond *bond)
1479 return (bond->active_slave
1480 ? bond->active_slave->tag
1481 : bond->no_slaves_tag);
1484 /* Attempts to make the sum of the bond slaves' statistics appear on the fake
1485 * bond interface. */
1487 bond_update_fake_slave_stats(struct bond *bond)
1489 struct netdev_stats bond_stats;
1490 struct bond_slave *slave;
1491 struct netdev *bond_dev;
1493 memset(&bond_stats, 0, sizeof bond_stats);
1495 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1496 struct netdev_stats slave_stats;
1498 if (!netdev_get_stats(slave->netdev, &slave_stats)) {
1499 /* XXX: We swap the stats here because they are swapped back when
1500 * reported by the internal device. The reason for this is
1501 * internal devices normally represent packets going into the
1502 * system but when used as fake bond device they represent packets
1503 * leaving the system. We really should do this in the internal
1504 * device itself because changing it here reverses the counts from
1505 * the perspective of the switch. However, the internal device
1506 * doesn't know what type of device it represents so we have to do
1507 * it here for now. */
1508 bond_stats.tx_packets += slave_stats.rx_packets;
1509 bond_stats.tx_bytes += slave_stats.rx_bytes;
1510 bond_stats.rx_packets += slave_stats.tx_packets;
1511 bond_stats.rx_bytes += slave_stats.tx_bytes;
1515 if (!netdev_open(bond->name, "system", &bond_dev)) {
1516 netdev_set_stats(bond_dev, &bond_stats);
1517 netdev_close(bond_dev);