2 * Copyright (c) 2008, 2009, 2010, 2011, 2012, 2013, 2014 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
26 #include "connectivity.h"
28 #include "dynamic-string.h"
37 #include "poll-loop.h"
44 VLOG_DEFINE_THIS_MODULE(bond);
46 static struct ovs_rwlock rwlock = OVS_RWLOCK_INITIALIZER;
47 static struct hmap all_bonds__ = HMAP_INITIALIZER(&all_bonds__);
48 static struct hmap *const all_bonds OVS_GUARDED_BY(rwlock) = &all_bonds__;
50 /* Bit-mask for hashing a flow down to a bucket.
51 * There are (BOND_MASK + 1) buckets. */
52 #define BOND_MASK 0xff
54 /* A hash bucket for mapping a flow to a slave.
55 * "struct bond" has an array of (BOND_MASK + 1) of these. */
57 struct bond_slave *slave; /* Assigned slave, NULL if unassigned. */
58 uint64_t tx_bytes; /* Count of bytes recently transmitted. */
59 struct list list_node; /* In bond_slave's 'entries' list. */
62 /* A bond slave, that is, one of the links comprising a bond. */
64 struct hmap_node hmap_node; /* In struct bond's slaves hmap. */
65 struct list list_node; /* In struct bond's enabled_slaves list. */
66 struct bond *bond; /* The bond that contains this slave. */
67 void *aux; /* Client-provided handle for this slave. */
69 struct netdev *netdev; /* Network device, owned by the client. */
70 unsigned int change_seq; /* Tracks changes in 'netdev'. */
71 char *name; /* Name (a copy of netdev_get_name(netdev)). */
74 long long delay_expires; /* Time after which 'enabled' may change. */
75 bool enabled; /* May be chosen for flows? */
76 bool may_enable; /* Client considers this slave bondable. */
78 /* Rebalancing info. Used only by bond_rebalance(). */
79 struct list bal_node; /* In bond_rebalance()'s 'bals' list. */
80 struct list entries; /* 'struct bond_entry's assigned here. */
81 uint64_t tx_bytes; /* Sum across 'tx_bytes' of entries. */
84 /* A bond, that is, a set of network devices grouped to improve performance or
87 struct hmap_node hmap_node; /* In 'all_bonds' hmap. */
88 char *name; /* Name provided by client. */
95 * Any reader or writer of 'enabled_slaves' must hold 'mutex'.
96 * (To prevent the bond_slave from disappearing they must also hold
98 struct ovs_mutex mutex OVS_ACQ_AFTER(rwlock);
99 struct list enabled_slaves OVS_GUARDED; /* Contains struct bond_slaves. */
102 enum bond_mode balance; /* Balancing mode, one of BM_*. */
103 struct bond_slave *active_slave;
104 int updelay, downdelay; /* Delay before slave goes up/down, in ms. */
105 enum lacp_status lacp_status; /* Status of LACP negotiations. */
106 bool bond_revalidate; /* True if flows need revalidation. */
107 uint32_t basis; /* Basis for flow hash function. */
109 /* SLB specific bonding info. */
110 struct bond_entry *hash; /* An array of (BOND_MASK + 1) elements. */
111 int rebalance_interval; /* Interval between rebalances, in ms. */
112 long long int next_rebalance; /* Next rebalancing time. */
113 bool send_learning_packets;
115 /* Legacy compatibility. */
116 long long int next_fake_iface_update; /* LLONG_MAX if disabled. */
117 bool lacp_fallback_ab; /* Fallback to active-backup on LACP failure. */
119 struct ovs_refcount ref_cnt;
122 static void bond_entry_reset(struct bond *) OVS_REQ_WRLOCK(rwlock);
123 static struct bond_slave *bond_slave_lookup(struct bond *, const void *slave_)
124 OVS_REQ_RDLOCK(rwlock);
125 static void bond_enable_slave(struct bond_slave *, bool enable)
126 OVS_REQ_WRLOCK(rwlock);
127 static void bond_link_status_update(struct bond_slave *)
128 OVS_REQ_WRLOCK(rwlock);
129 static void bond_choose_active_slave(struct bond *)
130 OVS_REQ_WRLOCK(rwlock);;
131 static unsigned int bond_hash_src(const uint8_t mac[ETH_ADDR_LEN],
132 uint16_t vlan, uint32_t basis);
133 static unsigned int bond_hash_tcp(const struct flow *, uint16_t vlan,
135 static struct bond_entry *lookup_bond_entry(const struct bond *,
138 OVS_REQ_RDLOCK(rwlock);
139 static struct bond_slave *get_enabled_slave(struct bond *)
140 OVS_REQ_RDLOCK(rwlock);
141 static struct bond_slave *choose_output_slave(const struct bond *,
143 struct flow_wildcards *,
145 OVS_REQ_RDLOCK(rwlock);
146 static void bond_update_fake_slave_stats(struct bond *)
147 OVS_REQ_RDLOCK(rwlock);
149 /* Attempts to parse 's' as the name of a bond balancing mode. If successful,
150 * stores the mode in '*balance' and returns true. Otherwise returns false
151 * without modifying '*balance'. */
153 bond_mode_from_string(enum bond_mode *balance, const char *s)
155 if (!strcmp(s, bond_mode_to_string(BM_TCP))) {
157 } else if (!strcmp(s, bond_mode_to_string(BM_SLB))) {
159 } else if (!strcmp(s, bond_mode_to_string(BM_AB))) {
167 /* Returns a string representing 'balance'. */
169 bond_mode_to_string(enum bond_mode balance) {
172 return "balance-tcp";
174 return "balance-slb";
176 return "active-backup";
182 /* Creates and returns a new bond whose configuration is initially taken from
185 * The caller should register each slave on the new bond by calling
186 * bond_slave_register(). */
188 bond_create(const struct bond_settings *s)
192 bond = xzalloc(sizeof *bond);
193 hmap_init(&bond->slaves);
194 list_init(&bond->enabled_slaves);
195 ovs_mutex_init(&bond->mutex);
196 bond->next_fake_iface_update = LLONG_MAX;
197 ovs_refcount_init(&bond->ref_cnt);
199 bond_reconfigure(bond, s);
204 bond_ref(const struct bond *bond_)
206 struct bond *bond = CONST_CAST(struct bond *, bond_);
209 ovs_refcount_ref(&bond->ref_cnt);
216 bond_unref(struct bond *bond)
218 struct bond_slave *slave, *next_slave;
220 if (!bond || ovs_refcount_unref(&bond->ref_cnt) != 1) {
224 ovs_rwlock_wrlock(&rwlock);
225 hmap_remove(all_bonds, &bond->hmap_node);
226 ovs_rwlock_unlock(&rwlock);
228 HMAP_FOR_EACH_SAFE (slave, next_slave, hmap_node, &bond->slaves) {
229 hmap_remove(&bond->slaves, &slave->hmap_node);
230 /* Client owns 'slave->netdev'. */
234 hmap_destroy(&bond->slaves);
236 ovs_mutex_destroy(&bond->mutex);
242 /* Updates 'bond''s overall configuration to 's'.
244 * The caller should register each slave on 'bond' by calling
245 * bond_slave_register(). This is optional if none of the slaves'
246 * configuration has changed. In any case it can't hurt.
248 * Returns true if the configuration has changed in such a way that requires
252 bond_reconfigure(struct bond *bond, const struct bond_settings *s)
254 bool revalidate = false;
256 ovs_rwlock_wrlock(&rwlock);
257 if (!bond->name || strcmp(bond->name, s->name)) {
259 hmap_remove(all_bonds, &bond->hmap_node);
262 bond->name = xstrdup(s->name);
263 hmap_insert(all_bonds, &bond->hmap_node, hash_string(bond->name, 0));
266 bond->updelay = s->up_delay;
267 bond->downdelay = s->down_delay;
269 if (bond->lacp_fallback_ab != s->lacp_fallback_ab_cfg) {
270 bond->lacp_fallback_ab = s->lacp_fallback_ab_cfg;
274 if (bond->rebalance_interval != s->rebalance_interval) {
275 bond->rebalance_interval = s->rebalance_interval;
279 if (bond->balance != s->balance) {
280 bond->balance = s->balance;
284 if (bond->basis != s->basis) {
285 bond->basis = s->basis;
290 if (bond->next_fake_iface_update == LLONG_MAX) {
291 bond->next_fake_iface_update = time_msec();
294 bond->next_fake_iface_update = LLONG_MAX;
297 if (bond->bond_revalidate) {
299 bond->bond_revalidate = false;
302 if (bond->balance == BM_AB || !bond->hash || revalidate) {
303 bond_entry_reset(bond);
306 ovs_rwlock_unlock(&rwlock);
311 bond_slave_set_netdev__(struct bond_slave *slave, struct netdev *netdev)
312 OVS_REQ_WRLOCK(rwlock)
314 if (slave->netdev != netdev) {
315 slave->netdev = netdev;
316 slave->change_seq = 0;
320 /* Registers 'slave_' as a slave of 'bond'. The 'slave_' pointer is an
321 * arbitrary client-provided pointer that uniquely identifies a slave within a
322 * bond. If 'slave_' already exists within 'bond' then this function
323 * reconfigures the existing slave.
325 * 'netdev' must be the network device that 'slave_' represents. It is owned
326 * by the client, so the client must not close it before either unregistering
327 * 'slave_' or destroying 'bond'.
330 bond_slave_register(struct bond *bond, void *slave_, struct netdev *netdev)
332 struct bond_slave *slave;
334 ovs_rwlock_wrlock(&rwlock);
335 slave = bond_slave_lookup(bond, slave_);
337 slave = xzalloc(sizeof *slave);
339 hmap_insert(&bond->slaves, &slave->hmap_node, hash_pointer(slave_, 0));
342 slave->delay_expires = LLONG_MAX;
343 slave->name = xstrdup(netdev_get_name(netdev));
344 bond->bond_revalidate = true;
346 slave->enabled = false;
347 bond_enable_slave(slave, netdev_get_carrier(netdev));
350 bond_slave_set_netdev__(slave, netdev);
353 slave->name = xstrdup(netdev_get_name(netdev));
354 ovs_rwlock_unlock(&rwlock);
357 /* Updates the network device to be used with 'slave_' to 'netdev'.
359 * This is useful if the caller closes and re-opens the network device
360 * registered with bond_slave_register() but doesn't need to change anything
363 bond_slave_set_netdev(struct bond *bond, void *slave_, struct netdev *netdev)
365 struct bond_slave *slave;
367 ovs_rwlock_wrlock(&rwlock);
368 slave = bond_slave_lookup(bond, slave_);
370 bond_slave_set_netdev__(slave, netdev);
372 ovs_rwlock_unlock(&rwlock);
375 /* Unregisters 'slave_' from 'bond'. If 'bond' does not contain such a slave
376 * then this function has no effect.
378 * Unregistering a slave invalidates all flows. */
380 bond_slave_unregister(struct bond *bond, const void *slave_)
382 struct bond_slave *slave;
385 ovs_rwlock_wrlock(&rwlock);
386 slave = bond_slave_lookup(bond, slave_);
391 bond->bond_revalidate = true;
392 bond_enable_slave(slave, false);
394 del_active = bond->active_slave == slave;
396 struct bond_entry *e;
397 for (e = bond->hash; e <= &bond->hash[BOND_MASK]; e++) {
398 if (e->slave == slave) {
406 hmap_remove(&bond->slaves, &slave->hmap_node);
407 /* Client owns 'slave->netdev'. */
411 bond_choose_active_slave(bond);
412 bond->send_learning_packets = true;
415 ovs_rwlock_unlock(&rwlock);
418 /* Should be called on each slave in 'bond' before bond_run() to indicate
419 * whether or not 'slave_' may be enabled. This function is intended to allow
420 * other protocols to have some impact on bonding decisions. For example LACP
421 * or high level link monitoring protocols may decide that a given slave should
422 * not be able to send traffic. */
424 bond_slave_set_may_enable(struct bond *bond, void *slave_, bool may_enable)
426 ovs_rwlock_wrlock(&rwlock);
427 bond_slave_lookup(bond, slave_)->may_enable = may_enable;
428 ovs_rwlock_unlock(&rwlock);
431 /* Performs periodic maintenance on 'bond'.
433 * Returns true if the caller should revalidate its flows.
435 * The caller should check bond_should_send_learning_packets() afterward. */
437 bond_run(struct bond *bond, enum lacp_status lacp_status)
439 struct bond_slave *slave;
442 ovs_rwlock_wrlock(&rwlock);
443 if (bond->lacp_status != lacp_status) {
444 bond->lacp_status = lacp_status;
445 bond->bond_revalidate = true;
448 /* Enable slaves based on link status and LACP feedback. */
449 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
450 bond_link_status_update(slave);
451 slave->change_seq = seq_read(connectivity_seq_get());
453 if (!bond->active_slave || !bond->active_slave->enabled) {
454 bond_choose_active_slave(bond);
457 /* Update fake bond interface stats. */
458 if (time_msec() >= bond->next_fake_iface_update) {
459 bond_update_fake_slave_stats(bond);
460 bond->next_fake_iface_update = time_msec() + 1000;
463 revalidate = bond->bond_revalidate;
464 bond->bond_revalidate = false;
465 ovs_rwlock_unlock(&rwlock);
470 /* Causes poll_block() to wake up when 'bond' needs something to be done. */
472 bond_wait(struct bond *bond)
474 struct bond_slave *slave;
476 ovs_rwlock_rdlock(&rwlock);
477 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
478 if (slave->delay_expires != LLONG_MAX) {
479 poll_timer_wait_until(slave->delay_expires);
482 seq_wait(connectivity_seq_get(), slave->change_seq);
485 if (bond->next_fake_iface_update != LLONG_MAX) {
486 poll_timer_wait_until(bond->next_fake_iface_update);
489 if (bond->bond_revalidate) {
490 poll_immediate_wake();
492 ovs_rwlock_unlock(&rwlock);
494 /* We don't wait for bond->next_rebalance because rebalancing can only run
495 * at a flow account checkpoint. ofproto does checkpointing on its own
496 * schedule and bond_rebalance() gets called afterward, so we'd just be
497 * waking up for no purpose. */
500 /* MAC learning table interaction. */
503 may_send_learning_packets(const struct bond *bond)
505 return ((bond->lacp_status == LACP_DISABLED
506 && (bond->balance == BM_SLB || bond->balance == BM_AB))
507 || (bond->lacp_fallback_ab && bond->lacp_status == LACP_CONFIGURED))
508 && bond->active_slave;
511 /* Returns true if 'bond' needs the client to send out packets to assist with
512 * MAC learning on 'bond'. If this function returns true, then the client
513 * should iterate through its MAC learning table for the bridge on which 'bond'
514 * is located. For each MAC that has been learned on a port other than 'bond',
515 * it should call bond_compose_learning_packet().
517 * This function will only return true if 'bond' is in SLB or active-backup
518 * mode and LACP is not negotiated. Otherwise sending learning packets isn't
521 * Calling this function resets the state that it checks. */
523 bond_should_send_learning_packets(struct bond *bond)
527 ovs_rwlock_wrlock(&rwlock);
528 send = bond->send_learning_packets && may_send_learning_packets(bond);
529 bond->send_learning_packets = false;
530 ovs_rwlock_unlock(&rwlock);
534 /* Sends a gratuitous learning packet on 'bond' from 'eth_src' on 'vlan'.
536 * See bond_should_send_learning_packets() for description of usage. The
537 * caller should send the composed packet on the port associated with
538 * port_aux and takes ownership of the returned ofpbuf. */
540 bond_compose_learning_packet(struct bond *bond,
541 const uint8_t eth_src[ETH_ADDR_LEN],
542 uint16_t vlan, void **port_aux)
544 struct bond_slave *slave;
545 struct ofpbuf *packet;
548 ovs_rwlock_rdlock(&rwlock);
549 ovs_assert(may_send_learning_packets(bond));
550 memset(&flow, 0, sizeof flow);
551 memcpy(flow.dl_src, eth_src, ETH_ADDR_LEN);
552 slave = choose_output_slave(bond, &flow, NULL, vlan);
554 packet = ofpbuf_new(0);
555 compose_rarp(packet, eth_src);
557 eth_push_vlan(packet, htons(ETH_TYPE_VLAN), htons(vlan));
560 *port_aux = slave->aux;
561 ovs_rwlock_unlock(&rwlock);
565 /* Checks whether a packet that arrived on 'slave_' within 'bond', with an
566 * Ethernet destination address of 'eth_dst', should be admitted.
568 * The return value is one of the following:
570 * - BV_ACCEPT: Admit the packet.
572 * - BV_DROP: Drop the packet.
574 * - BV_DROP_IF_MOVED: Consult the MAC learning table for the packet's
575 * Ethernet source address and VLAN. If there is none, or if the packet
576 * is on the learned port, then admit the packet. If a different port has
577 * been learned, however, drop the packet (and do not use it for MAC
581 bond_check_admissibility(struct bond *bond, const void *slave_,
582 const uint8_t eth_dst[ETH_ADDR_LEN])
584 enum bond_verdict verdict = BV_DROP;
585 struct bond_slave *slave;
587 ovs_rwlock_rdlock(&rwlock);
588 slave = bond_slave_lookup(bond, slave_);
593 /* LACP bonds have very loose admissibility restrictions because we can
594 * assume the remote switch is aware of the bond and will "do the right
595 * thing". However, as a precaution we drop packets on disabled slaves
596 * because no correctly implemented partner switch should be sending
599 * If LACP is configured, but LACP negotiations have been unsuccessful, we
600 * drop all incoming traffic except if lacp_fallback_ab is enabled. */
601 switch (bond->lacp_status) {
602 case LACP_NEGOTIATED:
603 verdict = slave->enabled ? BV_ACCEPT : BV_DROP;
605 case LACP_CONFIGURED:
606 if (!bond->lacp_fallback_ab) {
613 /* Drop all multicast packets on inactive slaves. */
614 if (eth_addr_is_multicast(eth_dst)) {
615 if (bond->active_slave != slave) {
620 switch (bond->balance) {
622 /* TCP balanced bonds require successful LACP negotiations. Based on the
623 * above check, LACP is off or lacp_fallback_ab is true on this bond.
624 * If lacp_fallback_ab is true fall through to BM_AB case else, we
625 * drop all incoming traffic. */
626 if (!bond->lacp_fallback_ab) {
631 /* Drop all packets which arrive on backup slaves. This is similar to
632 * how Linux bonding handles active-backup bonds. */
633 if (bond->active_slave != slave) {
634 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
636 VLOG_DBG_RL(&rl, "active-backup bond received packet on backup"
637 " slave (%s) destined for " ETH_ADDR_FMT,
638 slave->name, ETH_ADDR_ARGS(eth_dst));
645 /* Drop all packets for which we have learned a different input port,
646 * because we probably sent the packet on one slave and got it back on
647 * the other. Gratuitous ARP packets are an exception to this rule:
648 * the host has moved to another switch. The exception to the
649 * exception is if we locked the learning table to avoid reflections on
651 verdict = BV_DROP_IF_MOVED;
657 ovs_rwlock_unlock(&rwlock);
662 /* Returns the slave (registered on 'bond' by bond_slave_register()) to which
663 * a packet with the given 'flow' and 'vlan' should be forwarded. Returns
664 * NULL if the packet should be dropped because no slaves are enabled.
666 * 'vlan' is not necessarily the same as 'flow->vlan_tci'. First, 'vlan'
667 * should be a VID only (i.e. excluding the PCP bits). Second,
668 * 'flow->vlan_tci' is the VLAN TCI that appeared on the packet (so it will be
669 * nonzero only for trunk ports), whereas 'vlan' is the logical VLAN that the
670 * packet belongs to (so for an access port it will be the access port's VLAN).
672 * If 'wc' is non-NULL, bitwise-OR's 'wc' with the set of bits that were
673 * significant in the selection. At some point earlier, 'wc' should
674 * have been initialized (e.g., by flow_wildcards_init_catchall()).
677 bond_choose_output_slave(struct bond *bond, const struct flow *flow,
678 struct flow_wildcards *wc, uint16_t vlan)
680 struct bond_slave *slave;
683 ovs_rwlock_rdlock(&rwlock);
684 slave = choose_output_slave(bond, flow, wc, vlan);
685 aux = slave ? slave->aux : NULL;
686 ovs_rwlock_unlock(&rwlock);
694 bond_is_balanced(const struct bond *bond) OVS_REQ_RDLOCK(rwlock)
696 return bond->rebalance_interval
697 && (bond->balance == BM_SLB || bond->balance == BM_TCP);
700 /* Notifies 'bond' that 'n_bytes' bytes were sent in 'flow' within 'vlan'. */
702 bond_account(struct bond *bond, const struct flow *flow, uint16_t vlan,
705 ovs_rwlock_wrlock(&rwlock);
706 if (bond_is_balanced(bond)) {
707 lookup_bond_entry(bond, flow, vlan)->tx_bytes += n_bytes;
709 ovs_rwlock_unlock(&rwlock);
712 static struct bond_slave *
713 bond_slave_from_bal_node(struct list *bal) OVS_REQ_RDLOCK(rwlock)
715 return CONTAINER_OF(bal, struct bond_slave, bal_node);
719 log_bals(struct bond *bond, const struct list *bals)
721 if (VLOG_IS_DBG_ENABLED()) {
722 struct ds ds = DS_EMPTY_INITIALIZER;
723 const struct bond_slave *slave;
725 LIST_FOR_EACH (slave, bal_node, bals) {
727 ds_put_char(&ds, ',');
729 ds_put_format(&ds, " %s %"PRIu64"kB",
730 slave->name, slave->tx_bytes / 1024);
732 if (!slave->enabled) {
733 ds_put_cstr(&ds, " (disabled)");
735 if (!list_is_empty(&slave->entries)) {
736 struct bond_entry *e;
738 ds_put_cstr(&ds, " (");
739 LIST_FOR_EACH (e, list_node, &slave->entries) {
740 if (&e->list_node != list_front(&slave->entries)) {
741 ds_put_cstr(&ds, " + ");
743 ds_put_format(&ds, "h%"PRIdPTR": %"PRIu64"kB",
744 e - bond->hash, e->tx_bytes / 1024);
746 ds_put_cstr(&ds, ")");
749 VLOG_DBG("bond %s:%s", bond->name, ds_cstr(&ds));
754 /* Shifts 'hash' from its current slave to 'to'. */
756 bond_shift_load(struct bond_entry *hash, struct bond_slave *to)
758 struct bond_slave *from = hash->slave;
759 struct bond *bond = from->bond;
760 uint64_t delta = hash->tx_bytes;
762 VLOG_INFO("bond %s: shift %"PRIu64"kB of load (with hash %"PRIdPTR") "
763 "from %s to %s (now carrying %"PRIu64"kB and "
764 "%"PRIu64"kB load, respectively)",
765 bond->name, delta / 1024, hash - bond->hash,
766 from->name, to->name,
767 (from->tx_bytes - delta) / 1024,
768 (to->tx_bytes + delta) / 1024);
770 /* Shift load away from 'from' to 'to'. */
771 from->tx_bytes -= delta;
772 to->tx_bytes += delta;
774 /* Arrange for flows to be revalidated. */
776 bond->bond_revalidate = true;
779 /* Picks and returns a bond_entry to migrate from 'from' (the most heavily
780 * loaded bond slave) to a bond slave that has 'to_tx_bytes' bytes of load,
781 * given that doing so must decrease the ratio of the load on the two slaves by
782 * at least 0.1. Returns NULL if there is no appropriate entry.
784 * The list of entries isn't sorted. I don't know of a reason to prefer to
785 * shift away small hashes or large hashes. */
786 static struct bond_entry *
787 choose_entry_to_migrate(const struct bond_slave *from, uint64_t to_tx_bytes)
789 struct bond_entry *e;
791 if (list_is_short(&from->entries)) {
792 /* 'from' carries no more than one MAC hash, so shifting load away from
793 * it would be pointless. */
797 LIST_FOR_EACH (e, list_node, &from->entries) {
798 double old_ratio, new_ratio;
801 if (to_tx_bytes == 0) {
802 /* Nothing on the new slave, move it. */
807 old_ratio = (double)from->tx_bytes / to_tx_bytes;
808 new_ratio = (double)(from->tx_bytes - delta) / (to_tx_bytes + delta);
809 if (old_ratio - new_ratio > 0.1
810 && fabs(new_ratio - 1.0) < fabs(old_ratio - 1.0)) {
811 /* We're aiming for an ideal ratio of 1, meaning both the 'from'
812 and 'to' slave have the same load. Therefore, we only move an
813 entry if it decreases the load on 'from', and brings us closer
814 to equal traffic load. */
822 /* Inserts 'slave' into 'bals' so that descending order of 'tx_bytes' is
825 insert_bal(struct list *bals, struct bond_slave *slave)
827 struct bond_slave *pos;
829 LIST_FOR_EACH (pos, bal_node, bals) {
830 if (slave->tx_bytes > pos->tx_bytes) {
834 list_insert(&pos->bal_node, &slave->bal_node);
837 /* Removes 'slave' from its current list and then inserts it into 'bals' so
838 * that descending order of 'tx_bytes' is maintained. */
840 reinsert_bal(struct list *bals, struct bond_slave *slave)
842 list_remove(&slave->bal_node);
843 insert_bal(bals, slave);
846 /* If 'bond' needs rebalancing, does so.
848 * The caller should have called bond_account() for each active flow, to ensure
849 * that flow data is consistently accounted at this point. */
851 bond_rebalance(struct bond *bond)
853 struct bond_slave *slave;
854 struct bond_entry *e;
857 ovs_rwlock_wrlock(&rwlock);
858 if (!bond_is_balanced(bond) || time_msec() < bond->next_rebalance) {
859 ovs_rwlock_unlock(&rwlock);
862 bond->next_rebalance = time_msec() + bond->rebalance_interval;
864 /* Add each bond_entry to its slave's 'entries' list.
865 * Compute each slave's tx_bytes as the sum of its entries' tx_bytes. */
866 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
868 list_init(&slave->entries);
870 for (e = &bond->hash[0]; e <= &bond->hash[BOND_MASK]; e++) {
871 if (e->slave && e->tx_bytes) {
872 e->slave->tx_bytes += e->tx_bytes;
873 list_push_back(&e->slave->entries, &e->list_node);
877 /* Add enabled slaves to 'bals' in descending order of tx_bytes.
879 * XXX This is O(n**2) in the number of slaves but it could be O(n lg n)
880 * with a proper list sort algorithm. */
882 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
883 if (slave->enabled) {
884 insert_bal(&bals, slave);
887 log_bals(bond, &bals);
889 /* Shift load from the most-loaded slaves to the least-loaded slaves. */
890 while (!list_is_short(&bals)) {
891 struct bond_slave *from = bond_slave_from_bal_node(list_front(&bals));
892 struct bond_slave *to = bond_slave_from_bal_node(list_back(&bals));
895 overload = from->tx_bytes - to->tx_bytes;
896 if (overload < to->tx_bytes >> 5 || overload < 100000) {
897 /* The extra load on 'from' (and all less-loaded slaves), compared
898 * to that of 'to' (the least-loaded slave), is less than ~3%, or
899 * it is less than ~1Mbps. No point in rebalancing. */
903 /* 'from' is carrying significantly more load than 'to'. Pick a hash
904 * to move from 'from' to 'to'. */
905 e = choose_entry_to_migrate(from, to->tx_bytes);
907 bond_shift_load(e, to);
909 /* Delete element from from->entries.
911 * We don't add the element to to->hashes. That would only allow
912 * 'e' to be migrated to another slave in this rebalancing run, and
913 * there is no point in doing that. */
914 list_remove(&e->list_node);
916 /* Re-sort 'bals'. */
917 reinsert_bal(&bals, from);
918 reinsert_bal(&bals, to);
920 /* Can't usefully migrate anything away from 'from'.
921 * Don't reconsider it. */
922 list_remove(&from->bal_node);
926 /* Implement exponentially weighted moving average. A weight of 1/2 causes
927 * historical data to decay to <1% in 7 rebalancing runs. 1,000,000 bytes
928 * take 20 rebalancing runs to decay to 0 and get deleted entirely. */
929 for (e = &bond->hash[0]; e <= &bond->hash[BOND_MASK]; e++) {
935 ovs_rwlock_unlock(&rwlock);
938 /* Bonding unixctl user interface functions. */
941 bond_find(const char *name) OVS_REQ_RDLOCK(rwlock)
945 HMAP_FOR_EACH_WITH_HASH (bond, hmap_node, hash_string(name, 0),
947 if (!strcmp(bond->name, name)) {
954 static struct bond_slave *
955 bond_lookup_slave(struct bond *bond, const char *slave_name)
957 struct bond_slave *slave;
959 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
960 if (!strcmp(slave->name, slave_name)) {
968 bond_unixctl_list(struct unixctl_conn *conn,
969 int argc OVS_UNUSED, const char *argv[] OVS_UNUSED,
970 void *aux OVS_UNUSED)
972 struct ds ds = DS_EMPTY_INITIALIZER;
973 const struct bond *bond;
975 ds_put_cstr(&ds, "bond\ttype\tslaves\n");
977 ovs_rwlock_rdlock(&rwlock);
978 HMAP_FOR_EACH (bond, hmap_node, all_bonds) {
979 const struct bond_slave *slave;
982 ds_put_format(&ds, "%s\t%s\t",
983 bond->name, bond_mode_to_string(bond->balance));
986 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
988 ds_put_cstr(&ds, ", ");
990 ds_put_cstr(&ds, slave->name);
992 ds_put_char(&ds, '\n');
994 ovs_rwlock_unlock(&rwlock);
995 unixctl_command_reply(conn, ds_cstr(&ds));
1000 bond_print_details(struct ds *ds, const struct bond *bond)
1001 OVS_REQ_RDLOCK(rwlock)
1003 struct shash slave_shash = SHASH_INITIALIZER(&slave_shash);
1004 const struct shash_node **sorted_slaves = NULL;
1005 const struct bond_slave *slave;
1008 ds_put_format(ds, "---- %s ----\n", bond->name);
1009 ds_put_format(ds, "bond_mode: %s\n",
1010 bond_mode_to_string(bond->balance));
1012 ds_put_format(ds, "bond-hash-basis: %"PRIu32"\n", bond->basis);
1014 ds_put_format(ds, "updelay: %d ms\n", bond->updelay);
1015 ds_put_format(ds, "downdelay: %d ms\n", bond->downdelay);
1017 if (bond_is_balanced(bond)) {
1018 ds_put_format(ds, "next rebalance: %lld ms\n",
1019 bond->next_rebalance - time_msec());
1022 ds_put_cstr(ds, "lacp_status: ");
1023 switch (bond->lacp_status) {
1024 case LACP_NEGOTIATED:
1025 ds_put_cstr(ds, "negotiated\n");
1027 case LACP_CONFIGURED:
1028 ds_put_cstr(ds, "configured\n");
1031 ds_put_cstr(ds, "off\n");
1034 ds_put_cstr(ds, "<unknown>\n");
1038 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1039 shash_add(&slave_shash, slave->name, slave);
1041 sorted_slaves = shash_sort(&slave_shash);
1043 for (i = 0; i < shash_count(&slave_shash); i++) {
1044 struct bond_entry *be;
1046 slave = sorted_slaves[i]->data;
1049 ds_put_format(ds, "\nslave %s: %s\n",
1050 slave->name, slave->enabled ? "enabled" : "disabled");
1051 if (slave == bond->active_slave) {
1052 ds_put_cstr(ds, "\tactive slave\n");
1054 if (slave->delay_expires != LLONG_MAX) {
1055 ds_put_format(ds, "\t%s expires in %lld ms\n",
1056 slave->enabled ? "downdelay" : "updelay",
1057 slave->delay_expires - time_msec());
1060 ds_put_format(ds, "\tmay_enable: %s\n",
1061 slave->may_enable ? "true" : "false");
1063 if (!bond_is_balanced(bond)) {
1068 for (be = bond->hash; be <= &bond->hash[BOND_MASK]; be++) {
1069 int hash = be - bond->hash;
1071 if (be->slave != slave) {
1075 ds_put_format(ds, "\thash %d: %"PRIu64" kB load\n",
1076 hash, be->tx_bytes / 1024);
1078 /* XXX How can we list the MACs assigned to hashes of SLB bonds? */
1081 shash_destroy(&slave_shash);
1082 free(sorted_slaves);
1083 ds_put_cstr(ds, "\n");
1087 bond_unixctl_show(struct unixctl_conn *conn,
1088 int argc, const char *argv[],
1089 void *aux OVS_UNUSED)
1091 struct ds ds = DS_EMPTY_INITIALIZER;
1093 ovs_rwlock_rdlock(&rwlock);
1095 const struct bond *bond = bond_find(argv[1]);
1098 unixctl_command_reply_error(conn, "no such bond");
1101 bond_print_details(&ds, bond);
1103 const struct bond *bond;
1105 HMAP_FOR_EACH (bond, hmap_node, all_bonds) {
1106 bond_print_details(&ds, bond);
1110 unixctl_command_reply(conn, ds_cstr(&ds));
1114 ovs_rwlock_unlock(&rwlock);
1118 bond_unixctl_migrate(struct unixctl_conn *conn,
1119 int argc OVS_UNUSED, const char *argv[],
1120 void *aux OVS_UNUSED)
1122 const char *bond_s = argv[1];
1123 const char *hash_s = argv[2];
1124 const char *slave_s = argv[3];
1126 struct bond_slave *slave;
1127 struct bond_entry *entry;
1130 ovs_rwlock_wrlock(&rwlock);
1131 bond = bond_find(bond_s);
1133 unixctl_command_reply_error(conn, "no such bond");
1137 if (bond->balance != BM_SLB) {
1138 unixctl_command_reply_error(conn, "not an SLB bond");
1142 if (strspn(hash_s, "0123456789") == strlen(hash_s)) {
1143 hash = atoi(hash_s) & BOND_MASK;
1145 unixctl_command_reply_error(conn, "bad hash");
1149 slave = bond_lookup_slave(bond, slave_s);
1151 unixctl_command_reply_error(conn, "no such slave");
1155 if (!slave->enabled) {
1156 unixctl_command_reply_error(conn, "cannot migrate to disabled slave");
1160 entry = &bond->hash[hash];
1161 bond->bond_revalidate = true;
1162 entry->slave = slave;
1163 unixctl_command_reply(conn, "migrated");
1166 ovs_rwlock_unlock(&rwlock);
1170 bond_unixctl_set_active_slave(struct unixctl_conn *conn,
1171 int argc OVS_UNUSED, const char *argv[],
1172 void *aux OVS_UNUSED)
1174 const char *bond_s = argv[1];
1175 const char *slave_s = argv[2];
1177 struct bond_slave *slave;
1179 ovs_rwlock_wrlock(&rwlock);
1180 bond = bond_find(bond_s);
1182 unixctl_command_reply_error(conn, "no such bond");
1186 slave = bond_lookup_slave(bond, slave_s);
1188 unixctl_command_reply_error(conn, "no such slave");
1192 if (!slave->enabled) {
1193 unixctl_command_reply_error(conn, "cannot make disabled slave active");
1197 if (bond->active_slave != slave) {
1198 bond->bond_revalidate = true;
1199 bond->active_slave = slave;
1200 VLOG_INFO("bond %s: active interface is now %s",
1201 bond->name, slave->name);
1202 bond->send_learning_packets = true;
1203 unixctl_command_reply(conn, "done");
1205 unixctl_command_reply(conn, "no change");
1208 ovs_rwlock_unlock(&rwlock);
1212 enable_slave(struct unixctl_conn *conn, const char *argv[], bool enable)
1214 const char *bond_s = argv[1];
1215 const char *slave_s = argv[2];
1217 struct bond_slave *slave;
1219 ovs_rwlock_wrlock(&rwlock);
1220 bond = bond_find(bond_s);
1222 unixctl_command_reply_error(conn, "no such bond");
1226 slave = bond_lookup_slave(bond, slave_s);
1228 unixctl_command_reply_error(conn, "no such slave");
1232 bond_enable_slave(slave, enable);
1233 unixctl_command_reply(conn, enable ? "enabled" : "disabled");
1236 ovs_rwlock_unlock(&rwlock);
1240 bond_unixctl_enable_slave(struct unixctl_conn *conn,
1241 int argc OVS_UNUSED, const char *argv[],
1242 void *aux OVS_UNUSED)
1244 enable_slave(conn, argv, true);
1248 bond_unixctl_disable_slave(struct unixctl_conn *conn,
1249 int argc OVS_UNUSED, const char *argv[],
1250 void *aux OVS_UNUSED)
1252 enable_slave(conn, argv, false);
1256 bond_unixctl_hash(struct unixctl_conn *conn, int argc, const char *argv[],
1257 void *aux OVS_UNUSED)
1259 const char *mac_s = argv[1];
1260 const char *vlan_s = argc > 2 ? argv[2] : NULL;
1261 const char *basis_s = argc > 3 ? argv[3] : NULL;
1262 uint8_t mac[ETH_ADDR_LEN];
1269 if (!ovs_scan(vlan_s, "%u", &vlan)) {
1270 unixctl_command_reply_error(conn, "invalid vlan");
1278 if (!ovs_scan(basis_s, "%"SCNu32, &basis)) {
1279 unixctl_command_reply_error(conn, "invalid basis");
1286 if (ovs_scan(mac_s, ETH_ADDR_SCAN_FMT, ETH_ADDR_SCAN_ARGS(mac))) {
1287 hash = bond_hash_src(mac, vlan, basis) & BOND_MASK;
1289 hash_cstr = xasprintf("%u", hash);
1290 unixctl_command_reply(conn, hash_cstr);
1293 unixctl_command_reply_error(conn, "invalid mac");
1300 unixctl_command_register("bond/list", "", 0, 0, bond_unixctl_list, NULL);
1301 unixctl_command_register("bond/show", "[port]", 0, 1, bond_unixctl_show,
1303 unixctl_command_register("bond/migrate", "port hash slave", 3, 3,
1304 bond_unixctl_migrate, NULL);
1305 unixctl_command_register("bond/set-active-slave", "port slave", 2, 2,
1306 bond_unixctl_set_active_slave, NULL);
1307 unixctl_command_register("bond/enable-slave", "port slave", 2, 2,
1308 bond_unixctl_enable_slave, NULL);
1309 unixctl_command_register("bond/disable-slave", "port slave", 2, 2,
1310 bond_unixctl_disable_slave, NULL);
1311 unixctl_command_register("bond/hash", "mac [vlan] [basis]", 1, 3,
1312 bond_unixctl_hash, NULL);
1316 bond_entry_reset(struct bond *bond)
1318 if (bond->balance != BM_AB) {
1319 size_t hash_len = (BOND_MASK + 1) * sizeof *bond->hash;
1322 bond->hash = xmalloc(hash_len);
1324 memset(bond->hash, 0, hash_len);
1326 bond->next_rebalance = time_msec() + bond->rebalance_interval;
1333 static struct bond_slave *
1334 bond_slave_lookup(struct bond *bond, const void *slave_)
1336 struct bond_slave *slave;
1338 HMAP_FOR_EACH_IN_BUCKET (slave, hmap_node, hash_pointer(slave_, 0),
1340 if (slave->aux == slave_) {
1349 bond_enable_slave(struct bond_slave *slave, bool enable)
1351 slave->delay_expires = LLONG_MAX;
1352 if (enable != slave->enabled) {
1353 slave->bond->bond_revalidate = true;
1354 slave->enabled = enable;
1356 ovs_mutex_lock(&slave->bond->mutex);
1358 list_insert(&slave->bond->enabled_slaves, &slave->list_node);
1360 list_remove(&slave->list_node);
1362 ovs_mutex_unlock(&slave->bond->mutex);
1364 VLOG_INFO("interface %s: %s", slave->name,
1365 slave->enabled ? "enabled" : "disabled");
1370 bond_link_status_update(struct bond_slave *slave)
1372 struct bond *bond = slave->bond;
1375 up = netdev_get_carrier(slave->netdev) && slave->may_enable;
1376 if ((up == slave->enabled) != (slave->delay_expires == LLONG_MAX)) {
1377 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
1378 VLOG_INFO_RL(&rl, "interface %s: link state %s",
1379 slave->name, up ? "up" : "down");
1380 if (up == slave->enabled) {
1381 slave->delay_expires = LLONG_MAX;
1382 VLOG_INFO_RL(&rl, "interface %s: will not be %s",
1383 slave->name, up ? "disabled" : "enabled");
1385 int delay = (bond->lacp_status != LACP_DISABLED ? 0
1386 : up ? bond->updelay : bond->downdelay);
1387 slave->delay_expires = time_msec() + delay;
1389 VLOG_INFO_RL(&rl, "interface %s: will be %s if it stays %s "
1392 up ? "enabled" : "disabled",
1399 if (time_msec() >= slave->delay_expires) {
1400 bond_enable_slave(slave, up);
1405 bond_hash_src(const uint8_t mac[ETH_ADDR_LEN], uint16_t vlan, uint32_t basis)
1407 return hash_mac(mac, vlan, basis);
1411 bond_hash_tcp(const struct flow *flow, uint16_t vlan, uint32_t basis)
1413 struct flow hash_flow = *flow;
1414 hash_flow.vlan_tci = htons(vlan);
1416 /* The symmetric quality of this hash function is not required, but
1417 * flow_hash_symmetric_l4 already exists, and is sufficient for our
1418 * purposes, so we use it out of convenience. */
1419 return flow_hash_symmetric_l4(&hash_flow, basis);
1423 bond_hash(const struct bond *bond, const struct flow *flow, uint16_t vlan)
1425 ovs_assert(bond->balance == BM_TCP || bond->balance == BM_SLB);
1427 return (bond->balance == BM_TCP
1428 ? bond_hash_tcp(flow, vlan, bond->basis)
1429 : bond_hash_src(flow->dl_src, vlan, bond->basis));
1432 static struct bond_entry *
1433 lookup_bond_entry(const struct bond *bond, const struct flow *flow,
1436 return &bond->hash[bond_hash(bond, flow, vlan) & BOND_MASK];
1439 /* Selects and returns an enabled slave from the 'enabled_slaves' list
1440 * in a round-robin fashion. If the 'enabled_slaves' list is empty,
1442 static struct bond_slave *
1443 get_enabled_slave(struct bond *bond)
1447 ovs_mutex_lock(&bond->mutex);
1448 if (list_is_empty(&bond->enabled_slaves)) {
1449 ovs_mutex_unlock(&bond->mutex);
1453 node = list_pop_front(&bond->enabled_slaves);
1454 list_push_back(&bond->enabled_slaves, node);
1455 ovs_mutex_unlock(&bond->mutex);
1457 return CONTAINER_OF(node, struct bond_slave, list_node);
1460 static struct bond_slave *
1461 choose_output_slave(const struct bond *bond, const struct flow *flow,
1462 struct flow_wildcards *wc, uint16_t vlan)
1464 struct bond_entry *e;
1467 balance = bond->balance;
1468 if (bond->lacp_status == LACP_CONFIGURED) {
1469 /* LACP has been configured on this bond but negotiations were
1470 * unsuccussful. If lacp_fallback_ab is enabled use active-
1471 * backup mode else drop all traffic. */
1472 if (!bond->lacp_fallback_ab) {
1480 return bond->active_slave;
1483 if (bond->lacp_status != LACP_NEGOTIATED) {
1484 /* Must have LACP negotiations for TCP balanced bonds. */
1488 flow_mask_hash_fields(flow, wc, NX_HASH_FIELDS_SYMMETRIC_L4);
1493 flow_mask_hash_fields(flow, wc, NX_HASH_FIELDS_ETH_SRC);
1495 e = lookup_bond_entry(bond, flow, vlan);
1496 if (!e->slave || !e->slave->enabled) {
1497 e->slave = get_enabled_slave(CONST_CAST(struct bond*, bond));
1506 static struct bond_slave *
1507 bond_choose_slave(const struct bond *bond)
1509 struct bond_slave *slave, *best;
1511 /* Find an enabled slave. */
1512 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1513 if (slave->enabled) {
1518 /* All interfaces are disabled. Find an interface that will be enabled
1519 * after its updelay expires. */
1521 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1522 if (slave->delay_expires != LLONG_MAX
1523 && slave->may_enable
1524 && (!best || slave->delay_expires < best->delay_expires)) {
1532 bond_choose_active_slave(struct bond *bond)
1534 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
1535 struct bond_slave *old_active_slave = bond->active_slave;
1537 bond->active_slave = bond_choose_slave(bond);
1538 if (bond->active_slave) {
1539 if (bond->active_slave->enabled) {
1540 VLOG_INFO_RL(&rl, "bond %s: active interface is now %s",
1541 bond->name, bond->active_slave->name);
1543 VLOG_INFO_RL(&rl, "bond %s: active interface is now %s, skipping "
1544 "remaining %lld ms updelay (since no interface was "
1545 "enabled)", bond->name, bond->active_slave->name,
1546 bond->active_slave->delay_expires - time_msec());
1547 bond_enable_slave(bond->active_slave, true);
1550 bond->send_learning_packets = true;
1551 } else if (old_active_slave) {
1552 VLOG_INFO_RL(&rl, "bond %s: all interfaces disabled", bond->name);
1556 /* Attempts to make the sum of the bond slaves' statistics appear on the fake
1557 * bond interface. */
1559 bond_update_fake_slave_stats(struct bond *bond)
1561 struct netdev_stats bond_stats;
1562 struct bond_slave *slave;
1563 struct netdev *bond_dev;
1565 memset(&bond_stats, 0, sizeof bond_stats);
1567 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1568 struct netdev_stats slave_stats;
1570 if (!netdev_get_stats(slave->netdev, &slave_stats)) {
1571 /* XXX: We swap the stats here because they are swapped back when
1572 * reported by the internal device. The reason for this is
1573 * internal devices normally represent packets going into the
1574 * system but when used as fake bond device they represent packets
1575 * leaving the system. We really should do this in the internal
1576 * device itself because changing it here reverses the counts from
1577 * the perspective of the switch. However, the internal device
1578 * doesn't know what type of device it represents so we have to do
1579 * it here for now. */
1580 bond_stats.tx_packets += slave_stats.rx_packets;
1581 bond_stats.tx_bytes += slave_stats.rx_bytes;
1582 bond_stats.rx_packets += slave_stats.tx_packets;
1583 bond_stats.rx_bytes += slave_stats.tx_bytes;
1587 if (!netdev_open(bond->name, "system", &bond_dev)) {
1588 netdev_set_stats(bond_dev, &bond_stats);
1589 netdev_close(bond_dev);