2 * Copyright (c) 2008, 2009, 2010, 2011, 2012, 2013 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
26 #include "connectivity.h"
28 #include "dynamic-string.h"
37 #include "poll-loop.h"
44 VLOG_DEFINE_THIS_MODULE(bond);
46 /* Bit-mask for hashing a flow down to a bucket.
47 * There are (BOND_MASK + 1) buckets. */
48 #define BOND_MASK 0xff
50 /* A hash bucket for mapping a flow to a slave.
51 * "struct bond" has an array of (BOND_MASK + 1) of these. */
53 struct bond_slave *slave; /* Assigned slave, NULL if unassigned. */
54 uint64_t tx_bytes; /* Count of bytes recently transmitted. */
55 struct list list_node; /* In bond_slave's 'entries' list. */
58 /* A bond slave, that is, one of the links comprising a bond. */
60 struct hmap_node hmap_node; /* In struct bond's slaves hmap. */
61 struct bond *bond; /* The bond that contains this slave. */
62 void *aux; /* Client-provided handle for this slave. */
64 struct netdev *netdev; /* Network device, owned by the client. */
65 unsigned int change_seq; /* Tracks changes in 'netdev'. */
66 char *name; /* Name (a copy of netdev_get_name(netdev)). */
69 long long delay_expires; /* Time after which 'enabled' may change. */
70 bool enabled; /* May be chosen for flows? */
71 bool may_enable; /* Client considers this slave bondable. */
73 /* Rebalancing info. Used only by bond_rebalance(). */
74 struct list bal_node; /* In bond_rebalance()'s 'bals' list. */
75 struct list entries; /* 'struct bond_entry's assigned here. */
76 uint64_t tx_bytes; /* Sum across 'tx_bytes' of entries. */
79 /* A bond, that is, a set of network devices grouped to improve performance or
82 struct hmap_node hmap_node; /* In 'all_bonds' hmap. */
83 char *name; /* Name provided by client. */
89 enum bond_mode balance; /* Balancing mode, one of BM_*. */
90 struct bond_slave *active_slave;
91 int updelay, downdelay; /* Delay before slave goes up/down, in ms. */
92 enum lacp_status lacp_status; /* Status of LACP negotiations. */
93 bool bond_revalidate; /* True if flows need revalidation. */
94 uint32_t basis; /* Basis for flow hash function. */
96 /* SLB specific bonding info. */
97 struct bond_entry *hash; /* An array of (BOND_MASK + 1) elements. */
98 int rebalance_interval; /* Interval between rebalances, in ms. */
99 long long int next_rebalance; /* Next rebalancing time. */
100 bool send_learning_packets;
102 /* Legacy compatibility. */
103 long long int next_fake_iface_update; /* LLONG_MAX if disabled. */
104 bool lacp_fallback_ab; /* Fallback to active-backup on LACP failure. */
109 static struct ovs_rwlock rwlock = OVS_RWLOCK_INITIALIZER;
110 static struct hmap all_bonds__ = HMAP_INITIALIZER(&all_bonds__);
111 static struct hmap *const all_bonds OVS_GUARDED_BY(rwlock) = &all_bonds__;
113 static void bond_entry_reset(struct bond *) OVS_REQ_WRLOCK(rwlock);
114 static struct bond_slave *bond_slave_lookup(struct bond *, const void *slave_)
115 OVS_REQ_RDLOCK(rwlock);
116 static void bond_enable_slave(struct bond_slave *, bool enable)
117 OVS_REQ_WRLOCK(rwlock);
118 static void bond_link_status_update(struct bond_slave *)
119 OVS_REQ_WRLOCK(rwlock);
120 static void bond_choose_active_slave(struct bond *)
121 OVS_REQ_WRLOCK(rwlock);;
122 static unsigned int bond_hash_src(const uint8_t mac[ETH_ADDR_LEN],
123 uint16_t vlan, uint32_t basis);
124 static unsigned int bond_hash_tcp(const struct flow *, uint16_t vlan,
126 static struct bond_entry *lookup_bond_entry(const struct bond *,
129 OVS_REQ_RDLOCK(rwlock);
130 static struct bond_slave *choose_output_slave(const struct bond *,
132 struct flow_wildcards *,
134 OVS_REQ_RDLOCK(rwlock);
135 static void bond_update_fake_slave_stats(struct bond *)
136 OVS_REQ_RDLOCK(rwlock);
138 /* Attempts to parse 's' as the name of a bond balancing mode. If successful,
139 * stores the mode in '*balance' and returns true. Otherwise returns false
140 * without modifying '*balance'. */
142 bond_mode_from_string(enum bond_mode *balance, const char *s)
144 if (!strcmp(s, bond_mode_to_string(BM_TCP))) {
146 } else if (!strcmp(s, bond_mode_to_string(BM_SLB))) {
148 } else if (!strcmp(s, bond_mode_to_string(BM_AB))) {
156 /* Returns a string representing 'balance'. */
158 bond_mode_to_string(enum bond_mode balance) {
161 return "balance-tcp";
163 return "balance-slb";
165 return "active-backup";
171 /* Creates and returns a new bond whose configuration is initially taken from
174 * The caller should register each slave on the new bond by calling
175 * bond_slave_register(). */
177 bond_create(const struct bond_settings *s)
181 bond = xzalloc(sizeof *bond);
182 hmap_init(&bond->slaves);
183 bond->next_fake_iface_update = LLONG_MAX;
184 atomic_init(&bond->ref_cnt, 1);
186 bond_reconfigure(bond, s);
191 bond_ref(const struct bond *bond_)
193 struct bond *bond = CONST_CAST(struct bond *, bond_);
197 atomic_add(&bond->ref_cnt, 1, &orig);
198 ovs_assert(orig > 0);
205 bond_unref(struct bond *bond)
207 struct bond_slave *slave, *next_slave;
214 atomic_sub(&bond->ref_cnt, 1, &orig);
215 ovs_assert(orig > 0);
220 ovs_rwlock_wrlock(&rwlock);
221 hmap_remove(all_bonds, &bond->hmap_node);
222 ovs_rwlock_unlock(&rwlock);
224 HMAP_FOR_EACH_SAFE (slave, next_slave, hmap_node, &bond->slaves) {
225 hmap_remove(&bond->slaves, &slave->hmap_node);
226 /* Client owns 'slave->netdev'. */
230 hmap_destroy(&bond->slaves);
234 atomic_destroy(&bond->ref_cnt);
238 /* Updates 'bond''s overall configuration to 's'.
240 * The caller should register each slave on 'bond' by calling
241 * bond_slave_register(). This is optional if none of the slaves'
242 * configuration has changed. In any case it can't hurt.
244 * Returns true if the configuration has changed in such a way that requires
248 bond_reconfigure(struct bond *bond, const struct bond_settings *s)
250 bool revalidate = false;
252 ovs_rwlock_wrlock(&rwlock);
253 if (!bond->name || strcmp(bond->name, s->name)) {
255 hmap_remove(all_bonds, &bond->hmap_node);
258 bond->name = xstrdup(s->name);
259 hmap_insert(all_bonds, &bond->hmap_node, hash_string(bond->name, 0));
262 bond->updelay = s->up_delay;
263 bond->downdelay = s->down_delay;
265 if (bond->lacp_fallback_ab != s->lacp_fallback_ab_cfg) {
266 bond->lacp_fallback_ab = s->lacp_fallback_ab_cfg;
270 if (bond->rebalance_interval != s->rebalance_interval) {
271 bond->rebalance_interval = s->rebalance_interval;
275 if (bond->balance != s->balance) {
276 bond->balance = s->balance;
280 if (bond->basis != s->basis) {
281 bond->basis = s->basis;
286 if (bond->next_fake_iface_update == LLONG_MAX) {
287 bond->next_fake_iface_update = time_msec();
290 bond->next_fake_iface_update = LLONG_MAX;
293 if (bond->bond_revalidate) {
295 bond->bond_revalidate = false;
298 if (bond->balance == BM_AB || !bond->hash || revalidate) {
299 bond_entry_reset(bond);
302 ovs_rwlock_unlock(&rwlock);
307 bond_slave_set_netdev__(struct bond_slave *slave, struct netdev *netdev)
308 OVS_REQ_WRLOCK(rwlock)
310 if (slave->netdev != netdev) {
311 slave->netdev = netdev;
312 slave->change_seq = 0;
316 /* Registers 'slave_' as a slave of 'bond'. The 'slave_' pointer is an
317 * arbitrary client-provided pointer that uniquely identifies a slave within a
318 * bond. If 'slave_' already exists within 'bond' then this function
319 * reconfigures the existing slave.
321 * 'netdev' must be the network device that 'slave_' represents. It is owned
322 * by the client, so the client must not close it before either unregistering
323 * 'slave_' or destroying 'bond'.
326 bond_slave_register(struct bond *bond, void *slave_, struct netdev *netdev)
328 struct bond_slave *slave;
330 ovs_rwlock_wrlock(&rwlock);
331 slave = bond_slave_lookup(bond, slave_);
333 slave = xzalloc(sizeof *slave);
335 hmap_insert(&bond->slaves, &slave->hmap_node, hash_pointer(slave_, 0));
338 slave->delay_expires = LLONG_MAX;
339 slave->name = xstrdup(netdev_get_name(netdev));
340 bond->bond_revalidate = true;
342 slave->enabled = false;
343 bond_enable_slave(slave, netdev_get_carrier(netdev));
346 bond_slave_set_netdev__(slave, netdev);
349 slave->name = xstrdup(netdev_get_name(netdev));
350 ovs_rwlock_unlock(&rwlock);
353 /* Updates the network device to be used with 'slave_' to 'netdev'.
355 * This is useful if the caller closes and re-opens the network device
356 * registered with bond_slave_register() but doesn't need to change anything
359 bond_slave_set_netdev(struct bond *bond, void *slave_, struct netdev *netdev)
361 struct bond_slave *slave;
363 ovs_rwlock_wrlock(&rwlock);
364 slave = bond_slave_lookup(bond, slave_);
366 bond_slave_set_netdev__(slave, netdev);
368 ovs_rwlock_unlock(&rwlock);
371 /* Unregisters 'slave_' from 'bond'. If 'bond' does not contain such a slave
372 * then this function has no effect.
374 * Unregistering a slave invalidates all flows. */
376 bond_slave_unregister(struct bond *bond, const void *slave_)
378 struct bond_slave *slave;
381 ovs_rwlock_wrlock(&rwlock);
382 slave = bond_slave_lookup(bond, slave_);
387 bond->bond_revalidate = true;
388 bond_enable_slave(slave, false);
390 del_active = bond->active_slave == slave;
392 struct bond_entry *e;
393 for (e = bond->hash; e <= &bond->hash[BOND_MASK]; e++) {
394 if (e->slave == slave) {
402 hmap_remove(&bond->slaves, &slave->hmap_node);
403 /* Client owns 'slave->netdev'. */
407 bond_choose_active_slave(bond);
408 bond->send_learning_packets = true;
411 ovs_rwlock_unlock(&rwlock);
414 /* Should be called on each slave in 'bond' before bond_run() to indicate
415 * whether or not 'slave_' may be enabled. This function is intended to allow
416 * other protocols to have some impact on bonding decisions. For example LACP
417 * or high level link monitoring protocols may decide that a given slave should
418 * not be able to send traffic. */
420 bond_slave_set_may_enable(struct bond *bond, void *slave_, bool may_enable)
422 ovs_rwlock_wrlock(&rwlock);
423 bond_slave_lookup(bond, slave_)->may_enable = may_enable;
424 ovs_rwlock_unlock(&rwlock);
427 /* Performs periodic maintenance on 'bond'.
429 * Returns true if the caller should revalidate its flows.
431 * The caller should check bond_should_send_learning_packets() afterward. */
433 bond_run(struct bond *bond, enum lacp_status lacp_status)
435 struct bond_slave *slave;
438 ovs_rwlock_wrlock(&rwlock);
439 if (bond->lacp_status != lacp_status) {
440 bond->lacp_status = lacp_status;
441 bond->bond_revalidate = true;
444 /* Enable slaves based on link status and LACP feedback. */
445 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
446 bond_link_status_update(slave);
447 slave->change_seq = seq_read(connectivity_seq_get());
449 if (!bond->active_slave || !bond->active_slave->enabled) {
450 bond_choose_active_slave(bond);
453 /* Update fake bond interface stats. */
454 if (time_msec() >= bond->next_fake_iface_update) {
455 bond_update_fake_slave_stats(bond);
456 bond->next_fake_iface_update = time_msec() + 1000;
459 revalidate = bond->bond_revalidate;
460 bond->bond_revalidate = false;
461 ovs_rwlock_unlock(&rwlock);
466 /* Causes poll_block() to wake up when 'bond' needs something to be done. */
468 bond_wait(struct bond *bond)
470 struct bond_slave *slave;
472 ovs_rwlock_rdlock(&rwlock);
473 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
474 if (slave->delay_expires != LLONG_MAX) {
475 poll_timer_wait_until(slave->delay_expires);
478 seq_wait(connectivity_seq_get(), slave->change_seq);
481 if (bond->next_fake_iface_update != LLONG_MAX) {
482 poll_timer_wait_until(bond->next_fake_iface_update);
485 if (bond->bond_revalidate) {
486 poll_immediate_wake();
488 ovs_rwlock_unlock(&rwlock);
490 /* We don't wait for bond->next_rebalance because rebalancing can only run
491 * at a flow account checkpoint. ofproto does checkpointing on its own
492 * schedule and bond_rebalance() gets called afterward, so we'd just be
493 * waking up for no purpose. */
496 /* MAC learning table interaction. */
499 may_send_learning_packets(const struct bond *bond)
501 return ((bond->lacp_status == LACP_DISABLED
502 && (bond->balance == BM_SLB || bond->balance == BM_AB))
503 || (bond->lacp_fallback_ab && bond->lacp_status == LACP_CONFIGURED))
504 && bond->active_slave;
507 /* Returns true if 'bond' needs the client to send out packets to assist with
508 * MAC learning on 'bond'. If this function returns true, then the client
509 * should iterate through its MAC learning table for the bridge on which 'bond'
510 * is located. For each MAC that has been learned on a port other than 'bond',
511 * it should call bond_compose_learning_packet().
513 * This function will only return true if 'bond' is in SLB or active-backup
514 * mode and LACP is not negotiated. Otherwise sending learning packets isn't
517 * Calling this function resets the state that it checks. */
519 bond_should_send_learning_packets(struct bond *bond)
523 ovs_rwlock_wrlock(&rwlock);
524 send = bond->send_learning_packets && may_send_learning_packets(bond);
525 bond->send_learning_packets = false;
526 ovs_rwlock_unlock(&rwlock);
530 /* Sends a gratuitous learning packet on 'bond' from 'eth_src' on 'vlan'.
532 * See bond_should_send_learning_packets() for description of usage. The
533 * caller should send the composed packet on the port associated with
534 * port_aux and takes ownership of the returned ofpbuf. */
536 bond_compose_learning_packet(struct bond *bond,
537 const uint8_t eth_src[ETH_ADDR_LEN],
538 uint16_t vlan, void **port_aux)
540 struct bond_slave *slave;
541 struct ofpbuf *packet;
544 ovs_rwlock_rdlock(&rwlock);
545 ovs_assert(may_send_learning_packets(bond));
546 memset(&flow, 0, sizeof flow);
547 memcpy(flow.dl_src, eth_src, ETH_ADDR_LEN);
548 slave = choose_output_slave(bond, &flow, NULL, vlan);
550 packet = ofpbuf_new(0);
551 compose_rarp(packet, eth_src);
553 eth_push_vlan(packet, htons(vlan));
556 *port_aux = slave->aux;
557 ovs_rwlock_unlock(&rwlock);
561 /* Checks whether a packet that arrived on 'slave_' within 'bond', with an
562 * Ethernet destination address of 'eth_dst', should be admitted.
564 * The return value is one of the following:
566 * - BV_ACCEPT: Admit the packet.
568 * - BV_DROP: Drop the packet.
570 * - BV_DROP_IF_MOVED: Consult the MAC learning table for the packet's
571 * Ethernet source address and VLAN. If there is none, or if the packet
572 * is on the learned port, then admit the packet. If a different port has
573 * been learned, however, drop the packet (and do not use it for MAC
577 bond_check_admissibility(struct bond *bond, const void *slave_,
578 const uint8_t eth_dst[ETH_ADDR_LEN])
580 enum bond_verdict verdict = BV_DROP;
581 struct bond_slave *slave;
583 ovs_rwlock_rdlock(&rwlock);
584 slave = bond_slave_lookup(bond, slave_);
589 /* LACP bonds have very loose admissibility restrictions because we can
590 * assume the remote switch is aware of the bond and will "do the right
591 * thing". However, as a precaution we drop packets on disabled slaves
592 * because no correctly implemented partner switch should be sending
595 * If LACP is configured, but LACP negotiations have been unsuccessful, we
596 * drop all incoming traffic except if lacp_fallback_ab is enabled. */
597 switch (bond->lacp_status) {
598 case LACP_NEGOTIATED:
599 verdict = slave->enabled ? BV_ACCEPT : BV_DROP;
601 case LACP_CONFIGURED:
602 if (!bond->lacp_fallback_ab) {
609 /* Drop all multicast packets on inactive slaves. */
610 if (eth_addr_is_multicast(eth_dst)) {
611 if (bond->active_slave != slave) {
616 switch (bond->balance) {
618 /* TCP balanced bonds require successful LACP negotiations. Based on the
619 * above check, LACP is off or lacp_fallback_ab is true on this bond.
620 * If lacp_fallback_ab is true fall through to BM_AB case else, we
621 * drop all incoming traffic. */
622 if (!bond->lacp_fallback_ab) {
627 /* Drop all packets which arrive on backup slaves. This is similar to
628 * how Linux bonding handles active-backup bonds. */
629 if (bond->active_slave != slave) {
630 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
632 VLOG_DBG_RL(&rl, "active-backup bond received packet on backup"
633 " slave (%s) destined for " ETH_ADDR_FMT,
634 slave->name, ETH_ADDR_ARGS(eth_dst));
641 /* Drop all packets for which we have learned a different input port,
642 * because we probably sent the packet on one slave and got it back on
643 * the other. Gratuitous ARP packets are an exception to this rule:
644 * the host has moved to another switch. The exception to the
645 * exception is if we locked the learning table to avoid reflections on
647 verdict = BV_DROP_IF_MOVED;
653 ovs_rwlock_unlock(&rwlock);
658 /* Returns the slave (registered on 'bond' by bond_slave_register()) to which
659 * a packet with the given 'flow' and 'vlan' should be forwarded. Returns
660 * NULL if the packet should be dropped because no slaves are enabled.
662 * 'vlan' is not necessarily the same as 'flow->vlan_tci'. First, 'vlan'
663 * should be a VID only (i.e. excluding the PCP bits). Second,
664 * 'flow->vlan_tci' is the VLAN TCI that appeared on the packet (so it will be
665 * nonzero only for trunk ports), whereas 'vlan' is the logical VLAN that the
666 * packet belongs to (so for an access port it will be the access port's VLAN).
668 * If 'wc' is non-NULL, bitwise-OR's 'wc' with the set of bits that were
669 * significant in the selection. At some point earlier, 'wc' should
670 * have been initialized (e.g., by flow_wildcards_init_catchall()).
673 bond_choose_output_slave(struct bond *bond, const struct flow *flow,
674 struct flow_wildcards *wc, uint16_t vlan)
676 struct bond_slave *slave;
679 ovs_rwlock_rdlock(&rwlock);
680 slave = choose_output_slave(bond, flow, wc, vlan);
681 aux = slave ? slave->aux : NULL;
682 ovs_rwlock_unlock(&rwlock);
690 bond_is_balanced(const struct bond *bond) OVS_REQ_RDLOCK(rwlock)
692 return bond->rebalance_interval
693 && (bond->balance == BM_SLB || bond->balance == BM_TCP);
696 /* Notifies 'bond' that 'n_bytes' bytes were sent in 'flow' within 'vlan'. */
698 bond_account(struct bond *bond, const struct flow *flow, uint16_t vlan,
701 ovs_rwlock_wrlock(&rwlock);
702 if (bond_is_balanced(bond)) {
703 lookup_bond_entry(bond, flow, vlan)->tx_bytes += n_bytes;
705 ovs_rwlock_unlock(&rwlock);
708 static struct bond_slave *
709 bond_slave_from_bal_node(struct list *bal) OVS_REQ_RDLOCK(rwlock)
711 return CONTAINER_OF(bal, struct bond_slave, bal_node);
715 log_bals(struct bond *bond, const struct list *bals)
717 if (VLOG_IS_DBG_ENABLED()) {
718 struct ds ds = DS_EMPTY_INITIALIZER;
719 const struct bond_slave *slave;
721 LIST_FOR_EACH (slave, bal_node, bals) {
723 ds_put_char(&ds, ',');
725 ds_put_format(&ds, " %s %"PRIu64"kB",
726 slave->name, slave->tx_bytes / 1024);
728 if (!slave->enabled) {
729 ds_put_cstr(&ds, " (disabled)");
731 if (!list_is_empty(&slave->entries)) {
732 struct bond_entry *e;
734 ds_put_cstr(&ds, " (");
735 LIST_FOR_EACH (e, list_node, &slave->entries) {
736 if (&e->list_node != list_front(&slave->entries)) {
737 ds_put_cstr(&ds, " + ");
739 ds_put_format(&ds, "h%"PRIdPTR": %"PRIu64"kB",
740 e - bond->hash, e->tx_bytes / 1024);
742 ds_put_cstr(&ds, ")");
745 VLOG_DBG("bond %s:%s", bond->name, ds_cstr(&ds));
750 /* Shifts 'hash' from its current slave to 'to'. */
752 bond_shift_load(struct bond_entry *hash, struct bond_slave *to)
754 struct bond_slave *from = hash->slave;
755 struct bond *bond = from->bond;
756 uint64_t delta = hash->tx_bytes;
758 VLOG_INFO("bond %s: shift %"PRIu64"kB of load (with hash %"PRIdPTR") "
759 "from %s to %s (now carrying %"PRIu64"kB and "
760 "%"PRIu64"kB load, respectively)",
761 bond->name, delta / 1024, hash - bond->hash,
762 from->name, to->name,
763 (from->tx_bytes - delta) / 1024,
764 (to->tx_bytes + delta) / 1024);
766 /* Shift load away from 'from' to 'to'. */
767 from->tx_bytes -= delta;
768 to->tx_bytes += delta;
770 /* Arrange for flows to be revalidated. */
772 bond->bond_revalidate = true;
775 /* Picks and returns a bond_entry to migrate from 'from' (the most heavily
776 * loaded bond slave) to a bond slave that has 'to_tx_bytes' bytes of load,
777 * given that doing so must decrease the ratio of the load on the two slaves by
778 * at least 0.1. Returns NULL if there is no appropriate entry.
780 * The list of entries isn't sorted. I don't know of a reason to prefer to
781 * shift away small hashes or large hashes. */
782 static struct bond_entry *
783 choose_entry_to_migrate(const struct bond_slave *from, uint64_t to_tx_bytes)
785 struct bond_entry *e;
787 if (list_is_short(&from->entries)) {
788 /* 'from' carries no more than one MAC hash, so shifting load away from
789 * it would be pointless. */
793 LIST_FOR_EACH (e, list_node, &from->entries) {
794 double old_ratio, new_ratio;
797 if (to_tx_bytes == 0) {
798 /* Nothing on the new slave, move it. */
803 old_ratio = (double)from->tx_bytes / to_tx_bytes;
804 new_ratio = (double)(from->tx_bytes - delta) / (to_tx_bytes + delta);
805 if (old_ratio - new_ratio > 0.1
806 && fabs(new_ratio - 1.0) < fabs(old_ratio - 1.0)) {
807 /* We're aiming for an ideal ratio of 1, meaning both the 'from'
808 and 'to' slave have the same load. Therefore, we only move an
809 entry if it decreases the load on 'from', and brings us closer
810 to equal traffic load. */
818 /* Inserts 'slave' into 'bals' so that descending order of 'tx_bytes' is
821 insert_bal(struct list *bals, struct bond_slave *slave)
823 struct bond_slave *pos;
825 LIST_FOR_EACH (pos, bal_node, bals) {
826 if (slave->tx_bytes > pos->tx_bytes) {
830 list_insert(&pos->bal_node, &slave->bal_node);
833 /* Removes 'slave' from its current list and then inserts it into 'bals' so
834 * that descending order of 'tx_bytes' is maintained. */
836 reinsert_bal(struct list *bals, struct bond_slave *slave)
838 list_remove(&slave->bal_node);
839 insert_bal(bals, slave);
842 /* If 'bond' needs rebalancing, does so.
844 * The caller should have called bond_account() for each active flow, to ensure
845 * that flow data is consistently accounted at this point. */
847 bond_rebalance(struct bond *bond)
849 struct bond_slave *slave;
850 struct bond_entry *e;
853 ovs_rwlock_wrlock(&rwlock);
854 if (!bond_is_balanced(bond) || time_msec() < bond->next_rebalance) {
855 ovs_rwlock_unlock(&rwlock);
858 bond->next_rebalance = time_msec() + bond->rebalance_interval;
860 /* Add each bond_entry to its slave's 'entries' list.
861 * Compute each slave's tx_bytes as the sum of its entries' tx_bytes. */
862 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
864 list_init(&slave->entries);
866 for (e = &bond->hash[0]; e <= &bond->hash[BOND_MASK]; e++) {
867 if (e->slave && e->tx_bytes) {
868 e->slave->tx_bytes += e->tx_bytes;
869 list_push_back(&e->slave->entries, &e->list_node);
873 /* Add enabled slaves to 'bals' in descending order of tx_bytes.
875 * XXX This is O(n**2) in the number of slaves but it could be O(n lg n)
876 * with a proper list sort algorithm. */
878 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
879 if (slave->enabled) {
880 insert_bal(&bals, slave);
883 log_bals(bond, &bals);
885 /* Shift load from the most-loaded slaves to the least-loaded slaves. */
886 while (!list_is_short(&bals)) {
887 struct bond_slave *from = bond_slave_from_bal_node(list_front(&bals));
888 struct bond_slave *to = bond_slave_from_bal_node(list_back(&bals));
891 overload = from->tx_bytes - to->tx_bytes;
892 if (overload < to->tx_bytes >> 5 || overload < 100000) {
893 /* The extra load on 'from' (and all less-loaded slaves), compared
894 * to that of 'to' (the least-loaded slave), is less than ~3%, or
895 * it is less than ~1Mbps. No point in rebalancing. */
899 /* 'from' is carrying significantly more load than 'to'. Pick a hash
900 * to move from 'from' to 'to'. */
901 e = choose_entry_to_migrate(from, to->tx_bytes);
903 bond_shift_load(e, to);
905 /* Delete element from from->entries.
907 * We don't add the element to to->hashes. That would only allow
908 * 'e' to be migrated to another slave in this rebalancing run, and
909 * there is no point in doing that. */
910 list_remove(&e->list_node);
912 /* Re-sort 'bals'. */
913 reinsert_bal(&bals, from);
914 reinsert_bal(&bals, to);
916 /* Can't usefully migrate anything away from 'from'.
917 * Don't reconsider it. */
918 list_remove(&from->bal_node);
922 /* Implement exponentially weighted moving average. A weight of 1/2 causes
923 * historical data to decay to <1% in 7 rebalancing runs. 1,000,000 bytes
924 * take 20 rebalancing runs to decay to 0 and get deleted entirely. */
925 for (e = &bond->hash[0]; e <= &bond->hash[BOND_MASK]; e++) {
931 ovs_rwlock_unlock(&rwlock);
934 /* Bonding unixctl user interface functions. */
937 bond_find(const char *name) OVS_REQ_RDLOCK(rwlock)
941 HMAP_FOR_EACH_WITH_HASH (bond, hmap_node, hash_string(name, 0),
943 if (!strcmp(bond->name, name)) {
950 static struct bond_slave *
951 bond_lookup_slave(struct bond *bond, const char *slave_name)
953 struct bond_slave *slave;
955 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
956 if (!strcmp(slave->name, slave_name)) {
964 bond_unixctl_list(struct unixctl_conn *conn,
965 int argc OVS_UNUSED, const char *argv[] OVS_UNUSED,
966 void *aux OVS_UNUSED)
968 struct ds ds = DS_EMPTY_INITIALIZER;
969 const struct bond *bond;
971 ds_put_cstr(&ds, "bond\ttype\tslaves\n");
973 ovs_rwlock_rdlock(&rwlock);
974 HMAP_FOR_EACH (bond, hmap_node, all_bonds) {
975 const struct bond_slave *slave;
978 ds_put_format(&ds, "%s\t%s\t",
979 bond->name, bond_mode_to_string(bond->balance));
982 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
984 ds_put_cstr(&ds, ", ");
986 ds_put_cstr(&ds, slave->name);
988 ds_put_char(&ds, '\n');
990 ovs_rwlock_unlock(&rwlock);
991 unixctl_command_reply(conn, ds_cstr(&ds));
996 bond_print_details(struct ds *ds, const struct bond *bond)
997 OVS_REQ_RDLOCK(rwlock)
999 struct shash slave_shash = SHASH_INITIALIZER(&slave_shash);
1000 const struct shash_node **sorted_slaves = NULL;
1001 const struct bond_slave *slave;
1004 ds_put_format(ds, "---- %s ----\n", bond->name);
1005 ds_put_format(ds, "bond_mode: %s\n",
1006 bond_mode_to_string(bond->balance));
1008 ds_put_format(ds, "bond-hash-basis: %"PRIu32"\n", bond->basis);
1010 ds_put_format(ds, "updelay: %d ms\n", bond->updelay);
1011 ds_put_format(ds, "downdelay: %d ms\n", bond->downdelay);
1013 if (bond_is_balanced(bond)) {
1014 ds_put_format(ds, "next rebalance: %lld ms\n",
1015 bond->next_rebalance - time_msec());
1018 ds_put_cstr(ds, "lacp_status: ");
1019 switch (bond->lacp_status) {
1020 case LACP_NEGOTIATED:
1021 ds_put_cstr(ds, "negotiated\n");
1023 case LACP_CONFIGURED:
1024 ds_put_cstr(ds, "configured\n");
1027 ds_put_cstr(ds, "off\n");
1030 ds_put_cstr(ds, "<unknown>\n");
1034 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1035 shash_add(&slave_shash, slave->name, slave);
1037 sorted_slaves = shash_sort(&slave_shash);
1039 for (i = 0; i < shash_count(&slave_shash); i++) {
1040 struct bond_entry *be;
1042 slave = sorted_slaves[i]->data;
1045 ds_put_format(ds, "\nslave %s: %s\n",
1046 slave->name, slave->enabled ? "enabled" : "disabled");
1047 if (slave == bond->active_slave) {
1048 ds_put_cstr(ds, "\tactive slave\n");
1050 if (slave->delay_expires != LLONG_MAX) {
1051 ds_put_format(ds, "\t%s expires in %lld ms\n",
1052 slave->enabled ? "downdelay" : "updelay",
1053 slave->delay_expires - time_msec());
1056 ds_put_format(ds, "\tmay_enable: %s\n",
1057 slave->may_enable ? "true" : "false");
1059 if (!bond_is_balanced(bond)) {
1064 for (be = bond->hash; be <= &bond->hash[BOND_MASK]; be++) {
1065 int hash = be - bond->hash;
1067 if (be->slave != slave) {
1071 ds_put_format(ds, "\thash %d: %"PRIu64" kB load\n",
1072 hash, be->tx_bytes / 1024);
1074 /* XXX How can we list the MACs assigned to hashes of SLB bonds? */
1077 shash_destroy(&slave_shash);
1078 free(sorted_slaves);
1079 ds_put_cstr(ds, "\n");
1083 bond_unixctl_show(struct unixctl_conn *conn,
1084 int argc, const char *argv[],
1085 void *aux OVS_UNUSED)
1087 struct ds ds = DS_EMPTY_INITIALIZER;
1089 ovs_rwlock_rdlock(&rwlock);
1091 const struct bond *bond = bond_find(argv[1]);
1094 unixctl_command_reply_error(conn, "no such bond");
1097 bond_print_details(&ds, bond);
1099 const struct bond *bond;
1101 HMAP_FOR_EACH (bond, hmap_node, all_bonds) {
1102 bond_print_details(&ds, bond);
1106 unixctl_command_reply(conn, ds_cstr(&ds));
1110 ovs_rwlock_unlock(&rwlock);
1114 bond_unixctl_migrate(struct unixctl_conn *conn,
1115 int argc OVS_UNUSED, const char *argv[],
1116 void *aux OVS_UNUSED)
1118 const char *bond_s = argv[1];
1119 const char *hash_s = argv[2];
1120 const char *slave_s = argv[3];
1122 struct bond_slave *slave;
1123 struct bond_entry *entry;
1126 ovs_rwlock_wrlock(&rwlock);
1127 bond = bond_find(bond_s);
1129 unixctl_command_reply_error(conn, "no such bond");
1133 if (bond->balance != BM_SLB) {
1134 unixctl_command_reply_error(conn, "not an SLB bond");
1138 if (strspn(hash_s, "0123456789") == strlen(hash_s)) {
1139 hash = atoi(hash_s) & BOND_MASK;
1141 unixctl_command_reply_error(conn, "bad hash");
1145 slave = bond_lookup_slave(bond, slave_s);
1147 unixctl_command_reply_error(conn, "no such slave");
1151 if (!slave->enabled) {
1152 unixctl_command_reply_error(conn, "cannot migrate to disabled slave");
1156 entry = &bond->hash[hash];
1157 bond->bond_revalidate = true;
1158 entry->slave = slave;
1159 unixctl_command_reply(conn, "migrated");
1162 ovs_rwlock_unlock(&rwlock);
1166 bond_unixctl_set_active_slave(struct unixctl_conn *conn,
1167 int argc OVS_UNUSED, const char *argv[],
1168 void *aux OVS_UNUSED)
1170 const char *bond_s = argv[1];
1171 const char *slave_s = argv[2];
1173 struct bond_slave *slave;
1175 ovs_rwlock_wrlock(&rwlock);
1176 bond = bond_find(bond_s);
1178 unixctl_command_reply_error(conn, "no such bond");
1182 slave = bond_lookup_slave(bond, slave_s);
1184 unixctl_command_reply_error(conn, "no such slave");
1188 if (!slave->enabled) {
1189 unixctl_command_reply_error(conn, "cannot make disabled slave active");
1193 if (bond->active_slave != slave) {
1194 bond->bond_revalidate = true;
1195 bond->active_slave = slave;
1196 VLOG_INFO("bond %s: active interface is now %s",
1197 bond->name, slave->name);
1198 bond->send_learning_packets = true;
1199 unixctl_command_reply(conn, "done");
1201 unixctl_command_reply(conn, "no change");
1204 ovs_rwlock_unlock(&rwlock);
1208 enable_slave(struct unixctl_conn *conn, const char *argv[], bool enable)
1210 const char *bond_s = argv[1];
1211 const char *slave_s = argv[2];
1213 struct bond_slave *slave;
1215 ovs_rwlock_wrlock(&rwlock);
1216 bond = bond_find(bond_s);
1218 unixctl_command_reply_error(conn, "no such bond");
1222 slave = bond_lookup_slave(bond, slave_s);
1224 unixctl_command_reply_error(conn, "no such slave");
1228 bond_enable_slave(slave, enable);
1229 unixctl_command_reply(conn, enable ? "enabled" : "disabled");
1232 ovs_rwlock_unlock(&rwlock);
1236 bond_unixctl_enable_slave(struct unixctl_conn *conn,
1237 int argc OVS_UNUSED, const char *argv[],
1238 void *aux OVS_UNUSED)
1240 enable_slave(conn, argv, true);
1244 bond_unixctl_disable_slave(struct unixctl_conn *conn,
1245 int argc OVS_UNUSED, const char *argv[],
1246 void *aux OVS_UNUSED)
1248 enable_slave(conn, argv, false);
1252 bond_unixctl_hash(struct unixctl_conn *conn, int argc, const char *argv[],
1253 void *aux OVS_UNUSED)
1255 const char *mac_s = argv[1];
1256 const char *vlan_s = argc > 2 ? argv[2] : NULL;
1257 const char *basis_s = argc > 3 ? argv[3] : NULL;
1258 uint8_t mac[ETH_ADDR_LEN];
1265 if (!ovs_scan(vlan_s, "%u", &vlan)) {
1266 unixctl_command_reply_error(conn, "invalid vlan");
1274 if (!ovs_scan(basis_s, "%"SCNu32, &basis)) {
1275 unixctl_command_reply_error(conn, "invalid basis");
1282 if (ovs_scan(mac_s, ETH_ADDR_SCAN_FMT, ETH_ADDR_SCAN_ARGS(mac))) {
1283 hash = bond_hash_src(mac, vlan, basis) & BOND_MASK;
1285 hash_cstr = xasprintf("%u", hash);
1286 unixctl_command_reply(conn, hash_cstr);
1289 unixctl_command_reply_error(conn, "invalid mac");
1296 unixctl_command_register("bond/list", "", 0, 0, bond_unixctl_list, NULL);
1297 unixctl_command_register("bond/show", "[port]", 0, 1, bond_unixctl_show,
1299 unixctl_command_register("bond/migrate", "port hash slave", 3, 3,
1300 bond_unixctl_migrate, NULL);
1301 unixctl_command_register("bond/set-active-slave", "port slave", 2, 2,
1302 bond_unixctl_set_active_slave, NULL);
1303 unixctl_command_register("bond/enable-slave", "port slave", 2, 2,
1304 bond_unixctl_enable_slave, NULL);
1305 unixctl_command_register("bond/disable-slave", "port slave", 2, 2,
1306 bond_unixctl_disable_slave, NULL);
1307 unixctl_command_register("bond/hash", "mac [vlan] [basis]", 1, 3,
1308 bond_unixctl_hash, NULL);
1312 bond_entry_reset(struct bond *bond)
1314 if (bond->balance != BM_AB) {
1315 size_t hash_len = (BOND_MASK + 1) * sizeof *bond->hash;
1318 bond->hash = xmalloc(hash_len);
1320 memset(bond->hash, 0, hash_len);
1322 bond->next_rebalance = time_msec() + bond->rebalance_interval;
1329 static struct bond_slave *
1330 bond_slave_lookup(struct bond *bond, const void *slave_)
1332 struct bond_slave *slave;
1334 HMAP_FOR_EACH_IN_BUCKET (slave, hmap_node, hash_pointer(slave_, 0),
1336 if (slave->aux == slave_) {
1345 bond_enable_slave(struct bond_slave *slave, bool enable)
1347 slave->delay_expires = LLONG_MAX;
1348 if (enable != slave->enabled) {
1349 slave->bond->bond_revalidate = true;
1350 slave->enabled = enable;
1351 VLOG_INFO("interface %s: %s", slave->name,
1352 slave->enabled ? "enabled" : "disabled");
1357 bond_link_status_update(struct bond_slave *slave)
1359 struct bond *bond = slave->bond;
1362 up = netdev_get_carrier(slave->netdev) && slave->may_enable;
1363 if ((up == slave->enabled) != (slave->delay_expires == LLONG_MAX)) {
1364 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
1365 VLOG_INFO_RL(&rl, "interface %s: link state %s",
1366 slave->name, up ? "up" : "down");
1367 if (up == slave->enabled) {
1368 slave->delay_expires = LLONG_MAX;
1369 VLOG_INFO_RL(&rl, "interface %s: will not be %s",
1370 slave->name, up ? "disabled" : "enabled");
1372 int delay = (bond->lacp_status != LACP_DISABLED ? 0
1373 : up ? bond->updelay : bond->downdelay);
1374 slave->delay_expires = time_msec() + delay;
1376 VLOG_INFO_RL(&rl, "interface %s: will be %s if it stays %s "
1379 up ? "enabled" : "disabled",
1386 if (time_msec() >= slave->delay_expires) {
1387 bond_enable_slave(slave, up);
1392 bond_hash_src(const uint8_t mac[ETH_ADDR_LEN], uint16_t vlan, uint32_t basis)
1394 return hash_3words(hash_bytes(mac, ETH_ADDR_LEN, 0), vlan, basis);
1398 bond_hash_tcp(const struct flow *flow, uint16_t vlan, uint32_t basis)
1400 struct flow hash_flow = *flow;
1401 hash_flow.vlan_tci = htons(vlan);
1403 /* The symmetric quality of this hash function is not required, but
1404 * flow_hash_symmetric_l4 already exists, and is sufficient for our
1405 * purposes, so we use it out of convenience. */
1406 return flow_hash_symmetric_l4(&hash_flow, basis);
1410 bond_hash(const struct bond *bond, const struct flow *flow, uint16_t vlan)
1412 ovs_assert(bond->balance == BM_TCP || bond->balance == BM_SLB);
1414 return (bond->balance == BM_TCP
1415 ? bond_hash_tcp(flow, vlan, bond->basis)
1416 : bond_hash_src(flow->dl_src, vlan, bond->basis));
1419 static struct bond_entry *
1420 lookup_bond_entry(const struct bond *bond, const struct flow *flow,
1423 return &bond->hash[bond_hash(bond, flow, vlan) & BOND_MASK];
1426 static struct bond_slave *
1427 choose_output_slave(const struct bond *bond, const struct flow *flow,
1428 struct flow_wildcards *wc, uint16_t vlan)
1430 struct bond_entry *e;
1433 balance = bond->balance;
1434 if (bond->lacp_status == LACP_CONFIGURED) {
1435 /* LACP has been configured on this bond but negotiations were
1436 * unsuccussful. If lacp_fallback_ab is enabled use active-
1437 * backup mode else drop all traffic. */
1438 if (!bond->lacp_fallback_ab) {
1446 return bond->active_slave;
1449 if (bond->lacp_status != LACP_NEGOTIATED) {
1450 /* Must have LACP negotiations for TCP balanced bonds. */
1454 flow_mask_hash_fields(flow, wc, NX_HASH_FIELDS_SYMMETRIC_L4);
1459 flow_mask_hash_fields(flow, wc, NX_HASH_FIELDS_ETH_SRC);
1461 e = lookup_bond_entry(bond, flow, vlan);
1462 if (!e->slave || !e->slave->enabled) {
1463 e->slave = CONTAINER_OF(hmap_random_node(&bond->slaves),
1464 struct bond_slave, hmap_node);
1465 if (!e->slave->enabled) {
1466 e->slave = bond->active_slave;
1476 static struct bond_slave *
1477 bond_choose_slave(const struct bond *bond)
1479 struct bond_slave *slave, *best;
1481 /* Find an enabled slave. */
1482 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1483 if (slave->enabled) {
1488 /* All interfaces are disabled. Find an interface that will be enabled
1489 * after its updelay expires. */
1491 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1492 if (slave->delay_expires != LLONG_MAX
1493 && slave->may_enable
1494 && (!best || slave->delay_expires < best->delay_expires)) {
1502 bond_choose_active_slave(struct bond *bond)
1504 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
1505 struct bond_slave *old_active_slave = bond->active_slave;
1507 bond->active_slave = bond_choose_slave(bond);
1508 if (bond->active_slave) {
1509 if (bond->active_slave->enabled) {
1510 VLOG_INFO_RL(&rl, "bond %s: active interface is now %s",
1511 bond->name, bond->active_slave->name);
1513 VLOG_INFO_RL(&rl, "bond %s: active interface is now %s, skipping "
1514 "remaining %lld ms updelay (since no interface was "
1515 "enabled)", bond->name, bond->active_slave->name,
1516 bond->active_slave->delay_expires - time_msec());
1517 bond_enable_slave(bond->active_slave, true);
1520 bond->send_learning_packets = true;
1521 } else if (old_active_slave) {
1522 VLOG_INFO_RL(&rl, "bond %s: all interfaces disabled", bond->name);
1526 /* Attempts to make the sum of the bond slaves' statistics appear on the fake
1527 * bond interface. */
1529 bond_update_fake_slave_stats(struct bond *bond)
1531 struct netdev_stats bond_stats;
1532 struct bond_slave *slave;
1533 struct netdev *bond_dev;
1535 memset(&bond_stats, 0, sizeof bond_stats);
1537 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1538 struct netdev_stats slave_stats;
1540 if (!netdev_get_stats(slave->netdev, &slave_stats)) {
1541 /* XXX: We swap the stats here because they are swapped back when
1542 * reported by the internal device. The reason for this is
1543 * internal devices normally represent packets going into the
1544 * system but when used as fake bond device they represent packets
1545 * leaving the system. We really should do this in the internal
1546 * device itself because changing it here reverses the counts from
1547 * the perspective of the switch. However, the internal device
1548 * doesn't know what type of device it represents so we have to do
1549 * it here for now. */
1550 bond_stats.tx_packets += slave_stats.rx_packets;
1551 bond_stats.tx_bytes += slave_stats.rx_bytes;
1552 bond_stats.rx_packets += slave_stats.tx_packets;
1553 bond_stats.rx_bytes += slave_stats.tx_bytes;
1557 if (!netdev_open(bond->name, "system", &bond_dev)) {
1558 netdev_set_stats(bond_dev, &bond_stats);
1559 netdev_close(bond_dev);