2 * Copyright (c) 2008, 2009, 2010, 2011, 2012, 2013 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
26 #include "connectivity.h"
28 #include "dynamic-string.h"
37 #include "poll-loop.h"
44 VLOG_DEFINE_THIS_MODULE(bond);
46 /* Bit-mask for hashing a flow down to a bucket.
47 * There are (BOND_MASK + 1) buckets. */
48 #define BOND_MASK 0xff
50 /* A hash bucket for mapping a flow to a slave.
51 * "struct bond" has an array of (BOND_MASK + 1) of these. */
53 struct bond_slave *slave; /* Assigned slave, NULL if unassigned. */
54 uint64_t tx_bytes; /* Count of bytes recently transmitted. */
55 struct list list_node; /* In bond_slave's 'entries' list. */
58 /* A bond slave, that is, one of the links comprising a bond. */
60 struct hmap_node hmap_node; /* In struct bond's slaves hmap. */
61 struct bond *bond; /* The bond that contains this slave. */
62 void *aux; /* Client-provided handle for this slave. */
64 struct netdev *netdev; /* Network device, owned by the client. */
65 unsigned int change_seq; /* Tracks changes in 'netdev'. */
66 char *name; /* Name (a copy of netdev_get_name(netdev)). */
69 long long delay_expires; /* Time after which 'enabled' may change. */
70 bool enabled; /* May be chosen for flows? */
71 bool may_enable; /* Client considers this slave bondable. */
73 /* Rebalancing info. Used only by bond_rebalance(). */
74 struct list bal_node; /* In bond_rebalance()'s 'bals' list. */
75 struct list entries; /* 'struct bond_entry's assigned here. */
76 uint64_t tx_bytes; /* Sum across 'tx_bytes' of entries. */
79 /* A bond, that is, a set of network devices grouped to improve performance or
82 struct hmap_node hmap_node; /* In 'all_bonds' hmap. */
83 char *name; /* Name provided by client. */
89 enum bond_mode balance; /* Balancing mode, one of BM_*. */
90 struct bond_slave *active_slave;
91 int updelay, downdelay; /* Delay before slave goes up/down, in ms. */
92 enum lacp_status lacp_status; /* Status of LACP negotiations. */
93 bool bond_revalidate; /* True if flows need revalidation. */
94 uint32_t basis; /* Basis for flow hash function. */
96 /* SLB specific bonding info. */
97 struct bond_entry *hash; /* An array of (BOND_MASK + 1) elements. */
98 int rebalance_interval; /* Interval between rebalances, in ms. */
99 long long int next_rebalance; /* Next rebalancing time. */
100 bool send_learning_packets;
102 /* Legacy compatibility. */
103 long long int next_fake_iface_update; /* LLONG_MAX if disabled. */
104 bool lacp_fallback_ab; /* Fallback to active-backup on LACP failure. */
109 static struct ovs_rwlock rwlock = OVS_RWLOCK_INITIALIZER;
110 static struct hmap all_bonds__ = HMAP_INITIALIZER(&all_bonds__);
111 static struct hmap *const all_bonds OVS_GUARDED_BY(rwlock) = &all_bonds__;
113 static void bond_entry_reset(struct bond *) OVS_REQ_WRLOCK(rwlock);
114 static struct bond_slave *bond_slave_lookup(struct bond *, const void *slave_)
115 OVS_REQ_RDLOCK(rwlock);
116 static void bond_enable_slave(struct bond_slave *, bool enable)
117 OVS_REQ_WRLOCK(rwlock);
118 static void bond_link_status_update(struct bond_slave *)
119 OVS_REQ_WRLOCK(rwlock);
120 static void bond_choose_active_slave(struct bond *)
121 OVS_REQ_WRLOCK(rwlock);;
122 static unsigned int bond_hash_src(const uint8_t mac[ETH_ADDR_LEN],
123 uint16_t vlan, uint32_t basis);
124 static unsigned int bond_hash_tcp(const struct flow *, uint16_t vlan,
126 static struct bond_entry *lookup_bond_entry(const struct bond *,
129 OVS_REQ_RDLOCK(rwlock);
130 static struct bond_slave *choose_output_slave(const struct bond *,
132 struct flow_wildcards *,
134 OVS_REQ_RDLOCK(rwlock);
135 static void bond_update_fake_slave_stats(struct bond *)
136 OVS_REQ_RDLOCK(rwlock);
138 /* Attempts to parse 's' as the name of a bond balancing mode. If successful,
139 * stores the mode in '*balance' and returns true. Otherwise returns false
140 * without modifying '*balance'. */
142 bond_mode_from_string(enum bond_mode *balance, const char *s)
144 if (!strcmp(s, bond_mode_to_string(BM_TCP))) {
146 } else if (!strcmp(s, bond_mode_to_string(BM_SLB))) {
148 } else if (!strcmp(s, bond_mode_to_string(BM_AB))) {
156 /* Returns a string representing 'balance'. */
158 bond_mode_to_string(enum bond_mode balance) {
161 return "balance-tcp";
163 return "balance-slb";
165 return "active-backup";
171 /* Creates and returns a new bond whose configuration is initially taken from
174 * The caller should register each slave on the new bond by calling
175 * bond_slave_register(). */
177 bond_create(const struct bond_settings *s)
181 bond = xzalloc(sizeof *bond);
182 hmap_init(&bond->slaves);
183 bond->next_fake_iface_update = LLONG_MAX;
184 atomic_init(&bond->ref_cnt, 1);
186 bond_reconfigure(bond, s);
191 bond_ref(const struct bond *bond_)
193 struct bond *bond = CONST_CAST(struct bond *, bond_);
197 atomic_add(&bond->ref_cnt, 1, &orig);
198 ovs_assert(orig > 0);
205 bond_unref(struct bond *bond)
207 struct bond_slave *slave, *next_slave;
214 atomic_sub(&bond->ref_cnt, 1, &orig);
215 ovs_assert(orig > 0);
220 ovs_rwlock_wrlock(&rwlock);
221 hmap_remove(all_bonds, &bond->hmap_node);
222 ovs_rwlock_unlock(&rwlock);
224 HMAP_FOR_EACH_SAFE (slave, next_slave, hmap_node, &bond->slaves) {
225 hmap_remove(&bond->slaves, &slave->hmap_node);
226 /* Client owns 'slave->netdev'. */
230 hmap_destroy(&bond->slaves);
237 /* Updates 'bond''s overall configuration to 's'.
239 * The caller should register each slave on 'bond' by calling
240 * bond_slave_register(). This is optional if none of the slaves'
241 * configuration has changed. In any case it can't hurt.
243 * Returns true if the configuration has changed in such a way that requires
247 bond_reconfigure(struct bond *bond, const struct bond_settings *s)
249 bool revalidate = false;
251 ovs_rwlock_wrlock(&rwlock);
252 if (!bond->name || strcmp(bond->name, s->name)) {
254 hmap_remove(all_bonds, &bond->hmap_node);
257 bond->name = xstrdup(s->name);
258 hmap_insert(all_bonds, &bond->hmap_node, hash_string(bond->name, 0));
261 bond->updelay = s->up_delay;
262 bond->downdelay = s->down_delay;
264 if (bond->lacp_fallback_ab != s->lacp_fallback_ab_cfg) {
265 bond->lacp_fallback_ab = s->lacp_fallback_ab_cfg;
269 if (bond->rebalance_interval != s->rebalance_interval) {
270 bond->rebalance_interval = s->rebalance_interval;
274 if (bond->balance != s->balance) {
275 bond->balance = s->balance;
279 if (bond->basis != s->basis) {
280 bond->basis = s->basis;
285 if (bond->next_fake_iface_update == LLONG_MAX) {
286 bond->next_fake_iface_update = time_msec();
289 bond->next_fake_iface_update = LLONG_MAX;
292 if (bond->bond_revalidate) {
294 bond->bond_revalidate = false;
297 if (bond->balance == BM_AB || !bond->hash || revalidate) {
298 bond_entry_reset(bond);
301 ovs_rwlock_unlock(&rwlock);
306 bond_slave_set_netdev__(struct bond_slave *slave, struct netdev *netdev)
307 OVS_REQ_WRLOCK(rwlock)
309 if (slave->netdev != netdev) {
310 slave->netdev = netdev;
311 slave->change_seq = 0;
315 /* Registers 'slave_' as a slave of 'bond'. The 'slave_' pointer is an
316 * arbitrary client-provided pointer that uniquely identifies a slave within a
317 * bond. If 'slave_' already exists within 'bond' then this function
318 * reconfigures the existing slave.
320 * 'netdev' must be the network device that 'slave_' represents. It is owned
321 * by the client, so the client must not close it before either unregistering
322 * 'slave_' or destroying 'bond'.
325 bond_slave_register(struct bond *bond, void *slave_, struct netdev *netdev)
327 struct bond_slave *slave;
329 ovs_rwlock_wrlock(&rwlock);
330 slave = bond_slave_lookup(bond, slave_);
332 slave = xzalloc(sizeof *slave);
334 hmap_insert(&bond->slaves, &slave->hmap_node, hash_pointer(slave_, 0));
337 slave->delay_expires = LLONG_MAX;
338 slave->name = xstrdup(netdev_get_name(netdev));
339 bond->bond_revalidate = true;
341 slave->enabled = false;
342 bond_enable_slave(slave, netdev_get_carrier(netdev));
345 bond_slave_set_netdev__(slave, netdev);
348 slave->name = xstrdup(netdev_get_name(netdev));
349 ovs_rwlock_unlock(&rwlock);
352 /* Updates the network device to be used with 'slave_' to 'netdev'.
354 * This is useful if the caller closes and re-opens the network device
355 * registered with bond_slave_register() but doesn't need to change anything
358 bond_slave_set_netdev(struct bond *bond, void *slave_, struct netdev *netdev)
360 struct bond_slave *slave;
362 ovs_rwlock_wrlock(&rwlock);
363 slave = bond_slave_lookup(bond, slave_);
365 bond_slave_set_netdev__(slave, netdev);
367 ovs_rwlock_unlock(&rwlock);
370 /* Unregisters 'slave_' from 'bond'. If 'bond' does not contain such a slave
371 * then this function has no effect.
373 * Unregistering a slave invalidates all flows. */
375 bond_slave_unregister(struct bond *bond, const void *slave_)
377 struct bond_slave *slave;
380 ovs_rwlock_wrlock(&rwlock);
381 slave = bond_slave_lookup(bond, slave_);
386 bond->bond_revalidate = true;
387 bond_enable_slave(slave, false);
389 del_active = bond->active_slave == slave;
391 struct bond_entry *e;
392 for (e = bond->hash; e <= &bond->hash[BOND_MASK]; e++) {
393 if (e->slave == slave) {
401 hmap_remove(&bond->slaves, &slave->hmap_node);
402 /* Client owns 'slave->netdev'. */
406 bond_choose_active_slave(bond);
407 bond->send_learning_packets = true;
410 ovs_rwlock_unlock(&rwlock);
413 /* Should be called on each slave in 'bond' before bond_run() to indicate
414 * whether or not 'slave_' may be enabled. This function is intended to allow
415 * other protocols to have some impact on bonding decisions. For example LACP
416 * or high level link monitoring protocols may decide that a given slave should
417 * not be able to send traffic. */
419 bond_slave_set_may_enable(struct bond *bond, void *slave_, bool may_enable)
421 ovs_rwlock_wrlock(&rwlock);
422 bond_slave_lookup(bond, slave_)->may_enable = may_enable;
423 ovs_rwlock_unlock(&rwlock);
426 /* Performs periodic maintenance on 'bond'.
428 * Returns true if the caller should revalidate its flows.
430 * The caller should check bond_should_send_learning_packets() afterward. */
432 bond_run(struct bond *bond, enum lacp_status lacp_status)
434 struct bond_slave *slave;
437 ovs_rwlock_wrlock(&rwlock);
438 if (bond->lacp_status != lacp_status) {
439 bond->lacp_status = lacp_status;
440 bond->bond_revalidate = true;
443 /* Enable slaves based on link status and LACP feedback. */
444 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
445 bond_link_status_update(slave);
446 slave->change_seq = seq_read(connectivity_seq_get());
448 if (!bond->active_slave || !bond->active_slave->enabled) {
449 bond_choose_active_slave(bond);
452 /* Update fake bond interface stats. */
453 if (time_msec() >= bond->next_fake_iface_update) {
454 bond_update_fake_slave_stats(bond);
455 bond->next_fake_iface_update = time_msec() + 1000;
458 revalidate = bond->bond_revalidate;
459 bond->bond_revalidate = false;
460 ovs_rwlock_unlock(&rwlock);
465 /* Causes poll_block() to wake up when 'bond' needs something to be done. */
467 bond_wait(struct bond *bond)
469 struct bond_slave *slave;
471 ovs_rwlock_rdlock(&rwlock);
472 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
473 if (slave->delay_expires != LLONG_MAX) {
474 poll_timer_wait_until(slave->delay_expires);
477 seq_wait(connectivity_seq_get(), slave->change_seq);
480 if (bond->next_fake_iface_update != LLONG_MAX) {
481 poll_timer_wait_until(bond->next_fake_iface_update);
484 if (bond->bond_revalidate) {
485 poll_immediate_wake();
487 ovs_rwlock_unlock(&rwlock);
489 /* We don't wait for bond->next_rebalance because rebalancing can only run
490 * at a flow account checkpoint. ofproto does checkpointing on its own
491 * schedule and bond_rebalance() gets called afterward, so we'd just be
492 * waking up for no purpose. */
495 /* MAC learning table interaction. */
498 may_send_learning_packets(const struct bond *bond)
500 return ((bond->lacp_status == LACP_DISABLED
501 && (bond->balance == BM_SLB || bond->balance == BM_AB))
502 || (bond->lacp_fallback_ab && bond->lacp_status == LACP_CONFIGURED))
503 && bond->active_slave;
506 /* Returns true if 'bond' needs the client to send out packets to assist with
507 * MAC learning on 'bond'. If this function returns true, then the client
508 * should iterate through its MAC learning table for the bridge on which 'bond'
509 * is located. For each MAC that has been learned on a port other than 'bond',
510 * it should call bond_compose_learning_packet().
512 * This function will only return true if 'bond' is in SLB or active-backup
513 * mode and LACP is not negotiated. Otherwise sending learning packets isn't
516 * Calling this function resets the state that it checks. */
518 bond_should_send_learning_packets(struct bond *bond)
522 ovs_rwlock_wrlock(&rwlock);
523 send = bond->send_learning_packets && may_send_learning_packets(bond);
524 bond->send_learning_packets = false;
525 ovs_rwlock_unlock(&rwlock);
529 /* Sends a gratuitous learning packet on 'bond' from 'eth_src' on 'vlan'.
531 * See bond_should_send_learning_packets() for description of usage. The
532 * caller should send the composed packet on the port associated with
533 * port_aux and takes ownership of the returned ofpbuf. */
535 bond_compose_learning_packet(struct bond *bond,
536 const uint8_t eth_src[ETH_ADDR_LEN],
537 uint16_t vlan, void **port_aux)
539 struct bond_slave *slave;
540 struct ofpbuf *packet;
543 ovs_rwlock_rdlock(&rwlock);
544 ovs_assert(may_send_learning_packets(bond));
545 memset(&flow, 0, sizeof flow);
546 memcpy(flow.dl_src, eth_src, ETH_ADDR_LEN);
547 slave = choose_output_slave(bond, &flow, NULL, vlan);
549 packet = ofpbuf_new(0);
550 compose_rarp(packet, eth_src);
552 eth_push_vlan(packet, htons(vlan));
555 *port_aux = slave->aux;
556 ovs_rwlock_unlock(&rwlock);
560 /* Checks whether a packet that arrived on 'slave_' within 'bond', with an
561 * Ethernet destination address of 'eth_dst', should be admitted.
563 * The return value is one of the following:
565 * - BV_ACCEPT: Admit the packet.
567 * - BV_DROP: Drop the packet.
569 * - BV_DROP_IF_MOVED: Consult the MAC learning table for the packet's
570 * Ethernet source address and VLAN. If there is none, or if the packet
571 * is on the learned port, then admit the packet. If a different port has
572 * been learned, however, drop the packet (and do not use it for MAC
576 bond_check_admissibility(struct bond *bond, const void *slave_,
577 const uint8_t eth_dst[ETH_ADDR_LEN])
579 enum bond_verdict verdict = BV_DROP;
580 struct bond_slave *slave;
582 ovs_rwlock_rdlock(&rwlock);
583 slave = bond_slave_lookup(bond, slave_);
588 /* LACP bonds have very loose admissibility restrictions because we can
589 * assume the remote switch is aware of the bond and will "do the right
590 * thing". However, as a precaution we drop packets on disabled slaves
591 * because no correctly implemented partner switch should be sending
594 * If LACP is configured, but LACP negotiations have been unsuccessful, we
595 * drop all incoming traffic except if lacp_fallback_ab is enabled. */
596 switch (bond->lacp_status) {
597 case LACP_NEGOTIATED:
598 verdict = slave->enabled ? BV_ACCEPT : BV_DROP;
600 case LACP_CONFIGURED:
601 if (!bond->lacp_fallback_ab) {
608 /* Drop all multicast packets on inactive slaves. */
609 if (eth_addr_is_multicast(eth_dst)) {
610 if (bond->active_slave != slave) {
615 switch (bond->balance) {
617 /* TCP balanced bonds require successful LACP negotiations. Based on the
618 * above check, LACP is off or lacp_fallback_ab is true on this bond.
619 * If lacp_fallback_ab is true fall through to BM_AB case else, we
620 * drop all incoming traffic. */
621 if (!bond->lacp_fallback_ab) {
626 /* Drop all packets which arrive on backup slaves. This is similar to
627 * how Linux bonding handles active-backup bonds. */
628 if (bond->active_slave != slave) {
629 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
631 VLOG_DBG_RL(&rl, "active-backup bond received packet on backup"
632 " slave (%s) destined for " ETH_ADDR_FMT,
633 slave->name, ETH_ADDR_ARGS(eth_dst));
640 /* Drop all packets for which we have learned a different input port,
641 * because we probably sent the packet on one slave and got it back on
642 * the other. Gratuitous ARP packets are an exception to this rule:
643 * the host has moved to another switch. The exception to the
644 * exception is if we locked the learning table to avoid reflections on
646 verdict = BV_DROP_IF_MOVED;
652 ovs_rwlock_unlock(&rwlock);
657 /* Returns the slave (registered on 'bond' by bond_slave_register()) to which
658 * a packet with the given 'flow' and 'vlan' should be forwarded. Returns
659 * NULL if the packet should be dropped because no slaves are enabled.
661 * 'vlan' is not necessarily the same as 'flow->vlan_tci'. First, 'vlan'
662 * should be a VID only (i.e. excluding the PCP bits). Second,
663 * 'flow->vlan_tci' is the VLAN TCI that appeared on the packet (so it will be
664 * nonzero only for trunk ports), whereas 'vlan' is the logical VLAN that the
665 * packet belongs to (so for an access port it will be the access port's VLAN).
667 * If 'wc' is non-NULL, bitwise-OR's 'wc' with the set of bits that were
668 * significant in the selection. At some point earlier, 'wc' should
669 * have been initialized (e.g., by flow_wildcards_init_catchall()).
672 bond_choose_output_slave(struct bond *bond, const struct flow *flow,
673 struct flow_wildcards *wc, uint16_t vlan)
675 struct bond_slave *slave;
678 ovs_rwlock_rdlock(&rwlock);
679 slave = choose_output_slave(bond, flow, wc, vlan);
680 aux = slave ? slave->aux : NULL;
681 ovs_rwlock_unlock(&rwlock);
689 bond_is_balanced(const struct bond *bond) OVS_REQ_RDLOCK(rwlock)
691 return bond->rebalance_interval
692 && (bond->balance == BM_SLB || bond->balance == BM_TCP);
695 /* Notifies 'bond' that 'n_bytes' bytes were sent in 'flow' within 'vlan'. */
697 bond_account(struct bond *bond, const struct flow *flow, uint16_t vlan,
700 ovs_rwlock_wrlock(&rwlock);
701 if (bond_is_balanced(bond)) {
702 lookup_bond_entry(bond, flow, vlan)->tx_bytes += n_bytes;
704 ovs_rwlock_unlock(&rwlock);
707 static struct bond_slave *
708 bond_slave_from_bal_node(struct list *bal) OVS_REQ_RDLOCK(rwlock)
710 return CONTAINER_OF(bal, struct bond_slave, bal_node);
714 log_bals(struct bond *bond, const struct list *bals)
716 if (VLOG_IS_DBG_ENABLED()) {
717 struct ds ds = DS_EMPTY_INITIALIZER;
718 const struct bond_slave *slave;
720 LIST_FOR_EACH (slave, bal_node, bals) {
722 ds_put_char(&ds, ',');
724 ds_put_format(&ds, " %s %"PRIu64"kB",
725 slave->name, slave->tx_bytes / 1024);
727 if (!slave->enabled) {
728 ds_put_cstr(&ds, " (disabled)");
730 if (!list_is_empty(&slave->entries)) {
731 struct bond_entry *e;
733 ds_put_cstr(&ds, " (");
734 LIST_FOR_EACH (e, list_node, &slave->entries) {
735 if (&e->list_node != list_front(&slave->entries)) {
736 ds_put_cstr(&ds, " + ");
738 ds_put_format(&ds, "h%"PRIdPTR": %"PRIu64"kB",
739 e - bond->hash, e->tx_bytes / 1024);
741 ds_put_cstr(&ds, ")");
744 VLOG_DBG("bond %s:%s", bond->name, ds_cstr(&ds));
749 /* Shifts 'hash' from its current slave to 'to'. */
751 bond_shift_load(struct bond_entry *hash, struct bond_slave *to)
753 struct bond_slave *from = hash->slave;
754 struct bond *bond = from->bond;
755 uint64_t delta = hash->tx_bytes;
757 VLOG_INFO("bond %s: shift %"PRIu64"kB of load (with hash %"PRIdPTR") "
758 "from %s to %s (now carrying %"PRIu64"kB and "
759 "%"PRIu64"kB load, respectively)",
760 bond->name, delta / 1024, hash - bond->hash,
761 from->name, to->name,
762 (from->tx_bytes - delta) / 1024,
763 (to->tx_bytes + delta) / 1024);
765 /* Shift load away from 'from' to 'to'. */
766 from->tx_bytes -= delta;
767 to->tx_bytes += delta;
769 /* Arrange for flows to be revalidated. */
771 bond->bond_revalidate = true;
774 /* Picks and returns a bond_entry to migrate from 'from' (the most heavily
775 * loaded bond slave) to a bond slave that has 'to_tx_bytes' bytes of load,
776 * given that doing so must decrease the ratio of the load on the two slaves by
777 * at least 0.1. Returns NULL if there is no appropriate entry.
779 * The list of entries isn't sorted. I don't know of a reason to prefer to
780 * shift away small hashes or large hashes. */
781 static struct bond_entry *
782 choose_entry_to_migrate(const struct bond_slave *from, uint64_t to_tx_bytes)
784 struct bond_entry *e;
786 if (list_is_short(&from->entries)) {
787 /* 'from' carries no more than one MAC hash, so shifting load away from
788 * it would be pointless. */
792 LIST_FOR_EACH (e, list_node, &from->entries) {
793 double old_ratio, new_ratio;
796 if (to_tx_bytes == 0) {
797 /* Nothing on the new slave, move it. */
802 old_ratio = (double)from->tx_bytes / to_tx_bytes;
803 new_ratio = (double)(from->tx_bytes - delta) / (to_tx_bytes + delta);
804 if (old_ratio - new_ratio > 0.1
805 && fabs(new_ratio - 1.0) < fabs(old_ratio - 1.0)) {
806 /* We're aiming for an ideal ratio of 1, meaning both the 'from'
807 and 'to' slave have the same load. Therefore, we only move an
808 entry if it decreases the load on 'from', and brings us closer
809 to equal traffic load. */
817 /* Inserts 'slave' into 'bals' so that descending order of 'tx_bytes' is
820 insert_bal(struct list *bals, struct bond_slave *slave)
822 struct bond_slave *pos;
824 LIST_FOR_EACH (pos, bal_node, bals) {
825 if (slave->tx_bytes > pos->tx_bytes) {
829 list_insert(&pos->bal_node, &slave->bal_node);
832 /* Removes 'slave' from its current list and then inserts it into 'bals' so
833 * that descending order of 'tx_bytes' is maintained. */
835 reinsert_bal(struct list *bals, struct bond_slave *slave)
837 list_remove(&slave->bal_node);
838 insert_bal(bals, slave);
841 /* If 'bond' needs rebalancing, does so.
843 * The caller should have called bond_account() for each active flow, to ensure
844 * that flow data is consistently accounted at this point. */
846 bond_rebalance(struct bond *bond)
848 struct bond_slave *slave;
849 struct bond_entry *e;
852 ovs_rwlock_wrlock(&rwlock);
853 if (!bond_is_balanced(bond) || time_msec() < bond->next_rebalance) {
854 ovs_rwlock_unlock(&rwlock);
857 bond->next_rebalance = time_msec() + bond->rebalance_interval;
859 /* Add each bond_entry to its slave's 'entries' list.
860 * Compute each slave's tx_bytes as the sum of its entries' tx_bytes. */
861 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
863 list_init(&slave->entries);
865 for (e = &bond->hash[0]; e <= &bond->hash[BOND_MASK]; e++) {
866 if (e->slave && e->tx_bytes) {
867 e->slave->tx_bytes += e->tx_bytes;
868 list_push_back(&e->slave->entries, &e->list_node);
872 /* Add enabled slaves to 'bals' in descending order of tx_bytes.
874 * XXX This is O(n**2) in the number of slaves but it could be O(n lg n)
875 * with a proper list sort algorithm. */
877 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
878 if (slave->enabled) {
879 insert_bal(&bals, slave);
882 log_bals(bond, &bals);
884 /* Shift load from the most-loaded slaves to the least-loaded slaves. */
885 while (!list_is_short(&bals)) {
886 struct bond_slave *from = bond_slave_from_bal_node(list_front(&bals));
887 struct bond_slave *to = bond_slave_from_bal_node(list_back(&bals));
890 overload = from->tx_bytes - to->tx_bytes;
891 if (overload < to->tx_bytes >> 5 || overload < 100000) {
892 /* The extra load on 'from' (and all less-loaded slaves), compared
893 * to that of 'to' (the least-loaded slave), is less than ~3%, or
894 * it is less than ~1Mbps. No point in rebalancing. */
898 /* 'from' is carrying significantly more load than 'to'. Pick a hash
899 * to move from 'from' to 'to'. */
900 e = choose_entry_to_migrate(from, to->tx_bytes);
902 bond_shift_load(e, to);
904 /* Delete element from from->entries.
906 * We don't add the element to to->hashes. That would only allow
907 * 'e' to be migrated to another slave in this rebalancing run, and
908 * there is no point in doing that. */
909 list_remove(&e->list_node);
911 /* Re-sort 'bals'. */
912 reinsert_bal(&bals, from);
913 reinsert_bal(&bals, to);
915 /* Can't usefully migrate anything away from 'from'.
916 * Don't reconsider it. */
917 list_remove(&from->bal_node);
921 /* Implement exponentially weighted moving average. A weight of 1/2 causes
922 * historical data to decay to <1% in 7 rebalancing runs. 1,000,000 bytes
923 * take 20 rebalancing runs to decay to 0 and get deleted entirely. */
924 for (e = &bond->hash[0]; e <= &bond->hash[BOND_MASK]; e++) {
930 ovs_rwlock_unlock(&rwlock);
933 /* Bonding unixctl user interface functions. */
936 bond_find(const char *name) OVS_REQ_RDLOCK(rwlock)
940 HMAP_FOR_EACH_WITH_HASH (bond, hmap_node, hash_string(name, 0),
942 if (!strcmp(bond->name, name)) {
949 static struct bond_slave *
950 bond_lookup_slave(struct bond *bond, const char *slave_name)
952 struct bond_slave *slave;
954 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
955 if (!strcmp(slave->name, slave_name)) {
963 bond_unixctl_list(struct unixctl_conn *conn,
964 int argc OVS_UNUSED, const char *argv[] OVS_UNUSED,
965 void *aux OVS_UNUSED)
967 struct ds ds = DS_EMPTY_INITIALIZER;
968 const struct bond *bond;
970 ds_put_cstr(&ds, "bond\ttype\tslaves\n");
972 ovs_rwlock_rdlock(&rwlock);
973 HMAP_FOR_EACH (bond, hmap_node, all_bonds) {
974 const struct bond_slave *slave;
977 ds_put_format(&ds, "%s\t%s\t",
978 bond->name, bond_mode_to_string(bond->balance));
981 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
983 ds_put_cstr(&ds, ", ");
985 ds_put_cstr(&ds, slave->name);
987 ds_put_char(&ds, '\n');
989 ovs_rwlock_unlock(&rwlock);
990 unixctl_command_reply(conn, ds_cstr(&ds));
995 bond_print_details(struct ds *ds, const struct bond *bond)
996 OVS_REQ_RDLOCK(rwlock)
998 struct shash slave_shash = SHASH_INITIALIZER(&slave_shash);
999 const struct shash_node **sorted_slaves = NULL;
1000 const struct bond_slave *slave;
1003 ds_put_format(ds, "---- %s ----\n", bond->name);
1004 ds_put_format(ds, "bond_mode: %s\n",
1005 bond_mode_to_string(bond->balance));
1007 ds_put_format(ds, "bond-hash-basis: %"PRIu32"\n", bond->basis);
1009 ds_put_format(ds, "updelay: %d ms\n", bond->updelay);
1010 ds_put_format(ds, "downdelay: %d ms\n", bond->downdelay);
1012 if (bond_is_balanced(bond)) {
1013 ds_put_format(ds, "next rebalance: %lld ms\n",
1014 bond->next_rebalance - time_msec());
1017 ds_put_cstr(ds, "lacp_status: ");
1018 switch (bond->lacp_status) {
1019 case LACP_NEGOTIATED:
1020 ds_put_cstr(ds, "negotiated\n");
1022 case LACP_CONFIGURED:
1023 ds_put_cstr(ds, "configured\n");
1026 ds_put_cstr(ds, "off\n");
1029 ds_put_cstr(ds, "<unknown>\n");
1033 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1034 shash_add(&slave_shash, slave->name, slave);
1036 sorted_slaves = shash_sort(&slave_shash);
1038 for (i = 0; i < shash_count(&slave_shash); i++) {
1039 struct bond_entry *be;
1041 slave = sorted_slaves[i]->data;
1044 ds_put_format(ds, "\nslave %s: %s\n",
1045 slave->name, slave->enabled ? "enabled" : "disabled");
1046 if (slave == bond->active_slave) {
1047 ds_put_cstr(ds, "\tactive slave\n");
1049 if (slave->delay_expires != LLONG_MAX) {
1050 ds_put_format(ds, "\t%s expires in %lld ms\n",
1051 slave->enabled ? "downdelay" : "updelay",
1052 slave->delay_expires - time_msec());
1055 ds_put_format(ds, "\tmay_enable: %s\n",
1056 slave->may_enable ? "true" : "false");
1058 if (!bond_is_balanced(bond)) {
1063 for (be = bond->hash; be <= &bond->hash[BOND_MASK]; be++) {
1064 int hash = be - bond->hash;
1066 if (be->slave != slave) {
1070 ds_put_format(ds, "\thash %d: %"PRIu64" kB load\n",
1071 hash, be->tx_bytes / 1024);
1073 /* XXX How can we list the MACs assigned to hashes of SLB bonds? */
1076 shash_destroy(&slave_shash);
1077 free(sorted_slaves);
1078 ds_put_cstr(ds, "\n");
1082 bond_unixctl_show(struct unixctl_conn *conn,
1083 int argc, const char *argv[],
1084 void *aux OVS_UNUSED)
1086 struct ds ds = DS_EMPTY_INITIALIZER;
1088 ovs_rwlock_rdlock(&rwlock);
1090 const struct bond *bond = bond_find(argv[1]);
1093 unixctl_command_reply_error(conn, "no such bond");
1096 bond_print_details(&ds, bond);
1098 const struct bond *bond;
1100 HMAP_FOR_EACH (bond, hmap_node, all_bonds) {
1101 bond_print_details(&ds, bond);
1105 unixctl_command_reply(conn, ds_cstr(&ds));
1109 ovs_rwlock_unlock(&rwlock);
1113 bond_unixctl_migrate(struct unixctl_conn *conn,
1114 int argc OVS_UNUSED, const char *argv[],
1115 void *aux OVS_UNUSED)
1117 const char *bond_s = argv[1];
1118 const char *hash_s = argv[2];
1119 const char *slave_s = argv[3];
1121 struct bond_slave *slave;
1122 struct bond_entry *entry;
1125 ovs_rwlock_wrlock(&rwlock);
1126 bond = bond_find(bond_s);
1128 unixctl_command_reply_error(conn, "no such bond");
1132 if (bond->balance != BM_SLB) {
1133 unixctl_command_reply_error(conn, "not an SLB bond");
1137 if (strspn(hash_s, "0123456789") == strlen(hash_s)) {
1138 hash = atoi(hash_s) & BOND_MASK;
1140 unixctl_command_reply_error(conn, "bad hash");
1144 slave = bond_lookup_slave(bond, slave_s);
1146 unixctl_command_reply_error(conn, "no such slave");
1150 if (!slave->enabled) {
1151 unixctl_command_reply_error(conn, "cannot migrate to disabled slave");
1155 entry = &bond->hash[hash];
1156 bond->bond_revalidate = true;
1157 entry->slave = slave;
1158 unixctl_command_reply(conn, "migrated");
1161 ovs_rwlock_unlock(&rwlock);
1165 bond_unixctl_set_active_slave(struct unixctl_conn *conn,
1166 int argc OVS_UNUSED, const char *argv[],
1167 void *aux OVS_UNUSED)
1169 const char *bond_s = argv[1];
1170 const char *slave_s = argv[2];
1172 struct bond_slave *slave;
1174 ovs_rwlock_wrlock(&rwlock);
1175 bond = bond_find(bond_s);
1177 unixctl_command_reply_error(conn, "no such bond");
1181 slave = bond_lookup_slave(bond, slave_s);
1183 unixctl_command_reply_error(conn, "no such slave");
1187 if (!slave->enabled) {
1188 unixctl_command_reply_error(conn, "cannot make disabled slave active");
1192 if (bond->active_slave != slave) {
1193 bond->bond_revalidate = true;
1194 bond->active_slave = slave;
1195 VLOG_INFO("bond %s: active interface is now %s",
1196 bond->name, slave->name);
1197 bond->send_learning_packets = true;
1198 unixctl_command_reply(conn, "done");
1200 unixctl_command_reply(conn, "no change");
1203 ovs_rwlock_unlock(&rwlock);
1207 enable_slave(struct unixctl_conn *conn, const char *argv[], bool enable)
1209 const char *bond_s = argv[1];
1210 const char *slave_s = argv[2];
1212 struct bond_slave *slave;
1214 ovs_rwlock_wrlock(&rwlock);
1215 bond = bond_find(bond_s);
1217 unixctl_command_reply_error(conn, "no such bond");
1221 slave = bond_lookup_slave(bond, slave_s);
1223 unixctl_command_reply_error(conn, "no such slave");
1227 bond_enable_slave(slave, enable);
1228 unixctl_command_reply(conn, enable ? "enabled" : "disabled");
1231 ovs_rwlock_unlock(&rwlock);
1235 bond_unixctl_enable_slave(struct unixctl_conn *conn,
1236 int argc OVS_UNUSED, const char *argv[],
1237 void *aux OVS_UNUSED)
1239 enable_slave(conn, argv, true);
1243 bond_unixctl_disable_slave(struct unixctl_conn *conn,
1244 int argc OVS_UNUSED, const char *argv[],
1245 void *aux OVS_UNUSED)
1247 enable_slave(conn, argv, false);
1251 bond_unixctl_hash(struct unixctl_conn *conn, int argc, const char *argv[],
1252 void *aux OVS_UNUSED)
1254 const char *mac_s = argv[1];
1255 const char *vlan_s = argc > 2 ? argv[2] : NULL;
1256 const char *basis_s = argc > 3 ? argv[3] : NULL;
1257 uint8_t mac[ETH_ADDR_LEN];
1264 if (!ovs_scan(vlan_s, "%u", &vlan)) {
1265 unixctl_command_reply_error(conn, "invalid vlan");
1273 if (!ovs_scan(basis_s, "%"SCNu32, &basis)) {
1274 unixctl_command_reply_error(conn, "invalid basis");
1281 if (ovs_scan(mac_s, ETH_ADDR_SCAN_FMT, ETH_ADDR_SCAN_ARGS(mac))) {
1282 hash = bond_hash_src(mac, vlan, basis) & BOND_MASK;
1284 hash_cstr = xasprintf("%u", hash);
1285 unixctl_command_reply(conn, hash_cstr);
1288 unixctl_command_reply_error(conn, "invalid mac");
1295 unixctl_command_register("bond/list", "", 0, 0, bond_unixctl_list, NULL);
1296 unixctl_command_register("bond/show", "[port]", 0, 1, bond_unixctl_show,
1298 unixctl_command_register("bond/migrate", "port hash slave", 3, 3,
1299 bond_unixctl_migrate, NULL);
1300 unixctl_command_register("bond/set-active-slave", "port slave", 2, 2,
1301 bond_unixctl_set_active_slave, NULL);
1302 unixctl_command_register("bond/enable-slave", "port slave", 2, 2,
1303 bond_unixctl_enable_slave, NULL);
1304 unixctl_command_register("bond/disable-slave", "port slave", 2, 2,
1305 bond_unixctl_disable_slave, NULL);
1306 unixctl_command_register("bond/hash", "mac [vlan] [basis]", 1, 3,
1307 bond_unixctl_hash, NULL);
1311 bond_entry_reset(struct bond *bond)
1313 if (bond->balance != BM_AB) {
1314 size_t hash_len = (BOND_MASK + 1) * sizeof *bond->hash;
1317 bond->hash = xmalloc(hash_len);
1319 memset(bond->hash, 0, hash_len);
1321 bond->next_rebalance = time_msec() + bond->rebalance_interval;
1328 static struct bond_slave *
1329 bond_slave_lookup(struct bond *bond, const void *slave_)
1331 struct bond_slave *slave;
1333 HMAP_FOR_EACH_IN_BUCKET (slave, hmap_node, hash_pointer(slave_, 0),
1335 if (slave->aux == slave_) {
1344 bond_enable_slave(struct bond_slave *slave, bool enable)
1346 slave->delay_expires = LLONG_MAX;
1347 if (enable != slave->enabled) {
1348 slave->bond->bond_revalidate = true;
1349 slave->enabled = enable;
1350 VLOG_INFO("interface %s: %s", slave->name,
1351 slave->enabled ? "enabled" : "disabled");
1356 bond_link_status_update(struct bond_slave *slave)
1358 struct bond *bond = slave->bond;
1361 up = netdev_get_carrier(slave->netdev) && slave->may_enable;
1362 if ((up == slave->enabled) != (slave->delay_expires == LLONG_MAX)) {
1363 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
1364 VLOG_INFO_RL(&rl, "interface %s: link state %s",
1365 slave->name, up ? "up" : "down");
1366 if (up == slave->enabled) {
1367 slave->delay_expires = LLONG_MAX;
1368 VLOG_INFO_RL(&rl, "interface %s: will not be %s",
1369 slave->name, up ? "disabled" : "enabled");
1371 int delay = (bond->lacp_status != LACP_DISABLED ? 0
1372 : up ? bond->updelay : bond->downdelay);
1373 slave->delay_expires = time_msec() + delay;
1375 VLOG_INFO_RL(&rl, "interface %s: will be %s if it stays %s "
1378 up ? "enabled" : "disabled",
1385 if (time_msec() >= slave->delay_expires) {
1386 bond_enable_slave(slave, up);
1391 bond_hash_src(const uint8_t mac[ETH_ADDR_LEN], uint16_t vlan, uint32_t basis)
1393 return hash_3words(hash_bytes(mac, ETH_ADDR_LEN, 0), vlan, basis);
1397 bond_hash_tcp(const struct flow *flow, uint16_t vlan, uint32_t basis)
1399 struct flow hash_flow = *flow;
1400 hash_flow.vlan_tci = htons(vlan);
1402 /* The symmetric quality of this hash function is not required, but
1403 * flow_hash_symmetric_l4 already exists, and is sufficient for our
1404 * purposes, so we use it out of convenience. */
1405 return flow_hash_symmetric_l4(&hash_flow, basis);
1409 bond_hash(const struct bond *bond, const struct flow *flow, uint16_t vlan)
1411 ovs_assert(bond->balance == BM_TCP || bond->balance == BM_SLB);
1413 return (bond->balance == BM_TCP
1414 ? bond_hash_tcp(flow, vlan, bond->basis)
1415 : bond_hash_src(flow->dl_src, vlan, bond->basis));
1418 static struct bond_entry *
1419 lookup_bond_entry(const struct bond *bond, const struct flow *flow,
1422 return &bond->hash[bond_hash(bond, flow, vlan) & BOND_MASK];
1425 static struct bond_slave *
1426 choose_output_slave(const struct bond *bond, const struct flow *flow,
1427 struct flow_wildcards *wc, uint16_t vlan)
1429 struct bond_entry *e;
1432 balance = bond->balance;
1433 if (bond->lacp_status == LACP_CONFIGURED) {
1434 /* LACP has been configured on this bond but negotiations were
1435 * unsuccussful. If lacp_fallback_ab is enabled use active-
1436 * backup mode else drop all traffic. */
1437 if (!bond->lacp_fallback_ab) {
1445 return bond->active_slave;
1448 if (bond->lacp_status != LACP_NEGOTIATED) {
1449 /* Must have LACP negotiations for TCP balanced bonds. */
1453 flow_mask_hash_fields(flow, wc, NX_HASH_FIELDS_SYMMETRIC_L4);
1458 flow_mask_hash_fields(flow, wc, NX_HASH_FIELDS_ETH_SRC);
1460 e = lookup_bond_entry(bond, flow, vlan);
1461 if (!e->slave || !e->slave->enabled) {
1462 e->slave = CONTAINER_OF(hmap_random_node(&bond->slaves),
1463 struct bond_slave, hmap_node);
1464 if (!e->slave->enabled) {
1465 e->slave = bond->active_slave;
1475 static struct bond_slave *
1476 bond_choose_slave(const struct bond *bond)
1478 struct bond_slave *slave, *best;
1480 /* Find an enabled slave. */
1481 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1482 if (slave->enabled) {
1487 /* All interfaces are disabled. Find an interface that will be enabled
1488 * after its updelay expires. */
1490 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1491 if (slave->delay_expires != LLONG_MAX
1492 && slave->may_enable
1493 && (!best || slave->delay_expires < best->delay_expires)) {
1501 bond_choose_active_slave(struct bond *bond)
1503 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
1504 struct bond_slave *old_active_slave = bond->active_slave;
1506 bond->active_slave = bond_choose_slave(bond);
1507 if (bond->active_slave) {
1508 if (bond->active_slave->enabled) {
1509 VLOG_INFO_RL(&rl, "bond %s: active interface is now %s",
1510 bond->name, bond->active_slave->name);
1512 VLOG_INFO_RL(&rl, "bond %s: active interface is now %s, skipping "
1513 "remaining %lld ms updelay (since no interface was "
1514 "enabled)", bond->name, bond->active_slave->name,
1515 bond->active_slave->delay_expires - time_msec());
1516 bond_enable_slave(bond->active_slave, true);
1519 bond->send_learning_packets = true;
1520 } else if (old_active_slave) {
1521 VLOG_INFO_RL(&rl, "bond %s: all interfaces disabled", bond->name);
1525 /* Attempts to make the sum of the bond slaves' statistics appear on the fake
1526 * bond interface. */
1528 bond_update_fake_slave_stats(struct bond *bond)
1530 struct netdev_stats bond_stats;
1531 struct bond_slave *slave;
1532 struct netdev *bond_dev;
1534 memset(&bond_stats, 0, sizeof bond_stats);
1536 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1537 struct netdev_stats slave_stats;
1539 if (!netdev_get_stats(slave->netdev, &slave_stats)) {
1540 /* XXX: We swap the stats here because they are swapped back when
1541 * reported by the internal device. The reason for this is
1542 * internal devices normally represent packets going into the
1543 * system but when used as fake bond device they represent packets
1544 * leaving the system. We really should do this in the internal
1545 * device itself because changing it here reverses the counts from
1546 * the perspective of the switch. However, the internal device
1547 * doesn't know what type of device it represents so we have to do
1548 * it here for now. */
1549 bond_stats.tx_packets += slave_stats.rx_packets;
1550 bond_stats.tx_bytes += slave_stats.rx_bytes;
1551 bond_stats.rx_packets += slave_stats.tx_packets;
1552 bond_stats.rx_bytes += slave_stats.tx_bytes;
1556 if (!netdev_open(bond->name, "system", &bond_dev)) {
1557 netdev_set_stats(bond_dev, &bond_stats);
1558 netdev_close(bond_dev);