2 * Copyright (c) 2008, 2009, 2010, 2011, 2012, 2013 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
26 #include "connectivity.h"
28 #include "dynamic-string.h"
37 #include "poll-loop.h"
44 VLOG_DEFINE_THIS_MODULE(bond);
46 static struct ovs_rwlock rwlock = OVS_RWLOCK_INITIALIZER;
47 static struct hmap all_bonds__ = HMAP_INITIALIZER(&all_bonds__);
48 static struct hmap *const all_bonds OVS_GUARDED_BY(rwlock) = &all_bonds__;
50 /* Bit-mask for hashing a flow down to a bucket.
51 * There are (BOND_MASK + 1) buckets. */
52 #define BOND_MASK 0xff
54 /* A hash bucket for mapping a flow to a slave.
55 * "struct bond" has an array of (BOND_MASK + 1) of these. */
57 struct bond_slave *slave; /* Assigned slave, NULL if unassigned. */
58 uint64_t tx_bytes; /* Count of bytes recently transmitted. */
59 struct list list_node; /* In bond_slave's 'entries' list. */
62 /* A bond slave, that is, one of the links comprising a bond. */
64 struct hmap_node hmap_node; /* In struct bond's slaves hmap. */
65 struct list list_node; /* In struct bond's enabled_slaves list. */
66 struct bond *bond; /* The bond that contains this slave. */
67 void *aux; /* Client-provided handle for this slave. */
69 struct netdev *netdev; /* Network device, owned by the client. */
70 unsigned int change_seq; /* Tracks changes in 'netdev'. */
71 char *name; /* Name (a copy of netdev_get_name(netdev)). */
74 long long delay_expires; /* Time after which 'enabled' may change. */
75 bool enabled; /* May be chosen for flows? */
76 bool may_enable; /* Client considers this slave bondable. */
78 /* Rebalancing info. Used only by bond_rebalance(). */
79 struct list bal_node; /* In bond_rebalance()'s 'bals' list. */
80 struct list entries; /* 'struct bond_entry's assigned here. */
81 uint64_t tx_bytes; /* Sum across 'tx_bytes' of entries. */
84 /* A bond, that is, a set of network devices grouped to improve performance or
87 struct hmap_node hmap_node; /* In 'all_bonds' hmap. */
88 char *name; /* Name provided by client. */
95 * Any reader or writer of 'enabled_slaves' must hold 'mutex'.
96 * (To prevent the bond_slave from disappearing they must also hold
98 struct ovs_mutex mutex OVS_ACQ_AFTER(rwlock);
99 struct list enabled_slaves OVS_GUARDED; /* Contains struct bond_slaves. */
102 enum bond_mode balance; /* Balancing mode, one of BM_*. */
103 struct bond_slave *active_slave;
104 int updelay, downdelay; /* Delay before slave goes up/down, in ms. */
105 enum lacp_status lacp_status; /* Status of LACP negotiations. */
106 bool bond_revalidate; /* True if flows need revalidation. */
107 uint32_t basis; /* Basis for flow hash function. */
109 /* SLB specific bonding info. */
110 struct bond_entry *hash; /* An array of (BOND_MASK + 1) elements. */
111 int rebalance_interval; /* Interval between rebalances, in ms. */
112 long long int next_rebalance; /* Next rebalancing time. */
113 bool send_learning_packets;
115 /* Legacy compatibility. */
116 long long int next_fake_iface_update; /* LLONG_MAX if disabled. */
117 bool lacp_fallback_ab; /* Fallback to active-backup on LACP failure. */
119 struct ovs_refcount ref_cnt;
122 static void bond_entry_reset(struct bond *) OVS_REQ_WRLOCK(rwlock);
123 static struct bond_slave *bond_slave_lookup(struct bond *, const void *slave_)
124 OVS_REQ_RDLOCK(rwlock);
125 static void bond_enable_slave(struct bond_slave *, bool enable)
126 OVS_REQ_WRLOCK(rwlock);
127 static void bond_link_status_update(struct bond_slave *)
128 OVS_REQ_WRLOCK(rwlock);
129 static void bond_choose_active_slave(struct bond *)
130 OVS_REQ_WRLOCK(rwlock);;
131 static unsigned int bond_hash_src(const uint8_t mac[ETH_ADDR_LEN],
132 uint16_t vlan, uint32_t basis);
133 static unsigned int bond_hash_tcp(const struct flow *, uint16_t vlan,
135 static struct bond_entry *lookup_bond_entry(const struct bond *,
138 OVS_REQ_RDLOCK(rwlock);
139 static struct bond_slave *get_enabled_slave(struct bond *)
140 OVS_REQ_RDLOCK(rwlock);
141 static struct bond_slave *choose_output_slave(const struct bond *,
143 struct flow_wildcards *,
145 OVS_REQ_RDLOCK(rwlock);
146 static void bond_update_fake_slave_stats(struct bond *)
147 OVS_REQ_RDLOCK(rwlock);
149 /* Attempts to parse 's' as the name of a bond balancing mode. If successful,
150 * stores the mode in '*balance' and returns true. Otherwise returns false
151 * without modifying '*balance'. */
153 bond_mode_from_string(enum bond_mode *balance, const char *s)
155 if (!strcmp(s, bond_mode_to_string(BM_TCP))) {
157 } else if (!strcmp(s, bond_mode_to_string(BM_SLB))) {
159 } else if (!strcmp(s, bond_mode_to_string(BM_AB))) {
167 /* Returns a string representing 'balance'. */
169 bond_mode_to_string(enum bond_mode balance) {
172 return "balance-tcp";
174 return "balance-slb";
176 return "active-backup";
182 /* Creates and returns a new bond whose configuration is initially taken from
185 * The caller should register each slave on the new bond by calling
186 * bond_slave_register(). */
188 bond_create(const struct bond_settings *s)
192 bond = xzalloc(sizeof *bond);
193 hmap_init(&bond->slaves);
194 list_init(&bond->enabled_slaves);
195 ovs_mutex_init(&bond->mutex);
196 bond->next_fake_iface_update = LLONG_MAX;
197 ovs_refcount_init(&bond->ref_cnt);
199 bond_reconfigure(bond, s);
204 bond_ref(const struct bond *bond_)
206 struct bond *bond = CONST_CAST(struct bond *, bond_);
209 ovs_refcount_ref(&bond->ref_cnt);
216 bond_unref(struct bond *bond)
218 struct bond_slave *slave, *next_slave;
220 if (!bond || ovs_refcount_unref(&bond->ref_cnt) != 1) {
224 ovs_rwlock_wrlock(&rwlock);
225 hmap_remove(all_bonds, &bond->hmap_node);
226 ovs_rwlock_unlock(&rwlock);
228 HMAP_FOR_EACH_SAFE (slave, next_slave, hmap_node, &bond->slaves) {
229 hmap_remove(&bond->slaves, &slave->hmap_node);
230 /* Client owns 'slave->netdev'. */
234 hmap_destroy(&bond->slaves);
236 ovs_mutex_destroy(&bond->mutex);
239 ovs_refcount_destroy(&bond->ref_cnt);
243 /* Updates 'bond''s overall configuration to 's'.
245 * The caller should register each slave on 'bond' by calling
246 * bond_slave_register(). This is optional if none of the slaves'
247 * configuration has changed. In any case it can't hurt.
249 * Returns true if the configuration has changed in such a way that requires
253 bond_reconfigure(struct bond *bond, const struct bond_settings *s)
255 bool revalidate = false;
257 ovs_rwlock_wrlock(&rwlock);
258 if (!bond->name || strcmp(bond->name, s->name)) {
260 hmap_remove(all_bonds, &bond->hmap_node);
263 bond->name = xstrdup(s->name);
264 hmap_insert(all_bonds, &bond->hmap_node, hash_string(bond->name, 0));
267 bond->updelay = s->up_delay;
268 bond->downdelay = s->down_delay;
270 if (bond->lacp_fallback_ab != s->lacp_fallback_ab_cfg) {
271 bond->lacp_fallback_ab = s->lacp_fallback_ab_cfg;
275 if (bond->rebalance_interval != s->rebalance_interval) {
276 bond->rebalance_interval = s->rebalance_interval;
280 if (bond->balance != s->balance) {
281 bond->balance = s->balance;
285 if (bond->basis != s->basis) {
286 bond->basis = s->basis;
291 if (bond->next_fake_iface_update == LLONG_MAX) {
292 bond->next_fake_iface_update = time_msec();
295 bond->next_fake_iface_update = LLONG_MAX;
298 if (bond->bond_revalidate) {
300 bond->bond_revalidate = false;
303 if (bond->balance == BM_AB || !bond->hash || revalidate) {
304 bond_entry_reset(bond);
307 ovs_rwlock_unlock(&rwlock);
312 bond_slave_set_netdev__(struct bond_slave *slave, struct netdev *netdev)
313 OVS_REQ_WRLOCK(rwlock)
315 if (slave->netdev != netdev) {
316 slave->netdev = netdev;
317 slave->change_seq = 0;
321 /* Registers 'slave_' as a slave of 'bond'. The 'slave_' pointer is an
322 * arbitrary client-provided pointer that uniquely identifies a slave within a
323 * bond. If 'slave_' already exists within 'bond' then this function
324 * reconfigures the existing slave.
326 * 'netdev' must be the network device that 'slave_' represents. It is owned
327 * by the client, so the client must not close it before either unregistering
328 * 'slave_' or destroying 'bond'.
331 bond_slave_register(struct bond *bond, void *slave_, struct netdev *netdev)
333 struct bond_slave *slave;
335 ovs_rwlock_wrlock(&rwlock);
336 slave = bond_slave_lookup(bond, slave_);
338 slave = xzalloc(sizeof *slave);
340 hmap_insert(&bond->slaves, &slave->hmap_node, hash_pointer(slave_, 0));
343 slave->delay_expires = LLONG_MAX;
344 slave->name = xstrdup(netdev_get_name(netdev));
345 bond->bond_revalidate = true;
347 slave->enabled = false;
348 bond_enable_slave(slave, netdev_get_carrier(netdev));
351 bond_slave_set_netdev__(slave, netdev);
354 slave->name = xstrdup(netdev_get_name(netdev));
355 ovs_rwlock_unlock(&rwlock);
358 /* Updates the network device to be used with 'slave_' to 'netdev'.
360 * This is useful if the caller closes and re-opens the network device
361 * registered with bond_slave_register() but doesn't need to change anything
364 bond_slave_set_netdev(struct bond *bond, void *slave_, struct netdev *netdev)
366 struct bond_slave *slave;
368 ovs_rwlock_wrlock(&rwlock);
369 slave = bond_slave_lookup(bond, slave_);
371 bond_slave_set_netdev__(slave, netdev);
373 ovs_rwlock_unlock(&rwlock);
376 /* Unregisters 'slave_' from 'bond'. If 'bond' does not contain such a slave
377 * then this function has no effect.
379 * Unregistering a slave invalidates all flows. */
381 bond_slave_unregister(struct bond *bond, const void *slave_)
383 struct bond_slave *slave;
386 ovs_rwlock_wrlock(&rwlock);
387 slave = bond_slave_lookup(bond, slave_);
392 bond->bond_revalidate = true;
393 bond_enable_slave(slave, false);
395 del_active = bond->active_slave == slave;
397 struct bond_entry *e;
398 for (e = bond->hash; e <= &bond->hash[BOND_MASK]; e++) {
399 if (e->slave == slave) {
407 hmap_remove(&bond->slaves, &slave->hmap_node);
408 /* Client owns 'slave->netdev'. */
412 bond_choose_active_slave(bond);
413 bond->send_learning_packets = true;
416 ovs_rwlock_unlock(&rwlock);
419 /* Should be called on each slave in 'bond' before bond_run() to indicate
420 * whether or not 'slave_' may be enabled. This function is intended to allow
421 * other protocols to have some impact on bonding decisions. For example LACP
422 * or high level link monitoring protocols may decide that a given slave should
423 * not be able to send traffic. */
425 bond_slave_set_may_enable(struct bond *bond, void *slave_, bool may_enable)
427 ovs_rwlock_wrlock(&rwlock);
428 bond_slave_lookup(bond, slave_)->may_enable = may_enable;
429 ovs_rwlock_unlock(&rwlock);
432 /* Performs periodic maintenance on 'bond'.
434 * Returns true if the caller should revalidate its flows.
436 * The caller should check bond_should_send_learning_packets() afterward. */
438 bond_run(struct bond *bond, enum lacp_status lacp_status)
440 struct bond_slave *slave;
443 ovs_rwlock_wrlock(&rwlock);
444 if (bond->lacp_status != lacp_status) {
445 bond->lacp_status = lacp_status;
446 bond->bond_revalidate = true;
449 /* Enable slaves based on link status and LACP feedback. */
450 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
451 bond_link_status_update(slave);
452 slave->change_seq = seq_read(connectivity_seq_get());
454 if (!bond->active_slave || !bond->active_slave->enabled) {
455 bond_choose_active_slave(bond);
458 /* Update fake bond interface stats. */
459 if (time_msec() >= bond->next_fake_iface_update) {
460 bond_update_fake_slave_stats(bond);
461 bond->next_fake_iface_update = time_msec() + 1000;
464 revalidate = bond->bond_revalidate;
465 bond->bond_revalidate = false;
466 ovs_rwlock_unlock(&rwlock);
471 /* Causes poll_block() to wake up when 'bond' needs something to be done. */
473 bond_wait(struct bond *bond)
475 struct bond_slave *slave;
477 ovs_rwlock_rdlock(&rwlock);
478 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
479 if (slave->delay_expires != LLONG_MAX) {
480 poll_timer_wait_until(slave->delay_expires);
483 seq_wait(connectivity_seq_get(), slave->change_seq);
486 if (bond->next_fake_iface_update != LLONG_MAX) {
487 poll_timer_wait_until(bond->next_fake_iface_update);
490 if (bond->bond_revalidate) {
491 poll_immediate_wake();
493 ovs_rwlock_unlock(&rwlock);
495 /* We don't wait for bond->next_rebalance because rebalancing can only run
496 * at a flow account checkpoint. ofproto does checkpointing on its own
497 * schedule and bond_rebalance() gets called afterward, so we'd just be
498 * waking up for no purpose. */
501 /* MAC learning table interaction. */
504 may_send_learning_packets(const struct bond *bond)
506 return ((bond->lacp_status == LACP_DISABLED
507 && (bond->balance == BM_SLB || bond->balance == BM_AB))
508 || (bond->lacp_fallback_ab && bond->lacp_status == LACP_CONFIGURED))
509 && bond->active_slave;
512 /* Returns true if 'bond' needs the client to send out packets to assist with
513 * MAC learning on 'bond'. If this function returns true, then the client
514 * should iterate through its MAC learning table for the bridge on which 'bond'
515 * is located. For each MAC that has been learned on a port other than 'bond',
516 * it should call bond_compose_learning_packet().
518 * This function will only return true if 'bond' is in SLB or active-backup
519 * mode and LACP is not negotiated. Otherwise sending learning packets isn't
522 * Calling this function resets the state that it checks. */
524 bond_should_send_learning_packets(struct bond *bond)
528 ovs_rwlock_wrlock(&rwlock);
529 send = bond->send_learning_packets && may_send_learning_packets(bond);
530 bond->send_learning_packets = false;
531 ovs_rwlock_unlock(&rwlock);
535 /* Sends a gratuitous learning packet on 'bond' from 'eth_src' on 'vlan'.
537 * See bond_should_send_learning_packets() for description of usage. The
538 * caller should send the composed packet on the port associated with
539 * port_aux and takes ownership of the returned ofpbuf. */
541 bond_compose_learning_packet(struct bond *bond,
542 const uint8_t eth_src[ETH_ADDR_LEN],
543 uint16_t vlan, void **port_aux)
545 struct bond_slave *slave;
546 struct ofpbuf *packet;
549 ovs_rwlock_rdlock(&rwlock);
550 ovs_assert(may_send_learning_packets(bond));
551 memset(&flow, 0, sizeof flow);
552 memcpy(flow.dl_src, eth_src, ETH_ADDR_LEN);
553 slave = choose_output_slave(bond, &flow, NULL, vlan);
555 packet = ofpbuf_new(0);
556 compose_rarp(packet, eth_src);
558 eth_push_vlan(packet, htons(ETH_TYPE_VLAN), htons(vlan));
561 *port_aux = slave->aux;
562 ovs_rwlock_unlock(&rwlock);
566 /* Checks whether a packet that arrived on 'slave_' within 'bond', with an
567 * Ethernet destination address of 'eth_dst', should be admitted.
569 * The return value is one of the following:
571 * - BV_ACCEPT: Admit the packet.
573 * - BV_DROP: Drop the packet.
575 * - BV_DROP_IF_MOVED: Consult the MAC learning table for the packet's
576 * Ethernet source address and VLAN. If there is none, or if the packet
577 * is on the learned port, then admit the packet. If a different port has
578 * been learned, however, drop the packet (and do not use it for MAC
582 bond_check_admissibility(struct bond *bond, const void *slave_,
583 const uint8_t eth_dst[ETH_ADDR_LEN])
585 enum bond_verdict verdict = BV_DROP;
586 struct bond_slave *slave;
588 ovs_rwlock_rdlock(&rwlock);
589 slave = bond_slave_lookup(bond, slave_);
594 /* LACP bonds have very loose admissibility restrictions because we can
595 * assume the remote switch is aware of the bond and will "do the right
596 * thing". However, as a precaution we drop packets on disabled slaves
597 * because no correctly implemented partner switch should be sending
600 * If LACP is configured, but LACP negotiations have been unsuccessful, we
601 * drop all incoming traffic except if lacp_fallback_ab is enabled. */
602 switch (bond->lacp_status) {
603 case LACP_NEGOTIATED:
604 verdict = slave->enabled ? BV_ACCEPT : BV_DROP;
606 case LACP_CONFIGURED:
607 if (!bond->lacp_fallback_ab) {
614 /* Drop all multicast packets on inactive slaves. */
615 if (eth_addr_is_multicast(eth_dst)) {
616 if (bond->active_slave != slave) {
621 switch (bond->balance) {
623 /* TCP balanced bonds require successful LACP negotiations. Based on the
624 * above check, LACP is off or lacp_fallback_ab is true on this bond.
625 * If lacp_fallback_ab is true fall through to BM_AB case else, we
626 * drop all incoming traffic. */
627 if (!bond->lacp_fallback_ab) {
632 /* Drop all packets which arrive on backup slaves. This is similar to
633 * how Linux bonding handles active-backup bonds. */
634 if (bond->active_slave != slave) {
635 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
637 VLOG_DBG_RL(&rl, "active-backup bond received packet on backup"
638 " slave (%s) destined for " ETH_ADDR_FMT,
639 slave->name, ETH_ADDR_ARGS(eth_dst));
646 /* Drop all packets for which we have learned a different input port,
647 * because we probably sent the packet on one slave and got it back on
648 * the other. Gratuitous ARP packets are an exception to this rule:
649 * the host has moved to another switch. The exception to the
650 * exception is if we locked the learning table to avoid reflections on
652 verdict = BV_DROP_IF_MOVED;
658 ovs_rwlock_unlock(&rwlock);
663 /* Returns the slave (registered on 'bond' by bond_slave_register()) to which
664 * a packet with the given 'flow' and 'vlan' should be forwarded. Returns
665 * NULL if the packet should be dropped because no slaves are enabled.
667 * 'vlan' is not necessarily the same as 'flow->vlan_tci'. First, 'vlan'
668 * should be a VID only (i.e. excluding the PCP bits). Second,
669 * 'flow->vlan_tci' is the VLAN TCI that appeared on the packet (so it will be
670 * nonzero only for trunk ports), whereas 'vlan' is the logical VLAN that the
671 * packet belongs to (so for an access port it will be the access port's VLAN).
673 * If 'wc' is non-NULL, bitwise-OR's 'wc' with the set of bits that were
674 * significant in the selection. At some point earlier, 'wc' should
675 * have been initialized (e.g., by flow_wildcards_init_catchall()).
678 bond_choose_output_slave(struct bond *bond, const struct flow *flow,
679 struct flow_wildcards *wc, uint16_t vlan)
681 struct bond_slave *slave;
684 ovs_rwlock_rdlock(&rwlock);
685 slave = choose_output_slave(bond, flow, wc, vlan);
686 aux = slave ? slave->aux : NULL;
687 ovs_rwlock_unlock(&rwlock);
695 bond_is_balanced(const struct bond *bond) OVS_REQ_RDLOCK(rwlock)
697 return bond->rebalance_interval
698 && (bond->balance == BM_SLB || bond->balance == BM_TCP);
701 /* Notifies 'bond' that 'n_bytes' bytes were sent in 'flow' within 'vlan'. */
703 bond_account(struct bond *bond, const struct flow *flow, uint16_t vlan,
706 ovs_rwlock_wrlock(&rwlock);
707 if (bond_is_balanced(bond)) {
708 lookup_bond_entry(bond, flow, vlan)->tx_bytes += n_bytes;
710 ovs_rwlock_unlock(&rwlock);
713 static struct bond_slave *
714 bond_slave_from_bal_node(struct list *bal) OVS_REQ_RDLOCK(rwlock)
716 return CONTAINER_OF(bal, struct bond_slave, bal_node);
720 log_bals(struct bond *bond, const struct list *bals)
722 if (VLOG_IS_DBG_ENABLED()) {
723 struct ds ds = DS_EMPTY_INITIALIZER;
724 const struct bond_slave *slave;
726 LIST_FOR_EACH (slave, bal_node, bals) {
728 ds_put_char(&ds, ',');
730 ds_put_format(&ds, " %s %"PRIu64"kB",
731 slave->name, slave->tx_bytes / 1024);
733 if (!slave->enabled) {
734 ds_put_cstr(&ds, " (disabled)");
736 if (!list_is_empty(&slave->entries)) {
737 struct bond_entry *e;
739 ds_put_cstr(&ds, " (");
740 LIST_FOR_EACH (e, list_node, &slave->entries) {
741 if (&e->list_node != list_front(&slave->entries)) {
742 ds_put_cstr(&ds, " + ");
744 ds_put_format(&ds, "h%"PRIdPTR": %"PRIu64"kB",
745 e - bond->hash, e->tx_bytes / 1024);
747 ds_put_cstr(&ds, ")");
750 VLOG_DBG("bond %s:%s", bond->name, ds_cstr(&ds));
755 /* Shifts 'hash' from its current slave to 'to'. */
757 bond_shift_load(struct bond_entry *hash, struct bond_slave *to)
759 struct bond_slave *from = hash->slave;
760 struct bond *bond = from->bond;
761 uint64_t delta = hash->tx_bytes;
763 VLOG_INFO("bond %s: shift %"PRIu64"kB of load (with hash %"PRIdPTR") "
764 "from %s to %s (now carrying %"PRIu64"kB and "
765 "%"PRIu64"kB load, respectively)",
766 bond->name, delta / 1024, hash - bond->hash,
767 from->name, to->name,
768 (from->tx_bytes - delta) / 1024,
769 (to->tx_bytes + delta) / 1024);
771 /* Shift load away from 'from' to 'to'. */
772 from->tx_bytes -= delta;
773 to->tx_bytes += delta;
775 /* Arrange for flows to be revalidated. */
777 bond->bond_revalidate = true;
780 /* Picks and returns a bond_entry to migrate from 'from' (the most heavily
781 * loaded bond slave) to a bond slave that has 'to_tx_bytes' bytes of load,
782 * given that doing so must decrease the ratio of the load on the two slaves by
783 * at least 0.1. Returns NULL if there is no appropriate entry.
785 * The list of entries isn't sorted. I don't know of a reason to prefer to
786 * shift away small hashes or large hashes. */
787 static struct bond_entry *
788 choose_entry_to_migrate(const struct bond_slave *from, uint64_t to_tx_bytes)
790 struct bond_entry *e;
792 if (list_is_short(&from->entries)) {
793 /* 'from' carries no more than one MAC hash, so shifting load away from
794 * it would be pointless. */
798 LIST_FOR_EACH (e, list_node, &from->entries) {
799 double old_ratio, new_ratio;
802 if (to_tx_bytes == 0) {
803 /* Nothing on the new slave, move it. */
808 old_ratio = (double)from->tx_bytes / to_tx_bytes;
809 new_ratio = (double)(from->tx_bytes - delta) / (to_tx_bytes + delta);
810 if (old_ratio - new_ratio > 0.1
811 && fabs(new_ratio - 1.0) < fabs(old_ratio - 1.0)) {
812 /* We're aiming for an ideal ratio of 1, meaning both the 'from'
813 and 'to' slave have the same load. Therefore, we only move an
814 entry if it decreases the load on 'from', and brings us closer
815 to equal traffic load. */
823 /* Inserts 'slave' into 'bals' so that descending order of 'tx_bytes' is
826 insert_bal(struct list *bals, struct bond_slave *slave)
828 struct bond_slave *pos;
830 LIST_FOR_EACH (pos, bal_node, bals) {
831 if (slave->tx_bytes > pos->tx_bytes) {
835 list_insert(&pos->bal_node, &slave->bal_node);
838 /* Removes 'slave' from its current list and then inserts it into 'bals' so
839 * that descending order of 'tx_bytes' is maintained. */
841 reinsert_bal(struct list *bals, struct bond_slave *slave)
843 list_remove(&slave->bal_node);
844 insert_bal(bals, slave);
847 /* If 'bond' needs rebalancing, does so.
849 * The caller should have called bond_account() for each active flow, to ensure
850 * that flow data is consistently accounted at this point. */
852 bond_rebalance(struct bond *bond)
854 struct bond_slave *slave;
855 struct bond_entry *e;
858 ovs_rwlock_wrlock(&rwlock);
859 if (!bond_is_balanced(bond) || time_msec() < bond->next_rebalance) {
860 ovs_rwlock_unlock(&rwlock);
863 bond->next_rebalance = time_msec() + bond->rebalance_interval;
865 /* Add each bond_entry to its slave's 'entries' list.
866 * Compute each slave's tx_bytes as the sum of its entries' tx_bytes. */
867 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
869 list_init(&slave->entries);
871 for (e = &bond->hash[0]; e <= &bond->hash[BOND_MASK]; e++) {
872 if (e->slave && e->tx_bytes) {
873 e->slave->tx_bytes += e->tx_bytes;
874 list_push_back(&e->slave->entries, &e->list_node);
878 /* Add enabled slaves to 'bals' in descending order of tx_bytes.
880 * XXX This is O(n**2) in the number of slaves but it could be O(n lg n)
881 * with a proper list sort algorithm. */
883 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
884 if (slave->enabled) {
885 insert_bal(&bals, slave);
888 log_bals(bond, &bals);
890 /* Shift load from the most-loaded slaves to the least-loaded slaves. */
891 while (!list_is_short(&bals)) {
892 struct bond_slave *from = bond_slave_from_bal_node(list_front(&bals));
893 struct bond_slave *to = bond_slave_from_bal_node(list_back(&bals));
896 overload = from->tx_bytes - to->tx_bytes;
897 if (overload < to->tx_bytes >> 5 || overload < 100000) {
898 /* The extra load on 'from' (and all less-loaded slaves), compared
899 * to that of 'to' (the least-loaded slave), is less than ~3%, or
900 * it is less than ~1Mbps. No point in rebalancing. */
904 /* 'from' is carrying significantly more load than 'to'. Pick a hash
905 * to move from 'from' to 'to'. */
906 e = choose_entry_to_migrate(from, to->tx_bytes);
908 bond_shift_load(e, to);
910 /* Delete element from from->entries.
912 * We don't add the element to to->hashes. That would only allow
913 * 'e' to be migrated to another slave in this rebalancing run, and
914 * there is no point in doing that. */
915 list_remove(&e->list_node);
917 /* Re-sort 'bals'. */
918 reinsert_bal(&bals, from);
919 reinsert_bal(&bals, to);
921 /* Can't usefully migrate anything away from 'from'.
922 * Don't reconsider it. */
923 list_remove(&from->bal_node);
927 /* Implement exponentially weighted moving average. A weight of 1/2 causes
928 * historical data to decay to <1% in 7 rebalancing runs. 1,000,000 bytes
929 * take 20 rebalancing runs to decay to 0 and get deleted entirely. */
930 for (e = &bond->hash[0]; e <= &bond->hash[BOND_MASK]; e++) {
936 ovs_rwlock_unlock(&rwlock);
939 /* Bonding unixctl user interface functions. */
942 bond_find(const char *name) OVS_REQ_RDLOCK(rwlock)
946 HMAP_FOR_EACH_WITH_HASH (bond, hmap_node, hash_string(name, 0),
948 if (!strcmp(bond->name, name)) {
955 static struct bond_slave *
956 bond_lookup_slave(struct bond *bond, const char *slave_name)
958 struct bond_slave *slave;
960 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
961 if (!strcmp(slave->name, slave_name)) {
969 bond_unixctl_list(struct unixctl_conn *conn,
970 int argc OVS_UNUSED, const char *argv[] OVS_UNUSED,
971 void *aux OVS_UNUSED)
973 struct ds ds = DS_EMPTY_INITIALIZER;
974 const struct bond *bond;
976 ds_put_cstr(&ds, "bond\ttype\tslaves\n");
978 ovs_rwlock_rdlock(&rwlock);
979 HMAP_FOR_EACH (bond, hmap_node, all_bonds) {
980 const struct bond_slave *slave;
983 ds_put_format(&ds, "%s\t%s\t",
984 bond->name, bond_mode_to_string(bond->balance));
987 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
989 ds_put_cstr(&ds, ", ");
991 ds_put_cstr(&ds, slave->name);
993 ds_put_char(&ds, '\n');
995 ovs_rwlock_unlock(&rwlock);
996 unixctl_command_reply(conn, ds_cstr(&ds));
1001 bond_print_details(struct ds *ds, const struct bond *bond)
1002 OVS_REQ_RDLOCK(rwlock)
1004 struct shash slave_shash = SHASH_INITIALIZER(&slave_shash);
1005 const struct shash_node **sorted_slaves = NULL;
1006 const struct bond_slave *slave;
1009 ds_put_format(ds, "---- %s ----\n", bond->name);
1010 ds_put_format(ds, "bond_mode: %s\n",
1011 bond_mode_to_string(bond->balance));
1013 ds_put_format(ds, "bond-hash-basis: %"PRIu32"\n", bond->basis);
1015 ds_put_format(ds, "updelay: %d ms\n", bond->updelay);
1016 ds_put_format(ds, "downdelay: %d ms\n", bond->downdelay);
1018 if (bond_is_balanced(bond)) {
1019 ds_put_format(ds, "next rebalance: %lld ms\n",
1020 bond->next_rebalance - time_msec());
1023 ds_put_cstr(ds, "lacp_status: ");
1024 switch (bond->lacp_status) {
1025 case LACP_NEGOTIATED:
1026 ds_put_cstr(ds, "negotiated\n");
1028 case LACP_CONFIGURED:
1029 ds_put_cstr(ds, "configured\n");
1032 ds_put_cstr(ds, "off\n");
1035 ds_put_cstr(ds, "<unknown>\n");
1039 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1040 shash_add(&slave_shash, slave->name, slave);
1042 sorted_slaves = shash_sort(&slave_shash);
1044 for (i = 0; i < shash_count(&slave_shash); i++) {
1045 struct bond_entry *be;
1047 slave = sorted_slaves[i]->data;
1050 ds_put_format(ds, "\nslave %s: %s\n",
1051 slave->name, slave->enabled ? "enabled" : "disabled");
1052 if (slave == bond->active_slave) {
1053 ds_put_cstr(ds, "\tactive slave\n");
1055 if (slave->delay_expires != LLONG_MAX) {
1056 ds_put_format(ds, "\t%s expires in %lld ms\n",
1057 slave->enabled ? "downdelay" : "updelay",
1058 slave->delay_expires - time_msec());
1061 ds_put_format(ds, "\tmay_enable: %s\n",
1062 slave->may_enable ? "true" : "false");
1064 if (!bond_is_balanced(bond)) {
1069 for (be = bond->hash; be <= &bond->hash[BOND_MASK]; be++) {
1070 int hash = be - bond->hash;
1072 if (be->slave != slave) {
1076 ds_put_format(ds, "\thash %d: %"PRIu64" kB load\n",
1077 hash, be->tx_bytes / 1024);
1079 /* XXX How can we list the MACs assigned to hashes of SLB bonds? */
1082 shash_destroy(&slave_shash);
1083 free(sorted_slaves);
1084 ds_put_cstr(ds, "\n");
1088 bond_unixctl_show(struct unixctl_conn *conn,
1089 int argc, const char *argv[],
1090 void *aux OVS_UNUSED)
1092 struct ds ds = DS_EMPTY_INITIALIZER;
1094 ovs_rwlock_rdlock(&rwlock);
1096 const struct bond *bond = bond_find(argv[1]);
1099 unixctl_command_reply_error(conn, "no such bond");
1102 bond_print_details(&ds, bond);
1104 const struct bond *bond;
1106 HMAP_FOR_EACH (bond, hmap_node, all_bonds) {
1107 bond_print_details(&ds, bond);
1111 unixctl_command_reply(conn, ds_cstr(&ds));
1115 ovs_rwlock_unlock(&rwlock);
1119 bond_unixctl_migrate(struct unixctl_conn *conn,
1120 int argc OVS_UNUSED, const char *argv[],
1121 void *aux OVS_UNUSED)
1123 const char *bond_s = argv[1];
1124 const char *hash_s = argv[2];
1125 const char *slave_s = argv[3];
1127 struct bond_slave *slave;
1128 struct bond_entry *entry;
1131 ovs_rwlock_wrlock(&rwlock);
1132 bond = bond_find(bond_s);
1134 unixctl_command_reply_error(conn, "no such bond");
1138 if (bond->balance != BM_SLB) {
1139 unixctl_command_reply_error(conn, "not an SLB bond");
1143 if (strspn(hash_s, "0123456789") == strlen(hash_s)) {
1144 hash = atoi(hash_s) & BOND_MASK;
1146 unixctl_command_reply_error(conn, "bad hash");
1150 slave = bond_lookup_slave(bond, slave_s);
1152 unixctl_command_reply_error(conn, "no such slave");
1156 if (!slave->enabled) {
1157 unixctl_command_reply_error(conn, "cannot migrate to disabled slave");
1161 entry = &bond->hash[hash];
1162 bond->bond_revalidate = true;
1163 entry->slave = slave;
1164 unixctl_command_reply(conn, "migrated");
1167 ovs_rwlock_unlock(&rwlock);
1171 bond_unixctl_set_active_slave(struct unixctl_conn *conn,
1172 int argc OVS_UNUSED, const char *argv[],
1173 void *aux OVS_UNUSED)
1175 const char *bond_s = argv[1];
1176 const char *slave_s = argv[2];
1178 struct bond_slave *slave;
1180 ovs_rwlock_wrlock(&rwlock);
1181 bond = bond_find(bond_s);
1183 unixctl_command_reply_error(conn, "no such bond");
1187 slave = bond_lookup_slave(bond, slave_s);
1189 unixctl_command_reply_error(conn, "no such slave");
1193 if (!slave->enabled) {
1194 unixctl_command_reply_error(conn, "cannot make disabled slave active");
1198 if (bond->active_slave != slave) {
1199 bond->bond_revalidate = true;
1200 bond->active_slave = slave;
1201 VLOG_INFO("bond %s: active interface is now %s",
1202 bond->name, slave->name);
1203 bond->send_learning_packets = true;
1204 unixctl_command_reply(conn, "done");
1206 unixctl_command_reply(conn, "no change");
1209 ovs_rwlock_unlock(&rwlock);
1213 enable_slave(struct unixctl_conn *conn, const char *argv[], bool enable)
1215 const char *bond_s = argv[1];
1216 const char *slave_s = argv[2];
1218 struct bond_slave *slave;
1220 ovs_rwlock_wrlock(&rwlock);
1221 bond = bond_find(bond_s);
1223 unixctl_command_reply_error(conn, "no such bond");
1227 slave = bond_lookup_slave(bond, slave_s);
1229 unixctl_command_reply_error(conn, "no such slave");
1233 bond_enable_slave(slave, enable);
1234 unixctl_command_reply(conn, enable ? "enabled" : "disabled");
1237 ovs_rwlock_unlock(&rwlock);
1241 bond_unixctl_enable_slave(struct unixctl_conn *conn,
1242 int argc OVS_UNUSED, const char *argv[],
1243 void *aux OVS_UNUSED)
1245 enable_slave(conn, argv, true);
1249 bond_unixctl_disable_slave(struct unixctl_conn *conn,
1250 int argc OVS_UNUSED, const char *argv[],
1251 void *aux OVS_UNUSED)
1253 enable_slave(conn, argv, false);
1257 bond_unixctl_hash(struct unixctl_conn *conn, int argc, const char *argv[],
1258 void *aux OVS_UNUSED)
1260 const char *mac_s = argv[1];
1261 const char *vlan_s = argc > 2 ? argv[2] : NULL;
1262 const char *basis_s = argc > 3 ? argv[3] : NULL;
1263 uint8_t mac[ETH_ADDR_LEN];
1270 if (!ovs_scan(vlan_s, "%u", &vlan)) {
1271 unixctl_command_reply_error(conn, "invalid vlan");
1279 if (!ovs_scan(basis_s, "%"SCNu32, &basis)) {
1280 unixctl_command_reply_error(conn, "invalid basis");
1287 if (ovs_scan(mac_s, ETH_ADDR_SCAN_FMT, ETH_ADDR_SCAN_ARGS(mac))) {
1288 hash = bond_hash_src(mac, vlan, basis) & BOND_MASK;
1290 hash_cstr = xasprintf("%u", hash);
1291 unixctl_command_reply(conn, hash_cstr);
1294 unixctl_command_reply_error(conn, "invalid mac");
1301 unixctl_command_register("bond/list", "", 0, 0, bond_unixctl_list, NULL);
1302 unixctl_command_register("bond/show", "[port]", 0, 1, bond_unixctl_show,
1304 unixctl_command_register("bond/migrate", "port hash slave", 3, 3,
1305 bond_unixctl_migrate, NULL);
1306 unixctl_command_register("bond/set-active-slave", "port slave", 2, 2,
1307 bond_unixctl_set_active_slave, NULL);
1308 unixctl_command_register("bond/enable-slave", "port slave", 2, 2,
1309 bond_unixctl_enable_slave, NULL);
1310 unixctl_command_register("bond/disable-slave", "port slave", 2, 2,
1311 bond_unixctl_disable_slave, NULL);
1312 unixctl_command_register("bond/hash", "mac [vlan] [basis]", 1, 3,
1313 bond_unixctl_hash, NULL);
1317 bond_entry_reset(struct bond *bond)
1319 if (bond->balance != BM_AB) {
1320 size_t hash_len = (BOND_MASK + 1) * sizeof *bond->hash;
1323 bond->hash = xmalloc(hash_len);
1325 memset(bond->hash, 0, hash_len);
1327 bond->next_rebalance = time_msec() + bond->rebalance_interval;
1334 static struct bond_slave *
1335 bond_slave_lookup(struct bond *bond, const void *slave_)
1337 struct bond_slave *slave;
1339 HMAP_FOR_EACH_IN_BUCKET (slave, hmap_node, hash_pointer(slave_, 0),
1341 if (slave->aux == slave_) {
1350 bond_enable_slave(struct bond_slave *slave, bool enable)
1352 slave->delay_expires = LLONG_MAX;
1353 if (enable != slave->enabled) {
1354 slave->bond->bond_revalidate = true;
1355 slave->enabled = enable;
1357 ovs_mutex_lock(&slave->bond->mutex);
1359 list_insert(&slave->bond->enabled_slaves, &slave->list_node);
1361 list_remove(&slave->list_node);
1363 ovs_mutex_unlock(&slave->bond->mutex);
1365 VLOG_INFO("interface %s: %s", slave->name,
1366 slave->enabled ? "enabled" : "disabled");
1371 bond_link_status_update(struct bond_slave *slave)
1373 struct bond *bond = slave->bond;
1376 up = netdev_get_carrier(slave->netdev) && slave->may_enable;
1377 if ((up == slave->enabled) != (slave->delay_expires == LLONG_MAX)) {
1378 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
1379 VLOG_INFO_RL(&rl, "interface %s: link state %s",
1380 slave->name, up ? "up" : "down");
1381 if (up == slave->enabled) {
1382 slave->delay_expires = LLONG_MAX;
1383 VLOG_INFO_RL(&rl, "interface %s: will not be %s",
1384 slave->name, up ? "disabled" : "enabled");
1386 int delay = (bond->lacp_status != LACP_DISABLED ? 0
1387 : up ? bond->updelay : bond->downdelay);
1388 slave->delay_expires = time_msec() + delay;
1390 VLOG_INFO_RL(&rl, "interface %s: will be %s if it stays %s "
1393 up ? "enabled" : "disabled",
1400 if (time_msec() >= slave->delay_expires) {
1401 bond_enable_slave(slave, up);
1406 bond_hash_src(const uint8_t mac[ETH_ADDR_LEN], uint16_t vlan, uint32_t basis)
1408 return hash_3words(hash_bytes(mac, ETH_ADDR_LEN, 0), vlan, basis);
1412 bond_hash_tcp(const struct flow *flow, uint16_t vlan, uint32_t basis)
1414 struct flow hash_flow = *flow;
1415 hash_flow.vlan_tci = htons(vlan);
1417 /* The symmetric quality of this hash function is not required, but
1418 * flow_hash_symmetric_l4 already exists, and is sufficient for our
1419 * purposes, so we use it out of convenience. */
1420 return flow_hash_symmetric_l4(&hash_flow, basis);
1424 bond_hash(const struct bond *bond, const struct flow *flow, uint16_t vlan)
1426 ovs_assert(bond->balance == BM_TCP || bond->balance == BM_SLB);
1428 return (bond->balance == BM_TCP
1429 ? bond_hash_tcp(flow, vlan, bond->basis)
1430 : bond_hash_src(flow->dl_src, vlan, bond->basis));
1433 static struct bond_entry *
1434 lookup_bond_entry(const struct bond *bond, const struct flow *flow,
1437 return &bond->hash[bond_hash(bond, flow, vlan) & BOND_MASK];
1440 /* Selects and returns an enabled slave from the 'enabled_slaves' list
1441 * in a round-robin fashion. If the 'enabled_slaves' list is empty,
1443 static struct bond_slave *
1444 get_enabled_slave(struct bond *bond)
1448 ovs_mutex_lock(&bond->mutex);
1449 if (list_is_empty(&bond->enabled_slaves)) {
1450 ovs_mutex_unlock(&bond->mutex);
1454 node = list_pop_front(&bond->enabled_slaves);
1455 list_push_back(&bond->enabled_slaves, node);
1456 ovs_mutex_unlock(&bond->mutex);
1458 return CONTAINER_OF(node, struct bond_slave, list_node);
1461 static struct bond_slave *
1462 choose_output_slave(const struct bond *bond, const struct flow *flow,
1463 struct flow_wildcards *wc, uint16_t vlan)
1465 struct bond_entry *e;
1468 balance = bond->balance;
1469 if (bond->lacp_status == LACP_CONFIGURED) {
1470 /* LACP has been configured on this bond but negotiations were
1471 * unsuccussful. If lacp_fallback_ab is enabled use active-
1472 * backup mode else drop all traffic. */
1473 if (!bond->lacp_fallback_ab) {
1481 return bond->active_slave;
1484 if (bond->lacp_status != LACP_NEGOTIATED) {
1485 /* Must have LACP negotiations for TCP balanced bonds. */
1489 flow_mask_hash_fields(flow, wc, NX_HASH_FIELDS_SYMMETRIC_L4);
1494 flow_mask_hash_fields(flow, wc, NX_HASH_FIELDS_ETH_SRC);
1496 e = lookup_bond_entry(bond, flow, vlan);
1497 if (!e->slave || !e->slave->enabled) {
1498 e->slave = get_enabled_slave(CONST_CAST(struct bond*, bond));
1507 static struct bond_slave *
1508 bond_choose_slave(const struct bond *bond)
1510 struct bond_slave *slave, *best;
1512 /* Find an enabled slave. */
1513 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1514 if (slave->enabled) {
1519 /* All interfaces are disabled. Find an interface that will be enabled
1520 * after its updelay expires. */
1522 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1523 if (slave->delay_expires != LLONG_MAX
1524 && slave->may_enable
1525 && (!best || slave->delay_expires < best->delay_expires)) {
1533 bond_choose_active_slave(struct bond *bond)
1535 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
1536 struct bond_slave *old_active_slave = bond->active_slave;
1538 bond->active_slave = bond_choose_slave(bond);
1539 if (bond->active_slave) {
1540 if (bond->active_slave->enabled) {
1541 VLOG_INFO_RL(&rl, "bond %s: active interface is now %s",
1542 bond->name, bond->active_slave->name);
1544 VLOG_INFO_RL(&rl, "bond %s: active interface is now %s, skipping "
1545 "remaining %lld ms updelay (since no interface was "
1546 "enabled)", bond->name, bond->active_slave->name,
1547 bond->active_slave->delay_expires - time_msec());
1548 bond_enable_slave(bond->active_slave, true);
1551 bond->send_learning_packets = true;
1552 } else if (old_active_slave) {
1553 VLOG_INFO_RL(&rl, "bond %s: all interfaces disabled", bond->name);
1557 /* Attempts to make the sum of the bond slaves' statistics appear on the fake
1558 * bond interface. */
1560 bond_update_fake_slave_stats(struct bond *bond)
1562 struct netdev_stats bond_stats;
1563 struct bond_slave *slave;
1564 struct netdev *bond_dev;
1566 memset(&bond_stats, 0, sizeof bond_stats);
1568 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1569 struct netdev_stats slave_stats;
1571 if (!netdev_get_stats(slave->netdev, &slave_stats)) {
1572 /* XXX: We swap the stats here because they are swapped back when
1573 * reported by the internal device. The reason for this is
1574 * internal devices normally represent packets going into the
1575 * system but when used as fake bond device they represent packets
1576 * leaving the system. We really should do this in the internal
1577 * device itself because changing it here reverses the counts from
1578 * the perspective of the switch. However, the internal device
1579 * doesn't know what type of device it represents so we have to do
1580 * it here for now. */
1581 bond_stats.tx_packets += slave_stats.rx_packets;
1582 bond_stats.tx_bytes += slave_stats.rx_bytes;
1583 bond_stats.rx_packets += slave_stats.tx_packets;
1584 bond_stats.rx_bytes += slave_stats.tx_bytes;
1588 if (!netdev_open(bond->name, "system", &bond_dev)) {
1589 netdev_set_stats(bond_dev, &bond_stats);
1590 netdev_close(bond_dev);