2 * Copyright (c) 2008, 2009, 2010, 2011, 2012, 2013 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
27 #include "dynamic-string.h"
36 #include "poll-loop.h"
43 VLOG_DEFINE_THIS_MODULE(bond);
45 /* Bit-mask for hashing a flow down to a bucket.
46 * There are (BOND_MASK + 1) buckets. */
47 #define BOND_MASK 0xff
49 /* A hash bucket for mapping a flow to a slave.
50 * "struct bond" has an array of (BOND_MASK + 1) of these. */
52 struct bond_slave *slave; /* Assigned slave, NULL if unassigned. */
53 uint64_t tx_bytes; /* Count of bytes recently transmitted. */
54 tag_type tag; /* Tag for entry<->facet association. */
55 struct list list_node; /* In bond_slave's 'entries' list. */
58 /* A bond slave, that is, one of the links comprising a bond. */
60 struct hmap_node hmap_node; /* In struct bond's slaves hmap. */
61 struct bond *bond; /* The bond that contains this slave. */
62 void *aux; /* Client-provided handle for this slave. */
64 struct netdev *netdev; /* Network device, owned by the client. */
65 unsigned int change_seq; /* Tracks changes in 'netdev'. */
66 char *name; /* Name (a copy of netdev_get_name(netdev)). */
69 long long delay_expires; /* Time after which 'enabled' may change. */
70 bool enabled; /* May be chosen for flows? */
71 bool may_enable; /* Client considers this slave bondable. */
72 tag_type tag; /* Tag associated with this slave. */
74 /* Rebalancing info. Used only by bond_rebalance(). */
75 struct list bal_node; /* In bond_rebalance()'s 'bals' list. */
76 struct list entries; /* 'struct bond_entry's assigned here. */
77 uint64_t tx_bytes; /* Sum across 'tx_bytes' of entries. */
80 /* A bond, that is, a set of network devices grouped to improve performance or
83 struct hmap_node hmap_node; /* In 'all_bonds' hmap. */
84 char *name; /* Name provided by client. */
90 enum bond_mode balance; /* Balancing mode, one of BM_*. */
91 struct bond_slave *active_slave;
92 tag_type no_slaves_tag; /* Tag for flows when all slaves disabled. */
93 int updelay, downdelay; /* Delay before slave goes up/down, in ms. */
94 enum lacp_status lacp_status; /* Status of LACP negotiations. */
95 bool bond_revalidate; /* True if flows need revalidation. */
96 uint32_t basis; /* Basis for flow hash function. */
98 /* SLB specific bonding info. */
99 struct bond_entry *hash; /* An array of (BOND_MASK + 1) elements. */
100 int rebalance_interval; /* Interval between rebalances, in ms. */
101 long long int next_rebalance; /* Next rebalancing time. */
102 bool send_learning_packets;
104 /* Legacy compatibility. */
105 long long int next_fake_iface_update; /* LLONG_MAX if disabled. */
107 /* Tag set saved for next bond_run(). This tag set is a kluge for cases
108 * where we can't otherwise provide revalidation feedback to the client.
109 * That's only unixctl commands now; I hope no other cases will arise. */
110 struct tag_set unixctl_tags;
115 static struct ovs_rwlock rwlock = OVS_RWLOCK_INITIALIZER;
116 static struct hmap all_bonds__ = HMAP_INITIALIZER(&all_bonds__);
117 static struct hmap *const all_bonds OVS_GUARDED_BY(rwlock) = &all_bonds__;
119 static void bond_entry_reset(struct bond *) OVS_REQ_WRLOCK(rwlock);
120 static struct bond_slave *bond_slave_lookup(struct bond *, const void *slave_)
121 OVS_REQ_RDLOCK(rwlock);
122 static void bond_enable_slave(struct bond_slave *, bool enable,
123 struct tag_set *) OVS_REQ_WRLOCK(rwlock);
124 static void bond_link_status_update(struct bond_slave *, struct tag_set *)
125 OVS_REQ_WRLOCK(rwlock);
126 static void bond_choose_active_slave(struct bond *, struct tag_set *)
127 OVS_REQ_WRLOCK(rwlock);;
128 static unsigned int bond_hash_src(const uint8_t mac[ETH_ADDR_LEN],
129 uint16_t vlan, uint32_t basis);
130 static unsigned int bond_hash_tcp(const struct flow *, uint16_t vlan,
132 static struct bond_entry *lookup_bond_entry(const struct bond *,
135 OVS_REQ_RDLOCK(rwlock);
136 static tag_type bond_get_active_slave_tag(const struct bond *)
137 OVS_REQ_RDLOCK(rwlock);
138 static struct bond_slave *choose_output_slave(const struct bond *,
140 struct flow_wildcards *,
141 uint16_t vlan, tag_type *tags)
142 OVS_REQ_RDLOCK(rwlock);
143 static void bond_update_fake_slave_stats(struct bond *)
144 OVS_REQ_RDLOCK(rwlock);
146 /* Attempts to parse 's' as the name of a bond balancing mode. If successful,
147 * stores the mode in '*balance' and returns true. Otherwise returns false
148 * without modifying '*balance'. */
150 bond_mode_from_string(enum bond_mode *balance, const char *s)
152 if (!strcmp(s, bond_mode_to_string(BM_TCP))) {
154 } else if (!strcmp(s, bond_mode_to_string(BM_SLB))) {
156 } else if (!strcmp(s, bond_mode_to_string(BM_AB))) {
164 /* Returns a string representing 'balance'. */
166 bond_mode_to_string(enum bond_mode balance) {
169 return "balance-tcp";
171 return "balance-slb";
173 return "active-backup";
179 /* Creates and returns a new bond whose configuration is initially taken from
182 * The caller should register each slave on the new bond by calling
183 * bond_slave_register(). */
185 bond_create(const struct bond_settings *s)
189 bond = xzalloc(sizeof *bond);
190 hmap_init(&bond->slaves);
191 bond->no_slaves_tag = tag_create_random();
192 bond->next_fake_iface_update = LLONG_MAX;
193 atomic_init(&bond->ref_cnt, 1);
195 bond_reconfigure(bond, s);
197 tag_set_init(&bond->unixctl_tags);
203 bond_ref(const struct bond *bond_)
205 struct bond *bond = CONST_CAST(struct bond *, bond_);
209 atomic_add(&bond->ref_cnt, 1, &orig);
210 ovs_assert(orig > 0);
217 bond_unref(struct bond *bond)
219 struct bond_slave *slave, *next_slave;
226 atomic_sub(&bond->ref_cnt, 1, &orig);
227 ovs_assert(orig > 0);
232 ovs_rwlock_wrlock(&rwlock);
233 hmap_remove(all_bonds, &bond->hmap_node);
234 ovs_rwlock_unlock(&rwlock);
236 HMAP_FOR_EACH_SAFE (slave, next_slave, hmap_node, &bond->slaves) {
237 hmap_remove(&bond->slaves, &slave->hmap_node);
238 /* Client owns 'slave->netdev'. */
242 hmap_destroy(&bond->slaves);
249 /* Updates 'bond''s overall configuration to 's'.
251 * The caller should register each slave on 'bond' by calling
252 * bond_slave_register(). This is optional if none of the slaves'
253 * configuration has changed. In any case it can't hurt.
255 * Returns true if the configuration has changed in such a way that requires
259 bond_reconfigure(struct bond *bond, const struct bond_settings *s)
261 bool revalidate = false;
263 ovs_rwlock_wrlock(&rwlock);
264 if (!bond->name || strcmp(bond->name, s->name)) {
266 hmap_remove(all_bonds, &bond->hmap_node);
269 bond->name = xstrdup(s->name);
270 hmap_insert(all_bonds, &bond->hmap_node, hash_string(bond->name, 0));
273 bond->updelay = s->up_delay;
274 bond->downdelay = s->down_delay;
276 if (bond->rebalance_interval != s->rebalance_interval) {
277 bond->rebalance_interval = s->rebalance_interval;
281 if (bond->balance != s->balance) {
282 bond->balance = s->balance;
286 if (bond->basis != s->basis) {
287 bond->basis = s->basis;
292 if (bond->next_fake_iface_update == LLONG_MAX) {
293 bond->next_fake_iface_update = time_msec();
296 bond->next_fake_iface_update = LLONG_MAX;
299 if (bond->bond_revalidate) {
301 bond->bond_revalidate = false;
304 if (bond->balance == BM_AB || !bond->hash || revalidate) {
305 bond_entry_reset(bond);
308 ovs_rwlock_unlock(&rwlock);
313 bond_slave_set_netdev__(struct bond_slave *slave, struct netdev *netdev)
314 OVS_REQ_WRLOCK(rwlock)
316 if (slave->netdev != netdev) {
317 slave->netdev = netdev;
318 slave->change_seq = 0;
322 /* Registers 'slave_' as a slave of 'bond'. The 'slave_' pointer is an
323 * arbitrary client-provided pointer that uniquely identifies a slave within a
324 * bond. If 'slave_' already exists within 'bond' then this function
325 * reconfigures the existing slave.
327 * 'netdev' must be the network device that 'slave_' represents. It is owned
328 * by the client, so the client must not close it before either unregistering
329 * 'slave_' or destroying 'bond'.
332 bond_slave_register(struct bond *bond, void *slave_, struct netdev *netdev)
334 struct bond_slave *slave;
336 ovs_rwlock_wrlock(&rwlock);
337 slave = bond_slave_lookup(bond, slave_);
339 slave = xzalloc(sizeof *slave);
341 hmap_insert(&bond->slaves, &slave->hmap_node, hash_pointer(slave_, 0));
344 slave->delay_expires = LLONG_MAX;
345 slave->name = xstrdup(netdev_get_name(netdev));
346 bond->bond_revalidate = true;
348 slave->enabled = false;
349 bond_enable_slave(slave, netdev_get_carrier(netdev), NULL);
352 bond_slave_set_netdev__(slave, netdev);
355 slave->name = xstrdup(netdev_get_name(netdev));
356 ovs_rwlock_unlock(&rwlock);
359 /* Updates the network device to be used with 'slave_' to 'netdev'.
361 * This is useful if the caller closes and re-opens the network device
362 * registered with bond_slave_register() but doesn't need to change anything
365 bond_slave_set_netdev(struct bond *bond, void *slave_, struct netdev *netdev)
367 struct bond_slave *slave;
369 ovs_rwlock_wrlock(&rwlock);
370 slave = bond_slave_lookup(bond, slave_);
372 bond_slave_set_netdev__(slave, netdev);
374 ovs_rwlock_unlock(&rwlock);
377 /* Unregisters 'slave_' from 'bond'. If 'bond' does not contain such a slave
378 * then this function has no effect.
380 * Unregistering a slave invalidates all flows. */
382 bond_slave_unregister(struct bond *bond, const void *slave_)
384 struct bond_slave *slave;
387 ovs_rwlock_wrlock(&rwlock);
388 slave = bond_slave_lookup(bond, slave_);
393 bond_enable_slave(slave, false, NULL);
395 del_active = bond->active_slave == slave;
397 struct bond_entry *e;
398 for (e = bond->hash; e <= &bond->hash[BOND_MASK]; e++) {
399 if (e->slave == slave) {
407 hmap_remove(&bond->slaves, &slave->hmap_node);
408 /* Client owns 'slave->netdev'. */
415 bond_choose_active_slave(bond, &tags);
416 bond->send_learning_packets = true;
419 ovs_rwlock_unlock(&rwlock);
422 /* Should be called on each slave in 'bond' before bond_run() to indicate
423 * whether or not 'slave_' may be enabled. This function is intended to allow
424 * other protocols to have some impact on bonding decisions. For example LACP
425 * or high level link monitoring protocols may decide that a given slave should
426 * not be able to send traffic. */
428 bond_slave_set_may_enable(struct bond *bond, void *slave_, bool may_enable)
430 ovs_rwlock_wrlock(&rwlock);
431 bond_slave_lookup(bond, slave_)->may_enable = may_enable;
432 ovs_rwlock_unlock(&rwlock);
435 /* Performs periodic maintenance on 'bond'. The caller must provide 'tags' to
436 * allow tagged flows to be invalidated.
438 * The caller should check bond_should_send_learning_packets() afterward. */
440 bond_run(struct bond *bond, struct tag_set *tags, enum lacp_status lacp_status)
442 struct bond_slave *slave;
444 ovs_rwlock_wrlock(&rwlock);
445 if (bond->lacp_status != lacp_status) {
446 bond->lacp_status = lacp_status;
447 bond->bond_revalidate = true;
450 /* Enable slaves based on link status and LACP feedback. */
451 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
452 bond_link_status_update(slave, tags);
453 slave->change_seq = netdev_change_seq(slave->netdev);
455 if (!bond->active_slave || !bond->active_slave->enabled) {
456 bond_choose_active_slave(bond, tags);
459 /* Update fake bond interface stats. */
460 if (time_msec() >= bond->next_fake_iface_update) {
461 bond_update_fake_slave_stats(bond);
462 bond->next_fake_iface_update = time_msec() + 1000;
465 if (bond->bond_revalidate) {
466 struct bond_slave *slave;
468 bond->bond_revalidate = false;
469 bond_entry_reset(bond);
470 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
471 tag_set_add(tags, slave->tag);
473 tag_set_add(tags, bond->no_slaves_tag);
476 /* Invalidate any tags required by */
477 tag_set_union(tags, &bond->unixctl_tags);
478 tag_set_init(&bond->unixctl_tags);
479 ovs_rwlock_unlock(&rwlock);
482 /* Causes poll_block() to wake up when 'bond' needs something to be done. */
484 bond_wait(struct bond *bond)
486 struct bond_slave *slave;
488 ovs_rwlock_rdlock(&rwlock);
489 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
490 if (slave->delay_expires != LLONG_MAX) {
491 poll_timer_wait_until(slave->delay_expires);
494 if (slave->change_seq != netdev_change_seq(slave->netdev)) {
495 poll_immediate_wake();
499 if (bond->next_fake_iface_update != LLONG_MAX) {
500 poll_timer_wait_until(bond->next_fake_iface_update);
503 /* Ensure that any saved tags get revalidated right away. */
504 if (!tag_set_is_empty(&bond->unixctl_tags)) {
505 poll_immediate_wake();
507 ovs_rwlock_unlock(&rwlock);
509 /* We don't wait for bond->next_rebalance because rebalancing can only run
510 * at a flow account checkpoint. ofproto does checkpointing on its own
511 * schedule and bond_rebalance() gets called afterward, so we'd just be
512 * waking up for no purpose. */
515 /* MAC learning table interaction. */
518 may_send_learning_packets(const struct bond *bond)
520 return bond->lacp_status == LACP_DISABLED
521 && (bond->balance == BM_SLB || bond->balance == BM_AB)
522 && bond->active_slave;
525 /* Returns true if 'bond' needs the client to send out packets to assist with
526 * MAC learning on 'bond'. If this function returns true, then the client
527 * should iterate through its MAC learning table for the bridge on which 'bond'
528 * is located. For each MAC that has been learned on a port other than 'bond',
529 * it should call bond_compose_learning_packet().
531 * This function will only return true if 'bond' is in SLB or active-backup
532 * mode and LACP is not negotiated. Otherwise sending learning packets isn't
535 * Calling this function resets the state that it checks. */
537 bond_should_send_learning_packets(struct bond *bond)
541 ovs_rwlock_wrlock(&rwlock);
542 send = bond->send_learning_packets && may_send_learning_packets(bond);
543 bond->send_learning_packets = false;
544 ovs_rwlock_unlock(&rwlock);
548 /* Sends a gratuitous learning packet on 'bond' from 'eth_src' on 'vlan'.
550 * See bond_should_send_learning_packets() for description of usage. The
551 * caller should send the composed packet on the port associated with
552 * port_aux and takes ownership of the returned ofpbuf. */
554 bond_compose_learning_packet(struct bond *bond,
555 const uint8_t eth_src[ETH_ADDR_LEN],
556 uint16_t vlan, void **port_aux)
558 struct bond_slave *slave;
559 struct ofpbuf *packet;
563 ovs_rwlock_rdlock(&rwlock);
564 ovs_assert(may_send_learning_packets(bond));
565 memset(&flow, 0, sizeof flow);
566 memcpy(flow.dl_src, eth_src, ETH_ADDR_LEN);
567 slave = choose_output_slave(bond, &flow, NULL, vlan, &tags);
569 packet = ofpbuf_new(0);
570 compose_rarp(packet, eth_src);
572 eth_push_vlan(packet, htons(vlan));
575 *port_aux = slave->aux;
576 ovs_rwlock_unlock(&rwlock);
580 /* Checks whether a packet that arrived on 'slave_' within 'bond', with an
581 * Ethernet destination address of 'eth_dst', should be admitted.
583 * The return value is one of the following:
585 * - BV_ACCEPT: Admit the packet.
587 * - BV_DROP: Drop the packet.
589 * - BV_DROP_IF_MOVED: Consult the MAC learning table for the packet's
590 * Ethernet source address and VLAN. If there is none, or if the packet
591 * is on the learned port, then admit the packet. If a different port has
592 * been learned, however, drop the packet (and do not use it for MAC
596 bond_check_admissibility(struct bond *bond, const void *slave_,
597 const uint8_t eth_dst[ETH_ADDR_LEN], tag_type *tags)
599 enum bond_verdict verdict = BV_DROP;
600 struct bond_slave *slave;
602 ovs_rwlock_rdlock(&rwlock);
603 slave = bond_slave_lookup(bond, slave_);
608 /* LACP bonds have very loose admissibility restrictions because we can
609 * assume the remote switch is aware of the bond and will "do the right
610 * thing". However, as a precaution we drop packets on disabled slaves
611 * because no correctly implemented partner switch should be sending
614 * If LACP is configured, but LACP negotiations have been unsuccessful, we
615 * drop all incoming traffic. */
616 switch (bond->lacp_status) {
617 case LACP_NEGOTIATED:
618 verdict = slave->enabled ? BV_ACCEPT : BV_DROP;
620 case LACP_CONFIGURED:
626 /* Drop all multicast packets on inactive slaves. */
627 if (eth_addr_is_multicast(eth_dst)) {
628 *tags |= bond_get_active_slave_tag(bond);
629 if (bond->active_slave != slave) {
634 switch (bond->balance) {
636 /* Drop all packets which arrive on backup slaves. This is similar to
637 * how Linux bonding handles active-backup bonds. */
638 *tags |= bond_get_active_slave_tag(bond);
639 if (bond->active_slave != slave) {
640 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
642 VLOG_DBG_RL(&rl, "active-backup bond received packet on backup"
643 " slave (%s) destined for " ETH_ADDR_FMT,
644 slave->name, ETH_ADDR_ARGS(eth_dst));
651 /* TCP balanced bonds require successful LACP negotiated. Based on the
652 * above check, LACP is off on this bond. Therfore, we drop all
653 * incoming traffic. */
657 /* Drop all packets for which we have learned a different input port,
658 * because we probably sent the packet on one slave and got it back on
659 * the other. Gratuitous ARP packets are an exception to this rule:
660 * the host has moved to another switch. The exception to the
661 * exception is if we locked the learning table to avoid reflections on
663 verdict = BV_DROP_IF_MOVED;
669 ovs_rwlock_unlock(&rwlock);
674 /* Returns the slave (registered on 'bond' by bond_slave_register()) to which
675 * a packet with the given 'flow' and 'vlan' should be forwarded. Returns
676 * NULL if the packet should be dropped because no slaves are enabled.
678 * 'vlan' is not necessarily the same as 'flow->vlan_tci'. First, 'vlan'
679 * should be a VID only (i.e. excluding the PCP bits). Second,
680 * 'flow->vlan_tci' is the VLAN TCI that appeared on the packet (so it will be
681 * nonzero only for trunk ports), whereas 'vlan' is the logical VLAN that the
682 * packet belongs to (so for an access port it will be the access port's VLAN).
684 * Adds a tag to '*tags' that associates the flow with the returned slave.
686 * If 'wc' is non-NULL, bitwise-OR's 'wc' with the set of bits that were
687 * significant in the selection. At some point earlier, 'wc' should
688 * have been initialized (e.g., by flow_wildcards_init_catchall()).
691 bond_choose_output_slave(struct bond *bond, const struct flow *flow,
692 struct flow_wildcards *wc, uint16_t vlan,
695 struct bond_slave *slave;
698 ovs_rwlock_rdlock(&rwlock);
699 slave = choose_output_slave(bond, flow, wc, vlan, tags);
704 *tags |= bond->no_slaves_tag;
706 ovs_rwlock_unlock(&rwlock);
713 bond_is_balanced(const struct bond *bond) OVS_REQ_RDLOCK(rwlock)
715 return bond->rebalance_interval
716 && (bond->balance == BM_SLB || bond->balance == BM_TCP);
719 /* Notifies 'bond' that 'n_bytes' bytes were sent in 'flow' within 'vlan'. */
721 bond_account(struct bond *bond, const struct flow *flow, uint16_t vlan,
724 ovs_rwlock_wrlock(&rwlock);
725 if (bond_is_balanced(bond)) {
726 lookup_bond_entry(bond, flow, vlan)->tx_bytes += n_bytes;
728 ovs_rwlock_unlock(&rwlock);
731 static struct bond_slave *
732 bond_slave_from_bal_node(struct list *bal) OVS_REQ_RDLOCK(rwlock)
734 return CONTAINER_OF(bal, struct bond_slave, bal_node);
738 log_bals(struct bond *bond, const struct list *bals)
740 if (VLOG_IS_DBG_ENABLED()) {
741 struct ds ds = DS_EMPTY_INITIALIZER;
742 const struct bond_slave *slave;
744 LIST_FOR_EACH (slave, bal_node, bals) {
746 ds_put_char(&ds, ',');
748 ds_put_format(&ds, " %s %"PRIu64"kB",
749 slave->name, slave->tx_bytes / 1024);
751 if (!slave->enabled) {
752 ds_put_cstr(&ds, " (disabled)");
754 if (!list_is_empty(&slave->entries)) {
755 struct bond_entry *e;
757 ds_put_cstr(&ds, " (");
758 LIST_FOR_EACH (e, list_node, &slave->entries) {
759 if (&e->list_node != list_front(&slave->entries)) {
760 ds_put_cstr(&ds, " + ");
762 ds_put_format(&ds, "h%td: %"PRIu64"kB",
763 e - bond->hash, e->tx_bytes / 1024);
765 ds_put_cstr(&ds, ")");
768 VLOG_DBG("bond %s:%s", bond->name, ds_cstr(&ds));
773 /* Shifts 'hash' from its current slave to 'to'. */
775 bond_shift_load(struct bond_entry *hash, struct bond_slave *to,
778 struct bond_slave *from = hash->slave;
779 struct bond *bond = from->bond;
780 uint64_t delta = hash->tx_bytes;
782 VLOG_INFO("bond %s: shift %"PRIu64"kB of load (with hash %td) "
783 "from %s to %s (now carrying %"PRIu64"kB and "
784 "%"PRIu64"kB load, respectively)",
785 bond->name, delta / 1024, hash - bond->hash,
786 from->name, to->name,
787 (from->tx_bytes - delta) / 1024,
788 (to->tx_bytes + delta) / 1024);
790 /* Shift load away from 'from' to 'to'. */
791 from->tx_bytes -= delta;
792 to->tx_bytes += delta;
794 /* Arrange for flows to be revalidated. */
795 tag_set_add(set, hash->tag);
797 hash->tag = tag_create_random();
800 /* Picks and returns a bond_entry to migrate from 'from' (the most heavily
801 * loaded bond slave) to a bond slave that has 'to_tx_bytes' bytes of load,
802 * given that doing so must decrease the ratio of the load on the two slaves by
803 * at least 0.1. Returns NULL if there is no appropriate entry.
805 * The list of entries isn't sorted. I don't know of a reason to prefer to
806 * shift away small hashes or large hashes. */
807 static struct bond_entry *
808 choose_entry_to_migrate(const struct bond_slave *from, uint64_t to_tx_bytes)
810 struct bond_entry *e;
812 if (list_is_short(&from->entries)) {
813 /* 'from' carries no more than one MAC hash, so shifting load away from
814 * it would be pointless. */
818 LIST_FOR_EACH (e, list_node, &from->entries) {
819 double old_ratio, new_ratio;
822 if (to_tx_bytes == 0) {
823 /* Nothing on the new slave, move it. */
828 old_ratio = (double)from->tx_bytes / to_tx_bytes;
829 new_ratio = (double)(from->tx_bytes - delta) / (to_tx_bytes + delta);
830 if (old_ratio - new_ratio > 0.1
831 && fabs(new_ratio - 1.0) < fabs(old_ratio - 1.0)) {
832 /* We're aiming for an ideal ratio of 1, meaning both the 'from'
833 and 'to' slave have the same load. Therefore, we only move an
834 entry if it decreases the load on 'from', and brings us closer
835 to equal traffic load. */
843 /* Inserts 'slave' into 'bals' so that descending order of 'tx_bytes' is
846 insert_bal(struct list *bals, struct bond_slave *slave)
848 struct bond_slave *pos;
850 LIST_FOR_EACH (pos, bal_node, bals) {
851 if (slave->tx_bytes > pos->tx_bytes) {
855 list_insert(&pos->bal_node, &slave->bal_node);
858 /* Removes 'slave' from its current list and then inserts it into 'bals' so
859 * that descending order of 'tx_bytes' is maintained. */
861 reinsert_bal(struct list *bals, struct bond_slave *slave)
863 list_remove(&slave->bal_node);
864 insert_bal(bals, slave);
867 /* If 'bond' needs rebalancing, does so.
869 * The caller should have called bond_account() for each active flow, to ensure
870 * that flow data is consistently accounted at this point. */
872 bond_rebalance(struct bond *bond, struct tag_set *tags)
874 struct bond_slave *slave;
875 struct bond_entry *e;
878 ovs_rwlock_wrlock(&rwlock);
879 if (!bond_is_balanced(bond) || time_msec() < bond->next_rebalance) {
880 ovs_rwlock_unlock(&rwlock);
883 bond->next_rebalance = time_msec() + bond->rebalance_interval;
885 /* Add each bond_entry to its slave's 'entries' list.
886 * Compute each slave's tx_bytes as the sum of its entries' tx_bytes. */
887 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
889 list_init(&slave->entries);
891 for (e = &bond->hash[0]; e <= &bond->hash[BOND_MASK]; e++) {
892 if (e->slave && e->tx_bytes) {
893 e->slave->tx_bytes += e->tx_bytes;
894 list_push_back(&e->slave->entries, &e->list_node);
898 /* Add enabled slaves to 'bals' in descending order of tx_bytes.
900 * XXX This is O(n**2) in the number of slaves but it could be O(n lg n)
901 * with a proper list sort algorithm. */
903 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
904 if (slave->enabled) {
905 insert_bal(&bals, slave);
908 log_bals(bond, &bals);
910 /* Shift load from the most-loaded slaves to the least-loaded slaves. */
911 while (!list_is_short(&bals)) {
912 struct bond_slave *from = bond_slave_from_bal_node(list_front(&bals));
913 struct bond_slave *to = bond_slave_from_bal_node(list_back(&bals));
916 overload = from->tx_bytes - to->tx_bytes;
917 if (overload < to->tx_bytes >> 5 || overload < 100000) {
918 /* The extra load on 'from' (and all less-loaded slaves), compared
919 * to that of 'to' (the least-loaded slave), is less than ~3%, or
920 * it is less than ~1Mbps. No point in rebalancing. */
924 /* 'from' is carrying significantly more load than 'to'. Pick a hash
925 * to move from 'from' to 'to'. */
926 e = choose_entry_to_migrate(from, to->tx_bytes);
928 bond_shift_load(e, to, tags);
930 /* Delete element from from->entries.
932 * We don't add the element to to->hashes. That would only allow
933 * 'e' to be migrated to another slave in this rebalancing run, and
934 * there is no point in doing that. */
935 list_remove(&e->list_node);
937 /* Re-sort 'bals'. */
938 reinsert_bal(&bals, from);
939 reinsert_bal(&bals, to);
941 /* Can't usefully migrate anything away from 'from'.
942 * Don't reconsider it. */
943 list_remove(&from->bal_node);
947 /* Implement exponentially weighted moving average. A weight of 1/2 causes
948 * historical data to decay to <1% in 7 rebalancing runs. 1,000,000 bytes
949 * take 20 rebalancing runs to decay to 0 and get deleted entirely. */
950 for (e = &bond->hash[0]; e <= &bond->hash[BOND_MASK]; e++) {
956 ovs_rwlock_unlock(&rwlock);
959 /* Bonding unixctl user interface functions. */
962 bond_find(const char *name) OVS_REQ_RDLOCK(rwlock)
966 HMAP_FOR_EACH_WITH_HASH (bond, hmap_node, hash_string(name, 0),
968 if (!strcmp(bond->name, name)) {
975 static struct bond_slave *
976 bond_lookup_slave(struct bond *bond, const char *slave_name)
978 struct bond_slave *slave;
980 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
981 if (!strcmp(slave->name, slave_name)) {
989 bond_unixctl_list(struct unixctl_conn *conn,
990 int argc OVS_UNUSED, const char *argv[] OVS_UNUSED,
991 void *aux OVS_UNUSED)
993 struct ds ds = DS_EMPTY_INITIALIZER;
994 const struct bond *bond;
996 ds_put_cstr(&ds, "bond\ttype\tslaves\n");
998 ovs_rwlock_rdlock(&rwlock);
999 HMAP_FOR_EACH (bond, hmap_node, all_bonds) {
1000 const struct bond_slave *slave;
1003 ds_put_format(&ds, "%s\t%s\t",
1004 bond->name, bond_mode_to_string(bond->balance));
1007 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1009 ds_put_cstr(&ds, ", ");
1011 ds_put_cstr(&ds, slave->name);
1013 ds_put_char(&ds, '\n');
1015 ovs_rwlock_unlock(&rwlock);
1016 unixctl_command_reply(conn, ds_cstr(&ds));
1021 bond_print_details(struct ds *ds, const struct bond *bond)
1022 OVS_REQ_RDLOCK(rwlock)
1024 struct shash slave_shash = SHASH_INITIALIZER(&slave_shash);
1025 const struct shash_node **sorted_slaves = NULL;
1026 const struct bond_slave *slave;
1029 ds_put_format(ds, "---- %s ----\n", bond->name);
1030 ds_put_format(ds, "bond_mode: %s\n",
1031 bond_mode_to_string(bond->balance));
1033 ds_put_format(ds, "bond-hash-basis: %"PRIu32"\n", bond->basis);
1035 ds_put_format(ds, "updelay: %d ms\n", bond->updelay);
1036 ds_put_format(ds, "downdelay: %d ms\n", bond->downdelay);
1038 if (bond_is_balanced(bond)) {
1039 ds_put_format(ds, "next rebalance: %lld ms\n",
1040 bond->next_rebalance - time_msec());
1043 ds_put_cstr(ds, "lacp_status: ");
1044 switch (bond->lacp_status) {
1045 case LACP_NEGOTIATED:
1046 ds_put_cstr(ds, "negotiated\n");
1048 case LACP_CONFIGURED:
1049 ds_put_cstr(ds, "configured\n");
1052 ds_put_cstr(ds, "off\n");
1055 ds_put_cstr(ds, "<unknown>\n");
1059 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1060 shash_add(&slave_shash, slave->name, slave);
1062 sorted_slaves = shash_sort(&slave_shash);
1064 for (i = 0; i < shash_count(&slave_shash); i++) {
1065 struct bond_entry *be;
1067 slave = sorted_slaves[i]->data;
1070 ds_put_format(ds, "\nslave %s: %s\n",
1071 slave->name, slave->enabled ? "enabled" : "disabled");
1072 if (slave == bond->active_slave) {
1073 ds_put_cstr(ds, "\tactive slave\n");
1075 if (slave->delay_expires != LLONG_MAX) {
1076 ds_put_format(ds, "\t%s expires in %lld ms\n",
1077 slave->enabled ? "downdelay" : "updelay",
1078 slave->delay_expires - time_msec());
1081 ds_put_format(ds, "\tmay_enable: %s\n",
1082 slave->may_enable ? "true" : "false");
1084 if (!bond_is_balanced(bond)) {
1089 for (be = bond->hash; be <= &bond->hash[BOND_MASK]; be++) {
1090 int hash = be - bond->hash;
1092 if (be->slave != slave) {
1096 ds_put_format(ds, "\thash %d: %"PRIu64" kB load\n",
1097 hash, be->tx_bytes / 1024);
1099 /* XXX How can we list the MACs assigned to hashes of SLB bonds? */
1102 shash_destroy(&slave_shash);
1103 free(sorted_slaves);
1104 ds_put_cstr(ds, "\n");
1108 bond_unixctl_show(struct unixctl_conn *conn,
1109 int argc, const char *argv[],
1110 void *aux OVS_UNUSED)
1112 struct ds ds = DS_EMPTY_INITIALIZER;
1114 ovs_rwlock_rdlock(&rwlock);
1116 const struct bond *bond = bond_find(argv[1]);
1119 unixctl_command_reply_error(conn, "no such bond");
1122 bond_print_details(&ds, bond);
1124 const struct bond *bond;
1126 HMAP_FOR_EACH (bond, hmap_node, all_bonds) {
1127 bond_print_details(&ds, bond);
1131 unixctl_command_reply(conn, ds_cstr(&ds));
1135 ovs_rwlock_unlock(&rwlock);
1139 bond_unixctl_migrate(struct unixctl_conn *conn,
1140 int argc OVS_UNUSED, const char *argv[],
1141 void *aux OVS_UNUSED)
1143 const char *bond_s = argv[1];
1144 const char *hash_s = argv[2];
1145 const char *slave_s = argv[3];
1147 struct bond_slave *slave;
1148 struct bond_entry *entry;
1151 ovs_rwlock_wrlock(&rwlock);
1152 bond = bond_find(bond_s);
1154 unixctl_command_reply_error(conn, "no such bond");
1158 if (bond->balance != BM_SLB) {
1159 unixctl_command_reply_error(conn, "not an SLB bond");
1163 if (strspn(hash_s, "0123456789") == strlen(hash_s)) {
1164 hash = atoi(hash_s) & BOND_MASK;
1166 unixctl_command_reply_error(conn, "bad hash");
1170 slave = bond_lookup_slave(bond, slave_s);
1172 unixctl_command_reply_error(conn, "no such slave");
1176 if (!slave->enabled) {
1177 unixctl_command_reply_error(conn, "cannot migrate to disabled slave");
1181 entry = &bond->hash[hash];
1182 tag_set_add(&bond->unixctl_tags, entry->tag);
1183 entry->slave = slave;
1184 entry->tag = tag_create_random();
1185 unixctl_command_reply(conn, "migrated");
1188 ovs_rwlock_unlock(&rwlock);
1192 bond_unixctl_set_active_slave(struct unixctl_conn *conn,
1193 int argc OVS_UNUSED, const char *argv[],
1194 void *aux OVS_UNUSED)
1196 const char *bond_s = argv[1];
1197 const char *slave_s = argv[2];
1199 struct bond_slave *slave;
1201 ovs_rwlock_wrlock(&rwlock);
1202 bond = bond_find(bond_s);
1204 unixctl_command_reply_error(conn, "no such bond");
1208 slave = bond_lookup_slave(bond, slave_s);
1210 unixctl_command_reply_error(conn, "no such slave");
1214 if (!slave->enabled) {
1215 unixctl_command_reply_error(conn, "cannot make disabled slave active");
1219 if (bond->active_slave != slave) {
1220 tag_set_add(&bond->unixctl_tags, bond_get_active_slave_tag(bond));
1221 bond->active_slave = slave;
1222 bond->active_slave->tag = tag_create_random();
1223 VLOG_INFO("bond %s: active interface is now %s",
1224 bond->name, slave->name);
1225 bond->send_learning_packets = true;
1226 unixctl_command_reply(conn, "done");
1228 unixctl_command_reply(conn, "no change");
1231 ovs_rwlock_unlock(&rwlock);
1235 enable_slave(struct unixctl_conn *conn, const char *argv[], bool enable)
1237 const char *bond_s = argv[1];
1238 const char *slave_s = argv[2];
1240 struct bond_slave *slave;
1242 ovs_rwlock_wrlock(&rwlock);
1243 bond = bond_find(bond_s);
1245 unixctl_command_reply_error(conn, "no such bond");
1249 slave = bond_lookup_slave(bond, slave_s);
1251 unixctl_command_reply_error(conn, "no such slave");
1255 bond_enable_slave(slave, enable, &bond->unixctl_tags);
1256 unixctl_command_reply(conn, enable ? "enabled" : "disabled");
1259 ovs_rwlock_unlock(&rwlock);
1263 bond_unixctl_enable_slave(struct unixctl_conn *conn,
1264 int argc OVS_UNUSED, const char *argv[],
1265 void *aux OVS_UNUSED)
1267 enable_slave(conn, argv, true);
1271 bond_unixctl_disable_slave(struct unixctl_conn *conn,
1272 int argc OVS_UNUSED, const char *argv[],
1273 void *aux OVS_UNUSED)
1275 enable_slave(conn, argv, false);
1279 bond_unixctl_hash(struct unixctl_conn *conn, int argc, const char *argv[],
1280 void *aux OVS_UNUSED)
1282 const char *mac_s = argv[1];
1283 const char *vlan_s = argc > 2 ? argv[2] : NULL;
1284 const char *basis_s = argc > 3 ? argv[3] : NULL;
1285 uint8_t mac[ETH_ADDR_LEN];
1292 if (sscanf(vlan_s, "%u", &vlan) != 1) {
1293 unixctl_command_reply_error(conn, "invalid vlan");
1301 if (sscanf(basis_s, "%"PRIu32, &basis) != 1) {
1302 unixctl_command_reply_error(conn, "invalid basis");
1309 if (sscanf(mac_s, ETH_ADDR_SCAN_FMT, ETH_ADDR_SCAN_ARGS(mac))
1310 == ETH_ADDR_SCAN_COUNT) {
1311 hash = bond_hash_src(mac, vlan, basis) & BOND_MASK;
1313 hash_cstr = xasprintf("%u", hash);
1314 unixctl_command_reply(conn, hash_cstr);
1317 unixctl_command_reply_error(conn, "invalid mac");
1324 unixctl_command_register("bond/list", "", 0, 0, bond_unixctl_list, NULL);
1325 unixctl_command_register("bond/show", "[port]", 0, 1, bond_unixctl_show,
1327 unixctl_command_register("bond/migrate", "port hash slave", 3, 3,
1328 bond_unixctl_migrate, NULL);
1329 unixctl_command_register("bond/set-active-slave", "port slave", 2, 2,
1330 bond_unixctl_set_active_slave, NULL);
1331 unixctl_command_register("bond/enable-slave", "port slave", 2, 2,
1332 bond_unixctl_enable_slave, NULL);
1333 unixctl_command_register("bond/disable-slave", "port slave", 2, 2,
1334 bond_unixctl_disable_slave, NULL);
1335 unixctl_command_register("bond/hash", "mac [vlan] [basis]", 1, 3,
1336 bond_unixctl_hash, NULL);
1340 bond_entry_reset(struct bond *bond)
1342 if (bond->balance != BM_AB) {
1343 size_t hash_len = (BOND_MASK + 1) * sizeof *bond->hash;
1346 bond->hash = xmalloc(hash_len);
1348 memset(bond->hash, 0, hash_len);
1350 bond->next_rebalance = time_msec() + bond->rebalance_interval;
1357 static struct bond_slave *
1358 bond_slave_lookup(struct bond *bond, const void *slave_)
1360 struct bond_slave *slave;
1362 HMAP_FOR_EACH_IN_BUCKET (slave, hmap_node, hash_pointer(slave_, 0),
1364 if (slave->aux == slave_) {
1373 bond_enable_slave(struct bond_slave *slave, bool enable, struct tag_set *tags)
1375 slave->delay_expires = LLONG_MAX;
1376 if (enable != slave->enabled) {
1377 slave->enabled = enable;
1378 if (!slave->enabled) {
1379 VLOG_INFO("interface %s: disabled", slave->name);
1381 tag_set_add(tags, slave->tag);
1384 VLOG_INFO("interface %s: enabled", slave->name);
1385 slave->tag = tag_create_random();
1391 bond_link_status_update(struct bond_slave *slave, struct tag_set *tags)
1393 struct bond *bond = slave->bond;
1396 up = netdev_get_carrier(slave->netdev) && slave->may_enable;
1397 if ((up == slave->enabled) != (slave->delay_expires == LLONG_MAX)) {
1398 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
1399 VLOG_INFO_RL(&rl, "interface %s: link state %s",
1400 slave->name, up ? "up" : "down");
1401 if (up == slave->enabled) {
1402 slave->delay_expires = LLONG_MAX;
1403 VLOG_INFO_RL(&rl, "interface %s: will not be %s",
1404 slave->name, up ? "disabled" : "enabled");
1406 int delay = (bond->lacp_status != LACP_DISABLED ? 0
1407 : up ? bond->updelay : bond->downdelay);
1408 slave->delay_expires = time_msec() + delay;
1410 VLOG_INFO_RL(&rl, "interface %s: will be %s if it stays %s "
1413 up ? "enabled" : "disabled",
1420 if (time_msec() >= slave->delay_expires) {
1421 bond_enable_slave(slave, up, tags);
1426 bond_hash_src(const uint8_t mac[ETH_ADDR_LEN], uint16_t vlan, uint32_t basis)
1428 return hash_3words(hash_bytes(mac, ETH_ADDR_LEN, 0), vlan, basis);
1432 bond_hash_tcp(const struct flow *flow, uint16_t vlan, uint32_t basis)
1434 struct flow hash_flow = *flow;
1435 hash_flow.vlan_tci = htons(vlan);
1437 /* The symmetric quality of this hash function is not required, but
1438 * flow_hash_symmetric_l4 already exists, and is sufficient for our
1439 * purposes, so we use it out of convenience. */
1440 return flow_hash_symmetric_l4(&hash_flow, basis);
1444 bond_hash(const struct bond *bond, const struct flow *flow, uint16_t vlan)
1446 ovs_assert(bond->balance == BM_TCP || bond->balance == BM_SLB);
1448 return (bond->balance == BM_TCP
1449 ? bond_hash_tcp(flow, vlan, bond->basis)
1450 : bond_hash_src(flow->dl_src, vlan, bond->basis));
1453 static struct bond_entry *
1454 lookup_bond_entry(const struct bond *bond, const struct flow *flow,
1457 return &bond->hash[bond_hash(bond, flow, vlan) & BOND_MASK];
1460 static struct bond_slave *
1461 choose_output_slave(const struct bond *bond, const struct flow *flow,
1462 struct flow_wildcards *wc, uint16_t vlan, tag_type *tags)
1464 struct bond_entry *e;
1466 if (bond->lacp_status == LACP_CONFIGURED) {
1467 /* LACP has been configured on this bond but negotiations were
1468 * unsuccussful. Drop all traffic. */
1472 switch (bond->balance) {
1474 return bond->active_slave;
1477 if (bond->lacp_status != LACP_NEGOTIATED) {
1478 /* Must have LACP negotiations for TCP balanced bonds. */
1482 flow_mask_hash_fields(flow, wc, NX_HASH_FIELDS_SYMMETRIC_L4);
1487 flow_mask_hash_fields(flow, wc, NX_HASH_FIELDS_ETH_SRC);
1489 e = lookup_bond_entry(bond, flow, vlan);
1490 if (!e->slave || !e->slave->enabled) {
1491 e->slave = CONTAINER_OF(hmap_random_node(&bond->slaves),
1492 struct bond_slave, hmap_node);
1493 if (!e->slave->enabled) {
1494 e->slave = bond->active_slave;
1496 e->tag = tag_create_random();
1506 static struct bond_slave *
1507 bond_choose_slave(const struct bond *bond)
1509 struct bond_slave *slave, *best;
1511 /* Find an enabled slave. */
1512 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1513 if (slave->enabled) {
1518 /* All interfaces are disabled. Find an interface that will be enabled
1519 * after its updelay expires. */
1521 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1522 if (slave->delay_expires != LLONG_MAX
1523 && slave->may_enable
1524 && (!best || slave->delay_expires < best->delay_expires)) {
1532 bond_choose_active_slave(struct bond *bond, struct tag_set *tags)
1534 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
1535 struct bond_slave *old_active_slave = bond->active_slave;
1537 bond->active_slave = bond_choose_slave(bond);
1538 if (bond->active_slave) {
1539 if (bond->active_slave->enabled) {
1540 VLOG_INFO_RL(&rl, "bond %s: active interface is now %s",
1541 bond->name, bond->active_slave->name);
1543 VLOG_INFO_RL(&rl, "bond %s: active interface is now %s, skipping "
1544 "remaining %lld ms updelay (since no interface was "
1545 "enabled)", bond->name, bond->active_slave->name,
1546 bond->active_slave->delay_expires - time_msec());
1547 bond_enable_slave(bond->active_slave, true, tags);
1550 if (!old_active_slave) {
1551 tag_set_add(tags, bond->no_slaves_tag);
1554 bond->send_learning_packets = true;
1555 } else if (old_active_slave) {
1556 VLOG_INFO_RL(&rl, "bond %s: all interfaces disabled", bond->name);
1560 /* Returns the tag for 'bond''s active slave, or 'bond''s no_slaves_tag if
1561 * there is no active slave. */
1563 bond_get_active_slave_tag(const struct bond *bond)
1565 return (bond->active_slave
1566 ? bond->active_slave->tag
1567 : bond->no_slaves_tag);
1570 /* Attempts to make the sum of the bond slaves' statistics appear on the fake
1571 * bond interface. */
1573 bond_update_fake_slave_stats(struct bond *bond)
1575 struct netdev_stats bond_stats;
1576 struct bond_slave *slave;
1577 struct netdev *bond_dev;
1579 memset(&bond_stats, 0, sizeof bond_stats);
1581 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1582 struct netdev_stats slave_stats;
1584 if (!netdev_get_stats(slave->netdev, &slave_stats)) {
1585 /* XXX: We swap the stats here because they are swapped back when
1586 * reported by the internal device. The reason for this is
1587 * internal devices normally represent packets going into the
1588 * system but when used as fake bond device they represent packets
1589 * leaving the system. We really should do this in the internal
1590 * device itself because changing it here reverses the counts from
1591 * the perspective of the switch. However, the internal device
1592 * doesn't know what type of device it represents so we have to do
1593 * it here for now. */
1594 bond_stats.tx_packets += slave_stats.rx_packets;
1595 bond_stats.tx_bytes += slave_stats.rx_bytes;
1596 bond_stats.rx_packets += slave_stats.tx_packets;
1597 bond_stats.rx_bytes += slave_stats.tx_bytes;
1601 if (!netdev_open(bond->name, "system", &bond_dev)) {
1602 netdev_set_stats(bond_dev, &bond_stats);
1603 netdev_close(bond_dev);