2 * Copyright (c) 2008, 2009, 2010, 2011, 2012, 2013 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
27 #include "dynamic-string.h"
36 #include "poll-loop.h"
43 VLOG_DEFINE_THIS_MODULE(bond);
45 /* Bit-mask for hashing a flow down to a bucket.
46 * There are (BOND_MASK + 1) buckets. */
47 #define BOND_MASK 0xff
49 /* A hash bucket for mapping a flow to a slave.
50 * "struct bond" has an array of (BOND_MASK + 1) of these. */
52 struct bond_slave *slave; /* Assigned slave, NULL if unassigned. */
53 uint64_t tx_bytes; /* Count of bytes recently transmitted. */
54 tag_type tag; /* Tag for entry<->facet association. */
55 struct list list_node; /* In bond_slave's 'entries' list. */
58 /* A bond slave, that is, one of the links comprising a bond. */
60 struct hmap_node hmap_node; /* In struct bond's slaves hmap. */
61 struct bond *bond; /* The bond that contains this slave. */
62 void *aux; /* Client-provided handle for this slave. */
64 struct netdev *netdev; /* Network device, owned by the client. */
65 unsigned int change_seq; /* Tracks changes in 'netdev'. */
66 char *name; /* Name (a copy of netdev_get_name(netdev)). */
69 long long delay_expires; /* Time after which 'enabled' may change. */
70 bool enabled; /* May be chosen for flows? */
71 bool may_enable; /* Client considers this slave bondable. */
72 tag_type tag; /* Tag associated with this slave. */
74 /* Rebalancing info. Used only by bond_rebalance(). */
75 struct list bal_node; /* In bond_rebalance()'s 'bals' list. */
76 struct list entries; /* 'struct bond_entry's assigned here. */
77 uint64_t tx_bytes; /* Sum across 'tx_bytes' of entries. */
80 /* A bond, that is, a set of network devices grouped to improve performance or
83 struct hmap_node hmap_node; /* In 'all_bonds' hmap. */
84 char *name; /* Name provided by client. */
90 enum bond_mode balance; /* Balancing mode, one of BM_*. */
91 struct bond_slave *active_slave;
92 tag_type no_slaves_tag; /* Tag for flows when all slaves disabled. */
93 int updelay, downdelay; /* Delay before slave goes up/down, in ms. */
94 enum lacp_status lacp_status; /* Status of LACP negotiations. */
95 bool bond_revalidate; /* True if flows need revalidation. */
96 uint32_t basis; /* Basis for flow hash function. */
98 /* SLB specific bonding info. */
99 struct bond_entry *hash; /* An array of (BOND_MASK + 1) elements. */
100 int rebalance_interval; /* Interval between rebalances, in ms. */
101 long long int next_rebalance; /* Next rebalancing time. */
102 bool send_learning_packets;
104 /* Legacy compatibility. */
105 long long int next_fake_iface_update; /* LLONG_MAX if disabled. */
107 /* Tag set saved for next bond_run(). This tag set is a kluge for cases
108 * where we can't otherwise provide revalidation feedback to the client.
109 * That's only unixctl commands now; I hope no other cases will arise. */
110 struct tag_set unixctl_tags;
113 static struct hmap all_bonds = HMAP_INITIALIZER(&all_bonds);
115 static void bond_entry_reset(struct bond *);
116 static struct bond_slave *bond_slave_lookup(struct bond *, const void *slave_);
117 static void bond_enable_slave(struct bond_slave *, bool enable,
119 static void bond_link_status_update(struct bond_slave *, struct tag_set *);
120 static void bond_choose_active_slave(struct bond *, struct tag_set *);
121 static unsigned int bond_hash_src(const uint8_t mac[ETH_ADDR_LEN],
122 uint16_t vlan, uint32_t basis);
123 static unsigned int bond_hash_tcp(const struct flow *, uint16_t vlan,
125 static struct bond_entry *lookup_bond_entry(const struct bond *,
128 static tag_type bond_get_active_slave_tag(const struct bond *);
129 static struct bond_slave *choose_output_slave(const struct bond *,
131 uint16_t vlan, tag_type *tags);
132 static void bond_update_fake_slave_stats(struct bond *);
134 /* Attempts to parse 's' as the name of a bond balancing mode. If successful,
135 * stores the mode in '*balance' and returns true. Otherwise returns false
136 * without modifying '*balance'. */
138 bond_mode_from_string(enum bond_mode *balance, const char *s)
140 if (!strcmp(s, bond_mode_to_string(BM_TCP))) {
142 } else if (!strcmp(s, bond_mode_to_string(BM_SLB))) {
144 } else if (!strcmp(s, bond_mode_to_string(BM_AB))) {
152 /* Returns a string representing 'balance'. */
154 bond_mode_to_string(enum bond_mode balance) {
157 return "balance-tcp";
159 return "balance-slb";
161 return "active-backup";
167 /* Creates and returns a new bond whose configuration is initially taken from
170 * The caller should register each slave on the new bond by calling
171 * bond_slave_register(). */
173 bond_create(const struct bond_settings *s)
177 bond = xzalloc(sizeof *bond);
178 hmap_init(&bond->slaves);
179 bond->no_slaves_tag = tag_create_random();
180 bond->next_fake_iface_update = LLONG_MAX;
182 bond_reconfigure(bond, s);
184 tag_set_init(&bond->unixctl_tags);
191 bond_destroy(struct bond *bond)
193 struct bond_slave *slave, *next_slave;
199 hmap_remove(&all_bonds, &bond->hmap_node);
201 HMAP_FOR_EACH_SAFE (slave, next_slave, hmap_node, &bond->slaves) {
202 hmap_remove(&bond->slaves, &slave->hmap_node);
203 /* Client owns 'slave->netdev'. */
207 hmap_destroy(&bond->slaves);
214 /* Updates 'bond''s overall configuration to 's'.
216 * The caller should register each slave on 'bond' by calling
217 * bond_slave_register(). This is optional if none of the slaves'
218 * configuration has changed. In any case it can't hurt.
220 * Returns true if the configuration has changed in such a way that requires
224 bond_reconfigure(struct bond *bond, const struct bond_settings *s)
226 bool revalidate = false;
228 if (!bond->name || strcmp(bond->name, s->name)) {
230 hmap_remove(&all_bonds, &bond->hmap_node);
233 bond->name = xstrdup(s->name);
234 hmap_insert(&all_bonds, &bond->hmap_node, hash_string(bond->name, 0));
237 bond->updelay = s->up_delay;
238 bond->downdelay = s->down_delay;
240 if (bond->rebalance_interval != s->rebalance_interval) {
241 bond->rebalance_interval = s->rebalance_interval;
245 if (bond->balance != s->balance) {
246 bond->balance = s->balance;
250 if (bond->basis != s->basis) {
251 bond->basis = s->basis;
256 if (bond->next_fake_iface_update == LLONG_MAX) {
257 bond->next_fake_iface_update = time_msec();
260 bond->next_fake_iface_update = LLONG_MAX;
263 if (bond->bond_revalidate) {
265 bond->bond_revalidate = false;
268 if (bond->balance == BM_AB || !bond->hash || revalidate) {
269 bond_entry_reset(bond);
276 bond_slave_set_netdev__(struct bond_slave *slave, struct netdev *netdev)
278 if (slave->netdev != netdev) {
279 slave->netdev = netdev;
280 slave->change_seq = 0;
284 /* Registers 'slave_' as a slave of 'bond'. The 'slave_' pointer is an
285 * arbitrary client-provided pointer that uniquely identifies a slave within a
286 * bond. If 'slave_' already exists within 'bond' then this function
287 * reconfigures the existing slave.
289 * 'netdev' must be the network device that 'slave_' represents. It is owned
290 * by the client, so the client must not close it before either unregistering
291 * 'slave_' or destroying 'bond'.
294 bond_slave_register(struct bond *bond, void *slave_, struct netdev *netdev)
296 struct bond_slave *slave = bond_slave_lookup(bond, slave_);
299 slave = xzalloc(sizeof *slave);
301 hmap_insert(&bond->slaves, &slave->hmap_node, hash_pointer(slave_, 0));
304 slave->delay_expires = LLONG_MAX;
305 slave->name = xstrdup(netdev_get_name(netdev));
306 bond->bond_revalidate = true;
308 slave->enabled = false;
309 bond_enable_slave(slave, netdev_get_carrier(netdev), NULL);
312 bond_slave_set_netdev__(slave, netdev);
315 slave->name = xstrdup(netdev_get_name(netdev));
318 /* Updates the network device to be used with 'slave_' to 'netdev'.
320 * This is useful if the caller closes and re-opens the network device
321 * registered with bond_slave_register() but doesn't need to change anything
324 bond_slave_set_netdev(struct bond *bond, void *slave_, struct netdev *netdev)
326 struct bond_slave *slave = bond_slave_lookup(bond, slave_);
328 bond_slave_set_netdev__(slave, netdev);
332 /* Unregisters 'slave_' from 'bond'. If 'bond' does not contain such a slave
333 * then this function has no effect.
335 * Unregistering a slave invalidates all flows. */
337 bond_slave_unregister(struct bond *bond, const void *slave_)
339 struct bond_slave *slave = bond_slave_lookup(bond, slave_);
346 bond_enable_slave(slave, false, NULL);
348 del_active = bond->active_slave == slave;
350 struct bond_entry *e;
351 for (e = bond->hash; e <= &bond->hash[BOND_MASK]; e++) {
352 if (e->slave == slave) {
360 hmap_remove(&bond->slaves, &slave->hmap_node);
361 /* Client owns 'slave->netdev'. */
368 bond_choose_active_slave(bond, &tags);
369 bond->send_learning_packets = true;
373 /* Should be called on each slave in 'bond' before bond_run() to indicate
374 * whether or not 'slave_' may be enabled. This function is intended to allow
375 * other protocols to have some impact on bonding decisions. For example LACP
376 * or high level link monitoring protocols may decide that a given slave should
377 * not be able to send traffic. */
379 bond_slave_set_may_enable(struct bond *bond, void *slave_, bool may_enable)
381 bond_slave_lookup(bond, slave_)->may_enable = may_enable;
384 /* Performs periodic maintenance on 'bond'. The caller must provide 'tags' to
385 * allow tagged flows to be invalidated.
387 * The caller should check bond_should_send_learning_packets() afterward. */
389 bond_run(struct bond *bond, struct tag_set *tags, enum lacp_status lacp_status)
391 struct bond_slave *slave;
393 if (bond->lacp_status != lacp_status) {
394 bond->lacp_status = lacp_status;
395 bond->bond_revalidate = true;
398 /* Enable slaves based on link status and LACP feedback. */
399 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
400 bond_link_status_update(slave, tags);
401 slave->change_seq = netdev_change_seq(slave->netdev);
403 if (!bond->active_slave || !bond->active_slave->enabled) {
404 bond_choose_active_slave(bond, tags);
407 /* Update fake bond interface stats. */
408 if (time_msec() >= bond->next_fake_iface_update) {
409 bond_update_fake_slave_stats(bond);
410 bond->next_fake_iface_update = time_msec() + 1000;
413 if (bond->bond_revalidate) {
414 struct bond_slave *slave;
416 bond->bond_revalidate = false;
417 bond_entry_reset(bond);
418 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
419 tag_set_add(tags, slave->tag);
421 tag_set_add(tags, bond->no_slaves_tag);
424 /* Invalidate any tags required by */
425 tag_set_union(tags, &bond->unixctl_tags);
426 tag_set_init(&bond->unixctl_tags);
429 /* Causes poll_block() to wake up when 'bond' needs something to be done. */
431 bond_wait(struct bond *bond)
433 struct bond_slave *slave;
435 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
436 if (slave->delay_expires != LLONG_MAX) {
437 poll_timer_wait_until(slave->delay_expires);
440 if (slave->change_seq != netdev_change_seq(slave->netdev)) {
441 poll_immediate_wake();
445 if (bond->next_fake_iface_update != LLONG_MAX) {
446 poll_timer_wait_until(bond->next_fake_iface_update);
449 /* Ensure that any saved tags get revalidated right away. */
450 if (!tag_set_is_empty(&bond->unixctl_tags)) {
451 poll_immediate_wake();
454 /* We don't wait for bond->next_rebalance because rebalancing can only run
455 * at a flow account checkpoint. ofproto does checkpointing on its own
456 * schedule and bond_rebalance() gets called afterward, so we'd just be
457 * waking up for no purpose. */
460 /* MAC learning table interaction. */
463 may_send_learning_packets(const struct bond *bond)
465 return bond->lacp_status == LACP_DISABLED
466 && (bond->balance == BM_SLB || bond->balance == BM_AB)
467 && bond->active_slave;
470 /* Returns true if 'bond' needs the client to send out packets to assist with
471 * MAC learning on 'bond'. If this function returns true, then the client
472 * should iterate through its MAC learning table for the bridge on which 'bond'
473 * is located. For each MAC that has been learned on a port other than 'bond',
474 * it should call bond_compose_learning_packet().
476 * This function will only return true if 'bond' is in SLB or active-backup
477 * mode and LACP is not negotiated. Otherwise sending learning packets isn't
480 * Calling this function resets the state that it checks. */
482 bond_should_send_learning_packets(struct bond *bond)
484 bool send = bond->send_learning_packets && may_send_learning_packets(bond);
485 bond->send_learning_packets = false;
489 /* Sends a gratuitous learning packet on 'bond' from 'eth_src' on 'vlan'.
491 * See bond_should_send_learning_packets() for description of usage. The
492 * caller should send the composed packet on the port associated with
493 * port_aux and takes ownership of the returned ofpbuf. */
495 bond_compose_learning_packet(struct bond *bond,
496 const uint8_t eth_src[ETH_ADDR_LEN],
497 uint16_t vlan, void **port_aux)
499 struct bond_slave *slave;
500 struct ofpbuf *packet;
504 ovs_assert(may_send_learning_packets(bond));
506 memset(&flow, 0, sizeof flow);
507 memcpy(flow.dl_src, eth_src, ETH_ADDR_LEN);
508 slave = choose_output_slave(bond, &flow, vlan, &tags);
510 packet = ofpbuf_new(0);
511 compose_rarp(packet, eth_src);
513 eth_push_vlan(packet, htons(vlan));
516 *port_aux = slave->aux;
520 /* Checks whether a packet that arrived on 'slave_' within 'bond', with an
521 * Ethernet destination address of 'eth_dst', should be admitted.
523 * The return value is one of the following:
525 * - BV_ACCEPT: Admit the packet.
527 * - BV_DROP: Drop the packet.
529 * - BV_DROP_IF_MOVED: Consult the MAC learning table for the packet's
530 * Ethernet source address and VLAN. If there is none, or if the packet
531 * is on the learned port, then admit the packet. If a different port has
532 * been learned, however, drop the packet (and do not use it for MAC
536 bond_check_admissibility(struct bond *bond, const void *slave_,
537 const uint8_t eth_dst[ETH_ADDR_LEN], tag_type *tags)
539 struct bond_slave *slave = bond_slave_lookup(bond, slave_);
541 /* LACP bonds have very loose admissibility restrictions because we can
542 * assume the remote switch is aware of the bond and will "do the right
543 * thing". However, as a precaution we drop packets on disabled slaves
544 * because no correctly implemented partner switch should be sending
547 * If LACP is configured, but LACP negotiations have been unsuccessful, we
548 * drop all incoming traffic. */
549 switch (bond->lacp_status) {
550 case LACP_NEGOTIATED: return slave->enabled ? BV_ACCEPT : BV_DROP;
551 case LACP_CONFIGURED: return BV_DROP;
552 case LACP_DISABLED: break;
555 /* Drop all multicast packets on inactive slaves. */
556 if (eth_addr_is_multicast(eth_dst)) {
557 *tags |= bond_get_active_slave_tag(bond);
558 if (bond->active_slave != bond_slave_lookup(bond, slave_)) {
563 switch (bond->balance) {
565 /* Drop all packets which arrive on backup slaves. This is similar to
566 * how Linux bonding handles active-backup bonds. */
567 *tags |= bond_get_active_slave_tag(bond);
568 if (bond->active_slave != slave) {
569 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
571 VLOG_DBG_RL(&rl, "active-backup bond received packet on backup"
572 " slave (%s) destined for " ETH_ADDR_FMT,
573 slave->name, ETH_ADDR_ARGS(eth_dst));
579 /* TCP balanced bonds require successful LACP negotiated. Based on the
580 * above check, LACP is off on this bond. Therfore, we drop all
581 * incoming traffic. */
585 /* Drop all packets for which we have learned a different input port,
586 * because we probably sent the packet on one slave and got it back on
587 * the other. Gratuitous ARP packets are an exception to this rule:
588 * the host has moved to another switch. The exception to the
589 * exception is if we locked the learning table to avoid reflections on
591 return BV_DROP_IF_MOVED;
597 /* Returns the slave (registered on 'bond' by bond_slave_register()) to which
598 * a packet with the given 'flow' and 'vlan' should be forwarded. Returns
599 * NULL if the packet should be dropped because no slaves are enabled.
601 * 'vlan' is not necessarily the same as 'flow->vlan_tci'. First, 'vlan'
602 * should be a VID only (i.e. excluding the PCP bits). Second,
603 * 'flow->vlan_tci' is the VLAN TCI that appeared on the packet (so it will be
604 * nonzero only for trunk ports), whereas 'vlan' is the logical VLAN that the
605 * packet belongs to (so for an access port it will be the access port's VLAN).
607 * Adds a tag to '*tags' that associates the flow with the returned slave.
610 bond_choose_output_slave(struct bond *bond, const struct flow *flow,
611 uint16_t vlan, tag_type *tags)
613 struct bond_slave *slave = choose_output_slave(bond, flow, vlan, tags);
618 *tags |= bond->no_slaves_tag;
626 bond_is_balanced(const struct bond *bond)
628 return bond->rebalance_interval
629 && (bond->balance == BM_SLB || bond->balance == BM_TCP);
632 /* Notifies 'bond' that 'n_bytes' bytes were sent in 'flow' within 'vlan'. */
634 bond_account(struct bond *bond, const struct flow *flow, uint16_t vlan,
637 if (bond_is_balanced(bond)) {
638 lookup_bond_entry(bond, flow, vlan)->tx_bytes += n_bytes;
642 static struct bond_slave *
643 bond_slave_from_bal_node(struct list *bal)
645 return CONTAINER_OF(bal, struct bond_slave, bal_node);
649 log_bals(struct bond *bond, const struct list *bals)
651 if (VLOG_IS_DBG_ENABLED()) {
652 struct ds ds = DS_EMPTY_INITIALIZER;
653 const struct bond_slave *slave;
655 LIST_FOR_EACH (slave, bal_node, bals) {
657 ds_put_char(&ds, ',');
659 ds_put_format(&ds, " %s %"PRIu64"kB",
660 slave->name, slave->tx_bytes / 1024);
662 if (!slave->enabled) {
663 ds_put_cstr(&ds, " (disabled)");
665 if (!list_is_empty(&slave->entries)) {
666 struct bond_entry *e;
668 ds_put_cstr(&ds, " (");
669 LIST_FOR_EACH (e, list_node, &slave->entries) {
670 if (&e->list_node != list_front(&slave->entries)) {
671 ds_put_cstr(&ds, " + ");
673 ds_put_format(&ds, "h%td: %"PRIu64"kB",
674 e - bond->hash, e->tx_bytes / 1024);
676 ds_put_cstr(&ds, ")");
679 VLOG_DBG("bond %s:%s", bond->name, ds_cstr(&ds));
684 /* Shifts 'hash' from its current slave to 'to'. */
686 bond_shift_load(struct bond_entry *hash, struct bond_slave *to,
689 struct bond_slave *from = hash->slave;
690 struct bond *bond = from->bond;
691 uint64_t delta = hash->tx_bytes;
693 VLOG_INFO("bond %s: shift %"PRIu64"kB of load (with hash %td) "
694 "from %s to %s (now carrying %"PRIu64"kB and "
695 "%"PRIu64"kB load, respectively)",
696 bond->name, delta / 1024, hash - bond->hash,
697 from->name, to->name,
698 (from->tx_bytes - delta) / 1024,
699 (to->tx_bytes + delta) / 1024);
701 /* Shift load away from 'from' to 'to'. */
702 from->tx_bytes -= delta;
703 to->tx_bytes += delta;
705 /* Arrange for flows to be revalidated. */
706 tag_set_add(set, hash->tag);
708 hash->tag = tag_create_random();
711 /* Picks and returns a bond_entry to migrate from 'from' (the most heavily
712 * loaded bond slave) to a bond slave that has 'to_tx_bytes' bytes of load,
713 * given that doing so must decrease the ratio of the load on the two slaves by
714 * at least 0.1. Returns NULL if there is no appropriate entry.
716 * The list of entries isn't sorted. I don't know of a reason to prefer to
717 * shift away small hashes or large hashes. */
718 static struct bond_entry *
719 choose_entry_to_migrate(const struct bond_slave *from, uint64_t to_tx_bytes)
721 struct bond_entry *e;
723 if (list_is_short(&from->entries)) {
724 /* 'from' carries no more than one MAC hash, so shifting load away from
725 * it would be pointless. */
729 LIST_FOR_EACH (e, list_node, &from->entries) {
730 double old_ratio, new_ratio;
733 if (to_tx_bytes == 0) {
734 /* Nothing on the new slave, move it. */
739 old_ratio = (double)from->tx_bytes / to_tx_bytes;
740 new_ratio = (double)(from->tx_bytes - delta) / (to_tx_bytes + delta);
741 if (old_ratio - new_ratio > 0.1
742 && fabs(new_ratio - 1.0) < fabs(old_ratio - 1.0)) {
743 /* We're aiming for an ideal ratio of 1, meaning both the 'from'
744 and 'to' slave have the same load. Therefore, we only move an
745 entry if it decreases the load on 'from', and brings us closer
746 to equal traffic load. */
754 /* Inserts 'slave' into 'bals' so that descending order of 'tx_bytes' is
757 insert_bal(struct list *bals, struct bond_slave *slave)
759 struct bond_slave *pos;
761 LIST_FOR_EACH (pos, bal_node, bals) {
762 if (slave->tx_bytes > pos->tx_bytes) {
766 list_insert(&pos->bal_node, &slave->bal_node);
769 /* Removes 'slave' from its current list and then inserts it into 'bals' so
770 * that descending order of 'tx_bytes' is maintained. */
772 reinsert_bal(struct list *bals, struct bond_slave *slave)
774 list_remove(&slave->bal_node);
775 insert_bal(bals, slave);
778 /* If 'bond' needs rebalancing, does so.
780 * The caller should have called bond_account() for each active flow, to ensure
781 * that flow data is consistently accounted at this point. */
783 bond_rebalance(struct bond *bond, struct tag_set *tags)
785 struct bond_slave *slave;
786 struct bond_entry *e;
789 if (!bond_is_balanced(bond) || time_msec() < bond->next_rebalance) {
792 bond->next_rebalance = time_msec() + bond->rebalance_interval;
794 /* Add each bond_entry to its slave's 'entries' list.
795 * Compute each slave's tx_bytes as the sum of its entries' tx_bytes. */
796 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
798 list_init(&slave->entries);
800 for (e = &bond->hash[0]; e <= &bond->hash[BOND_MASK]; e++) {
801 if (e->slave && e->tx_bytes) {
802 e->slave->tx_bytes += e->tx_bytes;
803 list_push_back(&e->slave->entries, &e->list_node);
807 /* Add enabled slaves to 'bals' in descending order of tx_bytes.
809 * XXX This is O(n**2) in the number of slaves but it could be O(n lg n)
810 * with a proper list sort algorithm. */
812 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
813 if (slave->enabled) {
814 insert_bal(&bals, slave);
817 log_bals(bond, &bals);
819 /* Shift load from the most-loaded slaves to the least-loaded slaves. */
820 while (!list_is_short(&bals)) {
821 struct bond_slave *from = bond_slave_from_bal_node(list_front(&bals));
822 struct bond_slave *to = bond_slave_from_bal_node(list_back(&bals));
825 overload = from->tx_bytes - to->tx_bytes;
826 if (overload < to->tx_bytes >> 5 || overload < 100000) {
827 /* The extra load on 'from' (and all less-loaded slaves), compared
828 * to that of 'to' (the least-loaded slave), is less than ~3%, or
829 * it is less than ~1Mbps. No point in rebalancing. */
833 /* 'from' is carrying significantly more load than 'to'. Pick a hash
834 * to move from 'from' to 'to'. */
835 e = choose_entry_to_migrate(from, to->tx_bytes);
837 bond_shift_load(e, to, tags);
839 /* Delete element from from->entries.
841 * We don't add the element to to->hashes. That would only allow
842 * 'e' to be migrated to another slave in this rebalancing run, and
843 * there is no point in doing that. */
844 list_remove(&e->list_node);
846 /* Re-sort 'bals'. */
847 reinsert_bal(&bals, from);
848 reinsert_bal(&bals, to);
850 /* Can't usefully migrate anything away from 'from'.
851 * Don't reconsider it. */
852 list_remove(&from->bal_node);
856 /* Implement exponentially weighted moving average. A weight of 1/2 causes
857 * historical data to decay to <1% in 7 rebalancing runs. 1,000,000 bytes
858 * take 20 rebalancing runs to decay to 0 and get deleted entirely. */
859 for (e = &bond->hash[0]; e <= &bond->hash[BOND_MASK]; e++) {
867 /* Bonding unixctl user interface functions. */
870 bond_find(const char *name)
874 HMAP_FOR_EACH_WITH_HASH (bond, hmap_node, hash_string(name, 0),
876 if (!strcmp(bond->name, name)) {
883 static struct bond_slave *
884 bond_lookup_slave(struct bond *bond, const char *slave_name)
886 struct bond_slave *slave;
888 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
889 if (!strcmp(slave->name, slave_name)) {
897 bond_unixctl_list(struct unixctl_conn *conn,
898 int argc OVS_UNUSED, const char *argv[] OVS_UNUSED,
899 void *aux OVS_UNUSED)
901 struct ds ds = DS_EMPTY_INITIALIZER;
902 const struct bond *bond;
904 ds_put_cstr(&ds, "bond\ttype\tslaves\n");
906 HMAP_FOR_EACH (bond, hmap_node, &all_bonds) {
907 const struct bond_slave *slave;
910 ds_put_format(&ds, "%s\t%s\t",
911 bond->name, bond_mode_to_string(bond->balance));
914 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
916 ds_put_cstr(&ds, ", ");
918 ds_put_cstr(&ds, slave->name);
920 ds_put_char(&ds, '\n');
922 unixctl_command_reply(conn, ds_cstr(&ds));
927 bond_print_details(struct ds *ds, const struct bond *bond)
929 struct shash slave_shash = SHASH_INITIALIZER(&slave_shash);
930 const struct shash_node **sorted_slaves = NULL;
931 const struct bond_slave *slave;
934 ds_put_format(ds, "---- %s ----\n", bond->name);
935 ds_put_format(ds, "bond_mode: %s\n",
936 bond_mode_to_string(bond->balance));
938 ds_put_format(ds, "bond-hash-basis: %"PRIu32"\n", bond->basis);
940 ds_put_format(ds, "updelay: %d ms\n", bond->updelay);
941 ds_put_format(ds, "downdelay: %d ms\n", bond->downdelay);
943 if (bond_is_balanced(bond)) {
944 ds_put_format(ds, "next rebalance: %lld ms\n",
945 bond->next_rebalance - time_msec());
948 ds_put_cstr(ds, "lacp_status: ");
949 switch (bond->lacp_status) {
950 case LACP_NEGOTIATED:
951 ds_put_cstr(ds, "negotiated\n");
953 case LACP_CONFIGURED:
954 ds_put_cstr(ds, "configured\n");
957 ds_put_cstr(ds, "off\n");
960 ds_put_cstr(ds, "<unknown>\n");
964 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
965 shash_add(&slave_shash, slave->name, slave);
967 sorted_slaves = shash_sort(&slave_shash);
969 for (i = 0; i < shash_count(&slave_shash); i++) {
970 struct bond_entry *be;
972 slave = sorted_slaves[i]->data;
975 ds_put_format(ds, "\nslave %s: %s\n",
976 slave->name, slave->enabled ? "enabled" : "disabled");
977 if (slave == bond->active_slave) {
978 ds_put_cstr(ds, "\tactive slave\n");
980 if (slave->delay_expires != LLONG_MAX) {
981 ds_put_format(ds, "\t%s expires in %lld ms\n",
982 slave->enabled ? "downdelay" : "updelay",
983 slave->delay_expires - time_msec());
986 ds_put_format(ds, "\tmay_enable: %s\n",
987 slave->may_enable ? "true" : "false");
989 if (!bond_is_balanced(bond)) {
994 for (be = bond->hash; be <= &bond->hash[BOND_MASK]; be++) {
995 int hash = be - bond->hash;
997 if (be->slave != slave) {
1001 ds_put_format(ds, "\thash %d: %"PRIu64" kB load\n",
1002 hash, be->tx_bytes / 1024);
1004 /* XXX How can we list the MACs assigned to hashes of SLB bonds? */
1007 shash_destroy(&slave_shash);
1008 free(sorted_slaves);
1009 ds_put_cstr(ds, "\n");
1013 bond_unixctl_show(struct unixctl_conn *conn,
1014 int argc, const char *argv[],
1015 void *aux OVS_UNUSED)
1017 struct ds ds = DS_EMPTY_INITIALIZER;
1020 const struct bond *bond = bond_find(argv[1]);
1023 unixctl_command_reply_error(conn, "no such bond");
1026 bond_print_details(&ds, bond);
1028 const struct bond *bond;
1030 HMAP_FOR_EACH (bond, hmap_node, &all_bonds) {
1031 bond_print_details(&ds, bond);
1035 unixctl_command_reply(conn, ds_cstr(&ds));
1040 bond_unixctl_migrate(struct unixctl_conn *conn,
1041 int argc OVS_UNUSED, const char *argv[],
1042 void *aux OVS_UNUSED)
1044 const char *bond_s = argv[1];
1045 const char *hash_s = argv[2];
1046 const char *slave_s = argv[3];
1048 struct bond_slave *slave;
1049 struct bond_entry *entry;
1052 bond = bond_find(bond_s);
1054 unixctl_command_reply_error(conn, "no such bond");
1058 if (bond->balance != BM_SLB) {
1059 unixctl_command_reply_error(conn, "not an SLB bond");
1063 if (strspn(hash_s, "0123456789") == strlen(hash_s)) {
1064 hash = atoi(hash_s) & BOND_MASK;
1066 unixctl_command_reply_error(conn, "bad hash");
1070 slave = bond_lookup_slave(bond, slave_s);
1072 unixctl_command_reply_error(conn, "no such slave");
1076 if (!slave->enabled) {
1077 unixctl_command_reply_error(conn, "cannot migrate to disabled slave");
1081 entry = &bond->hash[hash];
1082 tag_set_add(&bond->unixctl_tags, entry->tag);
1083 entry->slave = slave;
1084 entry->tag = tag_create_random();
1085 unixctl_command_reply(conn, "migrated");
1089 bond_unixctl_set_active_slave(struct unixctl_conn *conn,
1090 int argc OVS_UNUSED, const char *argv[],
1091 void *aux OVS_UNUSED)
1093 const char *bond_s = argv[1];
1094 const char *slave_s = argv[2];
1096 struct bond_slave *slave;
1098 bond = bond_find(bond_s);
1100 unixctl_command_reply_error(conn, "no such bond");
1104 slave = bond_lookup_slave(bond, slave_s);
1106 unixctl_command_reply_error(conn, "no such slave");
1110 if (!slave->enabled) {
1111 unixctl_command_reply_error(conn, "cannot make disabled slave active");
1115 if (bond->active_slave != slave) {
1116 tag_set_add(&bond->unixctl_tags, bond_get_active_slave_tag(bond));
1117 bond->active_slave = slave;
1118 bond->active_slave->tag = tag_create_random();
1119 VLOG_INFO("bond %s: active interface is now %s",
1120 bond->name, slave->name);
1121 bond->send_learning_packets = true;
1122 unixctl_command_reply(conn, "done");
1124 unixctl_command_reply(conn, "no change");
1129 enable_slave(struct unixctl_conn *conn, const char *argv[], bool enable)
1131 const char *bond_s = argv[1];
1132 const char *slave_s = argv[2];
1134 struct bond_slave *slave;
1136 bond = bond_find(bond_s);
1138 unixctl_command_reply_error(conn, "no such bond");
1142 slave = bond_lookup_slave(bond, slave_s);
1144 unixctl_command_reply_error(conn, "no such slave");
1148 bond_enable_slave(slave, enable, &bond->unixctl_tags);
1149 unixctl_command_reply(conn, enable ? "enabled" : "disabled");
1153 bond_unixctl_enable_slave(struct unixctl_conn *conn,
1154 int argc OVS_UNUSED, const char *argv[],
1155 void *aux OVS_UNUSED)
1157 enable_slave(conn, argv, true);
1161 bond_unixctl_disable_slave(struct unixctl_conn *conn,
1162 int argc OVS_UNUSED, const char *argv[],
1163 void *aux OVS_UNUSED)
1165 enable_slave(conn, argv, false);
1169 bond_unixctl_hash(struct unixctl_conn *conn, int argc, const char *argv[],
1170 void *aux OVS_UNUSED)
1172 const char *mac_s = argv[1];
1173 const char *vlan_s = argc > 2 ? argv[2] : NULL;
1174 const char *basis_s = argc > 3 ? argv[3] : NULL;
1175 uint8_t mac[ETH_ADDR_LEN];
1182 if (sscanf(vlan_s, "%u", &vlan) != 1) {
1183 unixctl_command_reply_error(conn, "invalid vlan");
1191 if (sscanf(basis_s, "%"PRIu32, &basis) != 1) {
1192 unixctl_command_reply_error(conn, "invalid basis");
1199 if (sscanf(mac_s, ETH_ADDR_SCAN_FMT, ETH_ADDR_SCAN_ARGS(mac))
1200 == ETH_ADDR_SCAN_COUNT) {
1201 hash = bond_hash_src(mac, vlan, basis) & BOND_MASK;
1203 hash_cstr = xasprintf("%u", hash);
1204 unixctl_command_reply(conn, hash_cstr);
1207 unixctl_command_reply_error(conn, "invalid mac");
1214 unixctl_command_register("bond/list", "", 0, 0, bond_unixctl_list, NULL);
1215 unixctl_command_register("bond/show", "[port]", 0, 1, bond_unixctl_show,
1217 unixctl_command_register("bond/migrate", "port hash slave", 3, 3,
1218 bond_unixctl_migrate, NULL);
1219 unixctl_command_register("bond/set-active-slave", "port slave", 2, 2,
1220 bond_unixctl_set_active_slave, NULL);
1221 unixctl_command_register("bond/enable-slave", "port slave", 2, 2,
1222 bond_unixctl_enable_slave, NULL);
1223 unixctl_command_register("bond/disable-slave", "port slave", 2, 2,
1224 bond_unixctl_disable_slave, NULL);
1225 unixctl_command_register("bond/hash", "mac [vlan] [basis]", 1, 3,
1226 bond_unixctl_hash, NULL);
1230 bond_entry_reset(struct bond *bond)
1232 if (bond->balance != BM_AB) {
1233 size_t hash_len = (BOND_MASK + 1) * sizeof *bond->hash;
1236 bond->hash = xmalloc(hash_len);
1238 memset(bond->hash, 0, hash_len);
1240 bond->next_rebalance = time_msec() + bond->rebalance_interval;
1247 static struct bond_slave *
1248 bond_slave_lookup(struct bond *bond, const void *slave_)
1250 struct bond_slave *slave;
1252 HMAP_FOR_EACH_IN_BUCKET (slave, hmap_node, hash_pointer(slave_, 0),
1254 if (slave->aux == slave_) {
1263 bond_enable_slave(struct bond_slave *slave, bool enable, struct tag_set *tags)
1265 slave->delay_expires = LLONG_MAX;
1266 if (enable != slave->enabled) {
1267 slave->enabled = enable;
1268 if (!slave->enabled) {
1269 VLOG_INFO("interface %s: disabled", slave->name);
1271 tag_set_add(tags, slave->tag);
1274 VLOG_INFO("interface %s: enabled", slave->name);
1275 slave->tag = tag_create_random();
1281 bond_link_status_update(struct bond_slave *slave, struct tag_set *tags)
1283 struct bond *bond = slave->bond;
1286 up = netdev_get_carrier(slave->netdev) && slave->may_enable;
1287 if ((up == slave->enabled) != (slave->delay_expires == LLONG_MAX)) {
1288 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
1289 VLOG_INFO_RL(&rl, "interface %s: link state %s",
1290 slave->name, up ? "up" : "down");
1291 if (up == slave->enabled) {
1292 slave->delay_expires = LLONG_MAX;
1293 VLOG_INFO_RL(&rl, "interface %s: will not be %s",
1294 slave->name, up ? "disabled" : "enabled");
1296 int delay = (bond->lacp_status != LACP_DISABLED ? 0
1297 : up ? bond->updelay : bond->downdelay);
1298 slave->delay_expires = time_msec() + delay;
1300 VLOG_INFO_RL(&rl, "interface %s: will be %s if it stays %s "
1303 up ? "enabled" : "disabled",
1310 if (time_msec() >= slave->delay_expires) {
1311 bond_enable_slave(slave, up, tags);
1316 bond_hash_src(const uint8_t mac[ETH_ADDR_LEN], uint16_t vlan, uint32_t basis)
1318 return hash_3words(hash_bytes(mac, ETH_ADDR_LEN, 0), vlan, basis);
1322 bond_hash_tcp(const struct flow *flow, uint16_t vlan, uint32_t basis)
1324 struct flow hash_flow = *flow;
1325 hash_flow.vlan_tci = htons(vlan);
1327 /* The symmetric quality of this hash function is not required, but
1328 * flow_hash_symmetric_l4 already exists, and is sufficient for our
1329 * purposes, so we use it out of convenience. */
1330 return flow_hash_symmetric_l4(&hash_flow, basis);
1334 bond_hash(const struct bond *bond, const struct flow *flow, uint16_t vlan)
1336 ovs_assert(bond->balance == BM_TCP || bond->balance == BM_SLB);
1338 return (bond->balance == BM_TCP
1339 ? bond_hash_tcp(flow, vlan, bond->basis)
1340 : bond_hash_src(flow->dl_src, vlan, bond->basis));
1343 static struct bond_entry *
1344 lookup_bond_entry(const struct bond *bond, const struct flow *flow,
1347 return &bond->hash[bond_hash(bond, flow, vlan) & BOND_MASK];
1350 static struct bond_slave *
1351 choose_output_slave(const struct bond *bond, const struct flow *flow,
1352 uint16_t vlan, tag_type *tags)
1354 struct bond_entry *e;
1356 if (bond->lacp_status == LACP_CONFIGURED) {
1357 /* LACP has been configured on this bond but negotiations were
1358 * unsuccussful. Drop all traffic. */
1362 switch (bond->balance) {
1364 return bond->active_slave;
1367 if (bond->lacp_status != LACP_NEGOTIATED) {
1368 /* Must have LACP negotiations for TCP balanced bonds. */
1373 e = lookup_bond_entry(bond, flow, vlan);
1374 if (!e->slave || !e->slave->enabled) {
1375 e->slave = CONTAINER_OF(hmap_random_node(&bond->slaves),
1376 struct bond_slave, hmap_node);
1377 if (!e->slave->enabled) {
1378 e->slave = bond->active_slave;
1380 e->tag = tag_create_random();
1390 static struct bond_slave *
1391 bond_choose_slave(const struct bond *bond)
1393 struct bond_slave *slave, *best;
1395 /* Find an enabled slave. */
1396 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1397 if (slave->enabled) {
1402 /* All interfaces are disabled. Find an interface that will be enabled
1403 * after its updelay expires. */
1405 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1406 if (slave->delay_expires != LLONG_MAX
1407 && slave->may_enable
1408 && (!best || slave->delay_expires < best->delay_expires)) {
1416 bond_choose_active_slave(struct bond *bond, struct tag_set *tags)
1418 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
1419 struct bond_slave *old_active_slave = bond->active_slave;
1421 bond->active_slave = bond_choose_slave(bond);
1422 if (bond->active_slave) {
1423 if (bond->active_slave->enabled) {
1424 VLOG_INFO_RL(&rl, "bond %s: active interface is now %s",
1425 bond->name, bond->active_slave->name);
1427 VLOG_INFO_RL(&rl, "bond %s: active interface is now %s, skipping "
1428 "remaining %lld ms updelay (since no interface was "
1429 "enabled)", bond->name, bond->active_slave->name,
1430 bond->active_slave->delay_expires - time_msec());
1431 bond_enable_slave(bond->active_slave, true, tags);
1434 if (!old_active_slave) {
1435 tag_set_add(tags, bond->no_slaves_tag);
1438 bond->send_learning_packets = true;
1439 } else if (old_active_slave) {
1440 VLOG_INFO_RL(&rl, "bond %s: all interfaces disabled", bond->name);
1444 /* Returns the tag for 'bond''s active slave, or 'bond''s no_slaves_tag if
1445 * there is no active slave. */
1447 bond_get_active_slave_tag(const struct bond *bond)
1449 return (bond->active_slave
1450 ? bond->active_slave->tag
1451 : bond->no_slaves_tag);
1454 /* Attempts to make the sum of the bond slaves' statistics appear on the fake
1455 * bond interface. */
1457 bond_update_fake_slave_stats(struct bond *bond)
1459 struct netdev_stats bond_stats;
1460 struct bond_slave *slave;
1461 struct netdev *bond_dev;
1463 memset(&bond_stats, 0, sizeof bond_stats);
1465 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1466 struct netdev_stats slave_stats;
1468 if (!netdev_get_stats(slave->netdev, &slave_stats)) {
1469 /* XXX: We swap the stats here because they are swapped back when
1470 * reported by the internal device. The reason for this is
1471 * internal devices normally represent packets going into the
1472 * system but when used as fake bond device they represent packets
1473 * leaving the system. We really should do this in the internal
1474 * device itself because changing it here reverses the counts from
1475 * the perspective of the switch. However, the internal device
1476 * doesn't know what type of device it represents so we have to do
1477 * it here for now. */
1478 bond_stats.tx_packets += slave_stats.rx_packets;
1479 bond_stats.tx_bytes += slave_stats.rx_bytes;
1480 bond_stats.rx_packets += slave_stats.tx_packets;
1481 bond_stats.rx_bytes += slave_stats.tx_bytes;
1485 if (!netdev_open(bond->name, "system", &bond_dev)) {
1486 netdev_set_stats(bond_dev, &bond_stats);
1487 netdev_close(bond_dev);