2 * Copyright (c) 2008, 2009, 2010, 2011, 2012, 2013, 2014 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
27 #include "ofp-actions.h"
29 #include "ofproto/ofproto-provider.h"
30 #include "ofproto/ofproto-dpif.h"
31 #include "connectivity.h"
33 #include "dynamic-string.h"
42 #include "poll-loop.h"
50 VLOG_DEFINE_THIS_MODULE(bond);
52 static struct ovs_rwlock rwlock = OVS_RWLOCK_INITIALIZER;
53 static struct hmap all_bonds__ = HMAP_INITIALIZER(&all_bonds__);
54 static struct hmap *const all_bonds OVS_GUARDED_BY(rwlock) = &all_bonds__;
56 /* Bit-mask for hashing a flow down to a bucket. */
57 #define BOND_MASK 0xff
58 #define BOND_BUCKETS (BOND_MASK + 1)
59 #define RECIRC_RULE_PRIORITY 20 /* Priority level for internal rules */
61 /* A hash bucket for mapping a flow to a slave.
62 * "struct bond" has an array of BOND_BUCKETS of these. */
64 struct bond_slave *slave; /* Assigned slave, NULL if unassigned. */
65 uint64_t tx_bytes /* Count of bytes recently transmitted. */
66 OVS_GUARDED_BY(rwlock);
67 struct list list_node; /* In bond_slave's 'entries' list. */
71 * 'pr_rule' is the post-recirculation rule for this entry.
72 * 'pr_tx_bytes' is the most recently seen statistics for 'pr_rule', which
73 * is used to determine delta (applied to 'tx_bytes' above.) */
75 uint64_t pr_tx_bytes OVS_GUARDED_BY(rwlock);
78 /* A bond slave, that is, one of the links comprising a bond. */
80 struct hmap_node hmap_node; /* In struct bond's slaves hmap. */
81 struct list list_node; /* In struct bond's enabled_slaves list. */
82 struct bond *bond; /* The bond that contains this slave. */
83 void *aux; /* Client-provided handle for this slave. */
85 struct netdev *netdev; /* Network device, owned by the client. */
86 unsigned int change_seq; /* Tracks changes in 'netdev'. */
87 ofp_port_t ofp_port; /* Open flow port number */
88 char *name; /* Name (a copy of netdev_get_name(netdev)). */
91 long long delay_expires; /* Time after which 'enabled' may change. */
92 bool enabled; /* May be chosen for flows? */
93 bool may_enable; /* Client considers this slave bondable. */
95 /* Rebalancing info. Used only by bond_rebalance(). */
96 struct list bal_node; /* In bond_rebalance()'s 'bals' list. */
97 struct list entries; /* 'struct bond_entry's assigned here. */
98 uint64_t tx_bytes; /* Sum across 'tx_bytes' of entries. */
101 /* A bond, that is, a set of network devices grouped to improve performance or
104 struct hmap_node hmap_node; /* In 'all_bonds' hmap. */
105 char *name; /* Name provided by client. */
106 struct ofproto_dpif *ofproto; /* The bridge this bond belongs to. */
113 * Any reader or writer of 'enabled_slaves' must hold 'mutex'.
114 * (To prevent the bond_slave from disappearing they must also hold
116 struct ovs_mutex mutex OVS_ACQ_AFTER(rwlock);
117 struct list enabled_slaves OVS_GUARDED; /* Contains struct bond_slaves. */
120 enum bond_mode balance; /* Balancing mode, one of BM_*. */
121 struct bond_slave *active_slave;
122 int updelay, downdelay; /* Delay before slave goes up/down, in ms. */
123 enum lacp_status lacp_status; /* Status of LACP negotiations. */
124 bool bond_revalidate; /* True if flows need revalidation. */
125 uint32_t basis; /* Basis for flow hash function. */
127 /* SLB specific bonding info. */
128 struct bond_entry *hash; /* An array of BOND_BUCKETS elements. */
129 int rebalance_interval; /* Interval between rebalances, in ms. */
130 long long int next_rebalance; /* Next rebalancing time. */
131 bool send_learning_packets;
132 uint32_t recirc_id; /* Non zero if recirculation can be used.*/
133 struct hmap pr_rule_ops; /* Helps to maintain post recirculation rules.*/
135 /* Legacy compatibility. */
136 long long int next_fake_iface_update; /* LLONG_MAX if disabled. */
137 bool lacp_fallback_ab; /* Fallback to active-backup on LACP failure. */
139 struct ovs_refcount ref_cnt;
142 /* What to do with an bond_recirc_rule. */
144 ADD, /* Add the rule to ofproto's flow table. */
145 DEL, /* Delete the rule from the ofproto's flow table. */
148 /* A rule to add to or delete from ofproto's internal flow table. */
149 struct bond_pr_rule_op {
150 struct hmap_node hmap_node;
152 ofp_port_t out_ofport;
154 struct rule **pr_rule;
157 static void bond_entry_reset(struct bond *) OVS_REQ_WRLOCK(rwlock);
158 static struct bond_slave *bond_slave_lookup(struct bond *, const void *slave_)
159 OVS_REQ_RDLOCK(rwlock);
160 static void bond_enable_slave(struct bond_slave *, bool enable)
161 OVS_REQ_WRLOCK(rwlock);
162 static void bond_link_status_update(struct bond_slave *)
163 OVS_REQ_WRLOCK(rwlock);
164 static void bond_choose_active_slave(struct bond *)
165 OVS_REQ_WRLOCK(rwlock);
166 static unsigned int bond_hash_src(const uint8_t mac[ETH_ADDR_LEN],
167 uint16_t vlan, uint32_t basis);
168 static unsigned int bond_hash_tcp(const struct flow *, uint16_t vlan,
170 static struct bond_entry *lookup_bond_entry(const struct bond *,
173 OVS_REQ_RDLOCK(rwlock);
174 static struct bond_slave *get_enabled_slave(struct bond *)
175 OVS_REQ_RDLOCK(rwlock);
176 static struct bond_slave *choose_output_slave(const struct bond *,
178 struct flow_wildcards *,
180 OVS_REQ_RDLOCK(rwlock);
181 static void bond_update_fake_slave_stats(struct bond *)
182 OVS_REQ_RDLOCK(rwlock);
184 /* Attempts to parse 's' as the name of a bond balancing mode. If successful,
185 * stores the mode in '*balance' and returns true. Otherwise returns false
186 * without modifying '*balance'. */
188 bond_mode_from_string(enum bond_mode *balance, const char *s)
190 if (!strcmp(s, bond_mode_to_string(BM_TCP))) {
192 } else if (!strcmp(s, bond_mode_to_string(BM_SLB))) {
194 } else if (!strcmp(s, bond_mode_to_string(BM_AB))) {
202 /* Returns a string representing 'balance'. */
204 bond_mode_to_string(enum bond_mode balance) {
207 return "balance-tcp";
209 return "balance-slb";
211 return "active-backup";
217 /* Creates and returns a new bond whose configuration is initially taken from
220 * The caller should register each slave on the new bond by calling
221 * bond_slave_register(). */
223 bond_create(const struct bond_settings *s, struct ofproto_dpif *ofproto)
227 bond = xzalloc(sizeof *bond);
228 bond->ofproto = ofproto;
229 hmap_init(&bond->slaves);
230 list_init(&bond->enabled_slaves);
231 ovs_mutex_init(&bond->mutex);
232 bond->next_fake_iface_update = LLONG_MAX;
233 ovs_refcount_init(&bond->ref_cnt);
236 hmap_init(&bond->pr_rule_ops);
238 bond_reconfigure(bond, s);
243 bond_ref(const struct bond *bond_)
245 struct bond *bond = CONST_CAST(struct bond *, bond_);
248 ovs_refcount_ref(&bond->ref_cnt);
255 bond_unref(struct bond *bond)
257 struct bond_slave *slave, *next_slave;
258 struct bond_pr_rule_op *pr_op, *next_op;
260 if (!bond || ovs_refcount_unref(&bond->ref_cnt) != 1) {
264 ovs_rwlock_wrlock(&rwlock);
265 hmap_remove(all_bonds, &bond->hmap_node);
266 ovs_rwlock_unlock(&rwlock);
268 HMAP_FOR_EACH_SAFE (slave, next_slave, hmap_node, &bond->slaves) {
269 hmap_remove(&bond->slaves, &slave->hmap_node);
270 /* Client owns 'slave->netdev'. */
274 hmap_destroy(&bond->slaves);
276 ovs_mutex_destroy(&bond->mutex);
280 HMAP_FOR_EACH_SAFE(pr_op, next_op, hmap_node, &bond->pr_rule_ops) {
281 hmap_remove(&bond->pr_rule_ops, &pr_op->hmap_node);
284 hmap_destroy(&bond->pr_rule_ops);
286 if (bond->recirc_id) {
287 ofproto_dpif_free_recirc_id(bond->ofproto, bond->recirc_id);
294 add_pr_rule(struct bond *bond, const struct match *match,
295 ofp_port_t out_ofport, struct rule **rule)
297 uint32_t hash = match_hash(match, 0);
298 struct bond_pr_rule_op *pr_op;
300 HMAP_FOR_EACH_WITH_HASH(pr_op, hmap_node, hash, &bond->pr_rule_ops) {
301 if (match_equal(&pr_op->match, match)) {
303 pr_op->out_ofport = out_ofport;
304 pr_op->pr_rule = rule;
309 pr_op = xmalloc(sizeof *pr_op);
310 pr_op->match = *match;
312 pr_op->out_ofport = out_ofport;
313 pr_op->pr_rule = rule;
314 hmap_insert(&bond->pr_rule_ops, &pr_op->hmap_node, hash);
318 update_recirc_rules(struct bond *bond)
321 struct bond_pr_rule_op *pr_op, *next_op;
322 uint64_t ofpacts_stub[128 / 8];
323 struct ofpbuf ofpacts;
326 ofpbuf_use_stub(&ofpacts, ofpacts_stub, sizeof ofpacts_stub);
328 HMAP_FOR_EACH(pr_op, hmap_node, &bond->pr_rule_ops) {
332 if (bond->hash && bond->recirc_id) {
333 for (i = 0; i < BOND_BUCKETS; i++) {
334 struct bond_slave *slave = bond->hash[i].slave;
337 match_init_catchall(&match);
338 match_set_recirc_id(&match, bond->recirc_id);
339 /* recirc_id -> metadata to speed up look ups. */
340 match_set_metadata(&match, htonll(bond->recirc_id));
341 match_set_dp_hash_masked(&match, i, BOND_MASK);
343 add_pr_rule(bond, &match, slave->ofp_port,
344 &bond->hash[i].pr_rule);
349 HMAP_FOR_EACH_SAFE(pr_op, next_op, hmap_node, &bond->pr_rule_ops) {
353 ofpbuf_clear(&ofpacts);
354 ofpact_put_OUTPUT(&ofpacts)->port = pr_op->out_ofport;
355 error = ofproto_dpif_add_internal_flow(bond->ofproto,
357 RECIRC_RULE_PRIORITY,
358 &ofpacts, pr_op->pr_rule);
360 char *err_s = match_to_string(&pr_op->match,
361 RECIRC_RULE_PRIORITY);
363 VLOG_ERR("failed to add post recirculation flow %s", err_s);
369 error = ofproto_dpif_delete_internal_flow(bond->ofproto,
371 RECIRC_RULE_PRIORITY);
373 char *err_s = match_to_string(&pr_op->match,
374 RECIRC_RULE_PRIORITY);
376 VLOG_ERR("failed to remove post recirculation flow %s", err_s);
380 hmap_remove(&bond->pr_rule_ops, &pr_op->hmap_node);
381 *pr_op->pr_rule = NULL;
387 ofpbuf_uninit(&ofpacts);
391 /* Updates 'bond''s overall configuration to 's'.
393 * The caller should register each slave on 'bond' by calling
394 * bond_slave_register(). This is optional if none of the slaves'
395 * configuration has changed. In any case it can't hurt.
397 * Returns true if the configuration has changed in such a way that requires
401 bond_reconfigure(struct bond *bond, const struct bond_settings *s)
403 bool revalidate = false;
405 ovs_rwlock_wrlock(&rwlock);
406 if (!bond->name || strcmp(bond->name, s->name)) {
408 hmap_remove(all_bonds, &bond->hmap_node);
411 bond->name = xstrdup(s->name);
412 hmap_insert(all_bonds, &bond->hmap_node, hash_string(bond->name, 0));
415 bond->updelay = s->up_delay;
416 bond->downdelay = s->down_delay;
418 if (bond->lacp_fallback_ab != s->lacp_fallback_ab_cfg) {
419 bond->lacp_fallback_ab = s->lacp_fallback_ab_cfg;
423 if (bond->rebalance_interval != s->rebalance_interval) {
424 bond->rebalance_interval = s->rebalance_interval;
428 if (bond->balance != s->balance) {
429 bond->balance = s->balance;
433 if (bond->basis != s->basis) {
434 bond->basis = s->basis;
439 if (bond->next_fake_iface_update == LLONG_MAX) {
440 bond->next_fake_iface_update = time_msec();
443 bond->next_fake_iface_update = LLONG_MAX;
446 if (bond->bond_revalidate) {
448 bond->bond_revalidate = false;
451 if (bond->balance != BM_AB) {
452 if (!bond->recirc_id) {
453 bond->recirc_id = ofproto_dpif_alloc_recirc_id(bond->ofproto);
455 } else if (bond->recirc_id) {
456 ofproto_dpif_free_recirc_id(bond->ofproto, bond->recirc_id);
460 if (bond->balance == BM_AB || !bond->hash || revalidate) {
461 bond_entry_reset(bond);
464 ovs_rwlock_unlock(&rwlock);
469 bond_slave_set_netdev__(struct bond_slave *slave, struct netdev *netdev)
470 OVS_REQ_WRLOCK(rwlock)
472 if (slave->netdev != netdev) {
473 slave->netdev = netdev;
474 slave->change_seq = 0;
478 /* Registers 'slave_' as a slave of 'bond'. The 'slave_' pointer is an
479 * arbitrary client-provided pointer that uniquely identifies a slave within a
480 * bond. If 'slave_' already exists within 'bond' then this function
481 * reconfigures the existing slave.
483 * 'netdev' must be the network device that 'slave_' represents. It is owned
484 * by the client, so the client must not close it before either unregistering
485 * 'slave_' or destroying 'bond'.
488 bond_slave_register(struct bond *bond, void *slave_,
489 ofp_port_t ofport, struct netdev *netdev)
491 struct bond_slave *slave;
493 ovs_rwlock_wrlock(&rwlock);
494 slave = bond_slave_lookup(bond, slave_);
496 slave = xzalloc(sizeof *slave);
498 hmap_insert(&bond->slaves, &slave->hmap_node, hash_pointer(slave_, 0));
501 slave->ofp_port = ofport;
502 slave->delay_expires = LLONG_MAX;
503 slave->name = xstrdup(netdev_get_name(netdev));
504 bond->bond_revalidate = true;
506 slave->enabled = false;
507 bond_enable_slave(slave, netdev_get_carrier(netdev));
510 bond_slave_set_netdev__(slave, netdev);
513 slave->name = xstrdup(netdev_get_name(netdev));
514 ovs_rwlock_unlock(&rwlock);
517 /* Updates the network device to be used with 'slave_' to 'netdev'.
519 * This is useful if the caller closes and re-opens the network device
520 * registered with bond_slave_register() but doesn't need to change anything
523 bond_slave_set_netdev(struct bond *bond, void *slave_, struct netdev *netdev)
525 struct bond_slave *slave;
527 ovs_rwlock_wrlock(&rwlock);
528 slave = bond_slave_lookup(bond, slave_);
530 bond_slave_set_netdev__(slave, netdev);
532 ovs_rwlock_unlock(&rwlock);
535 /* Unregisters 'slave_' from 'bond'. If 'bond' does not contain such a slave
536 * then this function has no effect.
538 * Unregistering a slave invalidates all flows. */
540 bond_slave_unregister(struct bond *bond, const void *slave_)
542 struct bond_slave *slave;
545 ovs_rwlock_wrlock(&rwlock);
546 slave = bond_slave_lookup(bond, slave_);
551 bond->bond_revalidate = true;
552 bond_enable_slave(slave, false);
554 del_active = bond->active_slave == slave;
556 struct bond_entry *e;
557 for (e = bond->hash; e <= &bond->hash[BOND_MASK]; e++) {
558 if (e->slave == slave) {
566 hmap_remove(&bond->slaves, &slave->hmap_node);
567 /* Client owns 'slave->netdev'. */
571 bond_choose_active_slave(bond);
572 bond->send_learning_packets = true;
575 ovs_rwlock_unlock(&rwlock);
578 /* Should be called on each slave in 'bond' before bond_run() to indicate
579 * whether or not 'slave_' may be enabled. This function is intended to allow
580 * other protocols to have some impact on bonding decisions. For example LACP
581 * or high level link monitoring protocols may decide that a given slave should
582 * not be able to send traffic. */
584 bond_slave_set_may_enable(struct bond *bond, void *slave_, bool may_enable)
586 ovs_rwlock_wrlock(&rwlock);
587 bond_slave_lookup(bond, slave_)->may_enable = may_enable;
588 ovs_rwlock_unlock(&rwlock);
591 /* Performs periodic maintenance on 'bond'.
593 * Returns true if the caller should revalidate its flows.
595 * The caller should check bond_should_send_learning_packets() afterward. */
597 bond_run(struct bond *bond, enum lacp_status lacp_status)
599 struct bond_slave *slave;
602 ovs_rwlock_wrlock(&rwlock);
603 if (bond->lacp_status != lacp_status) {
604 bond->lacp_status = lacp_status;
605 bond->bond_revalidate = true;
608 /* Enable slaves based on link status and LACP feedback. */
609 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
610 bond_link_status_update(slave);
611 slave->change_seq = seq_read(connectivity_seq_get());
613 if (!bond->active_slave || !bond->active_slave->enabled) {
614 bond_choose_active_slave(bond);
617 /* Update fake bond interface stats. */
618 if (time_msec() >= bond->next_fake_iface_update) {
619 bond_update_fake_slave_stats(bond);
620 bond->next_fake_iface_update = time_msec() + 1000;
623 revalidate = bond->bond_revalidate;
624 bond->bond_revalidate = false;
625 ovs_rwlock_unlock(&rwlock);
630 /* Causes poll_block() to wake up when 'bond' needs something to be done. */
632 bond_wait(struct bond *bond)
634 struct bond_slave *slave;
636 ovs_rwlock_rdlock(&rwlock);
637 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
638 if (slave->delay_expires != LLONG_MAX) {
639 poll_timer_wait_until(slave->delay_expires);
642 seq_wait(connectivity_seq_get(), slave->change_seq);
645 if (bond->next_fake_iface_update != LLONG_MAX) {
646 poll_timer_wait_until(bond->next_fake_iface_update);
649 if (bond->bond_revalidate) {
650 poll_immediate_wake();
652 ovs_rwlock_unlock(&rwlock);
654 /* We don't wait for bond->next_rebalance because rebalancing can only run
655 * at a flow account checkpoint. ofproto does checkpointing on its own
656 * schedule and bond_rebalance() gets called afterward, so we'd just be
657 * waking up for no purpose. */
660 /* MAC learning table interaction. */
663 may_send_learning_packets(const struct bond *bond)
665 return ((bond->lacp_status == LACP_DISABLED
666 && (bond->balance == BM_SLB || bond->balance == BM_AB))
667 || (bond->lacp_fallback_ab && bond->lacp_status == LACP_CONFIGURED))
668 && bond->active_slave;
671 /* Returns true if 'bond' needs the client to send out packets to assist with
672 * MAC learning on 'bond'. If this function returns true, then the client
673 * should iterate through its MAC learning table for the bridge on which 'bond'
674 * is located. For each MAC that has been learned on a port other than 'bond',
675 * it should call bond_compose_learning_packet().
677 * This function will only return true if 'bond' is in SLB or active-backup
678 * mode and LACP is not negotiated. Otherwise sending learning packets isn't
681 * Calling this function resets the state that it checks. */
683 bond_should_send_learning_packets(struct bond *bond)
687 ovs_rwlock_wrlock(&rwlock);
688 send = bond->send_learning_packets && may_send_learning_packets(bond);
689 bond->send_learning_packets = false;
690 ovs_rwlock_unlock(&rwlock);
694 /* Sends a gratuitous learning packet on 'bond' from 'eth_src' on 'vlan'.
696 * See bond_should_send_learning_packets() for description of usage. The
697 * caller should send the composed packet on the port associated with
698 * port_aux and takes ownership of the returned ofpbuf. */
700 bond_compose_learning_packet(struct bond *bond,
701 const uint8_t eth_src[ETH_ADDR_LEN],
702 uint16_t vlan, void **port_aux)
704 struct bond_slave *slave;
705 struct ofpbuf *packet;
708 ovs_rwlock_rdlock(&rwlock);
709 ovs_assert(may_send_learning_packets(bond));
710 memset(&flow, 0, sizeof flow);
711 memcpy(flow.dl_src, eth_src, ETH_ADDR_LEN);
712 slave = choose_output_slave(bond, &flow, NULL, vlan);
714 packet = ofpbuf_new(0);
715 compose_rarp(packet, eth_src);
717 eth_push_vlan(packet, htons(ETH_TYPE_VLAN), htons(vlan));
720 *port_aux = slave->aux;
721 ovs_rwlock_unlock(&rwlock);
725 /* Checks whether a packet that arrived on 'slave_' within 'bond', with an
726 * Ethernet destination address of 'eth_dst', should be admitted.
728 * The return value is one of the following:
730 * - BV_ACCEPT: Admit the packet.
732 * - BV_DROP: Drop the packet.
734 * - BV_DROP_IF_MOVED: Consult the MAC learning table for the packet's
735 * Ethernet source address and VLAN. If there is none, or if the packet
736 * is on the learned port, then admit the packet. If a different port has
737 * been learned, however, drop the packet (and do not use it for MAC
741 bond_check_admissibility(struct bond *bond, const void *slave_,
742 const uint8_t eth_dst[ETH_ADDR_LEN])
744 enum bond_verdict verdict = BV_DROP;
745 struct bond_slave *slave;
747 ovs_rwlock_rdlock(&rwlock);
748 slave = bond_slave_lookup(bond, slave_);
753 /* LACP bonds have very loose admissibility restrictions because we can
754 * assume the remote switch is aware of the bond and will "do the right
755 * thing". However, as a precaution we drop packets on disabled slaves
756 * because no correctly implemented partner switch should be sending
759 * If LACP is configured, but LACP negotiations have been unsuccessful, we
760 * drop all incoming traffic except if lacp_fallback_ab is enabled. */
761 switch (bond->lacp_status) {
762 case LACP_NEGOTIATED:
763 verdict = slave->enabled ? BV_ACCEPT : BV_DROP;
765 case LACP_CONFIGURED:
766 if (!bond->lacp_fallback_ab) {
773 /* Drop all multicast packets on inactive slaves. */
774 if (eth_addr_is_multicast(eth_dst)) {
775 if (bond->active_slave != slave) {
780 switch (bond->balance) {
782 /* TCP balanced bonds require successful LACP negotiations. Based on the
783 * above check, LACP is off or lacp_fallback_ab is true on this bond.
784 * If lacp_fallback_ab is true fall through to BM_AB case else, we
785 * drop all incoming traffic. */
786 if (!bond->lacp_fallback_ab) {
791 /* Drop all packets which arrive on backup slaves. This is similar to
792 * how Linux bonding handles active-backup bonds. */
793 if (bond->active_slave != slave) {
794 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
796 VLOG_DBG_RL(&rl, "active-backup bond received packet on backup"
797 " slave (%s) destined for " ETH_ADDR_FMT,
798 slave->name, ETH_ADDR_ARGS(eth_dst));
805 /* Drop all packets for which we have learned a different input port,
806 * because we probably sent the packet on one slave and got it back on
807 * the other. Gratuitous ARP packets are an exception to this rule:
808 * the host has moved to another switch. The exception to the
809 * exception is if we locked the learning table to avoid reflections on
811 verdict = BV_DROP_IF_MOVED;
817 ovs_rwlock_unlock(&rwlock);
822 /* Returns the slave (registered on 'bond' by bond_slave_register()) to which
823 * a packet with the given 'flow' and 'vlan' should be forwarded. Returns
824 * NULL if the packet should be dropped because no slaves are enabled.
826 * 'vlan' is not necessarily the same as 'flow->vlan_tci'. First, 'vlan'
827 * should be a VID only (i.e. excluding the PCP bits). Second,
828 * 'flow->vlan_tci' is the VLAN TCI that appeared on the packet (so it will be
829 * nonzero only for trunk ports), whereas 'vlan' is the logical VLAN that the
830 * packet belongs to (so for an access port it will be the access port's VLAN).
832 * If 'wc' is non-NULL, bitwise-OR's 'wc' with the set of bits that were
833 * significant in the selection. At some point earlier, 'wc' should
834 * have been initialized (e.g., by flow_wildcards_init_catchall()).
837 bond_choose_output_slave(struct bond *bond, const struct flow *flow,
838 struct flow_wildcards *wc, uint16_t vlan)
840 struct bond_slave *slave;
843 ovs_rwlock_rdlock(&rwlock);
844 slave = choose_output_slave(bond, flow, wc, vlan);
845 aux = slave ? slave->aux : NULL;
846 ovs_rwlock_unlock(&rwlock);
853 bond_entry_account(struct bond_entry *entry, uint64_t rule_tx_bytes)
854 OVS_REQ_WRLOCK(rwlock)
859 delta = rule_tx_bytes - entry->pr_tx_bytes;
860 entry->tx_bytes += delta;
861 entry->pr_tx_bytes = rule_tx_bytes;
865 /* Maintain bond stats using post recirculation rule byte counters.*/
867 bond_recirculation_account(struct bond *bond)
871 ovs_rwlock_wrlock(&rwlock);
872 for (i=0; i<=BOND_MASK; i++) {
873 struct bond_entry *entry = &bond->hash[i];
874 struct rule *rule = entry->pr_rule;
877 uint64_t n_packets OVS_UNUSED;
878 long long int used OVS_UNUSED;
881 rule->ofproto->ofproto_class->rule_get_stats(
882 rule, &n_packets, &n_bytes, &used);
883 bond_entry_account(entry, n_bytes);
886 ovs_rwlock_unlock(&rwlock);
890 bond_may_recirc(const struct bond *bond, uint32_t *recirc_id,
893 if (bond->balance == BM_TCP) {
895 *recirc_id = bond->recirc_id;
898 *hash_bias = bond->basis;
907 bond_update_post_recirc_rules(struct bond* bond, const bool force)
909 struct bond_entry *e;
910 bool update_rules = force; /* Always update rules if caller forces it. */
912 /* Make sure all bond entries are populated */
913 for (e = bond->hash; e <= &bond->hash[BOND_MASK]; e++) {
914 if (!e->slave || !e->slave->enabled) {
916 e->slave = CONTAINER_OF(hmap_random_node(&bond->slaves),
917 struct bond_slave, hmap_node);
918 if (!e->slave->enabled) {
919 e->slave = bond->active_slave;
925 update_recirc_rules(bond);
932 bond_is_balanced(const struct bond *bond) OVS_REQ_RDLOCK(rwlock)
934 return bond->rebalance_interval
935 && (bond->balance == BM_SLB || bond->balance == BM_TCP);
938 /* Notifies 'bond' that 'n_bytes' bytes were sent in 'flow' within 'vlan'. */
940 bond_account(struct bond *bond, const struct flow *flow, uint16_t vlan,
943 ovs_rwlock_wrlock(&rwlock);
944 if (bond_is_balanced(bond)) {
945 lookup_bond_entry(bond, flow, vlan)->tx_bytes += n_bytes;
947 ovs_rwlock_unlock(&rwlock);
950 static struct bond_slave *
951 bond_slave_from_bal_node(struct list *bal) OVS_REQ_RDLOCK(rwlock)
953 return CONTAINER_OF(bal, struct bond_slave, bal_node);
957 log_bals(struct bond *bond, const struct list *bals)
958 OVS_REQ_RDLOCK(rwlock)
960 if (VLOG_IS_DBG_ENABLED()) {
961 struct ds ds = DS_EMPTY_INITIALIZER;
962 const struct bond_slave *slave;
964 LIST_FOR_EACH (slave, bal_node, bals) {
966 ds_put_char(&ds, ',');
968 ds_put_format(&ds, " %s %"PRIu64"kB",
969 slave->name, slave->tx_bytes / 1024);
971 if (!slave->enabled) {
972 ds_put_cstr(&ds, " (disabled)");
974 if (!list_is_empty(&slave->entries)) {
975 struct bond_entry *e;
977 ds_put_cstr(&ds, " (");
978 LIST_FOR_EACH (e, list_node, &slave->entries) {
979 if (&e->list_node != list_front(&slave->entries)) {
980 ds_put_cstr(&ds, " + ");
982 ds_put_format(&ds, "h%"PRIdPTR": %"PRIu64"kB",
983 e - bond->hash, e->tx_bytes / 1024);
985 ds_put_cstr(&ds, ")");
988 VLOG_DBG("bond %s:%s", bond->name, ds_cstr(&ds));
993 /* Shifts 'hash' from its current slave to 'to'. */
995 bond_shift_load(struct bond_entry *hash, struct bond_slave *to)
996 OVS_REQ_WRLOCK(rwlock)
998 struct bond_slave *from = hash->slave;
999 struct bond *bond = from->bond;
1000 uint64_t delta = hash->tx_bytes;
1002 VLOG_INFO("bond %s: shift %"PRIu64"kB of load (with hash %"PRIdPTR") "
1003 "from %s to %s (now carrying %"PRIu64"kB and "
1004 "%"PRIu64"kB load, respectively)",
1005 bond->name, delta / 1024, hash - bond->hash,
1006 from->name, to->name,
1007 (from->tx_bytes - delta) / 1024,
1008 (to->tx_bytes + delta) / 1024);
1010 /* Shift load away from 'from' to 'to'. */
1011 from->tx_bytes -= delta;
1012 to->tx_bytes += delta;
1014 /* Arrange for flows to be revalidated. */
1016 bond->bond_revalidate = true;
1019 /* Picks and returns a bond_entry to migrate from 'from' (the most heavily
1020 * loaded bond slave) to a bond slave that has 'to_tx_bytes' bytes of load,
1021 * given that doing so must decrease the ratio of the load on the two slaves by
1022 * at least 0.1. Returns NULL if there is no appropriate entry.
1024 * The list of entries isn't sorted. I don't know of a reason to prefer to
1025 * shift away small hashes or large hashes. */
1026 static struct bond_entry *
1027 choose_entry_to_migrate(const struct bond_slave *from, uint64_t to_tx_bytes)
1028 OVS_REQ_WRLOCK(rwlock)
1030 struct bond_entry *e;
1032 if (list_is_short(&from->entries)) {
1033 /* 'from' carries no more than one MAC hash, so shifting load away from
1034 * it would be pointless. */
1038 LIST_FOR_EACH (e, list_node, &from->entries) {
1039 double old_ratio, new_ratio;
1042 if (to_tx_bytes == 0) {
1043 /* Nothing on the new slave, move it. */
1047 delta = e->tx_bytes;
1048 old_ratio = (double)from->tx_bytes / to_tx_bytes;
1049 new_ratio = (double)(from->tx_bytes - delta) / (to_tx_bytes + delta);
1050 if (old_ratio - new_ratio > 0.1
1051 && fabs(new_ratio - 1.0) < fabs(old_ratio - 1.0)) {
1052 /* We're aiming for an ideal ratio of 1, meaning both the 'from'
1053 and 'to' slave have the same load. Therefore, we only move an
1054 entry if it decreases the load on 'from', and brings us closer
1055 to equal traffic load. */
1063 /* Inserts 'slave' into 'bals' so that descending order of 'tx_bytes' is
1066 insert_bal(struct list *bals, struct bond_slave *slave)
1068 struct bond_slave *pos;
1070 LIST_FOR_EACH (pos, bal_node, bals) {
1071 if (slave->tx_bytes > pos->tx_bytes) {
1075 list_insert(&pos->bal_node, &slave->bal_node);
1078 /* Removes 'slave' from its current list and then inserts it into 'bals' so
1079 * that descending order of 'tx_bytes' is maintained. */
1081 reinsert_bal(struct list *bals, struct bond_slave *slave)
1083 list_remove(&slave->bal_node);
1084 insert_bal(bals, slave);
1087 /* If 'bond' needs rebalancing, does so.
1089 * The caller should have called bond_account() for each active flow, or in case
1090 * of recirculation is used, have called bond_recirculation_account(bond),
1091 * to ensure that flow data is consistently accounted at this point.
1093 * Return whether rebalancing took place.*/
1095 bond_rebalance(struct bond *bond)
1097 struct bond_slave *slave;
1098 struct bond_entry *e;
1100 bool rebalanced = false;
1102 ovs_rwlock_wrlock(&rwlock);
1103 if (!bond_is_balanced(bond) || time_msec() < bond->next_rebalance) {
1106 bond->next_rebalance = time_msec() + bond->rebalance_interval;
1108 /* Add each bond_entry to its slave's 'entries' list.
1109 * Compute each slave's tx_bytes as the sum of its entries' tx_bytes. */
1110 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1111 slave->tx_bytes = 0;
1112 list_init(&slave->entries);
1114 for (e = &bond->hash[0]; e <= &bond->hash[BOND_MASK]; e++) {
1115 if (e->slave && e->tx_bytes) {
1116 e->slave->tx_bytes += e->tx_bytes;
1117 list_push_back(&e->slave->entries, &e->list_node);
1121 /* Add enabled slaves to 'bals' in descending order of tx_bytes.
1123 * XXX This is O(n**2) in the number of slaves but it could be O(n lg n)
1124 * with a proper list sort algorithm. */
1126 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1127 if (slave->enabled) {
1128 insert_bal(&bals, slave);
1131 log_bals(bond, &bals);
1133 /* Shift load from the most-loaded slaves to the least-loaded slaves. */
1134 while (!list_is_short(&bals)) {
1135 struct bond_slave *from = bond_slave_from_bal_node(list_front(&bals));
1136 struct bond_slave *to = bond_slave_from_bal_node(list_back(&bals));
1139 overload = from->tx_bytes - to->tx_bytes;
1140 if (overload < to->tx_bytes >> 5 || overload < 100000) {
1141 /* The extra load on 'from' (and all less-loaded slaves), compared
1142 * to that of 'to' (the least-loaded slave), is less than ~3%, or
1143 * it is less than ~1Mbps. No point in rebalancing. */
1147 /* 'from' is carrying significantly more load than 'to'. Pick a hash
1148 * to move from 'from' to 'to'. */
1149 e = choose_entry_to_migrate(from, to->tx_bytes);
1151 bond_shift_load(e, to);
1153 /* Delete element from from->entries.
1155 * We don't add the element to to->hashes. That would only allow
1156 * 'e' to be migrated to another slave in this rebalancing run, and
1157 * there is no point in doing that. */
1158 list_remove(&e->list_node);
1160 /* Re-sort 'bals'. */
1161 reinsert_bal(&bals, from);
1162 reinsert_bal(&bals, to);
1165 /* Can't usefully migrate anything away from 'from'.
1166 * Don't reconsider it. */
1167 list_remove(&from->bal_node);
1171 /* Implement exponentially weighted moving average. A weight of 1/2 causes
1172 * historical data to decay to <1% in 7 rebalancing runs. 1,000,000 bytes
1173 * take 20 rebalancing runs to decay to 0 and get deleted entirely. */
1174 for (e = &bond->hash[0]; e <= &bond->hash[BOND_MASK]; e++) {
1179 ovs_rwlock_unlock(&rwlock);
1183 /* Bonding unixctl user interface functions. */
1185 static struct bond *
1186 bond_find(const char *name) OVS_REQ_RDLOCK(rwlock)
1190 HMAP_FOR_EACH_WITH_HASH (bond, hmap_node, hash_string(name, 0),
1192 if (!strcmp(bond->name, name)) {
1199 static struct bond_slave *
1200 bond_lookup_slave(struct bond *bond, const char *slave_name)
1202 struct bond_slave *slave;
1204 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1205 if (!strcmp(slave->name, slave_name)) {
1213 bond_unixctl_list(struct unixctl_conn *conn,
1214 int argc OVS_UNUSED, const char *argv[] OVS_UNUSED,
1215 void *aux OVS_UNUSED)
1217 struct ds ds = DS_EMPTY_INITIALIZER;
1218 const struct bond *bond;
1220 ds_put_cstr(&ds, "bond\ttype\trecircID\tslaves\n");
1222 ovs_rwlock_rdlock(&rwlock);
1223 HMAP_FOR_EACH (bond, hmap_node, all_bonds) {
1224 const struct bond_slave *slave;
1227 ds_put_format(&ds, "%s\t%s\t%d\t", bond->name,
1228 bond_mode_to_string(bond->balance), bond->recirc_id);
1231 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1233 ds_put_cstr(&ds, ", ");
1235 ds_put_cstr(&ds, slave->name);
1237 ds_put_char(&ds, '\n');
1239 ovs_rwlock_unlock(&rwlock);
1240 unixctl_command_reply(conn, ds_cstr(&ds));
1245 bond_print_details(struct ds *ds, const struct bond *bond)
1246 OVS_REQ_RDLOCK(rwlock)
1248 struct shash slave_shash = SHASH_INITIALIZER(&slave_shash);
1249 const struct shash_node **sorted_slaves = NULL;
1250 const struct bond_slave *slave;
1255 ds_put_format(ds, "---- %s ----\n", bond->name);
1256 ds_put_format(ds, "bond_mode: %s\n",
1257 bond_mode_to_string(bond->balance));
1259 may_recirc = bond_may_recirc(bond, &recirc_id, NULL);
1260 ds_put_format(ds, "bond may use recirculation: %s, Recirc-ID : %d\n",
1261 may_recirc ? "yes" : "no", may_recirc ? recirc_id: -1);
1263 ds_put_format(ds, "bond-hash-basis: %"PRIu32"\n", bond->basis);
1265 ds_put_format(ds, "updelay: %d ms\n", bond->updelay);
1266 ds_put_format(ds, "downdelay: %d ms\n", bond->downdelay);
1268 if (bond_is_balanced(bond)) {
1269 ds_put_format(ds, "next rebalance: %lld ms\n",
1270 bond->next_rebalance - time_msec());
1273 ds_put_cstr(ds, "lacp_status: ");
1274 switch (bond->lacp_status) {
1275 case LACP_NEGOTIATED:
1276 ds_put_cstr(ds, "negotiated\n");
1278 case LACP_CONFIGURED:
1279 ds_put_cstr(ds, "configured\n");
1282 ds_put_cstr(ds, "off\n");
1285 ds_put_cstr(ds, "<unknown>\n");
1289 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1290 shash_add(&slave_shash, slave->name, slave);
1292 sorted_slaves = shash_sort(&slave_shash);
1294 for (i = 0; i < shash_count(&slave_shash); i++) {
1295 struct bond_entry *be;
1297 slave = sorted_slaves[i]->data;
1300 ds_put_format(ds, "\nslave %s: %s\n",
1301 slave->name, slave->enabled ? "enabled" : "disabled");
1302 if (slave == bond->active_slave) {
1303 ds_put_cstr(ds, "\tactive slave\n");
1305 if (slave->delay_expires != LLONG_MAX) {
1306 ds_put_format(ds, "\t%s expires in %lld ms\n",
1307 slave->enabled ? "downdelay" : "updelay",
1308 slave->delay_expires - time_msec());
1311 ds_put_format(ds, "\tmay_enable: %s\n",
1312 slave->may_enable ? "true" : "false");
1314 if (!bond_is_balanced(bond)) {
1319 for (be = bond->hash; be <= &bond->hash[BOND_MASK]; be++) {
1320 int hash = be - bond->hash;
1323 if (be->slave != slave) {
1327 be_tx_k = be->tx_bytes / 1024;
1329 ds_put_format(ds, "\thash %d: %"PRIu64" kB load\n",
1333 /* XXX How can we list the MACs assigned to hashes of SLB bonds? */
1336 shash_destroy(&slave_shash);
1337 free(sorted_slaves);
1338 ds_put_cstr(ds, "\n");
1342 bond_unixctl_show(struct unixctl_conn *conn,
1343 int argc, const char *argv[],
1344 void *aux OVS_UNUSED)
1346 struct ds ds = DS_EMPTY_INITIALIZER;
1348 ovs_rwlock_rdlock(&rwlock);
1350 const struct bond *bond = bond_find(argv[1]);
1353 unixctl_command_reply_error(conn, "no such bond");
1356 bond_print_details(&ds, bond);
1358 const struct bond *bond;
1360 HMAP_FOR_EACH (bond, hmap_node, all_bonds) {
1361 bond_print_details(&ds, bond);
1365 unixctl_command_reply(conn, ds_cstr(&ds));
1369 ovs_rwlock_unlock(&rwlock);
1373 bond_unixctl_migrate(struct unixctl_conn *conn,
1374 int argc OVS_UNUSED, const char *argv[],
1375 void *aux OVS_UNUSED)
1377 const char *bond_s = argv[1];
1378 const char *hash_s = argv[2];
1379 const char *slave_s = argv[3];
1381 struct bond_slave *slave;
1382 struct bond_entry *entry;
1385 ovs_rwlock_wrlock(&rwlock);
1386 bond = bond_find(bond_s);
1388 unixctl_command_reply_error(conn, "no such bond");
1392 if (bond->balance != BM_SLB) {
1393 unixctl_command_reply_error(conn, "not an SLB bond");
1397 if (strspn(hash_s, "0123456789") == strlen(hash_s)) {
1398 hash = atoi(hash_s) & BOND_MASK;
1400 unixctl_command_reply_error(conn, "bad hash");
1404 slave = bond_lookup_slave(bond, slave_s);
1406 unixctl_command_reply_error(conn, "no such slave");
1410 if (!slave->enabled) {
1411 unixctl_command_reply_error(conn, "cannot migrate to disabled slave");
1415 entry = &bond->hash[hash];
1416 bond->bond_revalidate = true;
1417 entry->slave = slave;
1418 unixctl_command_reply(conn, "migrated");
1421 ovs_rwlock_unlock(&rwlock);
1425 bond_unixctl_set_active_slave(struct unixctl_conn *conn,
1426 int argc OVS_UNUSED, const char *argv[],
1427 void *aux OVS_UNUSED)
1429 const char *bond_s = argv[1];
1430 const char *slave_s = argv[2];
1432 struct bond_slave *slave;
1434 ovs_rwlock_wrlock(&rwlock);
1435 bond = bond_find(bond_s);
1437 unixctl_command_reply_error(conn, "no such bond");
1441 slave = bond_lookup_slave(bond, slave_s);
1443 unixctl_command_reply_error(conn, "no such slave");
1447 if (!slave->enabled) {
1448 unixctl_command_reply_error(conn, "cannot make disabled slave active");
1452 if (bond->active_slave != slave) {
1453 bond->bond_revalidate = true;
1454 bond->active_slave = slave;
1455 VLOG_INFO("bond %s: active interface is now %s",
1456 bond->name, slave->name);
1457 bond->send_learning_packets = true;
1458 unixctl_command_reply(conn, "done");
1460 unixctl_command_reply(conn, "no change");
1463 ovs_rwlock_unlock(&rwlock);
1467 enable_slave(struct unixctl_conn *conn, const char *argv[], bool enable)
1469 const char *bond_s = argv[1];
1470 const char *slave_s = argv[2];
1472 struct bond_slave *slave;
1474 ovs_rwlock_wrlock(&rwlock);
1475 bond = bond_find(bond_s);
1477 unixctl_command_reply_error(conn, "no such bond");
1481 slave = bond_lookup_slave(bond, slave_s);
1483 unixctl_command_reply_error(conn, "no such slave");
1487 bond_enable_slave(slave, enable);
1488 unixctl_command_reply(conn, enable ? "enabled" : "disabled");
1491 ovs_rwlock_unlock(&rwlock);
1495 bond_unixctl_enable_slave(struct unixctl_conn *conn,
1496 int argc OVS_UNUSED, const char *argv[],
1497 void *aux OVS_UNUSED)
1499 enable_slave(conn, argv, true);
1503 bond_unixctl_disable_slave(struct unixctl_conn *conn,
1504 int argc OVS_UNUSED, const char *argv[],
1505 void *aux OVS_UNUSED)
1507 enable_slave(conn, argv, false);
1511 bond_unixctl_hash(struct unixctl_conn *conn, int argc, const char *argv[],
1512 void *aux OVS_UNUSED)
1514 const char *mac_s = argv[1];
1515 const char *vlan_s = argc > 2 ? argv[2] : NULL;
1516 const char *basis_s = argc > 3 ? argv[3] : NULL;
1517 uint8_t mac[ETH_ADDR_LEN];
1524 if (!ovs_scan(vlan_s, "%u", &vlan)) {
1525 unixctl_command_reply_error(conn, "invalid vlan");
1533 if (!ovs_scan(basis_s, "%"SCNu32, &basis)) {
1534 unixctl_command_reply_error(conn, "invalid basis");
1541 if (ovs_scan(mac_s, ETH_ADDR_SCAN_FMT, ETH_ADDR_SCAN_ARGS(mac))) {
1542 hash = bond_hash_src(mac, vlan, basis) & BOND_MASK;
1544 hash_cstr = xasprintf("%u", hash);
1545 unixctl_command_reply(conn, hash_cstr);
1548 unixctl_command_reply_error(conn, "invalid mac");
1555 unixctl_command_register("bond/list", "", 0, 0, bond_unixctl_list, NULL);
1556 unixctl_command_register("bond/show", "[port]", 0, 1, bond_unixctl_show,
1558 unixctl_command_register("bond/migrate", "port hash slave", 3, 3,
1559 bond_unixctl_migrate, NULL);
1560 unixctl_command_register("bond/set-active-slave", "port slave", 2, 2,
1561 bond_unixctl_set_active_slave, NULL);
1562 unixctl_command_register("bond/enable-slave", "port slave", 2, 2,
1563 bond_unixctl_enable_slave, NULL);
1564 unixctl_command_register("bond/disable-slave", "port slave", 2, 2,
1565 bond_unixctl_disable_slave, NULL);
1566 unixctl_command_register("bond/hash", "mac [vlan] [basis]", 1, 3,
1567 bond_unixctl_hash, NULL);
1571 bond_entry_reset(struct bond *bond)
1573 if (bond->balance != BM_AB) {
1574 size_t hash_len = BOND_BUCKETS * sizeof *bond->hash;
1577 bond->hash = xmalloc(hash_len);
1579 memset(bond->hash, 0, hash_len);
1581 bond->next_rebalance = time_msec() + bond->rebalance_interval;
1588 static struct bond_slave *
1589 bond_slave_lookup(struct bond *bond, const void *slave_)
1591 struct bond_slave *slave;
1593 HMAP_FOR_EACH_IN_BUCKET (slave, hmap_node, hash_pointer(slave_, 0),
1595 if (slave->aux == slave_) {
1604 bond_enable_slave(struct bond_slave *slave, bool enable)
1606 slave->delay_expires = LLONG_MAX;
1607 if (enable != slave->enabled) {
1608 slave->bond->bond_revalidate = true;
1609 slave->enabled = enable;
1611 ovs_mutex_lock(&slave->bond->mutex);
1613 list_insert(&slave->bond->enabled_slaves, &slave->list_node);
1615 list_remove(&slave->list_node);
1617 ovs_mutex_unlock(&slave->bond->mutex);
1619 VLOG_INFO("interface %s: %s", slave->name,
1620 slave->enabled ? "enabled" : "disabled");
1625 bond_link_status_update(struct bond_slave *slave)
1627 struct bond *bond = slave->bond;
1630 up = netdev_get_carrier(slave->netdev) && slave->may_enable;
1631 if ((up == slave->enabled) != (slave->delay_expires == LLONG_MAX)) {
1632 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
1633 VLOG_INFO_RL(&rl, "interface %s: link state %s",
1634 slave->name, up ? "up" : "down");
1635 if (up == slave->enabled) {
1636 slave->delay_expires = LLONG_MAX;
1637 VLOG_INFO_RL(&rl, "interface %s: will not be %s",
1638 slave->name, up ? "disabled" : "enabled");
1640 int delay = (bond->lacp_status != LACP_DISABLED ? 0
1641 : up ? bond->updelay : bond->downdelay);
1642 slave->delay_expires = time_msec() + delay;
1644 VLOG_INFO_RL(&rl, "interface %s: will be %s if it stays %s "
1647 up ? "enabled" : "disabled",
1654 if (time_msec() >= slave->delay_expires) {
1655 bond_enable_slave(slave, up);
1660 bond_hash_src(const uint8_t mac[ETH_ADDR_LEN], uint16_t vlan, uint32_t basis)
1662 return hash_mac(mac, vlan, basis);
1666 bond_hash_tcp(const struct flow *flow, uint16_t vlan, uint32_t basis)
1668 struct flow hash_flow = *flow;
1669 hash_flow.vlan_tci = htons(vlan);
1671 /* The symmetric quality of this hash function is not required, but
1672 * flow_hash_symmetric_l4 already exists, and is sufficient for our
1673 * purposes, so we use it out of convenience. */
1674 return flow_hash_symmetric_l4(&hash_flow, basis);
1678 bond_hash(const struct bond *bond, const struct flow *flow, uint16_t vlan)
1680 ovs_assert(bond->balance == BM_TCP || bond->balance == BM_SLB);
1682 return (bond->balance == BM_TCP
1683 ? bond_hash_tcp(flow, vlan, bond->basis)
1684 : bond_hash_src(flow->dl_src, vlan, bond->basis));
1687 static struct bond_entry *
1688 lookup_bond_entry(const struct bond *bond, const struct flow *flow,
1691 return &bond->hash[bond_hash(bond, flow, vlan) & BOND_MASK];
1694 /* Selects and returns an enabled slave from the 'enabled_slaves' list
1695 * in a round-robin fashion. If the 'enabled_slaves' list is empty,
1697 static struct bond_slave *
1698 get_enabled_slave(struct bond *bond)
1702 ovs_mutex_lock(&bond->mutex);
1703 if (list_is_empty(&bond->enabled_slaves)) {
1704 ovs_mutex_unlock(&bond->mutex);
1708 node = list_pop_front(&bond->enabled_slaves);
1709 list_push_back(&bond->enabled_slaves, node);
1710 ovs_mutex_unlock(&bond->mutex);
1712 return CONTAINER_OF(node, struct bond_slave, list_node);
1715 static struct bond_slave *
1716 choose_output_slave(const struct bond *bond, const struct flow *flow,
1717 struct flow_wildcards *wc, uint16_t vlan)
1719 struct bond_entry *e;
1722 balance = bond->balance;
1723 if (bond->lacp_status == LACP_CONFIGURED) {
1724 /* LACP has been configured on this bond but negotiations were
1725 * unsuccussful. If lacp_fallback_ab is enabled use active-
1726 * backup mode else drop all traffic. */
1727 if (!bond->lacp_fallback_ab) {
1735 return bond->active_slave;
1738 if (bond->lacp_status != LACP_NEGOTIATED) {
1739 /* Must have LACP negotiations for TCP balanced bonds. */
1743 flow_mask_hash_fields(flow, wc, NX_HASH_FIELDS_SYMMETRIC_L4);
1748 flow_mask_hash_fields(flow, wc, NX_HASH_FIELDS_ETH_SRC);
1750 e = lookup_bond_entry(bond, flow, vlan);
1751 if (!e->slave || !e->slave->enabled) {
1752 e->slave = get_enabled_slave(CONST_CAST(struct bond*, bond));
1761 static struct bond_slave *
1762 bond_choose_slave(const struct bond *bond)
1764 struct bond_slave *slave, *best;
1766 /* Find an enabled slave. */
1767 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1768 if (slave->enabled) {
1773 /* All interfaces are disabled. Find an interface that will be enabled
1774 * after its updelay expires. */
1776 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1777 if (slave->delay_expires != LLONG_MAX
1778 && slave->may_enable
1779 && (!best || slave->delay_expires < best->delay_expires)) {
1787 bond_choose_active_slave(struct bond *bond)
1789 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
1790 struct bond_slave *old_active_slave = bond->active_slave;
1792 bond->active_slave = bond_choose_slave(bond);
1793 if (bond->active_slave) {
1794 if (bond->active_slave->enabled) {
1795 VLOG_INFO_RL(&rl, "bond %s: active interface is now %s",
1796 bond->name, bond->active_slave->name);
1798 VLOG_INFO_RL(&rl, "bond %s: active interface is now %s, skipping "
1799 "remaining %lld ms updelay (since no interface was "
1800 "enabled)", bond->name, bond->active_slave->name,
1801 bond->active_slave->delay_expires - time_msec());
1802 bond_enable_slave(bond->active_slave, true);
1805 bond->send_learning_packets = true;
1806 } else if (old_active_slave) {
1807 VLOG_INFO_RL(&rl, "bond %s: all interfaces disabled", bond->name);
1811 /* Attempts to make the sum of the bond slaves' statistics appear on the fake
1812 * bond interface. */
1814 bond_update_fake_slave_stats(struct bond *bond)
1816 struct netdev_stats bond_stats;
1817 struct bond_slave *slave;
1818 struct netdev *bond_dev;
1820 memset(&bond_stats, 0, sizeof bond_stats);
1822 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1823 struct netdev_stats slave_stats;
1825 if (!netdev_get_stats(slave->netdev, &slave_stats)) {
1826 /* XXX: We swap the stats here because they are swapped back when
1827 * reported by the internal device. The reason for this is
1828 * internal devices normally represent packets going into the
1829 * system but when used as fake bond device they represent packets
1830 * leaving the system. We really should do this in the internal
1831 * device itself because changing it here reverses the counts from
1832 * the perspective of the switch. However, the internal device
1833 * doesn't know what type of device it represents so we have to do
1834 * it here for now. */
1835 bond_stats.tx_packets += slave_stats.rx_packets;
1836 bond_stats.tx_bytes += slave_stats.rx_bytes;
1837 bond_stats.rx_packets += slave_stats.tx_packets;
1838 bond_stats.rx_bytes += slave_stats.tx_bytes;
1842 if (!netdev_open(bond->name, "system", &bond_dev)) {
1843 netdev_set_stats(bond_dev, &bond_stats);
1844 netdev_close(bond_dev);