2 * Copyright (c) 2008, 2009, 2010, 2011, 2012, 2013, 2014 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
27 #include "ofp-actions.h"
29 #include "ofproto/ofproto-provider.h"
30 #include "ofproto/ofproto-dpif.h"
31 #include "connectivity.h"
33 #include "dynamic-string.h"
42 #include "poll-loop.h"
50 VLOG_DEFINE_THIS_MODULE(bond);
52 static struct ovs_rwlock rwlock = OVS_RWLOCK_INITIALIZER;
53 static struct hmap all_bonds__ = HMAP_INITIALIZER(&all_bonds__);
54 static struct hmap *const all_bonds OVS_GUARDED_BY(rwlock) = &all_bonds__;
56 /* Bit-mask for hashing a flow down to a bucket. */
57 #define BOND_MASK 0xff
58 #define BOND_BUCKETS (BOND_MASK + 1)
59 #define RECIRC_RULE_PRIORITY 20 /* Priority level for internal rules */
61 /* A hash bucket for mapping a flow to a slave.
62 * "struct bond" has an array of BOND_BUCKETS of these. */
64 struct bond_slave *slave; /* Assigned slave, NULL if unassigned. */
65 uint64_t tx_bytes /* Count of bytes recently transmitted. */
66 OVS_GUARDED_BY(rwlock);
67 struct list list_node; /* In bond_slave's 'entries' list. */
71 * 'pr_rule' is the post-recirculation rule for this entry.
72 * 'pr_tx_bytes' is the most recently seen statistics for 'pr_rule', which
73 * is used to determine delta (applied to 'tx_bytes' above.) */
75 uint64_t pr_tx_bytes OVS_GUARDED_BY(rwlock);
78 /* A bond slave, that is, one of the links comprising a bond. */
80 struct hmap_node hmap_node; /* In struct bond's slaves hmap. */
81 struct list list_node; /* In struct bond's enabled_slaves list. */
82 struct bond *bond; /* The bond that contains this slave. */
83 void *aux; /* Client-provided handle for this slave. */
85 struct netdev *netdev; /* Network device, owned by the client. */
86 unsigned int change_seq; /* Tracks changes in 'netdev'. */
87 ofp_port_t ofp_port; /* Open flow port number */
88 char *name; /* Name (a copy of netdev_get_name(netdev)). */
91 long long delay_expires; /* Time after which 'enabled' may change. */
92 bool enabled; /* May be chosen for flows? */
93 bool may_enable; /* Client considers this slave bondable. */
95 /* Rebalancing info. Used only by bond_rebalance(). */
96 struct list bal_node; /* In bond_rebalance()'s 'bals' list. */
97 struct list entries; /* 'struct bond_entry's assigned here. */
98 uint64_t tx_bytes; /* Sum across 'tx_bytes' of entries. */
101 /* A bond, that is, a set of network devices grouped to improve performance or
104 struct hmap_node hmap_node; /* In 'all_bonds' hmap. */
105 char *name; /* Name provided by client. */
106 struct ofproto_dpif *ofproto; /* The bridge this bond belongs to. */
113 * Any reader or writer of 'enabled_slaves' must hold 'mutex'.
114 * (To prevent the bond_slave from disappearing they must also hold
116 struct ovs_mutex mutex OVS_ACQ_AFTER(rwlock);
117 struct list enabled_slaves OVS_GUARDED; /* Contains struct bond_slaves. */
120 enum bond_mode balance; /* Balancing mode, one of BM_*. */
121 struct bond_slave *active_slave;
122 int updelay, downdelay; /* Delay before slave goes up/down, in ms. */
123 enum lacp_status lacp_status; /* Status of LACP negotiations. */
124 bool bond_revalidate; /* True if flows need revalidation. */
125 uint32_t basis; /* Basis for flow hash function. */
127 /* SLB specific bonding info. */
128 struct bond_entry *hash; /* An array of BOND_BUCKETS elements. */
129 int rebalance_interval; /* Interval between rebalances, in ms. */
130 long long int next_rebalance; /* Next rebalancing time. */
131 bool send_learning_packets;
132 uint32_t recirc_id; /* Non zero if recirculation can be used.*/
133 struct hmap pr_rule_ops; /* Helps to maintain post recirculation rules.*/
135 /* Legacy compatibility. */
136 long long int next_fake_iface_update; /* LLONG_MAX if disabled. */
137 bool lacp_fallback_ab; /* Fallback to active-backup on LACP failure. */
139 struct ovs_refcount ref_cnt;
142 /* What to do with an bond_recirc_rule. */
144 ADD, /* Add the rule to ofproto's flow table. */
145 DEL, /* Delete the rule from the ofproto's flow table. */
148 /* A rule to add to or delete from ofproto's internal flow table. */
149 struct bond_pr_rule_op {
150 struct hmap_node hmap_node;
152 ofp_port_t out_ofport;
154 struct rule **pr_rule;
157 static void bond_entry_reset(struct bond *) OVS_REQ_WRLOCK(rwlock);
158 static struct bond_slave *bond_slave_lookup(struct bond *, const void *slave_)
159 OVS_REQ_RDLOCK(rwlock);
160 static void bond_enable_slave(struct bond_slave *, bool enable)
161 OVS_REQ_WRLOCK(rwlock);
162 static void bond_link_status_update(struct bond_slave *)
163 OVS_REQ_WRLOCK(rwlock);
164 static void bond_choose_active_slave(struct bond *)
165 OVS_REQ_WRLOCK(rwlock);
166 static unsigned int bond_hash_src(const uint8_t mac[ETH_ADDR_LEN],
167 uint16_t vlan, uint32_t basis);
168 static unsigned int bond_hash_tcp(const struct flow *, uint16_t vlan,
170 static struct bond_entry *lookup_bond_entry(const struct bond *,
173 OVS_REQ_RDLOCK(rwlock);
174 static struct bond_slave *get_enabled_slave(struct bond *)
175 OVS_REQ_RDLOCK(rwlock);
176 static struct bond_slave *choose_output_slave(const struct bond *,
178 struct flow_wildcards *,
180 OVS_REQ_RDLOCK(rwlock);
181 static void bond_update_fake_slave_stats(struct bond *)
182 OVS_REQ_RDLOCK(rwlock);
184 /* Attempts to parse 's' as the name of a bond balancing mode. If successful,
185 * stores the mode in '*balance' and returns true. Otherwise returns false
186 * without modifying '*balance'. */
188 bond_mode_from_string(enum bond_mode *balance, const char *s)
190 if (!strcmp(s, bond_mode_to_string(BM_TCP))) {
192 } else if (!strcmp(s, bond_mode_to_string(BM_SLB))) {
194 } else if (!strcmp(s, bond_mode_to_string(BM_AB))) {
202 /* Returns a string representing 'balance'. */
204 bond_mode_to_string(enum bond_mode balance) {
207 return "balance-tcp";
209 return "balance-slb";
211 return "active-backup";
217 /* Creates and returns a new bond whose configuration is initially taken from
220 * The caller should register each slave on the new bond by calling
221 * bond_slave_register(). */
223 bond_create(const struct bond_settings *s, struct ofproto_dpif *ofproto)
227 bond = xzalloc(sizeof *bond);
228 bond->ofproto = ofproto;
229 hmap_init(&bond->slaves);
230 list_init(&bond->enabled_slaves);
231 ovs_mutex_init(&bond->mutex);
232 bond->next_fake_iface_update = LLONG_MAX;
233 ovs_refcount_init(&bond->ref_cnt);
236 hmap_init(&bond->pr_rule_ops);
238 bond_reconfigure(bond, s);
243 bond_ref(const struct bond *bond_)
245 struct bond *bond = CONST_CAST(struct bond *, bond_);
248 ovs_refcount_ref(&bond->ref_cnt);
255 bond_unref(struct bond *bond)
257 struct bond_slave *slave, *next_slave;
258 struct bond_pr_rule_op *pr_op, *next_op;
260 if (!bond || ovs_refcount_unref(&bond->ref_cnt) != 1) {
264 ovs_rwlock_wrlock(&rwlock);
265 hmap_remove(all_bonds, &bond->hmap_node);
266 ovs_rwlock_unlock(&rwlock);
268 HMAP_FOR_EACH_SAFE (slave, next_slave, hmap_node, &bond->slaves) {
269 hmap_remove(&bond->slaves, &slave->hmap_node);
270 /* Client owns 'slave->netdev'. */
274 hmap_destroy(&bond->slaves);
276 ovs_mutex_destroy(&bond->mutex);
280 HMAP_FOR_EACH_SAFE(pr_op, next_op, hmap_node, &bond->pr_rule_ops) {
281 hmap_remove(&bond->pr_rule_ops, &pr_op->hmap_node);
284 hmap_destroy(&bond->pr_rule_ops);
286 if (bond->recirc_id) {
287 ofproto_dpif_free_recirc_id(bond->ofproto, bond->recirc_id);
294 add_pr_rule(struct bond *bond, const struct match *match,
295 ofp_port_t out_ofport, struct rule **rule)
297 uint32_t hash = match_hash(match, 0);
298 struct bond_pr_rule_op *pr_op;
300 HMAP_FOR_EACH_WITH_HASH(pr_op, hmap_node, hash, &bond->pr_rule_ops) {
301 if (match_equal(&pr_op->match, match)) {
303 pr_op->out_ofport = out_ofport;
304 pr_op->pr_rule = rule;
309 pr_op = xmalloc(sizeof *pr_op);
310 pr_op->match = *match;
312 pr_op->out_ofport = out_ofport;
313 pr_op->pr_rule = rule;
314 hmap_insert(&bond->pr_rule_ops, &pr_op->hmap_node, hash);
318 update_recirc_rules(struct bond *bond)
321 struct bond_pr_rule_op *pr_op, *next_op;
322 uint64_t ofpacts_stub[128 / 8];
323 struct ofpbuf ofpacts;
326 ofpbuf_use_stub(&ofpacts, ofpacts_stub, sizeof ofpacts_stub);
328 HMAP_FOR_EACH(pr_op, hmap_node, &bond->pr_rule_ops) {
332 if (bond->hash && bond->recirc_id) {
333 for (i = 0; i < BOND_BUCKETS; i++) {
334 struct bond_slave *slave = bond->hash[i].slave;
337 match_init_catchall(&match);
338 match_set_recirc_id(&match, bond->recirc_id);
339 match_set_dp_hash_masked(&match, i, BOND_MASK);
341 add_pr_rule(bond, &match, slave->ofp_port,
342 &bond->hash[i].pr_rule);
347 HMAP_FOR_EACH_SAFE(pr_op, next_op, hmap_node, &bond->pr_rule_ops) {
351 ofpbuf_clear(&ofpacts);
352 ofpact_put_OUTPUT(&ofpacts)->port = pr_op->out_ofport;
353 error = ofproto_dpif_add_internal_flow(bond->ofproto,
355 RECIRC_RULE_PRIORITY,
356 &ofpacts, pr_op->pr_rule);
358 char *err_s = match_to_string(&pr_op->match,
359 RECIRC_RULE_PRIORITY);
361 VLOG_ERR("failed to add post recirculation flow %s", err_s);
367 error = ofproto_dpif_delete_internal_flow(bond->ofproto,
369 RECIRC_RULE_PRIORITY);
371 char *err_s = match_to_string(&pr_op->match,
372 RECIRC_RULE_PRIORITY);
374 VLOG_ERR("failed to remove post recirculation flow %s", err_s);
378 hmap_remove(&bond->pr_rule_ops, &pr_op->hmap_node);
379 *pr_op->pr_rule = NULL;
385 ofpbuf_uninit(&ofpacts);
389 /* Updates 'bond''s overall configuration to 's'.
391 * The caller should register each slave on 'bond' by calling
392 * bond_slave_register(). This is optional if none of the slaves'
393 * configuration has changed. In any case it can't hurt.
395 * Returns true if the configuration has changed in such a way that requires
399 bond_reconfigure(struct bond *bond, const struct bond_settings *s)
401 bool revalidate = false;
403 ovs_rwlock_wrlock(&rwlock);
404 if (!bond->name || strcmp(bond->name, s->name)) {
406 hmap_remove(all_bonds, &bond->hmap_node);
409 bond->name = xstrdup(s->name);
410 hmap_insert(all_bonds, &bond->hmap_node, hash_string(bond->name, 0));
413 bond->updelay = s->up_delay;
414 bond->downdelay = s->down_delay;
416 if (bond->lacp_fallback_ab != s->lacp_fallback_ab_cfg) {
417 bond->lacp_fallback_ab = s->lacp_fallback_ab_cfg;
421 if (bond->rebalance_interval != s->rebalance_interval) {
422 bond->rebalance_interval = s->rebalance_interval;
426 if (bond->balance != s->balance) {
427 bond->balance = s->balance;
431 if (bond->basis != s->basis) {
432 bond->basis = s->basis;
437 if (bond->next_fake_iface_update == LLONG_MAX) {
438 bond->next_fake_iface_update = time_msec();
441 bond->next_fake_iface_update = LLONG_MAX;
444 if (bond->bond_revalidate) {
446 bond->bond_revalidate = false;
449 if (bond->balance != BM_AB) {
450 if (!bond->recirc_id) {
451 bond->recirc_id = ofproto_dpif_alloc_recirc_id(bond->ofproto);
453 } else if (bond->recirc_id) {
454 ofproto_dpif_free_recirc_id(bond->ofproto, bond->recirc_id);
458 if (bond->balance == BM_AB || !bond->hash || revalidate) {
459 bond_entry_reset(bond);
462 ovs_rwlock_unlock(&rwlock);
467 bond_slave_set_netdev__(struct bond_slave *slave, struct netdev *netdev)
468 OVS_REQ_WRLOCK(rwlock)
470 if (slave->netdev != netdev) {
471 slave->netdev = netdev;
472 slave->change_seq = 0;
476 /* Registers 'slave_' as a slave of 'bond'. The 'slave_' pointer is an
477 * arbitrary client-provided pointer that uniquely identifies a slave within a
478 * bond. If 'slave_' already exists within 'bond' then this function
479 * reconfigures the existing slave.
481 * 'netdev' must be the network device that 'slave_' represents. It is owned
482 * by the client, so the client must not close it before either unregistering
483 * 'slave_' or destroying 'bond'.
486 bond_slave_register(struct bond *bond, void *slave_,
487 ofp_port_t ofport, struct netdev *netdev)
489 struct bond_slave *slave;
491 ovs_rwlock_wrlock(&rwlock);
492 slave = bond_slave_lookup(bond, slave_);
494 slave = xzalloc(sizeof *slave);
496 hmap_insert(&bond->slaves, &slave->hmap_node, hash_pointer(slave_, 0));
499 slave->ofp_port = ofport;
500 slave->delay_expires = LLONG_MAX;
501 slave->name = xstrdup(netdev_get_name(netdev));
502 bond->bond_revalidate = true;
504 slave->enabled = false;
505 bond_enable_slave(slave, netdev_get_carrier(netdev));
508 bond_slave_set_netdev__(slave, netdev);
511 slave->name = xstrdup(netdev_get_name(netdev));
512 ovs_rwlock_unlock(&rwlock);
515 /* Updates the network device to be used with 'slave_' to 'netdev'.
517 * This is useful if the caller closes and re-opens the network device
518 * registered with bond_slave_register() but doesn't need to change anything
521 bond_slave_set_netdev(struct bond *bond, void *slave_, struct netdev *netdev)
523 struct bond_slave *slave;
525 ovs_rwlock_wrlock(&rwlock);
526 slave = bond_slave_lookup(bond, slave_);
528 bond_slave_set_netdev__(slave, netdev);
530 ovs_rwlock_unlock(&rwlock);
533 /* Unregisters 'slave_' from 'bond'. If 'bond' does not contain such a slave
534 * then this function has no effect.
536 * Unregistering a slave invalidates all flows. */
538 bond_slave_unregister(struct bond *bond, const void *slave_)
540 struct bond_slave *slave;
543 ovs_rwlock_wrlock(&rwlock);
544 slave = bond_slave_lookup(bond, slave_);
549 bond->bond_revalidate = true;
550 bond_enable_slave(slave, false);
552 del_active = bond->active_slave == slave;
554 struct bond_entry *e;
555 for (e = bond->hash; e <= &bond->hash[BOND_MASK]; e++) {
556 if (e->slave == slave) {
564 hmap_remove(&bond->slaves, &slave->hmap_node);
565 /* Client owns 'slave->netdev'. */
569 bond_choose_active_slave(bond);
570 bond->send_learning_packets = true;
573 ovs_rwlock_unlock(&rwlock);
576 /* Should be called on each slave in 'bond' before bond_run() to indicate
577 * whether or not 'slave_' may be enabled. This function is intended to allow
578 * other protocols to have some impact on bonding decisions. For example LACP
579 * or high level link monitoring protocols may decide that a given slave should
580 * not be able to send traffic. */
582 bond_slave_set_may_enable(struct bond *bond, void *slave_, bool may_enable)
584 ovs_rwlock_wrlock(&rwlock);
585 bond_slave_lookup(bond, slave_)->may_enable = may_enable;
586 ovs_rwlock_unlock(&rwlock);
589 /* Performs periodic maintenance on 'bond'.
591 * Returns true if the caller should revalidate its flows.
593 * The caller should check bond_should_send_learning_packets() afterward. */
595 bond_run(struct bond *bond, enum lacp_status lacp_status)
597 struct bond_slave *slave;
600 ovs_rwlock_wrlock(&rwlock);
601 if (bond->lacp_status != lacp_status) {
602 bond->lacp_status = lacp_status;
603 bond->bond_revalidate = true;
606 /* Enable slaves based on link status and LACP feedback. */
607 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
608 bond_link_status_update(slave);
609 slave->change_seq = seq_read(connectivity_seq_get());
611 if (!bond->active_slave || !bond->active_slave->enabled) {
612 bond_choose_active_slave(bond);
615 /* Update fake bond interface stats. */
616 if (time_msec() >= bond->next_fake_iface_update) {
617 bond_update_fake_slave_stats(bond);
618 bond->next_fake_iface_update = time_msec() + 1000;
621 revalidate = bond->bond_revalidate;
622 bond->bond_revalidate = false;
623 ovs_rwlock_unlock(&rwlock);
628 /* Causes poll_block() to wake up when 'bond' needs something to be done. */
630 bond_wait(struct bond *bond)
632 struct bond_slave *slave;
634 ovs_rwlock_rdlock(&rwlock);
635 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
636 if (slave->delay_expires != LLONG_MAX) {
637 poll_timer_wait_until(slave->delay_expires);
640 seq_wait(connectivity_seq_get(), slave->change_seq);
643 if (bond->next_fake_iface_update != LLONG_MAX) {
644 poll_timer_wait_until(bond->next_fake_iface_update);
647 if (bond->bond_revalidate) {
648 poll_immediate_wake();
650 ovs_rwlock_unlock(&rwlock);
652 /* We don't wait for bond->next_rebalance because rebalancing can only run
653 * at a flow account checkpoint. ofproto does checkpointing on its own
654 * schedule and bond_rebalance() gets called afterward, so we'd just be
655 * waking up for no purpose. */
658 /* MAC learning table interaction. */
661 may_send_learning_packets(const struct bond *bond)
663 return ((bond->lacp_status == LACP_DISABLED
664 && (bond->balance == BM_SLB || bond->balance == BM_AB))
665 || (bond->lacp_fallback_ab && bond->lacp_status == LACP_CONFIGURED))
666 && bond->active_slave;
669 /* Returns true if 'bond' needs the client to send out packets to assist with
670 * MAC learning on 'bond'. If this function returns true, then the client
671 * should iterate through its MAC learning table for the bridge on which 'bond'
672 * is located. For each MAC that has been learned on a port other than 'bond',
673 * it should call bond_compose_learning_packet().
675 * This function will only return true if 'bond' is in SLB or active-backup
676 * mode and LACP is not negotiated. Otherwise sending learning packets isn't
679 * Calling this function resets the state that it checks. */
681 bond_should_send_learning_packets(struct bond *bond)
685 ovs_rwlock_wrlock(&rwlock);
686 send = bond->send_learning_packets && may_send_learning_packets(bond);
687 bond->send_learning_packets = false;
688 ovs_rwlock_unlock(&rwlock);
692 /* Sends a gratuitous learning packet on 'bond' from 'eth_src' on 'vlan'.
694 * See bond_should_send_learning_packets() for description of usage. The
695 * caller should send the composed packet on the port associated with
696 * port_aux and takes ownership of the returned ofpbuf. */
698 bond_compose_learning_packet(struct bond *bond,
699 const uint8_t eth_src[ETH_ADDR_LEN],
700 uint16_t vlan, void **port_aux)
702 struct bond_slave *slave;
703 struct ofpbuf *packet;
706 ovs_rwlock_rdlock(&rwlock);
707 ovs_assert(may_send_learning_packets(bond));
708 memset(&flow, 0, sizeof flow);
709 memcpy(flow.dl_src, eth_src, ETH_ADDR_LEN);
710 slave = choose_output_slave(bond, &flow, NULL, vlan);
712 packet = ofpbuf_new(0);
713 compose_rarp(packet, eth_src);
715 eth_push_vlan(packet, htons(ETH_TYPE_VLAN), htons(vlan));
718 *port_aux = slave->aux;
719 ovs_rwlock_unlock(&rwlock);
723 /* Checks whether a packet that arrived on 'slave_' within 'bond', with an
724 * Ethernet destination address of 'eth_dst', should be admitted.
726 * The return value is one of the following:
728 * - BV_ACCEPT: Admit the packet.
730 * - BV_DROP: Drop the packet.
732 * - BV_DROP_IF_MOVED: Consult the MAC learning table for the packet's
733 * Ethernet source address and VLAN. If there is none, or if the packet
734 * is on the learned port, then admit the packet. If a different port has
735 * been learned, however, drop the packet (and do not use it for MAC
739 bond_check_admissibility(struct bond *bond, const void *slave_,
740 const uint8_t eth_dst[ETH_ADDR_LEN])
742 enum bond_verdict verdict = BV_DROP;
743 struct bond_slave *slave;
745 ovs_rwlock_rdlock(&rwlock);
746 slave = bond_slave_lookup(bond, slave_);
751 /* LACP bonds have very loose admissibility restrictions because we can
752 * assume the remote switch is aware of the bond and will "do the right
753 * thing". However, as a precaution we drop packets on disabled slaves
754 * because no correctly implemented partner switch should be sending
757 * If LACP is configured, but LACP negotiations have been unsuccessful, we
758 * drop all incoming traffic except if lacp_fallback_ab is enabled. */
759 switch (bond->lacp_status) {
760 case LACP_NEGOTIATED:
761 verdict = slave->enabled ? BV_ACCEPT : BV_DROP;
763 case LACP_CONFIGURED:
764 if (!bond->lacp_fallback_ab) {
771 /* Drop all multicast packets on inactive slaves. */
772 if (eth_addr_is_multicast(eth_dst)) {
773 if (bond->active_slave != slave) {
778 switch (bond->balance) {
780 /* TCP balanced bonds require successful LACP negotiations. Based on the
781 * above check, LACP is off or lacp_fallback_ab is true on this bond.
782 * If lacp_fallback_ab is true fall through to BM_AB case else, we
783 * drop all incoming traffic. */
784 if (!bond->lacp_fallback_ab) {
789 /* Drop all packets which arrive on backup slaves. This is similar to
790 * how Linux bonding handles active-backup bonds. */
791 if (bond->active_slave != slave) {
792 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
794 VLOG_DBG_RL(&rl, "active-backup bond received packet on backup"
795 " slave (%s) destined for " ETH_ADDR_FMT,
796 slave->name, ETH_ADDR_ARGS(eth_dst));
803 /* Drop all packets for which we have learned a different input port,
804 * because we probably sent the packet on one slave and got it back on
805 * the other. Gratuitous ARP packets are an exception to this rule:
806 * the host has moved to another switch. The exception to the
807 * exception is if we locked the learning table to avoid reflections on
809 verdict = BV_DROP_IF_MOVED;
815 ovs_rwlock_unlock(&rwlock);
820 /* Returns the slave (registered on 'bond' by bond_slave_register()) to which
821 * a packet with the given 'flow' and 'vlan' should be forwarded. Returns
822 * NULL if the packet should be dropped because no slaves are enabled.
824 * 'vlan' is not necessarily the same as 'flow->vlan_tci'. First, 'vlan'
825 * should be a VID only (i.e. excluding the PCP bits). Second,
826 * 'flow->vlan_tci' is the VLAN TCI that appeared on the packet (so it will be
827 * nonzero only for trunk ports), whereas 'vlan' is the logical VLAN that the
828 * packet belongs to (so for an access port it will be the access port's VLAN).
830 * If 'wc' is non-NULL, bitwise-OR's 'wc' with the set of bits that were
831 * significant in the selection. At some point earlier, 'wc' should
832 * have been initialized (e.g., by flow_wildcards_init_catchall()).
835 bond_choose_output_slave(struct bond *bond, const struct flow *flow,
836 struct flow_wildcards *wc, uint16_t vlan)
838 struct bond_slave *slave;
841 ovs_rwlock_rdlock(&rwlock);
842 slave = choose_output_slave(bond, flow, wc, vlan);
843 aux = slave ? slave->aux : NULL;
844 ovs_rwlock_unlock(&rwlock);
851 bond_entry_account(struct bond_entry *entry, uint64_t rule_tx_bytes)
852 OVS_REQ_WRLOCK(rwlock)
857 delta = rule_tx_bytes - entry->pr_tx_bytes;
858 entry->tx_bytes += delta;
859 entry->pr_tx_bytes = rule_tx_bytes;
863 /* Maintain bond stats using post recirculation rule byte counters.*/
865 bond_recirculation_account(struct bond *bond)
869 ovs_rwlock_wrlock(&rwlock);
870 for (i=0; i<=BOND_MASK; i++) {
871 struct bond_entry *entry = &bond->hash[i];
872 struct rule *rule = entry->pr_rule;
875 uint64_t n_packets OVS_UNUSED;
876 long long int used OVS_UNUSED;
879 rule->ofproto->ofproto_class->rule_get_stats(
880 rule, &n_packets, &n_bytes, &used);
881 bond_entry_account(entry, n_bytes);
884 ovs_rwlock_unlock(&rwlock);
888 bond_may_recirc(const struct bond *bond, uint32_t *recirc_id,
891 if (bond->balance == BM_TCP && recirc_id) {
893 *recirc_id = bond->recirc_id;
896 *hash_bias = bond->basis;
905 bond_update_post_recirc_rules(struct bond* bond, const bool force)
907 struct bond_entry *e;
908 bool update_rules = force; /* Always update rules if caller forces it. */
910 /* Make sure all bond entries are populated */
911 for (e = bond->hash; e <= &bond->hash[BOND_MASK]; e++) {
912 if (!e->slave || !e->slave->enabled) {
914 e->slave = CONTAINER_OF(hmap_random_node(&bond->slaves),
915 struct bond_slave, hmap_node);
916 if (!e->slave->enabled) {
917 e->slave = bond->active_slave;
923 update_recirc_rules(bond);
930 bond_is_balanced(const struct bond *bond) OVS_REQ_RDLOCK(rwlock)
932 return bond->rebalance_interval
933 && (bond->balance == BM_SLB || bond->balance == BM_TCP);
936 /* Notifies 'bond' that 'n_bytes' bytes were sent in 'flow' within 'vlan'. */
938 bond_account(struct bond *bond, const struct flow *flow, uint16_t vlan,
941 ovs_rwlock_wrlock(&rwlock);
942 if (bond_is_balanced(bond)) {
943 lookup_bond_entry(bond, flow, vlan)->tx_bytes += n_bytes;
945 ovs_rwlock_unlock(&rwlock);
948 static struct bond_slave *
949 bond_slave_from_bal_node(struct list *bal) OVS_REQ_RDLOCK(rwlock)
951 return CONTAINER_OF(bal, struct bond_slave, bal_node);
955 log_bals(struct bond *bond, const struct list *bals)
956 OVS_REQ_RDLOCK(rwlock)
958 if (VLOG_IS_DBG_ENABLED()) {
959 struct ds ds = DS_EMPTY_INITIALIZER;
960 const struct bond_slave *slave;
962 LIST_FOR_EACH (slave, bal_node, bals) {
964 ds_put_char(&ds, ',');
966 ds_put_format(&ds, " %s %"PRIu64"kB",
967 slave->name, slave->tx_bytes / 1024);
969 if (!slave->enabled) {
970 ds_put_cstr(&ds, " (disabled)");
972 if (!list_is_empty(&slave->entries)) {
973 struct bond_entry *e;
975 ds_put_cstr(&ds, " (");
976 LIST_FOR_EACH (e, list_node, &slave->entries) {
977 if (&e->list_node != list_front(&slave->entries)) {
978 ds_put_cstr(&ds, " + ");
980 ds_put_format(&ds, "h%"PRIdPTR": %"PRIu64"kB",
981 e - bond->hash, e->tx_bytes / 1024);
983 ds_put_cstr(&ds, ")");
986 VLOG_DBG("bond %s:%s", bond->name, ds_cstr(&ds));
991 /* Shifts 'hash' from its current slave to 'to'. */
993 bond_shift_load(struct bond_entry *hash, struct bond_slave *to)
994 OVS_REQ_WRLOCK(rwlock)
996 struct bond_slave *from = hash->slave;
997 struct bond *bond = from->bond;
998 uint64_t delta = hash->tx_bytes;
1000 VLOG_INFO("bond %s: shift %"PRIu64"kB of load (with hash %"PRIdPTR") "
1001 "from %s to %s (now carrying %"PRIu64"kB and "
1002 "%"PRIu64"kB load, respectively)",
1003 bond->name, delta / 1024, hash - bond->hash,
1004 from->name, to->name,
1005 (from->tx_bytes - delta) / 1024,
1006 (to->tx_bytes + delta) / 1024);
1008 /* Shift load away from 'from' to 'to'. */
1009 from->tx_bytes -= delta;
1010 to->tx_bytes += delta;
1012 /* Arrange for flows to be revalidated. */
1014 bond->bond_revalidate = true;
1017 /* Picks and returns a bond_entry to migrate from 'from' (the most heavily
1018 * loaded bond slave) to a bond slave that has 'to_tx_bytes' bytes of load,
1019 * given that doing so must decrease the ratio of the load on the two slaves by
1020 * at least 0.1. Returns NULL if there is no appropriate entry.
1022 * The list of entries isn't sorted. I don't know of a reason to prefer to
1023 * shift away small hashes or large hashes. */
1024 static struct bond_entry *
1025 choose_entry_to_migrate(const struct bond_slave *from, uint64_t to_tx_bytes)
1026 OVS_REQ_WRLOCK(rwlock)
1028 struct bond_entry *e;
1030 if (list_is_short(&from->entries)) {
1031 /* 'from' carries no more than one MAC hash, so shifting load away from
1032 * it would be pointless. */
1036 LIST_FOR_EACH (e, list_node, &from->entries) {
1037 double old_ratio, new_ratio;
1040 if (to_tx_bytes == 0) {
1041 /* Nothing on the new slave, move it. */
1045 delta = e->tx_bytes;
1046 old_ratio = (double)from->tx_bytes / to_tx_bytes;
1047 new_ratio = (double)(from->tx_bytes - delta) / (to_tx_bytes + delta);
1048 if (old_ratio - new_ratio > 0.1
1049 && fabs(new_ratio - 1.0) < fabs(old_ratio - 1.0)) {
1050 /* We're aiming for an ideal ratio of 1, meaning both the 'from'
1051 and 'to' slave have the same load. Therefore, we only move an
1052 entry if it decreases the load on 'from', and brings us closer
1053 to equal traffic load. */
1061 /* Inserts 'slave' into 'bals' so that descending order of 'tx_bytes' is
1064 insert_bal(struct list *bals, struct bond_slave *slave)
1066 struct bond_slave *pos;
1068 LIST_FOR_EACH (pos, bal_node, bals) {
1069 if (slave->tx_bytes > pos->tx_bytes) {
1073 list_insert(&pos->bal_node, &slave->bal_node);
1076 /* Removes 'slave' from its current list and then inserts it into 'bals' so
1077 * that descending order of 'tx_bytes' is maintained. */
1079 reinsert_bal(struct list *bals, struct bond_slave *slave)
1081 list_remove(&slave->bal_node);
1082 insert_bal(bals, slave);
1085 /* If 'bond' needs rebalancing, does so.
1087 * The caller should have called bond_account() for each active flow, or in case
1088 * of recirculation is used, have called bond_recirculation_account(bond),
1089 * to ensure that flow data is consistently accounted at this point.
1092 bond_rebalance(struct bond *bond)
1094 struct bond_slave *slave;
1095 struct bond_entry *e;
1097 bool rebalanced = false;
1100 ovs_rwlock_wrlock(&rwlock);
1101 if (!bond_is_balanced(bond) || time_msec() < bond->next_rebalance) {
1104 bond->next_rebalance = time_msec() + bond->rebalance_interval;
1106 use_recirc = ofproto_dpif_get_enable_recirc(bond->ofproto) &&
1107 bond_may_recirc(bond, NULL, NULL);
1110 bond_recirculation_account(bond);
1113 /* Add each bond_entry to its slave's 'entries' list.
1114 * Compute each slave's tx_bytes as the sum of its entries' tx_bytes. */
1115 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1116 slave->tx_bytes = 0;
1117 list_init(&slave->entries);
1119 for (e = &bond->hash[0]; e <= &bond->hash[BOND_MASK]; e++) {
1120 if (e->slave && e->tx_bytes) {
1121 e->slave->tx_bytes += e->tx_bytes;
1122 list_push_back(&e->slave->entries, &e->list_node);
1126 /* Add enabled slaves to 'bals' in descending order of tx_bytes.
1128 * XXX This is O(n**2) in the number of slaves but it could be O(n lg n)
1129 * with a proper list sort algorithm. */
1131 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1132 if (slave->enabled) {
1133 insert_bal(&bals, slave);
1136 log_bals(bond, &bals);
1138 /* Shift load from the most-loaded slaves to the least-loaded slaves. */
1139 while (!list_is_short(&bals)) {
1140 struct bond_slave *from = bond_slave_from_bal_node(list_front(&bals));
1141 struct bond_slave *to = bond_slave_from_bal_node(list_back(&bals));
1144 overload = from->tx_bytes - to->tx_bytes;
1145 if (overload < to->tx_bytes >> 5 || overload < 100000) {
1146 /* The extra load on 'from' (and all less-loaded slaves), compared
1147 * to that of 'to' (the least-loaded slave), is less than ~3%, or
1148 * it is less than ~1Mbps. No point in rebalancing. */
1152 /* 'from' is carrying significantly more load than 'to'. Pick a hash
1153 * to move from 'from' to 'to'. */
1154 e = choose_entry_to_migrate(from, to->tx_bytes);
1156 bond_shift_load(e, to);
1158 /* Delete element from from->entries.
1160 * We don't add the element to to->hashes. That would only allow
1161 * 'e' to be migrated to another slave in this rebalancing run, and
1162 * there is no point in doing that. */
1163 list_remove(&e->list_node);
1165 /* Re-sort 'bals'. */
1166 reinsert_bal(&bals, from);
1167 reinsert_bal(&bals, to);
1170 /* Can't usefully migrate anything away from 'from'.
1171 * Don't reconsider it. */
1172 list_remove(&from->bal_node);
1176 /* Implement exponentially weighted moving average. A weight of 1/2 causes
1177 * historical data to decay to <1% in 7 rebalancing runs. 1,000,000 bytes
1178 * take 20 rebalancing runs to decay to 0 and get deleted entirely. */
1179 for (e = &bond->hash[0]; e <= &bond->hash[BOND_MASK]; e++) {
1183 if (use_recirc && rebalanced) {
1184 bond_update_post_recirc_rules(bond,true);
1188 ovs_rwlock_unlock(&rwlock);
1191 /* Bonding unixctl user interface functions. */
1193 static struct bond *
1194 bond_find(const char *name) OVS_REQ_RDLOCK(rwlock)
1198 HMAP_FOR_EACH_WITH_HASH (bond, hmap_node, hash_string(name, 0),
1200 if (!strcmp(bond->name, name)) {
1207 static struct bond_slave *
1208 bond_lookup_slave(struct bond *bond, const char *slave_name)
1210 struct bond_slave *slave;
1212 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1213 if (!strcmp(slave->name, slave_name)) {
1221 bond_unixctl_list(struct unixctl_conn *conn,
1222 int argc OVS_UNUSED, const char *argv[] OVS_UNUSED,
1223 void *aux OVS_UNUSED)
1225 struct ds ds = DS_EMPTY_INITIALIZER;
1226 const struct bond *bond;
1228 ds_put_cstr(&ds, "bond\ttype\trecircID\tslaves\n");
1230 ovs_rwlock_rdlock(&rwlock);
1231 HMAP_FOR_EACH (bond, hmap_node, all_bonds) {
1232 const struct bond_slave *slave;
1235 ds_put_format(&ds, "%s\t%s\t%d\t", bond->name,
1236 bond_mode_to_string(bond->balance), bond->recirc_id);
1239 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1241 ds_put_cstr(&ds, ", ");
1243 ds_put_cstr(&ds, slave->name);
1245 ds_put_char(&ds, '\n');
1247 ovs_rwlock_unlock(&rwlock);
1248 unixctl_command_reply(conn, ds_cstr(&ds));
1253 bond_print_details(struct ds *ds, const struct bond *bond)
1254 OVS_REQ_RDLOCK(rwlock)
1256 struct shash slave_shash = SHASH_INITIALIZER(&slave_shash);
1257 const struct shash_node **sorted_slaves = NULL;
1258 const struct bond_slave *slave;
1263 ds_put_format(ds, "---- %s ----\n", bond->name);
1264 ds_put_format(ds, "bond_mode: %s\n",
1265 bond_mode_to_string(bond->balance));
1267 may_recirc = bond_may_recirc(bond, &recirc_id, NULL);
1268 ds_put_format(ds, "bond may use recirculation: %s, Recirc-ID : %d\n",
1269 may_recirc ? "yes" : "no", may_recirc ? recirc_id: -1);
1271 ds_put_format(ds, "bond-hash-basis: %"PRIu32"\n", bond->basis);
1273 ds_put_format(ds, "updelay: %d ms\n", bond->updelay);
1274 ds_put_format(ds, "downdelay: %d ms\n", bond->downdelay);
1276 if (bond_is_balanced(bond)) {
1277 ds_put_format(ds, "next rebalance: %lld ms\n",
1278 bond->next_rebalance - time_msec());
1281 ds_put_cstr(ds, "lacp_status: ");
1282 switch (bond->lacp_status) {
1283 case LACP_NEGOTIATED:
1284 ds_put_cstr(ds, "negotiated\n");
1286 case LACP_CONFIGURED:
1287 ds_put_cstr(ds, "configured\n");
1290 ds_put_cstr(ds, "off\n");
1293 ds_put_cstr(ds, "<unknown>\n");
1297 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1298 shash_add(&slave_shash, slave->name, slave);
1300 sorted_slaves = shash_sort(&slave_shash);
1302 for (i = 0; i < shash_count(&slave_shash); i++) {
1303 struct bond_entry *be;
1305 slave = sorted_slaves[i]->data;
1308 ds_put_format(ds, "\nslave %s: %s\n",
1309 slave->name, slave->enabled ? "enabled" : "disabled");
1310 if (slave == bond->active_slave) {
1311 ds_put_cstr(ds, "\tactive slave\n");
1313 if (slave->delay_expires != LLONG_MAX) {
1314 ds_put_format(ds, "\t%s expires in %lld ms\n",
1315 slave->enabled ? "downdelay" : "updelay",
1316 slave->delay_expires - time_msec());
1319 ds_put_format(ds, "\tmay_enable: %s\n",
1320 slave->may_enable ? "true" : "false");
1322 if (!bond_is_balanced(bond)) {
1327 for (be = bond->hash; be <= &bond->hash[BOND_MASK]; be++) {
1328 int hash = be - bond->hash;
1331 if (be->slave != slave) {
1335 be_tx_k = be->tx_bytes / 1024;
1337 ds_put_format(ds, "\thash %d: %"PRIu64" kB load\n",
1341 /* XXX How can we list the MACs assigned to hashes of SLB bonds? */
1344 shash_destroy(&slave_shash);
1345 free(sorted_slaves);
1346 ds_put_cstr(ds, "\n");
1350 bond_unixctl_show(struct unixctl_conn *conn,
1351 int argc, const char *argv[],
1352 void *aux OVS_UNUSED)
1354 struct ds ds = DS_EMPTY_INITIALIZER;
1356 ovs_rwlock_rdlock(&rwlock);
1358 const struct bond *bond = bond_find(argv[1]);
1361 unixctl_command_reply_error(conn, "no such bond");
1364 bond_print_details(&ds, bond);
1366 const struct bond *bond;
1368 HMAP_FOR_EACH (bond, hmap_node, all_bonds) {
1369 bond_print_details(&ds, bond);
1373 unixctl_command_reply(conn, ds_cstr(&ds));
1377 ovs_rwlock_unlock(&rwlock);
1381 bond_unixctl_migrate(struct unixctl_conn *conn,
1382 int argc OVS_UNUSED, const char *argv[],
1383 void *aux OVS_UNUSED)
1385 const char *bond_s = argv[1];
1386 const char *hash_s = argv[2];
1387 const char *slave_s = argv[3];
1389 struct bond_slave *slave;
1390 struct bond_entry *entry;
1393 ovs_rwlock_wrlock(&rwlock);
1394 bond = bond_find(bond_s);
1396 unixctl_command_reply_error(conn, "no such bond");
1400 if (bond->balance != BM_SLB) {
1401 unixctl_command_reply_error(conn, "not an SLB bond");
1405 if (strspn(hash_s, "0123456789") == strlen(hash_s)) {
1406 hash = atoi(hash_s) & BOND_MASK;
1408 unixctl_command_reply_error(conn, "bad hash");
1412 slave = bond_lookup_slave(bond, slave_s);
1414 unixctl_command_reply_error(conn, "no such slave");
1418 if (!slave->enabled) {
1419 unixctl_command_reply_error(conn, "cannot migrate to disabled slave");
1423 entry = &bond->hash[hash];
1424 bond->bond_revalidate = true;
1425 entry->slave = slave;
1426 unixctl_command_reply(conn, "migrated");
1429 ovs_rwlock_unlock(&rwlock);
1433 bond_unixctl_set_active_slave(struct unixctl_conn *conn,
1434 int argc OVS_UNUSED, const char *argv[],
1435 void *aux OVS_UNUSED)
1437 const char *bond_s = argv[1];
1438 const char *slave_s = argv[2];
1440 struct bond_slave *slave;
1442 ovs_rwlock_wrlock(&rwlock);
1443 bond = bond_find(bond_s);
1445 unixctl_command_reply_error(conn, "no such bond");
1449 slave = bond_lookup_slave(bond, slave_s);
1451 unixctl_command_reply_error(conn, "no such slave");
1455 if (!slave->enabled) {
1456 unixctl_command_reply_error(conn, "cannot make disabled slave active");
1460 if (bond->active_slave != slave) {
1461 bond->bond_revalidate = true;
1462 bond->active_slave = slave;
1463 VLOG_INFO("bond %s: active interface is now %s",
1464 bond->name, slave->name);
1465 bond->send_learning_packets = true;
1466 unixctl_command_reply(conn, "done");
1468 unixctl_command_reply(conn, "no change");
1471 ovs_rwlock_unlock(&rwlock);
1475 enable_slave(struct unixctl_conn *conn, const char *argv[], bool enable)
1477 const char *bond_s = argv[1];
1478 const char *slave_s = argv[2];
1480 struct bond_slave *slave;
1482 ovs_rwlock_wrlock(&rwlock);
1483 bond = bond_find(bond_s);
1485 unixctl_command_reply_error(conn, "no such bond");
1489 slave = bond_lookup_slave(bond, slave_s);
1491 unixctl_command_reply_error(conn, "no such slave");
1495 bond_enable_slave(slave, enable);
1496 unixctl_command_reply(conn, enable ? "enabled" : "disabled");
1499 ovs_rwlock_unlock(&rwlock);
1503 bond_unixctl_enable_slave(struct unixctl_conn *conn,
1504 int argc OVS_UNUSED, const char *argv[],
1505 void *aux OVS_UNUSED)
1507 enable_slave(conn, argv, true);
1511 bond_unixctl_disable_slave(struct unixctl_conn *conn,
1512 int argc OVS_UNUSED, const char *argv[],
1513 void *aux OVS_UNUSED)
1515 enable_slave(conn, argv, false);
1519 bond_unixctl_hash(struct unixctl_conn *conn, int argc, const char *argv[],
1520 void *aux OVS_UNUSED)
1522 const char *mac_s = argv[1];
1523 const char *vlan_s = argc > 2 ? argv[2] : NULL;
1524 const char *basis_s = argc > 3 ? argv[3] : NULL;
1525 uint8_t mac[ETH_ADDR_LEN];
1532 if (!ovs_scan(vlan_s, "%u", &vlan)) {
1533 unixctl_command_reply_error(conn, "invalid vlan");
1541 if (!ovs_scan(basis_s, "%"SCNu32, &basis)) {
1542 unixctl_command_reply_error(conn, "invalid basis");
1549 if (ovs_scan(mac_s, ETH_ADDR_SCAN_FMT, ETH_ADDR_SCAN_ARGS(mac))) {
1550 hash = bond_hash_src(mac, vlan, basis) & BOND_MASK;
1552 hash_cstr = xasprintf("%u", hash);
1553 unixctl_command_reply(conn, hash_cstr);
1556 unixctl_command_reply_error(conn, "invalid mac");
1563 unixctl_command_register("bond/list", "", 0, 0, bond_unixctl_list, NULL);
1564 unixctl_command_register("bond/show", "[port]", 0, 1, bond_unixctl_show,
1566 unixctl_command_register("bond/migrate", "port hash slave", 3, 3,
1567 bond_unixctl_migrate, NULL);
1568 unixctl_command_register("bond/set-active-slave", "port slave", 2, 2,
1569 bond_unixctl_set_active_slave, NULL);
1570 unixctl_command_register("bond/enable-slave", "port slave", 2, 2,
1571 bond_unixctl_enable_slave, NULL);
1572 unixctl_command_register("bond/disable-slave", "port slave", 2, 2,
1573 bond_unixctl_disable_slave, NULL);
1574 unixctl_command_register("bond/hash", "mac [vlan] [basis]", 1, 3,
1575 bond_unixctl_hash, NULL);
1579 bond_entry_reset(struct bond *bond)
1581 if (bond->balance != BM_AB) {
1582 size_t hash_len = BOND_BUCKETS * sizeof *bond->hash;
1585 bond->hash = xmalloc(hash_len);
1587 memset(bond->hash, 0, hash_len);
1589 bond->next_rebalance = time_msec() + bond->rebalance_interval;
1596 static struct bond_slave *
1597 bond_slave_lookup(struct bond *bond, const void *slave_)
1599 struct bond_slave *slave;
1601 HMAP_FOR_EACH_IN_BUCKET (slave, hmap_node, hash_pointer(slave_, 0),
1603 if (slave->aux == slave_) {
1612 bond_enable_slave(struct bond_slave *slave, bool enable)
1614 slave->delay_expires = LLONG_MAX;
1615 if (enable != slave->enabled) {
1616 slave->bond->bond_revalidate = true;
1617 slave->enabled = enable;
1619 ovs_mutex_lock(&slave->bond->mutex);
1621 list_insert(&slave->bond->enabled_slaves, &slave->list_node);
1623 list_remove(&slave->list_node);
1625 ovs_mutex_unlock(&slave->bond->mutex);
1627 VLOG_INFO("interface %s: %s", slave->name,
1628 slave->enabled ? "enabled" : "disabled");
1633 bond_link_status_update(struct bond_slave *slave)
1635 struct bond *bond = slave->bond;
1638 up = netdev_get_carrier(slave->netdev) && slave->may_enable;
1639 if ((up == slave->enabled) != (slave->delay_expires == LLONG_MAX)) {
1640 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
1641 VLOG_INFO_RL(&rl, "interface %s: link state %s",
1642 slave->name, up ? "up" : "down");
1643 if (up == slave->enabled) {
1644 slave->delay_expires = LLONG_MAX;
1645 VLOG_INFO_RL(&rl, "interface %s: will not be %s",
1646 slave->name, up ? "disabled" : "enabled");
1648 int delay = (bond->lacp_status != LACP_DISABLED ? 0
1649 : up ? bond->updelay : bond->downdelay);
1650 slave->delay_expires = time_msec() + delay;
1652 VLOG_INFO_RL(&rl, "interface %s: will be %s if it stays %s "
1655 up ? "enabled" : "disabled",
1662 if (time_msec() >= slave->delay_expires) {
1663 bond_enable_slave(slave, up);
1668 bond_hash_src(const uint8_t mac[ETH_ADDR_LEN], uint16_t vlan, uint32_t basis)
1670 return hash_mac(mac, vlan, basis);
1674 bond_hash_tcp(const struct flow *flow, uint16_t vlan, uint32_t basis)
1676 struct flow hash_flow = *flow;
1677 hash_flow.vlan_tci = htons(vlan);
1679 /* The symmetric quality of this hash function is not required, but
1680 * flow_hash_symmetric_l4 already exists, and is sufficient for our
1681 * purposes, so we use it out of convenience. */
1682 return flow_hash_symmetric_l4(&hash_flow, basis);
1686 bond_hash(const struct bond *bond, const struct flow *flow, uint16_t vlan)
1688 ovs_assert(bond->balance == BM_TCP || bond->balance == BM_SLB);
1690 return (bond->balance == BM_TCP
1691 ? bond_hash_tcp(flow, vlan, bond->basis)
1692 : bond_hash_src(flow->dl_src, vlan, bond->basis));
1695 static struct bond_entry *
1696 lookup_bond_entry(const struct bond *bond, const struct flow *flow,
1699 return &bond->hash[bond_hash(bond, flow, vlan) & BOND_MASK];
1702 /* Selects and returns an enabled slave from the 'enabled_slaves' list
1703 * in a round-robin fashion. If the 'enabled_slaves' list is empty,
1705 static struct bond_slave *
1706 get_enabled_slave(struct bond *bond)
1710 ovs_mutex_lock(&bond->mutex);
1711 if (list_is_empty(&bond->enabled_slaves)) {
1712 ovs_mutex_unlock(&bond->mutex);
1716 node = list_pop_front(&bond->enabled_slaves);
1717 list_push_back(&bond->enabled_slaves, node);
1718 ovs_mutex_unlock(&bond->mutex);
1720 return CONTAINER_OF(node, struct bond_slave, list_node);
1723 static struct bond_slave *
1724 choose_output_slave(const struct bond *bond, const struct flow *flow,
1725 struct flow_wildcards *wc, uint16_t vlan)
1727 struct bond_entry *e;
1730 balance = bond->balance;
1731 if (bond->lacp_status == LACP_CONFIGURED) {
1732 /* LACP has been configured on this bond but negotiations were
1733 * unsuccussful. If lacp_fallback_ab is enabled use active-
1734 * backup mode else drop all traffic. */
1735 if (!bond->lacp_fallback_ab) {
1743 return bond->active_slave;
1746 if (bond->lacp_status != LACP_NEGOTIATED) {
1747 /* Must have LACP negotiations for TCP balanced bonds. */
1751 flow_mask_hash_fields(flow, wc, NX_HASH_FIELDS_SYMMETRIC_L4);
1756 flow_mask_hash_fields(flow, wc, NX_HASH_FIELDS_ETH_SRC);
1758 e = lookup_bond_entry(bond, flow, vlan);
1759 if (!e->slave || !e->slave->enabled) {
1760 e->slave = get_enabled_slave(CONST_CAST(struct bond*, bond));
1769 static struct bond_slave *
1770 bond_choose_slave(const struct bond *bond)
1772 struct bond_slave *slave, *best;
1774 /* Find an enabled slave. */
1775 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1776 if (slave->enabled) {
1781 /* All interfaces are disabled. Find an interface that will be enabled
1782 * after its updelay expires. */
1784 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1785 if (slave->delay_expires != LLONG_MAX
1786 && slave->may_enable
1787 && (!best || slave->delay_expires < best->delay_expires)) {
1795 bond_choose_active_slave(struct bond *bond)
1797 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
1798 struct bond_slave *old_active_slave = bond->active_slave;
1800 bond->active_slave = bond_choose_slave(bond);
1801 if (bond->active_slave) {
1802 if (bond->active_slave->enabled) {
1803 VLOG_INFO_RL(&rl, "bond %s: active interface is now %s",
1804 bond->name, bond->active_slave->name);
1806 VLOG_INFO_RL(&rl, "bond %s: active interface is now %s, skipping "
1807 "remaining %lld ms updelay (since no interface was "
1808 "enabled)", bond->name, bond->active_slave->name,
1809 bond->active_slave->delay_expires - time_msec());
1810 bond_enable_slave(bond->active_slave, true);
1813 bond->send_learning_packets = true;
1814 } else if (old_active_slave) {
1815 VLOG_INFO_RL(&rl, "bond %s: all interfaces disabled", bond->name);
1819 /* Attempts to make the sum of the bond slaves' statistics appear on the fake
1820 * bond interface. */
1822 bond_update_fake_slave_stats(struct bond *bond)
1824 struct netdev_stats bond_stats;
1825 struct bond_slave *slave;
1826 struct netdev *bond_dev;
1828 memset(&bond_stats, 0, sizeof bond_stats);
1830 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1831 struct netdev_stats slave_stats;
1833 if (!netdev_get_stats(slave->netdev, &slave_stats)) {
1834 /* XXX: We swap the stats here because they are swapped back when
1835 * reported by the internal device. The reason for this is
1836 * internal devices normally represent packets going into the
1837 * system but when used as fake bond device they represent packets
1838 * leaving the system. We really should do this in the internal
1839 * device itself because changing it here reverses the counts from
1840 * the perspective of the switch. However, the internal device
1841 * doesn't know what type of device it represents so we have to do
1842 * it here for now. */
1843 bond_stats.tx_packets += slave_stats.rx_packets;
1844 bond_stats.tx_bytes += slave_stats.rx_bytes;
1845 bond_stats.rx_packets += slave_stats.tx_packets;
1846 bond_stats.rx_bytes += slave_stats.tx_bytes;
1850 if (!netdev_open(bond->name, "system", &bond_dev)) {
1851 netdev_set_stats(bond_dev, &bond_stats);
1852 netdev_close(bond_dev);