-static void
-bridge_get_all_ifaces(const struct bridge *br, struct svec *ifaces)
-{
- size_t i, j;
-
- svec_init(ifaces);
- for (i = 0; i < br->n_ports; i++) {
- struct port *port = br->ports[i];
- for (j = 0; j < port->n_ifaces; j++) {
- struct iface *iface = port->ifaces[j];
- svec_add(ifaces, iface->name);
- }
- if (port->n_ifaces > 1
- && cfg_get_bool(0, "bonding.%s.fake-iface", port->name)) {
- svec_add(ifaces, port->name);
- }
- }
- svec_sort_unique(ifaces);
-}
-
-/* For robustness, in case the administrator moves around datapath ports behind
- * our back, we re-check all the datapath port numbers here.
- *
- * This function will set the 'dp_ifidx' members of interfaces that have
- * disappeared to -1, so only call this function from a context where those
- * 'struct iface's will be removed from the bridge. Otherwise, the -1
- * 'dp_ifidx'es will cause trouble later when we try to send them to the
- * datapath, which doesn't support UINT16_MAX+1 ports. */
-static void
-bridge_fetch_dp_ifaces(struct bridge *br)
-{
- struct odp_port *dpif_ports;
- size_t n_dpif_ports;
- size_t i, j;
-
- /* Reset all interface numbers. */
- for (i = 0; i < br->n_ports; i++) {
- struct port *port = br->ports[i];
- for (j = 0; j < port->n_ifaces; j++) {
- struct iface *iface = port->ifaces[j];
- iface->dp_ifidx = -1;
- }
- }
- port_array_clear(&br->ifaces);
-
- dpif_port_list(br->dpif, &dpif_ports, &n_dpif_ports);
- for (i = 0; i < n_dpif_ports; i++) {
- struct odp_port *p = &dpif_ports[i];
- struct iface *iface = iface_lookup(br, p->devname);
- if (iface) {
- if (iface->dp_ifidx >= 0) {
- VLOG_WARN("%s reported interface %s twice",
- dpif_name(br->dpif), p->devname);
- } else if (iface_from_dp_ifidx(br, p->port)) {
- VLOG_WARN("%s reported interface %"PRIu16" twice",
- dpif_name(br->dpif), p->port);
- } else {
- port_array_set(&br->ifaces, p->port, iface);
- iface->dp_ifidx = p->port;
- }
- }
- }
- free(dpif_ports);
-}
-\f
-/* Bridge packet processing functions. */
-
-static int
-bond_hash(const uint8_t mac[ETH_ADDR_LEN])
-{
- return hash_bytes(mac, ETH_ADDR_LEN, 0) & BOND_MASK;
-}
-
-static struct bond_entry *
-lookup_bond_entry(const struct port *port, const uint8_t mac[ETH_ADDR_LEN])
-{
- return &port->bond_hash[bond_hash(mac)];
-}
-
-static int
-bond_choose_iface(const struct port *port)
-{
- size_t i;
- for (i = 0; i < port->n_ifaces; i++) {
- if (port->ifaces[i]->enabled) {
- return i;
- }
- }
- return -1;
-}
-
-static bool
-choose_output_iface(const struct port *port, const uint8_t *dl_src,
- uint16_t *dp_ifidx, tag_type *tags)
-{
- struct iface *iface;
-
- assert(port->n_ifaces);
- if (port->n_ifaces == 1) {
- iface = port->ifaces[0];
- } else {
- struct bond_entry *e = lookup_bond_entry(port, dl_src);
- if (e->iface_idx < 0 || e->iface_idx >= port->n_ifaces
- || !port->ifaces[e->iface_idx]->enabled) {
- /* XXX select interface properly. The current interface selection
- * is only good for testing the rebalancing code. */
- e->iface_idx = bond_choose_iface(port);
- if (e->iface_idx < 0) {
- *tags |= port->no_ifaces_tag;
- return false;
- }
- e->iface_tag = tag_create_random();
- ((struct port *) port)->bond_compat_is_stale = true;
- }
- *tags |= e->iface_tag;
- iface = port->ifaces[e->iface_idx];
- }
- *dp_ifidx = iface->dp_ifidx;
- *tags |= iface->tag; /* Currently only used for bonding. */
- return true;
-}
-
-static void
-bond_link_status_update(struct iface *iface, bool carrier)
-{
- static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
- struct port *port = iface->port;
-
- if ((carrier == iface->enabled) == (iface->delay_expires == LLONG_MAX)) {
- /* Nothing to do. */
- return;
- }
- VLOG_INFO_RL(&rl, "interface %s: carrier %s",
- iface->name, carrier ? "detected" : "dropped");
- if (carrier == iface->enabled) {
- iface->delay_expires = LLONG_MAX;
- VLOG_INFO_RL(&rl, "interface %s: will not be %s",
- iface->name, carrier ? "disabled" : "enabled");
- } else if (carrier && port->updelay && port->active_iface < 0) {
- iface->delay_expires = time_msec();
- VLOG_INFO_RL(&rl, "interface %s: skipping %d ms updelay since no "
- "other interface is up", iface->name, port->updelay);
- } else {
- int delay = carrier ? port->updelay : port->downdelay;
- iface->delay_expires = time_msec() + delay;
- if (delay) {
- VLOG_INFO_RL(&rl,
- "interface %s: will be %s if it stays %s for %d ms",
- iface->name,
- carrier ? "enabled" : "disabled",
- carrier ? "up" : "down",
- delay);
- }
- }
-}
-
-static void
-bond_choose_active_iface(struct port *port)
-{
- static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
-
- port->active_iface = bond_choose_iface(port);
- port->active_iface_tag = tag_create_random();
- if (port->active_iface >= 0) {
- VLOG_INFO_RL(&rl, "port %s: active interface is now %s",
- port->name, port->ifaces[port->active_iface]->name);
- } else {
- VLOG_WARN_RL(&rl, "port %s: all ports disabled, no active interface",
- port->name);
- }
-}
-
-static void
-bond_enable_slave(struct iface *iface, bool enable)
-{
- struct port *port = iface->port;
- struct bridge *br = port->bridge;
-
- iface->delay_expires = LLONG_MAX;
- if (enable == iface->enabled) {
- return;
- }
-
- iface->enabled = enable;
- if (!iface->enabled) {
- VLOG_WARN("interface %s: disabled", iface->name);
- ofproto_revalidate(br->ofproto, iface->tag);
- if (iface->port_ifidx == port->active_iface) {
- ofproto_revalidate(br->ofproto,
- port->active_iface_tag);
- bond_choose_active_iface(port);
- }
- bond_send_learning_packets(port);
- } else {
- VLOG_WARN("interface %s: enabled", iface->name);
- if (port->active_iface < 0) {
- ofproto_revalidate(br->ofproto, port->no_ifaces_tag);
- bond_choose_active_iface(port);
- bond_send_learning_packets(port);
- }
- iface->tag = tag_create_random();
- }
- port_update_bond_compat(port);
-}
-
-static void
-bond_run(struct bridge *br)
-{
- size_t i, j;
-
- for (i = 0; i < br->n_ports; i++) {
- struct port *port = br->ports[i];
-
- if (port->bond_compat_is_stale) {
- port->bond_compat_is_stale = false;
- port_update_bond_compat(port);
- }
-
- if (port->n_ifaces < 2) {
- continue;
- }
- for (j = 0; j < port->n_ifaces; j++) {
- struct iface *iface = port->ifaces[j];
- if (time_msec() >= iface->delay_expires) {
- bond_enable_slave(iface, !iface->enabled);
- }
- }
- }
-}
-
-static void
-bond_wait(struct bridge *br)
-{
- size_t i, j;
-
- for (i = 0; i < br->n_ports; i++) {
- struct port *port = br->ports[i];
- if (port->n_ifaces < 2) {
- continue;
- }
- for (j = 0; j < port->n_ifaces; j++) {
- struct iface *iface = port->ifaces[j];
- if (iface->delay_expires != LLONG_MAX) {
- poll_timer_wait(iface->delay_expires - time_msec());
- }
- }
- }
-}
-
-static bool
-set_dst(struct dst *p, const flow_t *flow,
- const struct port *in_port, const struct port *out_port,
- tag_type *tags)
-{
- /* STP handling.
- *
- * XXX This uses too many tags: any broadcast flow will get one tag per
- * destination port, and thus a broadcast on a switch of any size is likely
- * to have all tag bits set. We should figure out a way to be smarter.
- *
- * This is OK when STP is disabled, because stp_state_tag is 0 then. */
- *tags |= out_port->stp_state_tag;
- if (!(out_port->stp_state & (STP_DISABLED | STP_FORWARDING))) {
- return false;
- }
-
- p->vlan = (out_port->vlan >= 0 ? OFP_VLAN_NONE
- : in_port->vlan >= 0 ? in_port->vlan
- : ntohs(flow->dl_vlan));
- return choose_output_iface(out_port, flow->dl_src, &p->dp_ifidx, tags);
-}
-
-static void
-swap_dst(struct dst *p, struct dst *q)
-{
- struct dst tmp = *p;
- *p = *q;
- *q = tmp;
-}
-
-/* Moves all the dsts with vlan == 'vlan' to the front of the 'n_dsts' in
- * 'dsts'. (This may help performance by reducing the number of VLAN changes
- * that we push to the datapath. We could in fact fully sort the array by
- * vlan, but in most cases there are at most two different vlan tags so that's
- * possibly overkill.) */
-static void
-partition_dsts(struct dst *dsts, size_t n_dsts, int vlan)
-{
- struct dst *first = dsts;
- struct dst *last = dsts + n_dsts;
-
- while (first != last) {
- /* Invariants:
- * - All dsts < first have vlan == 'vlan'.
- * - All dsts >= last have vlan != 'vlan'.
- * - first < last. */
- while (first->vlan == vlan) {
- if (++first == last) {
- return;
- }
- }
-
- /* Same invariants, plus one additional:
- * - first->vlan != vlan.
- */
- while (last[-1].vlan != vlan) {
- if (--last == first) {
- return;
- }
- }
-
- /* Same invariants, plus one additional:
- * - last[-1].vlan == vlan.*/
- swap_dst(first++, --last);
- }
-}
-
-static int
-mirror_mask_ffs(mirror_mask_t mask)
-{
- BUILD_ASSERT_DECL(sizeof(unsigned int) >= sizeof(mask));
- return ffs(mask);
-}
-
-static bool
-dst_is_duplicate(const struct dst *dsts, size_t n_dsts,
- const struct dst *test)
-{
- size_t i;
- for (i = 0; i < n_dsts; i++) {
- if (dsts[i].vlan == test->vlan && dsts[i].dp_ifidx == test->dp_ifidx) {
- return true;
- }
- }
- return false;
-}
-
-static bool
-port_trunks_vlan(const struct port *port, uint16_t vlan)
-{
- return port->vlan < 0 && bitmap_is_set(port->trunks, vlan);
-}
-
-static bool
-port_includes_vlan(const struct port *port, uint16_t vlan)
-{
- return vlan == port->vlan || port_trunks_vlan(port, vlan);
-}
-
-static size_t
-compose_dsts(const struct bridge *br, const flow_t *flow, uint16_t vlan,
- const struct port *in_port, const struct port *out_port,
- struct dst dsts[], tag_type *tags)
-{
- mirror_mask_t mirrors = in_port->src_mirrors;
- struct dst *dst = dsts;
- size_t i;
-
- *tags |= in_port->stp_state_tag;
- if (out_port == FLOOD_PORT) {
- /* XXX use ODP_FLOOD if no vlans or bonding. */
- /* XXX even better, define each VLAN as a datapath port group */
- for (i = 0; i < br->n_ports; i++) {
- struct port *port = br->ports[i];
- if (port != in_port && port_includes_vlan(port, vlan)
- && !port->is_mirror_output_port
- && set_dst(dst, flow, in_port, port, tags)) {
- mirrors |= port->dst_mirrors;
- dst++;
- }
- }
- } else if (out_port && set_dst(dst, flow, in_port, out_port, tags)) {
- mirrors |= out_port->dst_mirrors;
- dst++;
- }
-
- while (mirrors) {
- struct mirror *m = br->mirrors[mirror_mask_ffs(mirrors) - 1];
- if (!m->n_vlans || vlan_is_mirrored(m, vlan)) {
- if (m->out_port) {
- if (set_dst(dst, flow, in_port, m->out_port, tags)
- && !dst_is_duplicate(dsts, dst - dsts, dst)) {
- dst++;
- }
- } else {
- for (i = 0; i < br->n_ports; i++) {
- struct port *port = br->ports[i];
- if (port_includes_vlan(port, m->out_vlan)
- && set_dst(dst, flow, in_port, port, tags))
- {
- int flow_vlan;
-
- if (port->vlan < 0) {
- dst->vlan = m->out_vlan;
- }
- if (dst_is_duplicate(dsts, dst - dsts, dst)) {
- continue;
- }
-
- /* Use the vlan tag on the original flow instead of
- * the one passed in the vlan parameter. This ensures
- * that we compare the vlan from before any implicit
- * tagging tags place. This is necessary because
- * dst->vlan is the final vlan, after removing implicit
- * tags. */
- flow_vlan = ntohs(flow->dl_vlan);
- if (flow_vlan == 0) {
- flow_vlan = OFP_VLAN_NONE;
- }
- if (port == in_port && dst->vlan == flow_vlan) {
- /* Don't send out input port on same VLAN. */
- continue;
- }
- dst++;
- }
- }
- }
- }
- mirrors &= mirrors - 1;
- }
-
- partition_dsts(dsts, dst - dsts, ntohs(flow->dl_vlan));
- return dst - dsts;
-}
-
-static void UNUSED
-print_dsts(const struct dst *dsts, size_t n)
-{
- for (; n--; dsts++) {
- printf(">p%"PRIu16, dsts->dp_ifidx);
- if (dsts->vlan != OFP_VLAN_NONE) {
- printf("v%"PRIu16, dsts->vlan);
- }
- }
-}
-
-static void
-compose_actions(struct bridge *br, const flow_t *flow, uint16_t vlan,
- const struct port *in_port, const struct port *out_port,
- tag_type *tags, struct odp_actions *actions)
-{
- struct dst dsts[DP_MAX_PORTS * (MAX_MIRRORS + 1)];
- size_t n_dsts;
- const struct dst *p;
- uint16_t cur_vlan;
-
- n_dsts = compose_dsts(br, flow, vlan, in_port, out_port, dsts, tags);
-
- cur_vlan = ntohs(flow->dl_vlan);
- for (p = dsts; p < &dsts[n_dsts]; p++) {
- union odp_action *a;
- if (p->vlan != cur_vlan) {
- if (p->vlan == OFP_VLAN_NONE) {
- odp_actions_add(actions, ODPAT_STRIP_VLAN);
- } else {
- a = odp_actions_add(actions, ODPAT_SET_VLAN_VID);
- a->vlan_vid.vlan_vid = htons(p->vlan);
- }
- cur_vlan = p->vlan;
- }
- a = odp_actions_add(actions, ODPAT_OUTPUT);
- a->output.port = p->dp_ifidx;
- }
-}
-
-static bool
-is_bcast_arp_reply(const flow_t *flow, const struct ofpbuf *packet)
-{
- struct arp_eth_header *arp = (struct arp_eth_header *) packet->data;
- return (flow->dl_type == htons(ETH_TYPE_ARP)
- && eth_addr_is_broadcast(flow->dl_dst)
- && packet->size >= sizeof(struct arp_eth_header)
- && arp->ar_op == ARP_OP_REQUEST);
-}
-
-/* If the composed actions may be applied to any packet in the given 'flow',
- * returns true. Otherwise, the actions should only be applied to 'packet', or
- * not at all, if 'packet' was NULL. */
-static bool
-process_flow(struct bridge *br, const flow_t *flow,
- const struct ofpbuf *packet, struct odp_actions *actions,
- tag_type *tags)
-{
- struct iface *in_iface;
- struct port *in_port;
- struct port *out_port = NULL; /* By default, drop the packet/flow. */
- int vlan;
-
- /* Find the interface and port structure for the received packet. */
- in_iface = iface_from_dp_ifidx(br, flow->in_port);
- if (!in_iface) {
- /* No interface? Something fishy... */
- if (packet != NULL) {
- /* Odd. A few possible reasons here:
- *
- * - We deleted an interface but there are still a few packets
- * queued up from it.
- *
- * - Someone externally added an interface (e.g. with "ovs-dpctl
- * add-if") that we don't know about.
- *
- * - Packet arrived on the local port but the local port is not
- * one of our bridge ports.
- */
- static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
-
- VLOG_WARN_RL(&rl, "bridge %s: received packet on unknown "
- "interface %"PRIu16, br->name, flow->in_port);
- }
-
- /* Return without adding any actions, to drop packets on this flow. */
- return true;
- }
- in_port = in_iface->port;
-
- /* Figure out what VLAN this packet belongs to.
- *
- * Note that dl_vlan of 0 and of OFP_VLAN_NONE both mean that the packet
- * belongs to VLAN 0, so we should treat both cases identically. (In the
- * former case, the packet has an 802.1Q header that specifies VLAN 0,
- * presumably to allow a priority to be specified. In the latter case, the
- * packet does not have any 802.1Q header.) */
- vlan = ntohs(flow->dl_vlan);
- if (vlan == OFP_VLAN_NONE) {
- vlan = 0;
- }
- if (in_port->vlan >= 0) {
- if (vlan) {
- /* XXX support double tagging? */
- if (packet != NULL) {
- static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
- VLOG_WARN_RL(&rl, "bridge %s: dropping VLAN %"PRIu16" tagged "
- "packet received on port %s configured with "
- "implicit VLAN %"PRIu16,
- br->name, ntohs(flow->dl_vlan),
- in_port->name, in_port->vlan);
- }
- goto done;
- }
- vlan = in_port->vlan;
- } else {
- if (!port_includes_vlan(in_port, vlan)) {
- static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
- VLOG_WARN_RL(&rl, "bridge %s: dropping VLAN %d tagged "
- "packet received on port %s not configured for "
- "trunking VLAN %d",
- br->name, vlan, in_port->name, vlan);
- goto done;
- }
- }
-
- /* Drop frames for ports that STP wants entirely killed (both for
- * forwarding and for learning). Later, after we do learning, we'll drop
- * the frames that STP wants to do learning but not forwarding on. */
- if (in_port->stp_state & (STP_LISTENING | STP_BLOCKING)) {
- goto done;
- }
-
- /* Drop frames for reserved multicast addresses. */
- if (eth_addr_is_reserved(flow->dl_dst)) {
- goto done;
- }
-
- /* Drop frames on ports reserved for mirroring. */
- if (in_port->is_mirror_output_port) {
- static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
- VLOG_WARN_RL(&rl, "bridge %s: dropping packet received on port %s, "
- "which is reserved exclusively for mirroring",
- br->name, in_port->name);
- goto done;
- }
-
- /* Packets received on bonds need special attention to avoid duplicates. */
- if (in_port->n_ifaces > 1) {
- int src_idx;
-
- if (eth_addr_is_multicast(flow->dl_dst)) {
- *tags |= in_port->active_iface_tag;
- if (in_port->active_iface != in_iface->port_ifidx) {
- /* Drop all multicast packets on inactive slaves. */
- goto done;
- }
- }
-
- /* Drop all packets for which we have learned a different input
- * port, because we probably sent the packet on one slave and got
- * it back on the other. Broadcast ARP replies are an exception
- * to this rule: the host has moved to another switch. */
- src_idx = mac_learning_lookup(br->ml, flow->dl_src, vlan);
- if (src_idx != -1 && src_idx != in_port->port_idx &&
- (!packet || !is_bcast_arp_reply(flow, packet))) {
- goto done;
- }
- }
-
- /* MAC learning. */
- out_port = FLOOD_PORT;
- if (br->ml) {
- int out_port_idx;
-
- /* Learn source MAC (but don't try to learn from revalidation). */
- if (packet) {
- tag_type rev_tag = mac_learning_learn(br->ml, flow->dl_src,
- vlan, in_port->port_idx);
- if (rev_tag) {
- /* The log messages here could actually be useful in debugging,
- * so keep the rate limit relatively high. */
- static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(30,
- 300);
- VLOG_DBG_RL(&rl, "bridge %s: learned that "ETH_ADDR_FMT" is "
- "on port %s in VLAN %d",
- br->name, ETH_ADDR_ARGS(flow->dl_src),
- in_port->name, vlan);
- ofproto_revalidate(br->ofproto, rev_tag);
- }
- }
-
- /* Determine output port. */
- out_port_idx = mac_learning_lookup_tag(br->ml, flow->dl_dst, vlan,
- tags);
- if (out_port_idx >= 0 && out_port_idx < br->n_ports) {
- out_port = br->ports[out_port_idx];
- } else if (!packet) {
- /* If we are revalidating but don't have a learning entry then
- * eject the flow. Installing a flow that floods packets will
- * prevent us from seeing future packets and learning properly. */
- return false;
- }
- }
-
- /* Don't send packets out their input ports. Don't forward frames that STP
- * wants us to discard. */
- if (in_port == out_port || in_port->stp_state == STP_LEARNING) {
- out_port = NULL;
- }
-
-done:
- compose_actions(br, flow, vlan, in_port, out_port, tags, actions);
-
- /*
- * We send out only a single packet, instead of setting up a flow, if the
- * packet is an ARP directed to broadcast that arrived on a bonded
- * interface. In such a situation ARP requests and replies must be handled
- * differently, but OpenFlow unfortunately can't distinguish them.
- */
- return (in_port->n_ifaces < 2
- || flow->dl_type != htons(ETH_TYPE_ARP)
- || !eth_addr_is_broadcast(flow->dl_dst));
-}
-
-/* Careful: 'opp' is in host byte order and opp->port_no is an OFP port
- * number. */
-static void
-bridge_port_changed_ofhook_cb(enum ofp_port_reason reason,
- const struct ofp_phy_port *opp,
- void *br_)
-{
- struct bridge *br = br_;
- struct iface *iface;
- struct port *port;
-
- iface = iface_from_dp_ifidx(br, ofp_port_to_odp_port(opp->port_no));
- if (!iface) {
- return;
- }
- port = iface->port;
-
- if (reason == OFPPR_DELETE) {
- VLOG_WARN("bridge %s: interface %s deleted unexpectedly",
- br->name, iface->name);
- iface_destroy(iface);
- if (!port->n_ifaces) {
- VLOG_WARN("bridge %s: port %s has no interfaces, dropping",
- br->name, port->name);
- port_destroy(port);
- }
-
- bridge_flush(br);
- } else {
- if (port->n_ifaces > 1) {
- bool up = !(opp->state & OFPPS_LINK_DOWN);
- bond_link_status_update(iface, up);
- port_update_bond_compat(port);
- }
- }
-}
-
-static bool
-bridge_normal_ofhook_cb(const flow_t *flow, const struct ofpbuf *packet,
- struct odp_actions *actions, tag_type *tags, void *br_)
-{
- struct bridge *br = br_;
-
-#if 0
- if (flow->dl_type == htons(OFP_DL_TYPE_NOT_ETH_TYPE)
- && eth_addr_equals(flow->dl_dst, stp_eth_addr)) {
- brstp_receive(br, flow, payload);
- return true;
- }
-#endif
-
- COVERAGE_INC(bridge_process_flow);
- return process_flow(br, flow, packet, actions, tags);
-}
-
-static void
-bridge_account_flow_ofhook_cb(const flow_t *flow,
- const union odp_action *actions,
- size_t n_actions, unsigned long long int n_bytes,
- void *br_)
-{
- struct bridge *br = br_;
- const union odp_action *a;
-
- if (!br->has_bonded_ports) {
- return;
- }
-
- for (a = actions; a < &actions[n_actions]; a++) {
- if (a->type == ODPAT_OUTPUT) {
- struct port *port = port_from_dp_ifidx(br, a->output.port);
- if (port && port->n_ifaces >= 2) {
- struct bond_entry *e = lookup_bond_entry(port, flow->dl_src);
- e->tx_bytes += n_bytes;
- }
- }
- }
-}
-
-static void
-bridge_account_checkpoint_ofhook_cb(void *br_)
-{
- struct bridge *br = br_;
- size_t i;
-
- if (!br->has_bonded_ports) {
- return;
- }
-
- /* The current ofproto implementation calls this callback at least once a
- * second, so this timer implementation is sufficient. */
- if (time_msec() < br->bond_next_rebalance) {
- return;
- }
- br->bond_next_rebalance = time_msec() + 10000;
-
- for (i = 0; i < br->n_ports; i++) {
- struct port *port = br->ports[i];
- if (port->n_ifaces > 1) {
- bond_rebalance_port(port);
- }
- }
-}
-
-static struct ofhooks bridge_ofhooks = {
- bridge_port_changed_ofhook_cb,
- bridge_normal_ofhook_cb,
- bridge_account_flow_ofhook_cb,
- bridge_account_checkpoint_ofhook_cb,
-};
-\f
-/* Bonding functions. */
-
-/* Statistics for a single interface on a bonded port, used for load-based
- * bond rebalancing. */
-struct slave_balance {
- struct iface *iface; /* The interface. */
- uint64_t tx_bytes; /* Sum of hashes[*]->tx_bytes. */
-
- /* All the "bond_entry"s that are assigned to this interface, in order of
- * increasing tx_bytes. */
- struct bond_entry **hashes;
- size_t n_hashes;
-};
-
-/* Sorts pointers to pointers to bond_entries in ascending order by the
- * interface to which they are assigned, and within a single interface in
- * ascending order of bytes transmitted. */
-static int
-compare_bond_entries(const void *a_, const void *b_)
-{
- const struct bond_entry *const *ap = a_;
- const struct bond_entry *const *bp = b_;
- const struct bond_entry *a = *ap;
- const struct bond_entry *b = *bp;
- if (a->iface_idx != b->iface_idx) {
- return a->iface_idx > b->iface_idx ? 1 : -1;
- } else if (a->tx_bytes != b->tx_bytes) {
- return a->tx_bytes > b->tx_bytes ? 1 : -1;
- } else {
- return 0;
- }
-}
-
-/* Sorts slave_balances so that enabled ports come first, and otherwise in
- * *descending* order by number of bytes transmitted. */
-static int
-compare_slave_balance(const void *a_, const void *b_)
-{
- const struct slave_balance *a = a_;
- const struct slave_balance *b = b_;
- if (a->iface->enabled != b->iface->enabled) {
- return a->iface->enabled ? -1 : 1;
- } else if (a->tx_bytes != b->tx_bytes) {
- return a->tx_bytes > b->tx_bytes ? -1 : 1;
- } else {
- return 0;
- }
-}
-
-static void
-swap_bals(struct slave_balance *a, struct slave_balance *b)
-{
- struct slave_balance tmp = *a;
- *a = *b;
- *b = tmp;
-}
-
-/* Restores the 'n_bals' slave_balance structures in 'bals' to sorted order
- * given that 'p' (and only 'p') might be in the wrong location.
- *
- * This function invalidates 'p', since it might now be in a different memory
- * location. */
-static void
-resort_bals(struct slave_balance *p,
- struct slave_balance bals[], size_t n_bals)
-{
- if (n_bals > 1) {
- for (; p > bals && p->tx_bytes > p[-1].tx_bytes; p--) {
- swap_bals(p, p - 1);
- }
- for (; p < &bals[n_bals - 1] && p->tx_bytes < p[1].tx_bytes; p++) {
- swap_bals(p, p + 1);
- }
- }
-}
-
-static void
-log_bals(const struct slave_balance *bals, size_t n_bals, struct port *port)
-{
- if (VLOG_IS_DBG_ENABLED()) {
- struct ds ds = DS_EMPTY_INITIALIZER;
- const struct slave_balance *b;
-
- for (b = bals; b < bals + n_bals; b++) {
- size_t i;
-
- if (b > bals) {
- ds_put_char(&ds, ',');
- }
- ds_put_format(&ds, " %s %"PRIu64"kB",
- b->iface->name, b->tx_bytes / 1024);
-
- if (!b->iface->enabled) {
- ds_put_cstr(&ds, " (disabled)");
- }
- if (b->n_hashes > 0) {
- ds_put_cstr(&ds, " (");
- for (i = 0; i < b->n_hashes; i++) {
- const struct bond_entry *e = b->hashes[i];
- if (i > 0) {
- ds_put_cstr(&ds, " + ");
- }
- ds_put_format(&ds, "h%td: %"PRIu64"kB",
- e - port->bond_hash, e->tx_bytes / 1024);
- }
- ds_put_cstr(&ds, ")");
- }
- }
- VLOG_DBG("bond %s:%s", port->name, ds_cstr(&ds));
- ds_destroy(&ds);
- }
-}
-
-/* Shifts 'hash' from 'from' to 'to' within 'port'. */
-static void
-bond_shift_load(struct slave_balance *from, struct slave_balance *to,
- int hash_idx)
-{
- struct bond_entry *hash = from->hashes[hash_idx];
- struct port *port = from->iface->port;
- uint64_t delta = hash->tx_bytes;
-
- VLOG_INFO("bond %s: shift %"PRIu64"kB of load (with hash %td) "
- "from %s to %s (now carrying %"PRIu64"kB and "
- "%"PRIu64"kB load, respectively)",
- port->name, delta / 1024, hash - port->bond_hash,
- from->iface->name, to->iface->name,
- (from->tx_bytes - delta) / 1024,
- (to->tx_bytes + delta) / 1024);
-
- /* Delete element from from->hashes.
- *
- * We don't bother to add the element to to->hashes because not only would
- * it require more work, the only purpose it would be to allow that hash to
- * be migrated to another slave in this rebalancing run, and there is no
- * point in doing that. */
- if (hash_idx == 0) {
- from->hashes++;
- } else {
- memmove(from->hashes + hash_idx, from->hashes + hash_idx + 1,
- (from->n_hashes - (hash_idx + 1)) * sizeof *from->hashes);
- }
- from->n_hashes--;
-
- /* Shift load away from 'from' to 'to'. */
- from->tx_bytes -= delta;
- to->tx_bytes += delta;
-
- /* Arrange for flows to be revalidated. */
- ofproto_revalidate(port->bridge->ofproto, hash->iface_tag);
- hash->iface_idx = to->iface->port_ifidx;
- hash->iface_tag = tag_create_random();
-}
-
-static void
-bond_rebalance_port(struct port *port)
-{
- struct slave_balance bals[DP_MAX_PORTS];
- size_t n_bals;
- struct bond_entry *hashes[BOND_MASK + 1];
- struct slave_balance *b, *from, *to;
- struct bond_entry *e;
- size_t i;
-
- /* Sets up 'bals' to describe each of the port's interfaces, sorted in
- * descending order of tx_bytes, so that bals[0] represents the most
- * heavily loaded slave and bals[n_bals - 1] represents the least heavily
- * loaded slave.
- *
- * The code is a bit tricky: to avoid dynamically allocating a 'hashes'
- * array for each slave_balance structure, we sort our local array of
- * hashes in order by slave, so that all of the hashes for a given slave
- * become contiguous in memory, and then we point each 'hashes' members of
- * a slave_balance structure to the start of a contiguous group. */
- n_bals = port->n_ifaces;
- for (b = bals; b < &bals[n_bals]; b++) {
- b->iface = port->ifaces[b - bals];
- b->tx_bytes = 0;
- b->hashes = NULL;
- b->n_hashes = 0;
- }
- for (i = 0; i <= BOND_MASK; i++) {
- hashes[i] = &port->bond_hash[i];
- }
- qsort(hashes, BOND_MASK + 1, sizeof *hashes, compare_bond_entries);
- for (i = 0; i <= BOND_MASK; i++) {
- e = hashes[i];
- if (e->iface_idx >= 0 && e->iface_idx < port->n_ifaces) {
- b = &bals[e->iface_idx];
- b->tx_bytes += e->tx_bytes;
- if (!b->hashes) {
- b->hashes = &hashes[i];