X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=vswitchd%2Fbridge.c;h=fb6a1413fc57099aa8a9eef630f45154b0c8ff04;hb=f1bd68ab587edcbf6d6702c1843eba286db190ab;hp=ec8a1997e44419a8f18783251e0971bb32f54e54;hpb=ba09980aaff81d5ba31aa221179740bc04680787;p=sliver-openvswitch.git diff --git a/vswitchd/bridge.c b/vswitchd/bridge.c index ec8a1997e..fb6a1413f 100644 --- a/vswitchd/bridge.c +++ b/vswitchd/bridge.c @@ -48,6 +48,7 @@ #include "port-array.h" #include "proc-net-compat.h" #include "process.h" +#include "secchan/netflow.h" #include "secchan/ofproto.h" #include "socket-util.h" #include "stp.h" @@ -146,7 +147,7 @@ struct port { struct bridge { struct list node; /* Node in global list of bridges. */ char *name; /* User-specified arbitrary name. */ - struct mac_learning *ml; /* MAC learning table, or null not to learn. */ + struct mac_learning *ml; /* MAC learning table. */ bool sent_config_request; /* Successfully sent config request? */ uint8_t default_ea[ETH_ADDR_LEN]; /* Default MAC. */ @@ -215,6 +216,7 @@ static void bond_run(struct bridge *); static void bond_wait(struct bridge *); static void bond_rebalance_port(struct port *); static void bond_send_learning_packets(struct port *); +static void bond_enable_slave(struct iface *iface, bool enable); static void port_create(struct bridge *, const char *name); static void port_reconfigure(struct port *); @@ -242,6 +244,8 @@ static void iface_destroy(struct iface *); static struct iface *iface_lookup(const struct bridge *, const char *name); static struct iface *iface_from_dp_ifidx(const struct bridge *, uint16_t dp_ifidx); +static bool iface_is_internal(const struct bridge *, const char *name); +static void iface_set_mac(struct iface *); /* Hooks into ofproto processing. */ static struct ofhooks bridge_ofhooks; @@ -461,9 +465,13 @@ bridge_reconfigure(void) for (i = 0; i < add_ifaces.n; i++) { const char *if_name = add_ifaces.names[i]; for (;;) { - int internal = cfg_get_bool(0, "iface.%s.internal", if_name); - int error = dpif_port_add(&br->dpif, if_name, next_port_no++, - internal ? ODP_PORT_INTERNAL : 0); + bool internal; + int error; + + /* Add to datapath. */ + internal = iface_is_internal(br, if_name); + error = dpif_port_add(&br->dpif, if_name, next_port_no++, + internal ? ODP_PORT_INTERNAL : 0); if (error != EEXIST) { if (next_port_no >= 256) { VLOG_ERR("ran out of valid port numbers on dp%u", @@ -488,10 +496,7 @@ bridge_reconfigure(void) uint64_t dpid; struct iface *local_iface = NULL; const char *devname; - uint8_t engine_type = br->dpif.minor; - uint8_t engine_id = br->dpif.minor; - bool add_id_to_iface = false; - struct svec nf_hosts; + struct netflow_options nf_options; bridge_fetch_dp_ifaces(br); for (i = 0; i < br->n_ports; ) { @@ -536,34 +541,46 @@ bridge_reconfigure(void) ofproto_set_datapath_id(br->ofproto, dpid); /* Set NetFlow configuration on this bridge. */ + memset(&nf_options, 0, sizeof nf_options); + nf_options.engine_type = br->dpif.minor; + nf_options.engine_id = br->dpif.minor; + nf_options.active_timeout = -1; + if (cfg_has("netflow.%s.engine-type", br->name)) { - engine_type = cfg_get_int(0, "netflow.%s.engine-type", + nf_options.engine_type = cfg_get_int(0, "netflow.%s.engine-type", br->name); } if (cfg_has("netflow.%s.engine-id", br->name)) { - engine_id = cfg_get_int(0, "netflow.%s.engine-id", br->name); + nf_options.engine_id = cfg_get_int(0, "netflow.%s.engine-id", + br->name); + } + if (cfg_has("netflow.%s.active-timeout", br->name)) { + nf_options.active_timeout = cfg_get_int(0, + "netflow.%s.active-timeout", + br->name); } if (cfg_has("netflow.%s.add-id-to-iface", br->name)) { - add_id_to_iface = cfg_get_bool(0, "netflow.%s.add-id-to-iface", - br->name); + nf_options.add_id_to_iface = cfg_get_bool(0, + "netflow.%s.add-id-to-iface", + br->name); } - if (add_id_to_iface && engine_id > 0x7f) { + if (nf_options.add_id_to_iface && nf_options.engine_id > 0x7f) { VLOG_WARN("bridge %s: netflow port mangling may conflict with " "another vswitch, choose an engine id less than 128", br->name); } - if (add_id_to_iface && br->n_ports > 0x1ff) { + if (nf_options.add_id_to_iface && br->n_ports > 508) { VLOG_WARN("bridge %s: netflow port mangling will conflict with " - "another port when 512 or more ports are used", + "another port when more than 508 ports are used", br->name); } - svec_init(&nf_hosts); - cfg_get_all_keys(&nf_hosts, "netflow.%s.host", br->name); - if (ofproto_set_netflow(br->ofproto, &nf_hosts, engine_type, - engine_id, add_id_to_iface)) { + svec_init(&nf_options.collectors); + cfg_get_all_keys(&nf_options.collectors, "netflow.%s.host", br->name); + if (ofproto_set_netflow(br->ofproto, &nf_options)) { VLOG_ERR("bridge %s: problem setting netflow collectors", br->name); } + svec_destroy(&nf_options.collectors); /* Update the controller and related settings. It would be more * straightforward to call this from bridge_reconfigure_one(), but we @@ -579,7 +596,16 @@ bridge_reconfigure(void) LIST_FOR_EACH (br, struct bridge, node, &all_bridges) { for (i = 0; i < br->n_ports; i++) { struct port *port = br->ports[i]; + port_update_vlan_compat(port); + + for (j = 0; j < port->n_ifaces; j++) { + struct iface *iface = port->ifaces[j]; + if (iface->dp_ifidx != ODPP_LOCAL + && iface_is_internal(br, iface->name)) { + iface_set_mac(iface); + } + } } } LIST_FOR_EACH (br, struct bridge, node, &all_bridges) { @@ -820,9 +846,7 @@ bridge_wait(void) continue; } - if (br->ml) { - mac_learning_wait(br->ml); - } + mac_learning_wait(br->ml); bond_wait(br); brstp_wait(br); } @@ -835,9 +859,7 @@ bridge_flush(struct bridge *br) { COVERAGE_INC(bridge_flush); br->flush = true; - if (br->ml) { - mac_learning_flush(br->ml); - } + mac_learning_flush(br->ml); } /* Bridge unixctl user interface functions. */ @@ -846,6 +868,7 @@ bridge_unixctl_fdb_show(struct unixctl_conn *conn, const char *args) { struct ds ds = DS_EMPTY_INITIALIZER; const struct bridge *br; + const struct mac_entry *e; br = bridge_lookup(args); if (!br) { @@ -854,16 +877,13 @@ bridge_unixctl_fdb_show(struct unixctl_conn *conn, const char *args) } ds_put_cstr(&ds, " port VLAN MAC Age\n"); - if (br->ml) { - const struct mac_entry *e; - LIST_FOR_EACH (e, struct mac_entry, lru_node, &br->ml->lrus) { - if (e->port < 0 || e->port >= br->n_ports) { - continue; - } - ds_put_format(&ds, "%5d %4d "ETH_ADDR_FMT" %3d\n", - br->ports[e->port]->ifaces[0]->dp_ifidx, - e->vlan, ETH_ADDR_ARGS(e->mac), mac_entry_age(e)); + LIST_FOR_EACH (e, struct mac_entry, lru_node, &br->ml->lrus) { + if (e->port < 0 || e->port >= br->n_ports) { + continue; } + ds_put_format(&ds, "%5d %4d "ETH_ADDR_FMT" %3d\n", + br->ports[e->port]->ifaces[0]->dp_ifidx, + e->vlan, ETH_ADDR_ARGS(e->mac), mac_entry_age(e)); } unixctl_command_reply(conn, 200, ds_cstr(&ds)); ds_destroy(&ds); @@ -1005,9 +1025,7 @@ bridge_run_one(struct bridge *br) return error; } - if (br->ml) { - mac_learning_run(br->ml, ofproto_get_revalidate_set(br->ofproto)); - } + mac_learning_run(br->ml, ofproto_get_revalidate_set(br->ofproto)); bond_run(br); brstp_run(br); @@ -1321,9 +1339,12 @@ bridge_get_all_ifaces(const struct bridge *br, struct svec *ifaces) struct iface *iface = port->ifaces[j]; svec_add(ifaces, iface->name); } + if (port->n_ifaces > 1 + && cfg_get_bool(0, "bonding.%s.fake-iface", port->name)) { + svec_add(ifaces, port->name); + } } - svec_sort(ifaces); - assert(svec_is_unique(ifaces)); + svec_sort_unique(ifaces); } /* For robustness, in case the administrator moves around datapath ports behind @@ -1388,13 +1409,31 @@ lookup_bond_entry(const struct port *port, const uint8_t mac[ETH_ADDR_LEN]) static int bond_choose_iface(const struct port *port) { - size_t i; + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20); + size_t i, best_down_slave = -1; + long long next_delay_expiration = LLONG_MAX; + for (i = 0; i < port->n_ifaces; i++) { - if (port->ifaces[i]->enabled) { + struct iface *iface = port->ifaces[i]; + + if (iface->enabled) { return i; + } else if (iface->delay_expires < next_delay_expiration) { + best_down_slave = i; + next_delay_expiration = iface->delay_expires; } } - return -1; + + if (best_down_slave != -1) { + struct iface *iface = port->ifaces[best_down_slave]; + + VLOG_INFO_RL(&rl, "interface %s: skipping remaining %lli ms updelay " + "since no other interface is up", iface->name, + iface->delay_expires - time_msec()); + bond_enable_slave(iface, true); + } + + return best_down_slave; } static bool @@ -1444,10 +1483,12 @@ bond_link_status_update(struct iface *iface, bool carrier) iface->delay_expires = LLONG_MAX; VLOG_INFO_RL(&rl, "interface %s: will not be %s", iface->name, carrier ? "disabled" : "enabled"); - } else if (carrier && port->updelay && port->active_iface < 0) { - iface->delay_expires = time_msec(); - VLOG_INFO_RL(&rl, "interface %s: skipping %d ms updelay since no " - "other interface is up", iface->name, port->updelay); + } else if (carrier && port->active_iface < 0) { + bond_enable_slave(iface, true); + if (port->updelay) { + VLOG_INFO_RL(&rl, "interface %s: skipping %d ms updelay since no " + "other interface is up", iface->name, port->updelay); + } } else { int delay = carrier ? port->updelay : port->downdelay; iface->delay_expires = time_msec() + delay; @@ -1484,6 +1525,12 @@ bond_enable_slave(struct iface *iface, bool enable) struct port *port = iface->port; struct bridge *br = port->bridge; + /* This acts as a recursion check. If the act of disabling a slave + * causes a different slave to be enabled, the flag will allow us to + * skip redundant work when we reenter this function. It must be + * cleared on exit to keep things safe with multiple bonds. */ + static bool moving_active_iface = false; + iface->delay_expires = LLONG_MAX; if (enable == iface->enabled) { return; @@ -1496,18 +1543,29 @@ bond_enable_slave(struct iface *iface, bool enable) if (iface->port_ifidx == port->active_iface) { ofproto_revalidate(br->ofproto, port->active_iface_tag); + + /* Disabling a slave can lead to another slave being immediately + * enabled if there will be no active slaves but one is waiting + * on an updelay. In this case we do not need to run most of the + * code for the newly enabled slave since there was no period + * without an active slave and it is redundant with the disabling + * path. */ + moving_active_iface = true; bond_choose_active_iface(port); } bond_send_learning_packets(port); } else { VLOG_WARN("interface %s: enabled", iface->name); - if (port->active_iface < 0) { + if (port->active_iface < 0 && !moving_active_iface) { ofproto_revalidate(br->ofproto, port->no_ifaces_tag); bond_choose_active_iface(port); bond_send_learning_packets(port); } iface->tag = tag_create_random(); } + + moving_active_iface = false; + port->bond_compat_is_stale = true; } static void @@ -1518,20 +1576,19 @@ bond_run(struct bridge *br) for (i = 0; i < br->n_ports; i++) { struct port *port = br->ports[i]; + if (port->n_ifaces >= 2) { + for (j = 0; j < port->n_ifaces; j++) { + struct iface *iface = port->ifaces[j]; + if (time_msec() >= iface->delay_expires) { + bond_enable_slave(iface, !iface->enabled); + } + } + } + if (port->bond_compat_is_stale) { port->bond_compat_is_stale = false; port_update_bond_compat(port); } - - if (port->n_ifaces < 2) { - continue; - } - for (j = 0; j < port->n_ifaces; j++) { - struct iface *iface = port->ifaces[j]; - if (time_msec() >= iface->delay_expires) { - bond_enable_slave(iface, !iface->enabled); - } - } } } @@ -1657,7 +1714,7 @@ port_includes_vlan(const struct port *port, uint16_t vlan) static size_t compose_dsts(const struct bridge *br, const flow_t *flow, uint16_t vlan, const struct port *in_port, const struct port *out_port, - struct dst dsts[], tag_type *tags) + struct dst dsts[], tag_type *tags, uint16_t *nf_output_iface) { mirror_mask_t mirrors = in_port->src_mirrors; struct dst *dst = dsts; @@ -1676,7 +1733,9 @@ compose_dsts(const struct bridge *br, const flow_t *flow, uint16_t vlan, dst++; } } + *nf_output_iface = NF_OUT_FLOOD; } else if (out_port && set_dst(dst, flow, in_port, out_port, tags)) { + *nf_output_iface = dst->dp_ifidx; mirrors |= out_port->dst_mirrors; dst++; } @@ -1693,14 +1752,28 @@ compose_dsts(const struct bridge *br, const flow_t *flow, uint16_t vlan, for (i = 0; i < br->n_ports; i++) { struct port *port = br->ports[i]; if (port_includes_vlan(port, m->out_vlan) - && set_dst(dst, flow, in_port, port, tags) - && !dst_is_duplicate(dsts, dst - dsts, dst)) + && set_dst(dst, flow, in_port, port, tags)) { + int flow_vlan; + if (port->vlan < 0) { dst->vlan = m->out_vlan; } - if (dst->dp_ifidx == flow->in_port - && dst->vlan == vlan) { + if (dst_is_duplicate(dsts, dst - dsts, dst)) { + continue; + } + + /* Use the vlan tag on the original flow instead of + * the one passed in the vlan parameter. This ensures + * that we compare the vlan from before any implicit + * tagging tags place. This is necessary because + * dst->vlan is the final vlan, after removing implicit + * tags. */ + flow_vlan = ntohs(flow->dl_vlan); + if (flow_vlan == 0) { + flow_vlan = OFP_VLAN_NONE; + } + if (port == in_port && dst->vlan == flow_vlan) { /* Don't send out input port on same VLAN. */ continue; } @@ -1730,14 +1803,16 @@ print_dsts(const struct dst *dsts, size_t n) static void compose_actions(struct bridge *br, const flow_t *flow, uint16_t vlan, const struct port *in_port, const struct port *out_port, - tag_type *tags, struct odp_actions *actions) + tag_type *tags, struct odp_actions *actions, + uint16_t *nf_output_iface) { struct dst dsts[DP_MAX_PORTS * (MAX_MIRRORS + 1)]; size_t n_dsts; const struct dst *p; uint16_t cur_vlan; - n_dsts = compose_dsts(br, flow, vlan, in_port, out_port, dsts, tags); + n_dsts = compose_dsts(br, flow, vlan, in_port, out_port, dsts, tags, + nf_output_iface); cur_vlan = ntohs(flow->dl_vlan); for (p = dsts; p < &dsts[n_dsts]; p++) { @@ -1756,14 +1831,77 @@ compose_actions(struct bridge *br, const flow_t *flow, uint16_t vlan, } } +/* Returns the effective vlan of a packet, taking into account both the + * 802.1Q header and implicitly tagged ports. A value of 0 indicates that + * the packet is untagged and -1 indicates it has an invalid header and + * should be dropped. */ +static int flow_get_vlan(struct bridge *br, const flow_t *flow, + struct port *in_port, bool have_packet) +{ + /* Note that dl_vlan of 0 and of OFP_VLAN_NONE both mean that the packet + * belongs to VLAN 0, so we should treat both cases identically. (In the + * former case, the packet has an 802.1Q header that specifies VLAN 0, + * presumably to allow a priority to be specified. In the latter case, the + * packet does not have any 802.1Q header.) */ + int vlan = ntohs(flow->dl_vlan); + if (vlan == OFP_VLAN_NONE) { + vlan = 0; + } + if (in_port->vlan >= 0) { + if (vlan) { + /* XXX support double tagging? */ + if (have_packet) { + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5); + VLOG_WARN_RL(&rl, "bridge %s: dropping VLAN %"PRIu16" tagged " + "packet received on port %s configured with " + "implicit VLAN %"PRIu16, + br->name, ntohs(flow->dl_vlan), + in_port->name, in_port->vlan); + } + return -1; + } + vlan = in_port->vlan; + } else { + if (!port_includes_vlan(in_port, vlan)) { + if (have_packet) { + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5); + VLOG_WARN_RL(&rl, "bridge %s: dropping VLAN %d tagged " + "packet received on port %s not configured for " + "trunking VLAN %d", + br->name, vlan, in_port->name, vlan); + } + return -1; + } + } + + return vlan; +} + +static void +update_learning_table(struct bridge *br, const flow_t *flow, int vlan, + struct port *in_port) +{ + tag_type rev_tag = mac_learning_learn(br->ml, flow->dl_src, + vlan, in_port->port_idx); + if (rev_tag) { + /* The log messages here could actually be useful in debugging, + * so keep the rate limit relatively high. */ + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(30, + 300); + VLOG_DBG_RL(&rl, "bridge %s: learned that "ETH_ADDR_FMT" is " + "on port %s in VLAN %d", + br->name, ETH_ADDR_ARGS(flow->dl_src), + in_port->name, vlan); + ofproto_revalidate(br->ofproto, rev_tag); + } +} + static bool -is_bcast_arp_reply(const flow_t *flow, const struct ofpbuf *packet) +is_bcast_arp_reply(const flow_t *flow) { - struct arp_eth_header *arp = (struct arp_eth_header *) packet->data; return (flow->dl_type == htons(ETH_TYPE_ARP) - && eth_addr_is_broadcast(flow->dl_dst) - && packet->size >= sizeof(struct arp_eth_header) - && arp->ar_op == ARP_OP_REQUEST); + && flow->nw_proto == ARP_OP_REPLY + && eth_addr_is_broadcast(flow->dl_dst)); } /* If the composed actions may be applied to any packet in the given 'flow', @@ -1772,12 +1910,13 @@ is_bcast_arp_reply(const flow_t *flow, const struct ofpbuf *packet) static bool process_flow(struct bridge *br, const flow_t *flow, const struct ofpbuf *packet, struct odp_actions *actions, - tag_type *tags) + tag_type *tags, uint16_t *nf_output_iface) { struct iface *in_iface; struct port *in_port; struct port *out_port = NULL; /* By default, drop the packet/flow. */ int vlan; + int out_port_idx; /* Find the interface and port structure for the received packet. */ in_iface = iface_from_dp_ifidx(br, flow->in_port); @@ -1805,41 +1944,9 @@ process_flow(struct bridge *br, const flow_t *flow, return true; } in_port = in_iface->port; - - /* Figure out what VLAN this packet belongs to. - * - * Note that dl_vlan of 0 and of OFP_VLAN_NONE both mean that the packet - * belongs to VLAN 0, so we should treat both cases identically. (In the - * former case, the packet has an 802.1Q header that specifies VLAN 0, - * presumably to allow a priority to be specified. In the latter case, the - * packet does not have any 802.1Q header.) */ - vlan = ntohs(flow->dl_vlan); - if (vlan == OFP_VLAN_NONE) { - vlan = 0; - } - if (in_port->vlan >= 0) { - if (vlan) { - /* XXX support double tagging? */ - if (packet != NULL) { - static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5); - VLOG_WARN_RL(&rl, "bridge %s: dropping VLAN %"PRIu16" tagged " - "packet received on port %s configured with " - "implicit VLAN %"PRIu16, - br->name, ntohs(flow->dl_vlan), - in_port->name, in_port->vlan); - } - goto done; - } - vlan = in_port->vlan; - } else { - if (!port_includes_vlan(in_port, vlan)) { - static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5); - VLOG_WARN_RL(&rl, "bridge %s: dropping VLAN %d tagged " - "packet received on port %s not configured for " - "trunking VLAN %d", - br->name, vlan, in_port->name, vlan); - goto done; - } + vlan = flow_get_vlan(br, flow, in_port, !!packet); + if (vlan < 0) { + goto done; } /* Drop frames for ports that STP wants entirely killed (both for @@ -1863,63 +1970,48 @@ process_flow(struct bridge *br, const flow_t *flow, goto done; } - /* Multicast (and broadcast) packets on bonds need special attention, to - * avoid receiving duplicates. */ - if (in_port->n_ifaces > 1 && eth_addr_is_multicast(flow->dl_dst)) { - *tags |= in_port->active_iface_tag; - if (in_port->active_iface != in_iface->port_ifidx) { - /* Drop all multicast packets on inactive slaves. */ - goto done; - } else { - /* Drop all multicast packets for which we have learned a different - * input port, because we probably sent the packet on one slaves - * and got it back on the active slave. Broadcast ARP replies are - * an exception to this rule: the host has moved to another - * switch. */ - int src_idx = mac_learning_lookup(br->ml, flow->dl_src, vlan); - if (src_idx != -1 && src_idx != in_port->port_idx) { - if (packet) { - if (!is_bcast_arp_reply(flow, packet)) { - goto done; - } - } else { - /* No way to know whether it's an ARP reply, because the - * flow entry doesn't include enough information and we - * don't have a packet. Punt. */ - return false; - } + /* Packets received on bonds need special attention to avoid duplicates. */ + if (in_port->n_ifaces > 1) { + int src_idx; + + if (eth_addr_is_multicast(flow->dl_dst)) { + *tags |= in_port->active_iface_tag; + if (in_port->active_iface != in_iface->port_ifidx) { + /* Drop all multicast packets on inactive slaves. */ + goto done; } } + + /* Drop all packets for which we have learned a different input + * port, because we probably sent the packet on one slave and got + * it back on the other. Broadcast ARP replies are an exception + * to this rule: the host has moved to another switch. */ + src_idx = mac_learning_lookup(br->ml, flow->dl_src, vlan); + if (src_idx != -1 && src_idx != in_port->port_idx && + !is_bcast_arp_reply(flow)) { + goto done; + } } /* MAC learning. */ out_port = FLOOD_PORT; - if (br->ml) { - int out_port_idx; - - /* Learn source MAC (but don't try to learn from revalidation). */ - if (packet) { - tag_type rev_tag = mac_learning_learn(br->ml, flow->dl_src, - vlan, in_port->port_idx); - if (rev_tag) { - /* The log messages here could actually be useful in debugging, - * so keep the rate limit relatively high. */ - static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(30, - 300); - VLOG_DBG_RL(&rl, "bridge %s: learned that "ETH_ADDR_FMT" is " - "on port %s in VLAN %d", - br->name, ETH_ADDR_ARGS(flow->dl_src), - in_port->name, vlan); - ofproto_revalidate(br->ofproto, rev_tag); - } - } - - /* Determine output port. */ - out_port_idx = mac_learning_lookup_tag(br->ml, flow->dl_dst, vlan, - tags); - if (out_port_idx >= 0 && out_port_idx < br->n_ports) { - out_port = br->ports[out_port_idx]; - } + /* Learn source MAC (but don't try to learn from revalidation). */ + if (packet) { + update_learning_table(br, flow, vlan, in_port); + } + + /* Determine output port. */ + out_port_idx = mac_learning_lookup_tag(br->ml, flow->dl_dst, vlan, + tags); + if (out_port_idx >= 0 && out_port_idx < br->n_ports) { + out_port = br->ports[out_port_idx]; + } else if (!packet && !eth_addr_is_multicast(flow->dl_dst)) { + /* If we are revalidating but don't have a learning entry then + * eject the flow. Installing a flow that floods packets opens + * up a window of time where we could learn from a packet reflected + * on a bond and blackhole packets before the learning table is + * updated to reflect the correct port. */ + return false; } /* Don't send packets out their input ports. Don't forward frames that STP @@ -1929,17 +2021,10 @@ process_flow(struct bridge *br, const flow_t *flow, } done: - compose_actions(br, flow, vlan, in_port, out_port, tags, actions); + compose_actions(br, flow, vlan, in_port, out_port, tags, actions, + nf_output_iface); - /* - * We send out only a single packet, instead of setting up a flow, if the - * packet is an ARP directed to broadcast that arrived on a bonded - * interface. In such a situation ARP requests and replies must be handled - * differently, but OpenFlow unfortunately can't distinguish them. - */ - return (in_port->n_ifaces < 2 - || flow->dl_type != htons(ETH_TYPE_ARP) - || !eth_addr_is_broadcast(flow->dl_dst)); + return true; } /* Careful: 'opp' is in host byte order and opp->port_no is an OFP port @@ -1982,7 +2067,8 @@ bridge_port_changed_ofhook_cb(enum ofp_port_reason reason, static bool bridge_normal_ofhook_cb(const flow_t *flow, const struct ofpbuf *packet, - struct odp_actions *actions, tag_type *tags, void *br_) + struct odp_actions *actions, tag_type *tags, + uint16_t *nf_output_iface, void *br_) { struct bridge *br = br_; @@ -1995,7 +2081,7 @@ bridge_normal_ofhook_cb(const flow_t *flow, const struct ofpbuf *packet, #endif COVERAGE_INC(bridge_process_flow); - return process_flow(br, flow, packet, actions, tags); + return process_flow(br, flow, packet, actions, tags, nf_output_iface); } static void @@ -2005,17 +2091,30 @@ bridge_account_flow_ofhook_cb(const flow_t *flow, void *br_) { struct bridge *br = br_; + struct port *in_port; const union odp_action *a; + /* Feed information from the active flows back into the learning table + * to ensure that table is always in sync with what is actually flowing + * through the datapath. */ + in_port = port_from_dp_ifidx(br, flow->in_port); + if (in_port) { + int vlan = flow_get_vlan(br, flow, in_port, false); + if (vlan >= 0) { + update_learning_table(br, flow, vlan, in_port); + } + } + if (!br->has_bonded_ports) { return; } for (a = actions; a < &actions[n_actions]; a++) { if (a->type == ODPAT_OUTPUT) { - struct port *port = port_from_dp_ifidx(br, a->output.port); - if (port && port->n_ifaces >= 2) { - struct bond_entry *e = lookup_bond_entry(port, flow->dl_src); + struct port *out_port = port_from_dp_ifidx(br, a->output.port); + if (out_port && out_port->n_ifaces >= 2) { + struct bond_entry *e = lookup_bond_entry(out_port, + flow->dl_src); e->tx_bytes += n_bytes; } } @@ -2170,8 +2269,9 @@ log_bals(const struct slave_balance *bals, size_t n_bals, struct port *port) /* Shifts 'hash' from 'from' to 'to' within 'port'. */ static void bond_shift_load(struct slave_balance *from, struct slave_balance *to, - struct bond_entry *hash) + int hash_idx) { + struct bond_entry *hash = from->hashes[hash_idx]; struct port *port = from->iface->port; uint64_t delta = hash->tx_bytes; @@ -2189,12 +2289,11 @@ bond_shift_load(struct slave_balance *from, struct slave_balance *to, * it require more work, the only purpose it would be to allow that hash to * be migrated to another slave in this rebalancing run, and there is no * point in doing that. */ - if (from->hashes[0] == hash) { + if (hash_idx == 0) { from->hashes++; } else { - int i = hash - from->hashes[0]; - memmove(from->hashes + i, from->hashes + i + 1, - (from->n_hashes - (i + 1)) * sizeof *from->hashes); + memmove(from->hashes + hash_idx, from->hashes + hash_idx + 1, + (from->n_hashes - (hash_idx + 1)) * sizeof *from->hashes); } from->n_hashes--; @@ -2279,22 +2378,60 @@ bond_rebalance_port(struct port *port) /* 'from' is carrying significantly more load than 'to', and that * load is split across at least two different hashes. Pick a hash * to migrate to 'to' (the least-loaded slave), given that doing so - * must not cause 'to''s load to exceed 'from''s load. + * must decrease the ratio of the load on the two slaves by at + * least 0.1. * * The sort order we use means that we prefer to shift away the * smallest hashes instead of the biggest ones. There is little * reason behind this decision; we could use the opposite sort * order to shift away big hashes ahead of small ones. */ size_t i; + bool order_swapped; for (i = 0; i < from->n_hashes; i++) { + double old_ratio, new_ratio; uint64_t delta = from->hashes[i]->tx_bytes; - if (to->tx_bytes + delta < from->tx_bytes - delta) { + + if (delta == 0 || from->tx_bytes - delta == 0) { + /* Pointless move. */ + continue; + } + + order_swapped = from->tx_bytes - delta < to->tx_bytes + delta; + + if (to->tx_bytes == 0) { + /* Nothing on the new slave, move it. */ + break; + } + + old_ratio = (double)from->tx_bytes / to->tx_bytes; + new_ratio = (double)(from->tx_bytes - delta) / + (to->tx_bytes + delta); + + if (new_ratio == 0) { + /* Should already be covered but check to prevent division + * by zero. */ + continue; + } + + if (new_ratio < 1) { + new_ratio = 1 / new_ratio; + } + + if (old_ratio - new_ratio > 0.1) { + /* Would decrease the ratio, move it. */ break; } } if (i < from->n_hashes) { - bond_shift_load(from, to, from->hashes[i]); + bond_shift_load(from, to, i); + port->bond_compat_is_stale = true; + + /* If the result of the migration changed the relative order of + * 'from' and 'to' swap them back to maintain invariants. */ + if (order_swapped) { + swap_bals(from, to); + } /* Re-sort 'bals'. Note that this may make 'from' and 'to' * point to different slave_balance structures. It is only @@ -2305,7 +2442,6 @@ bond_rebalance_port(struct port *port) } else { from++; } - port->bond_compat_is_stale = true; } } @@ -2324,17 +2460,14 @@ bond_send_learning_packets(struct port *port) struct ofpbuf packet; int error, n_packets, n_errors; - if (!port->n_ifaces || port->active_iface < 0 || !br->ml) { + if (!port->n_ifaces || port->active_iface < 0) { return; } ofpbuf_init(&packet, 128); error = n_packets = n_errors = 0; LIST_FOR_EACH (e, struct mac_entry, lru_node, &br->ml->lrus) { - static const char s[] = "Open vSwitch Bond Failover"; union ofp_action actions[2], *a; - struct eth_header *eth; - struct llc_snap_header *llc_snap; uint16_t dp_ifidx; tag_type tags = 0; flow_t flow; @@ -2345,23 +2478,6 @@ bond_send_learning_packets(struct port *port) continue; } - /* Compose packet to send. */ - ofpbuf_clear(&packet); - eth = ofpbuf_put_zeros(&packet, ETH_HEADER_LEN); - llc_snap = ofpbuf_put_zeros(&packet, LLC_SNAP_HEADER_LEN); - ofpbuf_put(&packet, s, sizeof s); /* Includes null byte. */ - ofpbuf_put(&packet, e->mac, ETH_ADDR_LEN); - - memcpy(eth->eth_dst, eth_addr_broadcast, ETH_ADDR_LEN); - memcpy(eth->eth_src, e->mac, ETH_ADDR_LEN); - eth->eth_type = htons(packet.size - ETH_HEADER_LEN); - - llc_snap->llc.llc_dsap = LLC_DSAP_SNAP; - llc_snap->llc.llc_ssap = LLC_SSAP_SNAP; - llc_snap->llc.llc_cntl = LLC_CNTL_SNAP; - memcpy(llc_snap->snap.snap_org, "\x00\x23\x20", 3); - llc_snap->snap.snap_type = htons(0xf177); /* Random number. */ - /* Compose actions. */ memset(actions, 0, sizeof actions); a = actions; @@ -2378,6 +2494,8 @@ bond_send_learning_packets(struct port *port) /* Send packet. */ n_packets++; + compose_benign_packet(&packet, "Open vSwitch Bond Failover", 0xf177, + e->mac); flow_extract(&packet, ODPP_NONE, &flow); retval = ofproto_send_packet(br->ofproto, &flow, actions, a - actions, &packet); @@ -2493,14 +2611,10 @@ bond_unixctl_show(struct unixctl_conn *conn, const char *args) continue; } - ds_put_format(&ds, "\thash %d: %lld kB load\n", + ds_put_format(&ds, "\thash %d: %"PRIu64" kB load\n", hash, be->tx_bytes / 1024); /* MACs. */ - if (!port->bridge->ml) { - break; - } - LIST_FOR_EACH (me, struct mac_entry, lru_node, &port->bridge->ml->lrus) { uint16_t dp_ifidx; @@ -2958,14 +3072,40 @@ port_update_bond_compat(struct port *port) struct iface *iface = port->ifaces[i]; struct compat_bond_slave *slave = &bond.slaves[i]; slave->name = iface->name; - slave->up = ((iface->enabled && iface->delay_expires == LLONG_MAX) || - (!iface->enabled && iface->delay_expires != LLONG_MAX)); + + /* We need to make the same determination as the Linux bonding + * code to determine whether a slave should be consider "up". + * The Linux function bond_miimon_inspect() supports four + * BOND_LINK_* states: + * + * - BOND_LINK_UP: carrier detected, updelay has passed. + * - BOND_LINK_FAIL: carrier lost, downdelay in progress. + * - BOND_LINK_DOWN: carrier lost, downdelay has passed. + * - BOND_LINK_BACK: carrier detected, updelay in progress. + * + * The function bond_info_show_slave() only considers BOND_LINK_UP + * to be "up" and anything else to be "down". + */ + slave->up = iface->enabled && iface->delay_expires == LLONG_MAX; if (slave->up) { bond.up = true; } memcpy(slave->mac, iface->mac, ETH_ADDR_LEN); } + if (cfg_get_bool(0, "bonding.%s.fake-iface", port->name)) { + struct netdev *bond_netdev; + + if (!netdev_open(port->name, NETDEV_ETH_TYPE_NONE, &bond_netdev)) { + if (bond.up) { + netdev_turn_flags_on(bond_netdev, NETDEV_UP, true); + } else { + netdev_turn_flags_off(bond_netdev, NETDEV_UP, true); + } + netdev_close(bond_netdev); + } + } + proc_net_compat_update_bond(port->name, &bond); free(bond.slaves); } @@ -3100,6 +3240,60 @@ iface_from_dp_ifidx(const struct bridge *br, uint16_t dp_ifidx) { return port_array_get(&br->ifaces, dp_ifidx); } + +/* Returns true if 'iface' is the name of an "internal" interface on bridge + * 'br', that is, an interface that is entirely simulated within the datapath. + * The local port (ODPP_LOCAL) is always an internal interface. Other local + * interfaces are created by setting "iface..internal = true". + * + * In addition, we have a kluge-y feature that creates an internal port with + * the name of a bonded port if "bonding..fake-iface = true" is set. + * This feature needs to go away in the long term. Until then, this is one + * reason why this function takes a name instead of a struct iface: the fake + * interfaces created this way do not have a struct iface. */ +static bool +iface_is_internal(const struct bridge *br, const char *iface) +{ + if (!strcmp(iface, br->name) + || cfg_get_bool(0, "iface.%s.internal", iface)) { + return true; + } + + if (cfg_get_bool(0, "bonding.%s.fake-iface", iface)) { + struct port *port = port_lookup(br, iface); + if (port && port->n_ifaces > 1) { + return true; + } + } + + return false; +} + +/* Set Ethernet address of 'iface', if one is specified in the configuration + * file. */ +static void +iface_set_mac(struct iface *iface) +{ + uint64_t mac = cfg_get_mac(0, "iface.%s.mac", iface->name); + if (mac) { + static uint8_t ea[ETH_ADDR_LEN]; + + eth_addr_from_uint64(mac, ea); + if (eth_addr_is_multicast(ea)) { + VLOG_ERR("interface %s: cannot set MAC to multicast address", + iface->name); + } else if (iface->dp_ifidx == ODPP_LOCAL) { + VLOG_ERR("ignoring iface.%s.mac; use bridge.%s.mac instead", + iface->name, iface->name); + } else { + int error = netdev_nodev_set_etheraddr(iface->name, ea); + if (error) { + VLOG_ERR("interface %s: setting MAC failed (%s)", + iface->name, strerror(error)); + } + } + } +} /* Port mirroring. */ @@ -3107,7 +3301,8 @@ static void mirror_reconfigure(struct bridge *br) { struct svec old_mirrors, new_mirrors; - size_t i; + size_t i, n_rspan_vlans; + unsigned long *rspan_vlans; /* Collect old and new mirrors. */ svec_init(&old_mirrors); @@ -3156,6 +3351,29 @@ mirror_reconfigure(struct bridge *br) m->out_port->is_mirror_output_port = true; } } + + /* Update learning disabled vlans (for RSPAN). */ + rspan_vlans = NULL; + n_rspan_vlans = cfg_count("vlan.%s.disable-learning", br->name); + if (n_rspan_vlans) { + rspan_vlans = bitmap_allocate(4096); + + for (i = 0; i < n_rspan_vlans; i++) { + int vlan = cfg_get_vlan(i, "vlan.%s.disable-learning", br->name); + if (vlan >= 0) { + bitmap_set1(rspan_vlans, vlan); + VLOG_INFO("bridge %s: disabling learning on vlan %d\n", + br->name, vlan); + } else { + VLOG_ERR("bridge %s: invalid value '%s' for learning disabled " + "VLAN", br->name, + cfg_get_string(i, "vlan.%s.disable-learning", br->name)); + } + } + } + if (mac_learning_set_disabled_vlans(br->ml, rspan_vlans)) { + bridge_flush(br); + } } static void @@ -3300,6 +3518,7 @@ mirror_reconfigure_one(struct mirror *m) int *vlans; size_t i; bool mirror_all_ports; + bool any_ports_specified; /* Get output port. */ out_port_name = cfg_get_key(0, "mirror.%s.%s.output.port", @@ -3338,11 +3557,18 @@ mirror_reconfigure_one(struct mirror *m) cfg_get_all_keys(&src_ports, "%s.select.src-port", pfx); cfg_get_all_keys(&dst_ports, "%s.select.dst-port", pfx); cfg_get_all_keys(&ports, "%s.select.port", pfx); + any_ports_specified = src_ports.n || dst_ports.n || ports.n; svec_append(&src_ports, &ports); svec_append(&dst_ports, &ports); svec_destroy(&ports); prune_ports(m, &src_ports); prune_ports(m, &dst_ports); + if (any_ports_specified && !src_ports.n && !dst_ports.n) { + VLOG_ERR("%s: none of the specified ports exist; " + "disabling port mirror %s", pfx, pfx); + mirror_destroy(m); + goto exit; + } /* Get all the vlans, and drop duplicate and invalid vlans. */ svec_init(&vlan_strings); @@ -3394,6 +3620,7 @@ mirror_reconfigure_one(struct mirror *m) } /* Clean up. */ +exit: svec_destroy(&src_ports); svec_destroy(&dst_ports); free(pfx);