X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=vswitchd%2Fbridge.c;h=fda80f1a2c0a3ed93f7adedf85a872678afeaf28;hb=5422a9e189c627202a0eaa568a52d17e088d82fb;hp=10c6fee25e85ae9c28985791d7029b4b8a9b3a63;hpb=c93b1d6a4c7f96c5f75f7ec0972fe62e94d369dc;p=sliver-openvswitch.git diff --git a/vswitchd/bridge.c b/vswitchd/bridge.c index 10c6fee25..fda80f1a2 100644 --- a/vswitchd/bridge.c +++ b/vswitchd/bridge.c @@ -1,28 +1,16 @@ /* Copyright (c) 2008, 2009 Nicira Networks * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - * - * In addition, as a special exception, Nicira Networks gives permission - * to link the code of its release of vswitchd with the OpenSSL project's - * "OpenSSL" library (or with modified versions of it that use the same - * license as the "OpenSSL" library), and distribute the linked - * executables. You must obey the GNU General Public License in all - * respects for all of the code used other than "OpenSSL". If you modify - * this file, you may extend this exception to your version of the file, - * but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. + * http://www.apache.org/licenses/LICENSE-2.0 * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ #include @@ -39,6 +27,7 @@ #include #include #include +#include #include #include "bitmap.h" #include "cfg.h" @@ -141,6 +130,7 @@ struct port { tag_type active_iface_tag; /* Tag for bcast flows. */ tag_type no_ifaces_tag; /* Tag for flows when all ifaces disabled. */ int updelay, downdelay; /* Delay before iface goes up/down, in ms. */ + bool bond_compat_is_stale; /* Need to call port_update_bond_compat()? */ /* Port mirroring info. */ mirror_mask_t src_mirrors; /* Mirrors triggered when packet received. */ @@ -203,6 +193,7 @@ enum { DP_MAX = 256 }; static struct bridge *bridge_create(const char *name); static void bridge_destroy(struct bridge *); static struct bridge *bridge_lookup(const char *name); +static void bridge_unixctl_dump_flows(struct unixctl_conn *, const char *); static int bridge_run_one(struct bridge *); static void bridge_reconfigure_one(struct bridge *); static void bridge_reconfigure_controller(struct bridge *); @@ -217,6 +208,8 @@ static uint64_t bridge_pick_datapath_id(struct bridge *, const char *devname); static uint64_t dpid_from_hash(const void *, size_t nbytes); +static void bridge_unixctl_fdb_show(struct unixctl_conn *, const char *args); + static void bond_init(void); static void bond_run(struct bridge *); static void bond_wait(struct bridge *); @@ -249,6 +242,8 @@ static void iface_destroy(struct iface *); static struct iface *iface_lookup(const struct bridge *, const char *name); static struct iface *iface_from_dp_ifidx(const struct bridge *, uint16_t dp_ifidx); +static bool iface_is_internal(const struct bridge *, const char *name); +static void iface_set_mac(struct iface *); /* Hooks into ofproto processing. */ static struct ofhooks bridge_ofhooks; @@ -291,6 +286,8 @@ bridge_init(void) bond_init(); + unixctl_command_register("fdb/show", bridge_unixctl_fdb_show); + for (i = 0; i < DP_MAX; i++) { struct dpif dpif; char devname[16]; @@ -310,6 +307,8 @@ bridge_init(void) } } + unixctl_command_register("bridge/dump-flows", bridge_unixctl_dump_flows); + bridge_reconfigure(); } @@ -335,6 +334,7 @@ bridge_configure_ssl(void) static char *private_key_file; static char *certificate_file; static char *cacert_file; + struct stat s; if (config_string_change("ssl.private-key", &private_key_file)) { vconn_ssl_set_private_key_file(private_key_file); @@ -344,7 +344,13 @@ bridge_configure_ssl(void) vconn_ssl_set_certificate_file(certificate_file); } - if (config_string_change("ssl.ca-cert", &cacert_file)) { + /* We assume that even if the filename hasn't changed, if the CA cert + * file has been removed, that we want to move back into + * boot-strapping mode. This opens a small security hole, because + * the old certificate will still be trusted until vSwitch is + * restarted. We may want to address this in vconn's SSL library. */ + if (config_string_change("ssl.ca-cert", &cacert_file) + || (stat(cacert_file, &s) && errno == ENOENT)) { vconn_ssl_set_ca_cert_file(cacert_file, cfg_get_bool(0, "ssl.bootstrap-ca-cert")); } @@ -457,9 +463,13 @@ bridge_reconfigure(void) for (i = 0; i < add_ifaces.n; i++) { const char *if_name = add_ifaces.names[i]; for (;;) { - int internal = cfg_get_bool(0, "iface.%s.internal", if_name); - int error = dpif_port_add(&br->dpif, if_name, next_port_no++, - internal ? ODP_PORT_INTERNAL : 0); + bool internal; + int error; + + /* Add to datapath. */ + internal = iface_is_internal(br, if_name); + error = dpif_port_add(&br->dpif, if_name, next_port_no++, + internal ? ODP_PORT_INTERNAL : 0); if (error != EEXIST) { if (next_port_no >= 256) { VLOG_ERR("ran out of valid port numbers on dp%u", @@ -560,6 +570,7 @@ bridge_reconfigure(void) VLOG_ERR("bridge %s: problem setting netflow collectors", br->name); } + svec_destroy(&nf_hosts); /* Update the controller and related settings. It would be more * straightforward to call this from bridge_reconfigure_one(), but we @@ -575,7 +586,16 @@ bridge_reconfigure(void) LIST_FOR_EACH (br, struct bridge, node, &all_bridges) { for (i = 0; i < br->n_ports; i++) { struct port *port = br->ports[i]; + port_update_vlan_compat(port); + + for (j = 0; j < port->n_ifaces; j++) { + struct iface *iface = port->ifaces[j]; + if (iface->dp_ifidx != ODPP_LOCAL + && iface_is_internal(br, iface->name)) { + iface_set_mac(iface); + } + } } } LIST_FOR_EACH (br, struct bridge, node, &all_bridges) { @@ -613,31 +633,75 @@ bridge_pick_local_hw_addr(struct bridge *br, uint8_t ea[ETH_ADDR_LEN], memset(ea, 0xff, sizeof ea); for (i = 0; i < br->n_ports; i++) { struct port *port = br->ports[i]; + uint8_t iface_ea[ETH_ADDR_LEN]; + uint64_t iface_ea_u64; + struct iface *iface; + + /* Mirror output ports don't participate. */ if (port->is_mirror_output_port) { continue; } - for (j = 0; j < port->n_ifaces; j++) { - struct iface *iface = port->ifaces[j]; - uint8_t iface_ea[ETH_ADDR_LEN]; + + /* Choose the MAC address to represent the port. */ + iface_ea_u64 = cfg_get_mac(0, "port.%s.mac", port->name); + if (iface_ea_u64) { + /* User specified explicitly. */ + eth_addr_from_uint64(iface_ea_u64, iface_ea); + + /* Find the interface with this Ethernet address (if any) so that + * we can provide the correct devname to the caller. */ + iface = NULL; + for (j = 0; j < port->n_ifaces; j++) { + struct iface *candidate = port->ifaces[j]; + uint8_t candidate_ea[ETH_ADDR_LEN]; + if (!netdev_nodev_get_etheraddr(candidate->name, candidate_ea) + && eth_addr_equals(iface_ea, candidate_ea)) { + iface = candidate; + } + } + } else { + /* Choose the interface whose MAC address will represent the port. + * The Linux kernel bonding code always chooses the MAC address of + * the first slave added to a bond, and the Fedora networking + * scripts always add slaves to a bond in alphabetical order, so + * for compatibility we choose the interface with the name that is + * first in alphabetical order. */ + iface = port->ifaces[0]; + for (j = 1; j < port->n_ifaces; j++) { + struct iface *candidate = port->ifaces[j]; + if (strcmp(candidate->name, iface->name) < 0) { + iface = candidate; + } + } + + /* The local port doesn't count (since we're trying to choose its + * MAC address anyway). Other internal ports don't count because + * we really want a physical MAC if we can get it, and internal + * ports typically have randomly generated MACs. */ if (iface->dp_ifidx == ODPP_LOCAL || cfg_get_bool(0, "iface.%s.internal", iface->name)) { continue; } + + /* Grab MAC. */ error = netdev_nodev_get_etheraddr(iface->name, iface_ea); - if (!error) { - if (!eth_addr_is_multicast(iface_ea) && - !eth_addr_is_reserved(iface_ea) && - !eth_addr_is_zero(iface_ea) && - memcmp(iface_ea, ea, ETH_ADDR_LEN) < 0) { - memcpy(ea, iface_ea, ETH_ADDR_LEN); - *devname = iface->name; - } - } else { + if (error) { static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5); VLOG_ERR_RL(&rl, "failed to obtain Ethernet address of %s: %s", iface->name, strerror(error)); + continue; } } + + /* Compare against our current choice. */ + if (!eth_addr_is_multicast(iface_ea) && + !eth_addr_is_reserved(iface_ea) && + !eth_addr_is_zero(iface_ea) && + memcmp(iface_ea, ea, ETH_ADDR_LEN) < 0) + { + memcpy(ea, iface_ea, ETH_ADDR_LEN); + *devname = iface ? iface->name : NULL; + } } if (eth_addr_is_multicast(ea) || eth_addr_is_vif(ea)) { memcpy(ea, br->default_ea, ETH_ADDR_LEN); @@ -732,10 +796,10 @@ bridge_pick_datapath_id(struct bridge *br, static uint64_t dpid_from_hash(const void *data, size_t n) { - uint8_t hash[SHA1HashSize]; + uint8_t hash[SHA1_DIGEST_SIZE]; BUILD_ASSERT_DECL(sizeof hash >= ETH_ADDR_LEN); - SHA1Bytes(data, n, hash); + sha1_bytes(data, n, hash); eth_addr_mark_random(hash); return eth_addr_to_uint64(hash); } @@ -792,6 +856,35 @@ bridge_flush(struct bridge *br) } } +/* Bridge unixctl user interface functions. */ +static void +bridge_unixctl_fdb_show(struct unixctl_conn *conn, const char *args) +{ + struct ds ds = DS_EMPTY_INITIALIZER; + const struct bridge *br; + + br = bridge_lookup(args); + if (!br) { + unixctl_command_reply(conn, 501, "no such bridge"); + return; + } + + ds_put_cstr(&ds, " port VLAN MAC Age\n"); + if (br->ml) { + const struct mac_entry *e; + LIST_FOR_EACH (e, struct mac_entry, lru_node, &br->ml->lrus) { + if (e->port < 0 || e->port >= br->n_ports) { + continue; + } + ds_put_format(&ds, "%5d %4d "ETH_ADDR_FMT" %3d\n", + br->ports[e->port]->ifaces[0]->dp_ifidx, + e->vlan, ETH_ADDR_ARGS(e->mac), mac_entry_age(e)); + } + } + unixctl_command_reply(conn, 200, ds_cstr(&ds)); + ds_destroy(&ds); +} + /* Bridge reconfiguration functions. */ static struct bridge * @@ -897,6 +990,27 @@ bridge_get_datapathid(const char *name) return br ? ofproto_get_datapath_id(br->ofproto) : 0; } +/* Handle requests for a listing of all flows known by the OpenFlow + * stack, including those normally hidden. */ +static void +bridge_unixctl_dump_flows(struct unixctl_conn *conn, const char *args) +{ + struct bridge *br; + struct ds results; + + br = bridge_lookup(args); + if (!br) { + unixctl_command_reply(conn, 501, "Unknown bridge"); + return; + } + + ds_init(&results); + ofproto_get_all_flows(br->ofproto, &results); + + unixctl_command_reply(conn, 200, ds_cstr(&results)); + ds_destroy(&results); +} + static int bridge_run_one(struct bridge *br) { @@ -1080,10 +1194,15 @@ bridge_reconfigure_controller(struct bridge *br) int rate_limit, burst_limit; if (!strcmp(controller, "discover")) { + bool update_resolv_conf = true; + + if (cfg_has("%s.update-resolv.conf", pfx)) { + update_resolv_conf = cfg_get_bool(0, "%s.update-resolv.conf", + pfx); + } ofproto_set_discovery(br->ofproto, true, cfg_get_string(0, "%s.accept-regex", pfx), - cfg_get_bool(0, "%s.update-resolv.conf", - pfx)); + update_resolv_conf); } else { struct netdev *netdev; bool in_band; @@ -1135,14 +1254,19 @@ bridge_reconfigure_controller(struct bridge *br) || !strcmp(fail_mode, "open"))); probe = cfg_get_int(0, "%s.inactivity-probe", pfx); - ofproto_set_probe_interval(br->ofproto, - probe ? probe : cfg_get_int(0, "mgmt.inactivity-probe")); + if (probe < 5) { + probe = cfg_get_int(0, "mgmt.inactivity-probe"); + if (probe < 5) { + probe = 5; + } + } + ofproto_set_probe_interval(br->ofproto, probe); max_backoff = cfg_get_int(0, "%s.max-backoff", pfx); if (!max_backoff) { max_backoff = cfg_get_int(0, "mgmt.max-backoff"); if (!max_backoff) { - max_backoff = 15; + max_backoff = 8; } } ofproto_set_max_backoff(br->ofproto, max_backoff); @@ -1213,9 +1337,12 @@ bridge_get_all_ifaces(const struct bridge *br, struct svec *ifaces) struct iface *iface = port->ifaces[j]; svec_add(ifaces, iface->name); } + if (port->n_ifaces > 1 + && cfg_get_bool(0, "bonding.%s.fake-iface", port->name)) { + svec_add(ifaces, port->name); + } } - svec_sort(ifaces); - assert(svec_is_unique(ifaces)); + svec_sort_unique(ifaces); } /* For robustness, in case the administrator moves around datapath ports behind @@ -1310,6 +1437,7 @@ choose_output_iface(const struct port *port, const uint8_t *dl_src, return false; } e->iface_tag = tag_create_random(); + ((struct port *) port)->bond_compat_is_stale = true; } *tags |= e->iface_tag; iface = port->ifaces[e->iface_idx]; @@ -1335,6 +1463,10 @@ bond_link_status_update(struct iface *iface, bool carrier) iface->delay_expires = LLONG_MAX; VLOG_INFO_RL(&rl, "interface %s: will not be %s", iface->name, carrier ? "disabled" : "enabled"); + } else if (carrier && port->updelay && port->active_iface < 0) { + iface->delay_expires = time_msec(); + VLOG_INFO_RL(&rl, "interface %s: skipping %d ms updelay since no " + "other interface is up", iface->name, port->updelay); } else { int delay = carrier ? port->updelay : port->downdelay; iface->delay_expires = time_msec() + delay; @@ -1378,7 +1510,7 @@ bond_enable_slave(struct iface *iface, bool enable) iface->enabled = enable; if (!iface->enabled) { - VLOG_WARN("interface %s: enabled", iface->name); + VLOG_WARN("interface %s: disabled", iface->name); ofproto_revalidate(br->ofproto, iface->tag); if (iface->port_ifidx == port->active_iface) { ofproto_revalidate(br->ofproto, @@ -1387,7 +1519,7 @@ bond_enable_slave(struct iface *iface, bool enable) } bond_send_learning_packets(port); } else { - VLOG_WARN("interface %s: disabled", iface->name); + VLOG_WARN("interface %s: enabled", iface->name); if (port->active_iface < 0) { ofproto_revalidate(br->ofproto, port->no_ifaces_tag); bond_choose_active_iface(port); @@ -1395,6 +1527,7 @@ bond_enable_slave(struct iface *iface, bool enable) } iface->tag = tag_create_random(); } + port_update_bond_compat(port); } static void @@ -1404,6 +1537,12 @@ bond_run(struct bridge *br) for (i = 0; i < br->n_ports; i++) { struct port *port = br->ports[i]; + + if (port->bond_compat_is_stale) { + port->bond_compat_is_stale = false; + port_update_bond_compat(port); + } + if (port->n_ifaces < 2) { continue; } @@ -1574,14 +1713,28 @@ compose_dsts(const struct bridge *br, const flow_t *flow, uint16_t vlan, for (i = 0; i < br->n_ports; i++) { struct port *port = br->ports[i]; if (port_includes_vlan(port, m->out_vlan) - && set_dst(dst, flow, in_port, port, tags) - && !dst_is_duplicate(dsts, dst - dsts, dst)) + && set_dst(dst, flow, in_port, port, tags)) { + int flow_vlan; + if (port->vlan < 0) { dst->vlan = m->out_vlan; } - if (dst->dp_ifidx == flow->in_port - && dst->vlan == vlan) { + if (dst_is_duplicate(dsts, dst - dsts, dst)) { + continue; + } + + /* Use the vlan tag on the original flow instead of + * the one passed in the vlan parameter. This ensures + * that we compare the vlan from before any implicit + * tagging tags place. This is necessary because + * dst->vlan is the final vlan, after removing implicit + * tags. */ + flow_vlan = ntohs(flow->dl_vlan); + if (flow_vlan == 0) { + flow_vlan = OFP_VLAN_NONE; + } + if (port == in_port && dst->vlan == flow_vlan) { /* Don't send out input port on same VLAN. */ continue; } @@ -1744,26 +1897,27 @@ process_flow(struct bridge *br, const flow_t *flow, goto done; } - /* Multicast (and broadcast) packets on bonds need special attention, to - * avoid receiving duplicates. */ - if (in_port->n_ifaces > 1 && eth_addr_is_multicast(flow->dl_dst)) { - *tags |= in_port->active_iface_tag; - if (in_port->active_iface != in_iface->port_ifidx) { - /* Drop all multicast packets on inactive slaves. */ - goto done; - } else { - /* Drop all multicast packets for which we have learned a different - * input port, because we probably sent the packet on one slaves - * and got it back on the active slave. Broadcast ARP replies are - * an exception to this rule: the host has moved to another - * switch. */ - int src_idx = mac_learning_lookup(br->ml, flow->dl_src, vlan); - if (src_idx != -1 - && src_idx != in_port->port_idx - && !is_bcast_arp_reply(flow, packet)) { + /* Packets received on bonds need special attention to avoid duplicates. */ + if (in_port->n_ifaces > 1) { + int src_idx; + + if (eth_addr_is_multicast(flow->dl_dst)) { + *tags |= in_port->active_iface_tag; + if (in_port->active_iface != in_iface->port_ifidx) { + /* Drop all multicast packets on inactive slaves. */ goto done; } } + + /* Drop all packets for which we have learned a different input + * port, because we probably sent the packet on one slave and got + * it back on the other. Broadcast ARP replies are an exception + * to this rule: the host has moved to another switch. */ + src_idx = mac_learning_lookup(br->ml, flow->dl_src, vlan); + if (src_idx != -1 && src_idx != in_port->port_idx && + (!packet || !is_bcast_arp_reply(flow, packet))) { + goto done; + } } /* MAC learning. */ @@ -2044,8 +2198,9 @@ log_bals(const struct slave_balance *bals, size_t n_bals, struct port *port) /* Shifts 'hash' from 'from' to 'to' within 'port'. */ static void bond_shift_load(struct slave_balance *from, struct slave_balance *to, - struct bond_entry *hash) + int hash_idx) { + struct bond_entry *hash = from->hashes[hash_idx]; struct port *port = from->iface->port; uint64_t delta = hash->tx_bytes; @@ -2063,12 +2218,11 @@ bond_shift_load(struct slave_balance *from, struct slave_balance *to, * it require more work, the only purpose it would be to allow that hash to * be migrated to another slave in this rebalancing run, and there is no * point in doing that. */ - if (from->hashes[0] == hash) { + if (hash_idx == 0) { from->hashes++; } else { - int i = hash - from->hashes[0]; - memmove(from->hashes + i, from->hashes + i + 1, - (from->n_hashes - (i + 1)) * sizeof *from->hashes); + memmove(from->hashes + hash_idx, from->hashes + hash_idx + 1, + (from->n_hashes - (hash_idx + 1)) * sizeof *from->hashes); } from->n_hashes--; @@ -2153,22 +2307,60 @@ bond_rebalance_port(struct port *port) /* 'from' is carrying significantly more load than 'to', and that * load is split across at least two different hashes. Pick a hash * to migrate to 'to' (the least-loaded slave), given that doing so - * must not cause 'to''s load to exceed 'from''s load. + * must decrease the ratio of the load on the two slaves by at + * least 0.1. * * The sort order we use means that we prefer to shift away the * smallest hashes instead of the biggest ones. There is little * reason behind this decision; we could use the opposite sort * order to shift away big hashes ahead of small ones. */ size_t i; + bool order_swapped; for (i = 0; i < from->n_hashes; i++) { + double old_ratio, new_ratio; uint64_t delta = from->hashes[i]->tx_bytes; - if (to->tx_bytes + delta < from->tx_bytes - delta) { + + if (delta == 0 || from->tx_bytes - delta == 0) { + /* Pointless move. */ + continue; + } + + order_swapped = from->tx_bytes - delta < to->tx_bytes + delta; + + if (to->tx_bytes == 0) { + /* Nothing on the new slave, move it. */ + break; + } + + old_ratio = (double)from->tx_bytes / to->tx_bytes; + new_ratio = (double)(from->tx_bytes - delta) / + (to->tx_bytes + delta); + + if (new_ratio == 0) { + /* Should already be covered but check to prevent division + * by zero. */ + continue; + } + + if (new_ratio < 1) { + new_ratio = 1 / new_ratio; + } + + if (old_ratio - new_ratio > 0.1) { + /* Would decrease the ratio, move it. */ break; } } if (i < from->n_hashes) { - bond_shift_load(from, to, from->hashes[i]); + bond_shift_load(from, to, i); + port->bond_compat_is_stale = true; + + /* If the result of the migration changed the relative order of + * 'from' and 'to' swap them back to maintain invariants. */ + if (order_swapped) { + swap_bals(from, to); + } /* Re-sort 'bals'. Note that this may make 'from' and 'to' * point to different slave_balance structures. It is only @@ -2204,10 +2396,7 @@ bond_send_learning_packets(struct port *port) ofpbuf_init(&packet, 128); error = n_packets = n_errors = 0; LIST_FOR_EACH (e, struct mac_entry, lru_node, &br->ml->lrus) { - static const char s[] = "Open vSwitch Bond Failover"; union ofp_action actions[2], *a; - struct eth_header *eth; - struct llc_snap_header *llc_snap; uint16_t dp_ifidx; tag_type tags = 0; flow_t flow; @@ -2218,23 +2407,6 @@ bond_send_learning_packets(struct port *port) continue; } - /* Compose packet to send. */ - ofpbuf_clear(&packet); - eth = ofpbuf_put_zeros(&packet, ETH_HEADER_LEN); - llc_snap = ofpbuf_put_zeros(&packet, LLC_SNAP_HEADER_LEN); - ofpbuf_put(&packet, s, sizeof s); /* Includes null byte. */ - ofpbuf_put(&packet, e->mac, ETH_ADDR_LEN); - - memcpy(eth->eth_dst, eth_addr_broadcast, ETH_ADDR_LEN); - memcpy(eth->eth_src, e->mac, ETH_ADDR_LEN); - eth->eth_type = htons(packet.size - ETH_HEADER_LEN); - - llc_snap->llc.llc_dsap = LLC_DSAP_SNAP; - llc_snap->llc.llc_ssap = LLC_SSAP_SNAP; - llc_snap->llc.llc_cntl = LLC_CNTL_SNAP; - memcpy(llc_snap->snap.snap_org, "\x00\x23\x20", 3); - llc_snap->snap.snap_type = htons(0xf177); /* Random number. */ - /* Compose actions. */ memset(actions, 0, sizeof actions); a = actions; @@ -2251,6 +2423,8 @@ bond_send_learning_packets(struct port *port) /* Send packet. */ n_packets++; + compose_benign_packet(&packet, "Open vSwitch Bond Failover", 0xf177, + e->mac); flow_extract(&packet, ODPP_NONE, &flow); retval = ofproto_send_packet(br->ofproto, &flow, actions, a - actions, &packet); @@ -2420,8 +2594,8 @@ bond_unixctl_migrate(struct unixctl_conn *conn, const char *args_) return; } - if (sscanf(hash_s, "%"SCNx8":%"SCNx8":%"SCNx8":%"SCNx8":%"SCNx8":%"SCNx8, - &mac[0], &mac[1], &mac[2], &mac[3], &mac[4], &mac[5]) == 6) { + if (sscanf(hash_s, ETH_ADDR_SCAN_FMT, ETH_ADDR_SCAN_ARGS(mac)) + == ETH_ADDR_SCAN_COUNT) { hash = bond_hash(mac); } else if (strspn(hash_s, "0123456789") == strlen(hash_s)) { hash = atoi(hash_s) & BOND_MASK; @@ -2445,6 +2619,7 @@ bond_unixctl_migrate(struct unixctl_conn *conn, const char *args_) ofproto_revalidate(port->bridge->ofproto, entry->iface_tag); entry->iface_idx = iface->port_ifidx; entry->iface_tag = tag_create_random(); + port->bond_compat_is_stale = true; unixctl_command_reply(conn, 200, "migrated"); } @@ -2709,6 +2884,7 @@ port_destroy(struct port *port) size_t i; proc_net_compat_update_vlan(port->name, NULL, 0); + proc_net_compat_update_bond(port->name, NULL); for (i = 0; i < MAX_MIRRORS; i++) { struct mirror *m = br->mirrors[i]; @@ -2775,7 +2951,7 @@ port_update_bonding(struct port *port) if (port->bond_hash) { free(port->bond_hash); port->bond_hash = NULL; - proc_net_compat_update_bond(port->name, NULL); + port->bond_compat_is_stale = true; } } else { if (!port->bond_hash) { @@ -2790,36 +2966,79 @@ port_update_bonding(struct port *port) port->no_ifaces_tag = tag_create_random(); bond_choose_active_iface(port); } - port_update_bond_compat(port); + port->bond_compat_is_stale = true; } } static void port_update_bond_compat(struct port *port) { + struct compat_bond_hash compat_hashes[BOND_MASK + 1]; struct compat_bond bond; size_t i; if (port->n_ifaces < 2) { + proc_net_compat_update_bond(port->name, NULL); return; } bond.up = false; bond.updelay = port->updelay; bond.downdelay = port->downdelay; + + bond.n_hashes = 0; + bond.hashes = compat_hashes; + if (port->bond_hash) { + const struct bond_entry *e; + for (e = port->bond_hash; e <= &port->bond_hash[BOND_MASK]; e++) { + if (e->iface_idx >= 0 && e->iface_idx < port->n_ifaces) { + struct compat_bond_hash *cbh = &bond.hashes[bond.n_hashes++]; + cbh->hash = e - port->bond_hash; + cbh->netdev_name = port->ifaces[e->iface_idx]->name; + } + } + } + bond.n_slaves = port->n_ifaces; bond.slaves = xmalloc(port->n_ifaces * sizeof *bond.slaves); for (i = 0; i < port->n_ifaces; i++) { struct iface *iface = port->ifaces[i]; struct compat_bond_slave *slave = &bond.slaves[i]; slave->name = iface->name; - slave->up = ((iface->enabled && iface->delay_expires == LLONG_MAX) || - (!iface->enabled && iface->delay_expires != LLONG_MAX)); + + /* We need to make the same determination as the Linux bonding + * code to determine whether a slave should be consider "up". + * The Linux function bond_miimon_inspect() supports four + * BOND_LINK_* states: + * + * - BOND_LINK_UP: carrier detected, updelay has passed. + * - BOND_LINK_FAIL: carrier lost, downdelay in progress. + * - BOND_LINK_DOWN: carrier lost, downdelay has passed. + * - BOND_LINK_BACK: carrier detected, updelay in progress. + * + * The function bond_info_show_slave() only considers BOND_LINK_UP + * to be "up" and anything else to be "down". + */ + slave->up = iface->enabled && iface->delay_expires == LLONG_MAX; if (slave->up) { bond.up = true; } memcpy(slave->mac, iface->mac, ETH_ADDR_LEN); } + + if (cfg_get_bool(0, "bonding.%s.fake-iface", port->name)) { + struct netdev *bond_netdev; + + if (!netdev_open(port->name, NETDEV_ETH_TYPE_NONE, &bond_netdev)) { + if (bond.up) { + netdev_turn_flags_on(bond_netdev, NETDEV_UP, true); + } else { + netdev_turn_flags_off(bond_netdev, NETDEV_UP, true); + } + netdev_close(bond_netdev); + } + } + proc_net_compat_update_bond(port->name, &bond); free(bond.slaves); } @@ -2875,8 +3094,17 @@ iface_create(struct port *port, const char *name) iface->tag = tag_create_random(); iface->delay_expires = LLONG_MAX; - netdev_nodev_get_etheraddr(name, iface->mac); - netdev_nodev_get_carrier(name, &iface->enabled); + if (!cfg_get_bool(0, "iface.%s.internal", iface->name)) { + netdev_nodev_get_etheraddr(name, iface->mac); + netdev_nodev_get_carrier(name, &iface->enabled); + } else { + /* Internal interfaces are created later by the call to dpif_port_add() + * in bridge_reconfigure(). Until then, we can't obtain any + * information about them. (There's no real value in doing so, anyway, + * because the 'mac' and 'enabled' values are only used for interfaces + * that are bond slaves, and it doesn't normally make sense to bond an + * internal interface.) */ + } if (port->n_ifaces >= port->allocated_ifaces) { port->ifaces = x2nrealloc(port->ifaces, &port->allocated_ifaces, @@ -2945,6 +3173,60 @@ iface_from_dp_ifidx(const struct bridge *br, uint16_t dp_ifidx) { return port_array_get(&br->ifaces, dp_ifidx); } + +/* Returns true if 'iface' is the name of an "internal" interface on bridge + * 'br', that is, an interface that is entirely simulated within the datapath. + * The local port (ODPP_LOCAL) is always an internal interface. Other local + * interfaces are created by setting "iface..internal = true". + * + * In addition, we have a kluge-y feature that creates an internal port with + * the name of a bonded port if "bonding..fake-iface = true" is set. + * This feature needs to go away in the long term. Until then, this is one + * reason why this function takes a name instead of a struct iface: the fake + * interfaces created this way do not have a struct iface. */ +static bool +iface_is_internal(const struct bridge *br, const char *iface) +{ + if (!strcmp(iface, br->name) + || cfg_get_bool(0, "iface.%s.internal", iface)) { + return true; + } + + if (cfg_get_bool(0, "bonding.%s.fake-iface", iface)) { + struct port *port = port_lookup(br, iface); + if (port && port->n_ifaces > 1) { + return true; + } + } + + return false; +} + +/* Set Ethernet address of 'iface', if one is specified in the configuration + * file. */ +static void +iface_set_mac(struct iface *iface) +{ + uint64_t mac = cfg_get_mac(0, "iface.%s.mac", iface->name); + if (mac) { + static uint8_t ea[ETH_ADDR_LEN]; + + eth_addr_from_uint64(mac, ea); + if (eth_addr_is_multicast(ea)) { + VLOG_ERR("interface %s: cannot set MAC to multicast address", + iface->name); + } else if (iface->dp_ifidx == ODPP_LOCAL) { + VLOG_ERR("ignoring iface.%s.mac; use bridge.%s.mac instead", + iface->name, iface->name); + } else { + int error = netdev_nodev_set_etheraddr(iface->name, ea); + if (error) { + VLOG_ERR("interface %s: setting MAC failed (%s)", + iface->name, strerror(error)); + } + } + } +} /* Port mirroring. */ @@ -3145,6 +3427,7 @@ mirror_reconfigure_one(struct mirror *m) int *vlans; size_t i; bool mirror_all_ports; + bool any_ports_specified; /* Get output port. */ out_port_name = cfg_get_key(0, "mirror.%s.%s.output.port", @@ -3183,11 +3466,18 @@ mirror_reconfigure_one(struct mirror *m) cfg_get_all_keys(&src_ports, "%s.select.src-port", pfx); cfg_get_all_keys(&dst_ports, "%s.select.dst-port", pfx); cfg_get_all_keys(&ports, "%s.select.port", pfx); + any_ports_specified = src_ports.n || dst_ports.n || ports.n; svec_append(&src_ports, &ports); svec_append(&dst_ports, &ports); svec_destroy(&ports); prune_ports(m, &src_ports); prune_ports(m, &dst_ports); + if (any_ports_specified && !src_ports.n && !dst_ports.n) { + VLOG_ERR("%s: none of the specified ports exist; " + "disabling port mirror %s", pfx, pfx); + mirror_destroy(m); + goto exit; + } /* Get all the vlans, and drop duplicate and invalid vlans. */ svec_init(&vlan_strings); @@ -3239,6 +3529,7 @@ mirror_reconfigure_one(struct mirror *m) } /* Clean up. */ +exit: svec_destroy(&src_ports); svec_destroy(&dst_ports); free(pfx);