+ ofpbuf_init(&packet, 0);
+ cfm_compose_ccm(ofport->cfm, &packet, ofport->up.pp.hw_addr);
+ send_packet(ofport, &packet);
+ ofpbuf_uninit(&packet);
+ }
+}
+
+static void
+port_run(struct ofport_dpif *ofport)
+{
+ long long int carrier_seq = netdev_get_carrier_resets(ofport->up.netdev);
+ bool carrier_changed = carrier_seq != ofport->carrier_seq;
+ bool enable = netdev_get_carrier(ofport->up.netdev);
+
+ ofport->carrier_seq = carrier_seq;
+
+ port_run_fast(ofport);
+
+ if (ofport->tnl_port
+ && tnl_port_reconfigure(&ofport->up, ofport->odp_port,
+ &ofport->tnl_port)) {
+ ofproto_dpif_cast(ofport->up.ofproto)->backer->need_revalidate = true;
+ }
+
+ if (ofport->cfm) {
+ int cfm_opup = cfm_get_opup(ofport->cfm);
+
+ cfm_run(ofport->cfm);
+ enable = enable && !cfm_get_fault(ofport->cfm);
+
+ if (cfm_opup >= 0) {
+ enable = enable && cfm_opup;
+ }
+ }
+
+ if (ofport->bundle) {
+ enable = enable && lacp_slave_may_enable(ofport->bundle->lacp, ofport);
+ if (carrier_changed) {
+ lacp_slave_carrier_changed(ofport->bundle->lacp, ofport);
+ }
+ }
+
+ if (ofport->may_enable != enable) {
+ struct ofproto_dpif *ofproto = ofproto_dpif_cast(ofport->up.ofproto);
+
+ if (ofproto->has_bundle_action) {
+ ofproto->backer->need_revalidate = REV_PORT_TOGGLED;
+ }
+ }
+
+ ofport->may_enable = enable;
+}
+
+static void
+port_wait(struct ofport_dpif *ofport)
+{
+ if (ofport->cfm) {
+ cfm_wait(ofport->cfm);
+ }
+}
+
+static int
+port_query_by_name(const struct ofproto *ofproto_, const char *devname,
+ struct ofproto_port *ofproto_port)
+{
+ struct ofproto_dpif *ofproto = ofproto_dpif_cast(ofproto_);
+ struct dpif_port dpif_port;
+ int error;
+
+ if (sset_contains(&ofproto->ghost_ports, devname)) {
+ const char *type = netdev_get_type_from_name(devname);
+
+ /* We may be called before ofproto->up.port_by_name is populated with
+ * the appropriate ofport. For this reason, we must get the name and
+ * type from the netdev layer directly. */
+ if (type) {
+ const struct ofport *ofport;
+
+ ofport = shash_find_data(&ofproto->up.port_by_name, devname);
+ ofproto_port->ofp_port = ofport ? ofport->ofp_port : OFPP_NONE;
+ ofproto_port->name = xstrdup(devname);
+ ofproto_port->type = xstrdup(type);
+ return 0;
+ }
+ return ENODEV;
+ }
+
+ if (!sset_contains(&ofproto->ports, devname)) {
+ return ENODEV;
+ }
+ error = dpif_port_query_by_name(ofproto->backer->dpif,
+ devname, &dpif_port);
+ if (!error) {
+ ofproto_port_from_dpif_port(ofproto, ofproto_port, &dpif_port);
+ }
+ return error;
+}
+
+static int
+port_add(struct ofproto *ofproto_, struct netdev *netdev)
+{
+ struct ofproto_dpif *ofproto = ofproto_dpif_cast(ofproto_);
+ const char *dp_port_name = netdev_vport_get_dpif_port(netdev);
+ const char *devname = netdev_get_name(netdev);
+
+ if (netdev_vport_is_patch(netdev)) {
+ sset_add(&ofproto->ghost_ports, netdev_get_name(netdev));
+ return 0;
+ }
+
+ if (!dpif_port_exists(ofproto->backer->dpif, dp_port_name)) {
+ uint32_t port_no = UINT32_MAX;
+ int error;
+
+ error = dpif_port_add(ofproto->backer->dpif, netdev, &port_no);
+ if (error) {
+ return error;
+ }
+ if (netdev_get_tunnel_config(netdev)) {
+ simap_put(&ofproto->backer->tnl_backers, dp_port_name, port_no);
+ }
+ }
+
+ if (netdev_get_tunnel_config(netdev)) {
+ sset_add(&ofproto->ghost_ports, devname);
+ } else {
+ sset_add(&ofproto->ports, devname);
+ }
+ return 0;
+}
+
+static int
+port_del(struct ofproto *ofproto_, uint16_t ofp_port)
+{
+ struct ofproto_dpif *ofproto = ofproto_dpif_cast(ofproto_);
+ struct ofport_dpif *ofport = get_ofp_port(ofproto, ofp_port);
+ int error = 0;
+
+ if (!ofport) {
+ return 0;
+ }
+
+ sset_find_and_delete(&ofproto->ghost_ports,
+ netdev_get_name(ofport->up.netdev));
+ ofproto->backer->need_revalidate = REV_RECONFIGURE;
+ if (!ofport->tnl_port) {
+ error = dpif_port_del(ofproto->backer->dpif, ofport->odp_port);
+ if (!error) {
+ /* The caller is going to close ofport->up.netdev. If this is a
+ * bonded port, then the bond is using that netdev, so remove it
+ * from the bond. The client will need to reconfigure everything
+ * after deleting ports, so then the slave will get re-added. */
+ bundle_remove(&ofport->up);
+ }
+ }
+ return error;
+}
+
+static int
+port_get_stats(const struct ofport *ofport_, struct netdev_stats *stats)
+{
+ struct ofport_dpif *ofport = ofport_dpif_cast(ofport_);
+ int error;
+
+ push_all_stats();
+
+ error = netdev_get_stats(ofport->up.netdev, stats);
+
+ if (!error && ofport_->ofp_port == OFPP_LOCAL) {
+ struct ofproto_dpif *ofproto = ofproto_dpif_cast(ofport->up.ofproto);
+
+ /* ofproto->stats.tx_packets represents packets that we created
+ * internally and sent to some port (e.g. packets sent with
+ * send_packet()). Account for them as if they had come from
+ * OFPP_LOCAL and got forwarded. */
+
+ if (stats->rx_packets != UINT64_MAX) {
+ stats->rx_packets += ofproto->stats.tx_packets;
+ }
+
+ if (stats->rx_bytes != UINT64_MAX) {
+ stats->rx_bytes += ofproto->stats.tx_bytes;
+ }
+
+ /* ofproto->stats.rx_packets represents packets that were received on
+ * some port and we processed internally and dropped (e.g. STP).
+ * Account for them as if they had been forwarded to OFPP_LOCAL. */
+
+ if (stats->tx_packets != UINT64_MAX) {
+ stats->tx_packets += ofproto->stats.rx_packets;
+ }
+
+ if (stats->tx_bytes != UINT64_MAX) {
+ stats->tx_bytes += ofproto->stats.rx_bytes;
+ }
+ }
+
+ return error;
+}
+
+/* Account packets for LOCAL port. */
+static void
+ofproto_update_local_port_stats(const struct ofproto *ofproto_,
+ size_t tx_size, size_t rx_size)
+{
+ struct ofproto_dpif *ofproto = ofproto_dpif_cast(ofproto_);
+
+ if (rx_size) {
+ ofproto->stats.rx_packets++;
+ ofproto->stats.rx_bytes += rx_size;
+ }
+ if (tx_size) {
+ ofproto->stats.tx_packets++;
+ ofproto->stats.tx_bytes += tx_size;
+ }
+}
+
+struct port_dump_state {
+ uint32_t bucket;
+ uint32_t offset;
+ bool ghost;
+
+ struct ofproto_port port;
+ bool has_port;
+};
+
+static int
+port_dump_start(const struct ofproto *ofproto_ OVS_UNUSED, void **statep)
+{
+ *statep = xzalloc(sizeof(struct port_dump_state));
+ return 0;
+}
+
+static int
+port_dump_next(const struct ofproto *ofproto_, void *state_,
+ struct ofproto_port *port)
+{
+ struct ofproto_dpif *ofproto = ofproto_dpif_cast(ofproto_);
+ struct port_dump_state *state = state_;
+ const struct sset *sset;
+ struct sset_node *node;
+
+ if (state->has_port) {
+ ofproto_port_destroy(&state->port);
+ state->has_port = false;
+ }
+ sset = state->ghost ? &ofproto->ghost_ports : &ofproto->ports;
+ while ((node = sset_at_position(sset, &state->bucket, &state->offset))) {
+ int error;
+
+ error = port_query_by_name(ofproto_, node->name, &state->port);
+ if (!error) {
+ *port = state->port;
+ state->has_port = true;
+ return 0;
+ } else if (error != ENODEV) {
+ return error;
+ }
+ }
+
+ if (!state->ghost) {
+ state->ghost = true;
+ state->bucket = 0;
+ state->offset = 0;
+ return port_dump_next(ofproto_, state_, port);
+ }
+
+ return EOF;
+}
+
+static int
+port_dump_done(const struct ofproto *ofproto_ OVS_UNUSED, void *state_)
+{
+ struct port_dump_state *state = state_;
+
+ if (state->has_port) {
+ ofproto_port_destroy(&state->port);
+ }
+ free(state);
+ return 0;
+}
+
+static int
+port_poll(const struct ofproto *ofproto_, char **devnamep)
+{
+ struct ofproto_dpif *ofproto = ofproto_dpif_cast(ofproto_);
+
+ if (ofproto->port_poll_errno) {
+ int error = ofproto->port_poll_errno;
+ ofproto->port_poll_errno = 0;
+ return error;
+ }
+
+ if (sset_is_empty(&ofproto->port_poll_set)) {
+ return EAGAIN;
+ }
+
+ *devnamep = sset_pop(&ofproto->port_poll_set);
+ return 0;
+}
+
+static void
+port_poll_wait(const struct ofproto *ofproto_)
+{
+ struct ofproto_dpif *ofproto = ofproto_dpif_cast(ofproto_);
+ dpif_port_poll_wait(ofproto->backer->dpif);
+}
+
+static int
+port_is_lacp_current(const struct ofport *ofport_)
+{
+ const struct ofport_dpif *ofport = ofport_dpif_cast(ofport_);
+ return (ofport->bundle && ofport->bundle->lacp
+ ? lacp_slave_is_current(ofport->bundle->lacp, ofport)
+ : -1);
+}
+\f
+/* Upcall handling. */
+
+/* Flow miss batching.
+ *
+ * Some dpifs implement operations faster when you hand them off in a batch.
+ * To allow batching, "struct flow_miss" queues the dpif-related work needed
+ * for a given flow. Each "struct flow_miss" corresponds to sending one or
+ * more packets, plus possibly installing the flow in the dpif.
+ *
+ * So far we only batch the operations that affect flow setup time the most.
+ * It's possible to batch more than that, but the benefit might be minimal. */
+struct flow_miss {
+ struct hmap_node hmap_node;
+ struct ofproto_dpif *ofproto;
+ struct flow flow;
+ enum odp_key_fitness key_fitness;
+ const struct nlattr *key;
+ size_t key_len;
+ struct initial_vals initial_vals;
+ struct list packets;
+ enum dpif_upcall_type upcall_type;
+ uint32_t odp_in_port;
+};
+
+struct flow_miss_op {
+ struct dpif_op dpif_op;
+ void *garbage; /* Pointer to pass to free(), NULL if none. */
+ uint64_t stub[1024 / 8]; /* Temporary buffer. */
+};
+
+/* Sends an OFPT_PACKET_IN message for 'packet' of type OFPR_NO_MATCH to each
+ * OpenFlow controller as necessary according to their individual
+ * configurations. */
+static void
+send_packet_in_miss(struct ofproto_dpif *ofproto, const struct ofpbuf *packet,
+ const struct flow *flow)
+{
+ struct ofputil_packet_in pin;
+
+ pin.packet = packet->data;
+ pin.packet_len = packet->size;
+ pin.reason = OFPR_NO_MATCH;
+ pin.controller_id = 0;
+
+ pin.table_id = 0;
+ pin.cookie = 0;
+
+ pin.send_len = 0; /* not used for flow table misses */
+
+ flow_get_metadata(flow, &pin.fmd);
+
+ connmgr_send_packet_in(ofproto->up.connmgr, &pin);
+}
+
+static enum slow_path_reason
+process_special(struct ofproto_dpif *ofproto, const struct flow *flow,
+ const struct ofport_dpif *ofport, const struct ofpbuf *packet)
+{
+ if (!ofport) {
+ return 0;
+ } else if (ofport->cfm && cfm_should_process_flow(ofport->cfm, flow)) {
+ if (packet) {
+ cfm_process_heartbeat(ofport->cfm, packet);
+ }
+ return SLOW_CFM;
+ } else if (ofport->bundle && ofport->bundle->lacp
+ && flow->dl_type == htons(ETH_TYPE_LACP)) {
+ if (packet) {
+ lacp_process_packet(ofport->bundle->lacp, ofport, packet);
+ }
+ return SLOW_LACP;
+ } else if (ofproto->stp && stp_should_process_flow(flow)) {
+ if (packet) {
+ stp_process_packet(ofport, packet);
+ }
+ return SLOW_STP;
+ } else {
+ return 0;
+ }
+}
+
+static struct flow_miss *
+flow_miss_find(struct hmap *todo, const struct ofproto_dpif *ofproto,
+ const struct flow *flow, uint32_t hash)
+{
+ struct flow_miss *miss;
+
+ HMAP_FOR_EACH_WITH_HASH (miss, hmap_node, hash, todo) {
+ if (miss->ofproto == ofproto && flow_equal(&miss->flow, flow)) {
+ return miss;
+ }
+ }
+
+ return NULL;
+}
+
+/* Partially Initializes 'op' as an "execute" operation for 'miss' and
+ * 'packet'. The caller must initialize op->actions and op->actions_len. If
+ * 'miss' is associated with a subfacet the caller must also initialize the
+ * returned op->subfacet, and if anything needs to be freed after processing
+ * the op, the caller must initialize op->garbage also. */
+static void
+init_flow_miss_execute_op(struct flow_miss *miss, struct ofpbuf *packet,
+ struct flow_miss_op *op)
+{
+ if (miss->flow.vlan_tci != miss->initial_vals.vlan_tci) {
+ /* This packet was received on a VLAN splinter port. We
+ * added a VLAN to the packet to make the packet resemble
+ * the flow, but the actions were composed assuming that
+ * the packet contained no VLAN. So, we must remove the
+ * VLAN header from the packet before trying to execute the
+ * actions. */
+ eth_pop_vlan(packet);
+ }
+
+ op->garbage = NULL;
+ op->dpif_op.type = DPIF_OP_EXECUTE;
+ op->dpif_op.u.execute.key = miss->key;
+ op->dpif_op.u.execute.key_len = miss->key_len;
+ op->dpif_op.u.execute.packet = packet;
+}
+
+/* Helper for handle_flow_miss_without_facet() and
+ * handle_flow_miss_with_facet(). */
+static void
+handle_flow_miss_common(struct rule_dpif *rule,
+ struct ofpbuf *packet, const struct flow *flow)
+{
+ struct ofproto_dpif *ofproto = ofproto_dpif_cast(rule->up.ofproto);
+
+ ofproto->n_matches++;
+
+ if (rule->up.cr.priority == FAIL_OPEN_PRIORITY) {
+ /*
+ * Extra-special case for fail-open mode.
+ *
+ * We are in fail-open mode and the packet matched the fail-open
+ * rule, but we are connected to a controller too. We should send
+ * the packet up to the controller in the hope that it will try to
+ * set up a flow and thereby allow us to exit fail-open.
+ *
+ * See the top-level comment in fail-open.c for more information.
+ */
+ send_packet_in_miss(ofproto, packet, flow);
+ }
+}
+
+/* Figures out whether a flow that missed in 'ofproto', whose details are in
+ * 'miss', is likely to be worth tracking in detail in userspace and (usually)
+ * installing a datapath flow. The answer is usually "yes" (a return value of
+ * true). However, for short flows the cost of bookkeeping is much higher than
+ * the benefits, so when the datapath holds a large number of flows we impose
+ * some heuristics to decide which flows are likely to be worth tracking. */
+static bool
+flow_miss_should_make_facet(struct ofproto_dpif *ofproto,
+ struct flow_miss *miss, uint32_t hash)
+{
+ if (!ofproto->governor) {
+ size_t n_subfacets;
+
+ n_subfacets = hmap_count(&ofproto->subfacets);
+ if (n_subfacets * 2 <= ofproto->up.flow_eviction_threshold) {
+ return true;
+ }
+
+ ofproto->governor = governor_create(ofproto->up.name);
+ }
+
+ return governor_should_install_flow(ofproto->governor, hash,
+ list_size(&miss->packets));
+}
+
+/* Handles 'miss', which matches 'rule', without creating a facet or subfacet
+ * or creating any datapath flow. May add an "execute" operation to 'ops' and
+ * increment '*n_ops'. */
+static void
+handle_flow_miss_without_facet(struct flow_miss *miss,
+ struct rule_dpif *rule,
+ struct flow_miss_op *ops, size_t *n_ops)
+{
+ struct ofproto_dpif *ofproto = ofproto_dpif_cast(rule->up.ofproto);
+ long long int now = time_msec();
+ struct action_xlate_ctx ctx;
+ struct ofpbuf *packet;
+
+ LIST_FOR_EACH (packet, list_node, &miss->packets) {
+ struct flow_miss_op *op = &ops[*n_ops];
+ struct dpif_flow_stats stats;
+ struct ofpbuf odp_actions;
+
+ COVERAGE_INC(facet_suppress);
+
+ ofpbuf_use_stub(&odp_actions, op->stub, sizeof op->stub);
+
+ dpif_flow_stats_extract(&miss->flow, packet, now, &stats);
+ rule_credit_stats(rule, &stats);
+
+ action_xlate_ctx_init(&ctx, ofproto, &miss->flow,
+ &miss->initial_vals, rule, 0, packet);
+ ctx.resubmit_stats = &stats;
+ xlate_actions(&ctx, rule->up.ofpacts, rule->up.ofpacts_len,
+ &odp_actions);
+
+ if (odp_actions.size) {
+ struct dpif_execute *execute = &op->dpif_op.u.execute;
+
+ init_flow_miss_execute_op(miss, packet, op);
+ execute->actions = odp_actions.data;
+ execute->actions_len = odp_actions.size;
+ op->garbage = ofpbuf_get_uninit_pointer(&odp_actions);
+
+ (*n_ops)++;
+ } else {
+ ofpbuf_uninit(&odp_actions);
+ }
+ }
+}
+
+/* Handles 'miss', which matches 'facet'. May add any required datapath
+ * operations to 'ops', incrementing '*n_ops' for each new op.
+ *
+ * All of the packets in 'miss' are considered to have arrived at time 'now'.
+ * This is really important only for new facets: if we just called time_msec()
+ * here, then the new subfacet or its packets could look (occasionally) as
+ * though it was used some time after the facet was used. That can make a
+ * one-packet flow look like it has a nonzero duration, which looks odd in
+ * e.g. NetFlow statistics. */
+static void
+handle_flow_miss_with_facet(struct flow_miss *miss, struct facet *facet,
+ long long int now,
+ struct flow_miss_op *ops, size_t *n_ops)
+{
+ struct ofproto_dpif *ofproto = ofproto_dpif_cast(facet->rule->up.ofproto);
+ enum subfacet_path want_path;
+ struct subfacet *subfacet;
+ struct ofpbuf *packet;
+
+ subfacet = subfacet_create(facet, miss, now);
+
+ LIST_FOR_EACH (packet, list_node, &miss->packets) {
+ struct flow_miss_op *op = &ops[*n_ops];
+ struct dpif_flow_stats stats;
+ struct ofpbuf odp_actions;
+
+ handle_flow_miss_common(facet->rule, packet, &miss->flow);
+
+ ofpbuf_use_stub(&odp_actions, op->stub, sizeof op->stub);
+ if (!subfacet->actions || subfacet->slow) {
+ subfacet_make_actions(subfacet, packet, &odp_actions);
+ }
+
+ dpif_flow_stats_extract(&facet->flow, packet, now, &stats);
+ subfacet_update_stats(subfacet, &stats);
+
+ if (subfacet->actions_len) {
+ struct dpif_execute *execute = &op->dpif_op.u.execute;
+
+ init_flow_miss_execute_op(miss, packet, op);
+ if (!subfacet->slow) {
+ execute->actions = subfacet->actions;
+ execute->actions_len = subfacet->actions_len;
+ ofpbuf_uninit(&odp_actions);
+ } else {
+ execute->actions = odp_actions.data;
+ execute->actions_len = odp_actions.size;
+ op->garbage = ofpbuf_get_uninit_pointer(&odp_actions);
+ }
+
+ (*n_ops)++;
+ } else {
+ ofpbuf_uninit(&odp_actions);
+ }
+ }
+
+ want_path = subfacet_want_path(subfacet->slow);
+ if (miss->upcall_type == DPIF_UC_MISS || subfacet->path != want_path) {
+ struct flow_miss_op *op = &ops[(*n_ops)++];
+ struct dpif_flow_put *put = &op->dpif_op.u.flow_put;
+
+ subfacet->path = want_path;
+
+ op->garbage = NULL;
+ op->dpif_op.type = DPIF_OP_FLOW_PUT;
+ put->flags = DPIF_FP_CREATE | DPIF_FP_MODIFY;
+ put->key = miss->key;
+ put->key_len = miss->key_len;
+ if (want_path == SF_FAST_PATH) {
+ put->actions = subfacet->actions;
+ put->actions_len = subfacet->actions_len;
+ } else {
+ compose_slow_path(ofproto, &facet->flow, subfacet->slow,
+ op->stub, sizeof op->stub,
+ &put->actions, &put->actions_len);
+ }
+ put->stats = NULL;
+ }
+}
+
+/* Handles flow miss 'miss'. May add any required datapath operations
+ * to 'ops', incrementing '*n_ops' for each new op. */
+static void
+handle_flow_miss(struct flow_miss *miss, struct flow_miss_op *ops,
+ size_t *n_ops)
+{
+ struct ofproto_dpif *ofproto = miss->ofproto;
+ struct facet *facet;
+ long long int now;
+ uint32_t hash;
+
+ /* The caller must ensure that miss->hmap_node.hash contains
+ * flow_hash(miss->flow, 0). */
+ hash = miss->hmap_node.hash;
+
+ facet = facet_lookup_valid(ofproto, &miss->flow, hash);
+ if (!facet) {
+ struct rule_dpif *rule = rule_dpif_lookup(ofproto, &miss->flow);
+
+ if (!flow_miss_should_make_facet(ofproto, miss, hash)) {
+ handle_flow_miss_without_facet(miss, rule, ops, n_ops);
+ return;
+ }
+
+ facet = facet_create(rule, &miss->flow, hash);
+ now = facet->used;
+ } else {
+ now = time_msec();
+ }
+ handle_flow_miss_with_facet(miss, facet, now, ops, n_ops);
+}
+
+static struct drop_key *
+drop_key_lookup(const struct dpif_backer *backer, const struct nlattr *key,
+ size_t key_len)
+{
+ struct drop_key *drop_key;
+
+ HMAP_FOR_EACH_WITH_HASH (drop_key, hmap_node, hash_bytes(key, key_len, 0),
+ &backer->drop_keys) {
+ if (drop_key->key_len == key_len
+ && !memcmp(drop_key->key, key, key_len)) {
+ return drop_key;
+ }
+ }
+ return NULL;
+}
+
+static void
+drop_key_clear(struct dpif_backer *backer)
+{
+ static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 15);
+ struct drop_key *drop_key, *next;
+
+ HMAP_FOR_EACH_SAFE (drop_key, next, hmap_node, &backer->drop_keys) {
+ int error;
+
+ error = dpif_flow_del(backer->dpif, drop_key->key, drop_key->key_len,
+ NULL);
+ if (error && !VLOG_DROP_WARN(&rl)) {
+ struct ds ds = DS_EMPTY_INITIALIZER;
+ odp_flow_key_format(drop_key->key, drop_key->key_len, &ds);
+ VLOG_WARN("Failed to delete drop key (%s) (%s)", strerror(error),
+ ds_cstr(&ds));
+ ds_destroy(&ds);
+ }
+
+ hmap_remove(&backer->drop_keys, &drop_key->hmap_node);
+ free(drop_key->key);
+ free(drop_key);
+ }
+}
+
+/* Given a datpath, packet, and flow metadata ('backer', 'packet', and 'key'
+ * respectively), populates 'flow' with the result of odp_flow_key_to_flow().
+ * Optionally, if nonnull, populates 'fitnessp' with the fitness of 'flow' as
+ * returned by odp_flow_key_to_flow(). Also, optionally populates 'ofproto'
+ * with the ofproto_dpif, and 'odp_in_port' with the datapath in_port, that
+ * 'packet' ingressed.
+ *
+ * If 'ofproto' is nonnull, requires 'flow''s in_port to exist. Otherwise sets
+ * 'flow''s in_port to OFPP_NONE.
+ *
+ * This function does post-processing on data returned from
+ * odp_flow_key_to_flow() to help make VLAN splinters transparent to the rest
+ * of the upcall processing logic. In particular, if the extracted in_port is
+ * a VLAN splinter port, it replaces flow->in_port by the "real" port, sets
+ * flow->vlan_tci correctly for the VLAN of the VLAN splinter port, and pushes
+ * a VLAN header onto 'packet' (if it is nonnull).
+ *
+ * Optionally, if 'initial_vals' is nonnull, sets 'initial_vals->vlan_tci'
+ * to the VLAN TCI with which the packet was really received, that is, the
+ * actual VLAN TCI extracted by odp_flow_key_to_flow(). (This differs from
+ * the value returned in flow->vlan_tci only for packets received on
+ * VLAN splinters.) Also, if received on an IP tunnel, sets
+ * 'initial_vals->tunnel_ip_tos' to the tunnel's IP TOS.
+ *
+ * Similarly, this function also includes some logic to help with tunnels. It
+ * may modify 'flow' as necessary to make the tunneling implementation
+ * transparent to the upcall processing logic.
+ *
+ * Returns 0 if successful, ENODEV if the parsed flow has no associated ofport,
+ * or some other positive errno if there are other problems. */
+static int
+ofproto_receive(const struct dpif_backer *backer, struct ofpbuf *packet,
+ const struct nlattr *key, size_t key_len,
+ struct flow *flow, enum odp_key_fitness *fitnessp,
+ struct ofproto_dpif **ofproto, uint32_t *odp_in_port,
+ struct initial_vals *initial_vals)
+{
+ const struct ofport_dpif *port;
+ enum odp_key_fitness fitness;
+ int error = ENODEV;
+
+ fitness = odp_flow_key_to_flow(key, key_len, flow);
+ if (fitness == ODP_FIT_ERROR) {
+ error = EINVAL;
+ goto exit;
+ }
+
+ if (initial_vals) {
+ initial_vals->vlan_tci = flow->vlan_tci;
+ initial_vals->tunnel_ip_tos = flow->tunnel.ip_tos;
+ }
+
+ if (odp_in_port) {
+ *odp_in_port = flow->in_port;
+ }
+
+ if (tnl_port_should_receive(flow)) {
+ const struct ofport *ofport = tnl_port_receive(flow);
+ if (!ofport) {
+ flow->in_port = OFPP_NONE;
+ goto exit;
+ }
+ port = ofport_dpif_cast(ofport);
+
+ /* We can't reproduce 'key' from 'flow'. */
+ fitness = fitness == ODP_FIT_PERFECT ? ODP_FIT_TOO_MUCH : fitness;
+
+ /* XXX: Since the tunnel module is not scoped per backer, it's
+ * theoretically possible that we'll receive an ofport belonging to an
+ * entirely different datapath. In practice, this can't happen because
+ * no platforms has two separate datapaths which each support
+ * tunneling. */
+ ovs_assert(ofproto_dpif_cast(port->up.ofproto)->backer == backer);
+ } else {
+ port = odp_port_to_ofport(backer, flow->in_port);
+ if (!port) {
+ flow->in_port = OFPP_NONE;
+ goto exit;
+ }
+
+ flow->in_port = port->up.ofp_port;
+ if (vsp_adjust_flow(ofproto_dpif_cast(port->up.ofproto), flow)) {
+ if (packet) {
+ /* Make the packet resemble the flow, so that it gets sent to
+ * an OpenFlow controller properly, so that it looks correct
+ * for sFlow, and so that flow_extract() will get the correct
+ * vlan_tci if it is called on 'packet'.
+ *
+ * The allocated space inside 'packet' probably also contains
+ * 'key', that is, both 'packet' and 'key' are probably part of
+ * a struct dpif_upcall (see the large comment on that
+ * structure definition), so pushing data on 'packet' is in
+ * general not a good idea since it could overwrite 'key' or
+ * free it as a side effect. However, it's OK in this special
+ * case because we know that 'packet' is inside a Netlink
+ * attribute: pushing 4 bytes will just overwrite the 4-byte
+ * "struct nlattr", which is fine since we don't need that
+ * header anymore. */
+ eth_push_vlan(packet, flow->vlan_tci);
+ }
+ /* We can't reproduce 'key' from 'flow'. */
+ fitness = fitness == ODP_FIT_PERFECT ? ODP_FIT_TOO_MUCH : fitness;
+ }
+ }
+ error = 0;
+
+ if (ofproto) {
+ *ofproto = ofproto_dpif_cast(port->up.ofproto);
+ }
+
+exit:
+ if (fitnessp) {
+ *fitnessp = fitness;
+ }
+ return error;
+}
+
+static void
+handle_miss_upcalls(struct dpif_backer *backer, struct dpif_upcall *upcalls,
+ size_t n_upcalls)
+{
+ struct dpif_upcall *upcall;
+ struct flow_miss *miss;
+ struct flow_miss misses[FLOW_MISS_MAX_BATCH];
+ struct flow_miss_op flow_miss_ops[FLOW_MISS_MAX_BATCH * 2];
+ struct dpif_op *dpif_ops[FLOW_MISS_MAX_BATCH * 2];
+ struct hmap todo;
+ int n_misses;
+ size_t n_ops;
+ size_t i;
+
+ if (!n_upcalls) {
+ return;
+ }
+
+ /* Construct the to-do list.
+ *
+ * This just amounts to extracting the flow from each packet and sticking
+ * the packets that have the same flow in the same "flow_miss" structure so
+ * that we can process them together. */
+ hmap_init(&todo);
+ n_misses = 0;
+ for (upcall = upcalls; upcall < &upcalls[n_upcalls]; upcall++) {
+ struct flow_miss *miss = &misses[n_misses];
+ struct flow_miss *existing_miss;
+ struct ofproto_dpif *ofproto;
+ uint32_t odp_in_port;
+ struct flow flow;
+ uint32_t hash;
+ int error;
+
+ error = ofproto_receive(backer, upcall->packet, upcall->key,
+ upcall->key_len, &flow, &miss->key_fitness,
+ &ofproto, &odp_in_port, &miss->initial_vals);
+ if (error == ENODEV) {
+ struct drop_key *drop_key;
+
+ /* Received packet on port for which we couldn't associate
+ * an ofproto. This can happen if a port is removed while
+ * traffic is being received. Print a rate-limited message
+ * in case it happens frequently. Install a drop flow so
+ * that future packets of the flow are inexpensively dropped
+ * in the kernel. */
+ VLOG_INFO_RL(&rl, "received packet on unassociated port %"PRIu32,
+ flow.in_port);
+
+ drop_key = drop_key_lookup(backer, upcall->key, upcall->key_len);
+ if (!drop_key) {
+ drop_key = xmalloc(sizeof *drop_key);
+ drop_key->key = xmemdup(upcall->key, upcall->key_len);
+ drop_key->key_len = upcall->key_len;
+
+ hmap_insert(&backer->drop_keys, &drop_key->hmap_node,
+ hash_bytes(drop_key->key, drop_key->key_len, 0));
+ dpif_flow_put(backer->dpif, DPIF_FP_CREATE | DPIF_FP_MODIFY,
+ drop_key->key, drop_key->key_len, NULL, 0, NULL);
+ }
+ continue;
+ }
+ if (error) {
+ continue;
+ }
+
+ ofproto->n_missed++;
+ flow_extract(upcall->packet, flow.skb_priority, flow.skb_mark,
+ &flow.tunnel, flow.in_port, &miss->flow);
+
+ /* Add other packets to a to-do list. */
+ hash = flow_hash(&miss->flow, 0);
+ existing_miss = flow_miss_find(&todo, ofproto, &miss->flow, hash);
+ if (!existing_miss) {
+ hmap_insert(&todo, &miss->hmap_node, hash);
+ miss->ofproto = ofproto;
+ miss->key = upcall->key;
+ miss->key_len = upcall->key_len;
+ miss->upcall_type = upcall->type;
+ miss->odp_in_port = odp_in_port;
+ list_init(&miss->packets);
+
+ n_misses++;
+ } else {
+ miss = existing_miss;
+ }
+ list_push_back(&miss->packets, &upcall->packet->list_node);
+ }
+
+ /* Process each element in the to-do list, constructing the set of
+ * operations to batch. */
+ n_ops = 0;
+ HMAP_FOR_EACH (miss, hmap_node, &todo) {
+ handle_flow_miss(miss, flow_miss_ops, &n_ops);
+ }
+ ovs_assert(n_ops <= ARRAY_SIZE(flow_miss_ops));
+
+ /* Execute batch. */
+ for (i = 0; i < n_ops; i++) {
+ dpif_ops[i] = &flow_miss_ops[i].dpif_op;
+ }
+ dpif_operate(backer->dpif, dpif_ops, n_ops);
+
+ /* Free memory. */
+ for (i = 0; i < n_ops; i++) {
+ free(flow_miss_ops[i].garbage);
+ }
+ hmap_destroy(&todo);
+}
+
+static enum { SFLOW_UPCALL, MISS_UPCALL, BAD_UPCALL }
+classify_upcall(const struct dpif_upcall *upcall)
+{
+ union user_action_cookie cookie;
+
+ /* First look at the upcall type. */
+ switch (upcall->type) {
+ case DPIF_UC_ACTION:
+ break;
+
+ case DPIF_UC_MISS:
+ return MISS_UPCALL;
+
+ case DPIF_N_UC_TYPES:
+ default:
+ VLOG_WARN_RL(&rl, "upcall has unexpected type %"PRIu32, upcall->type);
+ return BAD_UPCALL;
+ }
+
+ /* "action" upcalls need a closer look. */
+ if (!upcall->userdata) {
+ VLOG_WARN_RL(&rl, "action upcall missing cookie");
+ return BAD_UPCALL;
+ }
+ if (nl_attr_get_size(upcall->userdata) != sizeof(cookie)) {
+ VLOG_WARN_RL(&rl, "action upcall cookie has unexpected size %zu",
+ nl_attr_get_size(upcall->userdata));
+ return BAD_UPCALL;
+ }
+ memcpy(&cookie, nl_attr_get(upcall->userdata), sizeof(cookie));
+ switch (cookie.type) {
+ case USER_ACTION_COOKIE_SFLOW:
+ return SFLOW_UPCALL;
+
+ case USER_ACTION_COOKIE_SLOW_PATH:
+ return MISS_UPCALL;
+
+ case USER_ACTION_COOKIE_UNSPEC:
+ default:
+ VLOG_WARN_RL(&rl, "invalid user cookie : 0x%"PRIx64,
+ nl_attr_get_u64(upcall->userdata));
+ return BAD_UPCALL;
+ }
+}
+
+static void
+handle_sflow_upcall(struct dpif_backer *backer,
+ const struct dpif_upcall *upcall)
+{
+ struct ofproto_dpif *ofproto;
+ union user_action_cookie cookie;
+ struct flow flow;
+ uint32_t odp_in_port;
+
+ if (ofproto_receive(backer, upcall->packet, upcall->key, upcall->key_len,
+ &flow, NULL, &ofproto, &odp_in_port, NULL)
+ || !ofproto->sflow) {
+ return;
+ }
+
+ memcpy(&cookie, nl_attr_get(upcall->userdata), sizeof(cookie));
+ dpif_sflow_received(ofproto->sflow, upcall->packet, &flow,
+ odp_in_port, &cookie);
+}
+
+static int
+handle_upcalls(struct dpif_backer *backer, unsigned int max_batch)
+{
+ struct dpif_upcall misses[FLOW_MISS_MAX_BATCH];
+ struct ofpbuf miss_bufs[FLOW_MISS_MAX_BATCH];
+ uint64_t miss_buf_stubs[FLOW_MISS_MAX_BATCH][4096 / 8];
+ int n_processed;
+ int n_misses;
+ int i;
+
+ ovs_assert(max_batch <= FLOW_MISS_MAX_BATCH);
+
+ n_misses = 0;
+ for (n_processed = 0; n_processed < max_batch; n_processed++) {
+ struct dpif_upcall *upcall = &misses[n_misses];
+ struct ofpbuf *buf = &miss_bufs[n_misses];
+ int error;
+
+ ofpbuf_use_stub(buf, miss_buf_stubs[n_misses],
+ sizeof miss_buf_stubs[n_misses]);
+ error = dpif_recv(backer->dpif, upcall, buf);
+ if (error) {
+ ofpbuf_uninit(buf);
+ break;
+ }
+
+ switch (classify_upcall(upcall)) {
+ case MISS_UPCALL:
+ /* Handle it later. */
+ n_misses++;
+ break;
+
+ case SFLOW_UPCALL:
+ handle_sflow_upcall(backer, upcall);
+ ofpbuf_uninit(buf);
+ break;
+
+ case BAD_UPCALL:
+ ofpbuf_uninit(buf);
+ break;
+ }
+ }
+
+ /* Handle deferred MISS_UPCALL processing. */
+ handle_miss_upcalls(backer, misses, n_misses);
+ for (i = 0; i < n_misses; i++) {
+ ofpbuf_uninit(&miss_bufs[i]);
+ }
+
+ return n_processed;
+}
+\f
+/* Flow expiration. */
+
+static int subfacet_max_idle(const struct ofproto_dpif *);
+static void update_stats(struct dpif_backer *);
+static void rule_expire(struct rule_dpif *);
+static void expire_subfacets(struct ofproto_dpif *, int dp_max_idle);
+
+/* This function is called periodically by run(). Its job is to collect
+ * updates for the flows that have been installed into the datapath, most
+ * importantly when they last were used, and then use that information to
+ * expire flows that have not been used recently.
+ *
+ * Returns the number of milliseconds after which it should be called again. */
+static int
+expire(struct dpif_backer *backer)
+{
+ struct ofproto_dpif *ofproto;
+ int max_idle = INT32_MAX;
+
+ /* Periodically clear out the drop keys in an effort to keep them
+ * relatively few. */
+ drop_key_clear(backer);
+
+ /* Update stats for each flow in the backer. */
+ update_stats(backer);
+
+ HMAP_FOR_EACH (ofproto, all_ofproto_dpifs_node, &all_ofproto_dpifs) {
+ struct rule *rule, *next_rule;
+ int dp_max_idle;
+
+ if (ofproto->backer != backer) {
+ continue;
+ }
+
+ /* Keep track of the max number of flows per ofproto_dpif. */
+ update_max_subfacet_count(ofproto);
+
+ /* Expire subfacets that have been idle too long. */
+ dp_max_idle = subfacet_max_idle(ofproto);
+ expire_subfacets(ofproto, dp_max_idle);
+
+ max_idle = MIN(max_idle, dp_max_idle);
+
+ /* Expire OpenFlow flows whose idle_timeout or hard_timeout
+ * has passed. */
+ LIST_FOR_EACH_SAFE (rule, next_rule, expirable,
+ &ofproto->up.expirable) {
+ rule_expire(rule_dpif_cast(rule));
+ }
+
+ /* All outstanding data in existing flows has been accounted, so it's a
+ * good time to do bond rebalancing. */
+ if (ofproto->has_bonded_bundles) {
+ struct ofbundle *bundle;
+
+ HMAP_FOR_EACH (bundle, hmap_node, &ofproto->bundles) {
+ if (bundle->bond) {
+ bond_rebalance(bundle->bond, &backer->revalidate_set);
+ }
+ }
+ }
+ }
+
+ return MIN(max_idle, 1000);
+}
+
+/* Updates flow table statistics given that the datapath just reported 'stats'
+ * as 'subfacet''s statistics. */
+static void
+update_subfacet_stats(struct subfacet *subfacet,
+ const struct dpif_flow_stats *stats)
+{
+ struct facet *facet = subfacet->facet;
+
+ if (stats->n_packets >= subfacet->dp_packet_count) {
+ uint64_t extra = stats->n_packets - subfacet->dp_packet_count;
+ facet->packet_count += extra;
+ } else {
+ VLOG_WARN_RL(&rl, "unexpected packet count from the datapath");
+ }
+
+ if (stats->n_bytes >= subfacet->dp_byte_count) {
+ facet->byte_count += stats->n_bytes - subfacet->dp_byte_count;
+ } else {
+ VLOG_WARN_RL(&rl, "unexpected byte count from datapath");
+ }
+
+ subfacet->dp_packet_count = stats->n_packets;
+ subfacet->dp_byte_count = stats->n_bytes;
+
+ facet->tcp_flags |= stats->tcp_flags;
+
+ subfacet_update_time(subfacet, stats->used);
+ if (facet->accounted_bytes < facet->byte_count) {
+ facet_learn(facet);
+ facet_account(facet);
+ facet->accounted_bytes = facet->byte_count;
+ }
+}
+
+/* 'key' with length 'key_len' bytes is a flow in 'dpif' that we know nothing
+ * about, or a flow that shouldn't be installed but was anyway. Delete it. */
+static void
+delete_unexpected_flow(struct ofproto_dpif *ofproto,
+ const struct nlattr *key, size_t key_len)
+{
+ if (!VLOG_DROP_WARN(&rl)) {
+ struct ds s;
+
+ ds_init(&s);
+ odp_flow_key_format(key, key_len, &s);
+ VLOG_WARN("unexpected flow on %s: %s", ofproto->up.name, ds_cstr(&s));
+ ds_destroy(&s);
+ }
+
+ COVERAGE_INC(facet_unexpected);
+ dpif_flow_del(ofproto->backer->dpif, key, key_len, NULL);
+}
+
+/* Update 'packet_count', 'byte_count', and 'used' members of installed facets.
+ *
+ * This function also pushes statistics updates to rules which each facet
+ * resubmits into. Generally these statistics will be accurate. However, if a
+ * facet changes the rule it resubmits into at some time in between
+ * update_stats() runs, it is possible that statistics accrued to the
+ * old rule will be incorrectly attributed to the new rule. This could be
+ * avoided by calling update_stats() whenever rules are created or
+ * deleted. However, the performance impact of making so many calls to the
+ * datapath do not justify the benefit of having perfectly accurate statistics.
+ *
+ * In addition, this function maintains per ofproto flow hit counts. The patch
+ * port is not treated specially. e.g. A packet ingress from br0 patched into
+ * br1 will increase the hit count of br0 by 1, however, does not affect
+ * the hit or miss counts of br1.
+ */
+static void
+update_stats(struct dpif_backer *backer)
+{
+ const struct dpif_flow_stats *stats;
+ struct dpif_flow_dump dump;
+ const struct nlattr *key;
+ size_t key_len;
+
+ dpif_flow_dump_start(&dump, backer->dpif);
+ while (dpif_flow_dump_next(&dump, &key, &key_len, NULL, NULL, &stats)) {
+ struct flow flow;
+ struct subfacet *subfacet;
+ struct ofproto_dpif *ofproto;
+ struct ofport_dpif *ofport;
+ uint32_t key_hash;
+
+ if (ofproto_receive(backer, NULL, key, key_len, &flow, NULL, &ofproto,
+ NULL, NULL)) {
+ continue;
+ }
+
+ ofproto->total_subfacet_count += hmap_count(&ofproto->subfacets);
+ ofproto->n_update_stats++;
+ update_moving_averages(ofproto);
+
+ ofport = get_ofp_port(ofproto, flow.in_port);
+ if (ofport && ofport->tnl_port) {
+ netdev_vport_inc_rx(ofport->up.netdev, stats);
+ }
+
+ key_hash = odp_flow_key_hash(key, key_len);
+ subfacet = subfacet_find(ofproto, key, key_len, key_hash);
+ switch (subfacet ? subfacet->path : SF_NOT_INSTALLED) {
+ case SF_FAST_PATH:
+ /* Update ofproto_dpif's hit count. */
+ if (stats->n_packets > subfacet->dp_packet_count) {
+ uint64_t delta = stats->n_packets - subfacet->dp_packet_count;
+ dpif_stats_update_hit_count(ofproto, delta);
+ }
+
+ update_subfacet_stats(subfacet, stats);
+ break;
+
+ case SF_SLOW_PATH:
+ /* Stats are updated per-packet. */
+ break;
+
+ case SF_NOT_INSTALLED:
+ default:
+ delete_unexpected_flow(ofproto, key, key_len);
+ break;
+ }
+ run_fast_rl();
+ }
+ dpif_flow_dump_done(&dump);
+}
+
+/* Calculates and returns the number of milliseconds of idle time after which
+ * subfacets should expire from the datapath. When a subfacet expires, we fold
+ * its statistics into its facet, and when a facet's last subfacet expires, we
+ * fold its statistic into its rule. */
+static int
+subfacet_max_idle(const struct ofproto_dpif *ofproto)
+{
+ /*
+ * Idle time histogram.
+ *
+ * Most of the time a switch has a relatively small number of subfacets.
+ * When this is the case we might as well keep statistics for all of them
+ * in userspace and to cache them in the kernel datapath for performance as
+ * well.
+ *
+ * As the number of subfacets increases, the memory required to maintain
+ * statistics about them in userspace and in the kernel becomes
+ * significant. However, with a large number of subfacets it is likely
+ * that only a few of them are "heavy hitters" that consume a large amount
+ * of bandwidth. At this point, only heavy hitters are worth caching in
+ * the kernel and maintaining in userspaces; other subfacets we can
+ * discard.
+ *
+ * The technique used to compute the idle time is to build a histogram with
+ * N_BUCKETS buckets whose width is BUCKET_WIDTH msecs each. Each subfacet
+ * that is installed in the kernel gets dropped in the appropriate bucket.
+ * After the histogram has been built, we compute the cutoff so that only
+ * the most-recently-used 1% of subfacets (but at least
+ * ofproto->up.flow_eviction_threshold flows) are kept cached. At least
+ * the most-recently-used bucket of subfacets is kept, so actually an
+ * arbitrary number of subfacets can be kept in any given expiration run
+ * (though the next run will delete most of those unless they receive
+ * additional data).
+ *
+ * This requires a second pass through the subfacets, in addition to the
+ * pass made by update_stats(), because the former function never looks at
+ * uninstallable subfacets.
+ */
+ enum { BUCKET_WIDTH = ROUND_UP(100, TIME_UPDATE_INTERVAL) };
+ enum { N_BUCKETS = 5000 / BUCKET_WIDTH };
+ int buckets[N_BUCKETS] = { 0 };
+ int total, subtotal, bucket;
+ struct subfacet *subfacet;
+ long long int now;
+ int i;
+
+ total = hmap_count(&ofproto->subfacets);
+ if (total <= ofproto->up.flow_eviction_threshold) {
+ return N_BUCKETS * BUCKET_WIDTH;
+ }