From: Ben Pfaff Date: Thu, 19 Sep 2013 18:03:47 +0000 (-0700) Subject: ofproto-dpif-upcall: Forward packets in order of arrival. X-Git-Tag: sliver-openvswitch-2.0.90-1~13^2~21 X-Git-Url: http://git.onelab.eu/?a=commitdiff_plain;h=04a19fb8f4b8ba19a9805906aac7b30b65b57206;p=sliver-openvswitch.git ofproto-dpif-upcall: Forward packets in order of arrival. Until now, the code in ofproto-dpif-upcall (and the code that preceded it in ofproto-dpif) obtained a batch of incoming packets, inserted them into a hash table based on hashes of their flows, processed them, and then forwarded them in hash order. Usually this maintains order within a single network connection, but because OVS's notion of a flow is so fine-grained, it can reorder packets within (e.g.) a TCP connection if two packets handled in a single batch have (e.g.) different ECN values. This commit fixes the problem by making ofproto-dpif-upcall always forward packets in the same order they were received. This is far from the minimal change necessary to avoid reordering packets. I think that the code is easier to understand afterward. Reported-by: Dmitry Fleytman Signed-off-by: Ben Pfaff Acked-by: Jarno Rajahalme --- diff --git a/ofproto/ofproto-dpif-upcall.c b/ofproto/ofproto-dpif-upcall.c index b9d91b291..d75c61b61 100644 --- a/ofproto/ofproto-dpif-upcall.c +++ b/ofproto/ofproto-dpif-upcall.c @@ -381,11 +381,11 @@ udpif_dispatcher(void *arg) static void * udpif_miss_handler(void *arg) { - struct list misses = LIST_INITIALIZER(&misses); struct handler *handler = arg; set_subprogram_name("miss_handler"); for (;;) { + struct list misses = LIST_INITIALIZER(&misses); size_t i; ovs_mutex_lock(&handler->mutex); @@ -416,12 +416,6 @@ udpif_miss_handler(void *arg) static void miss_destroy(struct flow_miss *miss) { - struct upcall *upcall, *next; - - LIST_FOR_EACH_SAFE (upcall, next, list_node, &miss->upcalls) { - list_remove(&upcall->list_node); - upcall_destroy(upcall); - } xlate_out_uninit(&miss->xout); } @@ -598,105 +592,6 @@ flow_miss_find(struct hmap *todo, const struct ofproto_dpif *ofproto, return NULL; } -/* Executes flow miss 'miss'. May add any required datapath operations - * to 'ops', incrementing '*n_ops' for each new op. */ -static void -execute_flow_miss(struct flow_miss *miss, struct dpif_op *ops, size_t *n_ops) -{ - struct ofproto_dpif *ofproto = miss->ofproto; - struct upcall *upcall; - struct flow_wildcards wc; - struct rule_dpif *rule; - struct xlate_in xin; - - memset(&miss->stats, 0, sizeof miss->stats); - miss->stats.used = time_msec(); - LIST_FOR_EACH (upcall, list_node, &miss->upcalls) { - struct ofpbuf *packet = upcall->dpif_upcall.packet; - - miss->stats.tcp_flags |= packet_get_tcp_flags(packet, &miss->flow); - miss->stats.n_bytes += packet->size; - miss->stats.n_packets++; - } - - flow_wildcards_init_catchall(&wc); - rule_dpif_lookup(ofproto, &miss->flow, &wc, &rule); - rule_dpif_credit_stats(rule, &miss->stats); - xlate_in_init(&xin, ofproto, &miss->flow, rule, miss->stats.tcp_flags, - NULL); - xin.may_learn = true; - xin.resubmit_stats = &miss->stats; - xlate_actions(&xin, &miss->xout); - flow_wildcards_or(&miss->xout.wc, &miss->xout.wc, &wc); - - if (rule_dpif_fail_open(rule)) { - LIST_FOR_EACH (upcall, list_node, &miss->upcalls) { - struct ofpbuf *packet = upcall->dpif_upcall.packet; - struct ofputil_packet_in *pin; - - /* Extra-special case for fail-open mode. - * - * We are in fail-open mode and the packet matched the fail-open - * rule, but we are connected to a controller too. We should send - * the packet up to the controller in the hope that it will try to - * set up a flow and thereby allow us to exit fail-open. - * - * See the top-level comment in fail-open.c for more information. */ - pin = xmalloc(sizeof(*pin)); - pin->packet = xmemdup(packet->data, packet->size); - pin->packet_len = packet->size; - pin->reason = OFPR_NO_MATCH; - pin->controller_id = 0; - pin->table_id = 0; - pin->cookie = 0; - pin->send_len = 0; /* Not used for flow table misses. */ - flow_get_metadata(&miss->flow, &pin->fmd); - ofproto_dpif_send_packet_in(ofproto, pin); - } - } - - if (miss->xout.slow) { - LIST_FOR_EACH (upcall, list_node, &miss->upcalls) { - struct ofpbuf *packet = upcall->dpif_upcall.packet; - struct xlate_in xin; - - xlate_in_init(&xin, miss->ofproto, &miss->flow, rule, 0, packet); - xlate_actions_for_side_effects(&xin); - } - } - rule_dpif_unref(rule); - - if (miss->xout.odp_actions.size) { - LIST_FOR_EACH (upcall, list_node, &miss->upcalls) { - struct ofpbuf *packet = upcall->dpif_upcall.packet; - struct dpif_op *op = &ops[*n_ops]; - struct dpif_execute *execute = &op->u.execute; - - if (miss->flow.in_port.ofp_port - != vsp_realdev_to_vlandev(miss->ofproto, - miss->flow.in_port.ofp_port, - miss->flow.vlan_tci)) { - /* This packet was received on a VLAN splinter port. We - * added a VLAN to the packet to make the packet resemble - * the flow, but the actions were composed assuming that - * the packet contained no VLAN. So, we must remove the - * VLAN header from the packet before trying to execute the - * actions. */ - eth_pop_vlan(packet); - } - - op->type = DPIF_OP_EXECUTE; - execute->key = miss->key; - execute->key_len = miss->key_len; - execute->packet = packet; - execute->actions = miss->xout.odp_actions.data; - execute->actions_len = miss->xout.odp_actions.size; - - (*n_ops)++; - } - } -} - static void handle_miss_upcalls(struct udpif *udpif, struct list *upcalls) { @@ -707,92 +602,185 @@ handle_miss_upcalls(struct udpif *udpif, struct list *upcalls) size_t n_upcalls, n_ops, i; struct flow_miss *miss; unsigned int reval_seq; + bool fail_open; - /* Construct the to-do list. + /* Extract the flow from each upcall. Construct in fmb->misses a hash + * table that maps each unique flow to a 'struct flow_miss'. + * + * Most commonly there is a single packet per flow_miss, but there are + * several reasons why there might be more than one, e.g.: + * + * - The dpif packet interface does not support TSO (or UFO, etc.), so a + * large packet sent to userspace is split into a sequence of smaller + * ones. * - * This just amounts to extracting the flow from each packet and sticking - * the packets that have the same flow in the same "flow_miss" structure so - * that we can process them together. */ + * - A stream of quickly arriving packets in an established "slow-pathed" + * flow. + * + * - Rarely, a stream of quickly arriving packets in a flow not yet + * established. (This is rare because most protocols do not send + * multiple back-to-back packets before receiving a reply from the + * other end of the connection, which gives OVS a chance to set up a + * datapath flow.) + */ fmb = xmalloc(sizeof *fmb); atomic_read(&udpif->reval_seq, &fmb->reval_seq); hmap_init(&fmb->misses); n_upcalls = 0; LIST_FOR_EACH_SAFE (upcall, next, list_node, upcalls) { struct dpif_upcall *dupcall = &upcall->dpif_upcall; + struct ofpbuf *packet = dupcall->packet; struct flow_miss *miss = &fmb->miss_buf[n_upcalls]; struct flow_miss *existing_miss; struct ofproto_dpif *ofproto; odp_port_t odp_in_port; struct flow flow; - uint32_t hash; int error; - error = xlate_receive(udpif->backer, dupcall->packet, dupcall->key, + error = xlate_receive(udpif->backer, packet, dupcall->key, dupcall->key_len, &flow, &miss->key_fitness, &ofproto, &odp_in_port); - if (error == ENODEV) { - struct drop_key *drop_key; - - /* Received packet on datapath port for which we couldn't - * associate an ofproto. This can happen if a port is removed - * while traffic is being received. Print a rate-limited message - * in case it happens frequently. Install a drop flow so - * that future packets of the flow are inexpensively dropped - * in the kernel. */ - VLOG_INFO_RL(&rl, "received packet on unassociated datapath port " - "%"PRIu32, odp_in_port); - - drop_key = xmalloc(sizeof *drop_key); - drop_key->key = xmemdup(dupcall->key, dupcall->key_len); - drop_key->key_len = dupcall->key_len; - - if (guarded_list_push_back(&udpif->drop_keys, &drop_key->list_node, - MAX_QUEUE_LENGTH)) { - seq_change(udpif->wait_seq); + if (!error) { + uint32_t hash; + + flow_extract(packet, flow.skb_priority, flow.pkt_mark, + &flow.tunnel, &flow.in_port, &miss->flow); + + hash = flow_hash(&miss->flow, 0); + existing_miss = flow_miss_find(&fmb->misses, ofproto, &miss->flow, + hash); + if (!existing_miss) { + hmap_insert(&fmb->misses, &miss->hmap_node, hash); + miss->ofproto = ofproto; + miss->key = dupcall->key; + miss->key_len = dupcall->key_len; + miss->upcall_type = dupcall->type; + miss->stats.n_packets = 0; + miss->stats.n_bytes = 0; + miss->stats.used = time_msec(); + miss->stats.tcp_flags = 0; + + n_upcalls++; } else { - COVERAGE_INC(drop_queue_overflow); - drop_key_destroy(drop_key); + miss = existing_miss; } - continue; - } else if (error) { - continue; - } + miss->stats.tcp_flags |= packet_get_tcp_flags(packet, &miss->flow); + miss->stats.n_bytes += packet->size; + miss->stats.n_packets++; - flow_extract(dupcall->packet, flow.skb_priority, flow.pkt_mark, - &flow.tunnel, &flow.in_port, &miss->flow); - - /* Add other packets to a to-do list. */ - hash = flow_hash(&miss->flow, 0); - existing_miss = flow_miss_find(&fmb->misses, ofproto, &miss->flow, hash); - if (!existing_miss) { - hmap_insert(&fmb->misses, &miss->hmap_node, hash); - miss->ofproto = ofproto; - miss->key = dupcall->key; - miss->key_len = dupcall->key_len; - miss->upcall_type = dupcall->type; - list_init(&miss->upcalls); - - n_upcalls++; + upcall->flow_miss = miss; } else { - miss = existing_miss; + if (error == ENODEV) { + struct drop_key *drop_key; + + /* Received packet on datapath port for which we couldn't + * associate an ofproto. This can happen if a port is removed + * while traffic is being received. Print a rate-limited + * message in case it happens frequently. Install a drop flow + * so that future packets of the flow are inexpensively dropped + * in the kernel. */ + VLOG_INFO_RL(&rl, "received packet on unassociated datapath " + "port %"PRIu32, odp_in_port); + + drop_key = xmalloc(sizeof *drop_key); + drop_key->key = xmemdup(dupcall->key, dupcall->key_len); + drop_key->key_len = dupcall->key_len; + + if (guarded_list_push_back(&udpif->drop_keys, + &drop_key->list_node, + MAX_QUEUE_LENGTH)) { + seq_change(udpif->wait_seq); + } else { + COVERAGE_INC(drop_queue_overflow); + drop_key_destroy(drop_key); + } + } + list_remove(&upcall->list_node); + upcall_destroy(upcall); } - list_remove(&upcall->list_node); - list_push_back(&miss->upcalls, &upcall->list_node); } - LIST_FOR_EACH_SAFE (upcall, next, list_node, upcalls) { - list_remove(&upcall->list_node); - upcall_destroy(upcall); - } - - /* Process each element in the to-do list, constructing the set of - * operations to batch. */ - n_ops = 0; + /* Initialize each 'struct flow_miss's ->xout. + * + * We do this per-flow_miss rather than per-packet because, most commonly, + * all the packets in a flow can use the same translation. + * + * We can't do this in the previous loop because we need the TCP flags for + * all the packets in each miss. */ + fail_open = false; HMAP_FOR_EACH (miss, hmap_node, &fmb->misses) { - execute_flow_miss(miss, ops, &n_ops); + struct flow_wildcards wc; + struct rule_dpif *rule; + struct xlate_in xin; + + flow_wildcards_init_catchall(&wc); + rule_dpif_lookup(miss->ofproto, &miss->flow, &wc, &rule); + if (rule_dpif_fail_open(rule)) { + fail_open = true; + } + rule_dpif_credit_stats(rule, &miss->stats); + xlate_in_init(&xin, miss->ofproto, &miss->flow, rule, + miss->stats.tcp_flags, NULL); + xin.may_learn = true; + xin.resubmit_stats = &miss->stats; + xlate_actions(&xin, &miss->xout); + flow_wildcards_or(&miss->xout.wc, &miss->xout.wc, &wc); + rule_dpif_unref(rule); + } + + /* Now handle the packets individually in order of arrival. In the common + * case each packet of a miss can share the same actions, but slow-pathed + * packets need to be translated individually: + * + * - For SLOW_CFM, SLOW_LACP, SLOW_STP, and SLOW_BFD, translation is what + * processes received packets for these protocols. + * + * - For SLOW_CONTROLLER, translation sends the packet to the OpenFlow + * controller. + * + * The loop fills 'ops' with an array of operations to execute in the + * datapath. */ + n_ops = 0; + LIST_FOR_EACH (upcall, list_node, upcalls) { + struct flow_miss *miss = upcall->flow_miss; + struct ofpbuf *packet = upcall->dpif_upcall.packet; + + if (miss->xout.slow) { + struct rule_dpif *rule; + struct xlate_in xin; + + rule_dpif_lookup(miss->ofproto, &miss->flow, NULL, &rule); + xlate_in_init(&xin, miss->ofproto, &miss->flow, rule, 0, packet); + xlate_actions_for_side_effects(&xin); + rule_dpif_unref(rule); + } + + if (miss->xout.odp_actions.size) { + struct dpif_op *op; + + if (miss->flow.in_port.ofp_port + != vsp_realdev_to_vlandev(miss->ofproto, + miss->flow.in_port.ofp_port, + miss->flow.vlan_tci)) { + /* This packet was received on a VLAN splinter port. We + * added a VLAN to the packet to make the packet resemble + * the flow, but the actions were composed assuming that + * the packet contained no VLAN. So, we must remove the + * VLAN header from the packet before trying to execute the + * actions. */ + eth_pop_vlan(packet); + } + + op = &ops[n_ops++]; + op->type = DPIF_OP_EXECUTE; + op->u.execute.key = miss->key; + op->u.execute.key_len = miss->key_len; + op->u.execute.packet = packet; + op->u.execute.actions = miss->xout.odp_actions.data; + op->u.execute.actions_len = miss->xout.odp_actions.size; + } } - ovs_assert(n_ops <= ARRAY_SIZE(ops)); /* Execute batch. */ for (i = 0; i < n_ops; i++) { @@ -800,6 +788,32 @@ handle_miss_upcalls(struct udpif *udpif, struct list *upcalls) } dpif_operate(udpif->dpif, opsp, n_ops); + /* Special case for fail-open mode. + * + * If we are in fail-open mode, but we are connected to a controller too, + * then we should send the packet up to the controller in the hope that it + * will try to set up a flow and thereby allow us to exit fail-open. + * + * See the top-level comment in fail-open.c for more information. */ + if (fail_open) { + LIST_FOR_EACH (upcall, list_node, upcalls) { + struct flow_miss *miss = upcall->flow_miss; + struct ofpbuf *packet = upcall->dpif_upcall.packet; + struct ofputil_packet_in *pin; + + pin = xmalloc(sizeof *pin); + pin->packet = xmemdup(packet->data, packet->size); + pin->packet_len = packet->size; + pin->reason = OFPR_NO_MATCH; + pin->controller_id = 0; + pin->table_id = 0; + pin->cookie = 0; + pin->send_len = 0; /* Not used for flow table misses. */ + flow_get_metadata(&miss->flow, &pin->fmd); + ofproto_dpif_send_packet_in(miss->ofproto, pin); + } + } + atomic_read(&udpif->reval_seq, &reval_seq); if (reval_seq != fmb->reval_seq) { COVERAGE_INC(fmb_queue_revalidated); diff --git a/ofproto/ofproto-dpif-upcall.h b/ofproto/ofproto-dpif-upcall.h index a23f7a0e2..9bd19ad52 100644 --- a/ofproto/ofproto-dpif-upcall.h +++ b/ofproto/ofproto-dpif-upcall.h @@ -59,6 +59,7 @@ enum upcall_type { /* An upcall. */ struct upcall { struct list list_node; /* For queuing upcalls. */ + struct flow_miss *flow_miss; /* This upcall's flow_miss. */ enum upcall_type type; /* Classification. */ @@ -94,8 +95,6 @@ struct flow_miss { struct dpif_flow_stats stats; struct xlate_out xout; - - struct list upcalls; /* Contains "struct upcall"s. */ }; struct flow_miss_batch {