X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=lib%2Fdpif-netdev.c;h=96c5feb428bdaaa508013c917848a5236be476ed;hb=8ba0a5227f6e6b50838c157bd303c2d5bf6f4e59;hp=18da4e8736f0372031cb91932df1f7e21df61c01;hpb=e4cfed38b159aba1ef44ed4a7b1f3e982b7358d4;p=sliver-openvswitch.git diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index 18da4e873..96c5feb42 100644 --- a/lib/dpif-netdev.c +++ b/lib/dpif-netdev.c @@ -43,6 +43,7 @@ #include "list.h" #include "meta-flow.h" #include "netdev.h" +#include "netdev-dpdk.h" #include "netdev-vport.h" #include "netlink.h" #include "odp-execute.h" @@ -67,6 +68,9 @@ VLOG_DEFINE_THIS_MODULE(dpif_netdev); #define NETDEV_RULE_PRIORITY 0x8000 #define NR_THREADS 1 +/* Use per thread recirc_depth to prevent recirculation loop. */ +#define MAX_RECIRC_DEPTH 5 +DEFINE_STATIC_PER_THREAD_DATA(uint32_t, recirc_depth, 0) /* Configuration parameters. */ enum { MAX_FLOWS = 65536 }; /* Maximum number of flows in flow table. */ @@ -194,7 +198,7 @@ struct dp_netdev_port { odp_port_t port_no; struct netdev *netdev; struct netdev_saved_flags *sf; - struct netdev_rx *rx; + struct netdev_rxq **rxq; struct ovs_refcount ref_cnt; char *type; /* Port type as requested by user. */ }; @@ -319,7 +323,6 @@ struct pmd_thread { pthread_t thread; int id; atomic_uint change_seq; - char *name; }; /* Interface to netdev-based datapath. */ @@ -349,10 +352,11 @@ static int dpif_netdev_open(const struct dpif_class *, const char *name, bool create, struct dpif **); static int dp_netdev_output_userspace(struct dp_netdev *dp, struct ofpbuf *, int queue_no, int type, - const struct flow *, + const struct miniflow *, const struct nlattr *userdata); static void dp_netdev_execute_actions(struct dp_netdev *dp, - const struct flow *, struct ofpbuf *, bool may_steal, + const struct miniflow *, + struct ofpbuf *, bool may_steal, struct pkt_metadata *, const struct nlattr *actions, size_t actions_len); @@ -675,6 +679,7 @@ do_add_port(struct dp_netdev *dp, const char *devname, const char *type, enum netdev_flags flags; const char *open_type; int error; + int i; /* XXX reject devices already in some dp_netdev. */ @@ -696,21 +701,26 @@ do_add_port(struct dp_netdev *dp, const char *devname, const char *type, port = xzalloc(sizeof *port); port->port_no = port_no; port->netdev = netdev; + port->rxq = xmalloc(sizeof *port->rxq * netdev_n_rxq(netdev)); port->type = xstrdup(type); - error = netdev_rx_open(netdev, &port->rx); - if (error - && !(error == EOPNOTSUPP && dpif_netdev_class_is_dummy(dp->class))) { - VLOG_ERR("%s: cannot receive packets on this network device (%s)", - devname, ovs_strerror(errno)); - netdev_close(netdev); - return error; + for (i = 0; i < netdev_n_rxq(netdev); i++) { + error = netdev_rxq_open(netdev, &port->rxq[i], i); + if (error + && !(error == EOPNOTSUPP && dpif_netdev_class_is_dummy(dp->class))) { + VLOG_ERR("%s: cannot receive packets on this network device (%s)", + devname, ovs_strerror(errno)); + netdev_close(netdev); + return error; + } } error = netdev_turn_flags_on(netdev, NETDEV_PROMISC, &sf); if (error) { - netdev_rx_close(port->rx); + for (i = 0; i < netdev_n_rxq(netdev); i++) { + netdev_rxq_close(port->rxq[i]); + } netdev_close(netdev); - free(port->rx); + free(port->rxq); free(port); return error; } @@ -817,9 +827,14 @@ static void port_unref(struct dp_netdev_port *port) { if (port && ovs_refcount_unref(&port->ref_cnt) == 1) { + int i; + netdev_close(port->netdev); netdev_restore_flags(port->sf); - netdev_rx_close(port->rx); + + for (i = 0; i < netdev_n_rxq(port->netdev); i++) { + netdev_rxq_close(port->rxq[i]); + } free(port->type); free(port); } @@ -1048,13 +1063,15 @@ dp_netdev_flow_cast(const struct cls_rule *cr) } static struct dp_netdev_flow * -dp_netdev_lookup_flow(const struct dp_netdev *dp, const struct flow *flow) +dp_netdev_lookup_flow(const struct dp_netdev *dp, const struct miniflow *key) OVS_EXCLUDED(dp->cls.rwlock) { struct dp_netdev_flow *netdev_flow; + struct cls_rule *rule; fat_rwlock_rdlock(&dp->cls.rwlock); - netdev_flow = dp_netdev_flow_cast(classifier_lookup(&dp->cls, flow, NULL)); + rule = classifier_lookup_miniflow_first(&dp->cls, key); + netdev_flow = dp_netdev_flow_cast(rule); fat_rwlock_unlock(&dp->cls.rwlock); return netdev_flow; @@ -1124,8 +1141,6 @@ dpif_netdev_mask_from_nlattrs(const struct nlattr *key, uint32_t key_len, return EINVAL; } - /* Force unwildcard the in_port. */ - mask->in_port.odp_port = u32_to_odp(UINT32_MAX); } else { enum mf_field_id id; /* No mask key, unwildcard everything except fields whose @@ -1144,6 +1159,14 @@ dpif_netdev_mask_from_nlattrs(const struct nlattr *key, uint32_t key_len, } } + /* Force unwildcard the in_port. + * + * We need to do this even in the case where we unwildcard "everything" + * above because "everything" only includes the 16-bit OpenFlow port number + * mask->in_port.ofp_port, which only covers half of the 32-bit datapath + * port number mask->in_port.odp_port. */ + mask->in_port.odp_port = u32_to_odp(UINT32_MAX); + return 0; } @@ -1273,6 +1296,7 @@ dpif_netdev_flow_put(struct dpif *dpif, const struct dpif_flow_put *put) struct dp_netdev *dp = get_dp_netdev(dpif); struct dp_netdev_flow *netdev_flow; struct flow flow; + struct miniflow miniflow; struct flow_wildcards wc; int error; @@ -1286,9 +1310,10 @@ dpif_netdev_flow_put(struct dpif *dpif, const struct dpif_flow_put *put) if (error) { return error; } + miniflow_init(&miniflow, &flow); ovs_mutex_lock(&dp->flow_mutex); - netdev_flow = dp_netdev_lookup_flow(dp, &flow); + netdev_flow = dp_netdev_lookup_flow(dp, &miniflow); if (!netdev_flow) { if (put->flags & DPIF_FP_CREATE) { if (hmap_count(&dp->flow_table) < MAX_FLOWS) { @@ -1421,6 +1446,7 @@ dpif_netdev_flow_dump_next(const struct dpif *dpif, void *iter_, void *state_, struct dp_netdev_flow_state *state = state_; struct dp_netdev *dp = get_dp_netdev(dpif); struct dp_netdev_flow *netdev_flow; + struct flow_wildcards wc; int error; ovs_mutex_lock(&iter->mutex); @@ -1443,29 +1469,29 @@ dpif_netdev_flow_dump_next(const struct dpif *dpif, void *iter_, void *state_, return error; } + minimask_expand(&netdev_flow->cr.match.mask, &wc); + if (key) { struct ofpbuf buf; ofpbuf_use_stack(&buf, &state->keybuf, sizeof state->keybuf); - odp_flow_key_from_flow(&buf, &netdev_flow->flow, + odp_flow_key_from_flow(&buf, &netdev_flow->flow, &wc.masks, netdev_flow->flow.in_port.odp_port); - *key = buf.data; - *key_len = buf.size; + *key = ofpbuf_data(&buf); + *key_len = ofpbuf_size(&buf); } if (key && mask) { struct ofpbuf buf; - struct flow_wildcards wc; ofpbuf_use_stack(&buf, &state->maskbuf, sizeof state->maskbuf); - minimask_expand(&netdev_flow->cr.match.mask, &wc); odp_flow_key_from_mask(&buf, &wc.masks, &netdev_flow->flow, odp_to_u32(wc.masks.in_port.odp_port), SIZE_MAX); - *mask = buf.data; - *mask_len = buf.size; + *mask = ofpbuf_data(&buf); + *mask_len = ofpbuf_size(&buf); } if (actions || stats) { @@ -1501,15 +1527,17 @@ dpif_netdev_execute(struct dpif *dpif, struct dpif_execute *execute) { struct dp_netdev *dp = get_dp_netdev(dpif); struct pkt_metadata *md = &execute->md; - struct flow key; + struct miniflow key; + uint32_t buf[FLOW_U32S]; - if (execute->packet->size < ETH_HEADER_LEN || - execute->packet->size > UINT16_MAX) { + if (ofpbuf_size(execute->packet) < ETH_HEADER_LEN || + ofpbuf_size(execute->packet) > UINT16_MAX) { return EINVAL; } /* Extract flow key. */ - flow_extract(execute->packet, md, &key); + miniflow_initialize(&key, buf); + miniflow_extract(execute->packet, md, &key); ovs_rwlock_rdlock(&dp->port_rwlock); dp_netdev_execute_actions(dp, &key, execute->packet, false, md, @@ -1722,15 +1750,15 @@ dp_netdev_actions_free(struct dp_netdev_actions *actions) } -inline static void -dp_netdev_process_rx_port(struct dp_netdev *dp, +static void +dp_netdev_process_rxq_port(struct dp_netdev *dp, struct dp_netdev_port *port, - struct netdev_rx *queue) + struct netdev_rxq *rxq) { struct ofpbuf *packet[NETDEV_MAX_RX_BATCH]; int error, c; - error = netdev_rx_recv(queue, packet, &c); + error = netdev_rxq_recv(rxq, packet, &c); if (!error) { struct pkt_metadata md = PKT_METADATA_INITIALIZER(port->port_no); int i; @@ -1757,8 +1785,12 @@ dpif_netdev_run(struct dpif *dpif) ovs_rwlock_rdlock(&dp->port_rwlock); HMAP_FOR_EACH (port, node, &dp->ports) { - if (port->rx && !netdev_is_pmd(port->netdev)) { - dp_netdev_process_rx_port(dp, port, port->rx); + if (!netdev_is_pmd(port->netdev)) { + int i; + + for (i = 0; i < netdev_n_rxq(port->netdev); i++) { + dp_netdev_process_rxq_port(dp, port, port->rxq[i]); + } } } @@ -1774,23 +1806,28 @@ dpif_netdev_wait(struct dpif *dpif) ovs_rwlock_rdlock(&dp->port_rwlock); HMAP_FOR_EACH (port, node, &dp->ports) { - if (port->rx && !netdev_is_pmd(port->netdev)) { - netdev_rx_wait(port->rx); + if (!netdev_is_pmd(port->netdev)) { + int i; + + for (i = 0; i < netdev_n_rxq(port->netdev); i++) { + netdev_rxq_wait(port->rxq[i]); + } } } ovs_rwlock_unlock(&dp->port_rwlock); } -struct rx_poll { +struct rxq_poll { struct dp_netdev_port *port; + struct netdev_rxq *rx; }; static int pmd_load_queues(struct pmd_thread *f, - struct rx_poll **ppoll_list, int poll_cnt) + struct rxq_poll **ppoll_list, int poll_cnt) { struct dp_netdev *dp = f->dp; - struct rx_poll *poll_list = *ppoll_list; + struct rxq_poll *poll_list = *ppoll_list; struct dp_netdev_port *port; int id = f->id; int index; @@ -1807,13 +1844,19 @@ pmd_load_queues(struct pmd_thread *f, HMAP_FOR_EACH (port, node, &f->dp->ports) { if (netdev_is_pmd(port->netdev)) { - if ((index % dp->n_pmd_threads) == id) { - poll_list = xrealloc(poll_list, sizeof *poll_list * (poll_cnt + 1)); + int i; + + for (i = 0; i < netdev_n_rxq(port->netdev); i++) { + if ((index % dp->n_pmd_threads) == id) { + poll_list = xrealloc(poll_list, sizeof *poll_list * (poll_cnt + 1)); - port_ref(port); - poll_list[poll_cnt++].port = port; + port_ref(port); + poll_list[poll_cnt].port = port; + poll_list[poll_cnt].rx = port->rxq[i]; + poll_cnt++; + } + index++; } - index++; } } @@ -1828,16 +1871,15 @@ pmd_thread_main(void *f_) struct pmd_thread *f = f_; struct dp_netdev *dp = f->dp; unsigned int lc = 0; - struct rx_poll *poll_list; + struct rxq_poll *poll_list; unsigned int port_seq; int poll_cnt; int i; - f->name = xasprintf("pmd_%u", ovsthread_id_self()); - set_subprogram_name("%s", f->name); poll_cnt = 0; poll_list = NULL; + pmd_thread_setaffinity_cpu(f->id); reload: poll_cnt = pmd_load_queues(f, &poll_list, poll_cnt); atomic_read(&f->change_seq, &port_seq); @@ -1847,7 +1889,7 @@ reload: int i; for (i = 0; i < poll_cnt; i++) { - dp_netdev_process_rx_port(dp, poll_list[i].port, poll_list[i].port->rx); + dp_netdev_process_rxq_port(dp, poll_list[i].port, poll_list[i].rx); } if (lc++ > 1024) { @@ -1873,7 +1915,6 @@ reload: } free(poll_list); - free(f->name); return NULL; } @@ -1910,7 +1951,7 @@ dp_netdev_set_pmd_threads(struct dp_netdev *dp, int n) /* Each thread will distribute all devices rx-queues among * themselves. */ - xpthread_create(&f->thread, NULL, pmd_thread_main, f); + f->thread = ovs_thread_create("pmd", pmd_thread_main, f); } } @@ -1926,9 +1967,9 @@ dp_netdev_flow_stats_new_cb(void) static void dp_netdev_flow_used(struct dp_netdev_flow *netdev_flow, const struct ofpbuf *packet, - const struct flow *key) + const struct miniflow *key) { - uint16_t tcp_flags = ntohs(key->tcp_flags); + uint16_t tcp_flags = miniflow_get_tcp_flags(key); long long int now = time_msec(); struct dp_netdev_flow_stats *bucket; @@ -1938,7 +1979,7 @@ dp_netdev_flow_used(struct dp_netdev_flow *netdev_flow, ovs_mutex_lock(&bucket->mutex); bucket->used = MAX(now, bucket->used); bucket->packet_count++; - bucket->byte_count += packet->size; + bucket->byte_count += ofpbuf_size(packet); bucket->tcp_flags |= tcp_flags; ovs_mutex_unlock(&bucket->mutex); } @@ -1963,17 +2004,21 @@ dp_netdev_count_packet(struct dp_netdev *dp, enum dp_stat_type type) } static void -dp_netdev_port_input(struct dp_netdev *dp, struct ofpbuf *packet, - struct pkt_metadata *md) +dp_netdev_input(struct dp_netdev *dp, struct ofpbuf *packet, + struct pkt_metadata *md) + OVS_REQ_RDLOCK(dp->port_rwlock) { struct dp_netdev_flow *netdev_flow; - struct flow key; + struct miniflow key; + uint32_t buf[FLOW_U32S]; - if (packet->size < ETH_HEADER_LEN) { + if (ofpbuf_size(packet) < ETH_HEADER_LEN) { ofpbuf_delete(packet); return; } - flow_extract(packet, md, &key); + miniflow_initialize(&key, buf); + miniflow_extract(packet, md, &key); + netdev_flow = dp_netdev_lookup_flow(dp, &key); if (netdev_flow) { struct dp_netdev_actions *actions; @@ -1987,15 +2032,27 @@ dp_netdev_port_input(struct dp_netdev *dp, struct ofpbuf *packet, } else if (dp->handler_queues) { dp_netdev_count_packet(dp, DP_STAT_MISS); dp_netdev_output_userspace(dp, packet, - flow_hash_5tuple(&key, 0) % dp->n_handlers, + miniflow_hash_5tuple(&key, 0) + % dp->n_handlers, DPIF_UC_MISS, &key, NULL); ofpbuf_delete(packet); } } +static void +dp_netdev_port_input(struct dp_netdev *dp, struct ofpbuf *packet, + struct pkt_metadata *md) + OVS_REQ_RDLOCK(dp->port_rwlock) +{ + uint32_t *recirc_depth = recirc_depth_get(); + + *recirc_depth = 0; + dp_netdev_input(dp, packet, md); +} + static int dp_netdev_output_userspace(struct dp_netdev *dp, struct ofpbuf *packet, - int queue_no, int type, const struct flow *flow, + int queue_no, int type, const struct miniflow *key, const struct nlattr *userdata) { struct dp_netdev_queue *q; @@ -2009,6 +2066,7 @@ dp_netdev_output_userspace(struct dp_netdev *dp, struct ofpbuf *packet, struct dpif_upcall *upcall = &u->upcall; struct ofpbuf *buf = &u->buf; size_t buf_size; + struct flow flow; upcall->type = type; @@ -2017,13 +2075,14 @@ dp_netdev_output_userspace(struct dp_netdev *dp, struct ofpbuf *packet, if (userdata) { buf_size += NLA_ALIGN(userdata->nla_len); } - buf_size += packet->size; + buf_size += ofpbuf_size(packet); ofpbuf_init(buf, buf_size); /* Put ODP flow. */ - odp_flow_key_from_flow(buf, flow, flow->in_port.odp_port); - upcall->key = buf->data; - upcall->key_len = buf->size; + miniflow_expand(key, &flow); + odp_flow_key_from_flow(buf, &flow, NULL, flow.in_port.odp_port); + upcall->key = ofpbuf_data(buf); + upcall->key_len = ofpbuf_size(buf); /* Put userdata. */ if (userdata) { @@ -2031,8 +2090,9 @@ dp_netdev_output_userspace(struct dp_netdev *dp, struct ofpbuf *packet, NLA_ALIGN(userdata->nla_len)); } - upcall->packet.data = ofpbuf_put(buf, packet->data, packet->size); - upcall->packet.size = packet->size; + ofpbuf_set_data(&upcall->packet, + ofpbuf_put(buf, ofpbuf_data(packet), ofpbuf_size(packet))); + ofpbuf_set_size(&upcall->packet, ofpbuf_size(packet)); seq_change(q->seq); @@ -2049,18 +2109,19 @@ dp_netdev_output_userspace(struct dp_netdev *dp, struct ofpbuf *packet, struct dp_netdev_execute_aux { struct dp_netdev *dp; - const struct flow *key; + const struct miniflow *key; }; static void dp_execute_cb(void *aux_, struct ofpbuf *packet, - const struct pkt_metadata *md OVS_UNUSED, + struct pkt_metadata *md, const struct nlattr *a, bool may_steal) OVS_NO_THREAD_SAFETY_ANALYSIS { struct dp_netdev_execute_aux *aux = aux_; int type = nl_attr_type(a); struct dp_netdev_port *p; + uint32_t *depth = recirc_depth_get(); switch ((enum ovs_action_attr)type) { case OVS_ACTION_ATTR_OUTPUT: @@ -2076,7 +2137,7 @@ dp_execute_cb(void *aux_, struct ofpbuf *packet, userdata = nl_attr_find_nested(a, OVS_USERSPACE_ATTR_USERDATA); dp_netdev_output_userspace(aux->dp, packet, - flow_hash_5tuple(aux->key, 0) + miniflow_hash_5tuple(aux->key, 0) % aux->dp->n_handlers, DPIF_UC_ACTION, aux->key, userdata); @@ -2086,6 +2147,47 @@ dp_execute_cb(void *aux_, struct ofpbuf *packet, } break; } + + case OVS_ACTION_ATTR_HASH: { + const struct ovs_action_hash *hash_act; + uint32_t hash; + + hash_act = nl_attr_get(a); + if (hash_act->hash_alg == OVS_HASH_ALG_L4) { + /* Hash need not be symmetric, nor does it need to include + * L2 fields. */ + hash = miniflow_hash_5tuple(aux->key, hash_act->hash_basis); + if (!hash) { + hash = 1; /* 0 is not valid */ + } + + } else { + VLOG_WARN("Unknown hash algorithm specified for the hash action."); + hash = 2; + } + + md->dp_hash = hash; + break; + } + + case OVS_ACTION_ATTR_RECIRC: + if (*depth < MAX_RECIRC_DEPTH) { + struct pkt_metadata recirc_md = *md; + struct ofpbuf *recirc_packet; + + recirc_packet = may_steal ? packet : ofpbuf_clone(packet); + recirc_md.recirc_id = nl_attr_get_u32(a); + + (*depth)++; + dp_netdev_input(aux->dp, recirc_packet, &recirc_md); + (*depth)--; + + break; + } else { + VLOG_WARN("Packet dropped. Max recirculation depth exceeded."); + } + break; + case OVS_ACTION_ATTR_PUSH_VLAN: case OVS_ACTION_ATTR_POP_VLAN: case OVS_ACTION_ATTR_PUSH_MPLS: @@ -2096,11 +2198,10 @@ dp_execute_cb(void *aux_, struct ofpbuf *packet, case __OVS_ACTION_ATTR_MAX: OVS_NOT_REACHED(); } - } static void -dp_netdev_execute_actions(struct dp_netdev *dp, const struct flow *key, +dp_netdev_execute_actions(struct dp_netdev *dp, const struct miniflow *key, struct ofpbuf *packet, bool may_steal, struct pkt_metadata *md, const struct nlattr *actions, size_t actions_len)