Post-v2.0.0
---------------------
+
+ - Address prefix tracking support for flow tables. New columns
+ "prefixes" in OVS-DB table "Flow_Table" controls which packet
+ header fields are used for address prefix tracking. Prefix
+ tracking allows the classifier to skip rules with longer than
+ necessary prefixes, resulting in better wildcarding for datapath
+ flows. Default configuration is to not use any fields for prefix
+ tracking. However, if any flow tables contain both exact matches
+ and masked matches for IP address fields, OVS performance may be
+ increased by using this feature.
+ * As of now, the fields for which prefix lookup can be enabled
+ are: 'tun_id', 'tun_src', 'tun_dst', 'nw_src', 'nw_dst' (or
+ aliases 'ip_src' and 'ip_dst'), 'ipv6_src', and 'ipv6_dst'.
+ (Using this feature for 'tun_id' would only make sense if the
+ tunnel IDs have prefix structure similar to IP addresses.)
+ * There is a maximum number of fields that can be enabled for any
+ one flow table. Currently this limit is 3.
+ * Examples:
+ $ ovs-vsctl set Bridge br0 flow_tables:0=@N1 -- \
+ --id=@N1 create Flow_Table name=table0
+ $ ovs-vsctl set Bridge br0 flow_tables:1=@N1 -- \
+ --id=@N1 create Flow_Table name=table1
+ $ ovs-vsctl set Flow_Table table0 prefixes=ip_dst,ip_src
+ $ ovs-vsctl set Flow_Table table1 prefixes=[]
- TCP flags matching: OVS now supports matching of TCP flags. This
has an adverse performance impact when using OVS userspace 1.10
or older (no megaflows support) together with the new OVS kernel
#include "hash.h"
#include "odp-util.h"
#include "ofp-util.h"
-#include "packets.h"
#include "ovs-thread.h"
+#include "packets.h"
+#include "vlog.h"
+
+VLOG_DEFINE_THIS_MODULE(classifier);
+struct trie_ctx;
static struct cls_subtable *find_subtable(const struct classifier *,
const struct minimask *);
static struct cls_subtable *insert_subtable(struct classifier *,
unsigned int del_priority);
static struct cls_rule *find_match_wc(const struct cls_subtable *,
- const struct flow *,
+ const struct flow *, struct trie_ctx *,
+ unsigned int n_tries,
struct flow_wildcards *);
static struct cls_rule *find_equal(struct cls_subtable *,
const struct miniflow *, uint32_t hash);
static struct cls_rule *next_rule_in_list__(struct cls_rule *);
static struct cls_rule *next_rule_in_list(struct cls_rule *);
+
+static unsigned int minimask_get_prefix_len(const struct minimask *,
+ const struct mf_field *);
+static void trie_init(struct classifier *, int trie_idx,
+ const struct mf_field *);
+static unsigned int trie_lookup(const struct cls_trie *, const struct flow *,
+ unsigned int *checkbits);
+
+static void trie_destroy(struct trie_node *);
+static void trie_insert(struct cls_trie *, const struct cls_rule *, int mlen);
+static void trie_remove(struct cls_trie *, const struct cls_rule *, int mlen);
+static void mask_set_prefix_bits(struct flow_wildcards *, uint8_t be32ofs,
+ unsigned int nbits);
+static bool mask_prefix_bits_set(const struct flow_wildcards *,
+ uint8_t be32ofs, unsigned int nbits);
\f
/* cls_rule. */
cls->flow_segments[cls->n_flow_segments++] = *flow_segments++;
}
}
+ cls->n_tries = 0;
}
/* Destroys 'cls'. Rules within 'cls', if any, are not freed; this is the
if (cls) {
struct cls_subtable *partition, *next_partition;
struct cls_subtable *subtable, *next_subtable;
+ int i;
+
+ for (i = 0; i < cls->n_tries; i++) {
+ trie_destroy(cls->tries[i].root);
+ }
HMAP_FOR_EACH_SAFE (subtable, next_subtable, hmap_node,
&cls->subtables) {
}
}
+/* We use uint64_t as a set for the fields below. */
+BUILD_ASSERT_DECL(MFF_N_IDS <= 64);
+
+/* Set the fields for which prefix lookup should be performed. */
+void
+classifier_set_prefix_fields(struct classifier *cls,
+ const enum mf_field_id *trie_fields,
+ unsigned int n_fields)
+{
+ uint64_t fields = 0;
+ int i, trie;
+
+ for (i = 0, trie = 0; i < n_fields && trie < CLS_MAX_TRIES; i++) {
+ const struct mf_field *field = mf_from_id(trie_fields[i]);
+ if (field->flow_be32ofs < 0 || field->n_bits % 32) {
+ /* Incompatible field. This is the only place where we
+ * enforce these requirements, but the rest of the trie code
+ * depends on the flow_be32ofs to be non-negative and the
+ * field length to be a multiple of 32 bits. */
+ continue;
+ }
+
+ if (fields & (UINT64_C(1) << trie_fields[i])) {
+ /* Duplicate field, there is no need to build more than
+ * one index for any one field. */
+ continue;
+ }
+ fields |= UINT64_C(1) << trie_fields[i];
+
+ if (trie >= cls->n_tries || field != cls->tries[trie].field) {
+ trie_init(cls, trie, field);
+ }
+ trie++;
+ }
+
+ /* Destroy the rest. */
+ for (i = trie; i < cls->n_tries; i++) {
+ trie_init(cls, i, NULL);
+ }
+ cls->n_tries = trie;
+}
+
+static void
+trie_init(struct classifier *cls, int trie_idx,
+ const struct mf_field *field)
+{
+ struct cls_trie *trie = &cls->tries[trie_idx];
+ struct cls_subtable *subtable;
+
+ if (trie_idx < cls->n_tries) {
+ trie_destroy(trie->root);
+ }
+ trie->root = NULL;
+ trie->field = field;
+
+ /* Add existing rules to the trie. */
+ LIST_FOR_EACH (subtable, list_node, &cls->subtables_priority) {
+ unsigned int plen;
+
+ plen = field ? minimask_get_prefix_len(&subtable->mask, field) : 0;
+ /* Initialize subtable's prefix length on this field. */
+ subtable->trie_plen[trie_idx] = plen;
+
+ if (plen) {
+ struct cls_rule *head;
+
+ HMAP_FOR_EACH (head, hmap_node, &subtable->rules) {
+ struct cls_rule *rule;
+
+ FOR_EACH_RULE_IN_LIST (rule, head) {
+ trie_insert(trie, rule, plen);
+ }
+ }
+ }
+ }
+}
+
/* Returns true if 'cls' contains no classification rules, false otherwise. */
bool
classifier_is_empty(const struct classifier *cls)
old_rule = insert_rule(cls, subtable, rule);
if (!old_rule) {
+ int i;
+
if (minimask_get_metadata_mask(&rule->match.mask) == OVS_BE64_MAX) {
ovs_be64 metadata = miniflow_get_metadata(&rule->match.flow);
rule->partition = create_partition(cls, subtable, metadata);
subtable->n_rules++;
cls->n_rules++;
+
+ for (i = 0; i < cls->n_tries; i++) {
+ if (subtable->trie_plen[i]) {
+ trie_insert(&cls->tries[i], rule, subtable->trie_plen[i]);
+ }
+ }
} else {
rule->partition = old_rule->partition;
}
subtable = find_subtable(cls, &rule->match.mask);
+ for (i = 0; i < cls->n_tries; i++) {
+ if (subtable->trie_plen[i]) {
+ trie_remove(&cls->tries[i], rule, subtable->trie_plen[i]);
+ }
+ }
+
/* Remove rule node from indices. */
for (i = 0; i < subtable->n_indices; i++) {
hindex_remove(&subtable->indices[i], &rule->index_nodes[i]);
} else {
update_subtables_after_removal(cls, subtable, rule->priority);
}
+
cls->n_rules--;
}
+/* Prefix tree context. Valid when 'lookup_done' is true. Can skip all
+ * subtables which have more than 'match_plen' bits in their corresponding
+ * field at offset 'be32ofs'. If skipped, 'maskbits' prefix bits should be
+ * unwildcarded to quarantee datapath flow matches only packets it should. */
+struct trie_ctx {
+ const struct cls_trie *trie;
+ bool lookup_done; /* Status of the lookup. */
+ uint8_t be32ofs; /* U32 offset of the field in question. */
+ unsigned int match_plen; /* Longest prefix than could possibly match. */
+ unsigned int maskbits; /* Prefix length needed to avoid false matches. */
+};
+
+static void
+trie_ctx_init(struct trie_ctx *ctx, const struct cls_trie *trie)
+{
+ ctx->trie = trie;
+ ctx->be32ofs = trie->field->flow_be32ofs;
+ ctx->lookup_done = false;
+}
+
/* Finds and returns the highest-priority rule in 'cls' that matches 'flow'.
* Returns a null pointer if no rules in 'cls' match 'flow'. If multiple rules
* of equal priority match 'flow', returns one arbitrarily.
struct cls_subtable *subtable;
struct cls_rule *best;
tag_type tags;
+ struct trie_ctx trie_ctx[CLS_MAX_TRIES];
+ int i;
/* Determine 'tags' such that, if 'subtable->tag' doesn't intersect them,
* then 'flow' cannot possibly match in 'subtable':
hash_metadata(flow->metadata)));
tags = partition ? partition->tags : TAG_ARBITRARY;
+ /* Initialize trie contexts for match_find_wc(). */
+ for (i = 0; i < cls->n_tries; i++) {
+ trie_ctx_init(&trie_ctx[i], &cls->tries[i]);
+ }
best = NULL;
LIST_FOR_EACH (subtable, list_node, &cls->subtables_priority) {
struct cls_rule *rule;
continue;
}
- rule = find_match_wc(subtable, flow, wc);
+ rule = find_match_wc(subtable, flow, trie_ctx, cls->n_tries, wc);
if (rule) {
best = rule;
LIST_FOR_EACH_CONTINUE (subtable, list_node,
continue;
}
- rule = find_match_wc(subtable, flow, wc);
+ rule = find_match_wc(subtable, flow, trie_ctx, cls->n_tries,
+ wc);
if (rule && rule->priority > best->priority) {
best = rule;
}
break;
}
}
+
return best;
}
? tag_create_deterministic(hash)
: TAG_ALL);
+ for (i = 0; i < cls->n_tries; i++) {
+ subtable->trie_plen[i] = minimask_get_prefix_len(mask,
+ cls->tries[i].field);
+ }
+
return subtable;
}
}
}
+struct range {
+ uint8_t start;
+ uint8_t end;
+};
+
+/* Return 'true' if can skip rest of the subtable based on the prefix trie
+ * lookup results. */
+static inline bool
+check_tries(struct trie_ctx trie_ctx[CLS_MAX_TRIES], unsigned int n_tries,
+ const unsigned int field_plen[CLS_MAX_TRIES],
+ const struct range ofs, const struct flow *flow,
+ struct flow_wildcards *wc)
+{
+ int j;
+
+ /* Check if we could avoid fully unwildcarding the next level of
+ * fields using the prefix tries. The trie checks are done only as
+ * needed to avoid folding in additional bits to the wildcards mask. */
+ for (j = 0; j < n_tries; j++) {
+ /* Is the trie field relevant for this subtable? */
+ if (field_plen[j]) {
+ struct trie_ctx *ctx = &trie_ctx[j];
+ uint8_t be32ofs = ctx->be32ofs;
+
+ /* Is the trie field within the current range of fields? */
+ if (be32ofs >= ofs.start && be32ofs < ofs.end) {
+ /* On-demand trie lookup. */
+ if (!ctx->lookup_done) {
+ ctx->match_plen = trie_lookup(ctx->trie, flow,
+ &ctx->maskbits);
+ ctx->lookup_done = true;
+ }
+ /* Possible to skip the rest of the subtable if subtable's
+ * prefix on the field is longer than what is known to match
+ * based on the trie lookup. */
+ if (field_plen[j] > ctx->match_plen) {
+ /* RFC: We want the trie lookup to never result in
+ * unwildcarding any bits that would not be unwildcarded
+ * otherwise. Since the trie is shared by the whole
+ * classifier, it is possible that the 'maskbits' contain
+ * bits that are irrelevant for the partition of the
+ * classifier relevant for the current flow. */
+
+ /* Can skip if the field is already unwildcarded. */
+ if (mask_prefix_bits_set(wc, be32ofs, ctx->maskbits)) {
+ return true;
+ }
+ /* Check that the trie result will not unwildcard more bits
+ * than this stage will. */
+ if (ctx->maskbits <= field_plen[j]) {
+ /* Unwildcard the bits and skip the rest. */
+ mask_set_prefix_bits(wc, be32ofs, ctx->maskbits);
+ /* Note: Prerequisite already unwildcarded, as the only
+ * prerequisite of the supported trie lookup fields is
+ * the ethertype, which is currently always
+ * unwildcarded.
+ */
+ return true;
+ }
+ }
+ }
+ }
+ }
+ return false;
+}
+
static inline struct cls_rule *
find_match(const struct cls_subtable *subtable, const struct flow *flow,
uint32_t hash)
static struct cls_rule *
find_match_wc(const struct cls_subtable *subtable, const struct flow *flow,
- struct flow_wildcards * wc)
+ struct trie_ctx trie_ctx[CLS_MAX_TRIES], unsigned int n_tries,
+ struct flow_wildcards *wc)
{
uint32_t basis = 0, hash;
struct cls_rule *rule = NULL;
- uint8_t prev_u32ofs = 0;
int i;
+ struct range ofs;
if (!wc) {
return find_match(subtable, flow,
flow_hash_in_minimask(flow, &subtable->mask, 0));
}
+ ofs.start = 0;
/* Try to finish early by checking fields in segments. */
for (i = 0; i < subtable->n_indices; i++) {
struct hindex_node *inode;
+ ofs.end = subtable->index_ofs[i];
- hash = flow_hash_in_minimask_range(flow, &subtable->mask, prev_u32ofs,
- subtable->index_ofs[i], &basis);
- prev_u32ofs = subtable->index_ofs[i];
+ if (check_tries(trie_ctx, n_tries, subtable->trie_plen, ofs, flow,
+ wc)) {
+ goto range_out;
+ }
+ hash = flow_hash_in_minimask_range(flow, &subtable->mask, ofs.start,
+ ofs.end, &basis);
+ ofs.start = ofs.end;
inode = hindex_node_with_hash(&subtable->indices[i], hash);
if (!inode) {
/* No match, can stop immediately, but must fold in the mask
* covered so far. */
- flow_wildcards_fold_minimask_range(wc, &subtable->mask, 0,
- prev_u32ofs);
- return NULL;
+ goto range_out;
}
/* If we have narrowed down to a single rule already, check whether
}
}
}
-
+ ofs.end = FLOW_U32S;
+ /* Trie check for the final range. */
+ if (check_tries(trie_ctx, n_tries, subtable->trie_plen, ofs, flow, wc)) {
+ goto range_out;
+ }
if (!rule) {
/* Multiple potential matches exist, look for one. */
- hash = flow_hash_in_minimask_range(flow, &subtable->mask, prev_u32ofs,
- FLOW_U32S, &basis);
+ hash = flow_hash_in_minimask_range(flow, &subtable->mask, ofs.start,
+ ofs.end, &basis);
rule = find_match(subtable, flow, hash);
} else {
/* We already narrowed the matching candidates down to just 'rule',
rule = NULL;
}
out:
+ /* Must unwildcard all the fields, as they were looked at. */
flow_wildcards_fold_minimask(wc, &subtable->mask);
return rule;
+
+ range_out:
+ /* Must unwildcard the fields looked up so far, if any. */
+ if (ofs.start) {
+ flow_wildcards_fold_minimask_range(wc, &subtable->mask, 0, ofs.start);
+ }
+ return NULL;
}
static struct cls_rule *
struct cls_rule *old = NULL;
int i;
uint32_t basis = 0, hash;
- uint8_t prev_u32ofs = 0;
+ uint8_t prev_be32ofs = 0;
/* Add new node to segment indices. */
for (i = 0; i < subtable->n_indices; i++) {
- hash = minimatch_hash_range(&new->match, prev_u32ofs,
+ hash = minimatch_hash_range(&new->match, prev_be32ofs,
subtable->index_ofs[i], &basis);
hindex_insert(&subtable->indices[i], &new->index_nodes[i], hash);
- prev_u32ofs = subtable->index_ofs[i];
+ prev_be32ofs = subtable->index_ofs[i];
}
- hash = minimatch_hash_range(&new->match, prev_u32ofs, FLOW_U32S, &basis);
+ hash = minimatch_hash_range(&new->match, prev_be32ofs, FLOW_U32S, &basis);
head = find_equal(subtable, &new->match.flow, hash);
if (!head) {
hmap_insert(&subtable->rules, &new->hmap_node, hash);
struct cls_rule *next = next_rule_in_list__(rule);
return next->priority < rule->priority ? next : NULL;
}
+\f
+/* A longest-prefix match tree. */
+struct trie_node {
+ uint32_t prefix; /* Prefix bits for this node, MSB first. */
+ uint8_t nbits; /* Never zero, except for the root node. */
+ unsigned int n_rules; /* Number of rules that have this prefix. */
+ struct trie_node *edges[2]; /* Both NULL if leaf. */
+};
+
+/* Max bits per node. Must fit in struct trie_node's 'prefix'.
+ * Also tested with 16, 8, and 5 to stress the implementation. */
+#define TRIE_PREFIX_BITS 32
+
+/* Return at least 'plen' bits of the 'prefix', starting at bit offset 'ofs'.
+ * Prefixes are in the network byte order, and the offset 0 corresponds to
+ * the most significant bit of the first byte. The offset can be read as
+ * "how many bits to skip from the start of the prefix starting at 'pr'". */
+static uint32_t
+raw_get_prefix(const ovs_be32 pr[], unsigned int ofs, unsigned int plen)
+{
+ uint32_t prefix;
+
+ pr += ofs / 32; /* Where to start. */
+ ofs %= 32; /* How many bits to skip at 'pr'. */
+
+ prefix = ntohl(*pr) << ofs; /* Get the first 32 - ofs bits. */
+ if (plen > 32 - ofs) { /* Need more than we have already? */
+ prefix |= ntohl(*++pr) >> (32 - ofs);
+ }
+ /* Return with possible unwanted bits at the end. */
+ return prefix;
+}
+
+/* Return min(TRIE_PREFIX_BITS, plen) bits of the 'prefix', starting at bit
+ * offset 'ofs'. Prefixes are in the network byte order, and the offset 0
+ * corresponds to the most significant bit of the first byte. The offset can
+ * be read as "how many bits to skip from the start of the prefix starting at
+ * 'pr'". */
+static uint32_t
+trie_get_prefix(const ovs_be32 pr[], unsigned int ofs, unsigned int plen)
+{
+ if (!plen) {
+ return 0;
+ }
+ if (plen > TRIE_PREFIX_BITS) {
+ plen = TRIE_PREFIX_BITS; /* Get at most TRIE_PREFIX_BITS. */
+ }
+ /* Return with unwanted bits cleared. */
+ return raw_get_prefix(pr, ofs, plen) & ~0u << (32 - plen);
+}
+
+/* Return the number of equal bits in 'nbits' of 'prefix's MSBs and a 'value'
+ * starting at "MSB 0"-based offset 'ofs'. */
+static unsigned int
+prefix_equal_bits(uint32_t prefix, unsigned int nbits, const ovs_be32 value[],
+ unsigned int ofs)
+{
+ uint64_t diff = prefix ^ raw_get_prefix(value, ofs, nbits);
+ /* Set the bit after the relevant bits to limit the result. */
+ return raw_clz64(diff << 32 | UINT64_C(1) << (63 - nbits));
+}
+
+/* Return the number of equal bits in 'node' prefix and a 'prefix' of length
+ * 'plen', starting at "MSB 0"-based offset 'ofs'. */
+static unsigned int
+trie_prefix_equal_bits(const struct trie_node *node, const ovs_be32 prefix[],
+ unsigned int ofs, unsigned int plen)
+{
+ return prefix_equal_bits(node->prefix, MIN(node->nbits, plen - ofs),
+ prefix, ofs);
+}
+
+/* Return the bit at ("MSB 0"-based) offset 'ofs' as an int. 'ofs' can
+ * be greater than 31. */
+static unsigned int
+be_get_bit_at(const ovs_be32 value[], unsigned int ofs)
+{
+ return (((const uint8_t *)value)[ofs / 8] >> (7 - ofs % 8)) & 1u;
+}
+
+/* Return the bit at ("MSB 0"-based) offset 'ofs' as an int. 'ofs' must
+ * be between 0 and 31, inclusive. */
+static unsigned int
+get_bit_at(const uint32_t prefix, unsigned int ofs)
+{
+ return (prefix >> (31 - ofs)) & 1u;
+}
+
+/* Create new branch. */
+static struct trie_node *
+trie_branch_create(const ovs_be32 *prefix, unsigned int ofs, unsigned int plen,
+ unsigned int n_rules)
+{
+ struct trie_node *node = xmalloc(sizeof *node);
+
+ node->prefix = trie_get_prefix(prefix, ofs, plen);
+
+ if (plen <= TRIE_PREFIX_BITS) {
+ node->nbits = plen;
+ node->edges[0] = NULL;
+ node->edges[1] = NULL;
+ node->n_rules = n_rules;
+ } else { /* Need intermediate nodes. */
+ struct trie_node *subnode = trie_branch_create(prefix,
+ ofs + TRIE_PREFIX_BITS,
+ plen - TRIE_PREFIX_BITS,
+ n_rules);
+ int bit = get_bit_at(subnode->prefix, 0);
+ node->nbits = TRIE_PREFIX_BITS;
+ node->edges[bit] = subnode;
+ node->edges[!bit] = NULL;
+ node->n_rules = 0;
+ }
+ return node;
+}
+
+static void
+trie_node_destroy(struct trie_node *node)
+{
+ free(node);
+}
+
+static void
+trie_destroy(struct trie_node *node)
+{
+ if (node) {
+ trie_destroy(node->edges[0]);
+ trie_destroy(node->edges[1]);
+ free(node);
+ }
+}
+
+static bool
+trie_is_leaf(const struct trie_node *trie)
+{
+ return !trie->edges[0] && !trie->edges[1]; /* No children. */
+}
+
+static void
+mask_set_prefix_bits(struct flow_wildcards *wc, uint8_t be32ofs,
+ unsigned int nbits)
+{
+ ovs_be32 *mask = &((ovs_be32 *)&wc->masks)[be32ofs];
+ unsigned int i;
+
+ for (i = 0; i < nbits / 32; i++) {
+ mask[i] = OVS_BE32_MAX;
+ }
+ if (nbits % 32) {
+ mask[i] |= htonl(~0u << (32 - nbits % 32));
+ }
+}
+
+static bool
+mask_prefix_bits_set(const struct flow_wildcards *wc, uint8_t be32ofs,
+ unsigned int nbits)
+{
+ ovs_be32 *mask = &((ovs_be32 *)&wc->masks)[be32ofs];
+ unsigned int i;
+ ovs_be32 zeroes = 0;
+
+ for (i = 0; i < nbits / 32; i++) {
+ zeroes |= ~mask[i];
+ }
+ if (nbits % 32) {
+ zeroes |= ~mask[i] & htonl(~0u << (32 - nbits % 32));
+ }
+
+ return !zeroes; /* All 'nbits' bits set. */
+}
+
+static struct trie_node **
+trie_next_edge(struct trie_node *node, const ovs_be32 value[],
+ unsigned int ofs)
+{
+ return node->edges + be_get_bit_at(value, ofs);
+}
+
+static const struct trie_node *
+trie_next_node(const struct trie_node *node, const ovs_be32 value[],
+ unsigned int ofs)
+{
+ return node->edges[be_get_bit_at(value, ofs)];
+}
+
+/* Return the prefix mask length necessary to find the longest-prefix match for
+ * the '*value' in the prefix tree 'node'.
+ * '*checkbits' is set to the number of bits in the prefix mask necessary to
+ * determine a mismatch, in case there are longer prefixes in the tree below
+ * the one that matched.
+ */
+static unsigned int
+trie_lookup_value(const struct trie_node *node, const ovs_be32 value[],
+ unsigned int *checkbits)
+{
+ unsigned int plen = 0, match_len = 0;
+ const struct trie_node *prev = NULL;
+
+ for (; node; prev = node, node = trie_next_node(node, value, plen)) {
+ unsigned int eqbits;
+ /* Check if this edge can be followed. */
+ eqbits = prefix_equal_bits(node->prefix, node->nbits, value, plen);
+ plen += eqbits;
+ if (eqbits < node->nbits) { /* Mismatch, nothing more to be found. */
+ /* Bit at offset 'plen' differed. */
+ *checkbits = plen + 1; /* Includes the first mismatching bit. */
+ return match_len;
+ }
+ /* Full match, check if rules exist at this prefix length. */
+ if (node->n_rules > 0) {
+ match_len = plen;
+ }
+ }
+ /* Dead end, exclude the other branch if it exists. */
+ *checkbits = !prev || trie_is_leaf(prev) ? plen : plen + 1;
+ return match_len;
+}
+
+static unsigned int
+trie_lookup(const struct cls_trie *trie, const struct flow *flow,
+ unsigned int *checkbits)
+{
+ const struct mf_field *mf = trie->field;
+
+ /* Check that current flow matches the prerequisites for the trie
+ * field. Some match fields are used for multiple purposes, so we
+ * must check that the trie is relevant for this flow. */
+ if (mf_are_prereqs_ok(mf, flow)) {
+ return trie_lookup_value(trie->root,
+ &((ovs_be32 *)flow)[mf->flow_be32ofs],
+ checkbits);
+ }
+ *checkbits = 0; /* Value not used in this case. */
+ return UINT_MAX;
+}
+
+/* Returns the length of a prefix match mask for the field 'mf' in 'minimask'.
+ * Returns the u32 offset to the miniflow data in '*miniflow_index', if
+ * 'miniflow_index' is not NULL. */
+static unsigned int
+minimask_get_prefix_len(const struct minimask *minimask,
+ const struct mf_field *mf)
+{
+ unsigned int nbits = 0, mask_tz = 0; /* Non-zero when end of mask seen. */
+ uint8_t u32_ofs = mf->flow_be32ofs;
+ uint8_t u32_end = u32_ofs + mf->n_bytes / 4;
+
+ for (; u32_ofs < u32_end; ++u32_ofs) {
+ uint32_t mask;
+ mask = ntohl((OVS_FORCE ovs_be32)minimask_get(minimask, u32_ofs));
+
+ /* Validate mask, count the mask length. */
+ if (mask_tz) {
+ if (mask) {
+ return 0; /* No bits allowed after mask ended. */
+ }
+ } else {
+ if (~mask & (~mask + 1)) {
+ return 0; /* Mask not contiguous. */
+ }
+ mask_tz = ctz32(mask);
+ nbits += 32 - mask_tz;
+ }
+ }
+
+ return nbits;
+}
+
+/*
+ * This is called only when mask prefix is known to be CIDR and non-zero.
+ * Relies on the fact that the flow and mask have the same map, and since
+ * the mask is CIDR, the storage for the flow field exists even if it
+ * happened to be zeros.
+ */
+static const ovs_be32 *
+minimatch_get_prefix(const struct minimatch *match, const struct mf_field *mf)
+{
+ return (OVS_FORCE const ovs_be32 *)match->flow.values +
+ count_1bits(match->flow.map & ((UINT64_C(1) << mf->flow_be32ofs) - 1));
+}
+
+/* Insert rule in to the prefix tree.
+ * 'mlen' must be the (non-zero) CIDR prefix length of the 'trie->field' mask
+ * in 'rule'. */
+static void
+trie_insert(struct cls_trie *trie, const struct cls_rule *rule, int mlen)
+{
+ const ovs_be32 *prefix = minimatch_get_prefix(&rule->match, trie->field);
+ struct trie_node *node;
+ struct trie_node **edge;
+ int ofs = 0;
+
+ /* Walk the tree. */
+ for (edge = &trie->root;
+ (node = *edge) != NULL;
+ edge = trie_next_edge(node, prefix, ofs)) {
+ unsigned int eqbits = trie_prefix_equal_bits(node, prefix, ofs, mlen);
+ ofs += eqbits;
+ if (eqbits < node->nbits) {
+ /* Mismatch, new node needs to be inserted above. */
+ int old_branch = get_bit_at(node->prefix, eqbits);
+
+ /* New parent node. */
+ *edge = trie_branch_create(prefix, ofs - eqbits, eqbits,
+ ofs == mlen ? 1 : 0);
+
+ /* Adjust old node for its new position in the tree. */
+ node->prefix <<= eqbits;
+ node->nbits -= eqbits;
+ (*edge)->edges[old_branch] = node;
+
+ /* Check if need a new branch for the new rule. */
+ if (ofs < mlen) {
+ (*edge)->edges[!old_branch]
+ = trie_branch_create(prefix, ofs, mlen - ofs, 1);
+ }
+ return;
+ }
+ /* Full match so far. */
+
+ if (ofs == mlen) {
+ /* Full match at the current node, rule needs to be added here. */
+ node->n_rules++;
+ return;
+ }
+ }
+ /* Must insert a new tree branch for the new rule. */
+ *edge = trie_branch_create(prefix, ofs, mlen - ofs, 1);
+}
+
+/* 'mlen' must be the (non-zero) CIDR prefix length of the 'trie->field' mask
+ * in 'rule'. */
+static void
+trie_remove(struct cls_trie *trie, const struct cls_rule *rule, int mlen)
+{
+ const ovs_be32 *prefix = minimatch_get_prefix(&rule->match, trie->field);
+ struct trie_node *node;
+ struct trie_node **edges[sizeof(union mf_value) * 8];
+ int depth = 0, ofs = 0;
+
+ /* Walk the tree. */
+ for (edges[depth] = &trie->root;
+ (node = *edges[depth]) != NULL;
+ edges[++depth] = trie_next_edge(node, prefix, ofs)) {
+ unsigned int eqbits = trie_prefix_equal_bits(node, prefix, ofs, mlen);
+ if (eqbits < node->nbits) {
+ /* Mismatch, nothing to be removed. This should never happen, as
+ * only rules in the classifier are ever removed. */
+ break; /* Log a warning. */
+ }
+ /* Full match so far. */
+ ofs += eqbits;
+
+ if (ofs == mlen) {
+ /* Full prefix match at the current node, remove rule here. */
+ if (!node->n_rules) {
+ break; /* Log a warning. */
+ }
+ node->n_rules--;
+
+ /* Check if can prune the tree. */
+ while (!node->n_rules && !(node->edges[0] && node->edges[1])) {
+ /* No rules and at most one child node, remove this node. */
+ struct trie_node *next;
+ next = node->edges[0] ? node->edges[0] : node->edges[1];
+
+ if (next) {
+ if (node->nbits + next->nbits > TRIE_PREFIX_BITS) {
+ break; /* Cannot combine. */
+ }
+ /* Combine node with next. */
+ next->prefix = node->prefix | next->prefix >> node->nbits;
+ next->nbits += node->nbits;
+ }
+ trie_node_destroy(node);
+ /* Update the parent's edge. */
+ *edges[depth] = next;
+ if (next || !depth) {
+ /* Branch not pruned or at root, nothing more to do. */
+ break;
+ }
+ node = *edges[--depth];
+ }
+ return;
+ }
+ }
+ /* Cannot go deeper. This should never happen, since only rules
+ * that actually exist in the classifier are ever removed. */
+ VLOG_WARN("Trying to remove non-existing rule from a prefix trie.");
+}
* =====
*
* A flow classifier holds any number of "rules", each of which specifies
- * values to match for some fields or subfields and a priority. The primary
- * design goal for the classifier is that, given a packet, it can as quickly as
- * possible find the highest-priority rule that matches the packet.
+ * values to match for some fields or subfields and a priority. Each OpenFlow
+ * table is implemented as a flow classifier.
*
- * Each OpenFlow table is implemented as a flow classifier.
+ * The classifier has two primary design goals. The first is obvious: given a
+ * set of packet headers, as quickly as possible find the highest-priority rule
+ * that matches those headers. The following section describes the second
+ * goal.
*
*
- * Basic Design
- * ============
+ * "Un-wildcarding"
+ * ================
+ *
+ * A primary goal of the flow classifier is to produce, as a side effect of a
+ * packet lookup, a wildcard mask that indicates which bits of the packet
+ * headers were essential to the classification result. Ideally, a 1-bit in
+ * any position of this mask means that, if the corresponding bit in the packet
+ * header were flipped, then the classification result might change. A 0-bit
+ * means that changing the packet header bit would have no effect. Thus, the
+ * wildcarded bits are the ones that played no role in the classification
+ * decision.
+ *
+ * Such a wildcard mask is useful with datapaths that support installing flows
+ * that wildcard fields or subfields. If an OpenFlow lookup for a TCP flow
+ * does not actually look at the TCP source or destination ports, for example,
+ * then the switch may install into the datapath a flow that wildcards the port
+ * numbers, which in turn allows the datapath to handle packets that arrive for
+ * other TCP source or destination ports without additional help from
+ * ovs-vswitchd. This is useful for the Open vSwitch software and,
+ * potentially, for ASIC-based switches as well.
+ *
+ * Some properties of the wildcard mask:
+ *
+ * - "False 1-bits" are acceptable, that is, setting a bit in the wildcard
+ * mask to 1 will never cause a packet to be forwarded the wrong way.
+ * As a corollary, a wildcard mask composed of all 1-bits will always
+ * yield correct (but often needlessly inefficient) behavior.
+ *
+ * - "False 0-bits" can cause problems, so they must be avoided. In the
+ * extreme case, a mask of all 0-bits is only correct if the classifier
+ * contains only a single flow that matches all packets.
+ *
+ * - 0-bits are desirable because they allow the datapath to act more
+ * autonomously, relying less on ovs-vswitchd to process flow setups,
+ * thereby improving performance.
+ *
+ * - We don't know a good way to generate wildcard masks with the maximum
+ * (correct) number of 0-bits. We use various approximations, described
+ * in later sections.
+ *
+ * - Wildcard masks for lookups in a given classifier yield a
+ * non-overlapping set of rules. More specifically:
+ *
+ * Consider an classifier C1 filled with an arbitrary collection of rules
+ * and an empty classifier C2. Now take a set of packet headers H and
+ * look it up in C1, yielding a highest-priority matching rule R1 and
+ * wildcard mask M. Form a new classifier rule R2 out of packet headers
+ * H and mask M, and add R2 to C2 with a fixed priority. If one were to
+ * do this for every possible set of packet headers H, then this
+ * process would not attempt to add any overlapping rules to C2, that is,
+ * any packet lookup using the rules generated by this process matches at
+ * most one rule in C2.
+ *
+ * During the lookup process, the classifier starts out with a wildcard mask
+ * that is all 0-bits, that is, fully wildcarded. As lookup proceeds, each
+ * step tends to add constraints to the wildcard mask, that is, change
+ * wildcarded 0-bits into exact-match 1-bits. We call this "un-wildcarding".
+ * A lookup step that examines a particular field must un-wildcard that field.
+ * In general, un-wildcarding is necessary for correctness but undesirable for
+ * performance.
+ *
+ *
+ * Basic Classifier Design
+ * =======================
*
* Suppose that all the rules in a classifier had the same form. For example,
* suppose that they all matched on the source and destination Ethernet address
* cls_subtable" in the classifier and tracks the highest-priority match that
* it finds. The subtables are kept in a descending priority order according
* to the highest priority rule in each subtable, which allows lookup to skip
- * over subtables that can't possibly have a higher-priority match than
- * already found.
+ * over subtables that can't possibly have a higher-priority match than already
+ * found. Eliminating lookups through priority ordering aids both classifier
+ * primary design goals: skipping lookups saves time and avoids un-wildcarding
+ * fields that those lookups would have examined.
*
* One detail: a classifier can contain multiple rules that are identical other
* than their priority. When this happens, only the highest priority rule out
* list inside that highest-priority rule.
*
*
- * Staged Lookup
- * =============
+ * Staged Lookup (Wildcard Optimization)
+ * =====================================
*
* Subtable lookup is performed in ranges defined for struct flow, starting
* from metadata (registers, in_port, etc.), then L2 header, L3, and finally
* L4 ports. Whenever it is found that there are no matches in the current
- * subtable, the rest of the subtable can be skipped. The rationale of this
- * logic is that as many fields as possible can remain wildcarded.
+ * subtable, the rest of the subtable can be skipped.
+ *
+ * Staged lookup does not reduce lookup time, and it may increase it, because
+ * it changes a single hash table lookup into multiple hash table lookups.
+ * It reduces un-wildcarding significantly in important use cases.
*
*
- * Partitioning
- * ============
+ * Prefix Tracking (Wildcard Optimization)
+ * =======================================
+ *
+ * Classifier uses prefix trees ("tries") for tracking the used
+ * address space, enabling skipping classifier tables containing
+ * longer masks than necessary for the given address. This reduces
+ * un-wildcarding for datapath flows in parts of the address space
+ * without host routes, but consulting extra data structures (the
+ * tries) may slightly increase lookup time.
+ *
+ * Trie lookup is interwoven with staged lookup, so that a trie is
+ * searched only when the configured trie field becomes relevant for
+ * the lookup. The trie lookup results are retained so that each trie
+ * is checked at most once for each classifier lookup.
+ *
+ * This implementation tracks the number of rules at each address
+ * prefix for the whole classifier. More aggressive table skipping
+ * would be possible by maintaining lists of tables that have prefixes
+ * at the lengths encountered on tree traversal, or by maintaining
+ * separate tries for subsets of rules separated by metadata fields.
+ *
+ * Prefix tracking is configured via OVSDB "Flow_Table" table,
+ * "fieldspec" column. "fieldspec" is a string map where a "prefix"
+ * key tells which fields should be used for prefix tracking. The
+ * value of the "prefix" key is a comma separated list of field names.
+ *
+ * There is a maximum number of fields that can be enabled for any one
+ * flow table. Currently this limit is 3.
+ *
+ *
+ * Partitioning (Lookup Time and Wildcard Optimization)
+ * ====================================================
*
* Suppose that a given classifier is being used to handle multiple stages in a
* pipeline using "resubmit", with metadata (that is, the OpenFlow 1.1+ field
* any cls_subtable that doesn't match on metadata. We handle that by giving
* any such cls_subtable TAG_ALL as its 'tags' so that it matches any tag.)
*
+ * Partitioning saves lookup time by reducing the number of subtable lookups.
+ * Each eliminated subtable lookup also reduces the amount of un-wildcarding.
+ *
*
* Thread-safety
* =============
#include "hmap.h"
#include "list.h"
#include "match.h"
+#include "meta-flow.h"
#include "tag.h"
#include "openflow/nicira-ext.h"
#include "openflow/openflow.h"
/* Needed only for the lock annotation in struct classifier. */
extern struct ovs_mutex ofproto_mutex;
+struct trie_node;
+
+/* Prefix trie for a 'field' */
+struct cls_trie {
+ const struct mf_field *field; /* Trie field, or NULL. */
+ struct trie_node *root; /* NULL if none. */
+};
-/* Maximum number of staged lookup indices for each subtable. */
-enum { CLS_MAX_INDICES = 3 };
+enum {
+ CLS_MAX_INDICES = 3, /* Maximum number of lookup indices per subtable. */
+ CLS_MAX_TRIES = 3 /* Maximum number of prefix trees per classifier. */
+};
/* A flow classifier. */
struct classifier {
*/
struct hmap partitions; /* Contains "struct cls_partition"s. */
struct ovs_rwlock rwlock OVS_ACQ_AFTER(ofproto_mutex);
+ struct cls_trie tries[CLS_MAX_TRIES]; /* Prefix tries. */
+ unsigned int n_tries;
};
/* A set of rules that all have the same fields wildcarded. */
uint8_t n_indices; /* How many indices to use. */
uint8_t index_ofs[CLS_MAX_INDICES]; /* u32 flow segment boundaries. */
struct hindex indices[CLS_MAX_INDICES]; /* Staged lookup indices. */
+ unsigned int trie_plen[CLS_MAX_TRIES]; /* Trie prefix length in 'mask'. */
};
/* Returns true if 'table' is a "catch-all" subtable that will match every
void classifier_init(struct classifier *cls, const uint8_t *flow_segments);
void classifier_destroy(struct classifier *);
+void classifier_set_prefix_fields(struct classifier *cls,
+ const enum mf_field_id *trie_fields,
+ unsigned int n_trie_fields)
+ OVS_REQ_WRLOCK(cls->rwlock);
+
bool classifier_is_empty(const struct classifier *cls)
OVS_REQ_RDLOCK(cls->rwlock);
int classifier_count(const struct classifier *cls)
/* Completes an initialization of 'dst' as a miniflow copy of 'src' begun by
* the caller. The caller must have already initialized 'dst->map' properly
- * to indicate the nonzero uint32_t elements of 'src'. 'n' must be the number
- * of 1-bits in 'dst->map'.
+ * to indicate the significant uint32_t elements of 'src'. 'n' must be the
+ * number of 1-bits in 'dst->map'.
+ *
+ * Normally the significant elements are the ones that are non-zero. However,
+ * when a miniflow is initialized from a (mini)mask, the values can be zeroes,
+ * so that the flow and mask always have the same maps.
*
* This function initializes 'dst->values' (either inline if possible or with
- * malloc() otherwise) and copies the nonzero uint32_t elements of 'src' into
- * it. */
+ * malloc() otherwise) and copies the uint32_t elements of 'src' indicated by
+ * 'dst->map' into it. */
static void
miniflow_init__(struct miniflow *dst, const struct flow *src, int n)
{
VLOG_DEFINE_THIS_MODULE(meta_flow);
+#define FLOW_U32OFS(FIELD) \
+ offsetof(struct flow, FIELD) % 4 ? -1 : offsetof(struct flow, FIELD) / 4
+
#define MF_FIELD_SIZES(MEMBER) \
sizeof ((union mf_value *)0)->MEMBER, \
8 * sizeof ((union mf_value *)0)->MEMBER
OXM_OF_TUNNEL_ID, "OXM_OF_TUNNEL_ID",
OFPUTIL_P_NXM_OXM_ANY,
OFPUTIL_P_NXM_OXM_ANY,
+ FLOW_U32OFS(tunnel.tun_id),
}, {
MFF_TUN_SRC, "tun_src", NULL,
MF_FIELD_SIZES(be32),
NXM_NX_TUN_IPV4_SRC, "NXM_NX_TUN_IPV4_SRC",
OFPUTIL_P_NXM_OXM_ANY,
OFPUTIL_P_NXM_OXM_ANY,
+ FLOW_U32OFS(tunnel.ip_src),
}, {
MFF_TUN_DST, "tun_dst", NULL,
MF_FIELD_SIZES(be32),
NXM_NX_TUN_IPV4_DST, "NXM_NX_TUN_IPV4_DST",
OFPUTIL_P_NXM_OXM_ANY,
OFPUTIL_P_NXM_OXM_ANY,
+ FLOW_U32OFS(tunnel.ip_dst),
}, {
MFF_TUN_FLAGS, "tun_flags", NULL,
MF_FIELD_SIZES(be16),
0, NULL,
OFPUTIL_P_NONE,
OFPUTIL_P_NONE,
+ -1,
}, {
MFF_TUN_TTL, "tun_ttl", NULL,
MF_FIELD_SIZES(u8),
0, NULL,
OFPUTIL_P_NONE,
OFPUTIL_P_NONE,
+ -1,
}, {
MFF_TUN_TOS, "tun_tos", NULL,
MF_FIELD_SIZES(u8),
0, NULL,
OFPUTIL_P_NONE,
OFPUTIL_P_NONE,
+ -1,
}, {
MFF_METADATA, "metadata", NULL,
MF_FIELD_SIZES(be64),
OXM_OF_METADATA, "OXM_OF_METADATA",
OFPUTIL_P_NXM_OF11_UP,
OFPUTIL_P_NXM_OF11_UP,
+ -1,
}, {
MFF_IN_PORT, "in_port", NULL,
MF_FIELD_SIZES(be16),
NXM_OF_IN_PORT, "NXM_OF_IN_PORT",
OFPUTIL_P_ANY, /* OF11+ via mapping to 32 bits. */
OFPUTIL_P_NONE,
+ -1,
}, {
MFF_IN_PORT_OXM, "in_port_oxm", NULL,
MF_FIELD_SIZES(be32),
OXM_OF_IN_PORT, "OXM_OF_IN_PORT",
OFPUTIL_P_OF11_UP,
OFPUTIL_P_NONE,
+ -1,
}, {
MFF_SKB_PRIORITY, "skb_priority", NULL,
MF_FIELD_SIZES(be32),
0, NULL,
OFPUTIL_P_NONE,
OFPUTIL_P_NONE,
+ -1,
}, {
MFF_PKT_MARK, "pkt_mark", NULL,
MF_FIELD_SIZES(be32),
NXM_NX_PKT_MARK, "NXM_NX_PKT_MARK",
OFPUTIL_P_NXM_OXM_ANY,
OFPUTIL_P_NXM_OXM_ANY,
+ -1,
},
#define REGISTER(IDX) \
NXM_NX_REG(IDX), "NXM_NX_REG" #IDX, \
OFPUTIL_P_NXM_OXM_ANY, \
OFPUTIL_P_NXM_OXM_ANY, \
+ -1, \
}
#if FLOW_N_REGS > 0
REGISTER(0),
OXM_OF_ETH_SRC, "OXM_OF_ETH_SRC",
OFPUTIL_P_ANY,
OFPUTIL_P_NXM_OF11_UP, /* Bitwise masking only with NXM and OF11+! */
+ -1,
}, {
MFF_ETH_DST, "eth_dst", "dl_dst",
MF_FIELD_SIZES(mac),
OXM_OF_ETH_DST, "OXM_OF_ETH_DST",
OFPUTIL_P_ANY,
OFPUTIL_P_NXM_OF11_UP, /* Bitwise masking only with NXM and OF11+! */
+ -1,
}, {
MFF_ETH_TYPE, "eth_type", "dl_type",
MF_FIELD_SIZES(be16),
OXM_OF_ETH_TYPE, "OXM_OF_ETH_TYPE",
OFPUTIL_P_ANY,
OFPUTIL_P_NONE,
+ -1,
},
{
NXM_OF_VLAN_TCI, "NXM_OF_VLAN_TCI",
OFPUTIL_P_ANY,
OFPUTIL_P_NXM_OXM_ANY,
+ -1,
}, {
MFF_DL_VLAN, "dl_vlan", NULL,
sizeof(ovs_be16), 12,
0, NULL,
OFPUTIL_P_ANY,
OFPUTIL_P_NXM_OXM_ANY,
+ -1,
}, {
MFF_VLAN_VID, "vlan_vid", NULL,
sizeof(ovs_be16), 12,
OXM_OF_VLAN_VID, "OXM_OF_VLAN_VID",
OFPUTIL_P_ANY,
OFPUTIL_P_NXM_OXM_ANY,
+ -1,
}, {
MFF_DL_VLAN_PCP, "dl_vlan_pcp", NULL,
1, 3,
0, NULL,
OFPUTIL_P_ANY, /* Will be mapped to NXM and OXM. */
OFPUTIL_P_NONE,
+ -1,
}, {
MFF_VLAN_PCP, "vlan_pcp", NULL,
1, 3,
OXM_OF_VLAN_PCP, "OXM_OF_VLAN_PCP",
OFPUTIL_P_ANY, /* Will be mapped to OF10 and NXM. */
OFPUTIL_P_NONE,
+ -1,
},
/* ## ---- ## */
OXM_OF_MPLS_LABEL, "OXM_OF_MPLS_LABEL",
OFPUTIL_P_NXM_OF11_UP,
OFPUTIL_P_NONE,
+ -1,
}, {
MFF_MPLS_TC, "mpls_tc", NULL,
1, 3,
OXM_OF_MPLS_TC, "OXM_OF_MPLS_TC",
OFPUTIL_P_NXM_OF11_UP,
OFPUTIL_P_NONE,
+ -1,
}, {
MFF_MPLS_BOS, "mpls_bos", NULL,
1, 1,
OXM_OF_MPLS_BOS, "OXM_OF_MPLS_BOS",
OFPUTIL_P_NXM_OXM_ANY,
OFPUTIL_P_NONE,
+ -1,
},
/* ## -- ## */
OXM_OF_IPV4_SRC, "OXM_OF_IPV4_SRC",
OFPUTIL_P_ANY,
OFPUTIL_P_NXM_OF11_UP,
+ FLOW_U32OFS(nw_src),
}, {
MFF_IPV4_DST, "ip_dst", "nw_dst",
MF_FIELD_SIZES(be32),
OXM_OF_IPV4_DST, "OXM_OF_IPV4_DST",
OFPUTIL_P_ANY,
OFPUTIL_P_NXM_OF11_UP,
+ FLOW_U32OFS(nw_dst),
},
{
OXM_OF_IPV6_SRC, "OXM_OF_IPV6_SRC",
OFPUTIL_P_NXM_OXM_ANY,
OFPUTIL_P_NXM_OXM_ANY,
+ FLOW_U32OFS(ipv6_src),
}, {
MFF_IPV6_DST, "ipv6_dst", NULL,
MF_FIELD_SIZES(ipv6),
OXM_OF_IPV6_DST, "OXM_OF_IPV6_DST",
OFPUTIL_P_NXM_OXM_ANY,
OFPUTIL_P_NXM_OXM_ANY,
+ FLOW_U32OFS(ipv6_dst),
},
{
MFF_IPV6_LABEL, "ipv6_label", NULL,
OXM_OF_IPV6_FLABEL, "OXM_OF_IPV6_FLABEL",
OFPUTIL_P_NXM_OXM_ANY,
OFPUTIL_P_NXM_OXM_ANY,
+ -1,
},
{
OXM_OF_IP_PROTO, "OXM_OF_IP_PROTO",
OFPUTIL_P_ANY,
OFPUTIL_P_NONE,
+ -1,
}, {
MFF_IP_DSCP, "nw_tos", NULL,
MF_FIELD_SIZES(u8),
NXM_OF_IP_TOS, "NXM_OF_IP_TOS",
OFPUTIL_P_ANY, /* Will be shifted for OXM. */
OFPUTIL_P_NONE,
+ -1,
}, {
MFF_IP_DSCP_SHIFTED, "ip_dscp", NULL,
1, 6,
OXM_OF_IP_DSCP, "OXM_OF_IP_DSCP",
OFPUTIL_P_ANY, /* Will be shifted for non-OXM. */
OFPUTIL_P_NONE,
+ -1,
}, {
MFF_IP_ECN, "nw_ecn", "ip_ecn",
1, 2,
OXM_OF_IP_ECN, "OXM_OF_IP_ECN",
OFPUTIL_P_NXM_OXM_ANY,
OFPUTIL_P_NONE,
+ -1,
}, {
MFF_IP_TTL, "nw_ttl", NULL,
MF_FIELD_SIZES(u8),
NXM_NX_IP_TTL, "NXM_NX_IP_TTL",
OFPUTIL_P_NXM_OXM_ANY,
OFPUTIL_P_NONE,
+ -1,
}, {
MFF_IP_FRAG, "ip_frag", NULL,
1, 2,
NXM_NX_IP_FRAG, "NXM_NX_IP_FRAG",
OFPUTIL_P_NXM_OXM_ANY,
OFPUTIL_P_NXM_OXM_ANY,
+ -1,
},
{
OXM_OF_ARP_OP, "OXM_OF_ARP_OP",
OFPUTIL_P_ANY,
OFPUTIL_P_NONE,
+ -1,
}, {
MFF_ARP_SPA, "arp_spa", NULL,
MF_FIELD_SIZES(be32),
OXM_OF_ARP_SPA, "OXM_OF_ARP_SPA",
OFPUTIL_P_ANY,
OFPUTIL_P_NXM_OF11_UP,
+ -1,
}, {
MFF_ARP_TPA, "arp_tpa", NULL,
MF_FIELD_SIZES(be32),
OXM_OF_ARP_TPA, "OXM_OF_ARP_TPA",
OFPUTIL_P_ANY,
OFPUTIL_P_NXM_OF11_UP,
+ -1,
}, {
MFF_ARP_SHA, "arp_sha", NULL,
MF_FIELD_SIZES(mac),
OXM_OF_ARP_SHA, "OXM_OF_ARP_SHA",
OFPUTIL_P_NXM_OXM_ANY,
OFPUTIL_P_NXM_OXM_ANY,
+ -1,
}, {
MFF_ARP_THA, "arp_tha", NULL,
MF_FIELD_SIZES(mac),
OXM_OF_ARP_THA, "OXM_OF_ARP_THA",
OFPUTIL_P_NXM_OXM_ANY,
OFPUTIL_P_NXM_OXM_ANY,
+ -1,
},
/* ## -- ## */
OXM_OF_TCP_SRC, "OXM_OF_TCP_SRC",
OFPUTIL_P_ANY,
OFPUTIL_P_NXM_OXM_ANY,
+ -1,
}, {
MFF_TCP_DST, "tcp_dst", "tp_dst",
MF_FIELD_SIZES(be16),
OXM_OF_TCP_DST, "OXM_OF_TCP_DST",
OFPUTIL_P_ANY,
OFPUTIL_P_NXM_OXM_ANY,
+ -1,
}, {
MFF_TCP_FLAGS, "tcp_flags", NULL,
2, 12,
NXM_NX_TCP_FLAGS, "NXM_NX_TCP_FLAGS",
OFPUTIL_P_NXM_OXM_ANY,
OFPUTIL_P_NXM_OXM_ANY,
+ -1,
},
{
OXM_OF_UDP_SRC, "OXM_OF_UDP_SRC",
OFPUTIL_P_ANY,
OFPUTIL_P_NXM_OXM_ANY,
+ -1,
}, {
MFF_UDP_DST, "udp_dst", NULL,
MF_FIELD_SIZES(be16),
OXM_OF_UDP_DST, "OXM_OF_UDP_DST",
OFPUTIL_P_ANY,
OFPUTIL_P_NXM_OXM_ANY,
+ -1,
},
{
OXM_OF_SCTP_SRC, "OXM_OF_SCTP_SRC",
OFPUTIL_P_NXM_OF11_UP,
OFPUTIL_P_NXM_OXM_ANY,
+ -1,
}, {
MFF_SCTP_DST, "sctp_dst", NULL,
MF_FIELD_SIZES(be16),
OXM_OF_SCTP_DST, "OXM_OF_SCTP_DST",
OFPUTIL_P_NXM_OF11_UP,
OFPUTIL_P_NXM_OXM_ANY,
+ -1,
},
{
OXM_OF_ICMPV4_TYPE, "OXM_OF_ICMPV4_TYPE",
OFPUTIL_P_ANY,
OFPUTIL_P_NONE,
+ -1,
}, {
MFF_ICMPV4_CODE, "icmp_code", NULL,
MF_FIELD_SIZES(u8),
OXM_OF_ICMPV4_CODE, "OXM_OF_ICMPV4_CODE",
OFPUTIL_P_ANY,
OFPUTIL_P_NONE,
+ -1,
},
{
OXM_OF_ICMPV6_TYPE, "OXM_OF_ICMPV6_TYPE",
OFPUTIL_P_NXM_OXM_ANY,
OFPUTIL_P_NONE,
+ -1,
}, {
MFF_ICMPV6_CODE, "icmpv6_code", NULL,
MF_FIELD_SIZES(u8),
OXM_OF_ICMPV6_CODE, "OXM_OF_ICMPV6_CODE",
OFPUTIL_P_NXM_OXM_ANY,
OFPUTIL_P_NONE,
+ -1,
},
/* ## ---- ## */
OXM_OF_IPV6_ND_TARGET, "OXM_OF_IPV6_ND_TARGET",
OFPUTIL_P_NXM_OXM_ANY,
OFPUTIL_P_NXM_OXM_ANY,
+ -1,
}, {
MFF_ND_SLL, "nd_sll", NULL,
MF_FIELD_SIZES(mac),
OXM_OF_IPV6_ND_SLL, "OXM_OF_IPV6_ND_SLL",
OFPUTIL_P_NXM_OXM_ANY,
OFPUTIL_P_NXM_OXM_ANY,
+ -1,
}, {
MFF_ND_TLL, "nd_tll", NULL,
MF_FIELD_SIZES(mac),
OXM_OF_IPV6_ND_TLL, "OXM_OF_IPV6_ND_TLL",
OFPUTIL_P_NXM_OXM_ANY,
OFPUTIL_P_NXM_OXM_ANY,
+ -1,
}
};
enum ofputil_protocol usable_protocols; /* If fully/cidr masked. */
/* If partially/non-cidr masked. */
enum ofputil_protocol usable_protocols_bitwise;
+
+ int flow_be32ofs; /* Field's be32 offset in "struct flow", if prefix tree
+ * lookup is supported for the field, or -1. */
};
/* The representation of a field's value. */
#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>
-#include "classifier.h"
#include "compiler.h"
#include "flow.h"
+#include "list.h"
#include "match.h"
#include "netdev.h"
#include "openflow/nicira-ext.h"
}
table->max_flows = s->max_flows;
- ovs_rwlock_rdlock(&table->cls.rwlock);
+ ovs_rwlock_wrlock(&table->cls.rwlock);
if (classifier_count(&table->cls) > table->max_flows
&& table->eviction_fields) {
/* 'table' contains more flows than allowed. We might not be able to
break;
}
}
+
+ classifier_set_prefix_fields(&table->cls,
+ s->prefix_fields, s->n_prefix_fields);
+
ovs_rwlock_unlock(&table->cls.rwlock);
}
\f
#include <stddef.h>
#include <stdint.h>
#include "cfm.h"
+#include "classifier.h"
#include "flow.h"
+#include "meta-flow.h"
#include "netflow.h"
#include "sset.h"
#include "stp.h"
* distinguished by different values for the subfields within 'groups'. */
struct mf_subfield *groups;
size_t n_groups;
+
+ /*
+ * Fields for which prefix trie lookup is maintained.
+ */
+ unsigned int n_prefix_fields;
+ enum mf_field_id prefix_fields[CLS_MAX_TRIES];
};
int ofproto_get_n_tables(const struct ofproto *);
])
OVS_VSWITCHD_STOP
AT_CLEANUP
+
+AT_BANNER([flow classifier prefix lookup])
+AT_SETUP([flow classifier - prefix lookup])
+OVS_VSWITCHD_START
+ADD_OF_PORTS([br0], [1], [2], [3])
+AT_CHECK([ovs-vsctl set Bridge br0 flow_tables:0=@N1 -- --id=@N1 create Flow_Table name=t0], [0], [ignore], [])
+AT_CHECK([ovs-vsctl set Flow_Table t0 prefixes=ipv6_label], [0])
+AT_CHECK([ovs-vsctl set Flow_Table t0 prefixes=nw_dst,nw_src,tun_dst,tun_src], [1], [],
+[ovs-vsctl: nw_dst,nw_src,tun_dst,tun_src: 4 value(s) specified but the maximum number is 3
+])
+AT_CHECK([ovs-vsctl set Flow_Table t0 prefixes=nw_dst,nw_dst], [1], [],
+[ovs-vsctl: nw_dst,nw_dst: set contains duplicate value
+])
+AT_CHECK([ovs-vsctl set Flow_Table t0 prefixes=nw_dst], [0])
+AT_DATA([flows.txt], [dnl
+table=0 in_port=1 priority=16,tcp,nw_dst=10.1.0.0/255.255.0.0,action=output(3)
+table=0 in_port=1 priority=32,tcp,nw_dst=10.1.2.0/255.255.255.0,tp_src=79,action=output(2)
+table=0 in_port=1 priority=33,tcp,nw_dst=10.1.2.15,tp_dst=80,action=drop
+table=0 in_port=1 priority=0,ip,action=drop
+table=0 in_port=2 priority=16,tcp,nw_dst=192.168.0.0/255.255.0.0,action=output(1)
+table=0 in_port=2 priority=0,ip,action=drop
+table=0 in_port=3 priority=16,tcp,nw_src=10.1.0.0/255.255.0.0,action=output(1)
+table=0 in_port=3 priority=0,ip,action=drop
+])
+AT_CHECK([ovs-ofctl add-flows br0 flows.txt])
+AT_CHECK([ovs-appctl ofproto/trace br0 'in_port=1,dl_src=50:54:00:00:00:05,dl_dst=50:54:00:00:00:07,dl_type=0x0800,nw_src=192.168.0.1,nw_dst=192.168.0.2,nw_proto=6,nw_tos=0,nw_ttl=128,tp_src=8,tp_dst=80'], [0], [stdout])
+AT_CHECK([tail -2 stdout], [0],
+ [Relevant fields: skb_priority=0,tcp,in_port=1,nw_dst=192.168.0.0/16,nw_frag=no
+Datapath actions: drop
+])
+AT_CHECK([ovs-appctl ofproto/trace br0 'in_port=2,dl_src=50:54:00:00:00:05,dl_dst=50:54:00:00:00:07,dl_type=0x0800,nw_src=192.168.0.1,nw_dst=192.168.0.2,nw_proto=6,nw_tos=0,nw_ttl=128,tp_src=8,tp_dst=80'], [0], [stdout])
+AT_CHECK([tail -2 stdout], [0],
+ [Relevant fields: skb_priority=0,tcp,in_port=2,nw_dst=192.168.0.0/16,nw_frag=no
+Datapath actions: 1
+])
+AT_CHECK([ovs-appctl ofproto/trace br0 'in_port=1,dl_src=50:54:00:00:00:05,dl_dst=50:54:00:00:00:07,dl_type=0x0800,nw_src=192.168.0.1,nw_dst=10.1.2.15,nw_proto=6,nw_tos=0,nw_ttl=128,tp_src=8,tp_dst=80'], [0], [stdout])
+AT_CHECK([tail -2 stdout], [0],
+ [Relevant fields: skb_priority=0,tcp,in_port=1,nw_dst=10.1.2.15,nw_frag=no,tp_dst=80
+Datapath actions: drop
+])
+AT_CHECK([ovs-appctl ofproto/trace br0 'in_port=1,dl_src=50:54:00:00:00:05,dl_dst=50:54:00:00:00:07,dl_type=0x0800,nw_src=192.168.0.1,nw_dst=10.1.2.15,nw_proto=6,nw_tos=0,nw_ttl=128,tp_src=8,tp_dst=79'], [0], [stdout])
+AT_CHECK([tail -2 stdout], [0],
+ [Relevant fields: skb_priority=0,tcp,in_port=1,nw_dst=10.1.2.15,nw_frag=no,tp_src=8,tp_dst=79
+Datapath actions: 3
+])
+OVS_VSWITCHD_STOP(["/'prefixes' with incompatible field: ipv6_label/d"])
+AT_CLEANUP
AT_SETUP([ofproto-dpif megaflow - L3 classification])
OVS_VSWITCHD_START
ADD_OF_PORTS([br0], [1], [2])
+AT_CHECK([ovs-vsctl set Bridge br0 flow_tables:0=@N1 -- --id=@N1 create Flow_Table name=t0 prefixes=nw_dst,nw_src], [0], [ignore], [])
AT_DATA([flows.txt], [dnl
table=0 in_port=1,icmp,nw_src=10.0.0.4 actions=output(2)
])
AT_CHECK([ovs-appctl netdev-dummy/receive p1 'in_port(1),eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800),ipv4(src=10.0.0.2,dst=10.0.0.1,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0)'])
AT_CHECK([ovs-appctl netdev-dummy/receive p1 'in_port(1),eth(src=50:54:00:00:00:0b,dst=50:54:00:00:00:0c),eth_type(0x0800),ipv4(src=10.0.0.4,dst=10.0.0.3,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0)'])
AT_CHECK([ovs-appctl dpif/dump-flows br0 | STRIP_XOUT], [0], [dnl
-skb_priority(0),in_port(1),eth_type(0x0800),ipv4(src=10.0.0.2/255.255.255.255,dst=10.0.0.1/0.0.0.0,proto=1/0xff,tos=0/0,ttl=64/0,frag=no/0xff), packets:0, bytes:0, used:0.0s, actions: <del>
+skb_priority(0),in_port(1),eth_type(0x0800),ipv4(src=10.0.0.2/255.255.255.252,dst=10.0.0.1/0.0.0.0,proto=1/0,tos=0/0,ttl=64/0,frag=no/0xff), packets:0, bytes:0, used:0.0s, actions: <del>
skb_priority(0),in_port(1),eth_type(0x0800),ipv4(src=10.0.0.4/255.255.255.255,dst=10.0.0.3/0.0.0.0,proto=1/0xff,tos=0/0,ttl=64/0,frag=no/0xff), packets:0, bytes:0, used:0.0s, actions: <del>
])
OVS_VSWITCHD_STOP
AT_CLEANUP
+AT_SETUP([ofproto-dpif megaflow - IPv6 classification])
+OVS_VSWITCHD_START
+ADD_OF_PORTS([br0], [1], [2])
+AT_CHECK([ovs-vsctl set Bridge br0 flow_tables:0=@N1 -- --id=@N1 create Flow_Table name=t0 prefixes=ipv6_dst,ipv6_src], [0], [ignore], [])
+AT_DATA([flows.txt], [dnl
+table=0 in_port=1,ipv6,ipv6_src=2001:db8:3c4d:1:2:3:4:5 actions=output(2)
+])
+AT_CHECK([ovs-ofctl add-flows br0 flows.txt])
+AT_CHECK([ovs-appctl netdev-dummy/receive p1 'in_port(1),eth(src=50:54:00:00:00:07,dst=50:54:00:00:00:05),eth_type(0x86dd),ipv6(src=2001:db8:3c4d:1:2:3:4:5,dst=fe80::2,label=0,proto=10,tclass=0x70,hlimit=128,frag=no)'])
+AT_CHECK([ovs-appctl netdev-dummy/receive p1 'in_port(1),eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x86dd),ipv6(src=2001:db8:3c4d:5:4:3:2:1,dst=2001:db8:3c4d:1:2:3:4:1,label=0,proto=99,tclass=0x70,hlimit=64,frag=no)'])
+AT_CHECK([ovs-appctl dpif/dump-flows br0 | STRIP_XOUT], [0], [dnl
+skb_priority(0),in_port(1),eth_type(0x86dd),ipv6(src=2001:db8:3c4d:1:2:3:4:5/ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff,dst=fe80::2/::,label=0/0,proto=10/0,tclass=0x70/0,hlimit=128/0,frag=no/0xff), packets:0, bytes:0, used:0.0s, actions: <del>
+skb_priority(0),in_port(1),eth_type(0x86dd),ipv6(src=2001:db8:3c4d:5:4:3:2:1/ffff:ffff:ffff:fffc::,dst=2001:db8:3c4d:1:2:3:4:1/::,label=0/0,proto=99/0,tclass=0x70/0,hlimit=64/0,frag=no/0xff), packets:0, bytes:0, used:0.0s, actions: <del>
+])
+OVS_VSWITCHD_STOP
+AT_CLEANUP
+
AT_SETUP([ofproto-dpif megaflow - L4 classification])
OVS_VSWITCHD_START
ADD_OF_PORTS([br0], [1], [2])
AT_SETUP([ofproto-dpif megaflow - dec_ttl])
OVS_VSWITCHD_START
ADD_OF_PORTS([br0], [1], [2])
+AT_CHECK([ovs-vsctl set Bridge br0 flow_tables:0=@N1 -- --id=@N1 create Flow_Table name=t0 prefixes=nw_dst,nw_src], [0], [ignore], [])
AT_DATA([flows.txt], [dnl
table=0 in_port=1,icmp,nw_src=10.0.0.4 actions=dec_ttl,output(2)
])
AT_CHECK([ovs-appctl netdev-dummy/receive p1 'in_port(1),eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800),ipv4(src=10.0.0.2,dst=10.0.0.1,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0)'])
AT_CHECK([ovs-appctl netdev-dummy/receive p1 'in_port(1),eth(src=50:54:00:00:00:0b,dst=50:54:00:00:00:0c),eth_type(0x0800),ipv4(src=10.0.0.4,dst=10.0.0.3,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0)'])
AT_CHECK([ovs-appctl dpif/dump-flows br0 | STRIP_XOUT], [0], [dnl
-skb_priority(0),in_port(1),eth_type(0x0800),ipv4(src=10.0.0.2/255.255.255.255,dst=10.0.0.1/0.0.0.0,proto=1/0xff,tos=0/0,ttl=64/0,frag=no/0xff), packets:0, bytes:0, used:0.0s, actions: <del>
+skb_priority(0),in_port(1),eth_type(0x0800),ipv4(src=10.0.0.2/255.255.255.252,dst=10.0.0.1/0.0.0.0,proto=1/0,tos=0/0,ttl=64/0,frag=no/0xff), packets:0, bytes:0, used:0.0s, actions: <del>
skb_priority(0),in_port(1),eth_type(0x0800),ipv4(src=10.0.0.4,dst=10.0.0.3,proto=1,tos=0,ttl=64,frag=no), packets:0, bytes:0, used:0.0s, actions: <del>
])
OVS_VSWITCHD_STOP
\f
/* Classifier tests. */
+static enum mf_field_id trie_fields[2] = {
+ MFF_IPV4_DST, MFF_IPV4_SRC
+};
+
/* Tests an empty classifier. */
static void
test_empty(int argc OVS_UNUSED, char *argv[] OVS_UNUSED)
struct tcls tcls;
classifier_init(&cls, flow_segment_u32s);
- ovs_rwlock_rdlock(&cls.rwlock);
+ ovs_rwlock_wrlock(&cls.rwlock);
+ classifier_set_prefix_fields(&cls, trie_fields, ARRAY_SIZE(trie_fields));
tcls_init(&tcls);
assert(classifier_is_empty(&cls));
assert(tcls_is_empty(&tcls));
classifier_init(&cls, flow_segment_u32s);
ovs_rwlock_wrlock(&cls.rwlock);
+ classifier_set_prefix_fields(&cls, trie_fields,
+ ARRAY_SIZE(trie_fields));
tcls_init(&tcls);
tcls_rule = tcls_insert(&tcls, rule);
classifier_init(&cls, flow_segment_u32s);
ovs_rwlock_wrlock(&cls.rwlock);
+ classifier_set_prefix_fields(&cls, trie_fields,
+ ARRAY_SIZE(trie_fields));
tcls_init(&tcls);
tcls_insert(&tcls, rule1);
classifier_insert(&cls, &rule1->cls_rule);
classifier_init(&cls, flow_segment_u32s);
ovs_rwlock_wrlock(&cls.rwlock);
+ classifier_set_prefix_fields(&cls, trie_fields,
+ ARRAY_SIZE(trie_fields));
tcls_init(&tcls);
for (i = 0; i < ARRAY_SIZE(ops); i++) {
classifier_init(&cls, flow_segment_u32s);
ovs_rwlock_wrlock(&cls.rwlock);
+ classifier_set_prefix_fields(&cls, trie_fields,
+ ARRAY_SIZE(trie_fields));
tcls_init(&tcls);
for (i = 0; i < N_RULES; i++) {
classifier_init(&cls, flow_segment_u32s);
ovs_rwlock_wrlock(&cls.rwlock);
+ classifier_set_prefix_fields(&cls, trie_fields,
+ ARRAY_SIZE(trie_fields));
tcls_init(&tcls);
for (i = 0; i < MAX_RULES; i++) {
{
static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
int n_tables;
- int i, j;
+ int i, j, k;
n_tables = ofproto_get_n_tables(br->ofproto);
j = 0;
s.max_flows = UINT_MAX;
s.groups = NULL;
s.n_groups = 0;
+ s.n_prefix_fields = 0;
+ memset(s.prefix_fields, ~0, sizeof(s.prefix_fields));
if (j < br->cfg->n_flow_tables && i == br->cfg->key_flow_tables[j]) {
struct ovsrec_flow_table *cfg = br->cfg->value_flow_tables[j++];
}
if (cfg->overflow_policy
&& !strcmp(cfg->overflow_policy, "evict")) {
- size_t k;
s.groups = xmalloc(cfg->n_groups * sizeof *s.groups);
for (k = 0; k < cfg->n_groups; k++) {
}
}
}
+ /* Prefix lookup fields. */
+ s.n_prefix_fields = 0;
+ for (k = 0; k < cfg->n_prefixes; k++) {
+ const char *name = cfg->prefixes[k];
+ const struct mf_field *mf = mf_from_name(name);
+ if (!mf) {
+ VLOG_WARN("bridge %s: 'prefixes' with unknown field: %s",
+ br->name, name);
+ continue;
+ }
+ if (mf->flow_be32ofs < 0 || mf->n_bits % 32) {
+ VLOG_WARN("bridge %s: 'prefixes' with incompatible field: "
+ "%s", br->name, name);
+ continue;
+ }
+ if (s.n_prefix_fields >= ARRAY_SIZE(s.prefix_fields)) {
+ VLOG_WARN("bridge %s: 'prefixes' with too many fields, "
+ "field not used: %s", br->name, name);
+ continue;
+ }
+ s.prefix_fields[s.n_prefix_fields++] = mf->id;
+ }
+ if (s.n_prefix_fields > 0) {
+ int k;
+ struct ds ds = DS_EMPTY_INITIALIZER;
+ for (k = 0; k < s.n_prefix_fields; k++) {
+ if (k) {
+ ds_put_char(&ds, ',');
+ }
+ ds_put_cstr(&ds, mf_from_id(s.prefix_fields[k])->name);
+ }
+ VLOG_INFO("bridge %s table %d: Prefix lookup with: %s.",
+ br->name, i, ds_cstr(&ds));
+ ds_destroy(&ds);
+ }
}
ofproto_configure_table(br->ofproto, i, &s);
{"name": "Open_vSwitch",
- "version": "7.3.0",
- "cksum": "2495231516 20311",
+ "version": "7.4.0",
+ "cksum": "951746691 20389",
"tables": {
"Open_vSwitch": {
"columns": {
"enum": ["set", ["refuse", "evict"]]},
"min": 0, "max": 1}},
"groups": {
- "type": {"key": "string", "min": 0, "max": "unlimited"}}}},
+ "type": {"key": "string", "min": 0, "max": "unlimited"}},
+ "prefixes": {
+ "type": {"key": "string", "min": 0, "max": 3}}}},
"QoS": {
"columns": {
"type": {
column has no effect.
</p>
</column>
+
+ <column name="prefixes">
+ <p>
+ This string set specifies which fields should be used for
+ address prefix tracking. Prefix tracking allows the
+ classifier to skip rules with longer than necessary prefixes,
+ resulting in better wildcarding for datapath flows.
+ </p>
+ <p>
+ Prefix tracking may be beneficial when a flow table contains
+ matches on IP address fields with different prefix lengths.
+ For example, when a flow table contains IP address matches on
+ both full addresses and proper prefixes, the full address
+ matches will typically cause the datapath flow to un-wildcard
+ the whole address field (depending on flow entry priorities).
+ In this case each packet with a different address gets handed
+ to the userspace for flow processing and generates its own
+ datapath flow. With prefix tracking enabled for the address
+ field in question packets with addresses matching shorter
+ prefixes would generate datapath flows where the irrelevant
+ address bits are wildcarded, allowing the same datapath flow
+ to handle all the packets within the prefix in question. In
+ this case many userspace upcalls can be avoided and the
+ overall performance can be better.
+ </p>
+ <p>
+ This is a performance optimization only, so packets will
+ receive the same treatment with or without prefix tracking.
+ </p>
+ <p>
+ The supported fields are: <code>tun_id</code>,
+ <code>tun_src</code>, <code>tun_dst</code>,
+ <code>nw_src</code>, <code>nw_dst</code> (or aliases
+ <code>ip_src</code> and <code>ip_dst</code>),
+ <code>ipv6_src</code>, and <code>ipv6_dst</code>. (Using this
+ feature for <code>tun_id</code> would only make sense if the
+ tunnel IDs have prefix structure similar to IP addresses.)
+ </p>
+ <p>
+ For example, <code>prefixes=ip_dst,ip_src</code> instructs the
+ flow classifier to track the IP destination and source
+ addresses used by the rules in this specific flow table. To
+ set the prefix fields, the flow table record needs to exist:
+ </p>
+ <dl>
+ <dt><code>ovs-vsctl set Bridge br0 flow_tables:0=@N1 -- --id=@N1 create Flow_Table name=table0</code></dt>
+ <dd>
+ Creates a flow table record for the OpenFlow table number 0.
+ </dd>
+
+ <dt><code>ovs-vsctl set Flow_Table table0 prefixes=ip_dst,ip_src</code></dt>
+ <dd>
+ Enables prefix tracking for IP source and destination
+ address fields.
+ </dd>
+ </dl>
+
+ <p>
+ There is a maximum number of fields that can be enabled for any
+ one flow table. Currently this limit is 3.
+ </p>
+ </column>
</table>
<table name="QoS" title="Quality of Service configuration">