+\f
+/* A longest-prefix match tree. */
+struct trie_node {
+ uint32_t prefix; /* Prefix bits for this node, MSB first. */
+ uint8_t nbits; /* Never zero, except for the root node. */
+ unsigned int n_rules; /* Number of rules that have this prefix. */
+ struct trie_node *edges[2]; /* Both NULL if leaf. */
+};
+
+/* Max bits per node. Must fit in struct trie_node's 'prefix'.
+ * Also tested with 16, 8, and 5 to stress the implementation. */
+#define TRIE_PREFIX_BITS 32
+
+/* Return at least 'plen' bits of the 'prefix', starting at bit offset 'ofs'.
+ * Prefixes are in the network byte order, and the offset 0 corresponds to
+ * the most significant bit of the first byte. The offset can be read as
+ * "how many bits to skip from the start of the prefix starting at 'pr'". */
+static uint32_t
+raw_get_prefix(const ovs_be32 pr[], unsigned int ofs, unsigned int plen)
+{
+ uint32_t prefix;
+
+ pr += ofs / 32; /* Where to start. */
+ ofs %= 32; /* How many bits to skip at 'pr'. */
+
+ prefix = ntohl(*pr) << ofs; /* Get the first 32 - ofs bits. */
+ if (plen > 32 - ofs) { /* Need more than we have already? */
+ prefix |= ntohl(*++pr) >> (32 - ofs);
+ }
+ /* Return with possible unwanted bits at the end. */
+ return prefix;
+}
+
+/* Return min(TRIE_PREFIX_BITS, plen) bits of the 'prefix', starting at bit
+ * offset 'ofs'. Prefixes are in the network byte order, and the offset 0
+ * corresponds to the most significant bit of the first byte. The offset can
+ * be read as "how many bits to skip from the start of the prefix starting at
+ * 'pr'". */
+static uint32_t
+trie_get_prefix(const ovs_be32 pr[], unsigned int ofs, unsigned int plen)
+{
+ if (!plen) {
+ return 0;
+ }
+ if (plen > TRIE_PREFIX_BITS) {
+ plen = TRIE_PREFIX_BITS; /* Get at most TRIE_PREFIX_BITS. */
+ }
+ /* Return with unwanted bits cleared. */
+ return raw_get_prefix(pr, ofs, plen) & ~0u << (32 - plen);
+}
+
+/* Return the number of equal bits in 'nbits' of 'prefix's MSBs and a 'value'
+ * starting at "MSB 0"-based offset 'ofs'. */
+static unsigned int
+prefix_equal_bits(uint32_t prefix, unsigned int nbits, const ovs_be32 value[],
+ unsigned int ofs)
+{
+ uint64_t diff = prefix ^ raw_get_prefix(value, ofs, nbits);
+ /* Set the bit after the relevant bits to limit the result. */
+ return raw_clz64(diff << 32 | UINT64_C(1) << (63 - nbits));
+}
+
+/* Return the number of equal bits in 'node' prefix and a 'prefix' of length
+ * 'plen', starting at "MSB 0"-based offset 'ofs'. */
+static unsigned int
+trie_prefix_equal_bits(const struct trie_node *node, const ovs_be32 prefix[],
+ unsigned int ofs, unsigned int plen)
+{
+ return prefix_equal_bits(node->prefix, MIN(node->nbits, plen - ofs),
+ prefix, ofs);
+}
+
+/* Return the bit at ("MSB 0"-based) offset 'ofs' as an int. 'ofs' can
+ * be greater than 31. */
+static unsigned int
+be_get_bit_at(const ovs_be32 value[], unsigned int ofs)
+{
+ return (((const uint8_t *)value)[ofs / 8] >> (7 - ofs % 8)) & 1u;
+}
+
+/* Return the bit at ("MSB 0"-based) offset 'ofs' as an int. 'ofs' must
+ * be between 0 and 31, inclusive. */
+static unsigned int
+get_bit_at(const uint32_t prefix, unsigned int ofs)
+{
+ return (prefix >> (31 - ofs)) & 1u;
+}
+
+/* Create new branch. */
+static struct trie_node *
+trie_branch_create(const ovs_be32 *prefix, unsigned int ofs, unsigned int plen,
+ unsigned int n_rules)
+{
+ struct trie_node *node = xmalloc(sizeof *node);
+
+ node->prefix = trie_get_prefix(prefix, ofs, plen);
+
+ if (plen <= TRIE_PREFIX_BITS) {
+ node->nbits = plen;
+ node->edges[0] = NULL;
+ node->edges[1] = NULL;
+ node->n_rules = n_rules;
+ } else { /* Need intermediate nodes. */
+ struct trie_node *subnode = trie_branch_create(prefix,
+ ofs + TRIE_PREFIX_BITS,
+ plen - TRIE_PREFIX_BITS,
+ n_rules);
+ int bit = get_bit_at(subnode->prefix, 0);
+ node->nbits = TRIE_PREFIX_BITS;
+ node->edges[bit] = subnode;
+ node->edges[!bit] = NULL;
+ node->n_rules = 0;
+ }
+ return node;
+}
+
+static void
+trie_node_destroy(struct trie_node *node)
+{
+ free(node);
+}
+
+static void
+trie_destroy(struct trie_node *node)
+{
+ if (node) {
+ trie_destroy(node->edges[0]);
+ trie_destroy(node->edges[1]);
+ free(node);
+ }
+}
+
+static bool
+trie_is_leaf(const struct trie_node *trie)
+{
+ return !trie->edges[0] && !trie->edges[1]; /* No children. */
+}
+
+static void
+mask_set_prefix_bits(struct flow_wildcards *wc, uint8_t be32ofs,
+ unsigned int nbits)
+{
+ ovs_be32 *mask = &((ovs_be32 *)&wc->masks)[be32ofs];
+ unsigned int i;
+
+ for (i = 0; i < nbits / 32; i++) {
+ mask[i] = OVS_BE32_MAX;
+ }
+ if (nbits % 32) {
+ mask[i] |= htonl(~0u << (32 - nbits % 32));
+ }
+}
+
+static bool
+mask_prefix_bits_set(const struct flow_wildcards *wc, uint8_t be32ofs,
+ unsigned int nbits)
+{
+ ovs_be32 *mask = &((ovs_be32 *)&wc->masks)[be32ofs];
+ unsigned int i;
+ ovs_be32 zeroes = 0;
+
+ for (i = 0; i < nbits / 32; i++) {
+ zeroes |= ~mask[i];
+ }
+ if (nbits % 32) {
+ zeroes |= ~mask[i] & htonl(~0u << (32 - nbits % 32));
+ }
+
+ return !zeroes; /* All 'nbits' bits set. */
+}
+
+static struct trie_node **
+trie_next_edge(struct trie_node *node, const ovs_be32 value[],
+ unsigned int ofs)
+{
+ return node->edges + be_get_bit_at(value, ofs);
+}
+
+static const struct trie_node *
+trie_next_node(const struct trie_node *node, const ovs_be32 value[],
+ unsigned int ofs)
+{
+ return node->edges[be_get_bit_at(value, ofs)];
+}
+
+/* Return the prefix mask length necessary to find the longest-prefix match for
+ * the '*value' in the prefix tree 'node'.
+ * '*checkbits' is set to the number of bits in the prefix mask necessary to
+ * determine a mismatch, in case there are longer prefixes in the tree below
+ * the one that matched.
+ */
+static unsigned int
+trie_lookup_value(const struct trie_node *node, const ovs_be32 value[],
+ unsigned int *checkbits)
+{
+ unsigned int plen = 0, match_len = 0;
+ const struct trie_node *prev = NULL;
+
+ for (; node; prev = node, node = trie_next_node(node, value, plen)) {
+ unsigned int eqbits;
+ /* Check if this edge can be followed. */
+ eqbits = prefix_equal_bits(node->prefix, node->nbits, value, plen);
+ plen += eqbits;
+ if (eqbits < node->nbits) { /* Mismatch, nothing more to be found. */
+ /* Bit at offset 'plen' differed. */
+ *checkbits = plen + 1; /* Includes the first mismatching bit. */
+ return match_len;
+ }
+ /* Full match, check if rules exist at this prefix length. */
+ if (node->n_rules > 0) {
+ match_len = plen;
+ }
+ }
+ /* Dead end, exclude the other branch if it exists. */
+ *checkbits = !prev || trie_is_leaf(prev) ? plen : plen + 1;
+ return match_len;
+}
+
+static unsigned int
+trie_lookup(const struct cls_trie *trie, const struct flow *flow,
+ unsigned int *checkbits)
+{
+ const struct mf_field *mf = trie->field;
+
+ /* Check that current flow matches the prerequisites for the trie
+ * field. Some match fields are used for multiple purposes, so we
+ * must check that the trie is relevant for this flow. */
+ if (mf_are_prereqs_ok(mf, flow)) {
+ return trie_lookup_value(trie->root,
+ &((ovs_be32 *)flow)[mf->flow_be32ofs],
+ checkbits);
+ }
+ *checkbits = 0; /* Value not used in this case. */
+ return UINT_MAX;
+}
+
+/* Returns the length of a prefix match mask for the field 'mf' in 'minimask'.
+ * Returns the u32 offset to the miniflow data in '*miniflow_index', if
+ * 'miniflow_index' is not NULL. */
+static unsigned int
+minimask_get_prefix_len(const struct minimask *minimask,
+ const struct mf_field *mf)
+{
+ unsigned int nbits = 0, mask_tz = 0; /* Non-zero when end of mask seen. */
+ uint8_t u32_ofs = mf->flow_be32ofs;
+ uint8_t u32_end = u32_ofs + mf->n_bytes / 4;
+
+ for (; u32_ofs < u32_end; ++u32_ofs) {
+ uint32_t mask;
+ mask = ntohl((OVS_FORCE ovs_be32)minimask_get(minimask, u32_ofs));
+
+ /* Validate mask, count the mask length. */
+ if (mask_tz) {
+ if (mask) {
+ return 0; /* No bits allowed after mask ended. */
+ }
+ } else {
+ if (~mask & (~mask + 1)) {
+ return 0; /* Mask not contiguous. */
+ }
+ mask_tz = ctz32(mask);
+ nbits += 32 - mask_tz;
+ }
+ }
+
+ return nbits;
+}
+
+/*
+ * This is called only when mask prefix is known to be CIDR and non-zero.
+ * Relies on the fact that the flow and mask have the same map, and since
+ * the mask is CIDR, the storage for the flow field exists even if it
+ * happened to be zeros.
+ */
+static const ovs_be32 *
+minimatch_get_prefix(const struct minimatch *match, const struct mf_field *mf)
+{
+ return (OVS_FORCE const ovs_be32 *)match->flow.values +
+ count_1bits(match->flow.map & ((UINT64_C(1) << mf->flow_be32ofs) - 1));
+}
+
+/* Insert rule in to the prefix tree.
+ * 'mlen' must be the (non-zero) CIDR prefix length of the 'trie->field' mask
+ * in 'rule'. */
+static void
+trie_insert(struct cls_trie *trie, const struct cls_rule *rule, int mlen)
+{
+ const ovs_be32 *prefix = minimatch_get_prefix(&rule->match, trie->field);
+ struct trie_node *node;
+ struct trie_node **edge;
+ int ofs = 0;
+
+ /* Walk the tree. */
+ for (edge = &trie->root;
+ (node = *edge) != NULL;
+ edge = trie_next_edge(node, prefix, ofs)) {
+ unsigned int eqbits = trie_prefix_equal_bits(node, prefix, ofs, mlen);
+ ofs += eqbits;
+ if (eqbits < node->nbits) {
+ /* Mismatch, new node needs to be inserted above. */
+ int old_branch = get_bit_at(node->prefix, eqbits);
+
+ /* New parent node. */
+ *edge = trie_branch_create(prefix, ofs - eqbits, eqbits,
+ ofs == mlen ? 1 : 0);
+
+ /* Adjust old node for its new position in the tree. */
+ node->prefix <<= eqbits;
+ node->nbits -= eqbits;
+ (*edge)->edges[old_branch] = node;
+
+ /* Check if need a new branch for the new rule. */
+ if (ofs < mlen) {
+ (*edge)->edges[!old_branch]
+ = trie_branch_create(prefix, ofs, mlen - ofs, 1);
+ }
+ return;
+ }
+ /* Full match so far. */
+
+ if (ofs == mlen) {
+ /* Full match at the current node, rule needs to be added here. */
+ node->n_rules++;
+ return;
+ }
+ }
+ /* Must insert a new tree branch for the new rule. */
+ *edge = trie_branch_create(prefix, ofs, mlen - ofs, 1);
+}
+
+/* 'mlen' must be the (non-zero) CIDR prefix length of the 'trie->field' mask
+ * in 'rule'. */
+static void
+trie_remove(struct cls_trie *trie, const struct cls_rule *rule, int mlen)
+{
+ const ovs_be32 *prefix = minimatch_get_prefix(&rule->match, trie->field);
+ struct trie_node *node;
+ struct trie_node **edges[sizeof(union mf_value) * 8];
+ int depth = 0, ofs = 0;
+
+ /* Walk the tree. */
+ for (edges[depth] = &trie->root;
+ (node = *edges[depth]) != NULL;
+ edges[++depth] = trie_next_edge(node, prefix, ofs)) {
+ unsigned int eqbits = trie_prefix_equal_bits(node, prefix, ofs, mlen);
+ if (eqbits < node->nbits) {
+ /* Mismatch, nothing to be removed. This should never happen, as
+ * only rules in the classifier are ever removed. */
+ break; /* Log a warning. */
+ }
+ /* Full match so far. */
+ ofs += eqbits;
+
+ if (ofs == mlen) {
+ /* Full prefix match at the current node, remove rule here. */
+ if (!node->n_rules) {
+ break; /* Log a warning. */
+ }
+ node->n_rules--;
+
+ /* Check if can prune the tree. */
+ while (!node->n_rules && !(node->edges[0] && node->edges[1])) {
+ /* No rules and at most one child node, remove this node. */
+ struct trie_node *next;
+ next = node->edges[0] ? node->edges[0] : node->edges[1];
+
+ if (next) {
+ if (node->nbits + next->nbits > TRIE_PREFIX_BITS) {
+ break; /* Cannot combine. */
+ }
+ /* Combine node with next. */
+ next->prefix = node->prefix | next->prefix >> node->nbits;
+ next->nbits += node->nbits;
+ }
+ trie_node_destroy(node);
+ /* Update the parent's edge. */
+ *edges[depth] = next;
+ if (next || !depth) {
+ /* Branch not pruned or at root, nothing more to do. */
+ break;
+ }
+ node = *edges[--depth];
+ }
+ return;
+ }
+ }
+ /* Cannot go deeper. This should never happen, since only rules
+ * that actually exist in the classifier are ever removed. */
+ VLOG_WARN("Trying to remove non-existing rule from a prefix trie.");
+}