+ uint32_t i;
+
+ for (i = 0; i < count; i++) {
+ uint64_t mf_mask = mf_masks[i];
+ /* Calculate the block index for the packet metadata. */
+ uint64_t idx_bits = mf_mask & pkt_mf_bits;
+ const uint32_t pkt_idx = count_1bits(idx_bits);
+
+ /* Check if the packet has the subtable miniflow bit set. If yes, the
+ * block at the above pkt_idx will be stored, otherwise it is masked
+ * out to be zero.
+ */
+ uint64_t pkt_has_mf_bit = (mf_mask + 1) & pkt_mf_bits;
+ uint64_t no_bit = ((!pkt_has_mf_bit) > 0) - 1;
+
+ /* Mask packet block by table block, and mask to zero if packet
+ * doesn't actually contain this block of metadata.
+ */
+ blocks_scratch[i] = pkt_blocks[pkt_idx] & tbl_blocks[i] & no_bit;
+ }
+}
+
+/* This function takes a packet, and subtable and writes an array of uint64_t
+ * blocks. The blocks contain the metadata that the subtable matches on, in
+ * the same order as the subtable, allowing linear iteration over the blocks.
+ *
+ * To calculate the blocks contents, the netdev_flow_key_flatten_unit function
+ * is called twice, once for each "unit" of the miniflow. This call can be
+ * inlined by the compiler for performance.
+ *
+ * Note that the u0_count and u1_count variables can be compile-time constants,
+ * allowing the loop in the inlined flatten_unit() function to be compile-time
+ * unrolled, or possibly removed totally by unrolling by the loop iterations.
+ * The compile time optimizations enabled by this design improves performance.
+ */
+static inline void
+netdev_flow_key_flatten(const struct netdev_flow_key *key,
+ const struct netdev_flow_key *mask,
+ const uint64_t *mf_masks,
+ uint64_t *blocks_scratch,
+ const uint32_t u0_count,
+ const uint32_t u1_count)
+{
+ /* Load mask from subtable, mask with packet mf, popcount to get idx. */
+ const uint64_t *pkt_blocks = miniflow_get_values(&key->mf);
+ const uint64_t *tbl_blocks = miniflow_get_values(&mask->mf);
+
+ /* Packet miniflow bits to be masked by pre-calculated mf_masks. */
+ const uint64_t pkt_bits_u0 = key->mf.map.bits[0];
+ const uint32_t pkt_bits_u0_pop = count_1bits(pkt_bits_u0);
+ const uint64_t pkt_bits_u1 = key->mf.map.bits[1];
+
+ /* Unit 0 flattening */
+ netdev_flow_key_flatten_unit(&pkt_blocks[0],
+ &tbl_blocks[0],
+ &mf_masks[0],
+ &blocks_scratch[0],
+ pkt_bits_u0,
+ u0_count);
+
+ /* Unit 1 flattening:
+ * Move the pointers forward in the arrays based on u0 offsets, NOTE:
+ * 1) pkt blocks indexed by actual popcount of u0, which is NOT always
+ * the same as the amount of bits set in the subtable.
+ * 2) mf_masks, tbl_block and blocks_scratch are all "flat" arrays, so
+ * the index is always u0_count.
+ */
+ netdev_flow_key_flatten_unit(&pkt_blocks[pkt_bits_u0_pop],
+ &tbl_blocks[u0_count],
+ &mf_masks[u0_count],
+ &blocks_scratch[u0_count],
+ pkt_bits_u1,
+ u1_count);
+}
+
+/* Compares a rule and the blocks representing a key, returns 1 on a match. */
+static inline uint64_t
+netdev_rule_matches_key(const struct dpcls_rule *rule,
+ const uint32_t mf_bits_total,
+ const uint64_t *blocks_scratch)
+{
+ const uint64_t *keyp = miniflow_get_values(&rule->flow.mf);
+ const uint64_t *maskp = miniflow_get_values(&rule->mask->mf);
+ uint64_t not_match = 0;
+
+ for (int i = 0; i < mf_bits_total; i++) {
+ not_match |= (blocks_scratch[i] & maskp[i]) != keyp[i];
+ }