lib/dpif-netdev-lookup-generic.c

   1 /*
   2  * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014, 2016, 2017 Nicira, Inc.
   3  * Copyright (c) 2019 Intel Corporation.
   4  *
   5  * Licensed under the Apache License, Version 2.0 (the "License");
   6  * you may not use this file except in compliance with the License.
   7  * You may obtain a copy of the License at:
   8  *
   9  *     http://www.apache.org/licenses/LICENSE-2.0
  10  *
  11  * Unless required by applicable law or agreed to in writing, software
  12  * distributed under the License is distributed on an "AS IS" BASIS,
  13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14  * See the License for the specific language governing permissions and
  15  * limitations under the License.
  16  */
  17
  18 #include <config.h>
  19 #include "dpif-netdev.h"
  20 #include "dpif-netdev-private.h"
  21
  22 #include "bitmap.h"
  23 #include "cmap.h"
  24
  25 #include "dp-packet.h"
  26 #include "dpif.h"
  27 #include "dpif-netdev-perf.h"
  28 #include "dpif-provider.h"
  29 #include "flow.h"
  30 #include "ovs-thread.h"
  31 #include "packets.h"
  32 #include "pvector.h"
  33
  34 VLOG_DEFINE_THIS_MODULE(dpif_lookup_generic);
  35
  36 /* Lookup functions below depends on the internal structure of flowmap. */
  37 BUILD_ASSERT_DECL(FLOWMAP_UNITS == 2);
  38
  39 struct block_array {
  40     uint32_t count; /* Number of items allocated in 'blocks' */
  41     uint64_t blocks[];
  42 };
  43
  44 DEFINE_PER_THREAD_MALLOCED_DATA(struct block_array *, block_array);
  45
  46 static inline uint64_t *
  47 get_blocks_scratch(uint32_t required_count)
  48 {
  49     struct block_array *array = block_array_get();
  50
  51     /* Check if this thread already has a large enough array allocated.
  52      * This is a predictable and unlikely branch, as it occurs only once at
  53      * startup, or if a subtable with higher block count is added.
  54      */
  55     if (OVS_UNLIKELY(!array || array->count < required_count)) {
  56         array = xrealloc(array, sizeof *array +
  57                          (required_count * sizeof array->blocks[0]));
  58         array->count = required_count;
  59         block_array_set_unsafe(array);
  60         VLOG_DBG("Block array resized to %"PRIu32, required_count);
  61     }
  62
  63     return &array->blocks[0];
  64 }
  65
  66 static inline void
  67 netdev_flow_key_flatten_unit(const uint64_t *pkt_blocks,
  68                              const uint64_t *tbl_blocks,
  69                              const uint64_t *mf_masks,
  70                              uint64_t *blocks_scratch,
  71                              const uint64_t pkt_mf_bits,
  72                              const uint32_t count)
  73 {
  74     uint32_t i;
  75
  76     for (i = 0; i < count; i++) {
  77         uint64_t mf_mask = mf_masks[i];
  78         /* Calculate the block index for the packet metadata. */
  79         uint64_t idx_bits = mf_mask & pkt_mf_bits;
  80         const uint32_t pkt_idx = count_1bits(idx_bits);
  81
  82         /* Check if the packet has the subtable miniflow bit set. If yes, the
  83          * block at the above pkt_idx will be stored, otherwise it is masked
  84          * out to be zero.
  85          */
  86         uint64_t pkt_has_mf_bit = (mf_mask + 1) & pkt_mf_bits;
  87         uint64_t no_bit = ((!pkt_has_mf_bit) > 0) - 1;
  88
  89         /* Mask packet block by table block, and mask to zero if packet
  90          * doesn't actually contain this block of metadata.
  91          */
  92         blocks_scratch[i] = pkt_blocks[pkt_idx] & tbl_blocks[i] & no_bit;
  93     }
  94 }
  95
  96 /* This function takes a packet, and subtable and writes an array of uint64_t
  97  * blocks. The blocks contain the metadata that the subtable matches on, in
  98  * the same order as the subtable, allowing linear iteration over the blocks.
  99  *
 100  * To calculate the blocks contents, the netdev_flow_key_flatten_unit function
 101  * is called twice, once for each "unit" of the miniflow. This call can be
 102  * inlined by the compiler for performance.
 103  *
 104  * Note that the u0_count and u1_count variables can be compile-time constants,
 105  * allowing the loop in the inlined flatten_unit() function to be compile-time
 106  * unrolled, or possibly removed totally by unrolling by the loop iterations.
 107  * The compile time optimizations enabled by this design improves performance.
 108  */
 109 static inline void
 110 netdev_flow_key_flatten(const struct netdev_flow_key *key,
 111                         const struct netdev_flow_key *mask,
 112                         const uint64_t *mf_masks,
 113                         uint64_t *blocks_scratch,
 114                         const uint32_t u0_count,
 115                         const uint32_t u1_count)
 116 {
 117     /* Load mask from subtable, mask with packet mf, popcount to get idx. */
 118     const uint64_t *pkt_blocks = miniflow_get_values(&key->mf);
 119     const uint64_t *tbl_blocks = miniflow_get_values(&mask->mf);
 120
 121     /* Packet miniflow bits to be masked by pre-calculated mf_masks. */
 122     const uint64_t pkt_bits_u0 = key->mf.map.bits[0];
 123     const uint32_t pkt_bits_u0_pop = count_1bits(pkt_bits_u0);
 124     const uint64_t pkt_bits_u1 = key->mf.map.bits[1];
 125
 126     /* Unit 0 flattening */
 127     netdev_flow_key_flatten_unit(&pkt_blocks[0],
 128                                  &tbl_blocks[0],
 129                                  &mf_masks[0],
 130                                  &blocks_scratch[0],
 131                                  pkt_bits_u0,
 132                                  u0_count);
 133
 134     /* Unit 1 flattening:
 135      * Move the pointers forward in the arrays based on u0 offsets, NOTE:
 136      * 1) pkt blocks indexed by actual popcount of u0, which is NOT always
 137      *    the same as the amount of bits set in the subtable.
 138      * 2) mf_masks, tbl_block and blocks_scratch are all "flat" arrays, so
 139      *    the index is always u0_count.
 140      */
 141     netdev_flow_key_flatten_unit(&pkt_blocks[pkt_bits_u0_pop],
 142                                  &tbl_blocks[u0_count],
 143                                  &mf_masks[u0_count],
 144                                  &blocks_scratch[u0_count],
 145                                  pkt_bits_u1,
 146                                  u1_count);
 147 }
 148
 149 /* Compares a rule and the blocks representing a key, returns 1 on a match. */
 150 static inline uint64_t
 151 netdev_rule_matches_key(const struct dpcls_rule *rule,
 152                         const uint32_t mf_bits_total,
 153                         const uint64_t *blocks_scratch)
 154 {
 155     const uint64_t *keyp = miniflow_get_values(&rule->flow.mf);
 156     const uint64_t *maskp = miniflow_get_values(&rule->mask->mf);
 157     uint64_t not_match = 0;
 158
 159     for (int i = 0; i < mf_bits_total; i++) {
 160         not_match |= (blocks_scratch[i] & maskp[i]) != keyp[i];
 161     }
 162
 163     /* Invert result to show match as 1. */
 164     return !not_match;
 165 }
 166
 167 /* Const prop version of the function: note that mf bits total and u0 are
 168  * explicitly passed in here, while they're also available at runtime from the
 169  * subtable pointer. By making them compile time, we enable the compiler to
 170  * unroll loops and flatten out code-sequences based on the knowledge of the
 171  * mf_bits_* compile time values. This results in improved performance.
 172  *
 173  * Note: this function is marked with ALWAYS_INLINE to ensure the compiler
 174  * inlines the below code, and then uses the compile time constants to make
 175  * specialized versions of the runtime code. Without ALWAYS_INLINE, the
 176  * compiler might decide to not inline, and performance will suffer.
 177  */
 178 static inline uint32_t ALWAYS_INLINE
 179 lookup_generic_impl(struct dpcls_subtable *subtable,
 180                     uint32_t keys_map,
 181                     const struct netdev_flow_key *keys[],
 182                     struct dpcls_rule **rules,
 183                     const uint32_t bit_count_u0,
 184                     const uint32_t bit_count_u1)
 185 {
 186     const uint32_t n_pkts = count_1bits(keys_map);
 187     ovs_assert(NETDEV_MAX_BURST >= n_pkts);
 188     uint32_t hashes[NETDEV_MAX_BURST];
 189
 190     const uint32_t bit_count_total = bit_count_u0 + bit_count_u1;
 191     const uint32_t block_count_required = bit_count_total * NETDEV_MAX_BURST;
 192     uint64_t *mf_masks = subtable->mf_masks;
 193     int i;
 194
 195     /* Blocks scratch is an optimization to re-use the same packet miniflow
 196      * block data when doing rule-verify. This reduces work done during lookup
 197      * and hence improves performance. The blocks_scratch array is stored as a
 198      * thread local variable, as each thread requires its own blocks memory.
 199      */
 200     uint64_t *blocks_scratch = get_blocks_scratch(block_count_required);
 201
 202     /* Flatten the packet metadata into the blocks_scratch[] using subtable. */
 203     ULLONG_FOR_EACH_1 (i, keys_map) {
 204             netdev_flow_key_flatten(keys[i],
 205                                     &subtable->mask,
 206                                     mf_masks,
 207                                     &blocks_scratch[i * bit_count_total],
 208                                     bit_count_u0,
 209                                     bit_count_u1);
 210     }
 211
 212     /* Hash the now linearized blocks of packet metadata. */
 213     ULLONG_FOR_EACH_1 (i, keys_map) {
 214         uint64_t *block_ptr = &blocks_scratch[i * bit_count_total];
 215         uint32_t hash = hash_add_words64(0, block_ptr, bit_count_total);
 216         hashes[i] = hash_finish(hash, bit_count_total * 8);
 217     }
 218
 219     /* Lookup: this returns a bitmask of packets where the hash table had
 220      * an entry for the given hash key. Presence of a hash key does not
 221      * guarantee matching the key, as there can be hash collisions.
 222      */
 223     uint32_t found_map;
 224     const struct cmap_node *nodes[NETDEV_MAX_BURST];
 225
 226     found_map = cmap_find_batch(&subtable->rules, keys_map, hashes, nodes);
 227
 228     /* Verify that packet actually matched rule. If not found, a hash
 229      * collision has taken place, so continue searching with the next node.
 230      */
 231     ULLONG_FOR_EACH_1 (i, found_map) {
 232         struct dpcls_rule *rule;
 233
 234         CMAP_NODE_FOR_EACH (rule, cmap_node, nodes[i]) {
 235             const uint32_t cidx = i * bit_count_total;
 236             uint32_t match = netdev_rule_matches_key(rule, bit_count_total,
 237                                                      &blocks_scratch[cidx]);
 238
 239             if (OVS_LIKELY(match)) {
 240                 rules[i] = rule;
 241                 subtable->hit_cnt++;
 242                 goto next;
 243             }
 244         }
 245
 246         /* None of the found rules was a match.  Reset the i-th bit to
 247          * keep searching this key in the next subtable. */
 248         ULLONG_SET0(found_map, i);  /* Did not match. */
 249     next:
 250         ; /* Keep Sparse happy. */
 251     }
 252
 253     return found_map;
 254 }
 255
 256 /* Generic lookup function that uses runtime provided mf bits for iterating. */
 257 uint32_t
 258 dpcls_subtable_lookup_generic(struct dpcls_subtable *subtable,
 259                               uint32_t keys_map,
 260                               const struct netdev_flow_key *keys[],
 261                               struct dpcls_rule **rules)
 262 {
 263     /* Here the runtime subtable->mf_bits counts are used, which forces the
 264      * compiler to iterate normal for() loops. Due to this limitation in the
 265      * compilers available optimizations, this function has lower performance
 266      * than the below specialized functions.
 267      */
 268     return lookup_generic_impl(subtable, keys_map, keys, rules,
 269                                subtable->mf_bits_set_unit0,
 270                                subtable->mf_bits_set_unit1);
 271 }
 272
 273 /* Expand out specialized functions with U0 and U1 bit attributes. */
 274 #define DECLARE_OPTIMIZED_LOOKUP_FUNCTION(U0, U1)                             \
 275     static uint32_t                                                           \
 276     dpcls_subtable_lookup_mf_u0w##U0##_u1w##U1(                               \
 277                                          struct dpcls_subtable *subtable,     \
 278                                          uint32_t keys_map,                   \
 279                                          const struct netdev_flow_key *keys[],\
 280                                          struct dpcls_rule **rules)           \
 281     {                                                                         \
 282         return lookup_generic_impl(subtable, keys_map, keys, rules, U0, U1);  \
 283     }                                                                         \
 284
 285 DECLARE_OPTIMIZED_LOOKUP_FUNCTION(5, 1)
 286 DECLARE_OPTIMIZED_LOOKUP_FUNCTION(4, 1)
 287 DECLARE_OPTIMIZED_LOOKUP_FUNCTION(4, 0)
 288
 289 /* Check if a specialized function is valid for the required subtable. */
 290 #define CHECK_LOOKUP_FUNCTION(U0, U1)                                          \
 291     if (!f && u0_bits == U0 && u1_bits == U1) {                               \
 292         f = dpcls_subtable_lookup_mf_u0w##U0##_u1w##U1;                       \
 293     }
 294
 295 /* Probe function to lookup an available specialized function.
 296  * If capable to run the requested miniflow fingerprint, this function returns
 297  * the most optimal implementation for that miniflow fingerprint.
 298  * @retval Non-NULL A valid function to handle the miniflow bit pattern
 299  * @retval NULL The requested miniflow is not supported by this implementation.
 300  */
 301 dpcls_subtable_lookup_func
 302 dpcls_subtable_generic_probe(uint32_t u0_bits, uint32_t u1_bits)
 303 {
 304     dpcls_subtable_lookup_func f = NULL;
 305
 306     CHECK_LOOKUP_FUNCTION(5, 1);
 307     CHECK_LOOKUP_FUNCTION(4, 1);
 308     CHECK_LOOKUP_FUNCTION(4, 0);
 309
 310     if (f) {
 311         VLOG_DBG("Subtable using Generic Optimized for u0 %d, u1 %d\n",
 312                  u0_bits, u1_bits);
 313     }
 314     return f;
 315 }