2 * Copyright (c) 2020, Intel Corporation.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
18 #if !defined(__CHECKER__)
22 #include "dpif-netdev.h"
23 #include "dpif-netdev-lookup.h"
24 #include "dpif-netdev-private.h"
28 #include "openvswitch/vlog.h"
30 #include "immintrin.h"
32 /* Each AVX512 register (zmm register in assembly notation) can contain up to
33 * 512 bits, which is equivalent to 8 uint64_t variables. This is the maximum
34 * number of miniflow blocks that can be processed in a single pass of the
35 * AVX512 code at a time.
37 #define NUM_U64_IN_ZMM_REG (8)
38 #define BLOCKS_CACHE_SIZE (NETDEV_MAX_BURST * NUM_U64_IN_ZMM_REG)
41 VLOG_DEFINE_THIS_MODULE(dpif_lookup_avx512_gather
);
44 _mm512_popcnt_epi64_manual(__m512i v_in
)
46 static const uint8_t pop_lut
[64] = {
47 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
48 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
49 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
50 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
52 __m512i v_pop_lut
= _mm512_loadu_si512(pop_lut
);
54 __m512i v_in_srl8
= _mm512_srli_epi64(v_in
, 4);
55 __m512i v_nibble_mask
= _mm512_set1_epi8(0xF);
56 __m512i v_in_lo
= _mm512_and_si512(v_in
, v_nibble_mask
);
57 __m512i v_in_hi
= _mm512_and_si512(v_in_srl8
, v_nibble_mask
);
59 __m512i v_lo_pop
= _mm512_shuffle_epi8(v_pop_lut
, v_in_lo
);
60 __m512i v_hi_pop
= _mm512_shuffle_epi8(v_pop_lut
, v_in_hi
);
61 __m512i v_u8_pop
= _mm512_add_epi8(v_lo_pop
, v_hi_pop
);
63 return _mm512_sad_epu8(v_u8_pop
, _mm512_setzero_si512());
66 static inline uint64_t
67 netdev_rule_matches_key(const struct dpcls_rule
*rule
,
68 const uint32_t mf_bits_total
,
69 const uint64_t * block_cache
)
71 const uint64_t *keyp
= miniflow_get_values(&rule
->flow
.mf
);
72 const uint64_t *maskp
= miniflow_get_values(&rule
->mask
->mf
);
73 const uint32_t lane_mask
= (1 << mf_bits_total
) - 1;
75 /* Always load a full cache line from blocks_cache. Other loads must be
76 * trimmed to the amount of data required for mf_bits_total blocks.
78 __m512i v_blocks
= _mm512_loadu_si512(&block_cache
[0]);
79 __m512i v_mask
= _mm512_maskz_loadu_epi64(lane_mask
, &maskp
[0]);
80 __m512i v_key
= _mm512_maskz_loadu_epi64(lane_mask
, &keyp
[0]);
82 __m512i v_data
= _mm512_and_si512(v_blocks
, v_mask
);
83 uint32_t res_mask
= _mm512_mask_cmpeq_epi64_mask(lane_mask
, v_data
, v_key
);
85 /* returns 1 assuming result of SIMD compare is all blocks. */
86 return res_mask
== lane_mask
;
89 static inline uint32_t ALWAYS_INLINE
90 avx512_lookup_impl(struct dpcls_subtable
*subtable
,
92 const struct netdev_flow_key
*keys
[],
93 struct dpcls_rule
**rules
,
94 const uint32_t bit_count_u0
,
95 const uint32_t bit_count_u1
)
97 OVS_ALIGNED_VAR(CACHE_LINE_SIZE
)uint64_t block_cache
[BLOCKS_CACHE_SIZE
];
99 const uint32_t bit_count_total
= bit_count_u0
+ bit_count_u1
;
101 uint32_t hashes
[NETDEV_MAX_BURST
];
102 const uint32_t n_pkts
= __builtin_popcountll(keys_map
);
103 ovs_assert(NETDEV_MAX_BURST
>= n_pkts
);
105 const uint64_t tbl_u0
= subtable
->mask
.mf
.map
.bits
[0];
106 const uint64_t tbl_u1
= subtable
->mask
.mf
.map
.bits
[1];
108 /* Load subtable blocks for masking later. */
109 const uint64_t *tbl_blocks
= miniflow_get_values(&subtable
->mask
.mf
);
110 const __m512i v_tbl_blocks
= _mm512_loadu_si512(&tbl_blocks
[0]);
112 /* Load pre-created subtable masks for each block in subtable. */
113 const __mmask8 bit_count_total_mask
= (1 << bit_count_total
) - 1;
114 const __m512i v_mf_masks
= _mm512_maskz_loadu_epi64(bit_count_total_mask
,
117 ULLONG_FOR_EACH_1 (i
, keys_map
) {
118 const uint64_t pkt_mf_u0_bits
= keys
[i
]->mf
.map
.bits
[0];
119 const uint64_t pkt_mf_u0_pop
= __builtin_popcountll(pkt_mf_u0_bits
);
121 /* Pre-create register with *PER PACKET* u0 offset. */
122 const __mmask8 u1_bcast_mask
= (UINT8_MAX
<< bit_count_u0
);
123 const __m512i v_idx_u0_offset
= _mm512_maskz_set1_epi64(u1_bcast_mask
,
126 /* Broadcast u0, u1 bitmasks to 8x u64 lanes. */
127 __m512i v_u0
= _mm512_set1_epi64(pkt_mf_u0_bits
);
128 __m512i v_pkt_bits
= _mm512_mask_set1_epi64(v_u0
, u1_bcast_mask
,
129 keys
[i
]->mf
.map
.bits
[1]);
131 /* Bitmask by pre-created masks. */
132 __m512i v_masks
= _mm512_and_si512(v_pkt_bits
, v_mf_masks
);
134 /* Manual AVX512 popcount for u64 lanes. */
135 __m512i v_popcnts
= _mm512_popcnt_epi64_manual(v_masks
);
137 /* Offset popcounts for u1 with pre-created offset register. */
138 __m512i v_indexes
= _mm512_add_epi64(v_popcnts
, v_idx_u0_offset
);
140 /* Gather u64 blocks from packet miniflow. */
141 const __m512i v_zeros
= _mm512_setzero_si512();
142 const void *pkt_data
= miniflow_get_values(&keys
[i
]->mf
);
143 __m512i v_all_blocks
= _mm512_mask_i64gather_epi64(v_zeros
,
144 bit_count_total_mask
, v_indexes
,
147 /* Zero out bits that pkt doesn't have:
148 * - 2x pext() to extract bits from packet miniflow as needed by TBL
149 * - Shift u1 over by bit_count of u0, OR to create zero bitmask
151 uint64_t u0_to_zero
= _pext_u64(keys
[i
]->mf
.map
.bits
[0], tbl_u0
);
152 uint64_t u1_to_zero
= _pext_u64(keys
[i
]->mf
.map
.bits
[1], tbl_u1
);
153 uint64_t zero_mask
= (u1_to_zero
<< bit_count_u0
) | u0_to_zero
;
155 /* Mask blocks using AND with subtable blocks, use k-mask to zero
156 * where lanes as required for this packet.
158 __m512i v_masked_blocks
= _mm512_maskz_and_epi64(zero_mask
,
159 v_all_blocks
, v_tbl_blocks
);
161 /* Store to blocks cache, full cache line aligned. */
162 _mm512_storeu_si512(&block_cache
[i
* 8], v_masked_blocks
);
165 /* Hash the now linearized blocks of packet metadata. */
166 ULLONG_FOR_EACH_1 (i
, keys_map
) {
167 uint64_t *block_ptr
= &block_cache
[i
* 8];
168 uint32_t hash
= hash_add_words64(0, block_ptr
, bit_count_total
);
169 hashes
[i
] = hash_finish(hash
, bit_count_total
* 8);
172 /* Lookup: this returns a bitmask of packets where the hash table had
173 * an entry for the given hash key. Presence of a hash key does not
174 * guarantee matching the key, as there can be hash collisions.
177 const struct cmap_node
*nodes
[NETDEV_MAX_BURST
];
178 found_map
= cmap_find_batch(&subtable
->rules
, keys_map
, hashes
, nodes
);
180 /* Verify that packet actually matched rule. If not found, a hash
181 * collision has taken place, so continue searching with the next node.
183 ULLONG_FOR_EACH_1 (i
, found_map
) {
184 struct dpcls_rule
*rule
;
186 CMAP_NODE_FOR_EACH (rule
, cmap_node
, nodes
[i
]) {
187 const uint32_t cidx
= i
* 8;
188 uint32_t match
= netdev_rule_matches_key(rule
, bit_count_total
,
190 if (OVS_LIKELY(match
)) {
197 /* None of the found rules was a match. Clear the i-th bit to
198 * search for this key in the next subtable. */
199 ULLONG_SET0(found_map
, i
);
201 ; /* Keep Sparse happy. */
207 /* Expand out specialized functions with U0 and U1 bit attributes. */
208 #define DECLARE_OPTIMIZED_LOOKUP_FUNCTION(U0, U1) \
210 dpcls_avx512_gather_mf_##U0##_##U1(struct dpcls_subtable *subtable, \
212 const struct netdev_flow_key *keys[], \
213 struct dpcls_rule **rules) \
215 return avx512_lookup_impl(subtable, keys_map, keys, rules, U0, U1); \
218 DECLARE_OPTIMIZED_LOOKUP_FUNCTION(5, 1)
219 DECLARE_OPTIMIZED_LOOKUP_FUNCTION(4, 1)
220 DECLARE_OPTIMIZED_LOOKUP_FUNCTION(4, 0)
222 /* Check if a specialized function is valid for the required subtable. */
223 #define CHECK_LOOKUP_FUNCTION(U0, U1) \
224 ovs_assert((U0 + U1) <= NUM_U64_IN_ZMM_REG); \
225 if (!f && u0_bits == U0 && u1_bits == U1) { \
226 f = dpcls_avx512_gather_mf_##U0##_##U1; \
230 dpcls_avx512_gather_mf_any(struct dpcls_subtable
*subtable
, uint32_t keys_map
,
231 const struct netdev_flow_key
*keys
[],
232 struct dpcls_rule
**rules
)
234 return avx512_lookup_impl(subtable
, keys_map
, keys
, rules
,
235 subtable
->mf_bits_set_unit0
,
236 subtable
->mf_bits_set_unit1
);
239 dpcls_subtable_lookup_func
240 dpcls_subtable_avx512_gather_probe(uint32_t u0_bits
, uint32_t u1_bits
)
242 dpcls_subtable_lookup_func f
= NULL
;
244 int avx512f_available
= dpdk_get_cpu_has_isa("x86_64", "avx512f");
245 int bmi2_available
= dpdk_get_cpu_has_isa("x86_64", "bmi2");
246 if (!avx512f_available
|| !bmi2_available
) {
250 CHECK_LOOKUP_FUNCTION(5, 1);
251 CHECK_LOOKUP_FUNCTION(4, 1);
252 CHECK_LOOKUP_FUNCTION(4, 0);
254 if (!f
&& (u0_bits
+ u1_bits
) < NUM_U64_IN_ZMM_REG
) {
255 f
= dpcls_avx512_gather_mf_any
;
256 VLOG_INFO("Using avx512_gather_mf_any for subtable (%d,%d)\n",
264 #endif /* __x86_64__ */