]> git.proxmox.com Git - mirror_ovs.git/blob - lib/dpif-netdev-lookup-avx512-gather.c
ovsdb-idl: Fix iteration over tracked rows with no actual data.
[mirror_ovs.git] / lib / dpif-netdev-lookup-avx512-gather.c
1 /*
2 * Copyright (c) 2020, Intel Corporation.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #ifdef __x86_64__
18 #if !defined(__CHECKER__)
19
20 #include <config.h>
21
22 #include "dpif-netdev.h"
23 #include "dpif-netdev-lookup.h"
24 #include "dpif-netdev-private.h"
25 #include "cmap.h"
26 #include "flow.h"
27 #include "pvector.h"
28 #include "openvswitch/vlog.h"
29
30 #include "immintrin.h"
31
32 /* Each AVX512 register (zmm register in assembly notation) can contain up to
33 * 512 bits, which is equivalent to 8 uint64_t variables. This is the maximum
34 * number of miniflow blocks that can be processed in a single pass of the
35 * AVX512 code at a time.
36 */
37 #define NUM_U64_IN_ZMM_REG (8)
38 #define BLOCKS_CACHE_SIZE (NETDEV_MAX_BURST * NUM_U64_IN_ZMM_REG)
39
40
41 VLOG_DEFINE_THIS_MODULE(dpif_lookup_avx512_gather);
42
43 static inline __m512i
44 _mm512_popcnt_epi64_manual(__m512i v_in)
45 {
46 static const uint8_t pop_lut[64] = {
47 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
48 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
49 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
50 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
51 };
52 __m512i v_pop_lut = _mm512_loadu_si512(pop_lut);
53
54 __m512i v_in_srl8 = _mm512_srli_epi64(v_in, 4);
55 __m512i v_nibble_mask = _mm512_set1_epi8(0xF);
56 __m512i v_in_lo = _mm512_and_si512(v_in, v_nibble_mask);
57 __m512i v_in_hi = _mm512_and_si512(v_in_srl8, v_nibble_mask);
58
59 __m512i v_lo_pop = _mm512_shuffle_epi8(v_pop_lut, v_in_lo);
60 __m512i v_hi_pop = _mm512_shuffle_epi8(v_pop_lut, v_in_hi);
61 __m512i v_u8_pop = _mm512_add_epi8(v_lo_pop, v_hi_pop);
62
63 return _mm512_sad_epu8(v_u8_pop, _mm512_setzero_si512());
64 }
65
66 static inline uint64_t
67 netdev_rule_matches_key(const struct dpcls_rule *rule,
68 const uint32_t mf_bits_total,
69 const uint64_t * block_cache)
70 {
71 const uint64_t *keyp = miniflow_get_values(&rule->flow.mf);
72 const uint64_t *maskp = miniflow_get_values(&rule->mask->mf);
73 const uint32_t lane_mask = (1 << mf_bits_total) - 1;
74
75 /* Always load a full cache line from blocks_cache. Other loads must be
76 * trimmed to the amount of data required for mf_bits_total blocks.
77 */
78 __m512i v_blocks = _mm512_loadu_si512(&block_cache[0]);
79 __m512i v_mask = _mm512_maskz_loadu_epi64(lane_mask, &maskp[0]);
80 __m512i v_key = _mm512_maskz_loadu_epi64(lane_mask, &keyp[0]);
81
82 __m512i v_data = _mm512_and_si512(v_blocks, v_mask);
83 uint32_t res_mask = _mm512_mask_cmpeq_epi64_mask(lane_mask, v_data, v_key);
84
85 /* returns 1 assuming result of SIMD compare is all blocks. */
86 return res_mask == lane_mask;
87 }
88
89 static inline uint32_t ALWAYS_INLINE
90 avx512_lookup_impl(struct dpcls_subtable *subtable,
91 uint32_t keys_map,
92 const struct netdev_flow_key *keys[],
93 struct dpcls_rule **rules,
94 const uint32_t bit_count_u0,
95 const uint32_t bit_count_u1)
96 {
97 OVS_ALIGNED_VAR(CACHE_LINE_SIZE)uint64_t block_cache[BLOCKS_CACHE_SIZE];
98
99 const uint32_t bit_count_total = bit_count_u0 + bit_count_u1;
100 int i;
101 uint32_t hashes[NETDEV_MAX_BURST];
102 const uint32_t n_pkts = __builtin_popcountll(keys_map);
103 ovs_assert(NETDEV_MAX_BURST >= n_pkts);
104
105 const uint64_t tbl_u0 = subtable->mask.mf.map.bits[0];
106 const uint64_t tbl_u1 = subtable->mask.mf.map.bits[1];
107
108 /* Load subtable blocks for masking later. */
109 const uint64_t *tbl_blocks = miniflow_get_values(&subtable->mask.mf);
110 const __m512i v_tbl_blocks = _mm512_loadu_si512(&tbl_blocks[0]);
111
112 /* Load pre-created subtable masks for each block in subtable. */
113 const __mmask8 bit_count_total_mask = (1 << bit_count_total) - 1;
114 const __m512i v_mf_masks = _mm512_maskz_loadu_epi64(bit_count_total_mask,
115 subtable->mf_masks);
116
117 ULLONG_FOR_EACH_1 (i, keys_map) {
118 const uint64_t pkt_mf_u0_bits = keys[i]->mf.map.bits[0];
119 const uint64_t pkt_mf_u0_pop = __builtin_popcountll(pkt_mf_u0_bits);
120
121 /* Pre-create register with *PER PACKET* u0 offset. */
122 const __mmask8 u1_bcast_mask = (UINT8_MAX << bit_count_u0);
123 const __m512i v_idx_u0_offset = _mm512_maskz_set1_epi64(u1_bcast_mask,
124 pkt_mf_u0_pop);
125
126 /* Broadcast u0, u1 bitmasks to 8x u64 lanes. */
127 __m512i v_u0 = _mm512_set1_epi64(pkt_mf_u0_bits);
128 __m512i v_pkt_bits = _mm512_mask_set1_epi64(v_u0, u1_bcast_mask,
129 keys[i]->mf.map.bits[1]);
130
131 /* Bitmask by pre-created masks. */
132 __m512i v_masks = _mm512_and_si512(v_pkt_bits, v_mf_masks);
133
134 /* Manual AVX512 popcount for u64 lanes. */
135 __m512i v_popcnts = _mm512_popcnt_epi64_manual(v_masks);
136
137 /* Offset popcounts for u1 with pre-created offset register. */
138 __m512i v_indexes = _mm512_add_epi64(v_popcnts, v_idx_u0_offset);
139
140 /* Gather u64 blocks from packet miniflow. */
141 const __m512i v_zeros = _mm512_setzero_si512();
142 const void *pkt_data = miniflow_get_values(&keys[i]->mf);
143 __m512i v_all_blocks = _mm512_mask_i64gather_epi64(v_zeros,
144 bit_count_total_mask, v_indexes,
145 pkt_data, 8);
146
147 /* Zero out bits that pkt doesn't have:
148 * - 2x pext() to extract bits from packet miniflow as needed by TBL
149 * - Shift u1 over by bit_count of u0, OR to create zero bitmask
150 */
151 uint64_t u0_to_zero = _pext_u64(keys[i]->mf.map.bits[0], tbl_u0);
152 uint64_t u1_to_zero = _pext_u64(keys[i]->mf.map.bits[1], tbl_u1);
153 uint64_t zero_mask = (u1_to_zero << bit_count_u0) | u0_to_zero;
154
155 /* Mask blocks using AND with subtable blocks, use k-mask to zero
156 * where lanes as required for this packet.
157 */
158 __m512i v_masked_blocks = _mm512_maskz_and_epi64(zero_mask,
159 v_all_blocks, v_tbl_blocks);
160
161 /* Store to blocks cache, full cache line aligned. */
162 _mm512_storeu_si512(&block_cache[i * 8], v_masked_blocks);
163 }
164
165 /* Hash the now linearized blocks of packet metadata. */
166 ULLONG_FOR_EACH_1 (i, keys_map) {
167 uint64_t *block_ptr = &block_cache[i * 8];
168 uint32_t hash = hash_add_words64(0, block_ptr, bit_count_total);
169 hashes[i] = hash_finish(hash, bit_count_total * 8);
170 }
171
172 /* Lookup: this returns a bitmask of packets where the hash table had
173 * an entry for the given hash key. Presence of a hash key does not
174 * guarantee matching the key, as there can be hash collisions.
175 */
176 uint32_t found_map;
177 const struct cmap_node *nodes[NETDEV_MAX_BURST];
178 found_map = cmap_find_batch(&subtable->rules, keys_map, hashes, nodes);
179
180 /* Verify that packet actually matched rule. If not found, a hash
181 * collision has taken place, so continue searching with the next node.
182 */
183 ULLONG_FOR_EACH_1 (i, found_map) {
184 struct dpcls_rule *rule;
185
186 CMAP_NODE_FOR_EACH (rule, cmap_node, nodes[i]) {
187 const uint32_t cidx = i * 8;
188 uint32_t match = netdev_rule_matches_key(rule, bit_count_total,
189 &block_cache[cidx]);
190 if (OVS_LIKELY(match)) {
191 rules[i] = rule;
192 subtable->hit_cnt++;
193 goto next;
194 }
195 }
196
197 /* None of the found rules was a match. Clear the i-th bit to
198 * search for this key in the next subtable. */
199 ULLONG_SET0(found_map, i);
200 next:
201 ; /* Keep Sparse happy. */
202 }
203
204 return found_map;
205 }
206
207 /* Expand out specialized functions with U0 and U1 bit attributes. */
208 #define DECLARE_OPTIMIZED_LOOKUP_FUNCTION(U0, U1) \
209 static uint32_t \
210 dpcls_avx512_gather_mf_##U0##_##U1(struct dpcls_subtable *subtable, \
211 uint32_t keys_map, \
212 const struct netdev_flow_key *keys[], \
213 struct dpcls_rule **rules) \
214 { \
215 return avx512_lookup_impl(subtable, keys_map, keys, rules, U0, U1); \
216 } \
217
218 DECLARE_OPTIMIZED_LOOKUP_FUNCTION(5, 1)
219 DECLARE_OPTIMIZED_LOOKUP_FUNCTION(4, 1)
220 DECLARE_OPTIMIZED_LOOKUP_FUNCTION(4, 0)
221
222 /* Check if a specialized function is valid for the required subtable. */
223 #define CHECK_LOOKUP_FUNCTION(U0, U1) \
224 ovs_assert((U0 + U1) <= NUM_U64_IN_ZMM_REG); \
225 if (!f && u0_bits == U0 && u1_bits == U1) { \
226 f = dpcls_avx512_gather_mf_##U0##_##U1; \
227 }
228
229 static uint32_t
230 dpcls_avx512_gather_mf_any(struct dpcls_subtable *subtable, uint32_t keys_map,
231 const struct netdev_flow_key *keys[],
232 struct dpcls_rule **rules)
233 {
234 return avx512_lookup_impl(subtable, keys_map, keys, rules,
235 subtable->mf_bits_set_unit0,
236 subtable->mf_bits_set_unit1);
237 }
238
239 dpcls_subtable_lookup_func
240 dpcls_subtable_avx512_gather_probe(uint32_t u0_bits, uint32_t u1_bits)
241 {
242 dpcls_subtable_lookup_func f = NULL;
243
244 int avx512f_available = dpdk_get_cpu_has_isa("x86_64", "avx512f");
245 int bmi2_available = dpdk_get_cpu_has_isa("x86_64", "bmi2");
246 if (!avx512f_available || !bmi2_available) {
247 return NULL;
248 }
249
250 CHECK_LOOKUP_FUNCTION(5, 1);
251 CHECK_LOOKUP_FUNCTION(4, 1);
252 CHECK_LOOKUP_FUNCTION(4, 0);
253
254 if (!f && (u0_bits + u1_bits) < NUM_U64_IN_ZMM_REG) {
255 f = dpcls_avx512_gather_mf_any;
256 VLOG_INFO("Using avx512_gather_mf_any for subtable (%d,%d)\n",
257 u0_bits, u1_bits);
258 }
259
260 return f;
261 }
262
263 #endif /* CHECKER */
264 #endif /* __x86_64__ */