]>
Commit | Line | Data |
---|---|---|
9f95a23c TL |
1 | /* SPDX-License-Identifier: BSD-3-Clause |
2 | * Copyright(c) 2010-2014 Intel Corporation | |
7c673cae FG |
3 | */ |
4 | ||
5 | #include "acl_run_sse.h" | |
6 | ||
7 | static const rte_ymm_t ymm_match_mask = { | |
8 | .u32 = { | |
9 | RTE_ACL_NODE_MATCH, | |
10 | RTE_ACL_NODE_MATCH, | |
11 | RTE_ACL_NODE_MATCH, | |
12 | RTE_ACL_NODE_MATCH, | |
13 | RTE_ACL_NODE_MATCH, | |
14 | RTE_ACL_NODE_MATCH, | |
15 | RTE_ACL_NODE_MATCH, | |
16 | RTE_ACL_NODE_MATCH, | |
17 | }, | |
18 | }; | |
19 | ||
20 | static const rte_ymm_t ymm_index_mask = { | |
21 | .u32 = { | |
22 | RTE_ACL_NODE_INDEX, | |
23 | RTE_ACL_NODE_INDEX, | |
24 | RTE_ACL_NODE_INDEX, | |
25 | RTE_ACL_NODE_INDEX, | |
26 | RTE_ACL_NODE_INDEX, | |
27 | RTE_ACL_NODE_INDEX, | |
28 | RTE_ACL_NODE_INDEX, | |
29 | RTE_ACL_NODE_INDEX, | |
30 | }, | |
31 | }; | |
32 | ||
33 | static const rte_ymm_t ymm_shuffle_input = { | |
34 | .u32 = { | |
35 | 0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c, | |
36 | 0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c, | |
37 | }, | |
38 | }; | |
39 | ||
40 | static const rte_ymm_t ymm_ones_16 = { | |
41 | .u16 = { | |
42 | 1, 1, 1, 1, 1, 1, 1, 1, | |
43 | 1, 1, 1, 1, 1, 1, 1, 1, | |
44 | }, | |
45 | }; | |
46 | ||
47 | static const rte_ymm_t ymm_range_base = { | |
48 | .u32 = { | |
49 | 0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c, | |
50 | 0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c, | |
51 | }, | |
52 | }; | |
53 | ||
54 | /* | |
55 | * Process 8 transitions in parallel. | |
56 | * tr_lo contains low 32 bits for 8 transition. | |
57 | * tr_hi contains high 32 bits for 8 transition. | |
58 | * next_input contains up to 4 input bytes for 8 flows. | |
59 | */ | |
9f95a23c | 60 | static __rte_always_inline ymm_t |
7c673cae FG |
61 | transition8(ymm_t next_input, const uint64_t *trans, ymm_t *tr_lo, ymm_t *tr_hi) |
62 | { | |
63 | const int32_t *tr; | |
64 | ymm_t addr; | |
65 | ||
66 | tr = (const int32_t *)(uintptr_t)trans; | |
67 | ||
68 | /* Calculate the address (array index) for all 8 transitions. */ | |
69 | ACL_TR_CALC_ADDR(mm256, 256, addr, ymm_index_mask.y, next_input, | |
70 | ymm_shuffle_input.y, ymm_ones_16.y, ymm_range_base.y, | |
71 | *tr_lo, *tr_hi); | |
72 | ||
73 | /* load lower 32 bits of 8 transactions at once. */ | |
74 | *tr_lo = _mm256_i32gather_epi32(tr, addr, sizeof(trans[0])); | |
75 | ||
76 | next_input = _mm256_srli_epi32(next_input, CHAR_BIT); | |
77 | ||
78 | /* load high 32 bits of 8 transactions at once. */ | |
79 | *tr_hi = _mm256_i32gather_epi32(tr + 1, addr, sizeof(trans[0])); | |
80 | ||
81 | return next_input; | |
82 | } | |
83 | ||
84 | /* | |
85 | * Process matches for 8 flows. | |
86 | * tr_lo contains low 32 bits for 8 transition. | |
87 | * tr_hi contains high 32 bits for 8 transition. | |
88 | */ | |
89 | static inline void | |
90 | acl_process_matches_avx2x8(const struct rte_acl_ctx *ctx, | |
91 | struct parms *parms, struct acl_flow_data *flows, uint32_t slot, | |
92 | ymm_t matches, ymm_t *tr_lo, ymm_t *tr_hi) | |
93 | { | |
94 | ymm_t t0, t1; | |
95 | ymm_t lo, hi; | |
96 | xmm_t l0, l1; | |
97 | uint32_t i; | |
98 | uint64_t tr[MAX_SEARCHES_SSE8]; | |
99 | ||
100 | l1 = _mm256_extracti128_si256(*tr_lo, 1); | |
101 | l0 = _mm256_castsi256_si128(*tr_lo); | |
102 | ||
103 | for (i = 0; i != RTE_DIM(tr) / 2; i++) { | |
104 | ||
105 | /* | |
106 | * Extract low 32bits of each transition. | |
107 | * That's enough to process the match. | |
108 | */ | |
109 | tr[i] = (uint32_t)_mm_cvtsi128_si32(l0); | |
110 | tr[i + 4] = (uint32_t)_mm_cvtsi128_si32(l1); | |
111 | ||
112 | l0 = _mm_srli_si128(l0, sizeof(uint32_t)); | |
113 | l1 = _mm_srli_si128(l1, sizeof(uint32_t)); | |
114 | ||
115 | tr[i] = acl_match_check(tr[i], slot + i, | |
116 | ctx, parms, flows, resolve_priority_sse); | |
117 | tr[i + 4] = acl_match_check(tr[i + 4], slot + i + 4, | |
118 | ctx, parms, flows, resolve_priority_sse); | |
119 | } | |
120 | ||
121 | /* Collect new transitions into 2 YMM registers. */ | |
122 | t0 = _mm256_set_epi64x(tr[5], tr[4], tr[1], tr[0]); | |
123 | t1 = _mm256_set_epi64x(tr[7], tr[6], tr[3], tr[2]); | |
124 | ||
125 | /* For each transition: put low 32 into tr_lo and high 32 into tr_hi */ | |
126 | ACL_TR_HILO(mm256, __m256, t0, t1, lo, hi); | |
127 | ||
128 | /* Keep transitions wth NOMATCH intact. */ | |
129 | *tr_lo = _mm256_blendv_epi8(*tr_lo, lo, matches); | |
130 | *tr_hi = _mm256_blendv_epi8(*tr_hi, hi, matches); | |
131 | } | |
132 | ||
133 | static inline void | |
134 | acl_match_check_avx2x8(const struct rte_acl_ctx *ctx, struct parms *parms, | |
135 | struct acl_flow_data *flows, uint32_t slot, | |
136 | ymm_t *tr_lo, ymm_t *tr_hi, ymm_t match_mask) | |
137 | { | |
138 | uint32_t msk; | |
139 | ymm_t matches, temp; | |
140 | ||
141 | /* test for match node */ | |
142 | temp = _mm256_and_si256(match_mask, *tr_lo); | |
143 | matches = _mm256_cmpeq_epi32(temp, match_mask); | |
144 | msk = _mm256_movemask_epi8(matches); | |
145 | ||
146 | while (msk != 0) { | |
147 | ||
148 | acl_process_matches_avx2x8(ctx, parms, flows, slot, | |
149 | matches, tr_lo, tr_hi); | |
150 | temp = _mm256_and_si256(match_mask, *tr_lo); | |
151 | matches = _mm256_cmpeq_epi32(temp, match_mask); | |
152 | msk = _mm256_movemask_epi8(matches); | |
153 | } | |
154 | } | |
155 | ||
156 | /* | |
157 | * Execute trie traversal for up to 16 flows in parallel. | |
158 | */ | |
159 | static inline int | |
160 | search_avx2x16(const struct rte_acl_ctx *ctx, const uint8_t **data, | |
161 | uint32_t *results, uint32_t total_packets, uint32_t categories) | |
162 | { | |
163 | uint32_t n; | |
164 | struct acl_flow_data flows; | |
165 | uint64_t index_array[MAX_SEARCHES_AVX16]; | |
166 | struct completion cmplt[MAX_SEARCHES_AVX16]; | |
167 | struct parms parms[MAX_SEARCHES_AVX16]; | |
168 | ymm_t input[2], tr_lo[2], tr_hi[2]; | |
169 | ymm_t t0, t1; | |
170 | ||
171 | acl_set_flow(&flows, cmplt, RTE_DIM(cmplt), data, results, | |
172 | total_packets, categories, ctx->trans_table); | |
173 | ||
174 | for (n = 0; n < RTE_DIM(cmplt); n++) { | |
175 | cmplt[n].count = 0; | |
176 | index_array[n] = acl_start_next_trie(&flows, parms, n, ctx); | |
177 | } | |
178 | ||
179 | t0 = _mm256_set_epi64x(index_array[5], index_array[4], | |
180 | index_array[1], index_array[0]); | |
181 | t1 = _mm256_set_epi64x(index_array[7], index_array[6], | |
182 | index_array[3], index_array[2]); | |
183 | ||
184 | ACL_TR_HILO(mm256, __m256, t0, t1, tr_lo[0], tr_hi[0]); | |
185 | ||
186 | t0 = _mm256_set_epi64x(index_array[13], index_array[12], | |
187 | index_array[9], index_array[8]); | |
188 | t1 = _mm256_set_epi64x(index_array[15], index_array[14], | |
189 | index_array[11], index_array[10]); | |
190 | ||
191 | ACL_TR_HILO(mm256, __m256, t0, t1, tr_lo[1], tr_hi[1]); | |
192 | ||
193 | /* Check for any matches. */ | |
194 | acl_match_check_avx2x8(ctx, parms, &flows, 0, &tr_lo[0], &tr_hi[0], | |
195 | ymm_match_mask.y); | |
196 | acl_match_check_avx2x8(ctx, parms, &flows, 8, &tr_lo[1], &tr_hi[1], | |
197 | ymm_match_mask.y); | |
198 | ||
199 | while (flows.started > 0) { | |
200 | ||
201 | uint32_t in[MAX_SEARCHES_SSE8]; | |
202 | ||
203 | /* Gather 4 bytes of input data for first 8 flows. */ | |
204 | in[0] = GET_NEXT_4BYTES(parms, 0); | |
205 | in[4] = GET_NEXT_4BYTES(parms, 4); | |
206 | in[1] = GET_NEXT_4BYTES(parms, 1); | |
207 | in[5] = GET_NEXT_4BYTES(parms, 5); | |
208 | in[2] = GET_NEXT_4BYTES(parms, 2); | |
209 | in[6] = GET_NEXT_4BYTES(parms, 6); | |
210 | in[3] = GET_NEXT_4BYTES(parms, 3); | |
211 | in[7] = GET_NEXT_4BYTES(parms, 7); | |
212 | input[0] = _mm256_set_epi32(in[7], in[6], in[5], in[4], | |
213 | in[3], in[2], in[1], in[0]); | |
214 | ||
215 | /* Gather 4 bytes of input data for last 8 flows. */ | |
216 | in[0] = GET_NEXT_4BYTES(parms, 8); | |
217 | in[4] = GET_NEXT_4BYTES(parms, 12); | |
218 | in[1] = GET_NEXT_4BYTES(parms, 9); | |
219 | in[5] = GET_NEXT_4BYTES(parms, 13); | |
220 | in[2] = GET_NEXT_4BYTES(parms, 10); | |
221 | in[6] = GET_NEXT_4BYTES(parms, 14); | |
222 | in[3] = GET_NEXT_4BYTES(parms, 11); | |
223 | in[7] = GET_NEXT_4BYTES(parms, 15); | |
224 | input[1] = _mm256_set_epi32(in[7], in[6], in[5], in[4], | |
225 | in[3], in[2], in[1], in[0]); | |
226 | ||
227 | input[0] = transition8(input[0], flows.trans, | |
228 | &tr_lo[0], &tr_hi[0]); | |
229 | input[1] = transition8(input[1], flows.trans, | |
230 | &tr_lo[1], &tr_hi[1]); | |
231 | ||
232 | input[0] = transition8(input[0], flows.trans, | |
233 | &tr_lo[0], &tr_hi[0]); | |
234 | input[1] = transition8(input[1], flows.trans, | |
235 | &tr_lo[1], &tr_hi[1]); | |
236 | ||
237 | input[0] = transition8(input[0], flows.trans, | |
238 | &tr_lo[0], &tr_hi[0]); | |
239 | input[1] = transition8(input[1], flows.trans, | |
240 | &tr_lo[1], &tr_hi[1]); | |
241 | ||
242 | input[0] = transition8(input[0], flows.trans, | |
243 | &tr_lo[0], &tr_hi[0]); | |
244 | input[1] = transition8(input[1], flows.trans, | |
245 | &tr_lo[1], &tr_hi[1]); | |
246 | ||
247 | /* Check for any matches. */ | |
248 | acl_match_check_avx2x8(ctx, parms, &flows, 0, | |
249 | &tr_lo[0], &tr_hi[0], ymm_match_mask.y); | |
250 | acl_match_check_avx2x8(ctx, parms, &flows, 8, | |
251 | &tr_lo[1], &tr_hi[1], ymm_match_mask.y); | |
252 | } | |
253 | ||
254 | return 0; | |
255 | } |