4 * Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * * Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * * Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * * Neither the name of Intel Corporation nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 SHUFFLE32_SLOT1
= 0xe5,
39 SHUFFLE32_SLOT2
= 0xe6,
40 SHUFFLE32_SLOT3
= 0xe7,
41 SHUFFLE32_SWAP64
= 0x4e,
44 static const rte_xmm_t xmm_shuffle_input
= {
45 .u32
= {0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c},
48 static const rte_xmm_t xmm_ones_16
= {
49 .u16
= {1, 1, 1, 1, 1, 1, 1, 1},
52 static const rte_xmm_t xmm_match_mask
= {
61 static const rte_xmm_t xmm_index_mask
= {
70 static const rte_xmm_t xmm_range_base
= {
72 0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c,
77 * Resolve priority for multiple results (sse version).
78 * This consists comparing the priority of the current traversal with the
79 * running set of results for the packet.
80 * For each result, keep a running array of the result (rule number) and
81 * its priority for each category.
84 resolve_priority_sse(uint64_t transition
, int n
, const struct rte_acl_ctx
*ctx
,
85 struct parms
*parms
, const struct rte_acl_match_results
*p
,
89 xmm_t results
, priority
, results1
, priority1
, selector
;
90 xmm_t
*saved_results
, *saved_priority
;
92 for (x
= 0; x
< categories
; x
+= RTE_ACL_RESULTS_MULTIPLIER
) {
94 saved_results
= (xmm_t
*)(&parms
[n
].cmplt
->results
[x
]);
96 (xmm_t
*)(&parms
[n
].cmplt
->priority
[x
]);
98 /* get results and priorities for completed trie */
99 results
= _mm_loadu_si128(
100 (const xmm_t
*)&p
[transition
].results
[x
]);
101 priority
= _mm_loadu_si128(
102 (const xmm_t
*)&p
[transition
].priority
[x
]);
104 /* if this is not the first completed trie */
105 if (parms
[n
].cmplt
->count
!= ctx
->num_tries
) {
107 /* get running best results and their priorities */
108 results1
= _mm_loadu_si128(saved_results
);
109 priority1
= _mm_loadu_si128(saved_priority
);
111 /* select results that are highest priority */
112 selector
= _mm_cmpgt_epi32(priority1
, priority
);
113 results
= _mm_blendv_epi8(results
, results1
, selector
);
114 priority
= _mm_blendv_epi8(priority
, priority1
,
118 /* save running best results and their priorities */
119 _mm_storeu_si128(saved_results
, results
);
120 _mm_storeu_si128(saved_priority
, priority
);
125 * Extract transitions from an XMM register and check for any matches
128 acl_process_matches(xmm_t
*indices
, int slot
, const struct rte_acl_ctx
*ctx
,
129 struct parms
*parms
, struct acl_flow_data
*flows
)
131 uint64_t transition1
, transition2
;
133 /* extract transition from low 64 bits. */
134 transition1
= _mm_cvtsi128_si64(*indices
);
136 /* extract transition from high 64 bits. */
137 *indices
= _mm_shuffle_epi32(*indices
, SHUFFLE32_SWAP64
);
138 transition2
= _mm_cvtsi128_si64(*indices
);
140 transition1
= acl_match_check(transition1
, slot
, ctx
,
141 parms
, flows
, resolve_priority_sse
);
142 transition2
= acl_match_check(transition2
, slot
+ 1, ctx
,
143 parms
, flows
, resolve_priority_sse
);
145 /* update indices with new transitions. */
146 *indices
= _mm_set_epi64x(transition2
, transition1
);
150 * Check for any match in 4 transitions (contained in 2 SSE registers)
152 static inline __attribute__((always_inline
)) void
153 acl_match_check_x4(int slot
, const struct rte_acl_ctx
*ctx
, struct parms
*parms
,
154 struct acl_flow_data
*flows
, xmm_t
*indices1
, xmm_t
*indices2
,
159 /* put low 32 bits of each transition into one register */
160 temp
= (xmm_t
)_mm_shuffle_ps((__m128
)*indices1
, (__m128
)*indices2
,
162 /* test for match node */
163 temp
= _mm_and_si128(match_mask
, temp
);
165 while (!_mm_testz_si128(temp
, temp
)) {
166 acl_process_matches(indices1
, slot
, ctx
, parms
, flows
);
167 acl_process_matches(indices2
, slot
+ 2, ctx
, parms
, flows
);
169 temp
= (xmm_t
)_mm_shuffle_ps((__m128
)*indices1
,
172 temp
= _mm_and_si128(match_mask
, temp
);
177 * Process 4 transitions (in 2 XMM registers) in parallel
179 static inline __attribute__((always_inline
)) xmm_t
180 transition4(xmm_t next_input
, const uint64_t *trans
,
181 xmm_t
*indices1
, xmm_t
*indices2
)
183 xmm_t addr
, tr_lo
, tr_hi
;
184 uint64_t trans0
, trans2
;
186 /* Shuffle low 32 into tr_lo and high 32 into tr_hi */
187 ACL_TR_HILO(mm
, __m128
, *indices1
, *indices2
, tr_lo
, tr_hi
);
189 /* Calculate the address (array index) for all 4 transitions. */
190 ACL_TR_CALC_ADDR(mm
, 128, addr
, xmm_index_mask
.x
, next_input
,
191 xmm_shuffle_input
.x
, xmm_ones_16
.x
, xmm_range_base
.x
,
194 /* Gather 64 bit transitions and pack back into 2 registers. */
196 trans0
= trans
[_mm_cvtsi128_si32(addr
)];
200 /* {x0, x1, x2, x3} -> {x2, x1, x2, x3} */
201 addr
= _mm_shuffle_epi32(addr
, SHUFFLE32_SLOT2
);
202 trans2
= trans
[_mm_cvtsi128_si32(addr
)];
206 /* {x2, x1, x2, x3} -> {x1, x1, x2, x3} */
207 addr
= _mm_shuffle_epi32(addr
, SHUFFLE32_SLOT1
);
208 *indices1
= _mm_set_epi64x(trans
[_mm_cvtsi128_si32(addr
)], trans0
);
212 /* {x1, x1, x2, x3} -> {x3, x1, x2, x3} */
213 addr
= _mm_shuffle_epi32(addr
, SHUFFLE32_SLOT3
);
214 *indices2
= _mm_set_epi64x(trans
[_mm_cvtsi128_si32(addr
)], trans2
);
216 return _mm_srli_epi32(next_input
, CHAR_BIT
);
220 * Execute trie traversal with 8 traversals in parallel
223 search_sse_8(const struct rte_acl_ctx
*ctx
, const uint8_t **data
,
224 uint32_t *results
, uint32_t total_packets
, uint32_t categories
)
227 struct acl_flow_data flows
;
228 uint64_t index_array
[MAX_SEARCHES_SSE8
];
229 struct completion cmplt
[MAX_SEARCHES_SSE8
];
230 struct parms parms
[MAX_SEARCHES_SSE8
];
231 xmm_t input0
, input1
;
232 xmm_t indices1
, indices2
, indices3
, indices4
;
234 acl_set_flow(&flows
, cmplt
, RTE_DIM(cmplt
), data
, results
,
235 total_packets
, categories
, ctx
->trans_table
);
237 for (n
= 0; n
< MAX_SEARCHES_SSE8
; n
++) {
239 index_array
[n
] = acl_start_next_trie(&flows
, parms
, n
, ctx
);
243 * indices1 contains index_array[0,1]
244 * indices2 contains index_array[2,3]
245 * indices3 contains index_array[4,5]
246 * indices4 contains index_array[6,7]
249 indices1
= _mm_loadu_si128((xmm_t
*) &index_array
[0]);
250 indices2
= _mm_loadu_si128((xmm_t
*) &index_array
[2]);
252 indices3
= _mm_loadu_si128((xmm_t
*) &index_array
[4]);
253 indices4
= _mm_loadu_si128((xmm_t
*) &index_array
[6]);
255 /* Check for any matches. */
256 acl_match_check_x4(0, ctx
, parms
, &flows
,
257 &indices1
, &indices2
, xmm_match_mask
.x
);
258 acl_match_check_x4(4, ctx
, parms
, &flows
,
259 &indices3
, &indices4
, xmm_match_mask
.x
);
261 while (flows
.started
> 0) {
263 /* Gather 4 bytes of input data for each stream. */
264 input0
= _mm_cvtsi32_si128(GET_NEXT_4BYTES(parms
, 0));
265 input1
= _mm_cvtsi32_si128(GET_NEXT_4BYTES(parms
, 4));
267 input0
= _mm_insert_epi32(input0
, GET_NEXT_4BYTES(parms
, 1), 1);
268 input1
= _mm_insert_epi32(input1
, GET_NEXT_4BYTES(parms
, 5), 1);
270 input0
= _mm_insert_epi32(input0
, GET_NEXT_4BYTES(parms
, 2), 2);
271 input1
= _mm_insert_epi32(input1
, GET_NEXT_4BYTES(parms
, 6), 2);
273 input0
= _mm_insert_epi32(input0
, GET_NEXT_4BYTES(parms
, 3), 3);
274 input1
= _mm_insert_epi32(input1
, GET_NEXT_4BYTES(parms
, 7), 3);
276 /* Process the 4 bytes of input on each stream. */
278 input0
= transition4(input0
, flows
.trans
,
279 &indices1
, &indices2
);
280 input1
= transition4(input1
, flows
.trans
,
281 &indices3
, &indices4
);
283 input0
= transition4(input0
, flows
.trans
,
284 &indices1
, &indices2
);
285 input1
= transition4(input1
, flows
.trans
,
286 &indices3
, &indices4
);
288 input0
= transition4(input0
, flows
.trans
,
289 &indices1
, &indices2
);
290 input1
= transition4(input1
, flows
.trans
,
291 &indices3
, &indices4
);
293 input0
= transition4(input0
, flows
.trans
,
294 &indices1
, &indices2
);
295 input1
= transition4(input1
, flows
.trans
,
296 &indices3
, &indices4
);
298 /* Check for any matches. */
299 acl_match_check_x4(0, ctx
, parms
, &flows
,
300 &indices1
, &indices2
, xmm_match_mask
.x
);
301 acl_match_check_x4(4, ctx
, parms
, &flows
,
302 &indices3
, &indices4
, xmm_match_mask
.x
);
309 * Execute trie traversal with 4 traversals in parallel
312 search_sse_4(const struct rte_acl_ctx
*ctx
, const uint8_t **data
,
313 uint32_t *results
, int total_packets
, uint32_t categories
)
316 struct acl_flow_data flows
;
317 uint64_t index_array
[MAX_SEARCHES_SSE4
];
318 struct completion cmplt
[MAX_SEARCHES_SSE4
];
319 struct parms parms
[MAX_SEARCHES_SSE4
];
320 xmm_t input
, indices1
, indices2
;
322 acl_set_flow(&flows
, cmplt
, RTE_DIM(cmplt
), data
, results
,
323 total_packets
, categories
, ctx
->trans_table
);
325 for (n
= 0; n
< MAX_SEARCHES_SSE4
; n
++) {
327 index_array
[n
] = acl_start_next_trie(&flows
, parms
, n
, ctx
);
330 indices1
= _mm_loadu_si128((xmm_t
*) &index_array
[0]);
331 indices2
= _mm_loadu_si128((xmm_t
*) &index_array
[2]);
333 /* Check for any matches. */
334 acl_match_check_x4(0, ctx
, parms
, &flows
,
335 &indices1
, &indices2
, xmm_match_mask
.x
);
337 while (flows
.started
> 0) {
339 /* Gather 4 bytes of input data for each stream. */
340 input
= _mm_cvtsi32_si128(GET_NEXT_4BYTES(parms
, 0));
341 input
= _mm_insert_epi32(input
, GET_NEXT_4BYTES(parms
, 1), 1);
342 input
= _mm_insert_epi32(input
, GET_NEXT_4BYTES(parms
, 2), 2);
343 input
= _mm_insert_epi32(input
, GET_NEXT_4BYTES(parms
, 3), 3);
345 /* Process the 4 bytes of input on each stream. */
346 input
= transition4(input
, flows
.trans
, &indices1
, &indices2
);
347 input
= transition4(input
, flows
.trans
, &indices1
, &indices2
);
348 input
= transition4(input
, flows
.trans
, &indices1
, &indices2
);
349 input
= transition4(input
, flows
.trans
, &indices1
, &indices2
);
351 /* Check for any matches. */
352 acl_match_check_x4(0, ctx
, parms
, &flows
,
353 &indices1
, &indices2
, xmm_match_mask
.x
);