1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright(c) 2015 Cavium, Inc
8 struct _neon_acl_const
{
9 rte_xmm_t xmm_shuffle_input
;
10 rte_xmm_t xmm_index_mask
;
12 } neon_acl_const
__attribute__((aligned(RTE_CACHE_LINE_SIZE
))) = {
14 .u32
= {0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c}
17 .u32
= {RTE_ACL_NODE_INDEX
, RTE_ACL_NODE_INDEX
,
18 RTE_ACL_NODE_INDEX
, RTE_ACL_NODE_INDEX
}
21 .u32
= {0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c}
26 * Resolve priority for multiple results (neon version).
27 * This consists comparing the priority of the current traversal with the
28 * running set of results for the packet.
29 * For each result, keep a running array of the result (rule number) and
30 * its priority for each category.
33 resolve_priority_neon(uint64_t transition
, int n
, const struct rte_acl_ctx
*ctx
,
35 const struct rte_acl_match_results
*p
,
39 int32x4_t results
, priority
, results1
, priority1
;
41 int32_t *saved_results
, *saved_priority
;
43 for (x
= 0; x
< categories
; x
+= RTE_ACL_RESULTS_MULTIPLIER
) {
44 saved_results
= (int32_t *)(&parms
[n
].cmplt
->results
[x
]);
45 saved_priority
= (int32_t *)(&parms
[n
].cmplt
->priority
[x
]);
47 /* get results and priorities for completed trie */
49 (const int32_t *)&p
[transition
].results
[x
]);
51 (const int32_t *)&p
[transition
].priority
[x
]);
53 /* if this is not the first completed trie */
54 if (parms
[n
].cmplt
->count
!= ctx
->num_tries
) {
55 /* get running best results and their priorities */
56 results1
= vld1q_s32(saved_results
);
57 priority1
= vld1q_s32(saved_priority
);
59 /* select results that are highest priority */
60 selector
= vcgtq_s32(priority1
, priority
);
61 results
= vbslq_s32(selector
, results1
, results
);
62 priority
= vbslq_s32(selector
, priority1
, priority
);
65 /* save running best results and their priorities */
66 vst1q_s32(saved_results
, results
);
67 vst1q_s32(saved_priority
, priority
);
72 * Check for any match in 4 transitions
74 static __rte_always_inline
uint32_t
75 check_any_match_x4(uint64_t val
[])
77 return (val
[0] | val
[1] | val
[2] | val
[3]) & RTE_ACL_NODE_MATCH
;
80 static __rte_always_inline
void
81 acl_match_check_x4(int slot
, const struct rte_acl_ctx
*ctx
, struct parms
*parms
,
82 struct acl_flow_data
*flows
, uint64_t transitions
[])
84 while (check_any_match_x4(transitions
)) {
85 transitions
[0] = acl_match_check(transitions
[0], slot
, ctx
,
86 parms
, flows
, resolve_priority_neon
);
87 transitions
[1] = acl_match_check(transitions
[1], slot
+ 1, ctx
,
88 parms
, flows
, resolve_priority_neon
);
89 transitions
[2] = acl_match_check(transitions
[2], slot
+ 2, ctx
,
90 parms
, flows
, resolve_priority_neon
);
91 transitions
[3] = acl_match_check(transitions
[3], slot
+ 3, ctx
,
92 parms
, flows
, resolve_priority_neon
);
97 * Process 4 transitions (in 2 NEON Q registers) in parallel
99 static __rte_always_inline int32x4_t
100 transition4(int32x4_t next_input
, const uint64_t *trans
, uint64_t transitions
[])
102 int32x4x2_t tr_hi_lo
;
104 uint32x4_t index_msk
, node_type
, addr
;
105 uint32x4_t dfa_msk
, mask
, quad_ofs
, dfa_ofs
;
107 /* Move low 32 into tr_hi_lo.val[0] and high 32 into tr_hi_lo.val[1] */
108 tr_hi_lo
= vld2q_s32((const int32_t *)transitions
);
110 /* Calculate the address (array index) for all 4 transitions. */
112 index_msk
= vld1q_u32((const uint32_t *)&neon_acl_const
.xmm_index_mask
);
114 /* Calc node type and node addr */
115 node_type
= vbicq_s32(tr_hi_lo
.val
[0], index_msk
);
116 addr
= vandq_s32(tr_hi_lo
.val
[0], index_msk
);
119 t
= veorq_s32(node_type
, node_type
);
121 /* mask for DFA type(0) nodes */
122 dfa_msk
= vceqq_u32(node_type
, t
);
124 mask
= vld1q_s32((const int32_t *)&neon_acl_const
.xmm_shuffle_input
);
125 in
= vqtbl1q_u8((uint8x16_t
)next_input
, (uint8x16_t
)mask
);
127 /* DFA calculations. */
128 r
= vshrq_n_u32(in
, 30); /* div by 64 */
129 mask
= vld1q_s32((const int32_t *)&neon_acl_const
.range_base
);
130 r
= vaddq_u8(r
, mask
);
131 t
= vshrq_n_u32(in
, 24);
132 r
= vqtbl1q_u8((uint8x16_t
)tr_hi_lo
.val
[1], (uint8x16_t
)r
);
133 dfa_ofs
= vsubq_s32(t
, r
);
135 /* QUAD/SINGLE calculations. */
136 t
= vcgtq_s8(in
, tr_hi_lo
.val
[1]);
139 quad_ofs
= vpaddlq_u16(t
);
141 /* blend DFA and QUAD/SINGLE. */
142 t
= vbslq_u8(dfa_msk
, dfa_ofs
, quad_ofs
);
144 /* calculate address for next transitions */
145 addr
= vaddq_u32(addr
, t
);
147 /* Fill next transitions */
148 transitions
[0] = trans
[vgetq_lane_u32(addr
, 0)];
149 transitions
[1] = trans
[vgetq_lane_u32(addr
, 1)];
150 transitions
[2] = trans
[vgetq_lane_u32(addr
, 2)];
151 transitions
[3] = trans
[vgetq_lane_u32(addr
, 3)];
153 return vshrq_n_u32(next_input
, CHAR_BIT
);
157 * Execute trie traversal with 8 traversals in parallel
160 search_neon_8(const struct rte_acl_ctx
*ctx
, const uint8_t **data
,
161 uint32_t *results
, uint32_t total_packets
, uint32_t categories
)
164 struct acl_flow_data flows
;
165 uint64_t index_array
[8];
166 struct completion cmplt
[8];
167 struct parms parms
[8];
168 int32x4_t input0
, input1
;
170 acl_set_flow(&flows
, cmplt
, RTE_DIM(cmplt
), data
, results
,
171 total_packets
, categories
, ctx
->trans_table
);
173 for (n
= 0; n
< 8; n
++) {
175 index_array
[n
] = acl_start_next_trie(&flows
, parms
, n
, ctx
);
178 /* Check for any matches. */
179 acl_match_check_x4(0, ctx
, parms
, &flows
, &index_array
[0]);
180 acl_match_check_x4(4, ctx
, parms
, &flows
, &index_array
[4]);
182 while (flows
.started
> 0) {
183 /* Gather 4 bytes of input data for each stream. */
184 input0
= vsetq_lane_s32(GET_NEXT_4BYTES(parms
, 0), input0
, 0);
185 input1
= vsetq_lane_s32(GET_NEXT_4BYTES(parms
, 4), input1
, 0);
187 input0
= vsetq_lane_s32(GET_NEXT_4BYTES(parms
, 1), input0
, 1);
188 input1
= vsetq_lane_s32(GET_NEXT_4BYTES(parms
, 5), input1
, 1);
190 input0
= vsetq_lane_s32(GET_NEXT_4BYTES(parms
, 2), input0
, 2);
191 input1
= vsetq_lane_s32(GET_NEXT_4BYTES(parms
, 6), input1
, 2);
193 input0
= vsetq_lane_s32(GET_NEXT_4BYTES(parms
, 3), input0
, 3);
194 input1
= vsetq_lane_s32(GET_NEXT_4BYTES(parms
, 7), input1
, 3);
196 /* Process the 4 bytes of input on each stream. */
198 input0
= transition4(input0
, flows
.trans
, &index_array
[0]);
199 input1
= transition4(input1
, flows
.trans
, &index_array
[4]);
201 input0
= transition4(input0
, flows
.trans
, &index_array
[0]);
202 input1
= transition4(input1
, flows
.trans
, &index_array
[4]);
204 input0
= transition4(input0
, flows
.trans
, &index_array
[0]);
205 input1
= transition4(input1
, flows
.trans
, &index_array
[4]);
207 input0
= transition4(input0
, flows
.trans
, &index_array
[0]);
208 input1
= transition4(input1
, flows
.trans
, &index_array
[4]);
210 /* Check for any matches. */
211 acl_match_check_x4(0, ctx
, parms
, &flows
, &index_array
[0]);
212 acl_match_check_x4(4, ctx
, parms
, &flows
, &index_array
[4]);
219 * Execute trie traversal with 4 traversals in parallel
222 search_neon_4(const struct rte_acl_ctx
*ctx
, const uint8_t **data
,
223 uint32_t *results
, int total_packets
, uint32_t categories
)
226 struct acl_flow_data flows
;
227 uint64_t index_array
[4];
228 struct completion cmplt
[4];
229 struct parms parms
[4];
232 acl_set_flow(&flows
, cmplt
, RTE_DIM(cmplt
), data
, results
,
233 total_packets
, categories
, ctx
->trans_table
);
235 for (n
= 0; n
< 4; n
++) {
237 index_array
[n
] = acl_start_next_trie(&flows
, parms
, n
, ctx
);
240 /* Check for any matches. */
241 acl_match_check_x4(0, ctx
, parms
, &flows
, index_array
);
243 while (flows
.started
> 0) {
244 /* Gather 4 bytes of input data for each stream. */
245 input
= vsetq_lane_s32(GET_NEXT_4BYTES(parms
, 0), input
, 0);
246 input
= vsetq_lane_s32(GET_NEXT_4BYTES(parms
, 1), input
, 1);
247 input
= vsetq_lane_s32(GET_NEXT_4BYTES(parms
, 2), input
, 2);
248 input
= vsetq_lane_s32(GET_NEXT_4BYTES(parms
, 3), input
, 3);
250 /* Process the 4 bytes of input on each stream. */
251 input
= transition4(input
, flows
.trans
, index_array
);
252 input
= transition4(input
, flows
.trans
, index_array
);
253 input
= transition4(input
, flows
.trans
, index_array
);
254 input
= transition4(input
, flows
.trans
, index_array
);
256 /* Check for any matches. */
257 acl_match_check_x4(0, ctx
, parms
, &flows
, index_array
);