[ceph.git] / ceph / src / seastar / dpdk / lib / librte_acl / acl_run_avx2.h

/* SPDX-License-Identifier: BSD-3-Clause
 * Copyright(c) 2010-2014 Intel Corporation
 */

#include "acl_run_sse.h"

static const rte_ymm_t ymm_match_mask = {
	.u32 = {
		RTE_ACL_NODE_MATCH,
		RTE_ACL_NODE_MATCH,
		RTE_ACL_NODE_MATCH,
		RTE_ACL_NODE_MATCH,
		RTE_ACL_NODE_MATCH,
		RTE_ACL_NODE_MATCH,
		RTE_ACL_NODE_MATCH,
		RTE_ACL_NODE_MATCH,
	},
};

static const rte_ymm_t ymm_index_mask = {
	.u32 = {
		RTE_ACL_NODE_INDEX,
		RTE_ACL_NODE_INDEX,
		RTE_ACL_NODE_INDEX,
		RTE_ACL_NODE_INDEX,
		RTE_ACL_NODE_INDEX,
		RTE_ACL_NODE_INDEX,
		RTE_ACL_NODE_INDEX,
		RTE_ACL_NODE_INDEX,
	},
};

static const rte_ymm_t ymm_shuffle_input = {
	.u32 = {
		0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c,
		0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c,
	},
};

static const rte_ymm_t ymm_ones_16 = {
	.u16 = {
		1, 1, 1, 1, 1, 1, 1, 1,
		1, 1, 1, 1, 1, 1, 1, 1,
	},
};

static const rte_ymm_t ymm_range_base = {
	.u32 = {
		0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c,
		0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c,
	},
};

/*
 * Process 8 transitions in parallel.
 * tr_lo contains low 32 bits for 8 transition.
 * tr_hi contains high 32 bits for 8 transition.
 * next_input contains up to 4 input bytes for 8 flows.
 */
static __rte_always_inline ymm_t
transition8(ymm_t next_input, const uint64_t *trans, ymm_t *tr_lo, ymm_t *tr_hi)
{
	const int32_t *tr;
	ymm_t addr;

	tr = (const int32_t *)(uintptr_t)trans;

	/* Calculate the address (array index) for all 8 transitions. */
	ACL_TR_CALC_ADDR(mm256, 256, addr, ymm_index_mask.y, next_input,
		ymm_shuffle_input.y, ymm_ones_16.y, ymm_range_base.y,
		*tr_lo, *tr_hi);

	/* load lower 32 bits of 8 transactions at once. */
	*tr_lo = _mm256_i32gather_epi32(tr, addr, sizeof(trans[0]));

	next_input = _mm256_srli_epi32(next_input, CHAR_BIT);

	/* load high 32 bits of 8 transactions at once. */
	*tr_hi = _mm256_i32gather_epi32(tr + 1, addr, sizeof(trans[0]));

	return next_input;
}

/*
 * Process matches for  8 flows.
 * tr_lo contains low 32 bits for 8 transition.
 * tr_hi contains high 32 bits for 8 transition.
 */
static inline void
acl_process_matches_avx2x8(const struct rte_acl_ctx *ctx,
	struct parms *parms, struct acl_flow_data *flows, uint32_t slot,
	ymm_t matches, ymm_t *tr_lo, ymm_t *tr_hi)
{
	ymm_t t0, t1;
	ymm_t lo, hi;
	xmm_t l0, l1;
	uint32_t i;
	uint64_t tr[MAX_SEARCHES_SSE8];

	l1 = _mm256_extracti128_si256(*tr_lo, 1);
	l0 = _mm256_castsi256_si128(*tr_lo);

	for (i = 0; i != RTE_DIM(tr) / 2; i++) {

		/*
		 * Extract low 32bits of each transition.
		 * That's enough to process the match.
		 */
		tr[i] = (uint32_t)_mm_cvtsi128_si32(l0);
		tr[i + 4] = (uint32_t)_mm_cvtsi128_si32(l1);

		l0 = _mm_srli_si128(l0, sizeof(uint32_t));
		l1 = _mm_srli_si128(l1, sizeof(uint32_t));

		tr[i] = acl_match_check(tr[i], slot + i,
			ctx, parms, flows, resolve_priority_sse);
		tr[i + 4] = acl_match_check(tr[i + 4], slot + i + 4,
			ctx, parms, flows, resolve_priority_sse);
	}

	/* Collect new transitions into 2 YMM registers. */
	t0 = _mm256_set_epi64x(tr[5], tr[4], tr[1], tr[0]);
	t1 = _mm256_set_epi64x(tr[7], tr[6], tr[3], tr[2]);

	/* For each transition: put low 32 into tr_lo and high 32 into tr_hi */
	ACL_TR_HILO(mm256, __m256, t0, t1, lo, hi);

	/* Keep transitions wth NOMATCH intact. */
	*tr_lo = _mm256_blendv_epi8(*tr_lo, lo, matches);
	*tr_hi = _mm256_blendv_epi8(*tr_hi, hi, matches);
}

static inline void
acl_match_check_avx2x8(const struct rte_acl_ctx *ctx, struct parms *parms,
	struct acl_flow_data *flows, uint32_t slot,
	ymm_t *tr_lo, ymm_t *tr_hi, ymm_t match_mask)
{
	uint32_t msk;
	ymm_t matches, temp;

	/* test for match node */
	temp = _mm256_and_si256(match_mask, *tr_lo);
	matches = _mm256_cmpeq_epi32(temp, match_mask);
	msk = _mm256_movemask_epi8(matches);

	while (msk != 0) {

		acl_process_matches_avx2x8(ctx, parms, flows, slot,
			matches, tr_lo, tr_hi);
		temp = _mm256_and_si256(match_mask, *tr_lo);
		matches = _mm256_cmpeq_epi32(temp, match_mask);
		msk = _mm256_movemask_epi8(matches);
	}
}

/*
 * Execute trie traversal for up to 16 flows in parallel.
 */
static inline int
search_avx2x16(const struct rte_acl_ctx *ctx, const uint8_t **data,
	uint32_t *results, uint32_t total_packets, uint32_t categories)
{
	uint32_t n;
	struct acl_flow_data flows;
	uint64_t index_array[MAX_SEARCHES_AVX16];
	struct completion cmplt[MAX_SEARCHES_AVX16];
	struct parms parms[MAX_SEARCHES_AVX16];
	ymm_t input[2], tr_lo[2], tr_hi[2];
	ymm_t t0, t1;

	acl_set_flow(&flows, cmplt, RTE_DIM(cmplt), data, results,
		total_packets, categories, ctx->trans_table);

	for (n = 0; n < RTE_DIM(cmplt); n++) {
		cmplt[n].count = 0;
		index_array[n] = acl_start_next_trie(&flows, parms, n, ctx);
	}

	t0 = _mm256_set_epi64x(index_array[5], index_array[4],
		index_array[1], index_array[0]);
	t1 = _mm256_set_epi64x(index_array[7], index_array[6],
		index_array[3], index_array[2]);

	ACL_TR_HILO(mm256, __m256, t0, t1, tr_lo[0], tr_hi[0]);

	t0 = _mm256_set_epi64x(index_array[13], index_array[12],
		index_array[9], index_array[8]);
	t1 = _mm256_set_epi64x(index_array[15], index_array[14],
		index_array[11], index_array[10]);

	ACL_TR_HILO(mm256, __m256, t0, t1, tr_lo[1], tr_hi[1]);

	 /* Check for any matches. */
	acl_match_check_avx2x8(ctx, parms, &flows, 0, &tr_lo[0], &tr_hi[0],
		ymm_match_mask.y);
	acl_match_check_avx2x8(ctx, parms, &flows, 8, &tr_lo[1], &tr_hi[1],
		ymm_match_mask.y);

	while (flows.started > 0) {

		uint32_t in[MAX_SEARCHES_SSE8];

		/* Gather 4 bytes of input data for first 8 flows. */
		in[0] = GET_NEXT_4BYTES(parms, 0);
		in[4] = GET_NEXT_4BYTES(parms, 4);
		in[1] = GET_NEXT_4BYTES(parms, 1);
		in[5] = GET_NEXT_4BYTES(parms, 5);
		in[2] = GET_NEXT_4BYTES(parms, 2);
		in[6] = GET_NEXT_4BYTES(parms, 6);
		in[3] = GET_NEXT_4BYTES(parms, 3);
		in[7] = GET_NEXT_4BYTES(parms, 7);
		input[0] = _mm256_set_epi32(in[7], in[6], in[5], in[4],
			in[3], in[2], in[1], in[0]);

		/* Gather 4 bytes of input data for last 8 flows. */
		in[0] = GET_NEXT_4BYTES(parms, 8);
		in[4] = GET_NEXT_4BYTES(parms, 12);
		in[1] = GET_NEXT_4BYTES(parms, 9);
		in[5] = GET_NEXT_4BYTES(parms, 13);
		in[2] = GET_NEXT_4BYTES(parms, 10);
		in[6] = GET_NEXT_4BYTES(parms, 14);
		in[3] = GET_NEXT_4BYTES(parms, 11);
		in[7] = GET_NEXT_4BYTES(parms, 15);
		input[1] = _mm256_set_epi32(in[7], in[6], in[5], in[4],
			in[3], in[2], in[1], in[0]);

		input[0] = transition8(input[0], flows.trans,
			&tr_lo[0], &tr_hi[0]);
		input[1] = transition8(input[1], flows.trans,
			&tr_lo[1], &tr_hi[1]);

		input[0] = transition8(input[0], flows.trans,
			&tr_lo[0], &tr_hi[0]);
		input[1] = transition8(input[1], flows.trans,
			&tr_lo[1], &tr_hi[1]);

		input[0] = transition8(input[0], flows.trans,
			&tr_lo[0], &tr_hi[0]);
		input[1] = transition8(input[1], flows.trans,
			&tr_lo[1], &tr_hi[1]);

		input[0] = transition8(input[0], flows.trans,
			&tr_lo[0], &tr_hi[0]);
		input[1] = transition8(input[1], flows.trans,
			&tr_lo[1], &tr_hi[1]);

		 /* Check for any matches. */
		acl_match_check_avx2x8(ctx, parms, &flows, 0,
			&tr_lo[0], &tr_hi[0], ymm_match_mask.y);
		acl_match_check_avx2x8(ctx, parms, &flows, 8,
			&tr_lo[1], &tr_hi[1], ymm_match_mask.y);
	}

	return 0;
}
Commit	Line	Data
9f95a23c TL	1	/* SPDX-License-Identifier: BSD-3-Clause
9f95a23c TL	2	* Copyright(c) 2010-2014 Intel Corporation
7c673cae FG	3	*/
	4
	5	#include "acl_run_sse.h"
	6
	7	static const rte_ymm_t ymm_match_mask = {
	8	.u32 = {
	9	RTE_ACL_NODE_MATCH,
	10	RTE_ACL_NODE_MATCH,
	11	RTE_ACL_NODE_MATCH,
	12	RTE_ACL_NODE_MATCH,
	13	RTE_ACL_NODE_MATCH,
	14	RTE_ACL_NODE_MATCH,
	15	RTE_ACL_NODE_MATCH,
	16	RTE_ACL_NODE_MATCH,
	17	},
	18	};
	19
	20	static const rte_ymm_t ymm_index_mask = {
	21	.u32 = {
	22	RTE_ACL_NODE_INDEX,
	23	RTE_ACL_NODE_INDEX,
	24	RTE_ACL_NODE_INDEX,
	25	RTE_ACL_NODE_INDEX,
	26	RTE_ACL_NODE_INDEX,
	27	RTE_ACL_NODE_INDEX,
	28	RTE_ACL_NODE_INDEX,
	29	RTE_ACL_NODE_INDEX,
	30	},
	31	};
	32
	33	static const rte_ymm_t ymm_shuffle_input = {
	34	.u32 = {
	35	0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c,
	36	0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c,
	37	},
	38	};
	39
	40	static const rte_ymm_t ymm_ones_16 = {
	41	.u16 = {
	42	1, 1, 1, 1, 1, 1, 1, 1,
	43	1, 1, 1, 1, 1, 1, 1, 1,
	44	},
	45	};
	46
	47	static const rte_ymm_t ymm_range_base = {
	48	.u32 = {
	49	0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c,
	50	0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c,
	51	},
	52	};
	53
	54	/*
	55	* Process 8 transitions in parallel.
	56	* tr_lo contains low 32 bits for 8 transition.
	57	* tr_hi contains high 32 bits for 8 transition.
	58	* next_input contains up to 4 input bytes for 8 flows.
	59	*/
9f95a23c	60	static __rte_always_inline ymm_t
7c673cae FG	61	transition8(ymm_t next_input, const uint64_t trans, ymm_t tr_lo, ymm_t *tr_hi)
	62	{
	63	const int32_t *tr;
	64	ymm_t addr;
	65
	66	tr = (const int32_t *)(uintptr_t)trans;
	67
	68	/* Calculate the address (array index) for all 8 transitions. */
	69	ACL_TR_CALC_ADDR(mm256, 256, addr, ymm_index_mask.y, next_input,
	70	ymm_shuffle_input.y, ymm_ones_16.y, ymm_range_base.y,
	71	tr_lo, tr_hi);
	72
	73	/* load lower 32 bits of 8 transactions at once. */
	74	*tr_lo = _mm256_i32gather_epi32(tr, addr, sizeof(trans[0]));
	75
	76	next_input = _mm256_srli_epi32(next_input, CHAR_BIT);
	77
	78	/* load high 32 bits of 8 transactions at once. */
	79	*tr_hi = _mm256_i32gather_epi32(tr + 1, addr, sizeof(trans[0]));
	80
	81	return next_input;
	82	}
	83
	84	/*
	85	* Process matches for 8 flows.
	86	* tr_lo contains low 32 bits for 8 transition.
	87	* tr_hi contains high 32 bits for 8 transition.
	88	*/
	89	static inline void
	90	acl_process_matches_avx2x8(const struct rte_acl_ctx *ctx,
	91	struct parms parms, struct acl_flow_data flows, uint32_t slot,
	92	ymm_t matches, ymm_t tr_lo, ymm_t tr_hi)
	93	{
	94	ymm_t t0, t1;
	95	ymm_t lo, hi;
	96	xmm_t l0, l1;
	97	uint32_t i;
	98	uint64_t tr[MAX_SEARCHES_SSE8];
	99
	100	l1 = _mm256_extracti128_si256(*tr_lo, 1);
	101	l0 = _mm256_castsi256_si128(*tr_lo);
	102
	103	for (i = 0; i != RTE_DIM(tr) / 2; i++) {
	104
	105	/*
	106	* Extract low 32bits of each transition.
	107	* That's enough to process the match.
	108	*/
	109	tr[i] = (uint32_t)_mm_cvtsi128_si32(l0);
	110	tr[i + 4] = (uint32_t)_mm_cvtsi128_si32(l1);
	111
	112	l0 = _mm_srli_si128(l0, sizeof(uint32_t));
	113	l1 = _mm_srli_si128(l1, sizeof(uint32_t));
	114
	115	tr[i] = acl_match_check(tr[i], slot + i,
	116	ctx, parms, flows, resolve_priority_sse);
	117	tr[i + 4] = acl_match_check(tr[i + 4], slot + i + 4,
	118	ctx, parms, flows, resolve_priority_sse);
	119	}
	120
	121	/* Collect new transitions into 2 YMM registers. */
	122	t0 = _mm256_set_epi64x(tr[5], tr[4], tr[1], tr[0]);
	123	t1 = _mm256_set_epi64x(tr[7], tr[6], tr[3], tr[2]);
	124
125	/* For each transition: put low 32 into tr_lo and high 32 into tr_hi */
126	ACL_TR_HILO(mm256, __m256, t0, t1, lo, hi);
127
128	/* Keep transitions wth NOMATCH intact. */
129	tr_lo = _mm256_blendv_epi8(tr_lo, lo, matches);
130	tr_hi = _mm256_blendv_epi8(tr_hi, hi, matches);
131	}
132
133	static inline void
134	acl_match_check_avx2x8(const struct rte_acl_ctx ctx, struct parms parms,
135	struct acl_flow_data *flows, uint32_t slot,
136	ymm_t tr_lo, ymm_t tr_hi, ymm_t match_mask)
137	{
138	uint32_t msk;
139	ymm_t matches, temp;
140
141	/* test for match node */
142	temp = _mm256_and_si256(match_mask, *tr_lo);
143	matches = _mm256_cmpeq_epi32(temp, match_mask);
144	msk = _mm256_movemask_epi8(matches);
145
146	while (msk != 0) {
147
148	acl_process_matches_avx2x8(ctx, parms, flows, slot,
149	matches, tr_lo, tr_hi);
150	temp = _mm256_and_si256(match_mask, *tr_lo);
151	matches = _mm256_cmpeq_epi32(temp, match_mask);
152	msk = _mm256_movemask_epi8(matches);
153	}
154	}
155
156	/*
157	* Execute trie traversal for up to 16 flows in parallel.
158	*/
159	static inline int
160	search_avx2x16(const struct rte_acl_ctx ctx, const uint8_t *data,
161	uint32_t *results, uint32_t total_packets, uint32_t categories)
162	{
163	uint32_t n;
164	struct acl_flow_data flows;
165	uint64_t index_array[MAX_SEARCHES_AVX16];
166	struct completion cmplt[MAX_SEARCHES_AVX16];
167	struct parms parms[MAX_SEARCHES_AVX16];
168	ymm_t input[2], tr_lo[2], tr_hi[2];
169	ymm_t t0, t1;
170
171	acl_set_flow(&flows, cmplt, RTE_DIM(cmplt), data, results,
172	total_packets, categories, ctx->trans_table);
173
174	for (n = 0; n < RTE_DIM(cmplt); n++) {
175	cmplt[n].count = 0;
176	index_array[n] = acl_start_next_trie(&flows, parms, n, ctx);
177	}
178
179	t0 = _mm256_set_epi64x(index_array[5], index_array[4],
180	index_array[1], index_array[0]);
181	t1 = _mm256_set_epi64x(index_array[7], index_array[6],
182	index_array[3], index_array[2]);
183
184	ACL_TR_HILO(mm256, __m256, t0, t1, tr_lo[0], tr_hi[0]);
185
186	t0 = _mm256_set_epi64x(index_array[13], index_array[12],
187	index_array[9], index_array[8]);
188	t1 = _mm256_set_epi64x(index_array[15], index_array[14],
189	index_array[11], index_array[10]);
190
191	ACL_TR_HILO(mm256, __m256, t0, t1, tr_lo[1], tr_hi[1]);
192
193	/* Check for any matches. */
194	acl_match_check_avx2x8(ctx, parms, &flows, 0, &tr_lo[0], &tr_hi[0],
195	ymm_match_mask.y);
196	acl_match_check_avx2x8(ctx, parms, &flows, 8, &tr_lo[1], &tr_hi[1],
197	ymm_match_mask.y);
198
199	while (flows.started > 0) {
200
201	uint32_t in[MAX_SEARCHES_SSE8];
202
203	/* Gather 4 bytes of input data for first 8 flows. */
204	in[0] = GET_NEXT_4BYTES(parms, 0);
205	in[4] = GET_NEXT_4BYTES(parms, 4);
206	in[1] = GET_NEXT_4BYTES(parms, 1);
207	in[5] = GET_NEXT_4BYTES(parms, 5);
208	in[2] = GET_NEXT_4BYTES(parms, 2);
209	in[6] = GET_NEXT_4BYTES(parms, 6);
210	in[3] = GET_NEXT_4BYTES(parms, 3);
211	in[7] = GET_NEXT_4BYTES(parms, 7);
212	input[0] = _mm256_set_epi32(in[7], in[6], in[5], in[4],
213	in[3], in[2], in[1], in[0]);
214
215	/* Gather 4 bytes of input data for last 8 flows. */
216	in[0] = GET_NEXT_4BYTES(parms, 8);
217	in[4] = GET_NEXT_4BYTES(parms, 12);
218	in[1] = GET_NEXT_4BYTES(parms, 9);
219	in[5] = GET_NEXT_4BYTES(parms, 13);
220	in[2] = GET_NEXT_4BYTES(parms, 10);
221	in[6] = GET_NEXT_4BYTES(parms, 14);
222	in[3] = GET_NEXT_4BYTES(parms, 11);
223	in[7] = GET_NEXT_4BYTES(parms, 15);
224	input[1] = _mm256_set_epi32(in[7], in[6], in[5], in[4],
225	in[3], in[2], in[1], in[0]);
226
227	input[0] = transition8(input[0], flows.trans,
228	&tr_lo[0], &tr_hi[0]);
229	input[1] = transition8(input[1], flows.trans,
230	&tr_lo[1], &tr_hi[1]);
231
232	input[0] = transition8(input[0], flows.trans,
233	&tr_lo[0], &tr_hi[0]);
234	input[1] = transition8(input[1], flows.trans,
235	&tr_lo[1], &tr_hi[1]);
236
237	input[0] = transition8(input[0], flows.trans,
238	&tr_lo[0], &tr_hi[0]);
239	input[1] = transition8(input[1], flows.trans,
240	&tr_lo[1], &tr_hi[1]);
241
242	input[0] = transition8(input[0], flows.trans,
243	&tr_lo[0], &tr_hi[0]);
244	input[1] = transition8(input[1], flows.trans,
245	&tr_lo[1], &tr_hi[1]);
246
247	/* Check for any matches. */
248	acl_match_check_avx2x8(ctx, parms, &flows, 0,
249	&tr_lo[0], &tr_hi[0], ymm_match_mask.y);
250	acl_match_check_avx2x8(ctx, parms, &flows, 8,
251	&tr_lo[1], &tr_hi[1], ymm_match_mask.y);
252	}
253
254	return 0;
255	}