[ceph.git] / ceph / src / spdk / dpdk / lib / librte_eal / ppc / include / rte_memcpy.h

/*
 * SPDX-License-Identifier: BSD-3-Clause
 * Copyright (C) IBM Corporation 2014.
 */

#ifndef _RTE_MEMCPY_PPC_64_H_
#define _RTE_MEMCPY_PPC_64_H_

#include <stdint.h>
#include <string.h>

#include "rte_altivec.h"
#include "rte_common.h"

#ifdef __cplusplus
extern "C" {
#endif

#include "generic/rte_memcpy.h"

#if (GCC_VERSION >= 90000 && GCC_VERSION < 90400)
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Warray-bounds"
#endif

static inline void
rte_mov16(uint8_t *dst, const uint8_t *src)
{
	vec_vsx_st(vec_vsx_ld(0, src), 0, dst);
}

static inline void
rte_mov32(uint8_t *dst, const uint8_t *src)
{
	vec_vsx_st(vec_vsx_ld(0, src), 0, dst);
	vec_vsx_st(vec_vsx_ld(16, src), 16, dst);
}

static inline void
rte_mov48(uint8_t *dst, const uint8_t *src)
{
	vec_vsx_st(vec_vsx_ld(0, src), 0, dst);
	vec_vsx_st(vec_vsx_ld(16, src), 16, dst);
	vec_vsx_st(vec_vsx_ld(32, src), 32, dst);
}

static inline void
rte_mov64(uint8_t *dst, const uint8_t *src)
{
	vec_vsx_st(vec_vsx_ld(0, src), 0, dst);
	vec_vsx_st(vec_vsx_ld(16, src), 16, dst);
	vec_vsx_st(vec_vsx_ld(32, src), 32, dst);
	vec_vsx_st(vec_vsx_ld(48, src), 48, dst);
}

static inline void
rte_mov128(uint8_t *dst, const uint8_t *src)
{
	vec_vsx_st(vec_vsx_ld(0, src), 0, dst);
	vec_vsx_st(vec_vsx_ld(16, src), 16, dst);
	vec_vsx_st(vec_vsx_ld(32, src), 32, dst);
	vec_vsx_st(vec_vsx_ld(48, src), 48, dst);
	vec_vsx_st(vec_vsx_ld(64, src), 64, dst);
	vec_vsx_st(vec_vsx_ld(80, src), 80, dst);
	vec_vsx_st(vec_vsx_ld(96, src), 96, dst);
	vec_vsx_st(vec_vsx_ld(112, src), 112, dst);
}

static inline void
rte_mov256(uint8_t *dst, const uint8_t *src)
{
	rte_mov128(dst, src);
	rte_mov128(dst + 128, src + 128);
}

#define rte_memcpy(dst, src, n)              \
	__extension__ ({                     \
	(__builtin_constant_p(n)) ?          \
	memcpy((dst), (src), (n)) :          \
	rte_memcpy_func((dst), (src), (n)); })

static inline void *
rte_memcpy_func(void *dst, const void *src, size_t n)
{
	void *ret = dst;

	/* We can't copy < 16 bytes using XMM registers so do it manually. */
	if (n < 16) {
		if (n & 0x01) {
			*(uint8_t *)dst = *(const uint8_t *)src;
			dst = (uint8_t *)dst + 1;
			src = (const uint8_t *)src + 1;
		}
		if (n & 0x02) {
			*(uint16_t *)dst = *(const uint16_t *)src;
			dst = (uint16_t *)dst + 1;
			src = (const uint16_t *)src + 1;
		}
		if (n & 0x04) {
			*(uint32_t *)dst = *(const uint32_t *)src;
			dst = (uint32_t *)dst + 1;
			src = (const uint32_t *)src + 1;
		}
		if (n & 0x08)
			*(uint64_t *)dst = *(const uint64_t *)src;
		return ret;
	}

	/* Special fast cases for <= 128 bytes */
	if (n <= 32) {
		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
		rte_mov16((uint8_t *)dst - 16 + n,
			(const uint8_t *)src - 16 + n);
		return ret;
	}

	if (n <= 64) {
		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
		rte_mov32((uint8_t *)dst - 32 + n,
			(const uint8_t *)src - 32 + n);
		return ret;
	}

	if (n <= 128) {
		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
		rte_mov64((uint8_t *)dst - 64 + n,
			(const uint8_t *)src - 64 + n);
		return ret;
	}

	/*
	 * For large copies > 128 bytes. This combination of 256, 64 and 16 byte
	 * copies was found to be faster than doing 128 and 32 byte copies as
	 * well.
	 */
	for ( ; n >= 256; n -= 256) {
		rte_mov256((uint8_t *)dst, (const uint8_t *)src);
		dst = (uint8_t *)dst + 256;
		src = (const uint8_t *)src + 256;
	}

	/*
	 * We split the remaining bytes (which will be less than 256) into
	 * 64byte (2^6) chunks.
	 * Using incrementing integers in the case labels of a switch statement
	 * encourages the compiler to use a jump table. To get incrementing
	 * integers, we shift the 2 relevant bits to the LSB position to first
	 * get decrementing integers, and then subtract.
	 */
	switch (3 - (n >> 6)) {
	case 0x00:
		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
		n -= 64;
		dst = (uint8_t *)dst + 64;
		src = (const uint8_t *)src + 64;      /* fallthrough */
	case 0x01:
		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
		n -= 64;
		dst = (uint8_t *)dst + 64;
		src = (const uint8_t *)src + 64;      /* fallthrough */
	case 0x02:
		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
		n -= 64;
		dst = (uint8_t *)dst + 64;
		src = (const uint8_t *)src + 64;      /* fallthrough */
	default:
		;
	}

	/*
	 * We split the remaining bytes (which will be less than 64) into
	 * 16byte (2^4) chunks, using the same switch structure as above.
	 */
	switch (3 - (n >> 4)) {
	case 0x00:
		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
		n -= 16;
		dst = (uint8_t *)dst + 16;
		src = (const uint8_t *)src + 16;      /* fallthrough */
	case 0x01:
		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
		n -= 16;
		dst = (uint8_t *)dst + 16;
		src = (const uint8_t *)src + 16;      /* fallthrough */
	case 0x02:
		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
		n -= 16;
		dst = (uint8_t *)dst + 16;
		src = (const uint8_t *)src + 16;      /* fallthrough */
	default:
		;
	}

	/* Copy any remaining bytes, without going beyond end of buffers */
	if (n != 0)
		rte_mov16((uint8_t *)dst - 16 + n,
			(const uint8_t *)src - 16 + n);
	return ret;
}

#if (GCC_VERSION >= 90000 && GCC_VERSION < 90400)
#pragma GCC diagnostic pop
#endif

#ifdef __cplusplus
}
#endif

#endif /* _RTE_MEMCPY_PPC_64_H_ */
Commit	Line	Data
7c673cae	1	/*
f67539c2 TL	2	* SPDX-License-Identifier: BSD-3-Clause
	3	* Copyright (C) IBM Corporation 2014.
	4	*/
7c673cae FG	5
	6	#ifndef _RTE_MEMCPY_PPC_64_H_
	7	#define _RTE_MEMCPY_PPC_64_H_
	8
	9	#include <stdint.h>
	10	#include <string.h>
f67539c2 TL	11
	12	#include "rte_altivec.h"
	13	#include "rte_common.h"
7c673cae FG	14
	15	#ifdef __cplusplus
	16	extern "C" {
	17	#endif
	18
	19	#include "generic/rte_memcpy.h"
	20
f67539c2 TL	21	#if (GCC_VERSION >= 90000 && GCC_VERSION < 90400)
	22	#pragma GCC diagnostic push
	23	#pragma GCC diagnostic ignored "-Warray-bounds"
	24	#endif
	25
7c673cae FG	26	static inline void
	27	rte_mov16(uint8_t dst, const uint8_t src)
	28	{
	29	vec_vsx_st(vec_vsx_ld(0, src), 0, dst);
	30	}
	31
	32	static inline void
	33	rte_mov32(uint8_t dst, const uint8_t src)
	34	{
	35	vec_vsx_st(vec_vsx_ld(0, src), 0, dst);
	36	vec_vsx_st(vec_vsx_ld(16, src), 16, dst);
	37	}
	38
	39	static inline void
	40	rte_mov48(uint8_t dst, const uint8_t src)
	41	{
	42	vec_vsx_st(vec_vsx_ld(0, src), 0, dst);
	43	vec_vsx_st(vec_vsx_ld(16, src), 16, dst);
	44	vec_vsx_st(vec_vsx_ld(32, src), 32, dst);
	45	}
	46
	47	static inline void
	48	rte_mov64(uint8_t dst, const uint8_t src)
	49	{
	50	vec_vsx_st(vec_vsx_ld(0, src), 0, dst);
	51	vec_vsx_st(vec_vsx_ld(16, src), 16, dst);
	52	vec_vsx_st(vec_vsx_ld(32, src), 32, dst);
	53	vec_vsx_st(vec_vsx_ld(48, src), 48, dst);
	54	}
	55
	56	static inline void
	57	rte_mov128(uint8_t dst, const uint8_t src)
	58	{
	59	vec_vsx_st(vec_vsx_ld(0, src), 0, dst);
	60	vec_vsx_st(vec_vsx_ld(16, src), 16, dst);
	61	vec_vsx_st(vec_vsx_ld(32, src), 32, dst);
	62	vec_vsx_st(vec_vsx_ld(48, src), 48, dst);
	63	vec_vsx_st(vec_vsx_ld(64, src), 64, dst);
	64	vec_vsx_st(vec_vsx_ld(80, src), 80, dst);
	65	vec_vsx_st(vec_vsx_ld(96, src), 96, dst);
	66	vec_vsx_st(vec_vsx_ld(112, src), 112, dst);
	67	}
	68
	69	static inline void
	70	rte_mov256(uint8_t dst, const uint8_t src)
	71	{
	72	rte_mov128(dst, src);
	73	rte_mov128(dst + 128, src + 128);
	74	}
	75
	76	#define rte_memcpy(dst, src, n) \
	77	__extension__ ({ \
	78	(__builtin_constant_p(n)) ? \
	79	memcpy((dst), (src), (n)) : \
	80	rte_memcpy_func((dst), (src), (n)); })
	81
	82	static inline void *
	83	rte_memcpy_func(void dst, const void src, size_t n)
	84	{
	85	void *ret = dst;
	86
	87	/* We can't copy < 16 bytes using XMM registers so do it manually. */
	88	if (n < 16) {
	89	if (n & 0x01) {
90	(uint8_t )dst = (const uint8_t )src;
91	dst = (uint8_t *)dst + 1;
92	src = (const uint8_t *)src + 1;
93	}
94	if (n & 0x02) {
95	(uint16_t )dst = (const uint16_t )src;
96	dst = (uint16_t *)dst + 1;
97	src = (const uint16_t *)src + 1;
98	}
99	if (n & 0x04) {
100	(uint32_t )dst = (const uint32_t )src;
101	dst = (uint32_t *)dst + 1;
102	src = (const uint32_t *)src + 1;
103	}
104	if (n & 0x08)
105	(uint64_t )dst = (const uint64_t )src;
106	return ret;
107	}
108
109	/* Special fast cases for <= 128 bytes */
110	if (n <= 32) {
111	rte_mov16((uint8_t )dst, (const uint8_t )src);
112	rte_mov16((uint8_t *)dst - 16 + n,
113	(const uint8_t *)src - 16 + n);
114	return ret;
115	}
116
117	if (n <= 64) {
118	rte_mov32((uint8_t )dst, (const uint8_t )src);
119	rte_mov32((uint8_t *)dst - 32 + n,
120	(const uint8_t *)src - 32 + n);
121	return ret;
122	}
123
124	if (n <= 128) {
125	rte_mov64((uint8_t )dst, (const uint8_t )src);
126	rte_mov64((uint8_t *)dst - 64 + n,
127	(const uint8_t *)src - 64 + n);
128	return ret;
129	}
130
131	/*
132	* For large copies > 128 bytes. This combination of 256, 64 and 16 byte
133	* copies was found to be faster than doing 128 and 32 byte copies as
134	* well.
135	*/
136	for ( ; n >= 256; n -= 256) {
137	rte_mov256((uint8_t )dst, (const uint8_t )src);
138	dst = (uint8_t *)dst + 256;
139	src = (const uint8_t *)src + 256;
140	}
141
142	/*
143	* We split the remaining bytes (which will be less than 256) into
144	* 64byte (2^6) chunks.
145	* Using incrementing integers in the case labels of a switch statement
11fdf7f2	146	* encourages the compiler to use a jump table. To get incrementing
7c673cae FG	147	* integers, we shift the 2 relevant bits to the LSB position to first
	148	* get decrementing integers, and then subtract.
	149	*/
	150	switch (3 - (n >> 6)) {
	151	case 0x00:
	152	rte_mov64((uint8_t )dst, (const uint8_t )src);
	153	n -= 64;
	154	dst = (uint8_t *)dst + 64;
	155	src = (const uint8_t )src + 64; / fallthrough */
	156	case 0x01:
	157	rte_mov64((uint8_t )dst, (const uint8_t )src);
	158	n -= 64;
	159	dst = (uint8_t *)dst + 64;
	160	src = (const uint8_t )src + 64; / fallthrough */
	161	case 0x02:
	162	rte_mov64((uint8_t )dst, (const uint8_t )src);
	163	n -= 64;
	164	dst = (uint8_t *)dst + 64;
	165	src = (const uint8_t )src + 64; / fallthrough */
	166	default:
	167	;
	168	}
	169
	170	/*
	171	* We split the remaining bytes (which will be less than 64) into
	172	* 16byte (2^4) chunks, using the same switch structure as above.
	173	*/
	174	switch (3 - (n >> 4)) {
	175	case 0x00:
	176	rte_mov16((uint8_t )dst, (const uint8_t )src);
	177	n -= 16;
	178	dst = (uint8_t *)dst + 16;
	179	src = (const uint8_t )src + 16; / fallthrough */
	180	case 0x01:
	181	rte_mov16((uint8_t )dst, (const uint8_t )src);
	182	n -= 16;
	183	dst = (uint8_t *)dst + 16;
	184	src = (const uint8_t )src + 16; / fallthrough */
	185	case 0x02:
	186	rte_mov16((uint8_t )dst, (const uint8_t )src);
	187	n -= 16;
	188	dst = (uint8_t *)dst + 16;
	189	src = (const uint8_t )src + 16; / fallthrough */
	190	default:
	191	;
	192	}
	193
	194	/* Copy any remaining bytes, without going beyond end of buffers */
	195	if (n != 0)
	196	rte_mov16((uint8_t *)dst - 16 + n,
	197	(const uint8_t *)src - 16 + n);
	198	return ret;
	199	}
	200
f67539c2 TL	201	#if (GCC_VERSION >= 90000 && GCC_VERSION < 90400)
	202	#pragma GCC diagnostic pop
	203	#endif
	204
7c673cae FG	205	#ifdef __cplusplus
	206	}
	207	#endif
	208
	209	#endif /* _RTE_MEMCPY_PPC_64_H_ */