[mirror_ubuntu-zesty-kernel.git] / arch / x86 / include / asm / xor_64.h

#ifndef _ASM_X86_XOR_64_H
#define _ASM_X86_XOR_64_H

/*
 * Optimized RAID-5 checksumming functions for MMX and SSE.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2, or (at your option)
 * any later version.
 *
 * You should have received a copy of the GNU General Public License
 * (for example /usr/src/linux/COPYING); if not, write to the Free
 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */


/*
 * Cache avoiding checksumming functions utilizing KNI instructions
 * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
 */

/*
 * Based on
 * High-speed RAID5 checksumming functions utilizing SSE instructions.
 * Copyright (C) 1998 Ingo Molnar.
 */

/*
 * x86-64 changes / gcc fixes from Andi Kleen.
 * Copyright 2002 Andi Kleen, SuSE Labs.
 *
 * This hasn't been optimized for the hammer yet, but there are likely
 * no advantages to be gotten from x86-64 here anyways.
 */

#include <asm/i387.h>

#define OFFS(x)		"16*("#x")"
#define PF_OFFS(x)	"256+16*("#x")"
#define	PF0(x)		"	prefetchnta "PF_OFFS(x)"(%[p1])		;\n"
#define LD(x, y)	"       movaps   "OFFS(x)"(%[p1]), %%xmm"#y"	;\n"
#define ST(x, y)	"       movaps %%xmm"#y",   "OFFS(x)"(%[p1])	;\n"
#define PF1(x)		"	prefetchnta "PF_OFFS(x)"(%[p2])		;\n"
#define PF2(x)		"	prefetchnta "PF_OFFS(x)"(%[p3])		;\n"
#define PF3(x)		"	prefetchnta "PF_OFFS(x)"(%[p4])		;\n"
#define PF4(x)		"	prefetchnta "PF_OFFS(x)"(%[p5])		;\n"
#define PF5(x)		"	prefetchnta "PF_OFFS(x)"(%[p6])		;\n"
#define XO1(x, y)	"       xorps   "OFFS(x)"(%[p2]), %%xmm"#y"	;\n"
#define XO2(x, y)	"       xorps   "OFFS(x)"(%[p3]), %%xmm"#y"	;\n"
#define XO3(x, y)	"       xorps   "OFFS(x)"(%[p4]), %%xmm"#y"	;\n"
#define XO4(x, y)	"       xorps   "OFFS(x)"(%[p5]), %%xmm"#y"	;\n"
#define XO5(x, y)	"       xorps   "OFFS(x)"(%[p6]), %%xmm"#y"	;\n"


static void
xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
{
	unsigned int lines = bytes >> 8;

	kernel_fpu_begin();

	asm volatile(
#undef BLOCK
#define BLOCK(i) \
		LD(i, 0)				\
			LD(i + 1, 1)			\
		PF1(i)					\
				PF1(i + 2)		\
				LD(i + 2, 2)		\
					LD(i + 3, 3)	\
		PF0(i + 4)				\
				PF0(i + 6)		\
		XO1(i, 0)				\
			XO1(i + 1, 1)			\
				XO1(i + 2, 2)		\
					XO1(i + 3, 3)	\
		ST(i, 0)				\
			ST(i + 1, 1)			\
				ST(i + 2, 2)		\
					ST(i + 3, 3)	\


		PF0(0)
				PF0(2)

	" .align 32			;\n"
	" 1:                            ;\n"

		BLOCK(0)
		BLOCK(4)
		BLOCK(8)
		BLOCK(12)

	"       addq %[inc], %[p1]           ;\n"
	"       addq %[inc], %[p2]           ;\n"
		"		decl %[cnt] ; jnz 1b"
	: [p1] "+r" (p1), [p2] "+r" (p2), [cnt] "+r" (lines)
	: [inc] "r" (256UL)
	: "memory");

	kernel_fpu_end();
}

static void
xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
	  unsigned long *p3)
{
	unsigned int lines = bytes >> 8;

	kernel_fpu_begin();
	asm volatile(
#undef BLOCK
#define BLOCK(i) \
		PF1(i)					\
				PF1(i + 2)		\
		LD(i, 0)					\
			LD(i + 1, 1)			\
				LD(i + 2, 2)		\
					LD(i + 3, 3)	\
		PF2(i)					\
				PF2(i + 2)		\
		PF0(i + 4)				\
				PF0(i + 6)		\
		XO1(i, 0)				\
			XO1(i + 1, 1)			\
				XO1(i + 2, 2)		\
					XO1(i + 3, 3)	\
		XO2(i, 0)				\
			XO2(i + 1, 1)			\
				XO2(i + 2, 2)		\
					XO2(i + 3, 3)	\
		ST(i, 0)				\
			ST(i + 1, 1)			\
				ST(i + 2, 2)		\
					ST(i + 3, 3)	\


		PF0(0)
				PF0(2)

	" .align 32			;\n"
	" 1:                            ;\n"

		BLOCK(0)
		BLOCK(4)
		BLOCK(8)
		BLOCK(12)

	"       addq %[inc], %[p1]           ;\n"
	"       addq %[inc], %[p2]          ;\n"
	"       addq %[inc], %[p3]           ;\n"
		"		decl %[cnt] ; jnz 1b"
	: [cnt] "+r" (lines),
	  [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
	: [inc] "r" (256UL)
	: "memory");
	kernel_fpu_end();
}

static void
xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
	  unsigned long *p3, unsigned long *p4)
{
	unsigned int lines = bytes >> 8;

	kernel_fpu_begin();

	asm volatile(
#undef BLOCK
#define BLOCK(i) \
		PF1(i)					\
				PF1(i + 2)		\
		LD(i, 0)				\
			LD(i + 1, 1)			\
				LD(i + 2, 2)		\
					LD(i + 3, 3)	\
		PF2(i)					\
				PF2(i + 2)		\
		XO1(i, 0)				\
			XO1(i + 1, 1)			\
				XO1(i + 2, 2)		\
					XO1(i + 3, 3)	\
		PF3(i)					\
				PF3(i + 2)		\
		PF0(i + 4)				\
				PF0(i + 6)		\
		XO2(i, 0)				\
			XO2(i + 1, 1)			\
				XO2(i + 2, 2)		\
					XO2(i + 3, 3)	\
		XO3(i, 0)				\
			XO3(i + 1, 1)			\
				XO3(i + 2, 2)		\
					XO3(i + 3, 3)	\
		ST(i, 0)				\
			ST(i + 1, 1)			\
				ST(i + 2, 2)		\
					ST(i + 3, 3)	\


		PF0(0)
				PF0(2)

	" .align 32			;\n"
	" 1:                            ;\n"

		BLOCK(0)
		BLOCK(4)
		BLOCK(8)
		BLOCK(12)

	"       addq %[inc], %[p1]           ;\n"
	"       addq %[inc], %[p2]           ;\n"
	"       addq %[inc], %[p3]           ;\n"
	"       addq %[inc], %[p4]           ;\n"
	"	decl %[cnt] ; jnz 1b"
	: [cnt] "+c" (lines),
	  [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
	: [inc] "r" (256UL)
	: "memory" );

	kernel_fpu_end();
}

static void
xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
	  unsigned long *p3, unsigned long *p4, unsigned long *p5)
{
	unsigned int lines = bytes >> 8;

	kernel_fpu_begin();

	asm volatile(
#undef BLOCK
#define BLOCK(i) \
		PF1(i)					\
				PF1(i + 2)		\
		LD(i, 0)				\
			LD(i + 1, 1)			\
				LD(i + 2, 2)		\
					LD(i + 3, 3)	\
		PF2(i)					\
				PF2(i + 2)		\
		XO1(i, 0)				\
			XO1(i + 1, 1)			\
				XO1(i + 2, 2)		\
					XO1(i + 3, 3)	\
		PF3(i)					\
				PF3(i + 2)		\
		XO2(i, 0)				\
			XO2(i + 1, 1)			\
				XO2(i + 2, 2)		\
					XO2(i + 3, 3)	\
		PF4(i)					\
				PF4(i + 2)		\
		PF0(i + 4)				\
				PF0(i + 6)		\
		XO3(i, 0)				\
			XO3(i + 1, 1)			\
				XO3(i + 2, 2)		\
					XO3(i + 3, 3)	\
		XO4(i, 0)				\
			XO4(i + 1, 1)			\
				XO4(i + 2, 2)		\
					XO4(i + 3, 3)	\
		ST(i, 0)				\
			ST(i + 1, 1)			\
				ST(i + 2, 2)		\
					ST(i + 3, 3)	\


		PF0(0)
				PF0(2)

	" .align 32			;\n"
	" 1:                            ;\n"

		BLOCK(0)
		BLOCK(4)
		BLOCK(8)
		BLOCK(12)

	"       addq %[inc], %[p1]           ;\n"
	"       addq %[inc], %[p2]           ;\n"
	"       addq %[inc], %[p3]           ;\n"
	"       addq %[inc], %[p4]           ;\n"
	"       addq %[inc], %[p5]           ;\n"
	"	decl %[cnt] ; jnz 1b"
	: [cnt] "+c" (lines),
	  [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4),
	  [p5] "+r" (p5)
	: [inc] "r" (256UL)
	: "memory");

	kernel_fpu_end();
}

static struct xor_block_template xor_block_sse = {
	.name = "generic_sse",
	.do_2 = xor_sse_2,
	.do_3 = xor_sse_3,
	.do_4 = xor_sse_4,
	.do_5 = xor_sse_5,
};


/* Also try the AVX routines */
#include "xor_avx.h"

#undef XOR_TRY_TEMPLATES
#define XOR_TRY_TEMPLATES			\
do {						\
	AVX_XOR_SPEED;				\
	xor_speed(&xor_block_sse);		\
} while (0)

/* We force the use of the SSE xor block because it can write around L2.
   We may also be able to load into the L1 only depending on how the cpu
   deals with a load to a line that is being prefetched.  */
#define XOR_SELECT_TEMPLATE(FASTEST) \
	AVX_SELECT(&xor_block_sse)

#endif /* _ASM_X86_XOR_64_H */
Commit	Line	Data
1965aae3 PA	1	#ifndef _ASM_X86_XOR_64_H
1965aae3 PA	2	#define _ASM_X86_XOR_64_H
0db125c4	3
1da177e4	4	/*
1da177e4 LT	5	* Optimized RAID-5 checksumming functions for MMX and SSE.
	6	*
	7	* This program is free software; you can redistribute it and/or modify
	8	* it under the terms of the GNU General Public License as published by
	9	* the Free Software Foundation; either version 2, or (at your option)
	10	* any later version.
	11	*
	12	* You should have received a copy of the GNU General Public License
	13	* (for example /usr/src/linux/COPYING); if not, write to the Free
	14	* Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	15	*/
	16
	17
	18	/*
	19	* Cache avoiding checksumming functions utilizing KNI instructions
	20	* Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
	21	*/
	22
	23	/*
	24	* Based on
	25	* High-speed RAID5 checksumming functions utilizing SSE instructions.
	26	* Copyright (C) 1998 Ingo Molnar.
	27	*/
	28
	29	/*
687c8054	30	* x86-64 changes / gcc fixes from Andi Kleen.
1da177e4 LT	31	* Copyright 2002 Andi Kleen, SuSE Labs.
	32	*
	33	* This hasn't been optimized for the hammer yet, but there are likely
	34	* no advantages to be gotten from x86-64 here anyways.
	35	*/
	36
841e3604	37	#include <asm/i387.h>
1da177e4 LT	38
	39	#define OFFS(x) "16*("#x")"
	40	#define PF_OFFS(x) "256+16*("#x")"
	41	#define PF0(x) " prefetchnta "PF_OFFS(x)"(%[p1]) ;\n"
687c8054 JP	42	#define LD(x, y) " movaps "OFFS(x)"(%[p1]), %%xmm"#y" ;\n"
687c8054 JP	43	#define ST(x, y) " movaps %%xmm"#y", "OFFS(x)"(%[p1]) ;\n"
1da177e4 LT	44	#define PF1(x) " prefetchnta "PF_OFFS(x)"(%[p2]) ;\n"
	45	#define PF2(x) " prefetchnta "PF_OFFS(x)"(%[p3]) ;\n"
	46	#define PF3(x) " prefetchnta "PF_OFFS(x)"(%[p4]) ;\n"
	47	#define PF4(x) " prefetchnta "PF_OFFS(x)"(%[p5]) ;\n"
	48	#define PF5(x) " prefetchnta "PF_OFFS(x)"(%[p6]) ;\n"
687c8054 JP	49	#define XO1(x, y) " xorps "OFFS(x)"(%[p2]), %%xmm"#y" ;\n"
	50	#define XO2(x, y) " xorps "OFFS(x)"(%[p3]), %%xmm"#y" ;\n"
	51	#define XO3(x, y) " xorps "OFFS(x)"(%[p4]), %%xmm"#y" ;\n"
	52	#define XO4(x, y) " xorps "OFFS(x)"(%[p5]), %%xmm"#y" ;\n"
	53	#define XO5(x, y) " xorps "OFFS(x)"(%[p6]), %%xmm"#y" ;\n"
1da177e4 LT	54
	55
	56	static void
	57	xor_sse_2(unsigned long bytes, unsigned long p1, unsigned long p2)
	58	{
687c8054	59	unsigned int lines = bytes >> 8;
1da177e4	60
841e3604	61	kernel_fpu_begin();
1da177e4	62
687c8054	63	asm volatile(
1da177e4 LT	64	#undef BLOCK
1da177e4 LT	65	#define BLOCK(i) \
687c8054 JP	66	LD(i, 0) \
687c8054 JP	67	LD(i + 1, 1) \
1da177e4	68	PF1(i) \
687c8054 JP	69	PF1(i + 2) \
	70	LD(i + 2, 2) \
	71	LD(i + 3, 3) \
	72	PF0(i + 4) \
	73	PF0(i + 6) \
	74	XO1(i, 0) \
	75	XO1(i + 1, 1) \
	76	XO1(i + 2, 2) \
	77	XO1(i + 3, 3) \
	78	ST(i, 0) \
	79	ST(i + 1, 1) \
	80	ST(i + 2, 2) \
	81	ST(i + 3, 3) \
1da177e4 LT	82
	83
	84	PF0(0)
	85	PF0(2)
	86
	87	" .align 32 ;\n"
687c8054	88	" 1: ;\n"
1da177e4 LT	89
	90	BLOCK(0)
	91	BLOCK(4)
	92	BLOCK(8)
	93	BLOCK(12)
	94
687c8054 JP	95	" addq %[inc], %[p1] ;\n"
687c8054 JP	96	" addq %[inc], %[p2] ;\n"
1da177e4 LT	97	" decl %[cnt] ; jnz 1b"
1da177e4 LT	98	: [p1] "+r" (p1), [p2] "+r" (p2), [cnt] "+r" (lines)
687c8054 JP	99	: [inc] "r" (256UL)
687c8054 JP	100	: "memory");
1da177e4	101
841e3604	102	kernel_fpu_end();
1da177e4 LT	103	}
	104
	105	static void
	106	xor_sse_3(unsigned long bytes, unsigned long p1, unsigned long p2,
	107	unsigned long *p3)
	108	{
	109	unsigned int lines = bytes >> 8;
1da177e4	110
841e3604	111	kernel_fpu_begin();
687c8054	112	asm volatile(
1da177e4 LT	113	#undef BLOCK
	114	#define BLOCK(i) \
	115	PF1(i) \
687c8054 JP	116	PF1(i + 2) \
	117	LD(i, 0) \
	118	LD(i + 1, 1) \
	119	LD(i + 2, 2) \
	120	LD(i + 3, 3) \
1da177e4	121	PF2(i) \
687c8054 JP	122	PF2(i + 2) \
	123	PF0(i + 4) \
	124	PF0(i + 6) \
	125	XO1(i, 0) \
	126	XO1(i + 1, 1) \
	127	XO1(i + 2, 2) \
	128	XO1(i + 3, 3) \
	129	XO2(i, 0) \
	130	XO2(i + 1, 1) \
	131	XO2(i + 2, 2) \
	132	XO2(i + 3, 3) \
	133	ST(i, 0) \
	134	ST(i + 1, 1) \
	135	ST(i + 2, 2) \
	136	ST(i + 3, 3) \
1da177e4 LT	137
	138
	139	PF0(0)
	140	PF0(2)
	141
	142	" .align 32 ;\n"
687c8054	143	" 1: ;\n"
1da177e4 LT	144
	145	BLOCK(0)
	146	BLOCK(4)
	147	BLOCK(8)
	148	BLOCK(12)
	149
687c8054 JP	150	" addq %[inc], %[p1] ;\n"
	151	" addq %[inc], %[p2] ;\n"
	152	" addq %[inc], %[p3] ;\n"
1da177e4 LT	153	" decl %[cnt] ; jnz 1b"
	154	: [cnt] "+r" (lines),
	155	[p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
	156	: [inc] "r" (256UL)
687c8054	157	: "memory");
841e3604	158	kernel_fpu_end();
1da177e4 LT	159	}
	160
	161	static void
	162	xor_sse_4(unsigned long bytes, unsigned long p1, unsigned long p2,
	163	unsigned long p3, unsigned long p4)
	164	{
	165	unsigned int lines = bytes >> 8;
1da177e4	166
841e3604	167	kernel_fpu_begin();
1da177e4	168
687c8054	169	asm volatile(
1da177e4 LT	170	#undef BLOCK
	171	#define BLOCK(i) \
	172	PF1(i) \
687c8054 JP	173	PF1(i + 2) \
	174	LD(i, 0) \
	175	LD(i + 1, 1) \
	176	LD(i + 2, 2) \
	177	LD(i + 3, 3) \
1da177e4	178	PF2(i) \
687c8054 JP	179	PF2(i + 2) \
	180	XO1(i, 0) \
	181	XO1(i + 1, 1) \
	182	XO1(i + 2, 2) \
	183	XO1(i + 3, 3) \
1da177e4	184	PF3(i) \
687c8054 JP	185	PF3(i + 2) \
	186	PF0(i + 4) \
	187	PF0(i + 6) \
	188	XO2(i, 0) \
	189	XO2(i + 1, 1) \
	190	XO2(i + 2, 2) \
	191	XO2(i + 3, 3) \
	192	XO3(i, 0) \
	193	XO3(i + 1, 1) \
	194	XO3(i + 2, 2) \
	195	XO3(i + 3, 3) \
	196	ST(i, 0) \
	197	ST(i + 1, 1) \
	198	ST(i + 2, 2) \
	199	ST(i + 3, 3) \
1da177e4 LT	200
	201
	202	PF0(0)
	203	PF0(2)
	204
	205	" .align 32 ;\n"
687c8054	206	" 1: ;\n"
1da177e4 LT	207
	208	BLOCK(0)
	209	BLOCK(4)
	210	BLOCK(8)
	211	BLOCK(12)
	212
687c8054 JP	213	" addq %[inc], %[p1] ;\n"
	214	" addq %[inc], %[p2] ;\n"
	215	" addq %[inc], %[p3] ;\n"
	216	" addq %[inc], %[p4] ;\n"
1da177e4 LT	217	" decl %[cnt] ; jnz 1b"
	218	: [cnt] "+c" (lines),
	219	[p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
	220	: [inc] "r" (256UL)
687c8054	221	: "memory" );
1da177e4	222
841e3604	223	kernel_fpu_end();
1da177e4 LT	224	}
	225
	226	static void
	227	xor_sse_5(unsigned long bytes, unsigned long p1, unsigned long p2,
	228	unsigned long p3, unsigned long p4, unsigned long *p5)
	229	{
687c8054	230	unsigned int lines = bytes >> 8;
1da177e4	231
841e3604	232	kernel_fpu_begin();
1da177e4	233
687c8054	234	asm volatile(
1da177e4 LT	235	#undef BLOCK
	236	#define BLOCK(i) \
	237	PF1(i) \
687c8054 JP	238	PF1(i + 2) \
	239	LD(i, 0) \
	240	LD(i + 1, 1) \
	241	LD(i + 2, 2) \
	242	LD(i + 3, 3) \
1da177e4	243	PF2(i) \
687c8054 JP	244	PF2(i + 2) \
	245	XO1(i, 0) \
	246	XO1(i + 1, 1) \
	247	XO1(i + 2, 2) \
	248	XO1(i + 3, 3) \
1da177e4	249	PF3(i) \
687c8054 JP	250	PF3(i + 2) \
	251	XO2(i, 0) \
	252	XO2(i + 1, 1) \
	253	XO2(i + 2, 2) \
	254	XO2(i + 3, 3) \
1da177e4	255	PF4(i) \
687c8054 JP	256	PF4(i + 2) \
	257	PF0(i + 4) \
	258	PF0(i + 6) \
	259	XO3(i, 0) \
	260	XO3(i + 1, 1) \
	261	XO3(i + 2, 2) \
	262	XO3(i + 3, 3) \
	263	XO4(i, 0) \
	264	XO4(i + 1, 1) \
	265	XO4(i + 2, 2) \
	266	XO4(i + 3, 3) \
	267	ST(i, 0) \
	268	ST(i + 1, 1) \
	269	ST(i + 2, 2) \
	270	ST(i + 3, 3) \
1da177e4 LT	271
	272
	273	PF0(0)
	274	PF0(2)
	275
	276	" .align 32 ;\n"
687c8054	277	" 1: ;\n"
1da177e4 LT	278
	279	BLOCK(0)
	280	BLOCK(4)
	281	BLOCK(8)
	282	BLOCK(12)
	283
687c8054 JP	284	" addq %[inc], %[p1] ;\n"
	285	" addq %[inc], %[p2] ;\n"
	286	" addq %[inc], %[p3] ;\n"
	287	" addq %[inc], %[p4] ;\n"
	288	" addq %[inc], %[p5] ;\n"
1da177e4 LT	289	" decl %[cnt] ; jnz 1b"
1da177e4 LT	290	: [cnt] "+c" (lines),
687c8054	291	[p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4),
1da177e4 LT	292	[p5] "+r" (p5)
	293	: [inc] "r" (256UL)
	294	: "memory");
	295
841e3604	296	kernel_fpu_end();
1da177e4 LT	297	}
	298
	299	static struct xor_block_template xor_block_sse = {
687c8054 JP	300	.name = "generic_sse",
	301	.do_2 = xor_sse_2,
	302	.do_3 = xor_sse_3,
	303	.do_4 = xor_sse_4,
	304	.do_5 = xor_sse_5,
1da177e4 LT	305	};
1da177e4 LT	306
ea4d26ae JK	307
	308	/* Also try the AVX routines */
	309	#include "xor_avx.h"
	310
1da177e4	311	#undef XOR_TRY_TEMPLATES
687c8054 JP	312	#define XOR_TRY_TEMPLATES \
687c8054 JP	313	do { \
ea4d26ae	314	AVX_XOR_SPEED; \
687c8054 JP	315	xor_speed(&xor_block_sse); \
687c8054 JP	316	} while (0)
1da177e4 LT	317
	318	/* We force the use of the SSE xor block because it can write around L2.
	319	We may also be able to load into the L1 only depending on how the cpu
	320	deals with a load to a line that is being prefetched. */
ea4d26ae JK	321	#define XOR_SELECT_TEMPLATE(FASTEST) \
ea4d26ae JK	322	AVX_SELECT(&xor_block_sse)
0db125c4	323
1965aae3	324	#endif /* _ASM_X86_XOR_64_H */