]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blame - arch/x86/include/asm/xor_64.h
UAPI: (Scripted) Convert #include "..." to #include <path/...> in kernel system headers
[mirror_ubuntu-bionic-kernel.git] / arch / x86 / include / asm / xor_64.h
CommitLineData
1965aae3
PA
1#ifndef _ASM_X86_XOR_64_H
2#define _ASM_X86_XOR_64_H
0db125c4 3
1da177e4 4/*
1da177e4
LT
5 * Optimized RAID-5 checksumming functions for MMX and SSE.
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2, or (at your option)
10 * any later version.
11 *
12 * You should have received a copy of the GNU General Public License
13 * (for example /usr/src/linux/COPYING); if not, write to the Free
14 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15 */
16
17
18/*
19 * Cache avoiding checksumming functions utilizing KNI instructions
20 * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
21 */
22
23/*
24 * Based on
25 * High-speed RAID5 checksumming functions utilizing SSE instructions.
26 * Copyright (C) 1998 Ingo Molnar.
27 */
28
29/*
687c8054 30 * x86-64 changes / gcc fixes from Andi Kleen.
1da177e4
LT
31 * Copyright 2002 Andi Kleen, SuSE Labs.
32 *
33 * This hasn't been optimized for the hammer yet, but there are likely
34 * no advantages to be gotten from x86-64 here anyways.
35 */
36
841e3604 37#include <asm/i387.h>
1da177e4
LT
38
39#define OFFS(x) "16*("#x")"
40#define PF_OFFS(x) "256+16*("#x")"
41#define PF0(x) " prefetchnta "PF_OFFS(x)"(%[p1]) ;\n"
687c8054
JP
42#define LD(x, y) " movaps "OFFS(x)"(%[p1]), %%xmm"#y" ;\n"
43#define ST(x, y) " movaps %%xmm"#y", "OFFS(x)"(%[p1]) ;\n"
1da177e4
LT
44#define PF1(x) " prefetchnta "PF_OFFS(x)"(%[p2]) ;\n"
45#define PF2(x) " prefetchnta "PF_OFFS(x)"(%[p3]) ;\n"
46#define PF3(x) " prefetchnta "PF_OFFS(x)"(%[p4]) ;\n"
47#define PF4(x) " prefetchnta "PF_OFFS(x)"(%[p5]) ;\n"
48#define PF5(x) " prefetchnta "PF_OFFS(x)"(%[p6]) ;\n"
687c8054
JP
49#define XO1(x, y) " xorps "OFFS(x)"(%[p2]), %%xmm"#y" ;\n"
50#define XO2(x, y) " xorps "OFFS(x)"(%[p3]), %%xmm"#y" ;\n"
51#define XO3(x, y) " xorps "OFFS(x)"(%[p4]), %%xmm"#y" ;\n"
52#define XO4(x, y) " xorps "OFFS(x)"(%[p5]), %%xmm"#y" ;\n"
53#define XO5(x, y) " xorps "OFFS(x)"(%[p6]), %%xmm"#y" ;\n"
1da177e4
LT
54
55
56static void
57xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
58{
687c8054 59 unsigned int lines = bytes >> 8;
1da177e4 60
841e3604 61 kernel_fpu_begin();
1da177e4 62
687c8054 63 asm volatile(
1da177e4
LT
64#undef BLOCK
65#define BLOCK(i) \
687c8054
JP
66 LD(i, 0) \
67 LD(i + 1, 1) \
1da177e4 68 PF1(i) \
687c8054
JP
69 PF1(i + 2) \
70 LD(i + 2, 2) \
71 LD(i + 3, 3) \
72 PF0(i + 4) \
73 PF0(i + 6) \
74 XO1(i, 0) \
75 XO1(i + 1, 1) \
76 XO1(i + 2, 2) \
77 XO1(i + 3, 3) \
78 ST(i, 0) \
79 ST(i + 1, 1) \
80 ST(i + 2, 2) \
81 ST(i + 3, 3) \
1da177e4
LT
82
83
84 PF0(0)
85 PF0(2)
86
87 " .align 32 ;\n"
687c8054 88 " 1: ;\n"
1da177e4
LT
89
90 BLOCK(0)
91 BLOCK(4)
92 BLOCK(8)
93 BLOCK(12)
94
687c8054
JP
95 " addq %[inc], %[p1] ;\n"
96 " addq %[inc], %[p2] ;\n"
1da177e4
LT
97 " decl %[cnt] ; jnz 1b"
98 : [p1] "+r" (p1), [p2] "+r" (p2), [cnt] "+r" (lines)
687c8054
JP
99 : [inc] "r" (256UL)
100 : "memory");
1da177e4 101
841e3604 102 kernel_fpu_end();
1da177e4
LT
103}
104
105static void
106xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
107 unsigned long *p3)
108{
109 unsigned int lines = bytes >> 8;
1da177e4 110
841e3604 111 kernel_fpu_begin();
687c8054 112 asm volatile(
1da177e4
LT
113#undef BLOCK
114#define BLOCK(i) \
115 PF1(i) \
687c8054
JP
116 PF1(i + 2) \
117 LD(i, 0) \
118 LD(i + 1, 1) \
119 LD(i + 2, 2) \
120 LD(i + 3, 3) \
1da177e4 121 PF2(i) \
687c8054
JP
122 PF2(i + 2) \
123 PF0(i + 4) \
124 PF0(i + 6) \
125 XO1(i, 0) \
126 XO1(i + 1, 1) \
127 XO1(i + 2, 2) \
128 XO1(i + 3, 3) \
129 XO2(i, 0) \
130 XO2(i + 1, 1) \
131 XO2(i + 2, 2) \
132 XO2(i + 3, 3) \
133 ST(i, 0) \
134 ST(i + 1, 1) \
135 ST(i + 2, 2) \
136 ST(i + 3, 3) \
1da177e4
LT
137
138
139 PF0(0)
140 PF0(2)
141
142 " .align 32 ;\n"
687c8054 143 " 1: ;\n"
1da177e4
LT
144
145 BLOCK(0)
146 BLOCK(4)
147 BLOCK(8)
148 BLOCK(12)
149
687c8054
JP
150 " addq %[inc], %[p1] ;\n"
151 " addq %[inc], %[p2] ;\n"
152 " addq %[inc], %[p3] ;\n"
1da177e4
LT
153 " decl %[cnt] ; jnz 1b"
154 : [cnt] "+r" (lines),
155 [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
156 : [inc] "r" (256UL)
687c8054 157 : "memory");
841e3604 158 kernel_fpu_end();
1da177e4
LT
159}
160
161static void
162xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
163 unsigned long *p3, unsigned long *p4)
164{
165 unsigned int lines = bytes >> 8;
1da177e4 166
841e3604 167 kernel_fpu_begin();
1da177e4 168
687c8054 169 asm volatile(
1da177e4
LT
170#undef BLOCK
171#define BLOCK(i) \
172 PF1(i) \
687c8054
JP
173 PF1(i + 2) \
174 LD(i, 0) \
175 LD(i + 1, 1) \
176 LD(i + 2, 2) \
177 LD(i + 3, 3) \
1da177e4 178 PF2(i) \
687c8054
JP
179 PF2(i + 2) \
180 XO1(i, 0) \
181 XO1(i + 1, 1) \
182 XO1(i + 2, 2) \
183 XO1(i + 3, 3) \
1da177e4 184 PF3(i) \
687c8054
JP
185 PF3(i + 2) \
186 PF0(i + 4) \
187 PF0(i + 6) \
188 XO2(i, 0) \
189 XO2(i + 1, 1) \
190 XO2(i + 2, 2) \
191 XO2(i + 3, 3) \
192 XO3(i, 0) \
193 XO3(i + 1, 1) \
194 XO3(i + 2, 2) \
195 XO3(i + 3, 3) \
196 ST(i, 0) \
197 ST(i + 1, 1) \
198 ST(i + 2, 2) \
199 ST(i + 3, 3) \
1da177e4
LT
200
201
202 PF0(0)
203 PF0(2)
204
205 " .align 32 ;\n"
687c8054 206 " 1: ;\n"
1da177e4
LT
207
208 BLOCK(0)
209 BLOCK(4)
210 BLOCK(8)
211 BLOCK(12)
212
687c8054
JP
213 " addq %[inc], %[p1] ;\n"
214 " addq %[inc], %[p2] ;\n"
215 " addq %[inc], %[p3] ;\n"
216 " addq %[inc], %[p4] ;\n"
1da177e4
LT
217 " decl %[cnt] ; jnz 1b"
218 : [cnt] "+c" (lines),
219 [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
220 : [inc] "r" (256UL)
687c8054 221 : "memory" );
1da177e4 222
841e3604 223 kernel_fpu_end();
1da177e4
LT
224}
225
226static void
227xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
228 unsigned long *p3, unsigned long *p4, unsigned long *p5)
229{
687c8054 230 unsigned int lines = bytes >> 8;
1da177e4 231
841e3604 232 kernel_fpu_begin();
1da177e4 233
687c8054 234 asm volatile(
1da177e4
LT
235#undef BLOCK
236#define BLOCK(i) \
237 PF1(i) \
687c8054
JP
238 PF1(i + 2) \
239 LD(i, 0) \
240 LD(i + 1, 1) \
241 LD(i + 2, 2) \
242 LD(i + 3, 3) \
1da177e4 243 PF2(i) \
687c8054
JP
244 PF2(i + 2) \
245 XO1(i, 0) \
246 XO1(i + 1, 1) \
247 XO1(i + 2, 2) \
248 XO1(i + 3, 3) \
1da177e4 249 PF3(i) \
687c8054
JP
250 PF3(i + 2) \
251 XO2(i, 0) \
252 XO2(i + 1, 1) \
253 XO2(i + 2, 2) \
254 XO2(i + 3, 3) \
1da177e4 255 PF4(i) \
687c8054
JP
256 PF4(i + 2) \
257 PF0(i + 4) \
258 PF0(i + 6) \
259 XO3(i, 0) \
260 XO3(i + 1, 1) \
261 XO3(i + 2, 2) \
262 XO3(i + 3, 3) \
263 XO4(i, 0) \
264 XO4(i + 1, 1) \
265 XO4(i + 2, 2) \
266 XO4(i + 3, 3) \
267 ST(i, 0) \
268 ST(i + 1, 1) \
269 ST(i + 2, 2) \
270 ST(i + 3, 3) \
1da177e4
LT
271
272
273 PF0(0)
274 PF0(2)
275
276 " .align 32 ;\n"
687c8054 277 " 1: ;\n"
1da177e4
LT
278
279 BLOCK(0)
280 BLOCK(4)
281 BLOCK(8)
282 BLOCK(12)
283
687c8054
JP
284 " addq %[inc], %[p1] ;\n"
285 " addq %[inc], %[p2] ;\n"
286 " addq %[inc], %[p3] ;\n"
287 " addq %[inc], %[p4] ;\n"
288 " addq %[inc], %[p5] ;\n"
1da177e4
LT
289 " decl %[cnt] ; jnz 1b"
290 : [cnt] "+c" (lines),
687c8054 291 [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4),
1da177e4
LT
292 [p5] "+r" (p5)
293 : [inc] "r" (256UL)
294 : "memory");
295
841e3604 296 kernel_fpu_end();
1da177e4
LT
297}
298
299static struct xor_block_template xor_block_sse = {
687c8054
JP
300 .name = "generic_sse",
301 .do_2 = xor_sse_2,
302 .do_3 = xor_sse_3,
303 .do_4 = xor_sse_4,
304 .do_5 = xor_sse_5,
1da177e4
LT
305};
306
ea4d26ae
JK
307
308/* Also try the AVX routines */
a1ce3928 309#include <asm/xor_avx.h>
ea4d26ae 310
1da177e4 311#undef XOR_TRY_TEMPLATES
687c8054
JP
312#define XOR_TRY_TEMPLATES \
313do { \
ea4d26ae 314 AVX_XOR_SPEED; \
687c8054
JP
315 xor_speed(&xor_block_sse); \
316} while (0)
1da177e4
LT
317
318/* We force the use of the SSE xor block because it can write around L2.
319 We may also be able to load into the L1 only depending on how the cpu
320 deals with a load to a line that is being prefetched. */
ea4d26ae
JK
321#define XOR_SELECT_TEMPLATE(FASTEST) \
322 AVX_SELECT(&xor_block_sse)
0db125c4 323
1965aae3 324#endif /* _ASM_X86_XOR_64_H */