]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blob - arch/x86/include/asm/xor_64.h
UAPI: (Scripted) Convert #include "..." to #include <path/...> in kernel system headers
[mirror_ubuntu-bionic-kernel.git] / arch / x86 / include / asm / xor_64.h
1 #ifndef _ASM_X86_XOR_64_H
2 #define _ASM_X86_XOR_64_H
3
4 /*
5 * Optimized RAID-5 checksumming functions for MMX and SSE.
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2, or (at your option)
10 * any later version.
11 *
12 * You should have received a copy of the GNU General Public License
13 * (for example /usr/src/linux/COPYING); if not, write to the Free
14 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15 */
16
17
18 /*
19 * Cache avoiding checksumming functions utilizing KNI instructions
20 * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
21 */
22
23 /*
24 * Based on
25 * High-speed RAID5 checksumming functions utilizing SSE instructions.
26 * Copyright (C) 1998 Ingo Molnar.
27 */
28
29 /*
30 * x86-64 changes / gcc fixes from Andi Kleen.
31 * Copyright 2002 Andi Kleen, SuSE Labs.
32 *
33 * This hasn't been optimized for the hammer yet, but there are likely
34 * no advantages to be gotten from x86-64 here anyways.
35 */
36
37 #include <asm/i387.h>
38
39 #define OFFS(x) "16*("#x")"
40 #define PF_OFFS(x) "256+16*("#x")"
41 #define PF0(x) " prefetchnta "PF_OFFS(x)"(%[p1]) ;\n"
42 #define LD(x, y) " movaps "OFFS(x)"(%[p1]), %%xmm"#y" ;\n"
43 #define ST(x, y) " movaps %%xmm"#y", "OFFS(x)"(%[p1]) ;\n"
44 #define PF1(x) " prefetchnta "PF_OFFS(x)"(%[p2]) ;\n"
45 #define PF2(x) " prefetchnta "PF_OFFS(x)"(%[p3]) ;\n"
46 #define PF3(x) " prefetchnta "PF_OFFS(x)"(%[p4]) ;\n"
47 #define PF4(x) " prefetchnta "PF_OFFS(x)"(%[p5]) ;\n"
48 #define PF5(x) " prefetchnta "PF_OFFS(x)"(%[p6]) ;\n"
49 #define XO1(x, y) " xorps "OFFS(x)"(%[p2]), %%xmm"#y" ;\n"
50 #define XO2(x, y) " xorps "OFFS(x)"(%[p3]), %%xmm"#y" ;\n"
51 #define XO3(x, y) " xorps "OFFS(x)"(%[p4]), %%xmm"#y" ;\n"
52 #define XO4(x, y) " xorps "OFFS(x)"(%[p5]), %%xmm"#y" ;\n"
53 #define XO5(x, y) " xorps "OFFS(x)"(%[p6]), %%xmm"#y" ;\n"
54
55
56 static void
57 xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
58 {
59 unsigned int lines = bytes >> 8;
60
61 kernel_fpu_begin();
62
63 asm volatile(
64 #undef BLOCK
65 #define BLOCK(i) \
66 LD(i, 0) \
67 LD(i + 1, 1) \
68 PF1(i) \
69 PF1(i + 2) \
70 LD(i + 2, 2) \
71 LD(i + 3, 3) \
72 PF0(i + 4) \
73 PF0(i + 6) \
74 XO1(i, 0) \
75 XO1(i + 1, 1) \
76 XO1(i + 2, 2) \
77 XO1(i + 3, 3) \
78 ST(i, 0) \
79 ST(i + 1, 1) \
80 ST(i + 2, 2) \
81 ST(i + 3, 3) \
82
83
84 PF0(0)
85 PF0(2)
86
87 " .align 32 ;\n"
88 " 1: ;\n"
89
90 BLOCK(0)
91 BLOCK(4)
92 BLOCK(8)
93 BLOCK(12)
94
95 " addq %[inc], %[p1] ;\n"
96 " addq %[inc], %[p2] ;\n"
97 " decl %[cnt] ; jnz 1b"
98 : [p1] "+r" (p1), [p2] "+r" (p2), [cnt] "+r" (lines)
99 : [inc] "r" (256UL)
100 : "memory");
101
102 kernel_fpu_end();
103 }
104
105 static void
106 xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
107 unsigned long *p3)
108 {
109 unsigned int lines = bytes >> 8;
110
111 kernel_fpu_begin();
112 asm volatile(
113 #undef BLOCK
114 #define BLOCK(i) \
115 PF1(i) \
116 PF1(i + 2) \
117 LD(i, 0) \
118 LD(i + 1, 1) \
119 LD(i + 2, 2) \
120 LD(i + 3, 3) \
121 PF2(i) \
122 PF2(i + 2) \
123 PF0(i + 4) \
124 PF0(i + 6) \
125 XO1(i, 0) \
126 XO1(i + 1, 1) \
127 XO1(i + 2, 2) \
128 XO1(i + 3, 3) \
129 XO2(i, 0) \
130 XO2(i + 1, 1) \
131 XO2(i + 2, 2) \
132 XO2(i + 3, 3) \
133 ST(i, 0) \
134 ST(i + 1, 1) \
135 ST(i + 2, 2) \
136 ST(i + 3, 3) \
137
138
139 PF0(0)
140 PF0(2)
141
142 " .align 32 ;\n"
143 " 1: ;\n"
144
145 BLOCK(0)
146 BLOCK(4)
147 BLOCK(8)
148 BLOCK(12)
149
150 " addq %[inc], %[p1] ;\n"
151 " addq %[inc], %[p2] ;\n"
152 " addq %[inc], %[p3] ;\n"
153 " decl %[cnt] ; jnz 1b"
154 : [cnt] "+r" (lines),
155 [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
156 : [inc] "r" (256UL)
157 : "memory");
158 kernel_fpu_end();
159 }
160
161 static void
162 xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
163 unsigned long *p3, unsigned long *p4)
164 {
165 unsigned int lines = bytes >> 8;
166
167 kernel_fpu_begin();
168
169 asm volatile(
170 #undef BLOCK
171 #define BLOCK(i) \
172 PF1(i) \
173 PF1(i + 2) \
174 LD(i, 0) \
175 LD(i + 1, 1) \
176 LD(i + 2, 2) \
177 LD(i + 3, 3) \
178 PF2(i) \
179 PF2(i + 2) \
180 XO1(i, 0) \
181 XO1(i + 1, 1) \
182 XO1(i + 2, 2) \
183 XO1(i + 3, 3) \
184 PF3(i) \
185 PF3(i + 2) \
186 PF0(i + 4) \
187 PF0(i + 6) \
188 XO2(i, 0) \
189 XO2(i + 1, 1) \
190 XO2(i + 2, 2) \
191 XO2(i + 3, 3) \
192 XO3(i, 0) \
193 XO3(i + 1, 1) \
194 XO3(i + 2, 2) \
195 XO3(i + 3, 3) \
196 ST(i, 0) \
197 ST(i + 1, 1) \
198 ST(i + 2, 2) \
199 ST(i + 3, 3) \
200
201
202 PF0(0)
203 PF0(2)
204
205 " .align 32 ;\n"
206 " 1: ;\n"
207
208 BLOCK(0)
209 BLOCK(4)
210 BLOCK(8)
211 BLOCK(12)
212
213 " addq %[inc], %[p1] ;\n"
214 " addq %[inc], %[p2] ;\n"
215 " addq %[inc], %[p3] ;\n"
216 " addq %[inc], %[p4] ;\n"
217 " decl %[cnt] ; jnz 1b"
218 : [cnt] "+c" (lines),
219 [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
220 : [inc] "r" (256UL)
221 : "memory" );
222
223 kernel_fpu_end();
224 }
225
226 static void
227 xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
228 unsigned long *p3, unsigned long *p4, unsigned long *p5)
229 {
230 unsigned int lines = bytes >> 8;
231
232 kernel_fpu_begin();
233
234 asm volatile(
235 #undef BLOCK
236 #define BLOCK(i) \
237 PF1(i) \
238 PF1(i + 2) \
239 LD(i, 0) \
240 LD(i + 1, 1) \
241 LD(i + 2, 2) \
242 LD(i + 3, 3) \
243 PF2(i) \
244 PF2(i + 2) \
245 XO1(i, 0) \
246 XO1(i + 1, 1) \
247 XO1(i + 2, 2) \
248 XO1(i + 3, 3) \
249 PF3(i) \
250 PF3(i + 2) \
251 XO2(i, 0) \
252 XO2(i + 1, 1) \
253 XO2(i + 2, 2) \
254 XO2(i + 3, 3) \
255 PF4(i) \
256 PF4(i + 2) \
257 PF0(i + 4) \
258 PF0(i + 6) \
259 XO3(i, 0) \
260 XO3(i + 1, 1) \
261 XO3(i + 2, 2) \
262 XO3(i + 3, 3) \
263 XO4(i, 0) \
264 XO4(i + 1, 1) \
265 XO4(i + 2, 2) \
266 XO4(i + 3, 3) \
267 ST(i, 0) \
268 ST(i + 1, 1) \
269 ST(i + 2, 2) \
270 ST(i + 3, 3) \
271
272
273 PF0(0)
274 PF0(2)
275
276 " .align 32 ;\n"
277 " 1: ;\n"
278
279 BLOCK(0)
280 BLOCK(4)
281 BLOCK(8)
282 BLOCK(12)
283
284 " addq %[inc], %[p1] ;\n"
285 " addq %[inc], %[p2] ;\n"
286 " addq %[inc], %[p3] ;\n"
287 " addq %[inc], %[p4] ;\n"
288 " addq %[inc], %[p5] ;\n"
289 " decl %[cnt] ; jnz 1b"
290 : [cnt] "+c" (lines),
291 [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4),
292 [p5] "+r" (p5)
293 : [inc] "r" (256UL)
294 : "memory");
295
296 kernel_fpu_end();
297 }
298
299 static struct xor_block_template xor_block_sse = {
300 .name = "generic_sse",
301 .do_2 = xor_sse_2,
302 .do_3 = xor_sse_3,
303 .do_4 = xor_sse_4,
304 .do_5 = xor_sse_5,
305 };
306
307
308 /* Also try the AVX routines */
309 #include <asm/xor_avx.h>
310
311 #undef XOR_TRY_TEMPLATES
312 #define XOR_TRY_TEMPLATES \
313 do { \
314 AVX_XOR_SPEED; \
315 xor_speed(&xor_block_sse); \
316 } while (0)
317
318 /* We force the use of the SSE xor block because it can write around L2.
319 We may also be able to load into the L1 only depending on how the cpu
320 deals with a load to a line that is being prefetched. */
321 #define XOR_SELECT_TEMPLATE(FASTEST) \
322 AVX_SELECT(&xor_block_sse)
323
324 #endif /* _ASM_X86_XOR_64_H */