]> git.proxmox.com Git - ceph.git/blame - ceph/src/spdk/intel-ipsec-mb/include/memcpy.asm
import 15.2.0 Octopus source
[ceph.git] / ceph / src / spdk / intel-ipsec-mb / include / memcpy.asm
CommitLineData
11fdf7f2
TL
1;;
2;; Copyright (c) 2012-2018, Intel Corporation
3;;
4;; Redistribution and use in source and binary forms, with or without
5;; modification, are permitted provided that the following conditions are met:
6;;
7;; * Redistributions of source code must retain the above copyright notice,
8;; this list of conditions and the following disclaimer.
9;; * Redistributions in binary form must reproduce the above copyright
10;; notice, this list of conditions and the following disclaimer in the
11;; documentation and/or other materials provided with the distribution.
12;; * Neither the name of Intel Corporation nor the names of its contributors
13;; may be used to endorse or promote products derived from this software
14;; without specific prior written permission.
15;;
16;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
20;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26;;
27
28%ifndef __MEMCPY_ASM__
29%define __MEMCPY_ASM__
30
31%include "reg_sizes.asm"
32
33
9f95a23c 34; This section defines a series of macros to copy small to medium amounts
11fdf7f2
TL
35; of data from memory to memory, where the size is variable but limited.
36;
37; The macros are all called as:
38; memcpy DST, SRC, SIZE, TMP0, TMP1, XTMP0, XTMP1, XTMP2, XTMP3
39; with the parameters defined as:
40; DST : register: pointer to dst (not modified)
41; SRC : register: pointer to src (not modified)
42; SIZE : register: length in bytes (not modified)
43; TMP0 : 64-bit temp GPR (clobbered)
44; TMP1 : 64-bit temp GPR (clobbered)
45; XTMP0 : temp XMM (clobbered)
46; XTMP1 : temp XMM (clobbered)
47; XTMP2 : temp XMM (clobbered)
48; XTMP3 : temp XMM (clobbered)
49;
50; The name indicates the options. The name is of the form:
51; memcpy_<VEC>_<SZ><ZERO><RET>
52; where:
53; <VEC> is either "sse" or "avx" or "avx2"
54; <SZ> is either "64" or "128" and defines largest value of SIZE
55; <ZERO> is blank or "_1". If "_1" then the min SIZE is 1 (otherwise 0)
56; <RET> is blank or "_ret". If blank, the code falls through. If "ret"
57; it does a "ret" at the end
58;
59; For the avx2 versions, the temp XMM registers need to be YMM registers
60; If the SZ is 64, then only two YMM temps are needed, i.e. it is called as:
61; memcpy_avx2_64 DST, SRC, SIZE, TMP0, TMP1, YTMP0, YTMP1
62; memcpy_avx2_128 DST, SRC, SIZE, TMP0, TMP1, YTMP0, YTMP1, YTMP2, YTMP3
63;
64; For example:
65; memcpy_sse_64 : SSE, 0 <= size < 64, falls through
66; memcpy_avx_64_1 : AVX1, 1 <= size < 64, falls through
67; memcpy_sse_128_ret : SSE, 0 <= size < 128, ends with ret
68; mempcy_avx_128_1_ret : AVX1, 1 <= size < 128, ends with ret
69;
70
71%macro memcpy_sse_64 9
72 __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 64, 0, 0
73%endm
74
75%macro memcpy_sse_64_1 9
76 __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 64, 0, 0
77%endm
78
79%macro memcpy_sse_128 9
80 __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 128, 0, 0
81%endm
82
83%macro memcpy_sse_128_1 9
84 __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 128, 0, 0
85%endm
86
87%macro memcpy_sse_64_ret 9
88 __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 64, 1, 0
89%endm
90
91%macro memcpy_sse_64_1_ret 9
92 __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 64, 1, 0
93%endm
94
95%macro memcpy_sse_128_ret 9
96 __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 128, 1, 0
97%endm
98
99%macro memcpy_sse_128_1_ret 9
100 __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 128, 1, 0
101%endm
102
103
104%macro memcpy_sse_16 5
105 __memcpy_int %1,%2,%3,%4,%5,,,,, 0, 16, 0, 0
106%endm
107
108%macro memcpy_sse_16_1 5
109 __memcpy_int %1,%2,%3,%4,%5,,,,, 1, 16, 0, 0
110%endm
111
112 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
113
114%macro memcpy_avx_64 9
115 __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 64, 0, 1
116%endm
117
118%macro memcpy_avx_64_1 9
119 __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 64, 0, 1
120%endm
121
122%macro memcpy_avx_128 9
123 __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 128, 0, 1
124%endm
125
126%macro memcpy_avx_128_1 9
127 __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 128, 0, 1
128%endm
129
130%macro memcpy_avx_64_ret 9
131 __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 64, 1, 1
132%endm
133
134%macro memcpy_avx_64_1_ret 9
135 __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 64, 1, 1
136%endm
137
138%macro memcpy_avx_128_ret 9
139 __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 128, 1, 1
140%endm
141
142%macro memcpy_avx_128_1_ret 9
143 __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 128, 1, 1
144%endm
145
146
147%macro memcpy_avx_16 5
148 __memcpy_int %1,%2,%3,%4,%5,,,,, 0, 16, 0, 1
149%endm
150
151%macro memcpy_avx_16_1 5
152 __memcpy_int %1,%2,%3,%4,%5,,,,, 1, 16, 0, 1
153%endm
154
155 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
156
157%macro memcpy_avx2_64 7
158 __memcpy_int %1,%2,%3,%4,%5,%6,%7,--,--, 0, 64, 0, 2
159%endm
160
161%macro memcpy_avx2_64_1 7
162 __memcpy_int %1,%2,%3,%4,%5,%6,%7,--,--, 1, 64, 0, 2
163%endm
164
165%macro memcpy_avx2_128 9
166 __memcpy_int %1,%2,%3,%4,%5,%6,%7, %8, %9, 0, 128, 0, 2
167%endm
168
169%macro memcpy_avx2_128_1 9
170 __memcpy_int %1,%2,%3,%4,%5,%6,%7, %8, %9, 1, 128, 0, 2
171%endm
172
173%macro memcpy_avx2_64_ret 7
174 __memcpy_int %1,%2,%3,%4,%5,%6,%7,--,--, 0, 64, 1, 2
175%endm
176
177%macro memcpy_avx2_64_1_ret 7
178 __memcpy_int %1,%2,%3,%4,%5,%6,%7,--,--, 1, 64, 1, 2
179%endm
180
181%macro memcpy_avx2_128_ret 9
182 __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 128, 1, 2
183%endm
184
185%macro memcpy_avx2_128_1_ret 9
186 __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 128, 1, 2
187%endm
188
189
190;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
191;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
192;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
193;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
194
195
196%macro __memcpy_int 13
197%define %%DST %1 ; register: pointer to dst (not modified)
198%define %%SRC %2 ; register: pointer to src (not modified)
199%define %%SIZE %3 ; register: length in bytes (not modified)
200%define %%TMP0 %4 ; 64-bit temp GPR (clobbered)
201%define %%TMP1 %5 ; 64-bit temp GPR (clobbered)
202%define %%XTMP0 %6 ; temp XMM (clobbered)
203%define %%XTMP1 %7 ; temp XMM (clobbered)
204%define %%XTMP2 %8 ; temp XMM (clobbered)
205%define %%XTMP3 %9 ; temp XMM (clobbered)
206%define %%NOT0 %10 ; if not 0, then assume size cannot be zero
207%define %%MAXSIZE %11 ; 128, 64, etc
208%define %%USERET %12 ; if not 0, use "ret" at end
209%define %%USEAVX %13 ; 0 = SSE, 1 = AVX1, 2 = AVX2
210
211%if (%%USERET != 0)
212 %define %%DONE ret
213%else
214 %define %%DONE jmp %%end
215%endif
216
217%if (%%USEAVX != 0)
218 %define %%MOVDQU vmovdqu
219%else
220 %define %%MOVDQU movdqu
221%endif
222
223%if (%%MAXSIZE >= 128)
224 test %%SIZE, 64
225 jz %%lt64
226 %if (%%USEAVX >= 2)
227 %%MOVDQU %%XTMP0, [%%SRC + 0*32]
228 %%MOVDQU %%XTMP1, [%%SRC + 1*32]
229 %%MOVDQU %%XTMP2, [%%SRC + %%SIZE - 2*32]
230 %%MOVDQU %%XTMP3, [%%SRC + %%SIZE - 1*32]
231
232 %%MOVDQU [%%DST + 0*32], %%XTMP0
233 %%MOVDQU [%%DST + 1*32], %%XTMP1
234 %%MOVDQU [%%DST + %%SIZE - 2*32], %%XTMP2
235 %%MOVDQU [%%DST + %%SIZE - 1*32], %%XTMP3
236 %else
237 %%MOVDQU %%XTMP0, [%%SRC + 0*16]
238 %%MOVDQU %%XTMP1, [%%SRC + 1*16]
239 %%MOVDQU %%XTMP2, [%%SRC + 2*16]
240 %%MOVDQU %%XTMP3, [%%SRC + 3*16]
241 %%MOVDQU [%%DST + 0*16], %%XTMP0
242 %%MOVDQU [%%DST + 1*16], %%XTMP1
243 %%MOVDQU [%%DST + 2*16], %%XTMP2
244 %%MOVDQU [%%DST + 3*16], %%XTMP3
245
246 %%MOVDQU %%XTMP0, [%%SRC + %%SIZE - 4*16]
247 %%MOVDQU %%XTMP1, [%%SRC + %%SIZE - 3*16]
248 %%MOVDQU %%XTMP2, [%%SRC + %%SIZE - 2*16]
249 %%MOVDQU %%XTMP3, [%%SRC + %%SIZE - 1*16]
250 %%MOVDQU [%%DST + %%SIZE - 4*16], %%XTMP0
251 %%MOVDQU [%%DST + %%SIZE - 3*16], %%XTMP1
252 %%MOVDQU [%%DST + %%SIZE - 2*16], %%XTMP2
253 %%MOVDQU [%%DST + %%SIZE - 1*16], %%XTMP3
254 %endif
255 %%DONE
256%endif
257
258%if (%%MAXSIZE >= 64)
259%%lt64:
260 test %%SIZE, 32
261 jz %%lt32
262 %if (%%USEAVX >= 2)
263 %%MOVDQU %%XTMP0, [%%SRC + 0*32]
264 %%MOVDQU %%XTMP1, [%%SRC + %%SIZE - 1*32]
265 %%MOVDQU [%%DST + 0*32], %%XTMP0
266 %%MOVDQU [%%DST + %%SIZE - 1*32], %%XTMP1
267 %else
268 %%MOVDQU %%XTMP0, [%%SRC + 0*16]
269 %%MOVDQU %%XTMP1, [%%SRC + 1*16]
270 %%MOVDQU %%XTMP2, [%%SRC + %%SIZE - 2*16]
271 %%MOVDQU %%XTMP3, [%%SRC + %%SIZE - 1*16]
272 %%MOVDQU [%%DST + 0*16], %%XTMP0
273 %%MOVDQU [%%DST + 1*16], %%XTMP1
274 %%MOVDQU [%%DST + %%SIZE - 2*16], %%XTMP2
275 %%MOVDQU [%%DST + %%SIZE - 1*16], %%XTMP3
276 %endif
277 %%DONE
278%endif
279
280%if (%%MAXSIZE >= 32)
281%%lt32:
282 test %%SIZE, 16
283 jz %%lt16
284 %if (%%USEAVX >= 2)
285 %%MOVDQU XWORD(%%XTMP0), [%%SRC + 0*16]
286 %%MOVDQU XWORD(%%XTMP1), [%%SRC + %%SIZE - 1*16]
287 %%MOVDQU [%%DST + 0*16], XWORD(%%XTMP0)
288 %%MOVDQU [%%DST + %%SIZE - 1*16], XWORD(%%XTMP1)
289 %else
290 %%MOVDQU %%XTMP0, [%%SRC + 0*16]
291 %%MOVDQU %%XTMP1, [%%SRC + %%SIZE - 1*16]
292 %%MOVDQU [%%DST + 0*16], %%XTMP0
293 %%MOVDQU [%%DST + %%SIZE - 1*16], %%XTMP1
294 %endif
295 %%DONE
296%endif
297
298%if (%%MAXSIZE >= 16)
299%%lt16:
300 test %%SIZE, 8
301 jz %%lt8
302 mov %%TMP0, [%%SRC]
303 mov %%TMP1, [%%SRC + %%SIZE - 8]
304 mov [%%DST], %%TMP0
305 mov [%%DST + %%SIZE - 8], %%TMP1
306 %%DONE
307%endif
308
309%if (%%MAXSIZE >= 8)
310%%lt8:
311 test %%SIZE, 4
312 jz %%lt4
313 mov DWORD(%%TMP0), [%%SRC]
314 mov DWORD(%%TMP1), [%%SRC + %%SIZE - 4]
315 mov [%%DST], DWORD(%%TMP0)
316 mov [%%DST + %%SIZE - 4], DWORD(%%TMP1)
317 %%DONE
318%endif
319
320%if (%%MAXSIZE >= 4)
321%%lt4:
322 test %%SIZE, 2
323 jz %%lt2
324 movzx DWORD(%%TMP0), word [%%SRC]
325 movzx DWORD(%%TMP1), byte [%%SRC + %%SIZE - 1]
326 mov [%%DST], WORD(%%TMP0)
327 mov [%%DST + %%SIZE - 1], BYTE(%%TMP1)
328 %%DONE
329%endif
330
331%%lt2:
332%if (%%NOT0 == 0)
333 test %%SIZE, 1
334 jz %%end
335%endif
336 movzx DWORD(%%TMP0), byte [%%SRC]
337 mov [%%DST], BYTE(%%TMP0)
338%%end:
339%if (%%USERET != 0)
340 ret
341%endif
342%endm
343
9f95a23c
TL
344
345;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
346;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
347;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
348;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
349
350;; Utility macro to assist with SIMD shifting
351%macro _PSRLDQ 3
352%define %%VEC %1
353%define %%REG %2
354%define %%IMM %3
355
356%ifidn %%VEC, SSE
357 psrldq %%REG, %%IMM
358%else
359 vpsrldq %%REG, %%REG, %%IMM
360%endif
361%endm
362
363 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
364
365; This section defines a series of macros to store small to medium amounts
366; of data from SIMD registers to memory, where the size is variable but limited.
367;
368; The macros are all called as:
369; memcpy DST, SRC, SIZE, TMP, IDX
370; with the parameters defined as:
371; DST : register: pointer to dst (not modified)
372; SRC : register: src data (clobbered)
373; SIZE : register: length in bytes (not modified)
374; TMP : 64-bit temp GPR (clobbered)
375; IDX : 64-bit GPR to store dst index/offset (clobbered)
376;
377; The name indicates the options. The name is of the form:
378; simd_store_<VEC>
379; where <VEC> is the SIMD instruction type e.g. "sse" or "avx"
380
381
382%macro simd_store_sse 5
383 __simd_store %1,%2,%3,%4,%5,SSE
384%endm
385
386%macro simd_store_avx 5
387 __simd_store %1,%2,%3,%4,%5,AVX
388%endm
389
390%macro __simd_store 6
391%define %%DST %1 ; register: pointer to dst (not modified)
392%define %%SRC %2 ; register: src data (clobbered)
393%define %%SIZE %3 ; register: length in bytes (not modified)
394%define %%TMP %4 ; 64-bit temp GPR (clobbered)
395%define %%IDX %5 ; 64-bit temp GPR to store dst idx (clobbered)
396%define %%SIMDTYPE %6 ; "SSE" or "AVX"
397
398%define %%PSRLDQ _PSRLDQ %%SIMDTYPE,
399
400%ifidn %%SIMDTYPE, SSE
401 %define %%MOVDQU movdqu
402 %define %%MOVQ movq
403%else
404 %define %%MOVDQU vmovdqu
405 %define %%MOVQ vmovq
406%endif
407
408 xor %%IDX, %%IDX ; zero idx
409
410 test %%SIZE, 16
411 jz %%lt16
412 %%MOVDQU [%%DST], %%SRC
413 jmp %%end
414%%lt16:
415 test %%SIZE, 8
416 jz %%lt8
417 %%MOVQ [%%DST + %%IDX], %%SRC
418 %%PSRLDQ %%SRC, 8
419 add %%IDX, 8
420%%lt8:
421 %%MOVQ %%TMP, %%SRC ; use GPR from now on
422
423 test %%SIZE, 4
424 jz %%lt4
425 mov [%%DST + %%IDX], DWORD(%%TMP)
426 shr %%TMP, 32
427 add %%IDX, 4
428%%lt4:
429 test %%SIZE, 2
430 jz %%lt2
431 mov [%%DST + %%IDX], WORD(%%TMP)
432 shr %%TMP, 16
433 add %%IDX, 2
434%%lt2:
435 test %%SIZE, 1
436 jz %%end
437 mov [%%DST + %%IDX], BYTE(%%TMP)
438%%end:
439%endm
440
11fdf7f2 441%endif ; ifndef __MEMCPY_ASM__