1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2 ; Copyright(c) 2011-2019 Intel Corporation All rights reserved.
4 ; Redistribution and use in source and binary forms, with or without
5 ; modification, are permitted provided that the following conditions
7 ; * Redistributions of source code must retain the above copyright
8 ; notice, this list of conditions and the following disclaimer.
9 ; * Redistributions in binary form must reproduce the above copyright
10 ; notice, this list of conditions and the following disclaimer in
11 ; the documentation and/or other materials provided with the
13 ; * Neither the name of Intel Corporation nor the names of its
14 ; contributors may be used to endorse or promote products derived
15 ; from this software without specific prior written permission.
17 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
29 %ifndef __MEMCPY_ASM__
30 %define __MEMCPY_ASM__
32 %include "reg_sizes.asm"
35 ; This file defines a series of macros to copy small to medium amounts
36 ; of data from memory to memory, where the size is variable but limited.
38 ; The macros are all called as:
39 ; memcpy DST, SRC, SIZE, TMP0, TMP1, XTMP0, XTMP1, XTMP2, XTMP3
40 ; with the parameters defined as:
41 ; DST : register: pointer to dst (not modified)
42 ; SRC : register: pointer to src (not modified)
43 ; SIZE : register: length in bytes (not modified)
44 ; TMP0 : 64-bit temp GPR (clobbered)
45 ; TMP1 : 64-bit temp GPR (clobbered)
46 ; XTMP0 : temp XMM (clobbered)
47 ; XTMP1 : temp XMM (clobbered)
48 ; XTMP2 : temp XMM (clobbered)
49 ; XTMP3 : temp XMM (clobbered)
51 ; The name indicates the options. The name is of the form:
52 ; memcpy_<VEC>_<SZ><ZERO><RET>
54 ; <VEC> is either "sse" or "avx" or "avx2"
55 ; <SZ> is either "64" or "128" and defines largest value of SIZE
56 ; <ZERO> is blank or "_1". If "_1" then the min SIZE is 1 (otherwise 0)
57 ; <RET> is blank or "_ret". If blank, the code falls through. If "ret"
58 ; it does a "ret" at the end
60 ; For the avx2 versions, the temp XMM registers need to be YMM registers
61 ; If the SZ is 64, then only two YMM temps are needed, i.e. it is called as:
62 ; memcpy_avx2_64 DST, SRC, SIZE, TMP0, TMP1, YTMP0, YTMP1
63 ; memcpy_avx2_128 DST, SRC, SIZE, TMP0, TMP1, YTMP0, YTMP1, YTMP2, YTMP3
66 ; memcpy_sse_64 : SSE, 0 <= size < 64, falls through
67 ; memcpy_avx_64_1 : AVX1, 1 <= size < 64, falls through
68 ; memcpy_sse_128_ret : SSE, 0 <= size < 128, ends with ret
69 ; mempcy_avx_128_1_ret : AVX1, 1 <= size < 128, ends with ret
72 %macro memcpy_sse_64 9
73 __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 64, 0, 0
76 %macro memcpy_sse_64_1 9
77 __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 64, 0, 0
80 %macro memcpy_sse_128 9
81 __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 128, 0, 0
84 %macro memcpy_sse_128_1 9
85 __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 128, 0, 0
88 %macro memcpy_sse_64_ret 9
89 __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 64, 1, 0
92 %macro memcpy_sse_64_1_ret 9
93 __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 64, 1, 0
96 %macro memcpy_sse_128_ret 9
97 __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 128, 1, 0
100 %macro memcpy_sse_128_1_ret 9
101 __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 128, 1, 0
105 %macro memcpy_sse_16 5
106 __memcpy_int %1,%2,%3,%4,%5,,,,, 0, 16, 0, 0
109 %macro memcpy_sse_16_1 5
110 __memcpy_int %1,%2,%3,%4,%5,,,,, 1, 16, 0, 0
113 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
115 %macro memcpy_avx_64 9
116 __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 64, 0, 1
119 %macro memcpy_avx_64_1 9
120 __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 64, 0, 1
123 %macro memcpy_avx_128 9
124 __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 128, 0, 1
127 %macro memcpy_avx_128_1 9
128 __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 128, 0, 1
131 %macro memcpy_avx_64_ret 9
132 __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 64, 1, 1
135 %macro memcpy_avx_64_1_ret 9
136 __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 64, 1, 1
139 %macro memcpy_avx_128_ret 9
140 __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 128, 1, 1
143 %macro memcpy_avx_128_1_ret 9
144 __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 128, 1, 1
148 %macro memcpy_avx_16 5
149 __memcpy_int %1,%2,%3,%4,%5,,,,, 0, 16, 0, 1
152 %macro memcpy_avx_16_1 5
153 __memcpy_int %1,%2,%3,%4,%5,,,,, 1, 16, 0, 1
156 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
158 %macro memcpy_avx2_64 7
159 __memcpy_int %1,%2,%3,%4,%5,%6,%7,--,--, 0, 64, 0, 2
162 %macro memcpy_avx2_64_1 7
163 __memcpy_int %1,%2,%3,%4,%5,%6,%7,--,--, 1, 64, 0, 2
166 %macro memcpy_avx2_128 9
167 __memcpy_int %1,%2,%3,%4,%5,%6,%7, %8, %9, 0, 128, 0, 2
170 %macro memcpy_avx2_128_1 9
171 __memcpy_int %1,%2,%3,%4,%5,%6,%7, %8, %9, 1, 128, 0, 2
174 %macro memcpy_avx2_64_ret 7
175 __memcpy_int %1,%2,%3,%4,%5,%6,%7,--,--, 0, 64, 1, 2
178 %macro memcpy_avx2_64_1_ret 7
179 __memcpy_int %1,%2,%3,%4,%5,%6,%7,--,--, 1, 64, 1, 2
182 %macro memcpy_avx2_128_ret 9
183 __memcpy_int %1,%2,%3,%4,%5,%6,%7,--,--, 0, 128, 1, 2
186 %macro memcpy_avx2_128_1_ret 9
187 __memcpy_int %1,%2,%3,%4,%5,%6,%7,--,--, 1, 128, 1, 2
192 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
193 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
194 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
195 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
198 %macro __memcpy_int 13
199 %define %%DST %1 ; register: pointer to dst (not modified)
200 %define %%SRC %2 ; register: pointer to src (not modified)
201 %define %%SIZE %3 ; register: length in bytes (not modified)
202 %define %%TMP0 %4 ; 64-bit temp GPR (clobbered)
203 %define %%TMP1 %5 ; 64-bit temp GPR (clobbered)
204 %define %%XTMP0 %6 ; temp XMM (clobbered)
205 %define %%XTMP1 %7 ; temp XMM (clobbered)
206 %define %%XTMP2 %8 ; temp XMM (clobbered)
207 %define %%XTMP3 %9 ; temp XMM (clobbered)
208 %define %%NOT0 %10 ; if not 0, then assume size cannot be zero
209 %define %%MAXSIZE %11 ; 128, 64, etc
210 %define %%USERET %12 ; if not 0, use "ret" at end
211 %define %%USEAVX %13 ; 0 = SSE, 1 = AVX1, 2 = AVX2
216 %define %%DONE jmp %%end
220 %define %%MOVDQU vmovdqu
222 %define %%MOVDQU movdqu
225 %if (%%MAXSIZE >= 128)
229 %%MOVDQU %%XTMP0, [%%SRC + 0*32]
230 %%MOVDQU %%XTMP1, [%%SRC + 1*32]
231 %%MOVDQU %%XTMP2, [%%SRC + %%SIZE - 2*32]
232 %%MOVDQU %%XTMP3, [%%SRC + %%SIZE - 1*32]
234 %%MOVDQU [%%DST + 0*32], %%XTMP0
235 %%MOVDQU [%%DST + 1*32], %%XTMP1
236 %%MOVDQU [%%DST + %%SIZE - 2*32], %%XTMP2
237 %%MOVDQU [%%DST + %%SIZE - 1*32], %%XTMP3
239 %%MOVDQU %%XTMP0, [%%SRC + 0*16]
240 %%MOVDQU %%XTMP1, [%%SRC + 1*16]
241 %%MOVDQU %%XTMP2, [%%SRC + 2*16]
242 %%MOVDQU %%XTMP3, [%%SRC + 3*16]
243 %%MOVDQU [%%DST + 0*16], %%XTMP0
244 %%MOVDQU [%%DST + 1*16], %%XTMP1
245 %%MOVDQU [%%DST + 2*16], %%XTMP2
246 %%MOVDQU [%%DST + 3*16], %%XTMP3
248 %%MOVDQU %%XTMP0, [%%SRC + %%SIZE - 4*16]
249 %%MOVDQU %%XTMP1, [%%SRC + %%SIZE - 3*16]
250 %%MOVDQU %%XTMP2, [%%SRC + %%SIZE - 2*16]
251 %%MOVDQU %%XTMP3, [%%SRC + %%SIZE - 1*16]
252 %%MOVDQU [%%DST + %%SIZE - 4*16], %%XTMP0
253 %%MOVDQU [%%DST + %%SIZE - 3*16], %%XTMP1
254 %%MOVDQU [%%DST + %%SIZE - 2*16], %%XTMP2
255 %%MOVDQU [%%DST + %%SIZE - 1*16], %%XTMP3
260 %if (%%MAXSIZE >= 64)
265 %%MOVDQU %%XTMP0, [%%SRC + 0*32]
266 %%MOVDQU %%XTMP1, [%%SRC + %%SIZE - 1*32]
267 %%MOVDQU [%%DST + 0*32], %%XTMP0
268 %%MOVDQU [%%DST + %%SIZE - 1*32], %%XTMP1
270 %%MOVDQU %%XTMP0, [%%SRC + 0*16]
271 %%MOVDQU %%XTMP1, [%%SRC + 1*16]
272 %%MOVDQU %%XTMP2, [%%SRC + %%SIZE - 2*16]
273 %%MOVDQU %%XTMP3, [%%SRC + %%SIZE - 1*16]
274 %%MOVDQU [%%DST + 0*16], %%XTMP0
275 %%MOVDQU [%%DST + 1*16], %%XTMP1
276 %%MOVDQU [%%DST + %%SIZE - 2*16], %%XTMP2
277 %%MOVDQU [%%DST + %%SIZE - 1*16], %%XTMP3
282 %if (%%MAXSIZE >= 32)
287 %%MOVDQU XWORD(%%XTMP0), [%%SRC + 0*16]
288 %%MOVDQU XWORD(%%XTMP1), [%%SRC + %%SIZE - 1*16]
289 %%MOVDQU [%%DST + 0*16], XWORD(%%XTMP0)
290 %%MOVDQU [%%DST + %%SIZE - 1*16], XWORD(%%XTMP1)
292 %%MOVDQU %%XTMP0, [%%SRC + 0*16]
293 %%MOVDQU %%XTMP1, [%%SRC + %%SIZE - 1*16]
294 %%MOVDQU [%%DST + 0*16], %%XTMP0
295 %%MOVDQU [%%DST + %%SIZE - 1*16], %%XTMP1
300 %if (%%MAXSIZE >= 16)
305 mov %%TMP1, [%%SRC + %%SIZE - 8]
307 mov [%%DST + %%SIZE - 8], %%TMP1
315 mov DWORD(%%TMP0), [%%SRC]
316 mov DWORD(%%TMP1), [%%SRC + %%SIZE - 4]
317 mov [%%DST], DWORD(%%TMP0)
318 mov [%%DST + %%SIZE - 4], DWORD(%%TMP1)
326 movzx DWORD(%%TMP0), word [%%SRC]
327 movzx DWORD(%%TMP1), byte [%%SRC + %%SIZE - 1]
328 mov [%%DST], WORD(%%TMP0)
329 mov [%%DST + %%SIZE - 1], BYTE(%%TMP1)
338 movzx DWORD(%%TMP0), byte [%%SRC]
339 mov [%%DST], BYTE(%%TMP0)
346 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
347 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
348 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
349 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
351 ;; Utility macro to assist with SIMD shifting
360 vpsrldq %%REG, %%REG, %%IMM
364 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
366 ; This section defines a series of macros to store small to medium amounts
367 ; of data from SIMD registers to memory, where the size is variable but limited.
369 ; The macros are all called as:
370 ; memcpy DST, SRC, SIZE, TMP, IDX
371 ; with the parameters defined as:
372 ; DST : register: pointer to dst (not modified)
373 ; SRC : register: src data (clobbered)
374 ; SIZE : register: length in bytes (not modified)
375 ; TMP : 64-bit temp GPR (clobbered)
376 ; IDX : 64-bit GPR to store dst index/offset (clobbered)
378 ; The name indicates the options. The name is of the form:
380 ; where <VEC> is the SIMD instruction type e.g. "sse" or "avx"
383 %macro simd_store_sse 5
384 __simd_store %1,%2,%3,%4,%5,SSE
387 %macro simd_store_avx 5
388 __simd_store %1,%2,%3,%4,%5,AVX
391 %macro simd_store_sse_15 5
392 __simd_store %1,%2,%3,%4,%5,SSE,15
395 %macro simd_store_avx_15 5
396 __simd_store %1,%2,%3,%4,%5,AVX,15
399 %macro __simd_store 6-7
400 %define %%DST %1 ; register: pointer to dst (not modified)
401 %define %%SRC %2 ; register: src data (clobbered)
402 %define %%SIZE %3 ; register: length in bytes (not modified)
403 %define %%TMP %4 ; 64-bit temp GPR (clobbered)
404 %define %%IDX %5 ; 64-bit temp GPR to store dst idx (clobbered)
405 %define %%SIMDTYPE %6 ; "SSE" or "AVX"
406 %define %%MAX_LEN %7 ; [optional] maximum length to be stored, default 16
408 %define %%PSRLDQ _PSRLDQ %%SIMDTYPE,
410 %ifidn %%SIMDTYPE, SSE
411 %define %%MOVDQU movdqu
414 %define %%MOVDQU vmovdqu
418 ;; determine max byte size for store operation
420 %assign max_length_to_store %%MAX_LEN
422 %assign max_length_to_store 16
425 %if max_length_to_store > 16
426 %error "__simd_store macro invoked with MAX_LEN bigger than 16!"
429 xor %%IDX, %%IDX ; zero idx
431 %if max_length_to_store == 16
434 %%MOVDQU [%%DST], %%SRC
439 %if max_length_to_store >= 8
442 %%MOVQ [%%DST + %%IDX], %%SRC
448 %%MOVQ %%TMP, %%SRC ; use GPR from now on
450 %if max_length_to_store >= 4
453 mov [%%DST + %%IDX], DWORD(%%TMP)
461 mov [%%DST + %%IDX], WORD(%%TMP)
467 mov [%%DST + %%IDX], BYTE(%%TMP)
471 ; This section defines a series of macros to load small to medium amounts
472 ; (from 0 to 16 bytes) of data from memory to SIMD registers,
473 ; where the size is variable but limited.
475 ; The macros are all called as:
476 ; simd_load DST, SRC, SIZE
477 ; with the parameters defined as:
478 ; DST : register: destination XMM register
479 ; SRC : register: pointer to src data (not modified)
480 ; SIZE : register: length in bytes (not modified)
482 ; The name indicates the options. The name is of the form:
483 ; simd_load_<VEC>_<SZ><ZERO>
485 ; <VEC> is either "sse" or "avx"
486 ; <SZ> is either "15" or "16" and defines largest value of SIZE
487 ; <ZERO> is blank or "_1". If "_1" then the min SIZE is 1 (otherwise 0)
490 ; simd_load_sse_16 : SSE, 0 <= size <= 16
491 ; simd_load_avx_15_1 : AVX, 1 <= size <= 15
493 %macro simd_load_sse_15_1 3
494 __simd_load %1,%2,%3,0,0,SSE
496 %macro simd_load_sse_15 3
497 __simd_load %1,%2,%3,1,0,SSE
499 %macro simd_load_sse_16_1 3
500 __simd_load %1,%2,%3,0,1,SSE
502 %macro simd_load_sse_16 3
503 __simd_load %1,%2,%3,1,1,SSE
506 %macro simd_load_avx_15_1 3
507 __simd_load %1,%2,%3,0,0,AVX
509 %macro simd_load_avx_15 3
510 __simd_load %1,%2,%3,1,0,AVX
512 %macro simd_load_avx_16_1 3
513 __simd_load %1,%2,%3,0,1,AVX
515 %macro simd_load_avx_16 3
516 __simd_load %1,%2,%3,1,1,AVX
520 %define %%DST %1 ; [out] destination XMM register
521 %define %%SRC %2 ; [in] pointer to src data
522 %define %%SIZE %3 ; [in] length in bytes (0-16 bytes)
523 %define %%ACCEPT_0 %4 ; 0 = min length = 1, 1 = min length = 0
524 %define %%ACCEPT_16 %5 ; 0 = max length = 15 , 1 = max length = 16
525 %define %%SIMDTYPE %6 ; "SSE" or "AVX"
527 %ifidn %%SIMDTYPE, SSE
528 %define %%MOVDQU movdqu
529 %define %%PINSRB pinsrb
530 %define %%PINSRQ pinsrq
533 %define %%MOVDQU vmovdqu
534 %define %%PINSRB vpinsrb
535 %define %%PINSRQ vpinsrq
539 %if (%%ACCEPT_16 != 0)
542 %%MOVDQU %%DST, [%%SRC]
547 %%PXOR %%DST, %%DST ; clear XMM register
548 %if (%%ACCEPT_0 != 0)
582 %%PINSRB %%DST, [%%SRC + 14], 14
584 %%PINSRB %%DST, [%%SRC + 13], 13
586 %%PINSRB %%DST, [%%SRC + 12], 12
588 %%PINSRB %%DST, [%%SRC + 11], 11
590 %%PINSRB %%DST, [%%SRC + 10], 10
592 %%PINSRB %%DST, [%%SRC + 9], 9
594 %%PINSRB %%DST, [%%SRC + 8], 8
596 %%PINSRQ %%DST, [%%SRC], 0
599 %%PINSRB %%DST, [%%SRC + 6], 6
601 %%PINSRB %%DST, [%%SRC + 5], 5
603 %%PINSRB %%DST, [%%SRC + 4], 4
605 %%PINSRB %%DST, [%%SRC + 3], 3
607 %%PINSRB %%DST, [%%SRC + 2], 2
609 %%PINSRB %%DST, [%%SRC + 1], 1
611 %%PINSRB %%DST, [%%SRC + 0], 0
615 %endif ; ifndef __MEMCPY_ASM__