]>
Commit | Line | Data |
---|---|---|
11fdf7f2 TL |
1 | ;; |
2 | ;; Copyright (c) 2012-2018, Intel Corporation | |
3 | ;; | |
4 | ;; Redistribution and use in source and binary forms, with or without | |
5 | ;; modification, are permitted provided that the following conditions are met: | |
6 | ;; | |
7 | ;; * Redistributions of source code must retain the above copyright notice, | |
8 | ;; this list of conditions and the following disclaimer. | |
9 | ;; * Redistributions in binary form must reproduce the above copyright | |
10 | ;; notice, this list of conditions and the following disclaimer in the | |
11 | ;; documentation and/or other materials provided with the distribution. | |
12 | ;; * Neither the name of Intel Corporation nor the names of its contributors | |
13 | ;; may be used to endorse or promote products derived from this software | |
14 | ;; without specific prior written permission. | |
15 | ;; | |
16 | ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |
17 | ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
18 | ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE | |
19 | ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE | |
20 | ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |
21 | ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |
22 | ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |
23 | ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |
24 | ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
25 | ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
26 | ;; | |
27 | ||
28 | %ifndef __MEMCPY_ASM__ | |
29 | %define __MEMCPY_ASM__ | |
30 | ||
31 | %include "reg_sizes.asm" | |
32 | ||
33 | ||
9f95a23c | 34 | ; This section defines a series of macros to copy small to medium amounts |
11fdf7f2 TL |
35 | ; of data from memory to memory, where the size is variable but limited. |
36 | ; | |
37 | ; The macros are all called as: | |
38 | ; memcpy DST, SRC, SIZE, TMP0, TMP1, XTMP0, XTMP1, XTMP2, XTMP3 | |
39 | ; with the parameters defined as: | |
40 | ; DST : register: pointer to dst (not modified) | |
41 | ; SRC : register: pointer to src (not modified) | |
42 | ; SIZE : register: length in bytes (not modified) | |
43 | ; TMP0 : 64-bit temp GPR (clobbered) | |
44 | ; TMP1 : 64-bit temp GPR (clobbered) | |
45 | ; XTMP0 : temp XMM (clobbered) | |
46 | ; XTMP1 : temp XMM (clobbered) | |
47 | ; XTMP2 : temp XMM (clobbered) | |
48 | ; XTMP3 : temp XMM (clobbered) | |
49 | ; | |
50 | ; The name indicates the options. The name is of the form: | |
51 | ; memcpy_<VEC>_<SZ><ZERO><RET> | |
52 | ; where: | |
53 | ; <VEC> is either "sse" or "avx" or "avx2" | |
54 | ; <SZ> is either "64" or "128" and defines largest value of SIZE | |
55 | ; <ZERO> is blank or "_1". If "_1" then the min SIZE is 1 (otherwise 0) | |
56 | ; <RET> is blank or "_ret". If blank, the code falls through. If "ret" | |
57 | ; it does a "ret" at the end | |
58 | ; | |
59 | ; For the avx2 versions, the temp XMM registers need to be YMM registers | |
60 | ; If the SZ is 64, then only two YMM temps are needed, i.e. it is called as: | |
61 | ; memcpy_avx2_64 DST, SRC, SIZE, TMP0, TMP1, YTMP0, YTMP1 | |
62 | ; memcpy_avx2_128 DST, SRC, SIZE, TMP0, TMP1, YTMP0, YTMP1, YTMP2, YTMP3 | |
63 | ; | |
64 | ; For example: | |
65 | ; memcpy_sse_64 : SSE, 0 <= size < 64, falls through | |
66 | ; memcpy_avx_64_1 : AVX1, 1 <= size < 64, falls through | |
67 | ; memcpy_sse_128_ret : SSE, 0 <= size < 128, ends with ret | |
68 | ; mempcy_avx_128_1_ret : AVX1, 1 <= size < 128, ends with ret | |
69 | ; | |
70 | ||
71 | %macro memcpy_sse_64 9 | |
72 | __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 64, 0, 0 | |
73 | %endm | |
74 | ||
75 | %macro memcpy_sse_64_1 9 | |
76 | __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 64, 0, 0 | |
77 | %endm | |
78 | ||
79 | %macro memcpy_sse_128 9 | |
80 | __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 128, 0, 0 | |
81 | %endm | |
82 | ||
83 | %macro memcpy_sse_128_1 9 | |
84 | __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 128, 0, 0 | |
85 | %endm | |
86 | ||
87 | %macro memcpy_sse_64_ret 9 | |
88 | __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 64, 1, 0 | |
89 | %endm | |
90 | ||
91 | %macro memcpy_sse_64_1_ret 9 | |
92 | __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 64, 1, 0 | |
93 | %endm | |
94 | ||
95 | %macro memcpy_sse_128_ret 9 | |
96 | __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 128, 1, 0 | |
97 | %endm | |
98 | ||
99 | %macro memcpy_sse_128_1_ret 9 | |
100 | __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 128, 1, 0 | |
101 | %endm | |
102 | ||
103 | ||
104 | %macro memcpy_sse_16 5 | |
105 | __memcpy_int %1,%2,%3,%4,%5,,,,, 0, 16, 0, 0 | |
106 | %endm | |
107 | ||
108 | %macro memcpy_sse_16_1 5 | |
109 | __memcpy_int %1,%2,%3,%4,%5,,,,, 1, 16, 0, 0 | |
110 | %endm | |
111 | ||
112 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
113 | ||
114 | %macro memcpy_avx_64 9 | |
115 | __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 64, 0, 1 | |
116 | %endm | |
117 | ||
118 | %macro memcpy_avx_64_1 9 | |
119 | __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 64, 0, 1 | |
120 | %endm | |
121 | ||
122 | %macro memcpy_avx_128 9 | |
123 | __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 128, 0, 1 | |
124 | %endm | |
125 | ||
126 | %macro memcpy_avx_128_1 9 | |
127 | __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 128, 0, 1 | |
128 | %endm | |
129 | ||
130 | %macro memcpy_avx_64_ret 9 | |
131 | __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 64, 1, 1 | |
132 | %endm | |
133 | ||
134 | %macro memcpy_avx_64_1_ret 9 | |
135 | __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 64, 1, 1 | |
136 | %endm | |
137 | ||
138 | %macro memcpy_avx_128_ret 9 | |
139 | __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 128, 1, 1 | |
140 | %endm | |
141 | ||
142 | %macro memcpy_avx_128_1_ret 9 | |
143 | __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 128, 1, 1 | |
144 | %endm | |
145 | ||
146 | ||
147 | %macro memcpy_avx_16 5 | |
148 | __memcpy_int %1,%2,%3,%4,%5,,,,, 0, 16, 0, 1 | |
149 | %endm | |
150 | ||
151 | %macro memcpy_avx_16_1 5 | |
152 | __memcpy_int %1,%2,%3,%4,%5,,,,, 1, 16, 0, 1 | |
153 | %endm | |
154 | ||
155 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
156 | ||
157 | %macro memcpy_avx2_64 7 | |
158 | __memcpy_int %1,%2,%3,%4,%5,%6,%7,--,--, 0, 64, 0, 2 | |
159 | %endm | |
160 | ||
161 | %macro memcpy_avx2_64_1 7 | |
162 | __memcpy_int %1,%2,%3,%4,%5,%6,%7,--,--, 1, 64, 0, 2 | |
163 | %endm | |
164 | ||
165 | %macro memcpy_avx2_128 9 | |
166 | __memcpy_int %1,%2,%3,%4,%5,%6,%7, %8, %9, 0, 128, 0, 2 | |
167 | %endm | |
168 | ||
169 | %macro memcpy_avx2_128_1 9 | |
170 | __memcpy_int %1,%2,%3,%4,%5,%6,%7, %8, %9, 1, 128, 0, 2 | |
171 | %endm | |
172 | ||
173 | %macro memcpy_avx2_64_ret 7 | |
174 | __memcpy_int %1,%2,%3,%4,%5,%6,%7,--,--, 0, 64, 1, 2 | |
175 | %endm | |
176 | ||
177 | %macro memcpy_avx2_64_1_ret 7 | |
178 | __memcpy_int %1,%2,%3,%4,%5,%6,%7,--,--, 1, 64, 1, 2 | |
179 | %endm | |
180 | ||
181 | %macro memcpy_avx2_128_ret 9 | |
182 | __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 128, 1, 2 | |
183 | %endm | |
184 | ||
185 | %macro memcpy_avx2_128_1_ret 9 | |
186 | __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 128, 1, 2 | |
187 | %endm | |
188 | ||
189 | ||
190 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
191 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
192 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
193 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
194 | ||
195 | ||
196 | %macro __memcpy_int 13 | |
197 | %define %%DST %1 ; register: pointer to dst (not modified) | |
198 | %define %%SRC %2 ; register: pointer to src (not modified) | |
199 | %define %%SIZE %3 ; register: length in bytes (not modified) | |
200 | %define %%TMP0 %4 ; 64-bit temp GPR (clobbered) | |
201 | %define %%TMP1 %5 ; 64-bit temp GPR (clobbered) | |
202 | %define %%XTMP0 %6 ; temp XMM (clobbered) | |
203 | %define %%XTMP1 %7 ; temp XMM (clobbered) | |
204 | %define %%XTMP2 %8 ; temp XMM (clobbered) | |
205 | %define %%XTMP3 %9 ; temp XMM (clobbered) | |
206 | %define %%NOT0 %10 ; if not 0, then assume size cannot be zero | |
207 | %define %%MAXSIZE %11 ; 128, 64, etc | |
208 | %define %%USERET %12 ; if not 0, use "ret" at end | |
209 | %define %%USEAVX %13 ; 0 = SSE, 1 = AVX1, 2 = AVX2 | |
210 | ||
211 | %if (%%USERET != 0) | |
212 | %define %%DONE ret | |
213 | %else | |
214 | %define %%DONE jmp %%end | |
215 | %endif | |
216 | ||
217 | %if (%%USEAVX != 0) | |
218 | %define %%MOVDQU vmovdqu | |
219 | %else | |
220 | %define %%MOVDQU movdqu | |
221 | %endif | |
222 | ||
223 | %if (%%MAXSIZE >= 128) | |
224 | test %%SIZE, 64 | |
225 | jz %%lt64 | |
226 | %if (%%USEAVX >= 2) | |
227 | %%MOVDQU %%XTMP0, [%%SRC + 0*32] | |
228 | %%MOVDQU %%XTMP1, [%%SRC + 1*32] | |
229 | %%MOVDQU %%XTMP2, [%%SRC + %%SIZE - 2*32] | |
230 | %%MOVDQU %%XTMP3, [%%SRC + %%SIZE - 1*32] | |
231 | ||
232 | %%MOVDQU [%%DST + 0*32], %%XTMP0 | |
233 | %%MOVDQU [%%DST + 1*32], %%XTMP1 | |
234 | %%MOVDQU [%%DST + %%SIZE - 2*32], %%XTMP2 | |
235 | %%MOVDQU [%%DST + %%SIZE - 1*32], %%XTMP3 | |
236 | %else | |
237 | %%MOVDQU %%XTMP0, [%%SRC + 0*16] | |
238 | %%MOVDQU %%XTMP1, [%%SRC + 1*16] | |
239 | %%MOVDQU %%XTMP2, [%%SRC + 2*16] | |
240 | %%MOVDQU %%XTMP3, [%%SRC + 3*16] | |
241 | %%MOVDQU [%%DST + 0*16], %%XTMP0 | |
242 | %%MOVDQU [%%DST + 1*16], %%XTMP1 | |
243 | %%MOVDQU [%%DST + 2*16], %%XTMP2 | |
244 | %%MOVDQU [%%DST + 3*16], %%XTMP3 | |
245 | ||
246 | %%MOVDQU %%XTMP0, [%%SRC + %%SIZE - 4*16] | |
247 | %%MOVDQU %%XTMP1, [%%SRC + %%SIZE - 3*16] | |
248 | %%MOVDQU %%XTMP2, [%%SRC + %%SIZE - 2*16] | |
249 | %%MOVDQU %%XTMP3, [%%SRC + %%SIZE - 1*16] | |
250 | %%MOVDQU [%%DST + %%SIZE - 4*16], %%XTMP0 | |
251 | %%MOVDQU [%%DST + %%SIZE - 3*16], %%XTMP1 | |
252 | %%MOVDQU [%%DST + %%SIZE - 2*16], %%XTMP2 | |
253 | %%MOVDQU [%%DST + %%SIZE - 1*16], %%XTMP3 | |
254 | %endif | |
255 | %%DONE | |
256 | %endif | |
257 | ||
258 | %if (%%MAXSIZE >= 64) | |
259 | %%lt64: | |
260 | test %%SIZE, 32 | |
261 | jz %%lt32 | |
262 | %if (%%USEAVX >= 2) | |
263 | %%MOVDQU %%XTMP0, [%%SRC + 0*32] | |
264 | %%MOVDQU %%XTMP1, [%%SRC + %%SIZE - 1*32] | |
265 | %%MOVDQU [%%DST + 0*32], %%XTMP0 | |
266 | %%MOVDQU [%%DST + %%SIZE - 1*32], %%XTMP1 | |
267 | %else | |
268 | %%MOVDQU %%XTMP0, [%%SRC + 0*16] | |
269 | %%MOVDQU %%XTMP1, [%%SRC + 1*16] | |
270 | %%MOVDQU %%XTMP2, [%%SRC + %%SIZE - 2*16] | |
271 | %%MOVDQU %%XTMP3, [%%SRC + %%SIZE - 1*16] | |
272 | %%MOVDQU [%%DST + 0*16], %%XTMP0 | |
273 | %%MOVDQU [%%DST + 1*16], %%XTMP1 | |
274 | %%MOVDQU [%%DST + %%SIZE - 2*16], %%XTMP2 | |
275 | %%MOVDQU [%%DST + %%SIZE - 1*16], %%XTMP3 | |
276 | %endif | |
277 | %%DONE | |
278 | %endif | |
279 | ||
280 | %if (%%MAXSIZE >= 32) | |
281 | %%lt32: | |
282 | test %%SIZE, 16 | |
283 | jz %%lt16 | |
284 | %if (%%USEAVX >= 2) | |
285 | %%MOVDQU XWORD(%%XTMP0), [%%SRC + 0*16] | |
286 | %%MOVDQU XWORD(%%XTMP1), [%%SRC + %%SIZE - 1*16] | |
287 | %%MOVDQU [%%DST + 0*16], XWORD(%%XTMP0) | |
288 | %%MOVDQU [%%DST + %%SIZE - 1*16], XWORD(%%XTMP1) | |
289 | %else | |
290 | %%MOVDQU %%XTMP0, [%%SRC + 0*16] | |
291 | %%MOVDQU %%XTMP1, [%%SRC + %%SIZE - 1*16] | |
292 | %%MOVDQU [%%DST + 0*16], %%XTMP0 | |
293 | %%MOVDQU [%%DST + %%SIZE - 1*16], %%XTMP1 | |
294 | %endif | |
295 | %%DONE | |
296 | %endif | |
297 | ||
298 | %if (%%MAXSIZE >= 16) | |
299 | %%lt16: | |
300 | test %%SIZE, 8 | |
301 | jz %%lt8 | |
302 | mov %%TMP0, [%%SRC] | |
303 | mov %%TMP1, [%%SRC + %%SIZE - 8] | |
304 | mov [%%DST], %%TMP0 | |
305 | mov [%%DST + %%SIZE - 8], %%TMP1 | |
306 | %%DONE | |
307 | %endif | |
308 | ||
309 | %if (%%MAXSIZE >= 8) | |
310 | %%lt8: | |
311 | test %%SIZE, 4 | |
312 | jz %%lt4 | |
313 | mov DWORD(%%TMP0), [%%SRC] | |
314 | mov DWORD(%%TMP1), [%%SRC + %%SIZE - 4] | |
315 | mov [%%DST], DWORD(%%TMP0) | |
316 | mov [%%DST + %%SIZE - 4], DWORD(%%TMP1) | |
317 | %%DONE | |
318 | %endif | |
319 | ||
320 | %if (%%MAXSIZE >= 4) | |
321 | %%lt4: | |
322 | test %%SIZE, 2 | |
323 | jz %%lt2 | |
324 | movzx DWORD(%%TMP0), word [%%SRC] | |
325 | movzx DWORD(%%TMP1), byte [%%SRC + %%SIZE - 1] | |
326 | mov [%%DST], WORD(%%TMP0) | |
327 | mov [%%DST + %%SIZE - 1], BYTE(%%TMP1) | |
328 | %%DONE | |
329 | %endif | |
330 | ||
331 | %%lt2: | |
332 | %if (%%NOT0 == 0) | |
333 | test %%SIZE, 1 | |
334 | jz %%end | |
335 | %endif | |
336 | movzx DWORD(%%TMP0), byte [%%SRC] | |
337 | mov [%%DST], BYTE(%%TMP0) | |
338 | %%end: | |
339 | %if (%%USERET != 0) | |
340 | ret | |
341 | %endif | |
342 | %endm | |
343 | ||
9f95a23c TL |
344 | |
345 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
346 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
347 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
348 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
349 | ||
350 | ;; Utility macro to assist with SIMD shifting | |
351 | %macro _PSRLDQ 3 | |
352 | %define %%VEC %1 | |
353 | %define %%REG %2 | |
354 | %define %%IMM %3 | |
355 | ||
356 | %ifidn %%VEC, SSE | |
357 | psrldq %%REG, %%IMM | |
358 | %else | |
359 | vpsrldq %%REG, %%REG, %%IMM | |
360 | %endif | |
361 | %endm | |
362 | ||
363 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
364 | ||
365 | ; This section defines a series of macros to store small to medium amounts | |
366 | ; of data from SIMD registers to memory, where the size is variable but limited. | |
367 | ; | |
368 | ; The macros are all called as: | |
369 | ; memcpy DST, SRC, SIZE, TMP, IDX | |
370 | ; with the parameters defined as: | |
371 | ; DST : register: pointer to dst (not modified) | |
372 | ; SRC : register: src data (clobbered) | |
373 | ; SIZE : register: length in bytes (not modified) | |
374 | ; TMP : 64-bit temp GPR (clobbered) | |
375 | ; IDX : 64-bit GPR to store dst index/offset (clobbered) | |
376 | ; | |
377 | ; The name indicates the options. The name is of the form: | |
378 | ; simd_store_<VEC> | |
379 | ; where <VEC> is the SIMD instruction type e.g. "sse" or "avx" | |
380 | ||
381 | ||
382 | %macro simd_store_sse 5 | |
383 | __simd_store %1,%2,%3,%4,%5,SSE | |
384 | %endm | |
385 | ||
386 | %macro simd_store_avx 5 | |
387 | __simd_store %1,%2,%3,%4,%5,AVX | |
388 | %endm | |
389 | ||
390 | %macro __simd_store 6 | |
391 | %define %%DST %1 ; register: pointer to dst (not modified) | |
392 | %define %%SRC %2 ; register: src data (clobbered) | |
393 | %define %%SIZE %3 ; register: length in bytes (not modified) | |
394 | %define %%TMP %4 ; 64-bit temp GPR (clobbered) | |
395 | %define %%IDX %5 ; 64-bit temp GPR to store dst idx (clobbered) | |
396 | %define %%SIMDTYPE %6 ; "SSE" or "AVX" | |
397 | ||
398 | %define %%PSRLDQ _PSRLDQ %%SIMDTYPE, | |
399 | ||
400 | %ifidn %%SIMDTYPE, SSE | |
401 | %define %%MOVDQU movdqu | |
402 | %define %%MOVQ movq | |
403 | %else | |
404 | %define %%MOVDQU vmovdqu | |
405 | %define %%MOVQ vmovq | |
406 | %endif | |
407 | ||
408 | xor %%IDX, %%IDX ; zero idx | |
409 | ||
410 | test %%SIZE, 16 | |
411 | jz %%lt16 | |
412 | %%MOVDQU [%%DST], %%SRC | |
413 | jmp %%end | |
414 | %%lt16: | |
415 | test %%SIZE, 8 | |
416 | jz %%lt8 | |
417 | %%MOVQ [%%DST + %%IDX], %%SRC | |
418 | %%PSRLDQ %%SRC, 8 | |
419 | add %%IDX, 8 | |
420 | %%lt8: | |
421 | %%MOVQ %%TMP, %%SRC ; use GPR from now on | |
422 | ||
423 | test %%SIZE, 4 | |
424 | jz %%lt4 | |
425 | mov [%%DST + %%IDX], DWORD(%%TMP) | |
426 | shr %%TMP, 32 | |
427 | add %%IDX, 4 | |
428 | %%lt4: | |
429 | test %%SIZE, 2 | |
430 | jz %%lt2 | |
431 | mov [%%DST + %%IDX], WORD(%%TMP) | |
432 | shr %%TMP, 16 | |
433 | add %%IDX, 2 | |
434 | %%lt2: | |
435 | test %%SIZE, 1 | |
436 | jz %%end | |
437 | mov [%%DST + %%IDX], BYTE(%%TMP) | |
438 | %%end: | |
439 | %endm | |
440 | ||
11fdf7f2 | 441 | %endif ; ifndef __MEMCPY_ASM__ |