]>
git.proxmox.com Git - ceph.git/blob - ceph/src/crypto/isa-l/isa-l_crypto/include/memcpy_inline.h
1 /**********************************************************************
2 Copyright(c) 2011-2016 Intel Corporation All rights reserved.
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions
7 * Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9 * Redistributions in binary form must reproduce the above copyright
10 notice, this list of conditions and the following disclaimer in
11 the documentation and/or other materials provided with the
13 * Neither the name of Intel Corporation nor the names of its
14 contributors may be used to endorse or promote products derived
15 from this software without specific prior written permission.
17 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 **********************************************************************/
32 * @file memcpy_inline.h
33 * @brief Defines intrinsic memcpy functions used by the new hashing API
40 #if defined(__i386__) || defined(__x86_64__) || defined( _M_X64) \
42 #include "intrinreg.h"
51 #if defined(__i386__) || defined(__x86_64__) || defined( _M_X64) \
54 #define memcpy_varlen memcpy_sse_varlen
55 #define memcpy_fixedlen memcpy_sse_fixedlen
57 #define memclr_varlen memclr_sse_varlen
58 #define memclr_fixedlen memclr_sse_fixedlen
60 static inline void memcpy_lte32_sse_fixedlen(void* dst
, const void* src
, size_t nbytes
);
61 static inline void memcpy_gte16_sse_fixedlen(void* dst
, const void* src
, size_t nbytes
);
62 static inline void memcpy_sse_fixedlen (void* dst
, const void* src
, size_t nbytes
);
64 static inline void memcpy_lte32_sse_varlen (void* dst
, const void* src
, size_t nbytes
);
65 static inline void memcpy_gte16_sse_varlen (void* dst
, const void* src
, size_t nbytes
);
66 static inline void memcpy_sse_varlen (void* dst
, const void* src
, size_t nbytes
);
69 static inline void memclr_lte32_sse_fixedlen(void* dst
, size_t nbytes
);
70 static inline void memclr_gte16_sse_fixedlen(void* dst
, size_t nbytes
);
71 static inline void memclr_sse_fixedlen (void* dst
, size_t nbytes
);
73 static inline void memclr_lte32_sse_varlen (void* dst
, size_t nbytes
);
74 static inline void memclr_gte16_sse_varlen (void* dst
, size_t nbytes
);
75 static inline void memclr_sse_varlen (void* dst
, size_t nbytes
);
77 #define MEMCPY_BETWEEN_N_AND_2N_BYTES(N, fixedwidth, dst, src, nbytes) \
81 assert(N <= nbytes && nbytes <= 2*N); \
82 if(N == 1 || (fixedwidth && nbytes==N) ) { \
83 head = load_intrinreg##N(src); \
84 store_intrinreg##N(dst, head); \
87 head = load_intrinreg##N(src); \
88 tail = load_intrinreg##N((const void*)((const char*)src + (nbytes - N))); \
89 store_intrinreg##N(dst, head); \
90 store_intrinreg##N((void*)((char*)dst + (nbytes - N)), tail); \
94 #define MEMCLR_BETWEEN_N_AND_2N_BYTES(N, fixedwidth, dst, nbytes) \
96 const intrinreg##N zero = {0}; \
97 assert(N <= nbytes && nbytes <= 2*N); \
98 if(N == 1 || (fixedwidth && nbytes==N) ) { \
99 store_intrinreg##N(dst, zero); \
102 store_intrinreg##N(dst, zero); \
103 store_intrinreg##N((void*)((char*)dst + (nbytes - N)), zero); \
107 // Define load/store functions uniformly.
109 #define load_intrinreg16(src) _mm_loadu_ps((const float*) src)
110 #define store_intrinreg16(dst,val) _mm_storeu_ps((float*) dst, val)
112 static inline intrinreg8
load_intrinreg8(const void *src
)
114 return *(intrinreg8
*) src
;
117 static inline void store_intrinreg8(void *dst
, intrinreg8 val
)
119 *(intrinreg8
*) dst
= val
;
122 static inline intrinreg4
load_intrinreg4(const void *src
)
124 return *(intrinreg4
*) src
;
127 static inline void store_intrinreg4(void *dst
, intrinreg4 val
)
129 *(intrinreg4
*) dst
= val
;
132 static inline intrinreg2
load_intrinreg2(const void *src
)
134 return *(intrinreg2
*) src
;
137 static inline void store_intrinreg2(void *dst
, intrinreg2 val
)
139 *(intrinreg2
*) dst
= val
;
142 static inline intrinreg1
load_intrinreg1(const void *src
)
144 return *(intrinreg1
*) src
;
147 static inline void store_intrinreg1(void *dst
, intrinreg1 val
)
149 *(intrinreg1
*) dst
= val
;
152 static inline void memcpy_gte16_sse_fixedlen(void *dst
, const void *src
, size_t nbytes
)
157 size_t remaining_moves
;
160 assert(nbytes
>= 16);
162 for (i
= 0; i
+ 16 * 4 <= nbytes
; i
+= 16 * 4) {
163 for (j
= 0; j
< 4; j
++)
165 load_intrinreg16((const void *)((const char *)src
+ i
+ 16 * j
));
166 for (j
= 0; j
< 4; j
++)
167 store_intrinreg16((void *)((char *)dst
+ i
+ 16 * j
), pool
[j
]);
170 remaining_moves
= (nbytes
- i
) / 16;
171 tail_offset
= nbytes
- 16;
172 do_tail
= (tail_offset
& (16 - 1));
174 for (j
= 0; j
< remaining_moves
; j
++)
175 pool
[j
] = load_intrinreg16((const void *)((const char *)src
+ i
+ 16 * j
));
178 pool
[j
] = load_intrinreg16((const void *)((const char *)src
+ tail_offset
));
180 for (j
= 0; j
< remaining_moves
; j
++)
181 store_intrinreg16((void *)((char *)dst
+ i
+ 16 * j
), pool
[j
]);
184 store_intrinreg16((void *)((char *)dst
+ tail_offset
), pool
[j
]);
187 static inline void memclr_gte16_sse_fixedlen(void *dst
, size_t nbytes
)
191 const intrinreg16 zero
= { 0 };
192 size_t remaining_moves
;
195 assert(nbytes
>= 16);
197 for (i
= 0; i
+ 16 * 4 <= nbytes
; i
+= 16 * 4)
198 for (j
= 0; j
< 4; j
++)
199 store_intrinreg16((void *)((char *)dst
+ i
+ 16 * j
), zero
);
201 remaining_moves
= (nbytes
- i
) / 16;
202 tail_offset
= nbytes
- 16;
203 do_tail
= (tail_offset
& (16 - 1));
205 for (j
= 0; j
< remaining_moves
; j
++)
206 store_intrinreg16((void *)((char *)dst
+ i
+ 16 * j
), zero
);
209 store_intrinreg16((void *)((char *)dst
+ tail_offset
), zero
);
212 static inline void memcpy_lte32_sse_fixedlen(void *dst
, const void *src
, size_t nbytes
)
214 assert(nbytes
<= 32);
216 MEMCPY_BETWEEN_N_AND_2N_BYTES(16, 1, dst
, src
, nbytes
);
217 else if (nbytes
>= 8)
218 MEMCPY_BETWEEN_N_AND_2N_BYTES(8, 1, dst
, src
, nbytes
);
219 else if (nbytes
>= 4)
220 MEMCPY_BETWEEN_N_AND_2N_BYTES(4, 1, dst
, src
, nbytes
);
221 else if (nbytes
>= 2)
222 MEMCPY_BETWEEN_N_AND_2N_BYTES(2, 1, dst
, src
, nbytes
);
223 else if (nbytes
>= 1)
224 MEMCPY_BETWEEN_N_AND_2N_BYTES(1, 1, dst
, src
, nbytes
);
227 static inline void memclr_lte32_sse_fixedlen(void *dst
, size_t nbytes
)
229 assert(nbytes
<= 32);
231 MEMCLR_BETWEEN_N_AND_2N_BYTES(16, 1, dst
, nbytes
);
232 else if (nbytes
>= 8)
233 MEMCLR_BETWEEN_N_AND_2N_BYTES(8, 1, dst
, nbytes
);
234 else if (nbytes
>= 4)
235 MEMCLR_BETWEEN_N_AND_2N_BYTES(4, 1, dst
, nbytes
);
236 else if (nbytes
>= 2)
237 MEMCLR_BETWEEN_N_AND_2N_BYTES(2, 1, dst
, nbytes
);
238 else if (nbytes
>= 1)
239 MEMCLR_BETWEEN_N_AND_2N_BYTES(1, 1, dst
, nbytes
);
242 static inline void memcpy_lte32_sse_varlen(void *dst
, const void *src
, size_t nbytes
)
244 assert(nbytes
<= 32);
246 MEMCPY_BETWEEN_N_AND_2N_BYTES(16, 0, dst
, src
, nbytes
);
247 else if (nbytes
>= 8)
248 MEMCPY_BETWEEN_N_AND_2N_BYTES(8, 0, dst
, src
, nbytes
);
249 else if (nbytes
>= 4)
250 MEMCPY_BETWEEN_N_AND_2N_BYTES(4, 0, dst
, src
, nbytes
);
251 else if (nbytes
>= 2)
252 MEMCPY_BETWEEN_N_AND_2N_BYTES(2, 0, dst
, src
, nbytes
);
253 else if (nbytes
>= 1)
254 MEMCPY_BETWEEN_N_AND_2N_BYTES(1, 0, dst
, src
, nbytes
);
257 static inline void memclr_lte32_sse_varlen(void *dst
, size_t nbytes
)
259 assert(nbytes
<= 32);
261 MEMCLR_BETWEEN_N_AND_2N_BYTES(16, 0, dst
, nbytes
);
262 else if (nbytes
>= 8)
263 MEMCLR_BETWEEN_N_AND_2N_BYTES(8, 0, dst
, nbytes
);
264 else if (nbytes
>= 4)
265 MEMCLR_BETWEEN_N_AND_2N_BYTES(4, 0, dst
, nbytes
);
266 else if (nbytes
>= 2)
267 MEMCLR_BETWEEN_N_AND_2N_BYTES(2, 0, dst
, nbytes
);
268 else if (nbytes
>= 1)
269 MEMCLR_BETWEEN_N_AND_2N_BYTES(1, 0, dst
, nbytes
);
272 static inline void memcpy_gte16_sse_varlen(void *dst
, const void *src
, size_t nbytes
)
277 assert(nbytes
>= 16);
279 while (i
+ 128 <= nbytes
) {
280 memcpy_gte16_sse_fixedlen((void *)((char *)dst
+ i
),
281 (const void *)((const char *)src
+ i
), 128);
284 if (i
+ 64 <= nbytes
) {
285 memcpy_gte16_sse_fixedlen((void *)((char *)dst
+ i
),
286 (const void *)((const char *)src
+ i
), 64);
289 if (i
+ 32 <= nbytes
) {
290 memcpy_gte16_sse_fixedlen((void *)((char *)dst
+ i
),
291 (const void *)((const char *)src
+ i
), 32);
294 if (i
+ 16 <= nbytes
) {
295 memcpy_gte16_sse_fixedlen((void *)((char *)dst
+ i
),
296 (const void *)((const char *)src
+ i
), 16);
300 tail
= load_intrinreg16((const void *)((const char *)src
+ i
));
301 store_intrinreg16((void *)((char *)dst
+ i
), tail
);
304 static inline void memclr_gte16_sse_varlen(void *dst
, size_t nbytes
)
307 const intrinreg16 zero
= { 0 };
309 assert(nbytes
>= 16);
311 while (i
+ 128 <= nbytes
) {
312 memclr_gte16_sse_fixedlen((void *)((char *)dst
+ i
), 128);
315 if (i
+ 64 <= nbytes
) {
316 memclr_gte16_sse_fixedlen((void *)((char *)dst
+ i
), 64);
319 if (i
+ 32 <= nbytes
) {
320 memclr_gte16_sse_fixedlen((void *)((char *)dst
+ i
), 32);
323 if (i
+ 16 <= nbytes
) {
324 memclr_gte16_sse_fixedlen((void *)((char *)dst
+ i
), 16);
328 store_intrinreg16((void *)((char *)dst
+ i
), zero
);
331 static inline void memcpy_sse_fixedlen(void *dst
, const void *src
, size_t nbytes
)
334 memcpy_gte16_sse_fixedlen(dst
, src
, nbytes
);
336 memcpy_lte32_sse_fixedlen(dst
, src
, nbytes
);
339 static inline void memclr_sse_fixedlen(void *dst
, size_t nbytes
)
342 memclr_gte16_sse_fixedlen(dst
, nbytes
);
344 memclr_lte32_sse_fixedlen(dst
, nbytes
);
347 static inline void memcpy_sse_varlen(void *dst
, const void *src
, size_t nbytes
)
350 memcpy_gte16_sse_varlen(dst
, src
, nbytes
);
352 memcpy_lte32_sse_varlen(dst
, src
, nbytes
);
355 static inline void memclr_sse_varlen(void *dst
, size_t nbytes
)
358 memclr_gte16_sse_varlen(dst
, nbytes
);
360 memclr_lte32_sse_varlen(dst
, nbytes
);
363 #define memcpy_varlen memcpy
364 #define memcpy_fixedlen memcpy
366 #define memclr_varlen(dst,n) memset(dst,0,n)
367 #define memclr_fixedlen(dst,n) memset(dst,0,n)