]>
Commit | Line | Data |
---|---|---|
a4b75251 TL |
1 | /* SPDX-License-Identifier: BSD-3-Clause */ |
2 | /* Copyright 2017-2020, Intel Corporation */ | |
3 | ||
4 | #ifndef PMEM2_MEMCPY_SSE2_H | |
5 | #define PMEM2_MEMCPY_SSE2_H | |
6 | ||
7 | #include <xmmintrin.h> | |
8 | #include <stddef.h> | |
9 | #include <stdint.h> | |
10 | ||
11 | #include "out.h" | |
12 | ||
13 | static force_inline void | |
14 | memmove_small_sse2_noflush(char *dest, const char *src, size_t len) | |
15 | { | |
16 | ASSERT(len <= 64); | |
17 | ||
18 | if (len <= 8) | |
19 | goto le8; | |
20 | if (len <= 32) | |
21 | goto le32; | |
22 | ||
23 | if (len > 48) { | |
24 | /* 49..64 */ | |
25 | __m128i xmm0 = _mm_loadu_si128((__m128i *)src); | |
26 | __m128i xmm1 = _mm_loadu_si128((__m128i *)(src + 16)); | |
27 | __m128i xmm2 = _mm_loadu_si128((__m128i *)(src + 32)); | |
28 | __m128i xmm3 = _mm_loadu_si128((__m128i *)(src + len - 16)); | |
29 | ||
30 | _mm_storeu_si128((__m128i *)dest, xmm0); | |
31 | _mm_storeu_si128((__m128i *)(dest + 16), xmm1); | |
32 | _mm_storeu_si128((__m128i *)(dest + 32), xmm2); | |
33 | _mm_storeu_si128((__m128i *)(dest + len - 16), xmm3); | |
34 | return; | |
35 | } | |
36 | ||
37 | /* 33..48 */ | |
38 | __m128i xmm0 = _mm_loadu_si128((__m128i *)src); | |
39 | __m128i xmm1 = _mm_loadu_si128((__m128i *)(src + 16)); | |
40 | __m128i xmm2 = _mm_loadu_si128((__m128i *)(src + len - 16)); | |
41 | ||
42 | _mm_storeu_si128((__m128i *)dest, xmm0); | |
43 | _mm_storeu_si128((__m128i *)(dest + 16), xmm1); | |
44 | _mm_storeu_si128((__m128i *)(dest + len - 16), xmm2); | |
45 | return; | |
46 | ||
47 | le32: | |
48 | if (len > 16) { | |
49 | /* 17..32 */ | |
50 | __m128i xmm0 = _mm_loadu_si128((__m128i *)src); | |
51 | __m128i xmm1 = _mm_loadu_si128((__m128i *)(src + len - 16)); | |
52 | ||
53 | _mm_storeu_si128((__m128i *)dest, xmm0); | |
54 | _mm_storeu_si128((__m128i *)(dest + len - 16), xmm1); | |
55 | return; | |
56 | } | |
57 | ||
58 | /* 9..16 */ | |
59 | uint64_t d80 = *(ua_uint64_t *)src; | |
60 | uint64_t d81 = *(ua_uint64_t *)(src + len - 8); | |
61 | ||
62 | *(ua_uint64_t *)dest = d80; | |
63 | *(ua_uint64_t *)(dest + len - 8) = d81; | |
64 | return; | |
65 | ||
66 | le8: | |
67 | if (len <= 2) | |
68 | goto le2; | |
69 | ||
70 | if (len > 4) { | |
71 | /* 5..8 */ | |
72 | uint32_t d40 = *(ua_uint32_t *)src; | |
73 | uint32_t d41 = *(ua_uint32_t *)(src + len - 4); | |
74 | ||
75 | *(ua_uint32_t *)dest = d40; | |
76 | *(ua_uint32_t *)(dest + len - 4) = d41; | |
77 | return; | |
78 | } | |
79 | ||
80 | /* 3..4 */ | |
81 | uint16_t d20 = *(ua_uint16_t *)src; | |
82 | uint16_t d21 = *(ua_uint16_t *)(src + len - 2); | |
83 | ||
84 | *(ua_uint16_t *)dest = d20; | |
85 | *(ua_uint16_t *)(dest + len - 2) = d21; | |
86 | return; | |
87 | ||
88 | le2: | |
89 | if (len == 2) { | |
90 | *(ua_uint16_t *)dest = *(ua_uint16_t *)src; | |
91 | return; | |
92 | } | |
93 | ||
94 | *(uint8_t *)dest = *(uint8_t *)src; | |
95 | } | |
96 | ||
97 | static force_inline void | |
98 | memmove_small_sse2(char *dest, const char *src, size_t len, flush_fn flush) | |
99 | { | |
100 | /* | |
101 | * pmemcheck complains about "overwritten stores before they were made | |
102 | * persistent" for overlapping stores (last instruction in each code | |
103 | * path) in the optimized version. | |
104 | * libc's memcpy also does that, so we can't use it here. | |
105 | */ | |
106 | if (On_pmemcheck) { | |
107 | memmove_nodrain_generic(dest, src, len, PMEM2_F_MEM_NOFLUSH, | |
108 | NULL); | |
109 | } else { | |
110 | memmove_small_sse2_noflush(dest, src, len); | |
111 | } | |
112 | ||
113 | flush(dest, len); | |
114 | } | |
115 | ||
116 | #endif |