]> git.proxmox.com Git - ceph.git/blob - ceph/src/pmdk/src/libpmem2/x86_64/memcpy/memcpy_t_sse2.c
import ceph 16.2.7
[ceph.git] / ceph / src / pmdk / src / libpmem2 / x86_64 / memcpy / memcpy_t_sse2.c
1 // SPDX-License-Identifier: BSD-3-Clause
2 /* Copyright 2017-2020, Intel Corporation */
3
4 #include <immintrin.h>
5 #include <stddef.h>
6 #include <stdint.h>
7
8 #include "pmem2_arch.h"
9 #include "flush.h"
10 #include "memcpy_memset.h"
11 #include "memcpy_sse2.h"
12 #include "out.h"
13
14 static force_inline __m128i
15 mm_loadu_si128(const char *src, unsigned idx)
16 {
17 return _mm_loadu_si128((const __m128i *)src + idx);
18 }
19
20 static force_inline void
21 mm_store_si128(char *dest, unsigned idx, __m128i src)
22 {
23 _mm_store_si128((__m128i *)dest + idx, src);
24 }
25
26 static force_inline void
27 memmove_mov4x64b(char *dest, const char *src, flush64b_fn flush64b)
28 {
29 __m128i xmm0 = mm_loadu_si128(src, 0);
30 __m128i xmm1 = mm_loadu_si128(src, 1);
31 __m128i xmm2 = mm_loadu_si128(src, 2);
32 __m128i xmm3 = mm_loadu_si128(src, 3);
33 __m128i xmm4 = mm_loadu_si128(src, 4);
34 __m128i xmm5 = mm_loadu_si128(src, 5);
35 __m128i xmm6 = mm_loadu_si128(src, 6);
36 __m128i xmm7 = mm_loadu_si128(src, 7);
37 __m128i xmm8 = mm_loadu_si128(src, 8);
38 __m128i xmm9 = mm_loadu_si128(src, 9);
39 __m128i xmm10 = mm_loadu_si128(src, 10);
40 __m128i xmm11 = mm_loadu_si128(src, 11);
41 __m128i xmm12 = mm_loadu_si128(src, 12);
42 __m128i xmm13 = mm_loadu_si128(src, 13);
43 __m128i xmm14 = mm_loadu_si128(src, 14);
44 __m128i xmm15 = mm_loadu_si128(src, 15);
45
46 mm_store_si128(dest, 0, xmm0);
47 mm_store_si128(dest, 1, xmm1);
48 mm_store_si128(dest, 2, xmm2);
49 mm_store_si128(dest, 3, xmm3);
50 mm_store_si128(dest, 4, xmm4);
51 mm_store_si128(dest, 5, xmm5);
52 mm_store_si128(dest, 6, xmm6);
53 mm_store_si128(dest, 7, xmm7);
54 mm_store_si128(dest, 8, xmm8);
55 mm_store_si128(dest, 9, xmm9);
56 mm_store_si128(dest, 10, xmm10);
57 mm_store_si128(dest, 11, xmm11);
58 mm_store_si128(dest, 12, xmm12);
59 mm_store_si128(dest, 13, xmm13);
60 mm_store_si128(dest, 14, xmm14);
61 mm_store_si128(dest, 15, xmm15);
62
63 flush64b(dest + 0 * 64);
64 flush64b(dest + 1 * 64);
65 flush64b(dest + 2 * 64);
66 flush64b(dest + 3 * 64);
67 }
68
69 static force_inline void
70 memmove_mov2x64b(char *dest, const char *src, flush64b_fn flush64b)
71 {
72 __m128i xmm0 = mm_loadu_si128(src, 0);
73 __m128i xmm1 = mm_loadu_si128(src, 1);
74 __m128i xmm2 = mm_loadu_si128(src, 2);
75 __m128i xmm3 = mm_loadu_si128(src, 3);
76 __m128i xmm4 = mm_loadu_si128(src, 4);
77 __m128i xmm5 = mm_loadu_si128(src, 5);
78 __m128i xmm6 = mm_loadu_si128(src, 6);
79 __m128i xmm7 = mm_loadu_si128(src, 7);
80
81 mm_store_si128(dest, 0, xmm0);
82 mm_store_si128(dest, 1, xmm1);
83 mm_store_si128(dest, 2, xmm2);
84 mm_store_si128(dest, 3, xmm3);
85 mm_store_si128(dest, 4, xmm4);
86 mm_store_si128(dest, 5, xmm5);
87 mm_store_si128(dest, 6, xmm6);
88 mm_store_si128(dest, 7, xmm7);
89
90 flush64b(dest + 0 * 64);
91 flush64b(dest + 1 * 64);
92 }
93
94 static force_inline void
95 memmove_mov1x64b(char *dest, const char *src, flush64b_fn flush64b)
96 {
97 __m128i xmm0 = mm_loadu_si128(src, 0);
98 __m128i xmm1 = mm_loadu_si128(src, 1);
99 __m128i xmm2 = mm_loadu_si128(src, 2);
100 __m128i xmm3 = mm_loadu_si128(src, 3);
101
102 mm_store_si128(dest, 0, xmm0);
103 mm_store_si128(dest, 1, xmm1);
104 mm_store_si128(dest, 2, xmm2);
105 mm_store_si128(dest, 3, xmm3);
106
107 flush64b(dest + 0 * 64);
108 }
109
110 static force_inline void
111 memmove_mov_sse_fw(char *dest, const char *src, size_t len,
112 flush_fn flush, flush64b_fn flush64b)
113 {
114 size_t cnt = (uint64_t)dest & 63;
115 if (cnt > 0) {
116 cnt = 64 - cnt;
117
118 if (cnt > len)
119 cnt = len;
120
121 memmove_small_sse2(dest, src, cnt, flush);
122
123 dest += cnt;
124 src += cnt;
125 len -= cnt;
126 }
127
128 while (len >= 4 * 64) {
129 memmove_mov4x64b(dest, src, flush64b);
130 dest += 4 * 64;
131 src += 4 * 64;
132 len -= 4 * 64;
133 }
134
135 if (len >= 2 * 64) {
136 memmove_mov2x64b(dest, src, flush64b);
137 dest += 2 * 64;
138 src += 2 * 64;
139 len -= 2 * 64;
140 }
141
142 if (len >= 1 * 64) {
143 memmove_mov1x64b(dest, src, flush64b);
144
145 dest += 1 * 64;
146 src += 1 * 64;
147 len -= 1 * 64;
148 }
149
150 if (len)
151 memmove_small_sse2(dest, src, len, flush);
152 }
153
154 static force_inline void
155 memmove_mov_sse_bw(char *dest, const char *src, size_t len,
156 flush_fn flush, flush64b_fn flush64b)
157 {
158 dest += len;
159 src += len;
160
161 size_t cnt = (uint64_t)dest & 63;
162 if (cnt > 0) {
163 if (cnt > len)
164 cnt = len;
165
166 dest -= cnt;
167 src -= cnt;
168 len -= cnt;
169 memmove_small_sse2(dest, src, cnt, flush);
170 }
171
172 while (len >= 4 * 64) {
173 dest -= 4 * 64;
174 src -= 4 * 64;
175 len -= 4 * 64;
176 memmove_mov4x64b(dest, src, flush64b);
177 }
178
179 if (len >= 2 * 64) {
180 dest -= 2 * 64;
181 src -= 2 * 64;
182 len -= 2 * 64;
183 memmove_mov2x64b(dest, src, flush64b);
184 }
185
186 if (len >= 1 * 64) {
187 dest -= 1 * 64;
188 src -= 1 * 64;
189 len -= 1 * 64;
190 memmove_mov1x64b(dest, src, flush64b);
191 }
192
193 if (len)
194 memmove_small_sse2(dest - len, src - len, len, flush);
195 }
196
197 static force_inline void
198 memmove_mov_sse2(char *dest, const char *src, size_t len,
199 flush_fn flush, flush64b_fn flush64b)
200 {
201 if ((uintptr_t)dest - (uintptr_t)src >= len)
202 memmove_mov_sse_fw(dest, src, len, flush, flush64b);
203 else
204 memmove_mov_sse_bw(dest, src, len, flush, flush64b);
205 }
206
207 void
208 memmove_mov_sse2_noflush(char *dest, const char *src, size_t len)
209 {
210 LOG(15, "dest %p src %p len %zu", dest, src, len);
211
212 memmove_mov_sse2(dest, src, len, noflush, noflush64b);
213 }
214
215 void
216 memmove_mov_sse2_empty(char *dest, const char *src, size_t len)
217 {
218 LOG(15, "dest %p src %p len %zu", dest, src, len);
219
220 memmove_mov_sse2(dest, src, len, flush_empty_nolog, flush64b_empty);
221 }
222
223 void
224 memmove_mov_sse2_clflush(char *dest, const char *src, size_t len)
225 {
226 LOG(15, "dest %p src %p len %zu", dest, src, len);
227
228 memmove_mov_sse2(dest, src, len, flush_clflush_nolog, pmem_clflush);
229 }
230
231 void
232 memmove_mov_sse2_clflushopt(char *dest, const char *src, size_t len)
233 {
234 LOG(15, "dest %p src %p len %zu", dest, src, len);
235
236 memmove_mov_sse2(dest, src, len, flush_clflushopt_nolog,
237 pmem_clflushopt);
238 }
239
240 void
241 memmove_mov_sse2_clwb(char *dest, const char *src, size_t len)
242 {
243 LOG(15, "dest %p src %p len %zu", dest, src, len);
244
245 memmove_mov_sse2(dest, src, len, flush_clwb_nolog, pmem_clwb);
246 }