]>
Commit | Line | Data |
---|---|---|
a4b75251 TL |
1 | // SPDX-License-Identifier: BSD-3-Clause |
2 | /* Copyright 2017-2020, Intel Corporation */ | |
3 | ||
4 | #include <immintrin.h> | |
5 | #include <stddef.h> | |
6 | #include <stdint.h> | |
7 | ||
8 | #include "pmem2_arch.h" | |
9 | #include "flush.h" | |
10 | #include "memcpy_memset.h" | |
11 | #include "memcpy_sse2.h" | |
12 | #include "valgrind_internal.h" | |
13 | ||
14 | static force_inline __m128i | |
15 | mm_loadu_si128(const char *src, unsigned idx) | |
16 | { | |
17 | return _mm_loadu_si128((const __m128i *)src + idx); | |
18 | } | |
19 | ||
20 | static force_inline void | |
21 | mm_stream_si128(char *dest, unsigned idx, __m128i src) | |
22 | { | |
23 | _mm_stream_si128((__m128i *)dest + idx, src); | |
24 | barrier(); | |
25 | } | |
26 | ||
27 | static force_inline void | |
28 | memmove_movnt4x64b(char *dest, const char *src) | |
29 | { | |
30 | __m128i xmm0 = mm_loadu_si128(src, 0); | |
31 | __m128i xmm1 = mm_loadu_si128(src, 1); | |
32 | __m128i xmm2 = mm_loadu_si128(src, 2); | |
33 | __m128i xmm3 = mm_loadu_si128(src, 3); | |
34 | __m128i xmm4 = mm_loadu_si128(src, 4); | |
35 | __m128i xmm5 = mm_loadu_si128(src, 5); | |
36 | __m128i xmm6 = mm_loadu_si128(src, 6); | |
37 | __m128i xmm7 = mm_loadu_si128(src, 7); | |
38 | __m128i xmm8 = mm_loadu_si128(src, 8); | |
39 | __m128i xmm9 = mm_loadu_si128(src, 9); | |
40 | __m128i xmm10 = mm_loadu_si128(src, 10); | |
41 | __m128i xmm11 = mm_loadu_si128(src, 11); | |
42 | __m128i xmm12 = mm_loadu_si128(src, 12); | |
43 | __m128i xmm13 = mm_loadu_si128(src, 13); | |
44 | __m128i xmm14 = mm_loadu_si128(src, 14); | |
45 | __m128i xmm15 = mm_loadu_si128(src, 15); | |
46 | ||
47 | mm_stream_si128(dest, 0, xmm0); | |
48 | mm_stream_si128(dest, 1, xmm1); | |
49 | mm_stream_si128(dest, 2, xmm2); | |
50 | mm_stream_si128(dest, 3, xmm3); | |
51 | mm_stream_si128(dest, 4, xmm4); | |
52 | mm_stream_si128(dest, 5, xmm5); | |
53 | mm_stream_si128(dest, 6, xmm6); | |
54 | mm_stream_si128(dest, 7, xmm7); | |
55 | mm_stream_si128(dest, 8, xmm8); | |
56 | mm_stream_si128(dest, 9, xmm9); | |
57 | mm_stream_si128(dest, 10, xmm10); | |
58 | mm_stream_si128(dest, 11, xmm11); | |
59 | mm_stream_si128(dest, 12, xmm12); | |
60 | mm_stream_si128(dest, 13, xmm13); | |
61 | mm_stream_si128(dest, 14, xmm14); | |
62 | mm_stream_si128(dest, 15, xmm15); | |
63 | } | |
64 | ||
65 | static force_inline void | |
66 | memmove_movnt2x64b(char *dest, const char *src) | |
67 | { | |
68 | __m128i xmm0 = mm_loadu_si128(src, 0); | |
69 | __m128i xmm1 = mm_loadu_si128(src, 1); | |
70 | __m128i xmm2 = mm_loadu_si128(src, 2); | |
71 | __m128i xmm3 = mm_loadu_si128(src, 3); | |
72 | __m128i xmm4 = mm_loadu_si128(src, 4); | |
73 | __m128i xmm5 = mm_loadu_si128(src, 5); | |
74 | __m128i xmm6 = mm_loadu_si128(src, 6); | |
75 | __m128i xmm7 = mm_loadu_si128(src, 7); | |
76 | ||
77 | mm_stream_si128(dest, 0, xmm0); | |
78 | mm_stream_si128(dest, 1, xmm1); | |
79 | mm_stream_si128(dest, 2, xmm2); | |
80 | mm_stream_si128(dest, 3, xmm3); | |
81 | mm_stream_si128(dest, 4, xmm4); | |
82 | mm_stream_si128(dest, 5, xmm5); | |
83 | mm_stream_si128(dest, 6, xmm6); | |
84 | mm_stream_si128(dest, 7, xmm7); | |
85 | } | |
86 | ||
87 | static force_inline void | |
88 | memmove_movnt1x64b(char *dest, const char *src) | |
89 | { | |
90 | __m128i xmm0 = mm_loadu_si128(src, 0); | |
91 | __m128i xmm1 = mm_loadu_si128(src, 1); | |
92 | __m128i xmm2 = mm_loadu_si128(src, 2); | |
93 | __m128i xmm3 = mm_loadu_si128(src, 3); | |
94 | ||
95 | mm_stream_si128(dest, 0, xmm0); | |
96 | mm_stream_si128(dest, 1, xmm1); | |
97 | mm_stream_si128(dest, 2, xmm2); | |
98 | mm_stream_si128(dest, 3, xmm3); | |
99 | } | |
100 | ||
101 | static force_inline void | |
102 | memmove_movnt1x32b(char *dest, const char *src) | |
103 | { | |
104 | __m128i xmm0 = mm_loadu_si128(src, 0); | |
105 | __m128i xmm1 = mm_loadu_si128(src, 1); | |
106 | ||
107 | mm_stream_si128(dest, 0, xmm0); | |
108 | mm_stream_si128(dest, 1, xmm1); | |
109 | } | |
110 | ||
111 | static force_inline void | |
112 | memmove_movnt1x16b(char *dest, const char *src) | |
113 | { | |
114 | __m128i xmm0 = mm_loadu_si128(src, 0); | |
115 | ||
116 | mm_stream_si128(dest, 0, xmm0); | |
117 | } | |
118 | ||
119 | static force_inline void | |
120 | memmove_movnt1x8b(char *dest, const char *src) | |
121 | { | |
122 | _mm_stream_si64((long long *)dest, *(long long *)src); | |
123 | } | |
124 | ||
125 | static force_inline void | |
126 | memmove_movnt1x4b(char *dest, const char *src) | |
127 | { | |
128 | _mm_stream_si32((int *)dest, *(int *)src); | |
129 | } | |
130 | ||
131 | static force_inline void | |
132 | memmove_movnt_sse_fw(char *dest, const char *src, size_t len, flush_fn flush, | |
133 | perf_barrier_fn perf_barrier) | |
134 | { | |
135 | size_t cnt = (uint64_t)dest & 63; | |
136 | if (cnt > 0) { | |
137 | cnt = 64 - cnt; | |
138 | ||
139 | if (cnt > len) | |
140 | cnt = len; | |
141 | ||
142 | memmove_small_sse2(dest, src, cnt, flush); | |
143 | ||
144 | dest += cnt; | |
145 | src += cnt; | |
146 | len -= cnt; | |
147 | } | |
148 | ||
149 | const char *srcend = src + len; | |
150 | prefetch_ini_fw(src, len); | |
151 | ||
152 | while (len >= PERF_BARRIER_SIZE) { | |
153 | prefetch_next_fw(src, srcend); | |
154 | ||
155 | memmove_movnt4x64b(dest, src); | |
156 | dest += 4 * 64; | |
157 | src += 4 * 64; | |
158 | len -= 4 * 64; | |
159 | ||
160 | memmove_movnt4x64b(dest, src); | |
161 | dest += 4 * 64; | |
162 | src += 4 * 64; | |
163 | len -= 4 * 64; | |
164 | ||
165 | memmove_movnt4x64b(dest, src); | |
166 | dest += 4 * 64; | |
167 | src += 4 * 64; | |
168 | len -= 4 * 64; | |
169 | ||
170 | COMPILE_ERROR_ON(PERF_BARRIER_SIZE != (4 + 4 + 4) * 64); | |
171 | ||
172 | if (len) | |
173 | perf_barrier(); | |
174 | } | |
175 | ||
176 | while (len >= 4 * 64) { | |
177 | memmove_movnt4x64b(dest, src); | |
178 | dest += 4 * 64; | |
179 | src += 4 * 64; | |
180 | len -= 4 * 64; | |
181 | } | |
182 | ||
183 | if (len >= 2 * 64) { | |
184 | memmove_movnt2x64b(dest, src); | |
185 | dest += 2 * 64; | |
186 | src += 2 * 64; | |
187 | len -= 2 * 64; | |
188 | } | |
189 | ||
190 | if (len >= 1 * 64) { | |
191 | memmove_movnt1x64b(dest, src); | |
192 | ||
193 | dest += 1 * 64; | |
194 | src += 1 * 64; | |
195 | len -= 1 * 64; | |
196 | } | |
197 | ||
198 | if (len == 0) | |
199 | return; | |
200 | ||
201 | /* There's no point in using more than 1 nt store for 1 cache line. */ | |
202 | if (util_is_pow2(len)) { | |
203 | if (len == 32) | |
204 | memmove_movnt1x32b(dest, src); | |
205 | else if (len == 16) | |
206 | memmove_movnt1x16b(dest, src); | |
207 | else if (len == 8) | |
208 | memmove_movnt1x8b(dest, src); | |
209 | else if (len == 4) | |
210 | memmove_movnt1x4b(dest, src); | |
211 | else | |
212 | goto nonnt; | |
213 | ||
214 | return; | |
215 | } | |
216 | ||
217 | nonnt: | |
218 | memmove_small_sse2(dest, src, len, flush); | |
219 | } | |
220 | ||
221 | static force_inline void | |
222 | memmove_movnt_sse_bw(char *dest, const char *src, size_t len, flush_fn flush, | |
223 | perf_barrier_fn perf_barrier) | |
224 | { | |
225 | dest += len; | |
226 | src += len; | |
227 | ||
228 | size_t cnt = (uint64_t)dest & 63; | |
229 | if (cnt > 0) { | |
230 | if (cnt > len) | |
231 | cnt = len; | |
232 | ||
233 | dest -= cnt; | |
234 | src -= cnt; | |
235 | len -= cnt; | |
236 | ||
237 | memmove_small_sse2(dest, src, cnt, flush); | |
238 | } | |
239 | ||
240 | const char *srcbegin = src - len; | |
241 | prefetch_ini_bw(src, len); | |
242 | ||
243 | while (len >= PERF_BARRIER_SIZE) { | |
244 | prefetch_next_bw(src, srcbegin); | |
245 | ||
246 | dest -= 4 * 64; | |
247 | src -= 4 * 64; | |
248 | len -= 4 * 64; | |
249 | memmove_movnt4x64b(dest, src); | |
250 | ||
251 | dest -= 4 * 64; | |
252 | src -= 4 * 64; | |
253 | len -= 4 * 64; | |
254 | memmove_movnt4x64b(dest, src); | |
255 | ||
256 | dest -= 4 * 64; | |
257 | src -= 4 * 64; | |
258 | len -= 4 * 64; | |
259 | memmove_movnt4x64b(dest, src); | |
260 | ||
261 | COMPILE_ERROR_ON(PERF_BARRIER_SIZE != (4 + 4 + 4) * 64); | |
262 | ||
263 | if (len) | |
264 | perf_barrier(); | |
265 | } | |
266 | ||
267 | while (len >= 4 * 64) { | |
268 | dest -= 4 * 64; | |
269 | src -= 4 * 64; | |
270 | len -= 4 * 64; | |
271 | memmove_movnt4x64b(dest, src); | |
272 | } | |
273 | ||
274 | if (len >= 2 * 64) { | |
275 | dest -= 2 * 64; | |
276 | src -= 2 * 64; | |
277 | len -= 2 * 64; | |
278 | memmove_movnt2x64b(dest, src); | |
279 | } | |
280 | ||
281 | if (len >= 1 * 64) { | |
282 | dest -= 1 * 64; | |
283 | src -= 1 * 64; | |
284 | len -= 1 * 64; | |
285 | memmove_movnt1x64b(dest, src); | |
286 | } | |
287 | ||
288 | if (len == 0) | |
289 | return; | |
290 | ||
291 | /* There's no point in using more than 1 nt store for 1 cache line. */ | |
292 | if (util_is_pow2(len)) { | |
293 | if (len == 32) { | |
294 | dest -= 32; | |
295 | src -= 32; | |
296 | memmove_movnt1x32b(dest, src); | |
297 | } else if (len == 16) { | |
298 | dest -= 16; | |
299 | src -= 16; | |
300 | memmove_movnt1x16b(dest, src); | |
301 | } else if (len == 8) { | |
302 | dest -= 8; | |
303 | src -= 8; | |
304 | memmove_movnt1x8b(dest, src); | |
305 | } else if (len == 4) { | |
306 | dest -= 4; | |
307 | src -= 4; | |
308 | memmove_movnt1x4b(dest, src); | |
309 | } else { | |
310 | goto nonnt; | |
311 | } | |
312 | ||
313 | return; | |
314 | } | |
315 | ||
316 | nonnt: | |
317 | dest -= len; | |
318 | src -= len; | |
319 | memmove_small_sse2(dest, src, len, flush); | |
320 | } | |
321 | ||
322 | static force_inline void | |
323 | memmove_movnt_sse2(char *dest, const char *src, size_t len, flush_fn flush, | |
324 | barrier_fn barrier, perf_barrier_fn perf_barrier) | |
325 | { | |
326 | if ((uintptr_t)dest - (uintptr_t)src >= len) | |
327 | memmove_movnt_sse_fw(dest, src, len, flush, perf_barrier); | |
328 | else | |
329 | memmove_movnt_sse_bw(dest, src, len, flush, perf_barrier); | |
330 | ||
331 | barrier(); | |
332 | ||
333 | VALGRIND_DO_FLUSH(dest, len); | |
334 | } | |
335 | ||
336 | /* variants without perf_barrier */ | |
337 | ||
338 | void | |
339 | memmove_movnt_sse2_noflush_nobarrier(char *dest, const char *src, size_t len) | |
340 | { | |
341 | LOG(15, "dest %p src %p len %zu", dest, src, len); | |
342 | ||
343 | memmove_movnt_sse2(dest, src, len, noflush, barrier_after_ntstores, | |
344 | no_barrier); | |
345 | } | |
346 | ||
347 | void | |
348 | memmove_movnt_sse2_empty_nobarrier(char *dest, const char *src, size_t len) | |
349 | { | |
350 | LOG(15, "dest %p src %p len %zu", dest, src, len); | |
351 | ||
352 | memmove_movnt_sse2(dest, src, len, flush_empty_nolog, | |
353 | barrier_after_ntstores, no_barrier); | |
354 | } | |
355 | ||
356 | void | |
357 | memmove_movnt_sse2_clflush_nobarrier(char *dest, const char *src, size_t len) | |
358 | { | |
359 | LOG(15, "dest %p src %p len %zu", dest, src, len); | |
360 | ||
361 | memmove_movnt_sse2(dest, src, len, flush_clflush_nolog, | |
362 | barrier_after_ntstores, no_barrier); | |
363 | } | |
364 | ||
365 | void | |
366 | memmove_movnt_sse2_clflushopt_nobarrier(char *dest, const char *src, size_t len) | |
367 | { | |
368 | LOG(15, "dest %p src %p len %zu", dest, src, len); | |
369 | ||
370 | memmove_movnt_sse2(dest, src, len, flush_clflushopt_nolog, | |
371 | no_barrier_after_ntstores, no_barrier); | |
372 | } | |
373 | ||
374 | void | |
375 | memmove_movnt_sse2_clwb_nobarrier(char *dest, const char *src, size_t len) | |
376 | { | |
377 | LOG(15, "dest %p src %p len %zu", dest, src, len); | |
378 | ||
379 | memmove_movnt_sse2(dest, src, len, flush_clwb_nolog, | |
380 | no_barrier_after_ntstores, no_barrier); | |
381 | } | |
382 | ||
383 | /* variants with perf_barrier */ | |
384 | ||
385 | void | |
386 | memmove_movnt_sse2_noflush_wcbarrier(char *dest, const char *src, size_t len) | |
387 | { | |
388 | LOG(15, "dest %p src %p len %zu", dest, src, len); | |
389 | ||
390 | memmove_movnt_sse2(dest, src, len, noflush, barrier_after_ntstores, | |
391 | wc_barrier); | |
392 | } | |
393 | ||
394 | void | |
395 | memmove_movnt_sse2_empty_wcbarrier(char *dest, const char *src, size_t len) | |
396 | { | |
397 | LOG(15, "dest %p src %p len %zu", dest, src, len); | |
398 | ||
399 | memmove_movnt_sse2(dest, src, len, flush_empty_nolog, | |
400 | barrier_after_ntstores, wc_barrier); | |
401 | } | |
402 | ||
403 | void | |
404 | memmove_movnt_sse2_clflush_wcbarrier(char *dest, const char *src, size_t len) | |
405 | { | |
406 | LOG(15, "dest %p src %p len %zu", dest, src, len); | |
407 | ||
408 | memmove_movnt_sse2(dest, src, len, flush_clflush_nolog, | |
409 | barrier_after_ntstores, wc_barrier); | |
410 | } | |
411 | ||
412 | void | |
413 | memmove_movnt_sse2_clflushopt_wcbarrier(char *dest, const char *src, size_t len) | |
414 | { | |
415 | LOG(15, "dest %p src %p len %zu", dest, src, len); | |
416 | ||
417 | memmove_movnt_sse2(dest, src, len, flush_clflushopt_nolog, | |
418 | no_barrier_after_ntstores, wc_barrier); | |
419 | } | |
420 | ||
421 | void | |
422 | memmove_movnt_sse2_clwb_wcbarrier(char *dest, const char *src, size_t len) | |
423 | { | |
424 | LOG(15, "dest %p src %p len %zu", dest, src, len); | |
425 | ||
426 | memmove_movnt_sse2(dest, src, len, flush_clwb_nolog, | |
427 | no_barrier_after_ntstores, wc_barrier); | |
428 | } |