]> git.proxmox.com Git - ceph.git/blob - ceph/src/pmdk/src/libpmem2/x86_64/memcpy/memcpy_nt_avx.c
import ceph 16.2.7
[ceph.git] / ceph / src / pmdk / src / libpmem2 / x86_64 / memcpy / memcpy_nt_avx.c
1 // SPDX-License-Identifier: BSD-3-Clause
2 /* Copyright 2017-2020, Intel Corporation */
3
4 #include <immintrin.h>
5 #include <stddef.h>
6 #include <stdint.h>
7
8 #include "pmem2_arch.h"
9 #include "avx.h"
10 #include "flush.h"
11 #include "memcpy_memset.h"
12 #include "memcpy_avx.h"
13 #include "valgrind_internal.h"
14
15 static force_inline __m256i
16 mm256_loadu_si256(const char *src, unsigned idx)
17 {
18 return _mm256_loadu_si256((const __m256i *)src + idx);
19 }
20
21 static force_inline void
22 mm256_stream_si256(char *dest, unsigned idx, __m256i src)
23 {
24 _mm256_stream_si256((__m256i *)dest + idx, src);
25 barrier();
26 }
27
28 static force_inline void
29 memmove_movnt8x64b(char *dest, const char *src)
30 {
31 __m256i ymm0 = mm256_loadu_si256(src, 0);
32 __m256i ymm1 = mm256_loadu_si256(src, 1);
33 __m256i ymm2 = mm256_loadu_si256(src, 2);
34 __m256i ymm3 = mm256_loadu_si256(src, 3);
35 __m256i ymm4 = mm256_loadu_si256(src, 4);
36 __m256i ymm5 = mm256_loadu_si256(src, 5);
37 __m256i ymm6 = mm256_loadu_si256(src, 6);
38 __m256i ymm7 = mm256_loadu_si256(src, 7);
39 __m256i ymm8 = mm256_loadu_si256(src, 8);
40 __m256i ymm9 = mm256_loadu_si256(src, 9);
41 __m256i ymm10 = mm256_loadu_si256(src, 10);
42 __m256i ymm11 = mm256_loadu_si256(src, 11);
43 __m256i ymm12 = mm256_loadu_si256(src, 12);
44 __m256i ymm13 = mm256_loadu_si256(src, 13);
45 __m256i ymm14 = mm256_loadu_si256(src, 14);
46 __m256i ymm15 = mm256_loadu_si256(src, 15);
47
48 mm256_stream_si256(dest, 0, ymm0);
49 mm256_stream_si256(dest, 1, ymm1);
50 mm256_stream_si256(dest, 2, ymm2);
51 mm256_stream_si256(dest, 3, ymm3);
52 mm256_stream_si256(dest, 4, ymm4);
53 mm256_stream_si256(dest, 5, ymm5);
54 mm256_stream_si256(dest, 6, ymm6);
55 mm256_stream_si256(dest, 7, ymm7);
56 mm256_stream_si256(dest, 8, ymm8);
57 mm256_stream_si256(dest, 9, ymm9);
58 mm256_stream_si256(dest, 10, ymm10);
59 mm256_stream_si256(dest, 11, ymm11);
60 mm256_stream_si256(dest, 12, ymm12);
61 mm256_stream_si256(dest, 13, ymm13);
62 mm256_stream_si256(dest, 14, ymm14);
63 mm256_stream_si256(dest, 15, ymm15);
64 }
65
66 static force_inline void
67 memmove_movnt4x64b(char *dest, const char *src)
68 {
69 __m256i ymm0 = mm256_loadu_si256(src, 0);
70 __m256i ymm1 = mm256_loadu_si256(src, 1);
71 __m256i ymm2 = mm256_loadu_si256(src, 2);
72 __m256i ymm3 = mm256_loadu_si256(src, 3);
73 __m256i ymm4 = mm256_loadu_si256(src, 4);
74 __m256i ymm5 = mm256_loadu_si256(src, 5);
75 __m256i ymm6 = mm256_loadu_si256(src, 6);
76 __m256i ymm7 = mm256_loadu_si256(src, 7);
77
78 mm256_stream_si256(dest, 0, ymm0);
79 mm256_stream_si256(dest, 1, ymm1);
80 mm256_stream_si256(dest, 2, ymm2);
81 mm256_stream_si256(dest, 3, ymm3);
82 mm256_stream_si256(dest, 4, ymm4);
83 mm256_stream_si256(dest, 5, ymm5);
84 mm256_stream_si256(dest, 6, ymm6);
85 mm256_stream_si256(dest, 7, ymm7);
86 }
87
88 static force_inline void
89 memmove_movnt2x64b(char *dest, const char *src)
90 {
91 __m256i ymm0 = mm256_loadu_si256(src, 0);
92 __m256i ymm1 = mm256_loadu_si256(src, 1);
93 __m256i ymm2 = mm256_loadu_si256(src, 2);
94 __m256i ymm3 = mm256_loadu_si256(src, 3);
95
96 mm256_stream_si256(dest, 0, ymm0);
97 mm256_stream_si256(dest, 1, ymm1);
98 mm256_stream_si256(dest, 2, ymm2);
99 mm256_stream_si256(dest, 3, ymm3);
100 }
101
102 static force_inline void
103 memmove_movnt1x64b(char *dest, const char *src)
104 {
105 __m256i ymm0 = mm256_loadu_si256(src, 0);
106 __m256i ymm1 = mm256_loadu_si256(src, 1);
107
108 mm256_stream_si256(dest, 0, ymm0);
109 mm256_stream_si256(dest, 1, ymm1);
110 }
111
112 static force_inline void
113 memmove_movnt1x32b(char *dest, const char *src)
114 {
115 __m256i ymm0 = _mm256_loadu_si256((__m256i *)src);
116
117 mm256_stream_si256(dest, 0, ymm0);
118 }
119
120 static force_inline void
121 memmove_movnt1x16b(char *dest, const char *src)
122 {
123 __m128i xmm0 = _mm_loadu_si128((__m128i *)src);
124
125 _mm_stream_si128((__m128i *)dest, xmm0);
126 }
127
128 static force_inline void
129 memmove_movnt1x8b(char *dest, const char *src)
130 {
131 _mm_stream_si64((long long *)dest, *(long long *)src);
132 }
133
134 static force_inline void
135 memmove_movnt1x4b(char *dest, const char *src)
136 {
137 _mm_stream_si32((int *)dest, *(int *)src);
138 }
139
140 static force_inline void
141 memmove_movnt_avx_fw(char *dest, const char *src, size_t len, flush_fn flush,
142 perf_barrier_fn perf_barrier)
143 {
144 size_t cnt = (uint64_t)dest & 63;
145 if (cnt > 0) {
146 cnt = 64 - cnt;
147
148 if (cnt > len)
149 cnt = len;
150
151 memmove_small_avx(dest, src, cnt, flush);
152
153 dest += cnt;
154 src += cnt;
155 len -= cnt;
156 }
157
158 const char *srcend = src + len;
159 prefetch_ini_fw(src, len);
160
161 while (len >= PERF_BARRIER_SIZE) {
162 prefetch_next_fw(src, srcend);
163
164 memmove_movnt8x64b(dest, src);
165 dest += 8 * 64;
166 src += 8 * 64;
167 len -= 8 * 64;
168
169 memmove_movnt4x64b(dest, src);
170 dest += 4 * 64;
171 src += 4 * 64;
172 len -= 4 * 64;
173
174 COMPILE_ERROR_ON(PERF_BARRIER_SIZE != (8 + 4) * 64);
175
176 if (len)
177 perf_barrier();
178 }
179
180 if (len >= 8 * 64) {
181 memmove_movnt8x64b(dest, src);
182 dest += 8 * 64;
183 src += 8 * 64;
184 len -= 8 * 64;
185 }
186
187 if (len >= 4 * 64) {
188 memmove_movnt4x64b(dest, src);
189 dest += 4 * 64;
190 src += 4 * 64;
191 len -= 4 * 64;
192 }
193
194 if (len >= 2 * 64) {
195 memmove_movnt2x64b(dest, src);
196 dest += 2 * 64;
197 src += 2 * 64;
198 len -= 2 * 64;
199 }
200
201 if (len >= 1 * 64) {
202 memmove_movnt1x64b(dest, src);
203
204 dest += 1 * 64;
205 src += 1 * 64;
206 len -= 1 * 64;
207 }
208
209 if (len == 0)
210 goto end;
211
212 /* There's no point in using more than 1 nt store for 1 cache line. */
213 if (util_is_pow2(len)) {
214 if (len == 32)
215 memmove_movnt1x32b(dest, src);
216 else if (len == 16)
217 memmove_movnt1x16b(dest, src);
218 else if (len == 8)
219 memmove_movnt1x8b(dest, src);
220 else if (len == 4)
221 memmove_movnt1x4b(dest, src);
222 else
223 goto nonnt;
224
225 goto end;
226 }
227
228 nonnt:
229 memmove_small_avx(dest, src, len, flush);
230 end:
231 avx_zeroupper();
232 }
233
234 static force_inline void
235 memmove_movnt_avx_bw(char *dest, const char *src, size_t len, flush_fn flush,
236 perf_barrier_fn perf_barrier)
237 {
238 dest += len;
239 src += len;
240
241 size_t cnt = (uint64_t)dest & 63;
242 if (cnt > 0) {
243 if (cnt > len)
244 cnt = len;
245
246 dest -= cnt;
247 src -= cnt;
248 len -= cnt;
249
250 memmove_small_avx(dest, src, cnt, flush);
251 }
252
253 const char *srcbegin = src - len;
254 prefetch_ini_bw(src, len);
255
256 while (len >= PERF_BARRIER_SIZE) {
257 prefetch_next_bw(src, srcbegin);
258
259 dest -= 8 * 64;
260 src -= 8 * 64;
261 len -= 8 * 64;
262 memmove_movnt8x64b(dest, src);
263
264 dest -= 4 * 64;
265 src -= 4 * 64;
266 len -= 4 * 64;
267 memmove_movnt4x64b(dest, src);
268
269 COMPILE_ERROR_ON(PERF_BARRIER_SIZE != (8 + 4) * 64);
270
271 if (len)
272 perf_barrier();
273 }
274
275 if (len >= 8 * 64) {
276 dest -= 8 * 64;
277 src -= 8 * 64;
278 len -= 8 * 64;
279 memmove_movnt8x64b(dest, src);
280 }
281
282 if (len >= 4 * 64) {
283 dest -= 4 * 64;
284 src -= 4 * 64;
285 len -= 4 * 64;
286 memmove_movnt4x64b(dest, src);
287 }
288
289 if (len >= 2 * 64) {
290 dest -= 2 * 64;
291 src -= 2 * 64;
292 len -= 2 * 64;
293 memmove_movnt2x64b(dest, src);
294 }
295
296 if (len >= 1 * 64) {
297 dest -= 1 * 64;
298 src -= 1 * 64;
299 len -= 1 * 64;
300 memmove_movnt1x64b(dest, src);
301 }
302
303 if (len == 0)
304 goto end;
305
306 /* There's no point in using more than 1 nt store for 1 cache line. */
307 if (util_is_pow2(len)) {
308 if (len == 32) {
309 dest -= 32;
310 src -= 32;
311 memmove_movnt1x32b(dest, src);
312 } else if (len == 16) {
313 dest -= 16;
314 src -= 16;
315 memmove_movnt1x16b(dest, src);
316 } else if (len == 8) {
317 dest -= 8;
318 src -= 8;
319 memmove_movnt1x8b(dest, src);
320 } else if (len == 4) {
321 dest -= 4;
322 src -= 4;
323 memmove_movnt1x4b(dest, src);
324 } else {
325 goto nonnt;
326 }
327
328 goto end;
329 }
330
331 nonnt:
332 dest -= len;
333 src -= len;
334 memmove_small_avx(dest, src, len, flush);
335 end:
336 avx_zeroupper();
337 }
338
339 static force_inline void
340 memmove_movnt_avx(char *dest, const char *src, size_t len, flush_fn flush,
341 barrier_fn barrier, perf_barrier_fn perf_barrier)
342 {
343 if ((uintptr_t)dest - (uintptr_t)src >= len)
344 memmove_movnt_avx_fw(dest, src, len, flush, perf_barrier);
345 else
346 memmove_movnt_avx_bw(dest, src, len, flush, perf_barrier);
347
348 barrier();
349
350 VALGRIND_DO_FLUSH(dest, len);
351 }
352
353 /* variants without perf_barrier */
354
355 void
356 memmove_movnt_avx_noflush_nobarrier(char *dest, const char *src, size_t len)
357 {
358 LOG(15, "dest %p src %p len %zu", dest, src, len);
359
360 memmove_movnt_avx(dest, src, len, noflush, barrier_after_ntstores,
361 no_barrier);
362 }
363
364 void
365 memmove_movnt_avx_empty_nobarrier(char *dest, const char *src, size_t len)
366 {
367 LOG(15, "dest %p src %p len %zu", dest, src, len);
368
369 memmove_movnt_avx(dest, src, len, flush_empty_nolog,
370 barrier_after_ntstores, no_barrier);
371 }
372 void
373 memmove_movnt_avx_clflush_nobarrier(char *dest, const char *src, size_t len)
374 {
375 LOG(15, "dest %p src %p len %zu", dest, src, len);
376
377 memmove_movnt_avx(dest, src, len, flush_clflush_nolog,
378 barrier_after_ntstores, no_barrier);
379 }
380
381 void
382 memmove_movnt_avx_clflushopt_nobarrier(char *dest, const char *src, size_t len)
383 {
384 LOG(15, "dest %p src %p len %zu", dest, src, len);
385
386 memmove_movnt_avx(dest, src, len, flush_clflushopt_nolog,
387 no_barrier_after_ntstores, no_barrier);
388 }
389
390 void
391 memmove_movnt_avx_clwb_nobarrier(char *dest, const char *src, size_t len)
392 {
393 LOG(15, "dest %p src %p len %zu", dest, src, len);
394
395 memmove_movnt_avx(dest, src, len, flush_clwb_nolog,
396 no_barrier_after_ntstores, no_barrier);
397 }
398
399 /* variants with perf_barrier */
400
401 void
402 memmove_movnt_avx_noflush_wcbarrier(char *dest, const char *src, size_t len)
403 {
404 LOG(15, "dest %p src %p len %zu", dest, src, len);
405
406 memmove_movnt_avx(dest, src, len, noflush, barrier_after_ntstores,
407 wc_barrier);
408 }
409
410 void
411 memmove_movnt_avx_empty_wcbarrier(char *dest, const char *src, size_t len)
412 {
413 LOG(15, "dest %p src %p len %zu", dest, src, len);
414
415 memmove_movnt_avx(dest, src, len, flush_empty_nolog,
416 barrier_after_ntstores, wc_barrier);
417 }
418 void
419 memmove_movnt_avx_clflush_wcbarrier(char *dest, const char *src, size_t len)
420 {
421 LOG(15, "dest %p src %p len %zu", dest, src, len);
422
423 memmove_movnt_avx(dest, src, len, flush_clflush_nolog,
424 barrier_after_ntstores, wc_barrier);
425 }
426
427 void
428 memmove_movnt_avx_clflushopt_wcbarrier(char *dest, const char *src, size_t len)
429 {
430 LOG(15, "dest %p src %p len %zu", dest, src, len);
431
432 memmove_movnt_avx(dest, src, len, flush_clflushopt_nolog,
433 no_barrier_after_ntstores, wc_barrier);
434 }
435
436 void
437 memmove_movnt_avx_clwb_wcbarrier(char *dest, const char *src, size_t len)
438 {
439 LOG(15, "dest %p src %p len %zu", dest, src, len);
440
441 memmove_movnt_avx(dest, src, len, flush_clwb_nolog,
442 no_barrier_after_ntstores, wc_barrier);
443 }