1 // SPDX-License-Identifier: BSD-3-Clause
2 /* Copyright 2017-2020, Intel Corporation */
8 #include "pmem2_arch.h"
11 #include "memcpy_memset.h"
12 #include "memcpy_avx.h"
13 #include "valgrind_internal.h"
15 static force_inline __m256i
16 mm256_loadu_si256(const char *src
, unsigned idx
)
18 return _mm256_loadu_si256((const __m256i
*)src
+ idx
);
21 static force_inline
void
22 mm256_stream_si256(char *dest
, unsigned idx
, __m256i src
)
24 _mm256_stream_si256((__m256i
*)dest
+ idx
, src
);
28 static force_inline
void
29 memmove_movnt8x64b(char *dest
, const char *src
)
31 __m256i ymm0
= mm256_loadu_si256(src
, 0);
32 __m256i ymm1
= mm256_loadu_si256(src
, 1);
33 __m256i ymm2
= mm256_loadu_si256(src
, 2);
34 __m256i ymm3
= mm256_loadu_si256(src
, 3);
35 __m256i ymm4
= mm256_loadu_si256(src
, 4);
36 __m256i ymm5
= mm256_loadu_si256(src
, 5);
37 __m256i ymm6
= mm256_loadu_si256(src
, 6);
38 __m256i ymm7
= mm256_loadu_si256(src
, 7);
39 __m256i ymm8
= mm256_loadu_si256(src
, 8);
40 __m256i ymm9
= mm256_loadu_si256(src
, 9);
41 __m256i ymm10
= mm256_loadu_si256(src
, 10);
42 __m256i ymm11
= mm256_loadu_si256(src
, 11);
43 __m256i ymm12
= mm256_loadu_si256(src
, 12);
44 __m256i ymm13
= mm256_loadu_si256(src
, 13);
45 __m256i ymm14
= mm256_loadu_si256(src
, 14);
46 __m256i ymm15
= mm256_loadu_si256(src
, 15);
48 mm256_stream_si256(dest
, 0, ymm0
);
49 mm256_stream_si256(dest
, 1, ymm1
);
50 mm256_stream_si256(dest
, 2, ymm2
);
51 mm256_stream_si256(dest
, 3, ymm3
);
52 mm256_stream_si256(dest
, 4, ymm4
);
53 mm256_stream_si256(dest
, 5, ymm5
);
54 mm256_stream_si256(dest
, 6, ymm6
);
55 mm256_stream_si256(dest
, 7, ymm7
);
56 mm256_stream_si256(dest
, 8, ymm8
);
57 mm256_stream_si256(dest
, 9, ymm9
);
58 mm256_stream_si256(dest
, 10, ymm10
);
59 mm256_stream_si256(dest
, 11, ymm11
);
60 mm256_stream_si256(dest
, 12, ymm12
);
61 mm256_stream_si256(dest
, 13, ymm13
);
62 mm256_stream_si256(dest
, 14, ymm14
);
63 mm256_stream_si256(dest
, 15, ymm15
);
66 static force_inline
void
67 memmove_movnt4x64b(char *dest
, const char *src
)
69 __m256i ymm0
= mm256_loadu_si256(src
, 0);
70 __m256i ymm1
= mm256_loadu_si256(src
, 1);
71 __m256i ymm2
= mm256_loadu_si256(src
, 2);
72 __m256i ymm3
= mm256_loadu_si256(src
, 3);
73 __m256i ymm4
= mm256_loadu_si256(src
, 4);
74 __m256i ymm5
= mm256_loadu_si256(src
, 5);
75 __m256i ymm6
= mm256_loadu_si256(src
, 6);
76 __m256i ymm7
= mm256_loadu_si256(src
, 7);
78 mm256_stream_si256(dest
, 0, ymm0
);
79 mm256_stream_si256(dest
, 1, ymm1
);
80 mm256_stream_si256(dest
, 2, ymm2
);
81 mm256_stream_si256(dest
, 3, ymm3
);
82 mm256_stream_si256(dest
, 4, ymm4
);
83 mm256_stream_si256(dest
, 5, ymm5
);
84 mm256_stream_si256(dest
, 6, ymm6
);
85 mm256_stream_si256(dest
, 7, ymm7
);
88 static force_inline
void
89 memmove_movnt2x64b(char *dest
, const char *src
)
91 __m256i ymm0
= mm256_loadu_si256(src
, 0);
92 __m256i ymm1
= mm256_loadu_si256(src
, 1);
93 __m256i ymm2
= mm256_loadu_si256(src
, 2);
94 __m256i ymm3
= mm256_loadu_si256(src
, 3);
96 mm256_stream_si256(dest
, 0, ymm0
);
97 mm256_stream_si256(dest
, 1, ymm1
);
98 mm256_stream_si256(dest
, 2, ymm2
);
99 mm256_stream_si256(dest
, 3, ymm3
);
102 static force_inline
void
103 memmove_movnt1x64b(char *dest
, const char *src
)
105 __m256i ymm0
= mm256_loadu_si256(src
, 0);
106 __m256i ymm1
= mm256_loadu_si256(src
, 1);
108 mm256_stream_si256(dest
, 0, ymm0
);
109 mm256_stream_si256(dest
, 1, ymm1
);
112 static force_inline
void
113 memmove_movnt1x32b(char *dest
, const char *src
)
115 __m256i ymm0
= _mm256_loadu_si256((__m256i
*)src
);
117 mm256_stream_si256(dest
, 0, ymm0
);
120 static force_inline
void
121 memmove_movnt1x16b(char *dest
, const char *src
)
123 __m128i xmm0
= _mm_loadu_si128((__m128i
*)src
);
125 _mm_stream_si128((__m128i
*)dest
, xmm0
);
128 static force_inline
void
129 memmove_movnt1x8b(char *dest
, const char *src
)
131 _mm_stream_si64((long long *)dest
, *(long long *)src
);
134 static force_inline
void
135 memmove_movnt1x4b(char *dest
, const char *src
)
137 _mm_stream_si32((int *)dest
, *(int *)src
);
140 static force_inline
void
141 memmove_movnt_avx_fw(char *dest
, const char *src
, size_t len
, flush_fn flush
,
142 perf_barrier_fn perf_barrier
)
144 size_t cnt
= (uint64_t)dest
& 63;
151 memmove_small_avx(dest
, src
, cnt
, flush
);
158 const char *srcend
= src
+ len
;
159 prefetch_ini_fw(src
, len
);
161 while (len
>= PERF_BARRIER_SIZE
) {
162 prefetch_next_fw(src
, srcend
);
164 memmove_movnt8x64b(dest
, src
);
169 memmove_movnt4x64b(dest
, src
);
174 COMPILE_ERROR_ON(PERF_BARRIER_SIZE
!= (8 + 4) * 64);
181 memmove_movnt8x64b(dest
, src
);
188 memmove_movnt4x64b(dest
, src
);
195 memmove_movnt2x64b(dest
, src
);
202 memmove_movnt1x64b(dest
, src
);
212 /* There's no point in using more than 1 nt store for 1 cache line. */
213 if (util_is_pow2(len
)) {
215 memmove_movnt1x32b(dest
, src
);
217 memmove_movnt1x16b(dest
, src
);
219 memmove_movnt1x8b(dest
, src
);
221 memmove_movnt1x4b(dest
, src
);
229 memmove_small_avx(dest
, src
, len
, flush
);
234 static force_inline
void
235 memmove_movnt_avx_bw(char *dest
, const char *src
, size_t len
, flush_fn flush
,
236 perf_barrier_fn perf_barrier
)
241 size_t cnt
= (uint64_t)dest
& 63;
250 memmove_small_avx(dest
, src
, cnt
, flush
);
253 const char *srcbegin
= src
- len
;
254 prefetch_ini_bw(src
, len
);
256 while (len
>= PERF_BARRIER_SIZE
) {
257 prefetch_next_bw(src
, srcbegin
);
262 memmove_movnt8x64b(dest
, src
);
267 memmove_movnt4x64b(dest
, src
);
269 COMPILE_ERROR_ON(PERF_BARRIER_SIZE
!= (8 + 4) * 64);
279 memmove_movnt8x64b(dest
, src
);
286 memmove_movnt4x64b(dest
, src
);
293 memmove_movnt2x64b(dest
, src
);
300 memmove_movnt1x64b(dest
, src
);
306 /* There's no point in using more than 1 nt store for 1 cache line. */
307 if (util_is_pow2(len
)) {
311 memmove_movnt1x32b(dest
, src
);
312 } else if (len
== 16) {
315 memmove_movnt1x16b(dest
, src
);
316 } else if (len
== 8) {
319 memmove_movnt1x8b(dest
, src
);
320 } else if (len
== 4) {
323 memmove_movnt1x4b(dest
, src
);
334 memmove_small_avx(dest
, src
, len
, flush
);
339 static force_inline
void
340 memmove_movnt_avx(char *dest
, const char *src
, size_t len
, flush_fn flush
,
341 barrier_fn barrier
, perf_barrier_fn perf_barrier
)
343 if ((uintptr_t)dest
- (uintptr_t)src
>= len
)
344 memmove_movnt_avx_fw(dest
, src
, len
, flush
, perf_barrier
);
346 memmove_movnt_avx_bw(dest
, src
, len
, flush
, perf_barrier
);
350 VALGRIND_DO_FLUSH(dest
, len
);
353 /* variants without perf_barrier */
356 memmove_movnt_avx_noflush_nobarrier(char *dest
, const char *src
, size_t len
)
358 LOG(15, "dest %p src %p len %zu", dest
, src
, len
);
360 memmove_movnt_avx(dest
, src
, len
, noflush
, barrier_after_ntstores
,
365 memmove_movnt_avx_empty_nobarrier(char *dest
, const char *src
, size_t len
)
367 LOG(15, "dest %p src %p len %zu", dest
, src
, len
);
369 memmove_movnt_avx(dest
, src
, len
, flush_empty_nolog
,
370 barrier_after_ntstores
, no_barrier
);
373 memmove_movnt_avx_clflush_nobarrier(char *dest
, const char *src
, size_t len
)
375 LOG(15, "dest %p src %p len %zu", dest
, src
, len
);
377 memmove_movnt_avx(dest
, src
, len
, flush_clflush_nolog
,
378 barrier_after_ntstores
, no_barrier
);
382 memmove_movnt_avx_clflushopt_nobarrier(char *dest
, const char *src
, size_t len
)
384 LOG(15, "dest %p src %p len %zu", dest
, src
, len
);
386 memmove_movnt_avx(dest
, src
, len
, flush_clflushopt_nolog
,
387 no_barrier_after_ntstores
, no_barrier
);
391 memmove_movnt_avx_clwb_nobarrier(char *dest
, const char *src
, size_t len
)
393 LOG(15, "dest %p src %p len %zu", dest
, src
, len
);
395 memmove_movnt_avx(dest
, src
, len
, flush_clwb_nolog
,
396 no_barrier_after_ntstores
, no_barrier
);
399 /* variants with perf_barrier */
402 memmove_movnt_avx_noflush_wcbarrier(char *dest
, const char *src
, size_t len
)
404 LOG(15, "dest %p src %p len %zu", dest
, src
, len
);
406 memmove_movnt_avx(dest
, src
, len
, noflush
, barrier_after_ntstores
,
411 memmove_movnt_avx_empty_wcbarrier(char *dest
, const char *src
, size_t len
)
413 LOG(15, "dest %p src %p len %zu", dest
, src
, len
);
415 memmove_movnt_avx(dest
, src
, len
, flush_empty_nolog
,
416 barrier_after_ntstores
, wc_barrier
);
419 memmove_movnt_avx_clflush_wcbarrier(char *dest
, const char *src
, size_t len
)
421 LOG(15, "dest %p src %p len %zu", dest
, src
, len
);
423 memmove_movnt_avx(dest
, src
, len
, flush_clflush_nolog
,
424 barrier_after_ntstores
, wc_barrier
);
428 memmove_movnt_avx_clflushopt_wcbarrier(char *dest
, const char *src
, size_t len
)
430 LOG(15, "dest %p src %p len %zu", dest
, src
, len
);
432 memmove_movnt_avx(dest
, src
, len
, flush_clflushopt_nolog
,
433 no_barrier_after_ntstores
, wc_barrier
);
437 memmove_movnt_avx_clwb_wcbarrier(char *dest
, const char *src
, size_t len
)
439 LOG(15, "dest %p src %p len %zu", dest
, src
, len
);
441 memmove_movnt_avx(dest
, src
, len
, flush_clwb_nolog
,
442 no_barrier_after_ntstores
, wc_barrier
);