1 // SPDX-License-Identifier: BSD-3-Clause
2 /* Copyright 2017-2020, Intel Corporation */
8 #include "pmem2_arch.h"
11 #include "memcpy_memset.h"
12 #include "memcpy_avx512f.h"
13 #include "valgrind_internal.h"
15 static force_inline __m512i
16 mm512_loadu_si512(const char *src
, unsigned idx
)
18 return _mm512_loadu_si512((const __m512i
*)src
+ idx
);
21 static force_inline
void
22 mm512_stream_si512(char *dest
, unsigned idx
, __m512i src
)
24 _mm512_stream_si512((__m512i
*)dest
+ idx
, src
);
28 static force_inline
void
29 memmove_movnt32x64b(char *dest
, const char *src
)
31 __m512i zmm0
= mm512_loadu_si512(src
, 0);
32 __m512i zmm1
= mm512_loadu_si512(src
, 1);
33 __m512i zmm2
= mm512_loadu_si512(src
, 2);
34 __m512i zmm3
= mm512_loadu_si512(src
, 3);
35 __m512i zmm4
= mm512_loadu_si512(src
, 4);
36 __m512i zmm5
= mm512_loadu_si512(src
, 5);
37 __m512i zmm6
= mm512_loadu_si512(src
, 6);
38 __m512i zmm7
= mm512_loadu_si512(src
, 7);
39 __m512i zmm8
= mm512_loadu_si512(src
, 8);
40 __m512i zmm9
= mm512_loadu_si512(src
, 9);
41 __m512i zmm10
= mm512_loadu_si512(src
, 10);
42 __m512i zmm11
= mm512_loadu_si512(src
, 11);
43 __m512i zmm12
= mm512_loadu_si512(src
, 12);
44 __m512i zmm13
= mm512_loadu_si512(src
, 13);
45 __m512i zmm14
= mm512_loadu_si512(src
, 14);
46 __m512i zmm15
= mm512_loadu_si512(src
, 15);
47 __m512i zmm16
= mm512_loadu_si512(src
, 16);
48 __m512i zmm17
= mm512_loadu_si512(src
, 17);
49 __m512i zmm18
= mm512_loadu_si512(src
, 18);
50 __m512i zmm19
= mm512_loadu_si512(src
, 19);
51 __m512i zmm20
= mm512_loadu_si512(src
, 20);
52 __m512i zmm21
= mm512_loadu_si512(src
, 21);
53 __m512i zmm22
= mm512_loadu_si512(src
, 22);
54 __m512i zmm23
= mm512_loadu_si512(src
, 23);
55 __m512i zmm24
= mm512_loadu_si512(src
, 24);
56 __m512i zmm25
= mm512_loadu_si512(src
, 25);
57 __m512i zmm26
= mm512_loadu_si512(src
, 26);
58 __m512i zmm27
= mm512_loadu_si512(src
, 27);
59 __m512i zmm28
= mm512_loadu_si512(src
, 28);
60 __m512i zmm29
= mm512_loadu_si512(src
, 29);
61 __m512i zmm30
= mm512_loadu_si512(src
, 30);
62 __m512i zmm31
= mm512_loadu_si512(src
, 31);
64 mm512_stream_si512(dest
, 0, zmm0
);
65 mm512_stream_si512(dest
, 1, zmm1
);
66 mm512_stream_si512(dest
, 2, zmm2
);
67 mm512_stream_si512(dest
, 3, zmm3
);
68 mm512_stream_si512(dest
, 4, zmm4
);
69 mm512_stream_si512(dest
, 5, zmm5
);
70 mm512_stream_si512(dest
, 6, zmm6
);
71 mm512_stream_si512(dest
, 7, zmm7
);
72 mm512_stream_si512(dest
, 8, zmm8
);
73 mm512_stream_si512(dest
, 9, zmm9
);
74 mm512_stream_si512(dest
, 10, zmm10
);
75 mm512_stream_si512(dest
, 11, zmm11
);
76 mm512_stream_si512(dest
, 12, zmm12
);
77 mm512_stream_si512(dest
, 13, zmm13
);
78 mm512_stream_si512(dest
, 14, zmm14
);
79 mm512_stream_si512(dest
, 15, zmm15
);
80 mm512_stream_si512(dest
, 16, zmm16
);
81 mm512_stream_si512(dest
, 17, zmm17
);
82 mm512_stream_si512(dest
, 18, zmm18
);
83 mm512_stream_si512(dest
, 19, zmm19
);
84 mm512_stream_si512(dest
, 20, zmm20
);
85 mm512_stream_si512(dest
, 21, zmm21
);
86 mm512_stream_si512(dest
, 22, zmm22
);
87 mm512_stream_si512(dest
, 23, zmm23
);
88 mm512_stream_si512(dest
, 24, zmm24
);
89 mm512_stream_si512(dest
, 25, zmm25
);
90 mm512_stream_si512(dest
, 26, zmm26
);
91 mm512_stream_si512(dest
, 27, zmm27
);
92 mm512_stream_si512(dest
, 28, zmm28
);
93 mm512_stream_si512(dest
, 29, zmm29
);
94 mm512_stream_si512(dest
, 30, zmm30
);
95 mm512_stream_si512(dest
, 31, zmm31
);
98 static force_inline
void
99 memmove_movnt16x64b(char *dest
, const char *src
)
101 __m512i zmm0
= mm512_loadu_si512(src
, 0);
102 __m512i zmm1
= mm512_loadu_si512(src
, 1);
103 __m512i zmm2
= mm512_loadu_si512(src
, 2);
104 __m512i zmm3
= mm512_loadu_si512(src
, 3);
105 __m512i zmm4
= mm512_loadu_si512(src
, 4);
106 __m512i zmm5
= mm512_loadu_si512(src
, 5);
107 __m512i zmm6
= mm512_loadu_si512(src
, 6);
108 __m512i zmm7
= mm512_loadu_si512(src
, 7);
109 __m512i zmm8
= mm512_loadu_si512(src
, 8);
110 __m512i zmm9
= mm512_loadu_si512(src
, 9);
111 __m512i zmm10
= mm512_loadu_si512(src
, 10);
112 __m512i zmm11
= mm512_loadu_si512(src
, 11);
113 __m512i zmm12
= mm512_loadu_si512(src
, 12);
114 __m512i zmm13
= mm512_loadu_si512(src
, 13);
115 __m512i zmm14
= mm512_loadu_si512(src
, 14);
116 __m512i zmm15
= mm512_loadu_si512(src
, 15);
118 mm512_stream_si512(dest
, 0, zmm0
);
119 mm512_stream_si512(dest
, 1, zmm1
);
120 mm512_stream_si512(dest
, 2, zmm2
);
121 mm512_stream_si512(dest
, 3, zmm3
);
122 mm512_stream_si512(dest
, 4, zmm4
);
123 mm512_stream_si512(dest
, 5, zmm5
);
124 mm512_stream_si512(dest
, 6, zmm6
);
125 mm512_stream_si512(dest
, 7, zmm7
);
126 mm512_stream_si512(dest
, 8, zmm8
);
127 mm512_stream_si512(dest
, 9, zmm9
);
128 mm512_stream_si512(dest
, 10, zmm10
);
129 mm512_stream_si512(dest
, 11, zmm11
);
130 mm512_stream_si512(dest
, 12, zmm12
);
131 mm512_stream_si512(dest
, 13, zmm13
);
132 mm512_stream_si512(dest
, 14, zmm14
);
133 mm512_stream_si512(dest
, 15, zmm15
);
136 static force_inline
void
137 memmove_movnt8x64b(char *dest
, const char *src
)
139 __m512i zmm0
= mm512_loadu_si512(src
, 0);
140 __m512i zmm1
= mm512_loadu_si512(src
, 1);
141 __m512i zmm2
= mm512_loadu_si512(src
, 2);
142 __m512i zmm3
= mm512_loadu_si512(src
, 3);
143 __m512i zmm4
= mm512_loadu_si512(src
, 4);
144 __m512i zmm5
= mm512_loadu_si512(src
, 5);
145 __m512i zmm6
= mm512_loadu_si512(src
, 6);
146 __m512i zmm7
= mm512_loadu_si512(src
, 7);
148 mm512_stream_si512(dest
, 0, zmm0
);
149 mm512_stream_si512(dest
, 1, zmm1
);
150 mm512_stream_si512(dest
, 2, zmm2
);
151 mm512_stream_si512(dest
, 3, zmm3
);
152 mm512_stream_si512(dest
, 4, zmm4
);
153 mm512_stream_si512(dest
, 5, zmm5
);
154 mm512_stream_si512(dest
, 6, zmm6
);
155 mm512_stream_si512(dest
, 7, zmm7
);
158 static force_inline
void
159 memmove_movnt4x64b(char *dest
, const char *src
)
161 __m512i zmm0
= mm512_loadu_si512(src
, 0);
162 __m512i zmm1
= mm512_loadu_si512(src
, 1);
163 __m512i zmm2
= mm512_loadu_si512(src
, 2);
164 __m512i zmm3
= mm512_loadu_si512(src
, 3);
166 mm512_stream_si512(dest
, 0, zmm0
);
167 mm512_stream_si512(dest
, 1, zmm1
);
168 mm512_stream_si512(dest
, 2, zmm2
);
169 mm512_stream_si512(dest
, 3, zmm3
);
172 static force_inline
void
173 memmove_movnt2x64b(char *dest
, const char *src
)
175 __m512i zmm0
= mm512_loadu_si512(src
, 0);
176 __m512i zmm1
= mm512_loadu_si512(src
, 1);
178 mm512_stream_si512(dest
, 0, zmm0
);
179 mm512_stream_si512(dest
, 1, zmm1
);
182 static force_inline
void
183 memmove_movnt1x64b(char *dest
, const char *src
)
185 __m512i zmm0
= mm512_loadu_si512(src
, 0);
187 mm512_stream_si512(dest
, 0, zmm0
);
190 static force_inline
void
191 memmove_movnt1x32b(char *dest
, const char *src
)
193 __m256i zmm0
= _mm256_loadu_si256((__m256i
*)src
);
195 _mm256_stream_si256((__m256i
*)dest
, zmm0
);
198 static force_inline
void
199 memmove_movnt1x16b(char *dest
, const char *src
)
201 __m128i ymm0
= _mm_loadu_si128((__m128i
*)src
);
203 _mm_stream_si128((__m128i
*)dest
, ymm0
);
206 static force_inline
void
207 memmove_movnt1x8b(char *dest
, const char *src
)
209 _mm_stream_si64((long long *)dest
, *(long long *)src
);
212 static force_inline
void
213 memmove_movnt1x4b(char *dest
, const char *src
)
215 _mm_stream_si32((int *)dest
, *(int *)src
);
218 static force_inline
void
219 memmove_movnt_avx512f_fw(char *dest
, const char *src
, size_t len
,
222 size_t cnt
= (uint64_t)dest
& 63;
229 memmove_small_avx512f(dest
, src
, cnt
, flush
);
236 while (len
>= 32 * 64) {
237 memmove_movnt32x64b(dest
, src
);
243 if (len
>= 16 * 64) {
244 memmove_movnt16x64b(dest
, src
);
251 memmove_movnt8x64b(dest
, src
);
258 memmove_movnt4x64b(dest
, src
);
265 memmove_movnt2x64b(dest
, src
);
272 memmove_movnt1x64b(dest
, src
);
282 /* There's no point in using more than 1 nt store for 1 cache line. */
283 if (util_is_pow2(len
)) {
285 memmove_movnt1x32b(dest
, src
);
287 memmove_movnt1x16b(dest
, src
);
289 memmove_movnt1x8b(dest
, src
);
291 memmove_movnt1x4b(dest
, src
);
299 memmove_small_avx512f(dest
, src
, len
, flush
);
304 static force_inline
void
305 memmove_movnt_avx512f_bw(char *dest
, const char *src
, size_t len
,
311 size_t cnt
= (uint64_t)dest
& 63;
320 memmove_small_avx512f(dest
, src
, cnt
, flush
);
323 while (len
>= 32 * 64) {
327 memmove_movnt32x64b(dest
, src
);
330 if (len
>= 16 * 64) {
334 memmove_movnt16x64b(dest
, src
);
341 memmove_movnt8x64b(dest
, src
);
348 memmove_movnt4x64b(dest
, src
);
355 memmove_movnt2x64b(dest
, src
);
363 memmove_movnt1x64b(dest
, src
);
369 /* There's no point in using more than 1 nt store for 1 cache line. */
370 if (util_is_pow2(len
)) {
374 memmove_movnt1x32b(dest
, src
);
375 } else if (len
== 16) {
378 memmove_movnt1x16b(dest
, src
);
379 } else if (len
== 8) {
382 memmove_movnt1x8b(dest
, src
);
383 } else if (len
== 4) {
386 memmove_movnt1x4b(dest
, src
);
398 memmove_small_avx512f(dest
, src
, len
, flush
);
403 static force_inline
void
404 memmove_movnt_avx512f(char *dest
, const char *src
, size_t len
, flush_fn flush
,
407 if ((uintptr_t)dest
- (uintptr_t)src
>= len
)
408 memmove_movnt_avx512f_fw(dest
, src
, len
, flush
);
410 memmove_movnt_avx512f_bw(dest
, src
, len
, flush
);
414 VALGRIND_DO_FLUSH(dest
, len
);
418 memmove_movnt_avx512f_noflush(char *dest
, const char *src
, size_t len
)
420 LOG(15, "dest %p src %p len %zu", dest
, src
, len
);
422 memmove_movnt_avx512f(dest
, src
, len
, noflush
, barrier_after_ntstores
);
426 memmove_movnt_avx512f_empty(char *dest
, const char *src
, size_t len
)
428 LOG(15, "dest %p src %p len %zu", dest
, src
, len
);
430 memmove_movnt_avx512f(dest
, src
, len
, flush_empty_nolog
,
431 barrier_after_ntstores
);
435 memmove_movnt_avx512f_clflush(char *dest
, const char *src
, size_t len
)
437 LOG(15, "dest %p src %p len %zu", dest
, src
, len
);
439 memmove_movnt_avx512f(dest
, src
, len
, flush_clflush_nolog
,
440 barrier_after_ntstores
);
444 memmove_movnt_avx512f_clflushopt(char *dest
, const char *src
, size_t len
)
446 LOG(15, "dest %p src %p len %zu", dest
, src
, len
);
448 memmove_movnt_avx512f(dest
, src
, len
, flush_clflushopt_nolog
,
449 no_barrier_after_ntstores
);
453 memmove_movnt_avx512f_clwb(char *dest
, const char *src
, size_t len
)
455 LOG(15, "dest %p src %p len %zu", dest
, src
, len
);
457 memmove_movnt_avx512f(dest
, src
, len
, flush_clwb_nolog
,
458 no_barrier_after_ntstores
);