]> git.proxmox.com Git - ceph.git/blob - ceph/src/pmdk/src/libpmem2/x86_64/memcpy/memcpy_nt_avx512f.c
import ceph 16.2.7
[ceph.git] / ceph / src / pmdk / src / libpmem2 / x86_64 / memcpy / memcpy_nt_avx512f.c
1 // SPDX-License-Identifier: BSD-3-Clause
2 /* Copyright 2017-2020, Intel Corporation */
3
4 #include <immintrin.h>
5 #include <stddef.h>
6 #include <stdint.h>
7
8 #include "pmem2_arch.h"
9 #include "avx.h"
10 #include "flush.h"
11 #include "memcpy_memset.h"
12 #include "memcpy_avx512f.h"
13 #include "valgrind_internal.h"
14
15 static force_inline __m512i
16 mm512_loadu_si512(const char *src, unsigned idx)
17 {
18 return _mm512_loadu_si512((const __m512i *)src + idx);
19 }
20
21 static force_inline void
22 mm512_stream_si512(char *dest, unsigned idx, __m512i src)
23 {
24 _mm512_stream_si512((__m512i *)dest + idx, src);
25 barrier();
26 }
27
28 static force_inline void
29 memmove_movnt32x64b(char *dest, const char *src)
30 {
31 __m512i zmm0 = mm512_loadu_si512(src, 0);
32 __m512i zmm1 = mm512_loadu_si512(src, 1);
33 __m512i zmm2 = mm512_loadu_si512(src, 2);
34 __m512i zmm3 = mm512_loadu_si512(src, 3);
35 __m512i zmm4 = mm512_loadu_si512(src, 4);
36 __m512i zmm5 = mm512_loadu_si512(src, 5);
37 __m512i zmm6 = mm512_loadu_si512(src, 6);
38 __m512i zmm7 = mm512_loadu_si512(src, 7);
39 __m512i zmm8 = mm512_loadu_si512(src, 8);
40 __m512i zmm9 = mm512_loadu_si512(src, 9);
41 __m512i zmm10 = mm512_loadu_si512(src, 10);
42 __m512i zmm11 = mm512_loadu_si512(src, 11);
43 __m512i zmm12 = mm512_loadu_si512(src, 12);
44 __m512i zmm13 = mm512_loadu_si512(src, 13);
45 __m512i zmm14 = mm512_loadu_si512(src, 14);
46 __m512i zmm15 = mm512_loadu_si512(src, 15);
47 __m512i zmm16 = mm512_loadu_si512(src, 16);
48 __m512i zmm17 = mm512_loadu_si512(src, 17);
49 __m512i zmm18 = mm512_loadu_si512(src, 18);
50 __m512i zmm19 = mm512_loadu_si512(src, 19);
51 __m512i zmm20 = mm512_loadu_si512(src, 20);
52 __m512i zmm21 = mm512_loadu_si512(src, 21);
53 __m512i zmm22 = mm512_loadu_si512(src, 22);
54 __m512i zmm23 = mm512_loadu_si512(src, 23);
55 __m512i zmm24 = mm512_loadu_si512(src, 24);
56 __m512i zmm25 = mm512_loadu_si512(src, 25);
57 __m512i zmm26 = mm512_loadu_si512(src, 26);
58 __m512i zmm27 = mm512_loadu_si512(src, 27);
59 __m512i zmm28 = mm512_loadu_si512(src, 28);
60 __m512i zmm29 = mm512_loadu_si512(src, 29);
61 __m512i zmm30 = mm512_loadu_si512(src, 30);
62 __m512i zmm31 = mm512_loadu_si512(src, 31);
63
64 mm512_stream_si512(dest, 0, zmm0);
65 mm512_stream_si512(dest, 1, zmm1);
66 mm512_stream_si512(dest, 2, zmm2);
67 mm512_stream_si512(dest, 3, zmm3);
68 mm512_stream_si512(dest, 4, zmm4);
69 mm512_stream_si512(dest, 5, zmm5);
70 mm512_stream_si512(dest, 6, zmm6);
71 mm512_stream_si512(dest, 7, zmm7);
72 mm512_stream_si512(dest, 8, zmm8);
73 mm512_stream_si512(dest, 9, zmm9);
74 mm512_stream_si512(dest, 10, zmm10);
75 mm512_stream_si512(dest, 11, zmm11);
76 mm512_stream_si512(dest, 12, zmm12);
77 mm512_stream_si512(dest, 13, zmm13);
78 mm512_stream_si512(dest, 14, zmm14);
79 mm512_stream_si512(dest, 15, zmm15);
80 mm512_stream_si512(dest, 16, zmm16);
81 mm512_stream_si512(dest, 17, zmm17);
82 mm512_stream_si512(dest, 18, zmm18);
83 mm512_stream_si512(dest, 19, zmm19);
84 mm512_stream_si512(dest, 20, zmm20);
85 mm512_stream_si512(dest, 21, zmm21);
86 mm512_stream_si512(dest, 22, zmm22);
87 mm512_stream_si512(dest, 23, zmm23);
88 mm512_stream_si512(dest, 24, zmm24);
89 mm512_stream_si512(dest, 25, zmm25);
90 mm512_stream_si512(dest, 26, zmm26);
91 mm512_stream_si512(dest, 27, zmm27);
92 mm512_stream_si512(dest, 28, zmm28);
93 mm512_stream_si512(dest, 29, zmm29);
94 mm512_stream_si512(dest, 30, zmm30);
95 mm512_stream_si512(dest, 31, zmm31);
96 }
97
98 static force_inline void
99 memmove_movnt16x64b(char *dest, const char *src)
100 {
101 __m512i zmm0 = mm512_loadu_si512(src, 0);
102 __m512i zmm1 = mm512_loadu_si512(src, 1);
103 __m512i zmm2 = mm512_loadu_si512(src, 2);
104 __m512i zmm3 = mm512_loadu_si512(src, 3);
105 __m512i zmm4 = mm512_loadu_si512(src, 4);
106 __m512i zmm5 = mm512_loadu_si512(src, 5);
107 __m512i zmm6 = mm512_loadu_si512(src, 6);
108 __m512i zmm7 = mm512_loadu_si512(src, 7);
109 __m512i zmm8 = mm512_loadu_si512(src, 8);
110 __m512i zmm9 = mm512_loadu_si512(src, 9);
111 __m512i zmm10 = mm512_loadu_si512(src, 10);
112 __m512i zmm11 = mm512_loadu_si512(src, 11);
113 __m512i zmm12 = mm512_loadu_si512(src, 12);
114 __m512i zmm13 = mm512_loadu_si512(src, 13);
115 __m512i zmm14 = mm512_loadu_si512(src, 14);
116 __m512i zmm15 = mm512_loadu_si512(src, 15);
117
118 mm512_stream_si512(dest, 0, zmm0);
119 mm512_stream_si512(dest, 1, zmm1);
120 mm512_stream_si512(dest, 2, zmm2);
121 mm512_stream_si512(dest, 3, zmm3);
122 mm512_stream_si512(dest, 4, zmm4);
123 mm512_stream_si512(dest, 5, zmm5);
124 mm512_stream_si512(dest, 6, zmm6);
125 mm512_stream_si512(dest, 7, zmm7);
126 mm512_stream_si512(dest, 8, zmm8);
127 mm512_stream_si512(dest, 9, zmm9);
128 mm512_stream_si512(dest, 10, zmm10);
129 mm512_stream_si512(dest, 11, zmm11);
130 mm512_stream_si512(dest, 12, zmm12);
131 mm512_stream_si512(dest, 13, zmm13);
132 mm512_stream_si512(dest, 14, zmm14);
133 mm512_stream_si512(dest, 15, zmm15);
134 }
135
136 static force_inline void
137 memmove_movnt8x64b(char *dest, const char *src)
138 {
139 __m512i zmm0 = mm512_loadu_si512(src, 0);
140 __m512i zmm1 = mm512_loadu_si512(src, 1);
141 __m512i zmm2 = mm512_loadu_si512(src, 2);
142 __m512i zmm3 = mm512_loadu_si512(src, 3);
143 __m512i zmm4 = mm512_loadu_si512(src, 4);
144 __m512i zmm5 = mm512_loadu_si512(src, 5);
145 __m512i zmm6 = mm512_loadu_si512(src, 6);
146 __m512i zmm7 = mm512_loadu_si512(src, 7);
147
148 mm512_stream_si512(dest, 0, zmm0);
149 mm512_stream_si512(dest, 1, zmm1);
150 mm512_stream_si512(dest, 2, zmm2);
151 mm512_stream_si512(dest, 3, zmm3);
152 mm512_stream_si512(dest, 4, zmm4);
153 mm512_stream_si512(dest, 5, zmm5);
154 mm512_stream_si512(dest, 6, zmm6);
155 mm512_stream_si512(dest, 7, zmm7);
156 }
157
158 static force_inline void
159 memmove_movnt4x64b(char *dest, const char *src)
160 {
161 __m512i zmm0 = mm512_loadu_si512(src, 0);
162 __m512i zmm1 = mm512_loadu_si512(src, 1);
163 __m512i zmm2 = mm512_loadu_si512(src, 2);
164 __m512i zmm3 = mm512_loadu_si512(src, 3);
165
166 mm512_stream_si512(dest, 0, zmm0);
167 mm512_stream_si512(dest, 1, zmm1);
168 mm512_stream_si512(dest, 2, zmm2);
169 mm512_stream_si512(dest, 3, zmm3);
170 }
171
172 static force_inline void
173 memmove_movnt2x64b(char *dest, const char *src)
174 {
175 __m512i zmm0 = mm512_loadu_si512(src, 0);
176 __m512i zmm1 = mm512_loadu_si512(src, 1);
177
178 mm512_stream_si512(dest, 0, zmm0);
179 mm512_stream_si512(dest, 1, zmm1);
180 }
181
182 static force_inline void
183 memmove_movnt1x64b(char *dest, const char *src)
184 {
185 __m512i zmm0 = mm512_loadu_si512(src, 0);
186
187 mm512_stream_si512(dest, 0, zmm0);
188 }
189
190 static force_inline void
191 memmove_movnt1x32b(char *dest, const char *src)
192 {
193 __m256i zmm0 = _mm256_loadu_si256((__m256i *)src);
194
195 _mm256_stream_si256((__m256i *)dest, zmm0);
196 }
197
198 static force_inline void
199 memmove_movnt1x16b(char *dest, const char *src)
200 {
201 __m128i ymm0 = _mm_loadu_si128((__m128i *)src);
202
203 _mm_stream_si128((__m128i *)dest, ymm0);
204 }
205
206 static force_inline void
207 memmove_movnt1x8b(char *dest, const char *src)
208 {
209 _mm_stream_si64((long long *)dest, *(long long *)src);
210 }
211
212 static force_inline void
213 memmove_movnt1x4b(char *dest, const char *src)
214 {
215 _mm_stream_si32((int *)dest, *(int *)src);
216 }
217
218 static force_inline void
219 memmove_movnt_avx512f_fw(char *dest, const char *src, size_t len,
220 flush_fn flush)
221 {
222 size_t cnt = (uint64_t)dest & 63;
223 if (cnt > 0) {
224 cnt = 64 - cnt;
225
226 if (cnt > len)
227 cnt = len;
228
229 memmove_small_avx512f(dest, src, cnt, flush);
230
231 dest += cnt;
232 src += cnt;
233 len -= cnt;
234 }
235
236 while (len >= 32 * 64) {
237 memmove_movnt32x64b(dest, src);
238 dest += 32 * 64;
239 src += 32 * 64;
240 len -= 32 * 64;
241 }
242
243 if (len >= 16 * 64) {
244 memmove_movnt16x64b(dest, src);
245 dest += 16 * 64;
246 src += 16 * 64;
247 len -= 16 * 64;
248 }
249
250 if (len >= 8 * 64) {
251 memmove_movnt8x64b(dest, src);
252 dest += 8 * 64;
253 src += 8 * 64;
254 len -= 8 * 64;
255 }
256
257 if (len >= 4 * 64) {
258 memmove_movnt4x64b(dest, src);
259 dest += 4 * 64;
260 src += 4 * 64;
261 len -= 4 * 64;
262 }
263
264 if (len >= 2 * 64) {
265 memmove_movnt2x64b(dest, src);
266 dest += 2 * 64;
267 src += 2 * 64;
268 len -= 2 * 64;
269 }
270
271 if (len >= 1 * 64) {
272 memmove_movnt1x64b(dest, src);
273
274 dest += 1 * 64;
275 src += 1 * 64;
276 len -= 1 * 64;
277 }
278
279 if (len == 0)
280 goto end;
281
282 /* There's no point in using more than 1 nt store for 1 cache line. */
283 if (util_is_pow2(len)) {
284 if (len == 32)
285 memmove_movnt1x32b(dest, src);
286 else if (len == 16)
287 memmove_movnt1x16b(dest, src);
288 else if (len == 8)
289 memmove_movnt1x8b(dest, src);
290 else if (len == 4)
291 memmove_movnt1x4b(dest, src);
292 else
293 goto nonnt;
294
295 goto end;
296 }
297
298 nonnt:
299 memmove_small_avx512f(dest, src, len, flush);
300 end:
301 avx_zeroupper();
302 }
303
304 static force_inline void
305 memmove_movnt_avx512f_bw(char *dest, const char *src, size_t len,
306 flush_fn flush)
307 {
308 dest += len;
309 src += len;
310
311 size_t cnt = (uint64_t)dest & 63;
312 if (cnt > 0) {
313 if (cnt > len)
314 cnt = len;
315
316 dest -= cnt;
317 src -= cnt;
318 len -= cnt;
319
320 memmove_small_avx512f(dest, src, cnt, flush);
321 }
322
323 while (len >= 32 * 64) {
324 dest -= 32 * 64;
325 src -= 32 * 64;
326 len -= 32 * 64;
327 memmove_movnt32x64b(dest, src);
328 }
329
330 if (len >= 16 * 64) {
331 dest -= 16 * 64;
332 src -= 16 * 64;
333 len -= 16 * 64;
334 memmove_movnt16x64b(dest, src);
335 }
336
337 if (len >= 8 * 64) {
338 dest -= 8 * 64;
339 src -= 8 * 64;
340 len -= 8 * 64;
341 memmove_movnt8x64b(dest, src);
342 }
343
344 if (len >= 4 * 64) {
345 dest -= 4 * 64;
346 src -= 4 * 64;
347 len -= 4 * 64;
348 memmove_movnt4x64b(dest, src);
349 }
350
351 if (len >= 2 * 64) {
352 dest -= 2 * 64;
353 src -= 2 * 64;
354 len -= 2 * 64;
355 memmove_movnt2x64b(dest, src);
356 }
357
358 if (len >= 1 * 64) {
359 dest -= 1 * 64;
360 src -= 1 * 64;
361 len -= 1 * 64;
362
363 memmove_movnt1x64b(dest, src);
364 }
365
366 if (len == 0)
367 goto end;
368
369 /* There's no point in using more than 1 nt store for 1 cache line. */
370 if (util_is_pow2(len)) {
371 if (len == 32) {
372 dest -= 32;
373 src -= 32;
374 memmove_movnt1x32b(dest, src);
375 } else if (len == 16) {
376 dest -= 16;
377 src -= 16;
378 memmove_movnt1x16b(dest, src);
379 } else if (len == 8) {
380 dest -= 8;
381 src -= 8;
382 memmove_movnt1x8b(dest, src);
383 } else if (len == 4) {
384 dest -= 4;
385 src -= 4;
386 memmove_movnt1x4b(dest, src);
387 } else {
388 goto nonnt;
389 }
390
391 goto end;
392 }
393
394 nonnt:
395 dest -= len;
396 src -= len;
397
398 memmove_small_avx512f(dest, src, len, flush);
399 end:
400 avx_zeroupper();
401 }
402
403 static force_inline void
404 memmove_movnt_avx512f(char *dest, const char *src, size_t len, flush_fn flush,
405 barrier_fn barrier)
406 {
407 if ((uintptr_t)dest - (uintptr_t)src >= len)
408 memmove_movnt_avx512f_fw(dest, src, len, flush);
409 else
410 memmove_movnt_avx512f_bw(dest, src, len, flush);
411
412 barrier();
413
414 VALGRIND_DO_FLUSH(dest, len);
415 }
416
417 void
418 memmove_movnt_avx512f_noflush(char *dest, const char *src, size_t len)
419 {
420 LOG(15, "dest %p src %p len %zu", dest, src, len);
421
422 memmove_movnt_avx512f(dest, src, len, noflush, barrier_after_ntstores);
423 }
424
425 void
426 memmove_movnt_avx512f_empty(char *dest, const char *src, size_t len)
427 {
428 LOG(15, "dest %p src %p len %zu", dest, src, len);
429
430 memmove_movnt_avx512f(dest, src, len, flush_empty_nolog,
431 barrier_after_ntstores);
432 }
433
434 void
435 memmove_movnt_avx512f_clflush(char *dest, const char *src, size_t len)
436 {
437 LOG(15, "dest %p src %p len %zu", dest, src, len);
438
439 memmove_movnt_avx512f(dest, src, len, flush_clflush_nolog,
440 barrier_after_ntstores);
441 }
442
443 void
444 memmove_movnt_avx512f_clflushopt(char *dest, const char *src, size_t len)
445 {
446 LOG(15, "dest %p src %p len %zu", dest, src, len);
447
448 memmove_movnt_avx512f(dest, src, len, flush_clflushopt_nolog,
449 no_barrier_after_ntstores);
450 }
451
452 void
453 memmove_movnt_avx512f_clwb(char *dest, const char *src, size_t len)
454 {
455 LOG(15, "dest %p src %p len %zu", dest, src, len);
456
457 memmove_movnt_avx512f(dest, src, len, flush_clwb_nolog,
458 no_barrier_after_ntstores);
459 }