]> git.proxmox.com Git - ceph.git/blame - ceph/src/pmdk/src/libpmem2/x86_64/memcpy/memcpy_nt_sse2.c
import ceph 16.2.7
[ceph.git] / ceph / src / pmdk / src / libpmem2 / x86_64 / memcpy / memcpy_nt_sse2.c
CommitLineData
a4b75251
TL
1// SPDX-License-Identifier: BSD-3-Clause
2/* Copyright 2017-2020, Intel Corporation */
3
4#include <immintrin.h>
5#include <stddef.h>
6#include <stdint.h>
7
8#include "pmem2_arch.h"
9#include "flush.h"
10#include "memcpy_memset.h"
11#include "memcpy_sse2.h"
12#include "valgrind_internal.h"
13
14static force_inline __m128i
15mm_loadu_si128(const char *src, unsigned idx)
16{
17 return _mm_loadu_si128((const __m128i *)src + idx);
18}
19
20static force_inline void
21mm_stream_si128(char *dest, unsigned idx, __m128i src)
22{
23 _mm_stream_si128((__m128i *)dest + idx, src);
24 barrier();
25}
26
27static force_inline void
28memmove_movnt4x64b(char *dest, const char *src)
29{
30 __m128i xmm0 = mm_loadu_si128(src, 0);
31 __m128i xmm1 = mm_loadu_si128(src, 1);
32 __m128i xmm2 = mm_loadu_si128(src, 2);
33 __m128i xmm3 = mm_loadu_si128(src, 3);
34 __m128i xmm4 = mm_loadu_si128(src, 4);
35 __m128i xmm5 = mm_loadu_si128(src, 5);
36 __m128i xmm6 = mm_loadu_si128(src, 6);
37 __m128i xmm7 = mm_loadu_si128(src, 7);
38 __m128i xmm8 = mm_loadu_si128(src, 8);
39 __m128i xmm9 = mm_loadu_si128(src, 9);
40 __m128i xmm10 = mm_loadu_si128(src, 10);
41 __m128i xmm11 = mm_loadu_si128(src, 11);
42 __m128i xmm12 = mm_loadu_si128(src, 12);
43 __m128i xmm13 = mm_loadu_si128(src, 13);
44 __m128i xmm14 = mm_loadu_si128(src, 14);
45 __m128i xmm15 = mm_loadu_si128(src, 15);
46
47 mm_stream_si128(dest, 0, xmm0);
48 mm_stream_si128(dest, 1, xmm1);
49 mm_stream_si128(dest, 2, xmm2);
50 mm_stream_si128(dest, 3, xmm3);
51 mm_stream_si128(dest, 4, xmm4);
52 mm_stream_si128(dest, 5, xmm5);
53 mm_stream_si128(dest, 6, xmm6);
54 mm_stream_si128(dest, 7, xmm7);
55 mm_stream_si128(dest, 8, xmm8);
56 mm_stream_si128(dest, 9, xmm9);
57 mm_stream_si128(dest, 10, xmm10);
58 mm_stream_si128(dest, 11, xmm11);
59 mm_stream_si128(dest, 12, xmm12);
60 mm_stream_si128(dest, 13, xmm13);
61 mm_stream_si128(dest, 14, xmm14);
62 mm_stream_si128(dest, 15, xmm15);
63}
64
65static force_inline void
66memmove_movnt2x64b(char *dest, const char *src)
67{
68 __m128i xmm0 = mm_loadu_si128(src, 0);
69 __m128i xmm1 = mm_loadu_si128(src, 1);
70 __m128i xmm2 = mm_loadu_si128(src, 2);
71 __m128i xmm3 = mm_loadu_si128(src, 3);
72 __m128i xmm4 = mm_loadu_si128(src, 4);
73 __m128i xmm5 = mm_loadu_si128(src, 5);
74 __m128i xmm6 = mm_loadu_si128(src, 6);
75 __m128i xmm7 = mm_loadu_si128(src, 7);
76
77 mm_stream_si128(dest, 0, xmm0);
78 mm_stream_si128(dest, 1, xmm1);
79 mm_stream_si128(dest, 2, xmm2);
80 mm_stream_si128(dest, 3, xmm3);
81 mm_stream_si128(dest, 4, xmm4);
82 mm_stream_si128(dest, 5, xmm5);
83 mm_stream_si128(dest, 6, xmm6);
84 mm_stream_si128(dest, 7, xmm7);
85}
86
87static force_inline void
88memmove_movnt1x64b(char *dest, const char *src)
89{
90 __m128i xmm0 = mm_loadu_si128(src, 0);
91 __m128i xmm1 = mm_loadu_si128(src, 1);
92 __m128i xmm2 = mm_loadu_si128(src, 2);
93 __m128i xmm3 = mm_loadu_si128(src, 3);
94
95 mm_stream_si128(dest, 0, xmm0);
96 mm_stream_si128(dest, 1, xmm1);
97 mm_stream_si128(dest, 2, xmm2);
98 mm_stream_si128(dest, 3, xmm3);
99}
100
101static force_inline void
102memmove_movnt1x32b(char *dest, const char *src)
103{
104 __m128i xmm0 = mm_loadu_si128(src, 0);
105 __m128i xmm1 = mm_loadu_si128(src, 1);
106
107 mm_stream_si128(dest, 0, xmm0);
108 mm_stream_si128(dest, 1, xmm1);
109}
110
111static force_inline void
112memmove_movnt1x16b(char *dest, const char *src)
113{
114 __m128i xmm0 = mm_loadu_si128(src, 0);
115
116 mm_stream_si128(dest, 0, xmm0);
117}
118
119static force_inline void
120memmove_movnt1x8b(char *dest, const char *src)
121{
122 _mm_stream_si64((long long *)dest, *(long long *)src);
123}
124
125static force_inline void
126memmove_movnt1x4b(char *dest, const char *src)
127{
128 _mm_stream_si32((int *)dest, *(int *)src);
129}
130
131static force_inline void
132memmove_movnt_sse_fw(char *dest, const char *src, size_t len, flush_fn flush,
133 perf_barrier_fn perf_barrier)
134{
135 size_t cnt = (uint64_t)dest & 63;
136 if (cnt > 0) {
137 cnt = 64 - cnt;
138
139 if (cnt > len)
140 cnt = len;
141
142 memmove_small_sse2(dest, src, cnt, flush);
143
144 dest += cnt;
145 src += cnt;
146 len -= cnt;
147 }
148
149 const char *srcend = src + len;
150 prefetch_ini_fw(src, len);
151
152 while (len >= PERF_BARRIER_SIZE) {
153 prefetch_next_fw(src, srcend);
154
155 memmove_movnt4x64b(dest, src);
156 dest += 4 * 64;
157 src += 4 * 64;
158 len -= 4 * 64;
159
160 memmove_movnt4x64b(dest, src);
161 dest += 4 * 64;
162 src += 4 * 64;
163 len -= 4 * 64;
164
165 memmove_movnt4x64b(dest, src);
166 dest += 4 * 64;
167 src += 4 * 64;
168 len -= 4 * 64;
169
170 COMPILE_ERROR_ON(PERF_BARRIER_SIZE != (4 + 4 + 4) * 64);
171
172 if (len)
173 perf_barrier();
174 }
175
176 while (len >= 4 * 64) {
177 memmove_movnt4x64b(dest, src);
178 dest += 4 * 64;
179 src += 4 * 64;
180 len -= 4 * 64;
181 }
182
183 if (len >= 2 * 64) {
184 memmove_movnt2x64b(dest, src);
185 dest += 2 * 64;
186 src += 2 * 64;
187 len -= 2 * 64;
188 }
189
190 if (len >= 1 * 64) {
191 memmove_movnt1x64b(dest, src);
192
193 dest += 1 * 64;
194 src += 1 * 64;
195 len -= 1 * 64;
196 }
197
198 if (len == 0)
199 return;
200
201 /* There's no point in using more than 1 nt store for 1 cache line. */
202 if (util_is_pow2(len)) {
203 if (len == 32)
204 memmove_movnt1x32b(dest, src);
205 else if (len == 16)
206 memmove_movnt1x16b(dest, src);
207 else if (len == 8)
208 memmove_movnt1x8b(dest, src);
209 else if (len == 4)
210 memmove_movnt1x4b(dest, src);
211 else
212 goto nonnt;
213
214 return;
215 }
216
217nonnt:
218 memmove_small_sse2(dest, src, len, flush);
219}
220
221static force_inline void
222memmove_movnt_sse_bw(char *dest, const char *src, size_t len, flush_fn flush,
223 perf_barrier_fn perf_barrier)
224{
225 dest += len;
226 src += len;
227
228 size_t cnt = (uint64_t)dest & 63;
229 if (cnt > 0) {
230 if (cnt > len)
231 cnt = len;
232
233 dest -= cnt;
234 src -= cnt;
235 len -= cnt;
236
237 memmove_small_sse2(dest, src, cnt, flush);
238 }
239
240 const char *srcbegin = src - len;
241 prefetch_ini_bw(src, len);
242
243 while (len >= PERF_BARRIER_SIZE) {
244 prefetch_next_bw(src, srcbegin);
245
246 dest -= 4 * 64;
247 src -= 4 * 64;
248 len -= 4 * 64;
249 memmove_movnt4x64b(dest, src);
250
251 dest -= 4 * 64;
252 src -= 4 * 64;
253 len -= 4 * 64;
254 memmove_movnt4x64b(dest, src);
255
256 dest -= 4 * 64;
257 src -= 4 * 64;
258 len -= 4 * 64;
259 memmove_movnt4x64b(dest, src);
260
261 COMPILE_ERROR_ON(PERF_BARRIER_SIZE != (4 + 4 + 4) * 64);
262
263 if (len)
264 perf_barrier();
265 }
266
267 while (len >= 4 * 64) {
268 dest -= 4 * 64;
269 src -= 4 * 64;
270 len -= 4 * 64;
271 memmove_movnt4x64b(dest, src);
272 }
273
274 if (len >= 2 * 64) {
275 dest -= 2 * 64;
276 src -= 2 * 64;
277 len -= 2 * 64;
278 memmove_movnt2x64b(dest, src);
279 }
280
281 if (len >= 1 * 64) {
282 dest -= 1 * 64;
283 src -= 1 * 64;
284 len -= 1 * 64;
285 memmove_movnt1x64b(dest, src);
286 }
287
288 if (len == 0)
289 return;
290
291 /* There's no point in using more than 1 nt store for 1 cache line. */
292 if (util_is_pow2(len)) {
293 if (len == 32) {
294 dest -= 32;
295 src -= 32;
296 memmove_movnt1x32b(dest, src);
297 } else if (len == 16) {
298 dest -= 16;
299 src -= 16;
300 memmove_movnt1x16b(dest, src);
301 } else if (len == 8) {
302 dest -= 8;
303 src -= 8;
304 memmove_movnt1x8b(dest, src);
305 } else if (len == 4) {
306 dest -= 4;
307 src -= 4;
308 memmove_movnt1x4b(dest, src);
309 } else {
310 goto nonnt;
311 }
312
313 return;
314 }
315
316nonnt:
317 dest -= len;
318 src -= len;
319 memmove_small_sse2(dest, src, len, flush);
320}
321
322static force_inline void
323memmove_movnt_sse2(char *dest, const char *src, size_t len, flush_fn flush,
324 barrier_fn barrier, perf_barrier_fn perf_barrier)
325{
326 if ((uintptr_t)dest - (uintptr_t)src >= len)
327 memmove_movnt_sse_fw(dest, src, len, flush, perf_barrier);
328 else
329 memmove_movnt_sse_bw(dest, src, len, flush, perf_barrier);
330
331 barrier();
332
333 VALGRIND_DO_FLUSH(dest, len);
334}
335
336/* variants without perf_barrier */
337
338void
339memmove_movnt_sse2_noflush_nobarrier(char *dest, const char *src, size_t len)
340{
341 LOG(15, "dest %p src %p len %zu", dest, src, len);
342
343 memmove_movnt_sse2(dest, src, len, noflush, barrier_after_ntstores,
344 no_barrier);
345}
346
347void
348memmove_movnt_sse2_empty_nobarrier(char *dest, const char *src, size_t len)
349{
350 LOG(15, "dest %p src %p len %zu", dest, src, len);
351
352 memmove_movnt_sse2(dest, src, len, flush_empty_nolog,
353 barrier_after_ntstores, no_barrier);
354}
355
356void
357memmove_movnt_sse2_clflush_nobarrier(char *dest, const char *src, size_t len)
358{
359 LOG(15, "dest %p src %p len %zu", dest, src, len);
360
361 memmove_movnt_sse2(dest, src, len, flush_clflush_nolog,
362 barrier_after_ntstores, no_barrier);
363}
364
365void
366memmove_movnt_sse2_clflushopt_nobarrier(char *dest, const char *src, size_t len)
367{
368 LOG(15, "dest %p src %p len %zu", dest, src, len);
369
370 memmove_movnt_sse2(dest, src, len, flush_clflushopt_nolog,
371 no_barrier_after_ntstores, no_barrier);
372}
373
374void
375memmove_movnt_sse2_clwb_nobarrier(char *dest, const char *src, size_t len)
376{
377 LOG(15, "dest %p src %p len %zu", dest, src, len);
378
379 memmove_movnt_sse2(dest, src, len, flush_clwb_nolog,
380 no_barrier_after_ntstores, no_barrier);
381}
382
383/* variants with perf_barrier */
384
385void
386memmove_movnt_sse2_noflush_wcbarrier(char *dest, const char *src, size_t len)
387{
388 LOG(15, "dest %p src %p len %zu", dest, src, len);
389
390 memmove_movnt_sse2(dest, src, len, noflush, barrier_after_ntstores,
391 wc_barrier);
392}
393
394void
395memmove_movnt_sse2_empty_wcbarrier(char *dest, const char *src, size_t len)
396{
397 LOG(15, "dest %p src %p len %zu", dest, src, len);
398
399 memmove_movnt_sse2(dest, src, len, flush_empty_nolog,
400 barrier_after_ntstores, wc_barrier);
401}
402
403void
404memmove_movnt_sse2_clflush_wcbarrier(char *dest, const char *src, size_t len)
405{
406 LOG(15, "dest %p src %p len %zu", dest, src, len);
407
408 memmove_movnt_sse2(dest, src, len, flush_clflush_nolog,
409 barrier_after_ntstores, wc_barrier);
410}
411
412void
413memmove_movnt_sse2_clflushopt_wcbarrier(char *dest, const char *src, size_t len)
414{
415 LOG(15, "dest %p src %p len %zu", dest, src, len);
416
417 memmove_movnt_sse2(dest, src, len, flush_clflushopt_nolog,
418 no_barrier_after_ntstores, wc_barrier);
419}
420
421void
422memmove_movnt_sse2_clwb_wcbarrier(char *dest, const char *src, size_t len)
423{
424 LOG(15, "dest %p src %p len %zu", dest, src, len);
425
426 memmove_movnt_sse2(dest, src, len, flush_clwb_nolog,
427 no_barrier_after_ntstores, wc_barrier);
428}