]> git.proxmox.com Git - ceph.git/blame - ceph/src/pmdk/src/libpmem2/x86_64/memcpy/memcpy_sse2.h
import ceph 16.2.7
[ceph.git] / ceph / src / pmdk / src / libpmem2 / x86_64 / memcpy / memcpy_sse2.h
CommitLineData
a4b75251
TL
1/* SPDX-License-Identifier: BSD-3-Clause */
2/* Copyright 2017-2020, Intel Corporation */
3
4#ifndef PMEM2_MEMCPY_SSE2_H
5#define PMEM2_MEMCPY_SSE2_H
6
7#include <xmmintrin.h>
8#include <stddef.h>
9#include <stdint.h>
10
11#include "out.h"
12
13static force_inline void
14memmove_small_sse2_noflush(char *dest, const char *src, size_t len)
15{
16 ASSERT(len <= 64);
17
18 if (len <= 8)
19 goto le8;
20 if (len <= 32)
21 goto le32;
22
23 if (len > 48) {
24 /* 49..64 */
25 __m128i xmm0 = _mm_loadu_si128((__m128i *)src);
26 __m128i xmm1 = _mm_loadu_si128((__m128i *)(src + 16));
27 __m128i xmm2 = _mm_loadu_si128((__m128i *)(src + 32));
28 __m128i xmm3 = _mm_loadu_si128((__m128i *)(src + len - 16));
29
30 _mm_storeu_si128((__m128i *)dest, xmm0);
31 _mm_storeu_si128((__m128i *)(dest + 16), xmm1);
32 _mm_storeu_si128((__m128i *)(dest + 32), xmm2);
33 _mm_storeu_si128((__m128i *)(dest + len - 16), xmm3);
34 return;
35 }
36
37 /* 33..48 */
38 __m128i xmm0 = _mm_loadu_si128((__m128i *)src);
39 __m128i xmm1 = _mm_loadu_si128((__m128i *)(src + 16));
40 __m128i xmm2 = _mm_loadu_si128((__m128i *)(src + len - 16));
41
42 _mm_storeu_si128((__m128i *)dest, xmm0);
43 _mm_storeu_si128((__m128i *)(dest + 16), xmm1);
44 _mm_storeu_si128((__m128i *)(dest + len - 16), xmm2);
45 return;
46
47le32:
48 if (len > 16) {
49 /* 17..32 */
50 __m128i xmm0 = _mm_loadu_si128((__m128i *)src);
51 __m128i xmm1 = _mm_loadu_si128((__m128i *)(src + len - 16));
52
53 _mm_storeu_si128((__m128i *)dest, xmm0);
54 _mm_storeu_si128((__m128i *)(dest + len - 16), xmm1);
55 return;
56 }
57
58 /* 9..16 */
59 uint64_t d80 = *(ua_uint64_t *)src;
60 uint64_t d81 = *(ua_uint64_t *)(src + len - 8);
61
62 *(ua_uint64_t *)dest = d80;
63 *(ua_uint64_t *)(dest + len - 8) = d81;
64 return;
65
66le8:
67 if (len <= 2)
68 goto le2;
69
70 if (len > 4) {
71 /* 5..8 */
72 uint32_t d40 = *(ua_uint32_t *)src;
73 uint32_t d41 = *(ua_uint32_t *)(src + len - 4);
74
75 *(ua_uint32_t *)dest = d40;
76 *(ua_uint32_t *)(dest + len - 4) = d41;
77 return;
78 }
79
80 /* 3..4 */
81 uint16_t d20 = *(ua_uint16_t *)src;
82 uint16_t d21 = *(ua_uint16_t *)(src + len - 2);
83
84 *(ua_uint16_t *)dest = d20;
85 *(ua_uint16_t *)(dest + len - 2) = d21;
86 return;
87
88le2:
89 if (len == 2) {
90 *(ua_uint16_t *)dest = *(ua_uint16_t *)src;
91 return;
92 }
93
94 *(uint8_t *)dest = *(uint8_t *)src;
95}
96
97static force_inline void
98memmove_small_sse2(char *dest, const char *src, size_t len, flush_fn flush)
99{
100 /*
101 * pmemcheck complains about "overwritten stores before they were made
102 * persistent" for overlapping stores (last instruction in each code
103 * path) in the optimized version.
104 * libc's memcpy also does that, so we can't use it here.
105 */
106 if (On_pmemcheck) {
107 memmove_nodrain_generic(dest, src, len, PMEM2_F_MEM_NOFLUSH,
108 NULL);
109 } else {
110 memmove_small_sse2_noflush(dest, src, len);
111 }
112
113 flush(dest, len);
114}
115
116#endif