]>
git.proxmox.com Git - ceph.git/blob - ceph/src/boost/libs/atomic/src/find_address_sse41.cpp
2 * Distributed under the Boost Software License, Version 1.0.
3 * (See accompanying file LICENSE_1_0.txt or copy at
4 * http://www.boost.org/LICENSE_1_0.txt)
6 * Copyright (c) 2020 Andrey Semashev
9 * \file find_address_sse41.cpp
11 * This file contains SSE4.1 implementation of the \c find_address algorithm
14 #include <boost/predef/architecture/x86.h>
15 #include <boost/atomic/detail/int_sizes.hpp>
17 #if BOOST_ARCH_X86 && defined(BOOST_ATOMIC_DETAIL_SIZEOF_POINTER) && (BOOST_ATOMIC_DETAIL_SIZEOF_POINTER == 8)
20 #include <smmintrin.h>
22 #include <boost/cstdint.hpp>
23 #include <boost/atomic/detail/config.hpp>
24 #include <boost/atomic/detail/intptr.hpp>
25 #include "find_address.hpp"
26 #include "x86_vector_tools.hpp"
27 #include "bit_operation_tools.hpp"
29 #include <boost/atomic/detail/header.hpp>
35 //! SSE4.1 implementation of the \c find_address algorithm
36 std::size_t find_address_sse41(const volatile void* addr
, const volatile void* const* addrs
, std::size_t size
)
39 return find_address_generic(addr
, addrs
, size
);
41 const __m128i mm_addr
= mm_set1_epiptr((uintptr_t)addr
);
43 const std::size_t n
= (size
+ 1u) & ~static_cast< std::size_t >(1u);
44 for (std::size_t m
= n
& ~static_cast< std::size_t >(15u); pos
< m
; pos
+= 16u)
46 __m128i mm1
= _mm_load_si128(reinterpret_cast< const __m128i
* >(addrs
+ pos
));
47 __m128i mm2
= _mm_load_si128(reinterpret_cast< const __m128i
* >(addrs
+ pos
+ 2u));
48 __m128i mm3
= _mm_load_si128(reinterpret_cast< const __m128i
* >(addrs
+ pos
+ 4u));
49 __m128i mm4
= _mm_load_si128(reinterpret_cast< const __m128i
* >(addrs
+ pos
+ 6u));
50 __m128i mm5
= _mm_load_si128(reinterpret_cast< const __m128i
* >(addrs
+ pos
+ 8u));
51 __m128i mm6
= _mm_load_si128(reinterpret_cast< const __m128i
* >(addrs
+ pos
+ 10u));
52 __m128i mm7
= _mm_load_si128(reinterpret_cast< const __m128i
* >(addrs
+ pos
+ 12u));
53 __m128i mm8
= _mm_load_si128(reinterpret_cast< const __m128i
* >(addrs
+ pos
+ 14u));
55 mm1
= _mm_cmpeq_epi64(mm1
, mm_addr
);
56 mm2
= _mm_cmpeq_epi64(mm2
, mm_addr
);
57 mm3
= _mm_cmpeq_epi64(mm3
, mm_addr
);
58 mm4
= _mm_cmpeq_epi64(mm4
, mm_addr
);
59 mm5
= _mm_cmpeq_epi64(mm5
, mm_addr
);
60 mm6
= _mm_cmpeq_epi64(mm6
, mm_addr
);
61 mm7
= _mm_cmpeq_epi64(mm7
, mm_addr
);
62 mm8
= _mm_cmpeq_epi64(mm8
, mm_addr
);
64 mm1
= _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(mm1
), _mm_castsi128_ps(mm2
), _MM_SHUFFLE(2, 0, 2, 0)));
65 mm3
= _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(mm3
), _mm_castsi128_ps(mm4
), _MM_SHUFFLE(2, 0, 2, 0)));
66 mm5
= _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(mm5
), _mm_castsi128_ps(mm6
), _MM_SHUFFLE(2, 0, 2, 0)));
67 mm7
= _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(mm7
), _mm_castsi128_ps(mm8
), _MM_SHUFFLE(2, 0, 2, 0)));
69 mm1
= _mm_packs_epi32(mm1
, mm3
);
70 mm5
= _mm_packs_epi32(mm5
, mm7
);
72 mm1
= _mm_packs_epi16(mm1
, mm5
);
74 uint32_t mask
= _mm_movemask_epi8(mm1
);
77 pos
+= atomics::detail::count_trailing_zeros(mask
);
84 __m128i mm1
= _mm_load_si128(reinterpret_cast< const __m128i
* >(addrs
+ pos
));
85 __m128i mm2
= _mm_load_si128(reinterpret_cast< const __m128i
* >(addrs
+ pos
+ 2u));
86 __m128i mm3
= _mm_load_si128(reinterpret_cast< const __m128i
* >(addrs
+ pos
+ 4u));
87 __m128i mm4
= _mm_load_si128(reinterpret_cast< const __m128i
* >(addrs
+ pos
+ 6u));
89 mm1
= _mm_cmpeq_epi64(mm1
, mm_addr
);
90 mm2
= _mm_cmpeq_epi64(mm2
, mm_addr
);
91 mm3
= _mm_cmpeq_epi64(mm3
, mm_addr
);
92 mm4
= _mm_cmpeq_epi64(mm4
, mm_addr
);
94 mm1
= _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(mm1
), _mm_castsi128_ps(mm2
), _MM_SHUFFLE(2, 0, 2, 0)));
95 mm3
= _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(mm3
), _mm_castsi128_ps(mm4
), _MM_SHUFFLE(2, 0, 2, 0)));
97 mm1
= _mm_packs_epi32(mm1
, mm3
);
99 uint32_t mask
= _mm_movemask_epi8(mm1
);
102 pos
+= atomics::detail::count_trailing_zeros(mask
) / 2u;
111 __m128i mm1
= _mm_load_si128(reinterpret_cast< const __m128i
* >(addrs
+ pos
));
112 __m128i mm2
= _mm_load_si128(reinterpret_cast< const __m128i
* >(addrs
+ pos
+ 2u));
114 mm1
= _mm_cmpeq_epi64(mm1
, mm_addr
);
115 mm2
= _mm_cmpeq_epi64(mm2
, mm_addr
);
117 mm1
= _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(mm1
), _mm_castsi128_ps(mm2
), _MM_SHUFFLE(2, 0, 2, 0)));
119 uint32_t mask
= _mm_movemask_ps(_mm_castsi128_ps(mm1
));
122 pos
+= atomics::detail::count_trailing_zeros(mask
);
131 __m128i mm1
= _mm_load_si128(reinterpret_cast< const __m128i
* >(addrs
+ pos
));
133 mm1
= _mm_cmpeq_epi64(mm1
, mm_addr
);
134 uint32_t mask
= _mm_movemask_pd(_mm_castsi128_pd(mm1
));
137 pos
+= atomics::detail::count_trailing_zeros(mask
);
148 } // namespace detail
149 } // namespace atomics
152 #include <boost/atomic/detail/footer.hpp>
154 #endif // BOOST_ARCH_X86 && defined(BOOST_ATOMIC_DETAIL_SIZEOF_POINTER) && (BOOST_ATOMIC_DETAIL_SIZEOF_POINTER == 8)