]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | /* |
2 | * Copyright Andrey Semashev 2013. | |
3 | * Distributed under the Boost Software License, Version 1.0. | |
4 | * (See accompanying file LICENSE_1_0.txt or copy at | |
5 | * http://www.boost.org/LICENSE_1_0.txt) | |
6 | */ | |
7 | /*! | |
8 | * \file uuid/detail/uuid_x86.hpp | |
9 | * | |
10 | * \brief This header contains optimized SSE implementation of \c boost::uuid operations. | |
11 | */ | |
12 | ||
13 | #ifndef BOOST_UUID_DETAIL_UUID_X86_HPP_INCLUDED_ | |
14 | #define BOOST_UUID_DETAIL_UUID_X86_HPP_INCLUDED_ | |
15 | ||
16 | // MSVC does not always have immintrin.h (at least, not up to MSVC 10), so include the appropriate header for each instruction set | |
17 | #if defined(BOOST_UUID_USE_SSE41) | |
18 | #include <smmintrin.h> | |
19 | #elif defined(BOOST_UUID_USE_SSE3) | |
20 | #include <pmmintrin.h> | |
21 | #else | |
22 | #include <emmintrin.h> | |
23 | #endif | |
24 | ||
25 | #if defined(BOOST_MSVC) && defined(_M_X64) && !defined(BOOST_UUID_USE_SSE3) && (BOOST_MSVC < 1900 /* Fixed in Visual Studio 2015 */ ) | |
26 | // At least MSVC 9 (VS2008) and 12 (VS2013) have an optimizer bug that sometimes results in incorrect SIMD code | |
27 | // generated in Release x64 mode. In particular, it affects operator==, where the compiler sometimes generates | |
28 | // pcmpeqd with a memory opereand instead of movdqu followed by pcmpeqd. The problem is that uuid can be | |
29 | // not aligned to 16 bytes and pcmpeqd causes alignment violation in this case. We cannot be sure that other | |
30 | // MSVC versions are not affected so we apply the workaround for all versions, except VS2015 on up where | |
31 | // the bug has been fixed. | |
32 | // | |
33 | // https://svn.boost.org/trac/boost/ticket/8509#comment:3 | |
34 | // https://connect.microsoft.com/VisualStudio/feedbackdetail/view/981648#tabs | |
35 | #define BOOST_UUID_DETAIL_MSVC_BUG981648 | |
36 | #if BOOST_MSVC >= 1600 | |
37 | extern "C" void _ReadWriteBarrier(void); | |
38 | #pragma intrinsic(_ReadWriteBarrier) | |
39 | #endif | |
40 | #endif | |
41 | ||
42 | namespace boost { | |
43 | namespace uuids { | |
44 | namespace detail { | |
45 | ||
46 | BOOST_FORCEINLINE __m128i load_unaligned_si128(const uint8_t* p) BOOST_NOEXCEPT | |
47 | { | |
48 | #if defined(BOOST_UUID_USE_SSE3) | |
49 | return _mm_lddqu_si128(reinterpret_cast< const __m128i* >(p)); | |
50 | #elif !defined(BOOST_UUID_DETAIL_MSVC_BUG981648) | |
51 | return _mm_loadu_si128(reinterpret_cast< const __m128i* >(p)); | |
52 | #elif defined(BOOST_MSVC) && BOOST_MSVC >= 1600 | |
53 | __m128i mm = _mm_loadu_si128(reinterpret_cast< const __m128i* >(p)); | |
54 | // Make sure this load doesn't get merged with the subsequent instructions | |
55 | _ReadWriteBarrier(); | |
56 | return mm; | |
57 | #else | |
58 | // VS2008 x64 doesn't respect _ReadWriteBarrier above, so we have to generate this crippled code to load unaligned data | |
59 | return _mm_unpacklo_epi64(_mm_loadl_epi64(reinterpret_cast< const __m128i* >(p)), _mm_loadl_epi64(reinterpret_cast< const __m128i* >(p + 8))); | |
60 | #endif | |
61 | } | |
62 | ||
63 | } // namespace detail | |
64 | ||
65 | inline bool uuid::is_nil() const BOOST_NOEXCEPT | |
66 | { | |
67 | __m128i mm = uuids::detail::load_unaligned_si128(data); | |
68 | #if defined(BOOST_UUID_USE_SSE41) | |
69 | return _mm_test_all_zeros(mm, mm) != 0; | |
70 | #else | |
71 | mm = _mm_cmpeq_epi32(mm, _mm_setzero_si128()); | |
72 | return _mm_movemask_epi8(mm) == 0xFFFF; | |
73 | #endif | |
74 | } | |
75 | ||
76 | inline void uuid::swap(uuid& rhs) BOOST_NOEXCEPT | |
77 | { | |
78 | __m128i mm_this = uuids::detail::load_unaligned_si128(data); | |
79 | __m128i mm_rhs = uuids::detail::load_unaligned_si128(rhs.data); | |
80 | _mm_storeu_si128(reinterpret_cast< __m128i* >(rhs.data), mm_this); | |
81 | _mm_storeu_si128(reinterpret_cast< __m128i* >(data), mm_rhs); | |
82 | } | |
83 | ||
84 | inline bool operator== (uuid const& lhs, uuid const& rhs) BOOST_NOEXCEPT | |
85 | { | |
86 | __m128i mm_left = uuids::detail::load_unaligned_si128(lhs.data); | |
87 | __m128i mm_right = uuids::detail::load_unaligned_si128(rhs.data); | |
88 | ||
89 | #if defined(BOOST_UUID_USE_SSE41) | |
90 | __m128i mm = _mm_xor_si128(mm_left, mm_right); | |
91 | return _mm_test_all_zeros(mm, mm) != 0; | |
92 | #else | |
93 | __m128i mm_cmp = _mm_cmpeq_epi32(mm_left, mm_right); | |
94 | return _mm_movemask_epi8(mm_cmp) == 0xFFFF; | |
95 | #endif | |
96 | } | |
97 | ||
98 | inline bool operator< (uuid const& lhs, uuid const& rhs) BOOST_NOEXCEPT | |
99 | { | |
100 | __m128i mm_left = uuids::detail::load_unaligned_si128(lhs.data); | |
101 | __m128i mm_right = uuids::detail::load_unaligned_si128(rhs.data); | |
102 | ||
103 | // To emulate lexicographical_compare behavior we have to perform two comparisons - the forward and reverse one. | |
104 | // Then we know which bytes are equivalent and which ones are different, and for those different the comparison results | |
105 | // will be opposite. Then we'll be able to find the first differing comparison result (for both forward and reverse ways), | |
106 | // and depending on which way it is for, this will be the result of the operation. There are a few notes to consider: | |
107 | // | |
108 | // 1. Due to little endian byte order the first bytes go into the lower part of the xmm registers, | |
109 | // so the comparison results in the least significant bits will actually be the most signigicant for the final operation result. | |
110 | // This means we have to determine which of the comparison results have the least significant bit on, and this is achieved with | |
111 | // the "(x - 1) ^ x" trick. | |
112 | // 2. Because there is only signed comparison in SSE/AVX, we have to invert byte comparison results whenever signs of the corresponding | |
113 | // bytes are different. I.e. in signed comparison it's -1 < 1, but in unsigned it is the opposite (255 > 1). To do that we XOR left and right, | |
114 | // making the most significant bit of each byte 1 if the signs are different, and later apply this mask with another XOR to the comparison results. | |
115 | // 3. pcmpgtw compares for "greater" relation, so we swap the arguments to get what we need. | |
116 | ||
117 | const __m128i mm_signs_mask = _mm_xor_si128(mm_left, mm_right); | |
118 | ||
119 | __m128i mm_cmp = _mm_cmpgt_epi8(mm_right, mm_left), mm_rcmp = _mm_cmpgt_epi8(mm_left, mm_right); | |
120 | ||
121 | mm_cmp = _mm_xor_si128(mm_signs_mask, mm_cmp); | |
122 | mm_rcmp = _mm_xor_si128(mm_signs_mask, mm_rcmp); | |
123 | ||
124 | uint32_t cmp = static_cast< uint32_t >(_mm_movemask_epi8(mm_cmp)), rcmp = static_cast< uint32_t >(_mm_movemask_epi8(mm_rcmp)); | |
125 | ||
126 | cmp = (cmp - 1u) ^ cmp; | |
127 | rcmp = (rcmp - 1u) ^ rcmp; | |
128 | ||
129 | return cmp < rcmp; | |
130 | } | |
131 | ||
132 | } // namespace uuids | |
133 | } // namespace boost | |
134 | ||
135 | #endif // BOOST_UUID_DETAIL_UUID_X86_HPP_INCLUDED_ |