2 * Copyright Andrey Semashev 2007 - 2015.
3 * Distributed under the Boost Software License, Version 1.0.
4 * (See accompanying file LICENSE_1_0.txt or copy at
5 * http://www.boost.org/LICENSE_1_0.txt)
9 * \author Andrey Semashev
12 * \brief This header is the Boost.Log library implementation, see the library documentation
13 * at http://www.boost.org/doc/libs/release/libs/log/doc/html/index.html.
16 // NOTE: You should generally avoid including headers as much as possible here, because this file
17 // is compiled with special compiler options, and any included header may result in generation of
18 // unintended code with these options and violation of ODR.
19 #include <boost/log/detail/config.hpp>
21 #include <immintrin.h>
22 #include <boost/cstdint.hpp>
23 #include <boost/log/detail/header.hpp>
25 #if defined(__x86_64) || defined(__x86_64__) || \
26 defined(__amd64__) || defined(__amd64) || \
28 #define BOOST_LOG_AUX_X86_64
33 BOOST_LOG_OPEN_NAMESPACE
37 extern const char g_hex_char_table
[2][16];
39 template< typename CharT
>
40 extern void dump_data_generic(const void* data
, std::size_t size
, std::basic_ostream
< CharT
>& strm
);
42 BOOST_LOG_ANONYMOUS_NAMESPACE
{
46 packs_per_stride
= 32,
47 stride
= packs_per_stride
* 32
55 BOOST_FORCEINLINE
operator __m256i () const { return as_mm
; }
58 static const ymm_constant mm_shuffle_pattern1
= {{ 0x80, 0, 1, 0x80, 2, 3, 0x80, 4, 5, 0x80, 6, 7, 0x80, 8, 9, 0x80, 0x80, 0, 1, 0x80, 2, 3, 0x80, 4, 5, 0x80, 6, 7, 0x80, 8, 9, 0x80 }};
59 static const ymm_constant mm_shuffle_pattern2
= {{ 0, 1, 0x80, 2, 3, 0x80, 4, 5, 0x80, 6, 7, 0x80, 8, 9, 0x80, 10, 0, 1, 0x80, 2, 3, 0x80, 4, 5, 0x80, 6, 7, 0x80, 8, 9, 0x80, 10 }};
60 static const ymm_constant mm_shuffle_pattern3
= {{ 5, 0x80, 6, 7, 0x80, 8, 9, 0x80, 10, 11, 0x80, 12, 13, 0x80, 14, 15, 5, 0x80, 6, 7, 0x80, 8, 9, 0x80, 10, 11, 0x80, 12, 13, 0x80, 14, 15 }};
61 static const ymm_constant mm_shuffle_pattern13
= {{ 0x80, 0, 1, 0x80, 2, 3, 0x80, 4, 5, 0x80, 6, 7, 0x80, 8, 9, 0x80, 5, 0x80, 6, 7, 0x80, 8, 9, 0x80, 10, 11, 0x80, 12, 13, 0x80, 14, 15 }};
63 #if defined(BOOST_LOG_AUX_X86_64)
65 // x86-64 architecture has more registers which we can utilize to pass constants
66 #define BOOST_LOG_AUX_MM_CONSTANT_ARGS_DECL __m256i mm_15, __m256i mm_9, __m256i mm_char_0, __m256i mm_char_space,
67 #define BOOST_LOG_AUX_MM_CONSTANT_ARGS mm_15, mm_9, mm_char_0, mm_char_space,
68 #define BOOST_LOG_AUX_MM_CONSTANTS \
69 const __m256i mm_15 = _mm256_set1_epi32(0x0F0F0F0F);\
70 const __m256i mm_9 = _mm256_set1_epi32(0x09090909);\
71 const __m256i mm_char_0 = _mm256_set1_epi32(0x30303030);\
72 const __m256i mm_char_space = _mm256_set1_epi32(0x20202020);
76 // MSVC in 32-bit mode is not able to pass all constants to dump_pack, and is also not able to align them on the stack, so we have to fetch them from global constants
77 static const ymm_constant mm_15
= {{ 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F }};
78 static const ymm_constant mm_9
= {{ 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09 }};
79 static const ymm_constant mm_char_0
= {{ 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30 }};
80 static const ymm_constant mm_char_space
= {{ 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20 }};
81 #define BOOST_LOG_AUX_MM_CONSTANT_ARGS_DECL
82 #define BOOST_LOG_AUX_MM_CONSTANT_ARGS
83 #define BOOST_LOG_AUX_MM_CONSTANTS
88 * \brief Dumps a pack of input data into a string of 8 bit ASCII characters.
90 * The composed string is placed as follows (in Intel notation): mm_output1[127:0], mm_output2[127:0], mm_output3[127:0], mm_output1[255:128], mm_output2[255:128], mm_output3[255:128].
92 static BOOST_FORCEINLINE
void dump_pack
94 BOOST_LOG_AUX_MM_CONSTANT_ARGS_DECL
95 __m256i mm_char_10_to_a
, __m256i mm_input
,
96 __m256i
& mm_output1
, __m256i
& mm_output2
, __m256i
& mm_output3
100 __m256i mm_input_hi
= _mm256_and_si256(_mm256_srli_epi16(mm_input
, 4), mm_15
);
101 __m256i mm_input_lo
= _mm256_and_si256(mm_input
, mm_15
);
103 // Stringize each of the halves
104 __m256i mm_addend_hi
= _mm256_cmpgt_epi8(mm_input_hi
, mm_9
);
105 __m256i mm_addend_lo
= _mm256_cmpgt_epi8(mm_input_lo
, mm_9
);
106 mm_addend_hi
= _mm256_and_si256(mm_char_10_to_a
, mm_addend_hi
);
107 mm_addend_lo
= _mm256_and_si256(mm_char_10_to_a
, mm_addend_lo
);
109 mm_input_hi
= _mm256_add_epi8(mm_input_hi
, mm_char_0
);
110 mm_input_lo
= _mm256_add_epi8(mm_input_lo
, mm_char_0
);
112 mm_input_hi
= _mm256_add_epi8(mm_input_hi
, mm_addend_hi
);
113 mm_input_lo
= _mm256_add_epi8(mm_input_lo
, mm_addend_lo
);
115 // Join them back together
116 __m256i mm_1
= _mm256_unpacklo_epi8(mm_input_hi
, mm_input_lo
);
117 __m256i mm_2
= _mm256_unpackhi_epi8(mm_input_hi
, mm_input_lo
);
119 // Insert spaces between stringized bytes:
120 // |0123456789abcdef|0123456789abcdef|
121 // | 01 23 45 67 89 |ab cd ef 01 23 4|5 67 89 ab cd ef|
122 __m256i mm_out1
= _mm256_shuffle_epi8(mm_1
, mm_shuffle_pattern1
.as_mm
);
123 __m256i mm_out3
= _mm256_shuffle_epi8(mm_2
, mm_shuffle_pattern3
.as_mm
);
124 __m256i mm_out2
= _mm256_shuffle_epi8(_mm256_alignr_epi8(mm_2
, mm_1
, 10), mm_shuffle_pattern2
.as_mm
);
126 mm_output1
= _mm256_max_epu8(mm_out1
, mm_char_space
);
127 mm_output2
= _mm256_max_epu8(mm_out2
, mm_char_space
);
128 mm_output3
= _mm256_max_epu8(mm_out3
, mm_char_space
);
131 //! Dumps a pack of input data into a string of 8 bit ASCII characters
132 static BOOST_FORCEINLINE
void dump_pack
134 BOOST_LOG_AUX_MM_CONSTANT_ARGS_DECL
135 __m256i mm_char_10_to_a
, __m128i mm_input
,
136 __m128i
& mm_output1
, __m128i
& mm_output2
, __m128i
& mm_output3
140 __m128i mm_input_hi
= _mm_srli_epi16(mm_input
, 4);
141 __m256i mm
= _mm256_inserti128_si256(_mm256_castsi128_si256(_mm_unpacklo_epi8(mm_input_hi
, mm_input
)), _mm_unpackhi_epi8(mm_input_hi
, mm_input
), 1);
142 mm
= _mm256_and_si256(mm
, mm_15
);
144 // Stringize the halves
145 __m256i mm_addend
= _mm256_cmpgt_epi8(mm
, mm_9
);
146 mm_addend
= _mm256_and_si256(mm_char_10_to_a
, mm_addend
);
148 mm
= _mm256_add_epi8(mm
, mm_char_0
);
149 mm
= _mm256_add_epi8(mm
, mm_addend
);
151 // Insert spaces between stringized bytes:
152 __m256i mm_out13
= _mm256_shuffle_epi8(mm
, mm_shuffle_pattern13
.as_mm
);
153 __m128i mm_out2
= _mm_shuffle_epi8(_mm_alignr_epi8(_mm256_extracti128_si256(mm
, 1), _mm256_castsi256_si128(mm
), 10), _mm256_castsi256_si128(mm_shuffle_pattern2
.as_mm
));
155 mm_out13
= _mm256_max_epu8(mm_out13
, mm_char_space
);
156 mm_output2
= _mm_max_epu8(mm_out2
, _mm256_castsi256_si128(mm_char_space
));
157 mm_output1
= _mm256_castsi256_si128(mm_out13
);
158 mm_output3
= _mm256_extracti128_si256(mm_out13
, 1);
161 template< typename CharT
>
162 BOOST_FORCEINLINE
void store_characters(__m128i mm_chars
, CharT
* buf
)
164 switch (sizeof(CharT
))
167 _mm_store_si128(reinterpret_cast< __m128i
* >(buf
), mm_chars
);
171 _mm256_store_si256(reinterpret_cast< __m256i
* >(buf
), _mm256_cvtepu8_epi16(mm_chars
));
176 __m128i mm
= _mm_unpackhi_epi64(mm_chars
, mm_chars
);
177 _mm256_store_si256(reinterpret_cast< __m256i
* >(buf
), _mm256_cvtepu8_epi32(mm_chars
));
178 _mm256_store_si256(reinterpret_cast< __m256i
* >(buf
) + 1, _mm256_cvtepu8_epi32(mm
));
184 template< typename CharT
>
185 BOOST_FORCEINLINE
void store_characters_x3(__m256i mm_chars1
, __m256i mm_chars2
, __m256i mm_chars3
, CharT
* buf
)
187 store_characters(_mm256_castsi256_si128(mm_chars1
), buf
);
188 store_characters(_mm256_castsi256_si128(mm_chars2
), buf
+ 16);
189 store_characters(_mm256_castsi256_si128(mm_chars3
), buf
+ 32);
190 store_characters(_mm256_extracti128_si256(mm_chars1
, 1), buf
+ 48);
191 store_characters(_mm256_extracti128_si256(mm_chars2
, 1), buf
+ 64);
192 store_characters(_mm256_extracti128_si256(mm_chars3
, 1), buf
+ 80);
195 template< typename CharT
>
196 BOOST_FORCEINLINE
void dump_data_avx2(const void* data
, std::size_t size
, std::basic_ostream
< CharT
>& strm
)
198 typedef CharT char_type
;
200 char_type buf_storage
[stride
* 3u + 32u];
201 // Align the temporary buffer at 32 bytes
202 char_type
* const buf
= reinterpret_cast< char_type
* >((uint8_t*)buf_storage
+ (32u - (((uintptr_t)(char_type
*)buf_storage
) & 31u)));
203 char_type
* buf_begin
= buf
+ 1u; // skip the first space of the first chunk
204 char_type
* buf_end
= buf
+ stride
* 3u;
206 __m256i mm_char_10_to_a
;
207 if (strm
.flags() & std::ios_base::uppercase
)
208 mm_char_10_to_a
= _mm256_set1_epi32(0x07070707); // '9' is 0x39 and 'A' is 0x41 in ASCII, so we have to add 0x07 to 0x3A to get uppercase letters
210 mm_char_10_to_a
= _mm256_set1_epi32(0x27272727); // ...and 'a' is 0x61, which means we have to add 0x27 to 0x3A to get lowercase letters
212 // First, check the input alignment. Also, if we can dump the whole data in one go, do it right away. It turns out to be faster than splitting
213 // the work between prealign and tail part. It is also a fairly common case since on most platforms memory is not aligned to 32 bytes (i.e. prealign is often needed).
214 const uint8_t* p
= static_cast< const uint8_t* >(data
);
215 const std::size_t prealign_size
= size
== 32u ? static_cast< std::size_t >(32u) : static_cast< std::size_t >((32u - ((uintptr_t)p
& 31u)) & 31u);
218 __m256i mm_input
= _mm256_lddqu_si256(reinterpret_cast< const __m256i
* >(p
));
219 BOOST_LOG_AUX_MM_CONSTANTS
221 __m256i mm_output1
, mm_output2
, mm_output3
;
222 dump_pack(BOOST_LOG_AUX_MM_CONSTANT_ARGS mm_char_10_to_a
, mm_input
, mm_output1
, mm_output2
, mm_output3
);
224 store_characters_x3(mm_output1
, mm_output2
, mm_output3
, buf
);
226 _mm256_zeroall(); // need to zero all ymm registers to avoid register spills/restores the compler generates around the function call
227 strm
.write(buf_begin
, prealign_size
* 3u - 1u);
230 size
-= prealign_size
;
234 const std::size_t stride_count
= size
/ stride
;
235 std::size_t tail_size
= size
% stride
;
236 for (std::size_t i
= 0; i
< stride_count
; ++i
)
239 BOOST_LOG_AUX_MM_CONSTANTS
241 for (unsigned int j
= 0; j
< packs_per_stride
; ++j
, b
+= 3u * 32u, p
+= 32u)
243 __m256i mm_input
= _mm256_load_si256(reinterpret_cast< const __m256i
* >(p
));
244 __m256i mm_output1
, mm_output2
, mm_output3
;
245 dump_pack(BOOST_LOG_AUX_MM_CONSTANT_ARGS mm_char_10_to_a
, mm_input
, mm_output1
, mm_output2
, mm_output3
);
247 store_characters_x3(mm_output1
, mm_output2
, mm_output3
, b
);
250 _mm256_zeroall(); // need to zero all ymm registers to avoid register spills/restores the compler generates around the function call
251 strm
.write(buf_begin
, buf_end
- buf_begin
);
255 if (BOOST_UNLIKELY(tail_size
> 0))
258 while (tail_size
>= 16u)
260 __m128i mm_input
= _mm_load_si128(reinterpret_cast< const __m128i
* >(p
));
261 BOOST_LOG_AUX_MM_CONSTANTS
263 __m128i mm_output1
, mm_output2
, mm_output3
;
264 dump_pack(BOOST_LOG_AUX_MM_CONSTANT_ARGS mm_char_10_to_a
, mm_input
, mm_output1
, mm_output2
, mm_output3
);
266 store_characters(mm_output1
, b
);
267 store_characters(mm_output2
, b
+ 16u);
268 store_characters(mm_output3
, b
+ 32u);
275 _mm256_zeroall(); // need to zero all ymm registers to avoid register spills/restores the compler generates around the function call
276 const char* const char_table
= g_hex_char_table
[(strm
.flags() & std::ios_base::uppercase
) != 0];
277 for (unsigned int i
= 0; i
< tail_size
; ++i
, ++p
, b
+= 3u)
280 b
[0] = static_cast< char_type
>(' ');
281 b
[1] = static_cast< char_type
>(char_table
[n
>> 4]);
282 b
[2] = static_cast< char_type
>(char_table
[n
& 0x0F]);
285 strm
.write(buf_begin
, b
- buf_begin
);
291 void dump_data_char_avx2(const void* data
, std::size_t size
, std::basic_ostream
< char >& strm
)
295 dump_data_avx2(data
, size
, strm
);
299 dump_data_generic(data
, size
, strm
);
303 void dump_data_wchar_avx2(const void* data
, std::size_t size
, std::basic_ostream
< wchar_t >& strm
)
307 dump_data_avx2(data
, size
, strm
);
311 dump_data_generic(data
, size
, strm
);
315 #if !defined(BOOST_NO_CXX11_CHAR16_T)
316 void dump_data_char16_avx2(const void* data
, std::size_t size
, std::basic_ostream
< char16_t
>& strm
)
320 dump_data_avx2(data
, size
, strm
);
324 dump_data_generic(data
, size
, strm
);
329 #if !defined(BOOST_NO_CXX11_CHAR32_T)
330 void dump_data_char32_avx2(const void* data
, std::size_t size
, std::basic_ostream
< char32_t
>& strm
)
334 dump_data_avx2(data
, size
, strm
);
338 dump_data_generic(data
, size
, strm
);
345 BOOST_LOG_CLOSE_NAMESPACE
// namespace log
349 #include <boost/log/detail/footer.hpp>