2 // Copyright (c) 2009-2011 Artyom Beilis (Tonkikh)
3 // Copyright (c) 2020 Alexander Grund
5 // Distributed under the Boost Software License, Version 1.0. (See
6 // accompanying file LICENSE or copy at
7 // http://www.boost.org/LICENSE_1_0.txt)
9 #ifndef BOOST_NOWIDE_UTF_HPP_INCLUDED
10 #define BOOST_NOWIDE_UTF_HPP_INCLUDED
12 #include <boost/nowide/config.hpp>
18 /// \brief Namespace that holds basic operations on UTF encoded sequences
20 /// All functions defined in this namespace do not require linking with Boost.Nowide library
21 /// Extracted from Boost.Locale
26 /// \brief The integral type that can hold a Unicode code point
28 using code_point = uint32_t;
31 /// \brief Special constant that defines illegal code point
33 static const code_point illegal = 0xFFFFFFFFu;
36 /// \brief Special constant that defines incomplete code point
38 static const code_point incomplete = 0xFFFFFFFEu;
41 /// \brief the function checks if \a v is a valid code point
43 inline bool is_valid_codepoint(code_point v)
47 if(0xD800 <= v && v <= 0xDFFF) // surrogates
52 #ifdef BOOST_NOWIDE_DOXYGEN
54 /// \brief UTF Traits class - functions to convert UTF sequences to and from Unicode code points
56 template<typename CharType, int size = sizeof(CharType)>
60 /// The type of the character
62 using char_type = CharType;
64 /// Read one code point from the range [p,e) and return it.
66 /// - If the sequence that was read is incomplete sequence returns \ref incomplete,
67 /// - If illegal sequence detected returns \ref illegal
71 /// - Iterator is valid input iterator
75 /// - p points to the last consumed character
77 template<typename Iterator>
78 static code_point decode(Iterator& p, Iterator e);
81 /// Maximal width of valid sequence in the code units:
87 static const int max_width;
89 /// The width of specific code point in the code units.
91 /// Requirement: value is a valid Unicode code point
92 /// Returns value in range [1..max_width]
94 static int width(code_point value);
97 /// Get the size of the trail part of variable length encoded sequence.
99 /// Returns -1 if C is not valid lead character
101 static int trail_length(char_type c);
103 /// Returns true if c is trail code unit, always false for UTF-32
105 static bool is_trail(char_type c);
107 /// Returns true if c is lead code unit, always true of UTF-32
109 static bool is_lead(char_type c);
112 /// Convert valid Unicode code point \a value to the UTF sequence.
116 /// - \a value is valid code point
117 /// - \a out is an output iterator should be able to accept at least width(value) units
119 /// Returns the iterator past the last written code unit.
121 template<typename Iterator>
122 static Iterator encode(code_point value, Iterator out);
124 /// Decodes valid UTF sequence that is pointed by p into code point.
126 /// If the sequence is invalid or points to end the behavior is undefined
128 template<typename Iterator>
129 static code_point decode_valid(Iterator& p);
134 template<typename CharType, int size = sizeof(CharType)>
137 template<typename CharType>
138 struct utf_traits<CharType, 1>
140 using char_type = CharType;
142 static int trail_length(char_type ci)
144 unsigned char c = ci;
147 if(BOOST_UNLIKELY(c < 194))
153 if(BOOST_LIKELY(c <= 244))
158 static const int max_width = 4;
160 static int width(code_point value)
165 } else if(value <= 0x7FF)
168 } else if(BOOST_LIKELY(value <= 0xFFFF))
177 static bool is_trail(char_type ci)
179 unsigned char c = ci;
180 return (c & 0xC0) == 0x80;
183 static bool is_lead(char_type ci)
185 return !is_trail(ci);
188 template<typename Iterator>
189 static code_point decode(Iterator& p, Iterator e)
191 if(BOOST_UNLIKELY(p == e))
194 unsigned char lead = *p++;
196 // First byte is fully validated here
197 int trail_size = trail_length(lead);
199 if(BOOST_UNLIKELY(trail_size < 0))
202 // OK as only ASCII may be of size = 0
203 // also optimize for ASCII text
207 code_point c = lead & ((1 << (6 - trail_size)) - 1);
214 if(BOOST_UNLIKELY(p == e))
219 c = (c << 6) | (tmp & 0x3F);
220 BOOST_NOWIDE_FALLTHROUGH;
222 if(BOOST_UNLIKELY(p == e))
227 c = (c << 6) | (tmp & 0x3F);
228 BOOST_NOWIDE_FALLTHROUGH;
230 if(BOOST_UNLIKELY(p == e))
235 c = (c << 6) | (tmp & 0x3F);
238 // Check code point validity: no surrogates and
240 if(BOOST_UNLIKELY(!is_valid_codepoint(c)))
243 // make sure it is the most compact representation
244 if(BOOST_UNLIKELY(width(c) != trail_size + 1))
250 template<typename Iterator>
251 static code_point decode_valid(Iterator& p)
253 unsigned char lead = *p++;
261 else if(BOOST_LIKELY(lead < 240)) // non-BMP rare
266 code_point c = lead & ((1 << (6 - trail_size)) - 1);
270 case 3: c = (c << 6) | (static_cast<unsigned char>(*p++) & 0x3F); BOOST_NOWIDE_FALLTHROUGH;
271 case 2: c = (c << 6) | (static_cast<unsigned char>(*p++) & 0x3F); BOOST_NOWIDE_FALLTHROUGH;
272 case 1: c = (c << 6) | (static_cast<unsigned char>(*p++) & 0x3F);
278 template<typename Iterator>
279 static Iterator encode(code_point value, Iterator out)
283 *out++ = static_cast<char_type>(value);
284 } else if(value <= 0x7FF)
286 *out++ = static_cast<char_type>((value >> 6) | 0xC0);
287 *out++ = static_cast<char_type>((value & 0x3F) | 0x80);
288 } else if(BOOST_LIKELY(value <= 0xFFFF))
290 *out++ = static_cast<char_type>((value >> 12) | 0xE0);
291 *out++ = static_cast<char_type>(((value >> 6) & 0x3F) | 0x80);
292 *out++ = static_cast<char_type>((value & 0x3F) | 0x80);
295 *out++ = static_cast<char_type>((value >> 18) | 0xF0);
296 *out++ = static_cast<char_type>(((value >> 12) & 0x3F) | 0x80);
297 *out++ = static_cast<char_type>(((value >> 6) & 0x3F) | 0x80);
298 *out++ = static_cast<char_type>((value & 0x3F) | 0x80);
304 template<typename CharType>
305 struct utf_traits<CharType, 2>
307 using char_type = CharType;
310 static bool is_first_surrogate(uint16_t x)
312 return 0xD800 <= x && x <= 0xDBFF;
314 static bool is_second_surrogate(uint16_t x)
316 return 0xDC00 <= x && x <= 0xDFFF;
318 static code_point combine_surrogate(uint16_t w1, uint16_t w2)
320 return ((code_point(w1 & 0x3FF) << 10) | (w2 & 0x3FF)) + 0x10000;
322 static int trail_length(char_type c)
324 if(is_first_surrogate(c))
326 if(is_second_surrogate(c))
331 /// Returns true if c is trail code unit, always false for UTF-32
333 static bool is_trail(char_type c)
335 return is_second_surrogate(c);
338 /// Returns true if c is lead code unit, always true of UTF-32
340 static bool is_lead(char_type c)
342 return !is_second_surrogate(c);
345 template<typename It>
346 static code_point decode(It& current, It last)
348 if(BOOST_UNLIKELY(current == last))
350 uint16_t w1 = *current++;
351 if(BOOST_LIKELY(w1 < 0xD800 || 0xDFFF < w1))
359 uint16_t w2 = *current++;
360 if(w2 < 0xDC00 || 0xDFFF < w2)
362 return combine_surrogate(w1, w2);
364 template<typename It>
365 static code_point decode_valid(It& current)
367 uint16_t w1 = *current++;
368 if(BOOST_LIKELY(w1 < 0xD800 || 0xDFFF < w1))
372 uint16_t w2 = *current++;
373 return combine_surrogate(w1, w2);
376 static const int max_width = 2;
377 static int width(code_point u)
379 return u >= 0x10000 ? 2 : 1;
381 template<typename It>
382 static It encode(code_point u, It out)
384 if(BOOST_LIKELY(u <= 0xFFFF))
386 *out++ = static_cast<char_type>(u);
390 *out++ = static_cast<char_type>(0xD800 | (u >> 10));
391 *out++ = static_cast<char_type>(0xDC00 | (u & 0x3FF));
397 template<typename CharType>
398 struct utf_traits<CharType, 4>
400 using char_type = CharType;
401 static int trail_length(char_type c)
403 if(is_valid_codepoint(c))
407 static bool is_trail(char_type /*c*/)
411 static bool is_lead(char_type /*c*/)
416 template<typename It>
417 static code_point decode_valid(It& current)
422 template<typename It>
423 static code_point decode(It& current, It last)
425 if(BOOST_UNLIKELY(current == last))
427 code_point c = *current++;
428 if(BOOST_UNLIKELY(!is_valid_codepoint(c)))
432 static const int max_width = 1;
433 static int width(code_point /*u*/)
437 template<typename It>
438 static It encode(code_point u, It out)
440 *out++ = static_cast<char_type>(u);
449 } // namespace nowide