2 // Copyright (c) 2009-2011 Artyom Beilis (Tonkikh)
4 // Distributed under the Boost Software License, Version 1.0. (See
5 // accompanying file LICENSE_1_0.txt or copy at
6 // http://www.boost.org/LICENSE_1_0.txt)
8 #ifndef BOOST_LOCALE_UTF_HPP_INCLUDED
9 #define BOOST_LOCALE_UTF_HPP_INCLUDED
11 #include <boost/cstdint.hpp>
16 /// \brief Namespace that holds basic operations on UTF encoded sequences
18 /// All functions defined in this namespace do not require linking with Boost.Locale library
23 # define BOOST_LOCALE_LIKELY(x) __builtin_expect((x),1)
24 # define BOOST_LOCALE_UNLIKELY(x) __builtin_expect((x),0)
26 # define BOOST_LOCALE_LIKELY(x) (x)
27 # define BOOST_LOCALE_UNLIKELY(x) (x)
32 /// \brief The integral type that can hold a Unicode code point
34 typedef uint32_t code_point;
37 /// \brief Special constant that defines illegal code point
39 static const code_point illegal = 0xFFFFFFFFu;
42 /// \brief Special constant that defines incomplete code point
44 static const code_point incomplete = 0xFFFFFFFEu;
47 /// \brief the function checks if \a v is a valid code point
49 inline bool is_valid_codepoint(code_point v)
53 if(0xD800 <=v && v<= 0xDFFF) // surragates
58 #ifdef BOOST_LOCALE_DOXYGEN
60 /// \brief UTF Traits class - functions to convert UTF sequences to and from Unicode code points
62 template<typename CharType,int size=sizeof(CharType)>
65 /// The type of the character
67 typedef CharType char_type;
69 /// Read one code point from the range [p,e) and return it.
71 /// - If the sequence that was read is incomplete sequence returns \ref incomplete,
72 /// - If illegal sequence detected returns \ref illegal
76 /// - Iterator is valid input iterator
80 /// - p points to the last consumed character
82 template<typename Iterator>
83 static code_point decode(Iterator &p,Iterator e);
86 /// Maximal width of valid sequence in the code units:
92 static const int max_width;
94 /// The width of specific code point in the code units.
96 /// Requirement: value is a valid Unicode code point
97 /// Returns value in range [1..max_width]
99 static int width(code_point value);
102 /// Get the size of the trail part of variable length encoded sequence.
104 /// Returns -1 if C is not valid lead character
106 static int trail_length(char_type c);
108 /// Returns true if c is trail code unit, always false for UTF-32
110 static bool is_trail(char_type c);
112 /// Returns true if c is lead code unit, always true of UTF-32
114 static bool is_lead(char_type c);
117 /// Convert valid Unicode code point \a value to the UTF sequence.
121 /// - \a value is valid code point
122 /// - \a out is an output iterator should be able to accept at least width(value) units
124 /// Returns the iterator past the last written code unit.
126 template<typename Iterator>
127 static Iterator encode(code_point value,Iterator out);
129 /// Decodes valid UTF sequence that is pointed by p into code point.
131 /// If the sequence is invalid or points to end the behavior is undefined
133 template<typename Iterator>
134 static code_point decode_valid(Iterator &p);
139 template<typename CharType,int size=sizeof(CharType)>
142 template<typename CharType>
143 struct utf_traits<CharType,1> {
145 typedef CharType char_type;
147 static int trail_length(char_type ci)
149 unsigned char c = ci;
152 if(BOOST_LOCALE_UNLIKELY(c < 194))
158 if(BOOST_LOCALE_LIKELY(c <=244))
163 static const int max_width = 4;
165 static int width(code_point value)
170 else if(value <=0x7FF) {
173 else if(BOOST_LOCALE_LIKELY(value <=0xFFFF)) {
181 static bool is_trail(char_type ci)
184 return (c & 0xC0)==0x80;
187 static bool is_lead(char_type ci)
189 return !is_trail(ci);
192 template<typename Iterator>
193 static code_point decode(Iterator &p,Iterator e)
195 if(BOOST_LOCALE_UNLIKELY(p==e))
198 unsigned char lead = *p++;
200 // First byte is fully validated here
201 int trail_size = trail_length(lead);
203 if(BOOST_LOCALE_UNLIKELY(trail_size < 0))
207 // Ok as only ASCII may be of size = 0
208 // also optimize for ASCII text
213 code_point c = lead & ((1<<(6-trail_size))-1);
219 if(BOOST_LOCALE_UNLIKELY(p==e))
224 c = (c << 6) | ( tmp & 0x3F);
226 if(BOOST_LOCALE_UNLIKELY(p==e))
231 c = (c << 6) | ( tmp & 0x3F);
233 if(BOOST_LOCALE_UNLIKELY(p==e))
238 c = (c << 6) | ( tmp & 0x3F);
241 // Check code point validity: no surrogates and
243 if(BOOST_LOCALE_UNLIKELY(!is_valid_codepoint(c)))
246 // make sure it is the most compact representation
247 if(BOOST_LOCALE_UNLIKELY(width(c)!=trail_size + 1))
254 template<typename Iterator>
255 static code_point decode_valid(Iterator &p)
257 unsigned char lead = *p++;
265 else if(BOOST_LOCALE_LIKELY(lead < 240)) // non-BMP rare
270 code_point c = lead & ((1<<(6-trail_size))-1);
274 c = (c << 6) | ( static_cast<unsigned char>(*p++) & 0x3F);
276 c = (c << 6) | ( static_cast<unsigned char>(*p++) & 0x3F);
278 c = (c << 6) | ( static_cast<unsigned char>(*p++) & 0x3F);
286 template<typename Iterator>
287 static Iterator encode(code_point value,Iterator out)
290 *out++ = static_cast<char_type>(value);
292 else if(value <= 0x7FF) {
293 *out++ = static_cast<char_type>((value >> 6) | 0xC0);
294 *out++ = static_cast<char_type>((value & 0x3F) | 0x80);
296 else if(BOOST_LOCALE_LIKELY(value <= 0xFFFF)) {
297 *out++ = static_cast<char_type>((value >> 12) | 0xE0);
298 *out++ = static_cast<char_type>(((value >> 6) & 0x3F) | 0x80);
299 *out++ = static_cast<char_type>((value & 0x3F) | 0x80);
302 *out++ = static_cast<char_type>((value >> 18) | 0xF0);
303 *out++ = static_cast<char_type>(((value >> 12) & 0x3F) | 0x80);
304 *out++ = static_cast<char_type>(((value >> 6) & 0x3F) | 0x80);
305 *out++ = static_cast<char_type>((value & 0x3F) | 0x80);
311 template<typename CharType>
312 struct utf_traits<CharType,2> {
313 typedef CharType char_type;
316 static bool is_first_surrogate(uint16_t x)
318 return 0xD800 <=x && x<= 0xDBFF;
320 static bool is_second_surrogate(uint16_t x)
322 return 0xDC00 <=x && x<= 0xDFFF;
324 static code_point combine_surrogate(uint16_t w1,uint16_t w2)
326 return ((code_point(w1 & 0x3FF) << 10) | (w2 & 0x3FF)) + 0x10000;
328 static int trail_length(char_type c)
330 if(is_first_surrogate(c))
332 if(is_second_surrogate(c))
337 /// Returns true if c is trail code unit, always false for UTF-32
339 static bool is_trail(char_type c)
341 return is_second_surrogate(c);
344 /// Returns true if c is lead code unit, always true of UTF-32
346 static bool is_lead(char_type c)
348 return !is_second_surrogate(c);
351 template<typename It>
352 static code_point decode(It ¤t,It last)
354 if(BOOST_LOCALE_UNLIKELY(current == last))
356 uint16_t w1=*current++;
357 if(BOOST_LOCALE_LIKELY(w1 < 0xD800 || 0xDFFF < w1)) {
364 uint16_t w2=*current++;
365 if(w2 < 0xDC00 || 0xDFFF < w2)
367 return combine_surrogate(w1,w2);
369 template<typename It>
370 static code_point decode_valid(It ¤t)
372 uint16_t w1=*current++;
373 if(BOOST_LOCALE_LIKELY(w1 < 0xD800 || 0xDFFF < w1)) {
376 uint16_t w2=*current++;
377 return combine_surrogate(w1,w2);
380 static const int max_width = 2;
381 static int width(code_point u)
383 return u>=0x10000 ? 2 : 1;
385 template<typename It>
386 static It encode(code_point u,It out)
388 if(BOOST_LOCALE_LIKELY(u<=0xFFFF)) {
389 *out++ = static_cast<char_type>(u);
393 *out++ = static_cast<char_type>(0xD800 | (u>>10));
394 *out++ = static_cast<char_type>(0xDC00 | (u & 0x3FF));
401 template<typename CharType>
402 struct utf_traits<CharType,4> {
403 typedef CharType char_type;
404 static int trail_length(char_type c)
406 if(is_valid_codepoint(c))
410 static bool is_trail(char_type /*c*/)
414 static bool is_lead(char_type /*c*/)
419 template<typename It>
420 static code_point decode_valid(It ¤t)
425 template<typename It>
426 static code_point decode(It ¤t,It last)
428 if(BOOST_LOCALE_UNLIKELY(current == last))
429 return boost::locale::utf::incomplete;
430 code_point c=*current++;
431 if(BOOST_LOCALE_UNLIKELY(!is_valid_codepoint(c)))
432 return boost::locale::utf::illegal;
435 static const int max_width = 1;
436 static int width(code_point /*u*/)
440 template<typename It>
441 static It encode(code_point u,It out)
443 *out++ = static_cast<char_type>(u);
459 // vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4