]> git.proxmox.com Git - ceph.git/blob - ceph/src/boost/libs/locale/include/boost/locale/utf.hpp
bump version to 12.2.2-pve1
[ceph.git] / ceph / src / boost / libs / locale / include / boost / locale / utf.hpp
1 //
2 // Copyright (c) 2009-2011 Artyom Beilis (Tonkikh)
3 //
4 // Distributed under the Boost Software License, Version 1.0. (See
5 // accompanying file LICENSE_1_0.txt or copy at
6 // http://www.boost.org/LICENSE_1_0.txt)
7 //
8 #ifndef BOOST_LOCALE_UTF_HPP_INCLUDED
9 #define BOOST_LOCALE_UTF_HPP_INCLUDED
10
11 #include <boost/cstdint.hpp>
12
13 namespace boost {
14 namespace locale {
15 ///
16 /// \brief Namespace that holds basic operations on UTF encoded sequences
17 ///
18 /// All functions defined in this namespace do not require linking with Boost.Locale library
19 ///
20 namespace utf {
21 /// \cond INTERNAL
22 #ifdef __GNUC__
23 # define BOOST_LOCALE_LIKELY(x) __builtin_expect((x),1)
24 # define BOOST_LOCALE_UNLIKELY(x) __builtin_expect((x),0)
25 #else
26 # define BOOST_LOCALE_LIKELY(x) (x)
27 # define BOOST_LOCALE_UNLIKELY(x) (x)
28 #endif
29 /// \endcond
30
31 ///
32 /// \brief The integral type that can hold a Unicode code point
33 ///
34 typedef uint32_t code_point;
35
36 ///
37 /// \brief Special constant that defines illegal code point
38 ///
39 static const code_point illegal = 0xFFFFFFFFu;
40
41 ///
42 /// \brief Special constant that defines incomplete code point
43 ///
44 static const code_point incomplete = 0xFFFFFFFEu;
45
46 ///
47 /// \brief the function checks if \a v is a valid code point
48 ///
49 inline bool is_valid_codepoint(code_point v)
50 {
51 if(v>0x10FFFF)
52 return false;
53 if(0xD800 <=v && v<= 0xDFFF) // surragates
54 return false;
55 return true;
56 }
57
58 #ifdef BOOST_LOCALE_DOXYGEN
59 ///
60 /// \brief UTF Traits class - functions to convert UTF sequences to and from Unicode code points
61 ///
62 template<typename CharType,int size=sizeof(CharType)>
63 struct utf_traits {
64 ///
65 /// The type of the character
66 ///
67 typedef CharType char_type;
68 ///
69 /// Read one code point from the range [p,e) and return it.
70 ///
71 /// - If the sequence that was read is incomplete sequence returns \ref incomplete,
72 /// - If illegal sequence detected returns \ref illegal
73 ///
74 /// Requirements
75 ///
76 /// - Iterator is valid input iterator
77 ///
78 /// Postconditions
79 ///
80 /// - p points to the last consumed character
81 ///
82 template<typename Iterator>
83 static code_point decode(Iterator &p,Iterator e);
84
85 ///
86 /// Maximal width of valid sequence in the code units:
87 ///
88 /// - UTF-8 - 4
89 /// - UTF-16 - 2
90 /// - UTF-32 - 1
91 ///
92 static const int max_width;
93 ///
94 /// The width of specific code point in the code units.
95 ///
96 /// Requirement: value is a valid Unicode code point
97 /// Returns value in range [1..max_width]
98 ///
99 static int width(code_point value);
100
101 ///
102 /// Get the size of the trail part of variable length encoded sequence.
103 ///
104 /// Returns -1 if C is not valid lead character
105 ///
106 static int trail_length(char_type c);
107 ///
108 /// Returns true if c is trail code unit, always false for UTF-32
109 ///
110 static bool is_trail(char_type c);
111 ///
112 /// Returns true if c is lead code unit, always true of UTF-32
113 ///
114 static bool is_lead(char_type c);
115
116 ///
117 /// Convert valid Unicode code point \a value to the UTF sequence.
118 ///
119 /// Requirements:
120 ///
121 /// - \a value is valid code point
122 /// - \a out is an output iterator should be able to accept at least width(value) units
123 ///
124 /// Returns the iterator past the last written code unit.
125 ///
126 template<typename Iterator>
127 static Iterator encode(code_point value,Iterator out);
128 ///
129 /// Decodes valid UTF sequence that is pointed by p into code point.
130 ///
131 /// If the sequence is invalid or points to end the behavior is undefined
132 ///
133 template<typename Iterator>
134 static code_point decode_valid(Iterator &p);
135 };
136
137 #else
138
139 template<typename CharType,int size=sizeof(CharType)>
140 struct utf_traits;
141
142 template<typename CharType>
143 struct utf_traits<CharType,1> {
144
145 typedef CharType char_type;
146
147 static int trail_length(char_type ci)
148 {
149 unsigned char c = ci;
150 if(c < 128)
151 return 0;
152 if(BOOST_LOCALE_UNLIKELY(c < 194))
153 return -1;
154 if(c < 224)
155 return 1;
156 if(c < 240)
157 return 2;
158 if(BOOST_LOCALE_LIKELY(c <=244))
159 return 3;
160 return -1;
161 }
162
163 static const int max_width = 4;
164
165 static int width(code_point value)
166 {
167 if(value <=0x7F) {
168 return 1;
169 }
170 else if(value <=0x7FF) {
171 return 2;
172 }
173 else if(BOOST_LOCALE_LIKELY(value <=0xFFFF)) {
174 return 3;
175 }
176 else {
177 return 4;
178 }
179 }
180
181 static bool is_trail(char_type ci)
182 {
183 unsigned char c=ci;
184 return (c & 0xC0)==0x80;
185 }
186
187 static bool is_lead(char_type ci)
188 {
189 return !is_trail(ci);
190 }
191
192 template<typename Iterator>
193 static code_point decode(Iterator &p,Iterator e)
194 {
195 if(BOOST_LOCALE_UNLIKELY(p==e))
196 return incomplete;
197
198 unsigned char lead = *p++;
199
200 // First byte is fully validated here
201 int trail_size = trail_length(lead);
202
203 if(BOOST_LOCALE_UNLIKELY(trail_size < 0))
204 return illegal;
205
206 //
207 // Ok as only ASCII may be of size = 0
208 // also optimize for ASCII text
209 //
210 if(trail_size == 0)
211 return lead;
212
213 code_point c = lead & ((1<<(6-trail_size))-1);
214
215 // Read the rest
216 unsigned char tmp;
217 switch(trail_size) {
218 case 3:
219 if(BOOST_LOCALE_UNLIKELY(p==e))
220 return incomplete;
221 tmp = *p++;
222 if (!is_trail(tmp))
223 return illegal;
224 c = (c << 6) | ( tmp & 0x3F);
225 case 2:
226 if(BOOST_LOCALE_UNLIKELY(p==e))
227 return incomplete;
228 tmp = *p++;
229 if (!is_trail(tmp))
230 return illegal;
231 c = (c << 6) | ( tmp & 0x3F);
232 case 1:
233 if(BOOST_LOCALE_UNLIKELY(p==e))
234 return incomplete;
235 tmp = *p++;
236 if (!is_trail(tmp))
237 return illegal;
238 c = (c << 6) | ( tmp & 0x3F);
239 }
240
241 // Check code point validity: no surrogates and
242 // valid range
243 if(BOOST_LOCALE_UNLIKELY(!is_valid_codepoint(c)))
244 return illegal;
245
246 // make sure it is the most compact representation
247 if(BOOST_LOCALE_UNLIKELY(width(c)!=trail_size + 1))
248 return illegal;
249
250 return c;
251
252 }
253
254 template<typename Iterator>
255 static code_point decode_valid(Iterator &p)
256 {
257 unsigned char lead = *p++;
258 if(lead < 192)
259 return lead;
260
261 int trail_size;
262
263 if(lead < 224)
264 trail_size = 1;
265 else if(BOOST_LOCALE_LIKELY(lead < 240)) // non-BMP rare
266 trail_size = 2;
267 else
268 trail_size = 3;
269
270 code_point c = lead & ((1<<(6-trail_size))-1);
271
272 switch(trail_size) {
273 case 3:
274 c = (c << 6) | ( static_cast<unsigned char>(*p++) & 0x3F);
275 case 2:
276 c = (c << 6) | ( static_cast<unsigned char>(*p++) & 0x3F);
277 case 1:
278 c = (c << 6) | ( static_cast<unsigned char>(*p++) & 0x3F);
279 }
280
281 return c;
282 }
283
284
285
286 template<typename Iterator>
287 static Iterator encode(code_point value,Iterator out)
288 {
289 if(value <= 0x7F) {
290 *out++ = static_cast<char_type>(value);
291 }
292 else if(value <= 0x7FF) {
293 *out++ = static_cast<char_type>((value >> 6) | 0xC0);
294 *out++ = static_cast<char_type>((value & 0x3F) | 0x80);
295 }
296 else if(BOOST_LOCALE_LIKELY(value <= 0xFFFF)) {
297 *out++ = static_cast<char_type>((value >> 12) | 0xE0);
298 *out++ = static_cast<char_type>(((value >> 6) & 0x3F) | 0x80);
299 *out++ = static_cast<char_type>((value & 0x3F) | 0x80);
300 }
301 else {
302 *out++ = static_cast<char_type>((value >> 18) | 0xF0);
303 *out++ = static_cast<char_type>(((value >> 12) & 0x3F) | 0x80);
304 *out++ = static_cast<char_type>(((value >> 6) & 0x3F) | 0x80);
305 *out++ = static_cast<char_type>((value & 0x3F) | 0x80);
306 }
307 return out;
308 }
309 }; // utf8
310
311 template<typename CharType>
312 struct utf_traits<CharType,2> {
313 typedef CharType char_type;
314
315 // See RFC 2781
316 static bool is_first_surrogate(uint16_t x)
317 {
318 return 0xD800 <=x && x<= 0xDBFF;
319 }
320 static bool is_second_surrogate(uint16_t x)
321 {
322 return 0xDC00 <=x && x<= 0xDFFF;
323 }
324 static code_point combine_surrogate(uint16_t w1,uint16_t w2)
325 {
326 return ((code_point(w1 & 0x3FF) << 10) | (w2 & 0x3FF)) + 0x10000;
327 }
328 static int trail_length(char_type c)
329 {
330 if(is_first_surrogate(c))
331 return 1;
332 if(is_second_surrogate(c))
333 return -1;
334 return 0;
335 }
336 ///
337 /// Returns true if c is trail code unit, always false for UTF-32
338 ///
339 static bool is_trail(char_type c)
340 {
341 return is_second_surrogate(c);
342 }
343 ///
344 /// Returns true if c is lead code unit, always true of UTF-32
345 ///
346 static bool is_lead(char_type c)
347 {
348 return !is_second_surrogate(c);
349 }
350
351 template<typename It>
352 static code_point decode(It &current,It last)
353 {
354 if(BOOST_LOCALE_UNLIKELY(current == last))
355 return incomplete;
356 uint16_t w1=*current++;
357 if(BOOST_LOCALE_LIKELY(w1 < 0xD800 || 0xDFFF < w1)) {
358 return w1;
359 }
360 if(w1 > 0xDBFF)
361 return illegal;
362 if(current==last)
363 return incomplete;
364 uint16_t w2=*current++;
365 if(w2 < 0xDC00 || 0xDFFF < w2)
366 return illegal;
367 return combine_surrogate(w1,w2);
368 }
369 template<typename It>
370 static code_point decode_valid(It &current)
371 {
372 uint16_t w1=*current++;
373 if(BOOST_LOCALE_LIKELY(w1 < 0xD800 || 0xDFFF < w1)) {
374 return w1;
375 }
376 uint16_t w2=*current++;
377 return combine_surrogate(w1,w2);
378 }
379
380 static const int max_width = 2;
381 static int width(code_point u)
382 {
383 return u>=0x10000 ? 2 : 1;
384 }
385 template<typename It>
386 static It encode(code_point u,It out)
387 {
388 if(BOOST_LOCALE_LIKELY(u<=0xFFFF)) {
389 *out++ = static_cast<char_type>(u);
390 }
391 else {
392 u -= 0x10000;
393 *out++ = static_cast<char_type>(0xD800 | (u>>10));
394 *out++ = static_cast<char_type>(0xDC00 | (u & 0x3FF));
395 }
396 return out;
397 }
398 }; // utf16;
399
400
401 template<typename CharType>
402 struct utf_traits<CharType,4> {
403 typedef CharType char_type;
404 static int trail_length(char_type c)
405 {
406 if(is_valid_codepoint(c))
407 return 0;
408 return -1;
409 }
410 static bool is_trail(char_type /*c*/)
411 {
412 return false;
413 }
414 static bool is_lead(char_type /*c*/)
415 {
416 return true;
417 }
418
419 template<typename It>
420 static code_point decode_valid(It &current)
421 {
422 return *current++;
423 }
424
425 template<typename It>
426 static code_point decode(It &current,It last)
427 {
428 if(BOOST_LOCALE_UNLIKELY(current == last))
429 return boost::locale::utf::incomplete;
430 code_point c=*current++;
431 if(BOOST_LOCALE_UNLIKELY(!is_valid_codepoint(c)))
432 return boost::locale::utf::illegal;
433 return c;
434 }
435 static const int max_width = 1;
436 static int width(code_point /*u*/)
437 {
438 return 1;
439 }
440 template<typename It>
441 static It encode(code_point u,It out)
442 {
443 *out++ = static_cast<char_type>(u);
444 return out;
445 }
446
447 }; // utf32
448
449 #endif
450
451
452 } // utf
453 } // locale
454 } // boost
455
456
457 #endif
458
459 // vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4
460