]> git.proxmox.com Git - ceph.git/blame - ceph/src/boost/boost/nowide/detail/utf.hpp
update source to Ceph Pacific 16.2.2
[ceph.git] / ceph / src / boost / boost / nowide / detail / utf.hpp
CommitLineData
f67539c2
TL
1//
2// Copyright (c) 2009-2011 Artyom Beilis (Tonkikh)
3//
4// Distributed under the Boost Software License, Version 1.0. (See
5// accompanying file LICENSE_1_0.txt or copy at
6// http://www.boost.org/LICENSE_1_0.txt)
7//
8#ifndef BOOST_NOWIDE_UTF_HPP_INCLUDED
9#define BOOST_NOWIDE_UTF_HPP_INCLUDED
10
11#include <boost/nowide/config.hpp>
12#include <boost/cstdint.hpp>
13
14namespace boost {
15namespace nowide {
16 namespace detail {
17 ///
18 /// \brief Namespace that holds basic operations on UTF encoded sequences
19 ///
20 /// All functions defined in this namespace do not require linking with Boost.Nowide library
21 /// Extracted from Boost.Locale
22 ///
23 namespace utf {
24
25 ///
26 /// \brief The integral type that can hold a Unicode code point
27 ///
28 typedef uint32_t code_point;
29
30 ///
31 /// \brief Special constant that defines illegal code point
32 ///
33 static const code_point illegal = 0xFFFFFFFFu;
34
35 ///
36 /// \brief Special constant that defines incomplete code point
37 ///
38 static const code_point incomplete = 0xFFFFFFFEu;
39
40 ///
41 /// \brief the function checks if \a v is a valid code point
42 ///
43 inline bool is_valid_codepoint(code_point v)
44 {
45 if(v > 0x10FFFF)
46 return false;
47 if(0xD800 <= v && v <= 0xDFFF) // surrogates
48 return false;
49 return true;
50 }
51
52#ifdef BOOST_NOWIDE_DOXYGEN
53 ///
54 /// \brief UTF Traits class - functions to convert UTF sequences to and from Unicode code points
55 ///
56 template<typename CharType, int size = sizeof(CharType)>
57 struct utf_traits
58 {
59 ///
60 /// The type of the character
61 ///
62 typedef CharType char_type;
63 ///
64 /// Read one code point from the range [p,e) and return it.
65 ///
66 /// - If the sequence that was read is incomplete sequence returns \ref incomplete,
67 /// - If illegal sequence detected returns \ref illegal
68 ///
69 /// Requirements
70 ///
71 /// - Iterator is valid input iterator
72 ///
73 /// Postconditions
74 ///
75 /// - p points to the last consumed character
76 ///
77 template<typename Iterator>
78 static code_point decode(Iterator& p, Iterator e);
79
80 ///
81 /// Maximal width of valid sequence in the code units:
82 ///
83 /// - UTF-8 - 4
84 /// - UTF-16 - 2
85 /// - UTF-32 - 1
86 ///
87 static const int max_width;
88 ///
89 /// The width of specific code point in the code units.
90 ///
91 /// Requirement: value is a valid Unicode code point
92 /// Returns value in range [1..max_width]
93 ///
94 static int width(code_point value);
95
96 ///
97 /// Get the size of the trail part of variable length encoded sequence.
98 ///
99 /// Returns -1 if C is not valid lead character
100 ///
101 static int trail_length(char_type c);
102 ///
103 /// Returns true if c is trail code unit, always false for UTF-32
104 ///
105 static bool is_trail(char_type c);
106 ///
107 /// Returns true if c is lead code unit, always true of UTF-32
108 ///
109 static bool is_lead(char_type c);
110
111 ///
112 /// Convert valid Unicode code point \a value to the UTF sequence.
113 ///
114 /// Requirements:
115 ///
116 /// - \a value is valid code point
117 /// - \a out is an output iterator should be able to accept at least width(value) units
118 ///
119 /// Returns the iterator past the last written code unit.
120 ///
121 template<typename Iterator>
122 static Iterator encode(code_point value, Iterator out);
123 ///
124 /// Decodes valid UTF sequence that is pointed by p into code point.
125 ///
126 /// If the sequence is invalid or points to end the behavior is undefined
127 ///
128 template<typename Iterator>
129 static code_point decode_valid(Iterator& p);
130 };
131
132#else
133
134 template<typename CharType, int size = sizeof(CharType)>
135 struct utf_traits;
136
137 template<typename CharType>
138 struct utf_traits<CharType, 1>
139 {
140 typedef CharType char_type;
141
142 static int trail_length(char_type ci)
143 {
144 unsigned char c = ci;
145 if(c < 128)
146 return 0;
147 if(BOOST_UNLIKELY(c < 194))
148 return -1;
149 if(c < 224)
150 return 1;
151 if(c < 240)
152 return 2;
153 if(BOOST_LIKELY(c <= 244))
154 return 3;
155 return -1;
156 }
157
158 static const int max_width = 4;
159
160 static int width(code_point value)
161 {
162 if(value <= 0x7F)
163 {
164 return 1;
165 } else if(value <= 0x7FF)
166 {
167 return 2;
168 } else if(BOOST_LIKELY(value <= 0xFFFF))
169 {
170 return 3;
171 } else
172 {
173 return 4;
174 }
175 }
176
177 static bool is_trail(char_type ci)
178 {
179 unsigned char c = ci;
180 return (c & 0xC0) == 0x80;
181 }
182
183 static bool is_lead(char_type ci)
184 {
185 return !is_trail(ci);
186 }
187
188 template<typename Iterator>
189 static code_point decode(Iterator& p, Iterator e)
190 {
191 if(BOOST_UNLIKELY(p == e))
192 return incomplete;
193
194 unsigned char lead = *p++;
195
196 // First byte is fully validated here
197 int trail_size = trail_length(lead);
198
199 if(BOOST_UNLIKELY(trail_size < 0))
200 return illegal;
201
202 //
203 // OK as only ASCII may be of size = 0
204 // also optimize for ASCII text
205 //
206 if(trail_size == 0)
207 return lead;
208
209 code_point c = lead & ((1 << (6 - trail_size)) - 1);
210
211 // Read the rest
212 unsigned char tmp;
213 switch(trail_size)
214 {
215 case 3:
216 if(BOOST_UNLIKELY(p == e))
217 return incomplete;
218 tmp = *p++;
219 if(!is_trail(tmp))
220 return illegal;
221 c = (c << 6) | (tmp & 0x3F);
222 BOOST_NOWIDE_FALLTHROUGH;
223 case 2:
224 if(BOOST_UNLIKELY(p == e))
225 return incomplete;
226 tmp = *p++;
227 if(!is_trail(tmp))
228 return illegal;
229 c = (c << 6) | (tmp & 0x3F);
230 BOOST_NOWIDE_FALLTHROUGH;
231 case 1:
232 if(BOOST_UNLIKELY(p == e))
233 return incomplete;
234 tmp = *p++;
235 if(!is_trail(tmp))
236 return illegal;
237 c = (c << 6) | (tmp & 0x3F);
238 }
239
240 // Check code point validity: no surrogates and
241 // valid range
242 if(BOOST_UNLIKELY(!is_valid_codepoint(c)))
243 return illegal;
244
245 // make sure it is the most compact representation
246 if(BOOST_UNLIKELY(width(c) != trail_size + 1))
247 return illegal;
248
249 return c;
250 }
251
252 template<typename Iterator>
253 static code_point decode_valid(Iterator& p)
254 {
255 unsigned char lead = *p++;
256 if(lead < 192)
257 return lead;
258
259 int trail_size;
260
261 if(lead < 224)
262 trail_size = 1;
263 else if(BOOST_LIKELY(lead < 240)) // non-BMP rare
264 trail_size = 2;
265 else
266 trail_size = 3;
267
268 code_point c = lead & ((1 << (6 - trail_size)) - 1);
269
270 switch(trail_size)
271 {
272 case 3: c = (c << 6) | (static_cast<unsigned char>(*p++) & 0x3F); BOOST_NOWIDE_FALLTHROUGH;
273 case 2: c = (c << 6) | (static_cast<unsigned char>(*p++) & 0x3F); BOOST_NOWIDE_FALLTHROUGH;
274 case 1: c = (c << 6) | (static_cast<unsigned char>(*p++) & 0x3F);
275 }
276
277 return c;
278 }
279
280 template<typename Iterator>
281 static Iterator encode(code_point value, Iterator out)
282 {
283 if(value <= 0x7F)
284 {
285 *out++ = static_cast<char_type>(value);
286 } else if(value <= 0x7FF)
287 {
288 *out++ = static_cast<char_type>((value >> 6) | 0xC0);
289 *out++ = static_cast<char_type>((value & 0x3F) | 0x80);
290 } else if(BOOST_LIKELY(value <= 0xFFFF))
291 {
292 *out++ = static_cast<char_type>((value >> 12) | 0xE0);
293 *out++ = static_cast<char_type>(((value >> 6) & 0x3F) | 0x80);
294 *out++ = static_cast<char_type>((value & 0x3F) | 0x80);
295 } else
296 {
297 *out++ = static_cast<char_type>((value >> 18) | 0xF0);
298 *out++ = static_cast<char_type>(((value >> 12) & 0x3F) | 0x80);
299 *out++ = static_cast<char_type>(((value >> 6) & 0x3F) | 0x80);
300 *out++ = static_cast<char_type>((value & 0x3F) | 0x80);
301 }
302 return out;
303 }
304 }; // utf8
305
306 template<typename CharType>
307 struct utf_traits<CharType, 2>
308 {
309 typedef CharType char_type;
310
311 // See RFC 2781
312 static bool is_first_surrogate(uint16_t x)
313 {
314 return 0xD800 <= x && x <= 0xDBFF;
315 }
316 static bool is_second_surrogate(uint16_t x)
317 {
318 return 0xDC00 <= x && x <= 0xDFFF;
319 }
320 static code_point combine_surrogate(uint16_t w1, uint16_t w2)
321 {
322 return ((code_point(w1 & 0x3FF) << 10) | (w2 & 0x3FF)) + 0x10000;
323 }
324 static int trail_length(char_type c)
325 {
326 if(is_first_surrogate(c))
327 return 1;
328 if(is_second_surrogate(c))
329 return -1;
330 return 0;
331 }
332 ///
333 /// Returns true if c is trail code unit, always false for UTF-32
334 ///
335 static bool is_trail(char_type c)
336 {
337 return is_second_surrogate(c);
338 }
339 ///
340 /// Returns true if c is lead code unit, always true of UTF-32
341 ///
342 static bool is_lead(char_type c)
343 {
344 return !is_second_surrogate(c);
345 }
346
347 template<typename It>
348 static code_point decode(It& current, It last)
349 {
350 if(BOOST_UNLIKELY(current == last))
351 return incomplete;
352 uint16_t w1 = *current++;
353 if(BOOST_LIKELY(w1 < 0xD800 || 0xDFFF < w1))
354 {
355 return w1;
356 }
357 if(w1 > 0xDBFF)
358 return illegal;
359 if(current == last)
360 return incomplete;
361 uint16_t w2 = *current++;
362 if(w2 < 0xDC00 || 0xDFFF < w2)
363 return illegal;
364 return combine_surrogate(w1, w2);
365 }
366 template<typename It>
367 static code_point decode_valid(It& current)
368 {
369 uint16_t w1 = *current++;
370 if(BOOST_LIKELY(w1 < 0xD800 || 0xDFFF < w1))
371 {
372 return w1;
373 }
374 uint16_t w2 = *current++;
375 return combine_surrogate(w1, w2);
376 }
377
378 static const int max_width = 2;
379 static int width(code_point u)
380 {
381 return u >= 0x10000 ? 2 : 1;
382 }
383 template<typename It>
384 static It encode(code_point u, It out)
385 {
386 if(BOOST_LIKELY(u <= 0xFFFF))
387 {
388 *out++ = static_cast<char_type>(u);
389 } else
390 {
391 u -= 0x10000;
392 *out++ = static_cast<char_type>(0xD800 | (u >> 10));
393 *out++ = static_cast<char_type>(0xDC00 | (u & 0x3FF));
394 }
395 return out;
396 }
397 }; // utf16;
398
399 template<typename CharType>
400 struct utf_traits<CharType, 4>
401 {
402 typedef CharType char_type;
403 static int trail_length(char_type c)
404 {
405 if(is_valid_codepoint(c))
406 return 0;
407 return -1;
408 }
409 static bool is_trail(char_type /*c*/)
410 {
411 return false;
412 }
413 static bool is_lead(char_type /*c*/)
414 {
415 return true;
416 }
417
418 template<typename It>
419 static code_point decode_valid(It& current)
420 {
421 return *current++;
422 }
423
424 template<typename It>
425 static code_point decode(It& current, It last)
426 {
427 if(BOOST_UNLIKELY(current == last))
428 return incomplete;
429 code_point c = *current++;
430 if(BOOST_UNLIKELY(!is_valid_codepoint(c)))
431 return illegal;
432 return c;
433 }
434 static const int max_width = 1;
435 static int width(code_point /*u*/)
436 {
437 return 1;
438 }
439 template<typename It>
440 static It encode(code_point u, It out)
441 {
442 *out++ = static_cast<char_type>(u);
443 return out;
444 }
445
446 }; // utf32
447
448#endif
449
450 } // namespace utf
451 } // namespace detail
452} // namespace nowide
453} // namespace boost
454
455#endif