]> git.proxmox.com Git - ceph.git/blob - ceph/src/boost/boost/nowide/utf/utf.hpp
import quincy beta 17.1.0
[ceph.git] / ceph / src / boost / boost / nowide / utf / utf.hpp
1 //
2 // Copyright (c) 2009-2011 Artyom Beilis (Tonkikh)
3 // Copyright (c) 2020 Alexander Grund
4 //
5 // Distributed under the Boost Software License, Version 1.0. (See
6 // accompanying file LICENSE or copy at
7 // http://www.boost.org/LICENSE_1_0.txt)
8 //
9 #ifndef BOOST_NOWIDE_UTF_HPP_INCLUDED
10 #define BOOST_NOWIDE_UTF_HPP_INCLUDED
11
12 #include <boost/nowide/config.hpp>
13 #include <cstdint>
14
15 namespace boost {
16 namespace nowide {
17 ///
18 /// \brief Namespace that holds basic operations on UTF encoded sequences
19 ///
20 /// All functions defined in this namespace do not require linking with Boost.Nowide library
21 /// Extracted from Boost.Locale
22 ///
23 namespace utf {
24
25 ///
26 /// \brief The integral type that can hold a Unicode code point
27 ///
28 using code_point = uint32_t;
29
30 ///
31 /// \brief Special constant that defines illegal code point
32 ///
33 static const code_point illegal = 0xFFFFFFFFu;
34
35 ///
36 /// \brief Special constant that defines incomplete code point
37 ///
38 static const code_point incomplete = 0xFFFFFFFEu;
39
40 ///
41 /// \brief the function checks if \a v is a valid code point
42 ///
43 inline bool is_valid_codepoint(code_point v)
44 {
45 if(v > 0x10FFFF)
46 return false;
47 if(0xD800 <= v && v <= 0xDFFF) // surrogates
48 return false;
49 return true;
50 }
51
52 #ifdef BOOST_NOWIDE_DOXYGEN
53 ///
54 /// \brief UTF Traits class - functions to convert UTF sequences to and from Unicode code points
55 ///
56 template<typename CharType, int size = sizeof(CharType)>
57 struct utf_traits
58 {
59 ///
60 /// The type of the character
61 ///
62 using char_type = CharType;
63 ///
64 /// Read one code point from the range [p,e) and return it.
65 ///
66 /// - If the sequence that was read is incomplete sequence returns \ref incomplete,
67 /// - If illegal sequence detected returns \ref illegal
68 ///
69 /// Requirements
70 ///
71 /// - Iterator is valid input iterator
72 ///
73 /// Postconditions
74 ///
75 /// - p points to the last consumed character
76 ///
77 template<typename Iterator>
78 static code_point decode(Iterator& p, Iterator e);
79
80 ///
81 /// Maximal width of valid sequence in the code units:
82 ///
83 /// - UTF-8 - 4
84 /// - UTF-16 - 2
85 /// - UTF-32 - 1
86 ///
87 static const int max_width;
88 ///
89 /// The width of specific code point in the code units.
90 ///
91 /// Requirement: value is a valid Unicode code point
92 /// Returns value in range [1..max_width]
93 ///
94 static int width(code_point value);
95
96 ///
97 /// Get the size of the trail part of variable length encoded sequence.
98 ///
99 /// Returns -1 if C is not valid lead character
100 ///
101 static int trail_length(char_type c);
102 ///
103 /// Returns true if c is trail code unit, always false for UTF-32
104 ///
105 static bool is_trail(char_type c);
106 ///
107 /// Returns true if c is lead code unit, always true of UTF-32
108 ///
109 static bool is_lead(char_type c);
110
111 ///
112 /// Convert valid Unicode code point \a value to the UTF sequence.
113 ///
114 /// Requirements:
115 ///
116 /// - \a value is valid code point
117 /// - \a out is an output iterator should be able to accept at least width(value) units
118 ///
119 /// Returns the iterator past the last written code unit.
120 ///
121 template<typename Iterator>
122 static Iterator encode(code_point value, Iterator out);
123 ///
124 /// Decodes valid UTF sequence that is pointed by p into code point.
125 ///
126 /// If the sequence is invalid or points to end the behavior is undefined
127 ///
128 template<typename Iterator>
129 static code_point decode_valid(Iterator& p);
130 };
131
132 #else
133
134 template<typename CharType, int size = sizeof(CharType)>
135 struct utf_traits;
136
137 template<typename CharType>
138 struct utf_traits<CharType, 1>
139 {
140 using char_type = CharType;
141
142 static int trail_length(char_type ci)
143 {
144 unsigned char c = ci;
145 if(c < 128)
146 return 0;
147 if(BOOST_UNLIKELY(c < 194))
148 return -1;
149 if(c < 224)
150 return 1;
151 if(c < 240)
152 return 2;
153 if(BOOST_LIKELY(c <= 244))
154 return 3;
155 return -1;
156 }
157
158 static const int max_width = 4;
159
160 static int width(code_point value)
161 {
162 if(value <= 0x7F)
163 {
164 return 1;
165 } else if(value <= 0x7FF)
166 {
167 return 2;
168 } else if(BOOST_LIKELY(value <= 0xFFFF))
169 {
170 return 3;
171 } else
172 {
173 return 4;
174 }
175 }
176
177 static bool is_trail(char_type ci)
178 {
179 unsigned char c = ci;
180 return (c & 0xC0) == 0x80;
181 }
182
183 static bool is_lead(char_type ci)
184 {
185 return !is_trail(ci);
186 }
187
188 template<typename Iterator>
189 static code_point decode(Iterator& p, Iterator e)
190 {
191 if(BOOST_UNLIKELY(p == e))
192 return incomplete;
193
194 unsigned char lead = *p++;
195
196 // First byte is fully validated here
197 int trail_size = trail_length(lead);
198
199 if(BOOST_UNLIKELY(trail_size < 0))
200 return illegal;
201
202 // OK as only ASCII may be of size = 0
203 // also optimize for ASCII text
204 if(trail_size == 0)
205 return lead;
206
207 code_point c = lead & ((1 << (6 - trail_size)) - 1);
208
209 // Read the rest
210 unsigned char tmp;
211 switch(trail_size)
212 {
213 case 3:
214 if(BOOST_UNLIKELY(p == e))
215 return incomplete;
216 tmp = *p++;
217 if(!is_trail(tmp))
218 return illegal;
219 c = (c << 6) | (tmp & 0x3F);
220 BOOST_NOWIDE_FALLTHROUGH;
221 case 2:
222 if(BOOST_UNLIKELY(p == e))
223 return incomplete;
224 tmp = *p++;
225 if(!is_trail(tmp))
226 return illegal;
227 c = (c << 6) | (tmp & 0x3F);
228 BOOST_NOWIDE_FALLTHROUGH;
229 case 1:
230 if(BOOST_UNLIKELY(p == e))
231 return incomplete;
232 tmp = *p++;
233 if(!is_trail(tmp))
234 return illegal;
235 c = (c << 6) | (tmp & 0x3F);
236 }
237
238 // Check code point validity: no surrogates and
239 // valid range
240 if(BOOST_UNLIKELY(!is_valid_codepoint(c)))
241 return illegal;
242
243 // make sure it is the most compact representation
244 if(BOOST_UNLIKELY(width(c) != trail_size + 1))
245 return illegal;
246
247 return c;
248 }
249
250 template<typename Iterator>
251 static code_point decode_valid(Iterator& p)
252 {
253 unsigned char lead = *p++;
254 if(lead < 192)
255 return lead;
256
257 int trail_size;
258
259 if(lead < 224)
260 trail_size = 1;
261 else if(BOOST_LIKELY(lead < 240)) // non-BMP rare
262 trail_size = 2;
263 else
264 trail_size = 3;
265
266 code_point c = lead & ((1 << (6 - trail_size)) - 1);
267
268 switch(trail_size)
269 {
270 case 3: c = (c << 6) | (static_cast<unsigned char>(*p++) & 0x3F); BOOST_NOWIDE_FALLTHROUGH;
271 case 2: c = (c << 6) | (static_cast<unsigned char>(*p++) & 0x3F); BOOST_NOWIDE_FALLTHROUGH;
272 case 1: c = (c << 6) | (static_cast<unsigned char>(*p++) & 0x3F);
273 }
274
275 return c;
276 }
277
278 template<typename Iterator>
279 static Iterator encode(code_point value, Iterator out)
280 {
281 if(value <= 0x7F)
282 {
283 *out++ = static_cast<char_type>(value);
284 } else if(value <= 0x7FF)
285 {
286 *out++ = static_cast<char_type>((value >> 6) | 0xC0);
287 *out++ = static_cast<char_type>((value & 0x3F) | 0x80);
288 } else if(BOOST_LIKELY(value <= 0xFFFF))
289 {
290 *out++ = static_cast<char_type>((value >> 12) | 0xE0);
291 *out++ = static_cast<char_type>(((value >> 6) & 0x3F) | 0x80);
292 *out++ = static_cast<char_type>((value & 0x3F) | 0x80);
293 } else
294 {
295 *out++ = static_cast<char_type>((value >> 18) | 0xF0);
296 *out++ = static_cast<char_type>(((value >> 12) & 0x3F) | 0x80);
297 *out++ = static_cast<char_type>(((value >> 6) & 0x3F) | 0x80);
298 *out++ = static_cast<char_type>((value & 0x3F) | 0x80);
299 }
300 return out;
301 }
302 }; // utf8
303
304 template<typename CharType>
305 struct utf_traits<CharType, 2>
306 {
307 using char_type = CharType;
308
309 // See RFC 2781
310 static bool is_first_surrogate(uint16_t x)
311 {
312 return 0xD800 <= x && x <= 0xDBFF;
313 }
314 static bool is_second_surrogate(uint16_t x)
315 {
316 return 0xDC00 <= x && x <= 0xDFFF;
317 }
318 static code_point combine_surrogate(uint16_t w1, uint16_t w2)
319 {
320 return ((code_point(w1 & 0x3FF) << 10) | (w2 & 0x3FF)) + 0x10000;
321 }
322 static int trail_length(char_type c)
323 {
324 if(is_first_surrogate(c))
325 return 1;
326 if(is_second_surrogate(c))
327 return -1;
328 return 0;
329 }
330 ///
331 /// Returns true if c is trail code unit, always false for UTF-32
332 ///
333 static bool is_trail(char_type c)
334 {
335 return is_second_surrogate(c);
336 }
337 ///
338 /// Returns true if c is lead code unit, always true of UTF-32
339 ///
340 static bool is_lead(char_type c)
341 {
342 return !is_second_surrogate(c);
343 }
344
345 template<typename It>
346 static code_point decode(It& current, It last)
347 {
348 if(BOOST_UNLIKELY(current == last))
349 return incomplete;
350 uint16_t w1 = *current++;
351 if(BOOST_LIKELY(w1 < 0xD800 || 0xDFFF < w1))
352 {
353 return w1;
354 }
355 if(w1 > 0xDBFF)
356 return illegal;
357 if(current == last)
358 return incomplete;
359 uint16_t w2 = *current++;
360 if(w2 < 0xDC00 || 0xDFFF < w2)
361 return illegal;
362 return combine_surrogate(w1, w2);
363 }
364 template<typename It>
365 static code_point decode_valid(It& current)
366 {
367 uint16_t w1 = *current++;
368 if(BOOST_LIKELY(w1 < 0xD800 || 0xDFFF < w1))
369 {
370 return w1;
371 }
372 uint16_t w2 = *current++;
373 return combine_surrogate(w1, w2);
374 }
375
376 static const int max_width = 2;
377 static int width(code_point u)
378 {
379 return u >= 0x10000 ? 2 : 1;
380 }
381 template<typename It>
382 static It encode(code_point u, It out)
383 {
384 if(BOOST_LIKELY(u <= 0xFFFF))
385 {
386 *out++ = static_cast<char_type>(u);
387 } else
388 {
389 u -= 0x10000;
390 *out++ = static_cast<char_type>(0xD800 | (u >> 10));
391 *out++ = static_cast<char_type>(0xDC00 | (u & 0x3FF));
392 }
393 return out;
394 }
395 }; // utf16;
396
397 template<typename CharType>
398 struct utf_traits<CharType, 4>
399 {
400 using char_type = CharType;
401 static int trail_length(char_type c)
402 {
403 if(is_valid_codepoint(c))
404 return 0;
405 return -1;
406 }
407 static bool is_trail(char_type /*c*/)
408 {
409 return false;
410 }
411 static bool is_lead(char_type /*c*/)
412 {
413 return true;
414 }
415
416 template<typename It>
417 static code_point decode_valid(It& current)
418 {
419 return *current++;
420 }
421
422 template<typename It>
423 static code_point decode(It& current, It last)
424 {
425 if(BOOST_UNLIKELY(current == last))
426 return incomplete;
427 code_point c = *current++;
428 if(BOOST_UNLIKELY(!is_valid_codepoint(c)))
429 return illegal;
430 return c;
431 }
432 static const int max_width = 1;
433 static int width(code_point /*u*/)
434 {
435 return 1;
436 }
437 template<typename It>
438 static It encode(code_point u, It out)
439 {
440 *out++ = static_cast<char_type>(u);
441 return out;
442 }
443
444 }; // utf32
445
446 #endif
447
448 } // namespace utf
449 } // namespace nowide
450 } // namespace boost
451
452 #endif