]> git.proxmox.com Git - ceph.git/blame - ceph/src/boost/boost/nowide/utf/utf.hpp
update ceph source to reef 18.1.2
[ceph.git] / ceph / src / boost / boost / nowide / utf / utf.hpp
CommitLineData
20effc67
TL
1//
2// Copyright (c) 2009-2011 Artyom Beilis (Tonkikh)
3// Copyright (c) 2020 Alexander Grund
4//
5// Distributed under the Boost Software License, Version 1.0. (See
6// accompanying file LICENSE or copy at
7// http://www.boost.org/LICENSE_1_0.txt)
8//
9#ifndef BOOST_NOWIDE_UTF_HPP_INCLUDED
10#define BOOST_NOWIDE_UTF_HPP_INCLUDED
11
12#include <boost/nowide/config.hpp>
13#include <cstdint>
14
15namespace boost {
16namespace nowide {
17 ///
18 /// \brief Namespace that holds basic operations on UTF encoded sequences
19 ///
1e59de90 20 /// All functions defined in this namespace do not require linking with Boost.Nowide library.
20effc67
TL
21 /// Extracted from Boost.Locale
22 ///
23 namespace utf {
24
25 ///
26 /// \brief The integral type that can hold a Unicode code point
27 ///
28 using code_point = uint32_t;
29
30 ///
31 /// \brief Special constant that defines illegal code point
32 ///
33 static const code_point illegal = 0xFFFFFFFFu;
34
35 ///
36 /// \brief Special constant that defines incomplete code point
37 ///
38 static const code_point incomplete = 0xFFFFFFFEu;
39
40 ///
41 /// \brief the function checks if \a v is a valid code point
42 ///
43 inline bool is_valid_codepoint(code_point v)
44 {
45 if(v > 0x10FFFF)
46 return false;
47 if(0xD800 <= v && v <= 0xDFFF) // surrogates
48 return false;
49 return true;
50 }
51
52#ifdef BOOST_NOWIDE_DOXYGEN
53 ///
54 /// \brief UTF Traits class - functions to convert UTF sequences to and from Unicode code points
55 ///
56 template<typename CharType, int size = sizeof(CharType)>
57 struct utf_traits
58 {
59 ///
60 /// The type of the character
61 ///
62 using char_type = CharType;
63 ///
64 /// Read one code point from the range [p,e) and return it.
65 ///
66 /// - If the sequence that was read is incomplete sequence returns \ref incomplete,
67 /// - If illegal sequence detected returns \ref illegal
68 ///
69 /// Requirements
70 ///
71 /// - Iterator is valid input iterator
72 ///
73 /// Postconditions
74 ///
75 /// - p points to the last consumed character
76 ///
77 template<typename Iterator>
78 static code_point decode(Iterator& p, Iterator e);
79
80 ///
81 /// Maximal width of valid sequence in the code units:
82 ///
83 /// - UTF-8 - 4
84 /// - UTF-16 - 2
85 /// - UTF-32 - 1
86 ///
87 static const int max_width;
88 ///
89 /// The width of specific code point in the code units.
90 ///
91 /// Requirement: value is a valid Unicode code point
92 /// Returns value in range [1..max_width]
93 ///
94 static int width(code_point value);
95
96 ///
97 /// Get the size of the trail part of variable length encoded sequence.
98 ///
99 /// Returns -1 if C is not valid lead character
100 ///
101 static int trail_length(char_type c);
102 ///
103 /// Returns true if c is trail code unit, always false for UTF-32
104 ///
105 static bool is_trail(char_type c);
106 ///
107 /// Returns true if c is lead code unit, always true of UTF-32
108 ///
109 static bool is_lead(char_type c);
110
111 ///
112 /// Convert valid Unicode code point \a value to the UTF sequence.
113 ///
114 /// Requirements:
115 ///
116 /// - \a value is valid code point
117 /// - \a out is an output iterator should be able to accept at least width(value) units
118 ///
119 /// Returns the iterator past the last written code unit.
120 ///
121 template<typename Iterator>
122 static Iterator encode(code_point value, Iterator out);
123 ///
124 /// Decodes valid UTF sequence that is pointed by p into code point.
125 ///
126 /// If the sequence is invalid or points to end the behavior is undefined
127 ///
128 template<typename Iterator>
129 static code_point decode_valid(Iterator& p);
130 };
131
132#else
133
134 template<typename CharType, int size = sizeof(CharType)>
135 struct utf_traits;
136
137 template<typename CharType>
138 struct utf_traits<CharType, 1>
139 {
140 using char_type = CharType;
141
142 static int trail_length(char_type ci)
143 {
144 unsigned char c = ci;
145 if(c < 128)
146 return 0;
147 if(BOOST_UNLIKELY(c < 194))
148 return -1;
149 if(c < 224)
150 return 1;
151 if(c < 240)
152 return 2;
153 if(BOOST_LIKELY(c <= 244))
154 return 3;
155 return -1;
156 }
157
158 static const int max_width = 4;
159
160 static int width(code_point value)
161 {
162 if(value <= 0x7F)
163 {
164 return 1;
165 } else if(value <= 0x7FF)
166 {
167 return 2;
168 } else if(BOOST_LIKELY(value <= 0xFFFF))
169 {
170 return 3;
171 } else
172 {
173 return 4;
174 }
175 }
176
177 static bool is_trail(char_type ci)
178 {
179 unsigned char c = ci;
180 return (c & 0xC0) == 0x80;
181 }
182
183 static bool is_lead(char_type ci)
184 {
185 return !is_trail(ci);
186 }
187
188 template<typename Iterator>
189 static code_point decode(Iterator& p, Iterator e)
190 {
191 if(BOOST_UNLIKELY(p == e))
192 return incomplete;
193
194 unsigned char lead = *p++;
195
196 // First byte is fully validated here
197 int trail_size = trail_length(lead);
198
199 if(BOOST_UNLIKELY(trail_size < 0))
200 return illegal;
201
202 // OK as only ASCII may be of size = 0
203 // also optimize for ASCII text
204 if(trail_size == 0)
205 return lead;
206
207 code_point c = lead & ((1 << (6 - trail_size)) - 1);
208
209 // Read the rest
210 unsigned char tmp;
211 switch(trail_size)
212 {
213 case 3:
214 if(BOOST_UNLIKELY(p == e))
215 return incomplete;
216 tmp = *p++;
217 if(!is_trail(tmp))
218 return illegal;
219 c = (c << 6) | (tmp & 0x3F);
220 BOOST_NOWIDE_FALLTHROUGH;
221 case 2:
222 if(BOOST_UNLIKELY(p == e))
223 return incomplete;
224 tmp = *p++;
225 if(!is_trail(tmp))
226 return illegal;
227 c = (c << 6) | (tmp & 0x3F);
228 BOOST_NOWIDE_FALLTHROUGH;
229 case 1:
230 if(BOOST_UNLIKELY(p == e))
231 return incomplete;
232 tmp = *p++;
233 if(!is_trail(tmp))
234 return illegal;
235 c = (c << 6) | (tmp & 0x3F);
236 }
237
1e59de90
TL
238 // Check code point validity:
239 // - no surrogates and valid range
240 // - most compact representation
241 if(BOOST_UNLIKELY(!is_valid_codepoint(c)) || BOOST_UNLIKELY(width(c) != trail_size + 1))
242 {
243 p -= trail_size;
20effc67 244 return illegal;
1e59de90 245 }
20effc67
TL
246
247 return c;
248 }
249
250 template<typename Iterator>
251 static code_point decode_valid(Iterator& p)
252 {
253 unsigned char lead = *p++;
254 if(lead < 192)
255 return lead;
256
257 int trail_size;
258
259 if(lead < 224)
260 trail_size = 1;
261 else if(BOOST_LIKELY(lead < 240)) // non-BMP rare
262 trail_size = 2;
263 else
264 trail_size = 3;
265
266 code_point c = lead & ((1 << (6 - trail_size)) - 1);
267
268 switch(trail_size)
269 {
270 case 3: c = (c << 6) | (static_cast<unsigned char>(*p++) & 0x3F); BOOST_NOWIDE_FALLTHROUGH;
271 case 2: c = (c << 6) | (static_cast<unsigned char>(*p++) & 0x3F); BOOST_NOWIDE_FALLTHROUGH;
272 case 1: c = (c << 6) | (static_cast<unsigned char>(*p++) & 0x3F);
273 }
274
275 return c;
276 }
277
278 template<typename Iterator>
279 static Iterator encode(code_point value, Iterator out)
280 {
281 if(value <= 0x7F)
282 {
283 *out++ = static_cast<char_type>(value);
284 } else if(value <= 0x7FF)
285 {
286 *out++ = static_cast<char_type>((value >> 6) | 0xC0);
287 *out++ = static_cast<char_type>((value & 0x3F) | 0x80);
288 } else if(BOOST_LIKELY(value <= 0xFFFF))
289 {
290 *out++ = static_cast<char_type>((value >> 12) | 0xE0);
291 *out++ = static_cast<char_type>(((value >> 6) & 0x3F) | 0x80);
292 *out++ = static_cast<char_type>((value & 0x3F) | 0x80);
293 } else
294 {
295 *out++ = static_cast<char_type>((value >> 18) | 0xF0);
296 *out++ = static_cast<char_type>(((value >> 12) & 0x3F) | 0x80);
297 *out++ = static_cast<char_type>(((value >> 6) & 0x3F) | 0x80);
298 *out++ = static_cast<char_type>((value & 0x3F) | 0x80);
299 }
300 return out;
301 }
302 }; // utf8
303
304 template<typename CharType>
305 struct utf_traits<CharType, 2>
306 {
307 using char_type = CharType;
308
309 // See RFC 2781
1e59de90
TL
310 static bool is_single_codepoint(uint16_t x)
311 {
312 // Ranges [U+0000, 0+D7FF], [U+E000, U+FFFF] are numerically equal in UTF-16
313 return x <= 0xD7FF || x >= 0xE000;
314 }
20effc67
TL
315 static bool is_first_surrogate(uint16_t x)
316 {
1e59de90 317 // Range [U+D800, 0+DBFF]: High surrogate
20effc67
TL
318 return 0xD800 <= x && x <= 0xDBFF;
319 }
320 static bool is_second_surrogate(uint16_t x)
321 {
1e59de90 322 // Range [U+DC00, 0+DFFF]: Low surrogate
20effc67
TL
323 return 0xDC00 <= x && x <= 0xDFFF;
324 }
325 static code_point combine_surrogate(uint16_t w1, uint16_t w2)
326 {
327 return ((code_point(w1 & 0x3FF) << 10) | (w2 & 0x3FF)) + 0x10000;
328 }
329 static int trail_length(char_type c)
330 {
331 if(is_first_surrogate(c))
332 return 1;
333 if(is_second_surrogate(c))
334 return -1;
335 return 0;
336 }
1e59de90 337 /// Return true if c is trail code unit, always false for UTF-32
20effc67
TL
338 static bool is_trail(char_type c)
339 {
340 return is_second_surrogate(c);
341 }
1e59de90 342 /// Return true if c is lead code unit, always true of UTF-32
20effc67
TL
343 static bool is_lead(char_type c)
344 {
345 return !is_second_surrogate(c);
346 }
347
348 template<typename It>
349 static code_point decode(It& current, It last)
350 {
351 if(BOOST_UNLIKELY(current == last))
352 return incomplete;
353 uint16_t w1 = *current++;
1e59de90 354 if(BOOST_LIKELY(is_single_codepoint(w1)))
20effc67
TL
355 {
356 return w1;
357 }
1e59de90
TL
358 // Now it's either a high or a low surrogate, the latter is invalid
359 if(w1 >= 0xDC00)
20effc67
TL
360 return illegal;
361 if(current == last)
362 return incomplete;
363 uint16_t w2 = *current++;
1e59de90 364 if(!is_second_surrogate(w2))
20effc67
TL
365 return illegal;
366 return combine_surrogate(w1, w2);
367 }
368 template<typename It>
369 static code_point decode_valid(It& current)
370 {
371 uint16_t w1 = *current++;
1e59de90 372 if(BOOST_LIKELY(is_single_codepoint(w1)))
20effc67
TL
373 {
374 return w1;
375 }
376 uint16_t w2 = *current++;
377 return combine_surrogate(w1, w2);
378 }
379
380 static const int max_width = 2;
1e59de90 381 static int width(code_point u) // LCOV_EXCL_LINE
20effc67
TL
382 {
383 return u >= 0x10000 ? 2 : 1;
384 }
385 template<typename It>
386 static It encode(code_point u, It out)
387 {
388 if(BOOST_LIKELY(u <= 0xFFFF))
389 {
390 *out++ = static_cast<char_type>(u);
391 } else
392 {
393 u -= 0x10000;
394 *out++ = static_cast<char_type>(0xD800 | (u >> 10));
395 *out++ = static_cast<char_type>(0xDC00 | (u & 0x3FF));
396 }
397 return out;
398 }
399 }; // utf16;
400
401 template<typename CharType>
402 struct utf_traits<CharType, 4>
403 {
404 using char_type = CharType;
405 static int trail_length(char_type c)
406 {
407 if(is_valid_codepoint(c))
408 return 0;
409 return -1;
410 }
411 static bool is_trail(char_type /*c*/)
412 {
413 return false;
414 }
415 static bool is_lead(char_type /*c*/)
416 {
417 return true;
418 }
419
420 template<typename It>
421 static code_point decode_valid(It& current)
422 {
423 return *current++;
424 }
425
426 template<typename It>
427 static code_point decode(It& current, It last)
428 {
429 if(BOOST_UNLIKELY(current == last))
430 return incomplete;
431 code_point c = *current++;
432 if(BOOST_UNLIKELY(!is_valid_codepoint(c)))
433 return illegal;
434 return c;
435 }
436 static const int max_width = 1;
437 static int width(code_point /*u*/)
438 {
439 return 1;
440 }
441 template<typename It>
442 static It encode(code_point u, It out)
443 {
444 *out++ = static_cast<char_type>(u);
445 return out;
446 }
20effc67
TL
447 }; // utf32
448
449#endif
450
451 } // namespace utf
452} // namespace nowide
453} // namespace boost
454
455#endif