]>
Commit | Line | Data |
---|---|---|
20effc67 TL |
1 | // |
2 | // Copyright (c) 2009-2011 Artyom Beilis (Tonkikh) | |
3 | // Copyright (c) 2020 Alexander Grund | |
4 | // | |
5 | // Distributed under the Boost Software License, Version 1.0. (See | |
6 | // accompanying file LICENSE or copy at | |
7 | // http://www.boost.org/LICENSE_1_0.txt) | |
8 | // | |
9 | #ifndef BOOST_NOWIDE_UTF_HPP_INCLUDED | |
10 | #define BOOST_NOWIDE_UTF_HPP_INCLUDED | |
11 | ||
12 | #include <boost/nowide/config.hpp> | |
13 | #include <cstdint> | |
14 | ||
15 | namespace boost { | |
16 | namespace nowide { | |
17 | /// | |
18 | /// \brief Namespace that holds basic operations on UTF encoded sequences | |
19 | /// | |
1e59de90 | 20 | /// All functions defined in this namespace do not require linking with Boost.Nowide library. |
20effc67 TL |
21 | /// Extracted from Boost.Locale |
22 | /// | |
23 | namespace utf { | |
24 | ||
25 | /// | |
26 | /// \brief The integral type that can hold a Unicode code point | |
27 | /// | |
28 | using code_point = uint32_t; | |
29 | ||
30 | /// | |
31 | /// \brief Special constant that defines illegal code point | |
32 | /// | |
33 | static const code_point illegal = 0xFFFFFFFFu; | |
34 | ||
35 | /// | |
36 | /// \brief Special constant that defines incomplete code point | |
37 | /// | |
38 | static const code_point incomplete = 0xFFFFFFFEu; | |
39 | ||
40 | /// | |
41 | /// \brief the function checks if \a v is a valid code point | |
42 | /// | |
43 | inline bool is_valid_codepoint(code_point v) | |
44 | { | |
45 | if(v > 0x10FFFF) | |
46 | return false; | |
47 | if(0xD800 <= v && v <= 0xDFFF) // surrogates | |
48 | return false; | |
49 | return true; | |
50 | } | |
51 | ||
52 | #ifdef BOOST_NOWIDE_DOXYGEN | |
53 | /// | |
54 | /// \brief UTF Traits class - functions to convert UTF sequences to and from Unicode code points | |
55 | /// | |
56 | template<typename CharType, int size = sizeof(CharType)> | |
57 | struct utf_traits | |
58 | { | |
59 | /// | |
60 | /// The type of the character | |
61 | /// | |
62 | using char_type = CharType; | |
63 | /// | |
64 | /// Read one code point from the range [p,e) and return it. | |
65 | /// | |
66 | /// - If the sequence that was read is incomplete sequence returns \ref incomplete, | |
67 | /// - If illegal sequence detected returns \ref illegal | |
68 | /// | |
69 | /// Requirements | |
70 | /// | |
71 | /// - Iterator is valid input iterator | |
72 | /// | |
73 | /// Postconditions | |
74 | /// | |
75 | /// - p points to the last consumed character | |
76 | /// | |
77 | template<typename Iterator> | |
78 | static code_point decode(Iterator& p, Iterator e); | |
79 | ||
80 | /// | |
81 | /// Maximal width of valid sequence in the code units: | |
82 | /// | |
83 | /// - UTF-8 - 4 | |
84 | /// - UTF-16 - 2 | |
85 | /// - UTF-32 - 1 | |
86 | /// | |
87 | static const int max_width; | |
88 | /// | |
89 | /// The width of specific code point in the code units. | |
90 | /// | |
91 | /// Requirement: value is a valid Unicode code point | |
92 | /// Returns value in range [1..max_width] | |
93 | /// | |
94 | static int width(code_point value); | |
95 | ||
96 | /// | |
97 | /// Get the size of the trail part of variable length encoded sequence. | |
98 | /// | |
99 | /// Returns -1 if C is not valid lead character | |
100 | /// | |
101 | static int trail_length(char_type c); | |
102 | /// | |
103 | /// Returns true if c is trail code unit, always false for UTF-32 | |
104 | /// | |
105 | static bool is_trail(char_type c); | |
106 | /// | |
107 | /// Returns true if c is lead code unit, always true of UTF-32 | |
108 | /// | |
109 | static bool is_lead(char_type c); | |
110 | ||
111 | /// | |
112 | /// Convert valid Unicode code point \a value to the UTF sequence. | |
113 | /// | |
114 | /// Requirements: | |
115 | /// | |
116 | /// - \a value is valid code point | |
117 | /// - \a out is an output iterator should be able to accept at least width(value) units | |
118 | /// | |
119 | /// Returns the iterator past the last written code unit. | |
120 | /// | |
121 | template<typename Iterator> | |
122 | static Iterator encode(code_point value, Iterator out); | |
123 | /// | |
124 | /// Decodes valid UTF sequence that is pointed by p into code point. | |
125 | /// | |
126 | /// If the sequence is invalid or points to end the behavior is undefined | |
127 | /// | |
128 | template<typename Iterator> | |
129 | static code_point decode_valid(Iterator& p); | |
130 | }; | |
131 | ||
132 | #else | |
133 | ||
134 | template<typename CharType, int size = sizeof(CharType)> | |
135 | struct utf_traits; | |
136 | ||
137 | template<typename CharType> | |
138 | struct utf_traits<CharType, 1> | |
139 | { | |
140 | using char_type = CharType; | |
141 | ||
142 | static int trail_length(char_type ci) | |
143 | { | |
144 | unsigned char c = ci; | |
145 | if(c < 128) | |
146 | return 0; | |
147 | if(BOOST_UNLIKELY(c < 194)) | |
148 | return -1; | |
149 | if(c < 224) | |
150 | return 1; | |
151 | if(c < 240) | |
152 | return 2; | |
153 | if(BOOST_LIKELY(c <= 244)) | |
154 | return 3; | |
155 | return -1; | |
156 | } | |
157 | ||
158 | static const int max_width = 4; | |
159 | ||
160 | static int width(code_point value) | |
161 | { | |
162 | if(value <= 0x7F) | |
163 | { | |
164 | return 1; | |
165 | } else if(value <= 0x7FF) | |
166 | { | |
167 | return 2; | |
168 | } else if(BOOST_LIKELY(value <= 0xFFFF)) | |
169 | { | |
170 | return 3; | |
171 | } else | |
172 | { | |
173 | return 4; | |
174 | } | |
175 | } | |
176 | ||
177 | static bool is_trail(char_type ci) | |
178 | { | |
179 | unsigned char c = ci; | |
180 | return (c & 0xC0) == 0x80; | |
181 | } | |
182 | ||
183 | static bool is_lead(char_type ci) | |
184 | { | |
185 | return !is_trail(ci); | |
186 | } | |
187 | ||
188 | template<typename Iterator> | |
189 | static code_point decode(Iterator& p, Iterator e) | |
190 | { | |
191 | if(BOOST_UNLIKELY(p == e)) | |
192 | return incomplete; | |
193 | ||
194 | unsigned char lead = *p++; | |
195 | ||
196 | // First byte is fully validated here | |
197 | int trail_size = trail_length(lead); | |
198 | ||
199 | if(BOOST_UNLIKELY(trail_size < 0)) | |
200 | return illegal; | |
201 | ||
202 | // OK as only ASCII may be of size = 0 | |
203 | // also optimize for ASCII text | |
204 | if(trail_size == 0) | |
205 | return lead; | |
206 | ||
207 | code_point c = lead & ((1 << (6 - trail_size)) - 1); | |
208 | ||
209 | // Read the rest | |
210 | unsigned char tmp; | |
211 | switch(trail_size) | |
212 | { | |
213 | case 3: | |
214 | if(BOOST_UNLIKELY(p == e)) | |
215 | return incomplete; | |
216 | tmp = *p++; | |
217 | if(!is_trail(tmp)) | |
218 | return illegal; | |
219 | c = (c << 6) | (tmp & 0x3F); | |
220 | BOOST_NOWIDE_FALLTHROUGH; | |
221 | case 2: | |
222 | if(BOOST_UNLIKELY(p == e)) | |
223 | return incomplete; | |
224 | tmp = *p++; | |
225 | if(!is_trail(tmp)) | |
226 | return illegal; | |
227 | c = (c << 6) | (tmp & 0x3F); | |
228 | BOOST_NOWIDE_FALLTHROUGH; | |
229 | case 1: | |
230 | if(BOOST_UNLIKELY(p == e)) | |
231 | return incomplete; | |
232 | tmp = *p++; | |
233 | if(!is_trail(tmp)) | |
234 | return illegal; | |
235 | c = (c << 6) | (tmp & 0x3F); | |
236 | } | |
237 | ||
1e59de90 TL |
238 | // Check code point validity: |
239 | // - no surrogates and valid range | |
240 | // - most compact representation | |
241 | if(BOOST_UNLIKELY(!is_valid_codepoint(c)) || BOOST_UNLIKELY(width(c) != trail_size + 1)) | |
242 | { | |
243 | p -= trail_size; | |
20effc67 | 244 | return illegal; |
1e59de90 | 245 | } |
20effc67 TL |
246 | |
247 | return c; | |
248 | } | |
249 | ||
250 | template<typename Iterator> | |
251 | static code_point decode_valid(Iterator& p) | |
252 | { | |
253 | unsigned char lead = *p++; | |
254 | if(lead < 192) | |
255 | return lead; | |
256 | ||
257 | int trail_size; | |
258 | ||
259 | if(lead < 224) | |
260 | trail_size = 1; | |
261 | else if(BOOST_LIKELY(lead < 240)) // non-BMP rare | |
262 | trail_size = 2; | |
263 | else | |
264 | trail_size = 3; | |
265 | ||
266 | code_point c = lead & ((1 << (6 - trail_size)) - 1); | |
267 | ||
268 | switch(trail_size) | |
269 | { | |
270 | case 3: c = (c << 6) | (static_cast<unsigned char>(*p++) & 0x3F); BOOST_NOWIDE_FALLTHROUGH; | |
271 | case 2: c = (c << 6) | (static_cast<unsigned char>(*p++) & 0x3F); BOOST_NOWIDE_FALLTHROUGH; | |
272 | case 1: c = (c << 6) | (static_cast<unsigned char>(*p++) & 0x3F); | |
273 | } | |
274 | ||
275 | return c; | |
276 | } | |
277 | ||
278 | template<typename Iterator> | |
279 | static Iterator encode(code_point value, Iterator out) | |
280 | { | |
281 | if(value <= 0x7F) | |
282 | { | |
283 | *out++ = static_cast<char_type>(value); | |
284 | } else if(value <= 0x7FF) | |
285 | { | |
286 | *out++ = static_cast<char_type>((value >> 6) | 0xC0); | |
287 | *out++ = static_cast<char_type>((value & 0x3F) | 0x80); | |
288 | } else if(BOOST_LIKELY(value <= 0xFFFF)) | |
289 | { | |
290 | *out++ = static_cast<char_type>((value >> 12) | 0xE0); | |
291 | *out++ = static_cast<char_type>(((value >> 6) & 0x3F) | 0x80); | |
292 | *out++ = static_cast<char_type>((value & 0x3F) | 0x80); | |
293 | } else | |
294 | { | |
295 | *out++ = static_cast<char_type>((value >> 18) | 0xF0); | |
296 | *out++ = static_cast<char_type>(((value >> 12) & 0x3F) | 0x80); | |
297 | *out++ = static_cast<char_type>(((value >> 6) & 0x3F) | 0x80); | |
298 | *out++ = static_cast<char_type>((value & 0x3F) | 0x80); | |
299 | } | |
300 | return out; | |
301 | } | |
302 | }; // utf8 | |
303 | ||
304 | template<typename CharType> | |
305 | struct utf_traits<CharType, 2> | |
306 | { | |
307 | using char_type = CharType; | |
308 | ||
309 | // See RFC 2781 | |
1e59de90 TL |
310 | static bool is_single_codepoint(uint16_t x) |
311 | { | |
312 | // Ranges [U+0000, 0+D7FF], [U+E000, U+FFFF] are numerically equal in UTF-16 | |
313 | return x <= 0xD7FF || x >= 0xE000; | |
314 | } | |
20effc67 TL |
315 | static bool is_first_surrogate(uint16_t x) |
316 | { | |
1e59de90 | 317 | // Range [U+D800, 0+DBFF]: High surrogate |
20effc67 TL |
318 | return 0xD800 <= x && x <= 0xDBFF; |
319 | } | |
320 | static bool is_second_surrogate(uint16_t x) | |
321 | { | |
1e59de90 | 322 | // Range [U+DC00, 0+DFFF]: Low surrogate |
20effc67 TL |
323 | return 0xDC00 <= x && x <= 0xDFFF; |
324 | } | |
325 | static code_point combine_surrogate(uint16_t w1, uint16_t w2) | |
326 | { | |
327 | return ((code_point(w1 & 0x3FF) << 10) | (w2 & 0x3FF)) + 0x10000; | |
328 | } | |
329 | static int trail_length(char_type c) | |
330 | { | |
331 | if(is_first_surrogate(c)) | |
332 | return 1; | |
333 | if(is_second_surrogate(c)) | |
334 | return -1; | |
335 | return 0; | |
336 | } | |
1e59de90 | 337 | /// Return true if c is trail code unit, always false for UTF-32 |
20effc67 TL |
338 | static bool is_trail(char_type c) |
339 | { | |
340 | return is_second_surrogate(c); | |
341 | } | |
1e59de90 | 342 | /// Return true if c is lead code unit, always true of UTF-32 |
20effc67 TL |
343 | static bool is_lead(char_type c) |
344 | { | |
345 | return !is_second_surrogate(c); | |
346 | } | |
347 | ||
348 | template<typename It> | |
349 | static code_point decode(It& current, It last) | |
350 | { | |
351 | if(BOOST_UNLIKELY(current == last)) | |
352 | return incomplete; | |
353 | uint16_t w1 = *current++; | |
1e59de90 | 354 | if(BOOST_LIKELY(is_single_codepoint(w1))) |
20effc67 TL |
355 | { |
356 | return w1; | |
357 | } | |
1e59de90 TL |
358 | // Now it's either a high or a low surrogate, the latter is invalid |
359 | if(w1 >= 0xDC00) | |
20effc67 TL |
360 | return illegal; |
361 | if(current == last) | |
362 | return incomplete; | |
363 | uint16_t w2 = *current++; | |
1e59de90 | 364 | if(!is_second_surrogate(w2)) |
20effc67 TL |
365 | return illegal; |
366 | return combine_surrogate(w1, w2); | |
367 | } | |
368 | template<typename It> | |
369 | static code_point decode_valid(It& current) | |
370 | { | |
371 | uint16_t w1 = *current++; | |
1e59de90 | 372 | if(BOOST_LIKELY(is_single_codepoint(w1))) |
20effc67 TL |
373 | { |
374 | return w1; | |
375 | } | |
376 | uint16_t w2 = *current++; | |
377 | return combine_surrogate(w1, w2); | |
378 | } | |
379 | ||
380 | static const int max_width = 2; | |
1e59de90 | 381 | static int width(code_point u) // LCOV_EXCL_LINE |
20effc67 TL |
382 | { |
383 | return u >= 0x10000 ? 2 : 1; | |
384 | } | |
385 | template<typename It> | |
386 | static It encode(code_point u, It out) | |
387 | { | |
388 | if(BOOST_LIKELY(u <= 0xFFFF)) | |
389 | { | |
390 | *out++ = static_cast<char_type>(u); | |
391 | } else | |
392 | { | |
393 | u -= 0x10000; | |
394 | *out++ = static_cast<char_type>(0xD800 | (u >> 10)); | |
395 | *out++ = static_cast<char_type>(0xDC00 | (u & 0x3FF)); | |
396 | } | |
397 | return out; | |
398 | } | |
399 | }; // utf16; | |
400 | ||
401 | template<typename CharType> | |
402 | struct utf_traits<CharType, 4> | |
403 | { | |
404 | using char_type = CharType; | |
405 | static int trail_length(char_type c) | |
406 | { | |
407 | if(is_valid_codepoint(c)) | |
408 | return 0; | |
409 | return -1; | |
410 | } | |
411 | static bool is_trail(char_type /*c*/) | |
412 | { | |
413 | return false; | |
414 | } | |
415 | static bool is_lead(char_type /*c*/) | |
416 | { | |
417 | return true; | |
418 | } | |
419 | ||
420 | template<typename It> | |
421 | static code_point decode_valid(It& current) | |
422 | { | |
423 | return *current++; | |
424 | } | |
425 | ||
426 | template<typename It> | |
427 | static code_point decode(It& current, It last) | |
428 | { | |
429 | if(BOOST_UNLIKELY(current == last)) | |
430 | return incomplete; | |
431 | code_point c = *current++; | |
432 | if(BOOST_UNLIKELY(!is_valid_codepoint(c))) | |
433 | return illegal; | |
434 | return c; | |
435 | } | |
436 | static const int max_width = 1; | |
437 | static int width(code_point /*u*/) | |
438 | { | |
439 | return 1; | |
440 | } | |
441 | template<typename It> | |
442 | static It encode(code_point u, It out) | |
443 | { | |
444 | *out++ = static_cast<char_type>(u); | |
445 | return out; | |
446 | } | |
20effc67 TL |
447 | }; // utf32 |
448 | ||
449 | #endif | |
450 | ||
451 | } // namespace utf | |
452 | } // namespace nowide | |
453 | } // namespace boost | |
454 | ||
455 | #endif |