]>
Commit | Line | Data |
---|---|---|
f67539c2 TL |
1 | // |
2 | // Copyright (c) 2009-2011 Artyom Beilis (Tonkikh) | |
3 | // | |
4 | // Distributed under the Boost Software License, Version 1.0. (See | |
5 | // accompanying file LICENSE_1_0.txt or copy at | |
6 | // http://www.boost.org/LICENSE_1_0.txt) | |
7 | // | |
8 | #ifndef BOOST_NOWIDE_UTF_HPP_INCLUDED | |
9 | #define BOOST_NOWIDE_UTF_HPP_INCLUDED | |
10 | ||
11 | #include <boost/nowide/config.hpp> | |
12 | #include <boost/cstdint.hpp> | |
13 | ||
14 | namespace boost { | |
15 | namespace nowide { | |
16 | namespace detail { | |
17 | /// | |
18 | /// \brief Namespace that holds basic operations on UTF encoded sequences | |
19 | /// | |
20 | /// All functions defined in this namespace do not require linking with Boost.Nowide library | |
21 | /// Extracted from Boost.Locale | |
22 | /// | |
23 | namespace utf { | |
24 | ||
25 | /// | |
26 | /// \brief The integral type that can hold a Unicode code point | |
27 | /// | |
28 | typedef uint32_t code_point; | |
29 | ||
30 | /// | |
31 | /// \brief Special constant that defines illegal code point | |
32 | /// | |
33 | static const code_point illegal = 0xFFFFFFFFu; | |
34 | ||
35 | /// | |
36 | /// \brief Special constant that defines incomplete code point | |
37 | /// | |
38 | static const code_point incomplete = 0xFFFFFFFEu; | |
39 | ||
40 | /// | |
41 | /// \brief the function checks if \a v is a valid code point | |
42 | /// | |
43 | inline bool is_valid_codepoint(code_point v) | |
44 | { | |
45 | if(v > 0x10FFFF) | |
46 | return false; | |
47 | if(0xD800 <= v && v <= 0xDFFF) // surrogates | |
48 | return false; | |
49 | return true; | |
50 | } | |
51 | ||
52 | #ifdef BOOST_NOWIDE_DOXYGEN | |
53 | /// | |
54 | /// \brief UTF Traits class - functions to convert UTF sequences to and from Unicode code points | |
55 | /// | |
56 | template<typename CharType, int size = sizeof(CharType)> | |
57 | struct utf_traits | |
58 | { | |
59 | /// | |
60 | /// The type of the character | |
61 | /// | |
62 | typedef CharType char_type; | |
63 | /// | |
64 | /// Read one code point from the range [p,e) and return it. | |
65 | /// | |
66 | /// - If the sequence that was read is incomplete sequence returns \ref incomplete, | |
67 | /// - If illegal sequence detected returns \ref illegal | |
68 | /// | |
69 | /// Requirements | |
70 | /// | |
71 | /// - Iterator is valid input iterator | |
72 | /// | |
73 | /// Postconditions | |
74 | /// | |
75 | /// - p points to the last consumed character | |
76 | /// | |
77 | template<typename Iterator> | |
78 | static code_point decode(Iterator& p, Iterator e); | |
79 | ||
80 | /// | |
81 | /// Maximal width of valid sequence in the code units: | |
82 | /// | |
83 | /// - UTF-8 - 4 | |
84 | /// - UTF-16 - 2 | |
85 | /// - UTF-32 - 1 | |
86 | /// | |
87 | static const int max_width; | |
88 | /// | |
89 | /// The width of specific code point in the code units. | |
90 | /// | |
91 | /// Requirement: value is a valid Unicode code point | |
92 | /// Returns value in range [1..max_width] | |
93 | /// | |
94 | static int width(code_point value); | |
95 | ||
96 | /// | |
97 | /// Get the size of the trail part of variable length encoded sequence. | |
98 | /// | |
99 | /// Returns -1 if C is not valid lead character | |
100 | /// | |
101 | static int trail_length(char_type c); | |
102 | /// | |
103 | /// Returns true if c is trail code unit, always false for UTF-32 | |
104 | /// | |
105 | static bool is_trail(char_type c); | |
106 | /// | |
107 | /// Returns true if c is lead code unit, always true of UTF-32 | |
108 | /// | |
109 | static bool is_lead(char_type c); | |
110 | ||
111 | /// | |
112 | /// Convert valid Unicode code point \a value to the UTF sequence. | |
113 | /// | |
114 | /// Requirements: | |
115 | /// | |
116 | /// - \a value is valid code point | |
117 | /// - \a out is an output iterator should be able to accept at least width(value) units | |
118 | /// | |
119 | /// Returns the iterator past the last written code unit. | |
120 | /// | |
121 | template<typename Iterator> | |
122 | static Iterator encode(code_point value, Iterator out); | |
123 | /// | |
124 | /// Decodes valid UTF sequence that is pointed by p into code point. | |
125 | /// | |
126 | /// If the sequence is invalid or points to end the behavior is undefined | |
127 | /// | |
128 | template<typename Iterator> | |
129 | static code_point decode_valid(Iterator& p); | |
130 | }; | |
131 | ||
132 | #else | |
133 | ||
134 | template<typename CharType, int size = sizeof(CharType)> | |
135 | struct utf_traits; | |
136 | ||
137 | template<typename CharType> | |
138 | struct utf_traits<CharType, 1> | |
139 | { | |
140 | typedef CharType char_type; | |
141 | ||
142 | static int trail_length(char_type ci) | |
143 | { | |
144 | unsigned char c = ci; | |
145 | if(c < 128) | |
146 | return 0; | |
147 | if(BOOST_UNLIKELY(c < 194)) | |
148 | return -1; | |
149 | if(c < 224) | |
150 | return 1; | |
151 | if(c < 240) | |
152 | return 2; | |
153 | if(BOOST_LIKELY(c <= 244)) | |
154 | return 3; | |
155 | return -1; | |
156 | } | |
157 | ||
158 | static const int max_width = 4; | |
159 | ||
160 | static int width(code_point value) | |
161 | { | |
162 | if(value <= 0x7F) | |
163 | { | |
164 | return 1; | |
165 | } else if(value <= 0x7FF) | |
166 | { | |
167 | return 2; | |
168 | } else if(BOOST_LIKELY(value <= 0xFFFF)) | |
169 | { | |
170 | return 3; | |
171 | } else | |
172 | { | |
173 | return 4; | |
174 | } | |
175 | } | |
176 | ||
177 | static bool is_trail(char_type ci) | |
178 | { | |
179 | unsigned char c = ci; | |
180 | return (c & 0xC0) == 0x80; | |
181 | } | |
182 | ||
183 | static bool is_lead(char_type ci) | |
184 | { | |
185 | return !is_trail(ci); | |
186 | } | |
187 | ||
188 | template<typename Iterator> | |
189 | static code_point decode(Iterator& p, Iterator e) | |
190 | { | |
191 | if(BOOST_UNLIKELY(p == e)) | |
192 | return incomplete; | |
193 | ||
194 | unsigned char lead = *p++; | |
195 | ||
196 | // First byte is fully validated here | |
197 | int trail_size = trail_length(lead); | |
198 | ||
199 | if(BOOST_UNLIKELY(trail_size < 0)) | |
200 | return illegal; | |
201 | ||
202 | // | |
203 | // OK as only ASCII may be of size = 0 | |
204 | // also optimize for ASCII text | |
205 | // | |
206 | if(trail_size == 0) | |
207 | return lead; | |
208 | ||
209 | code_point c = lead & ((1 << (6 - trail_size)) - 1); | |
210 | ||
211 | // Read the rest | |
212 | unsigned char tmp; | |
213 | switch(trail_size) | |
214 | { | |
215 | case 3: | |
216 | if(BOOST_UNLIKELY(p == e)) | |
217 | return incomplete; | |
218 | tmp = *p++; | |
219 | if(!is_trail(tmp)) | |
220 | return illegal; | |
221 | c = (c << 6) | (tmp & 0x3F); | |
222 | BOOST_NOWIDE_FALLTHROUGH; | |
223 | case 2: | |
224 | if(BOOST_UNLIKELY(p == e)) | |
225 | return incomplete; | |
226 | tmp = *p++; | |
227 | if(!is_trail(tmp)) | |
228 | return illegal; | |
229 | c = (c << 6) | (tmp & 0x3F); | |
230 | BOOST_NOWIDE_FALLTHROUGH; | |
231 | case 1: | |
232 | if(BOOST_UNLIKELY(p == e)) | |
233 | return incomplete; | |
234 | tmp = *p++; | |
235 | if(!is_trail(tmp)) | |
236 | return illegal; | |
237 | c = (c << 6) | (tmp & 0x3F); | |
238 | } | |
239 | ||
240 | // Check code point validity: no surrogates and | |
241 | // valid range | |
242 | if(BOOST_UNLIKELY(!is_valid_codepoint(c))) | |
243 | return illegal; | |
244 | ||
245 | // make sure it is the most compact representation | |
246 | if(BOOST_UNLIKELY(width(c) != trail_size + 1)) | |
247 | return illegal; | |
248 | ||
249 | return c; | |
250 | } | |
251 | ||
252 | template<typename Iterator> | |
253 | static code_point decode_valid(Iterator& p) | |
254 | { | |
255 | unsigned char lead = *p++; | |
256 | if(lead < 192) | |
257 | return lead; | |
258 | ||
259 | int trail_size; | |
260 | ||
261 | if(lead < 224) | |
262 | trail_size = 1; | |
263 | else if(BOOST_LIKELY(lead < 240)) // non-BMP rare | |
264 | trail_size = 2; | |
265 | else | |
266 | trail_size = 3; | |
267 | ||
268 | code_point c = lead & ((1 << (6 - trail_size)) - 1); | |
269 | ||
270 | switch(trail_size) | |
271 | { | |
272 | case 3: c = (c << 6) | (static_cast<unsigned char>(*p++) & 0x3F); BOOST_NOWIDE_FALLTHROUGH; | |
273 | case 2: c = (c << 6) | (static_cast<unsigned char>(*p++) & 0x3F); BOOST_NOWIDE_FALLTHROUGH; | |
274 | case 1: c = (c << 6) | (static_cast<unsigned char>(*p++) & 0x3F); | |
275 | } | |
276 | ||
277 | return c; | |
278 | } | |
279 | ||
280 | template<typename Iterator> | |
281 | static Iterator encode(code_point value, Iterator out) | |
282 | { | |
283 | if(value <= 0x7F) | |
284 | { | |
285 | *out++ = static_cast<char_type>(value); | |
286 | } else if(value <= 0x7FF) | |
287 | { | |
288 | *out++ = static_cast<char_type>((value >> 6) | 0xC0); | |
289 | *out++ = static_cast<char_type>((value & 0x3F) | 0x80); | |
290 | } else if(BOOST_LIKELY(value <= 0xFFFF)) | |
291 | { | |
292 | *out++ = static_cast<char_type>((value >> 12) | 0xE0); | |
293 | *out++ = static_cast<char_type>(((value >> 6) & 0x3F) | 0x80); | |
294 | *out++ = static_cast<char_type>((value & 0x3F) | 0x80); | |
295 | } else | |
296 | { | |
297 | *out++ = static_cast<char_type>((value >> 18) | 0xF0); | |
298 | *out++ = static_cast<char_type>(((value >> 12) & 0x3F) | 0x80); | |
299 | *out++ = static_cast<char_type>(((value >> 6) & 0x3F) | 0x80); | |
300 | *out++ = static_cast<char_type>((value & 0x3F) | 0x80); | |
301 | } | |
302 | return out; | |
303 | } | |
304 | }; // utf8 | |
305 | ||
306 | template<typename CharType> | |
307 | struct utf_traits<CharType, 2> | |
308 | { | |
309 | typedef CharType char_type; | |
310 | ||
311 | // See RFC 2781 | |
312 | static bool is_first_surrogate(uint16_t x) | |
313 | { | |
314 | return 0xD800 <= x && x <= 0xDBFF; | |
315 | } | |
316 | static bool is_second_surrogate(uint16_t x) | |
317 | { | |
318 | return 0xDC00 <= x && x <= 0xDFFF; | |
319 | } | |
320 | static code_point combine_surrogate(uint16_t w1, uint16_t w2) | |
321 | { | |
322 | return ((code_point(w1 & 0x3FF) << 10) | (w2 & 0x3FF)) + 0x10000; | |
323 | } | |
324 | static int trail_length(char_type c) | |
325 | { | |
326 | if(is_first_surrogate(c)) | |
327 | return 1; | |
328 | if(is_second_surrogate(c)) | |
329 | return -1; | |
330 | return 0; | |
331 | } | |
332 | /// | |
333 | /// Returns true if c is trail code unit, always false for UTF-32 | |
334 | /// | |
335 | static bool is_trail(char_type c) | |
336 | { | |
337 | return is_second_surrogate(c); | |
338 | } | |
339 | /// | |
340 | /// Returns true if c is lead code unit, always true of UTF-32 | |
341 | /// | |
342 | static bool is_lead(char_type c) | |
343 | { | |
344 | return !is_second_surrogate(c); | |
345 | } | |
346 | ||
347 | template<typename It> | |
348 | static code_point decode(It& current, It last) | |
349 | { | |
350 | if(BOOST_UNLIKELY(current == last)) | |
351 | return incomplete; | |
352 | uint16_t w1 = *current++; | |
353 | if(BOOST_LIKELY(w1 < 0xD800 || 0xDFFF < w1)) | |
354 | { | |
355 | return w1; | |
356 | } | |
357 | if(w1 > 0xDBFF) | |
358 | return illegal; | |
359 | if(current == last) | |
360 | return incomplete; | |
361 | uint16_t w2 = *current++; | |
362 | if(w2 < 0xDC00 || 0xDFFF < w2) | |
363 | return illegal; | |
364 | return combine_surrogate(w1, w2); | |
365 | } | |
366 | template<typename It> | |
367 | static code_point decode_valid(It& current) | |
368 | { | |
369 | uint16_t w1 = *current++; | |
370 | if(BOOST_LIKELY(w1 < 0xD800 || 0xDFFF < w1)) | |
371 | { | |
372 | return w1; | |
373 | } | |
374 | uint16_t w2 = *current++; | |
375 | return combine_surrogate(w1, w2); | |
376 | } | |
377 | ||
378 | static const int max_width = 2; | |
379 | static int width(code_point u) | |
380 | { | |
381 | return u >= 0x10000 ? 2 : 1; | |
382 | } | |
383 | template<typename It> | |
384 | static It encode(code_point u, It out) | |
385 | { | |
386 | if(BOOST_LIKELY(u <= 0xFFFF)) | |
387 | { | |
388 | *out++ = static_cast<char_type>(u); | |
389 | } else | |
390 | { | |
391 | u -= 0x10000; | |
392 | *out++ = static_cast<char_type>(0xD800 | (u >> 10)); | |
393 | *out++ = static_cast<char_type>(0xDC00 | (u & 0x3FF)); | |
394 | } | |
395 | return out; | |
396 | } | |
397 | }; // utf16; | |
398 | ||
399 | template<typename CharType> | |
400 | struct utf_traits<CharType, 4> | |
401 | { | |
402 | typedef CharType char_type; | |
403 | static int trail_length(char_type c) | |
404 | { | |
405 | if(is_valid_codepoint(c)) | |
406 | return 0; | |
407 | return -1; | |
408 | } | |
409 | static bool is_trail(char_type /*c*/) | |
410 | { | |
411 | return false; | |
412 | } | |
413 | static bool is_lead(char_type /*c*/) | |
414 | { | |
415 | return true; | |
416 | } | |
417 | ||
418 | template<typename It> | |
419 | static code_point decode_valid(It& current) | |
420 | { | |
421 | return *current++; | |
422 | } | |
423 | ||
424 | template<typename It> | |
425 | static code_point decode(It& current, It last) | |
426 | { | |
427 | if(BOOST_UNLIKELY(current == last)) | |
428 | return incomplete; | |
429 | code_point c = *current++; | |
430 | if(BOOST_UNLIKELY(!is_valid_codepoint(c))) | |
431 | return illegal; | |
432 | return c; | |
433 | } | |
434 | static const int max_width = 1; | |
435 | static int width(code_point /*u*/) | |
436 | { | |
437 | return 1; | |
438 | } | |
439 | template<typename It> | |
440 | static It encode(code_point u, It out) | |
441 | { | |
442 | *out++ = static_cast<char_type>(u); | |
443 | return out; | |
444 | } | |
445 | ||
446 | }; // utf32 | |
447 | ||
448 | #endif | |
449 | ||
450 | } // namespace utf | |
451 | } // namespace detail | |
452 | } // namespace nowide | |
453 | } // namespace boost | |
454 | ||
455 | #endif |