]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | /////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8 |
2 | // utf8_codecvt_facet.cpp | |
3 | ||
4 | // Copyright (c) 2001 Ronald Garcia, Indiana University (garcia@osl.iu.edu) | |
5 | // Andrew Lumsdaine, Indiana University (lums@osl.iu.edu). | |
6 | // Distributed under the Boost Software License, Version 1.0. (See accompany- | |
7 | // ing file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) | |
8 | ||
9 | // See http://www.boost.org/libs/iostreams for documentation. | |
10 | ||
11 | //#include <cstdlib> // for multi-byte converson routines | |
12 | ||
13 | // Jonathan Turkanis: | |
14 | // - Replaced test for BOOST_NO_STD_WSTREAMBUF with test for | |
15 | // BOOST_IOSTREAMS_NO_WIDE_STREAMS; | |
16 | // - Derived from codecvt_helper instead of codecvt. | |
17 | ||
18 | #include <boost/config.hpp> | |
19 | #include <boost/iostreams/detail/config/wide_streams.hpp> | |
20 | #ifdef BOOST_IOSTREAMS_NO_LOCALES | |
21 | # error "C++ locales not supported on this platform" | |
22 | #else | |
23 | ||
24 | #include <cassert> | |
25 | #include <cstddef> | |
26 | ||
27 | #include <boost/detail/workaround.hpp> | |
28 | #include "./utf8_codecvt_facet.hpp" | |
29 | ||
30 | #if BOOST_WORKAROUND(__BORLANDC__, <= 0x600) | |
31 | # pragma warn -sig // Conversion may lose significant digits | |
32 | # pragma warn -rng // Constant is out of range in comparison | |
33 | #endif | |
34 | ||
35 | /////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8 | |
36 | // implementation for wchar_t | |
37 | ||
38 | // Translate incoming UTF-8 into UCS-4 | |
39 | std::codecvt_base::result utf8_codecvt_facet_wchar_t::do_in( | |
40 | std::mbstate_t&, | |
41 | const char * from, | |
42 | const char * from_end, | |
43 | const char * & from_next, | |
44 | wchar_t * to, | |
45 | wchar_t * to_end, | |
46 | wchar_t * & to_next | |
47 | ) const { | |
48 | // Basic algorithm: The first octet determines how many | |
49 | // octets total make up the UCS-4 character. The remaining | |
50 | // "continuing octets" all begin with "10". To convert, subtract | |
51 | // the amount that specifies the number of octets from the first | |
52 | // octet. Subtract 0x80 (1000 0000) from each continuing octet, | |
53 | // then mash the whole lot together. Note that each continuing | |
54 | // octet only uses 6 bits as unique values, so only shift by | |
55 | // multiples of 6 to combine. | |
56 | while (from != from_end && to != to_end) { | |
57 | ||
58 | // Error checking on the first octet | |
59 | if (invalid_leading_octet(*from)){ | |
60 | from_next = from; | |
61 | to_next = to; | |
62 | return std::codecvt_base::error; | |
63 | } | |
64 | ||
65 | // The first octet is adjusted by a value dependent upon | |
66 | // the number of "continuing octets" encoding the character | |
67 | const int cont_octet_count = get_cont_octet_count(*from); | |
68 | const wchar_t octet1_modifier_table[] = { | |
69 | 0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc | |
70 | }; | |
71 | ||
72 | // The unsigned char conversion is necessary in case char is | |
73 | // signed (I learned this the hard way) | |
74 | wchar_t ucs_result = | |
75 | (unsigned char)(*from++) - octet1_modifier_table[cont_octet_count]; | |
76 | ||
77 | // Invariants : | |
78 | // 1) At the start of the loop, 'i' continuing characters have been | |
79 | // processed | |
80 | // 2) *from points to the next continuing character to be processed. | |
81 | int i = 0; | |
82 | while(i != cont_octet_count && from != from_end) { | |
83 | ||
84 | // Error checking on continuing characters | |
85 | if (invalid_continuing_octet(*from)) { | |
86 | from_next = from; | |
87 | to_next = to; | |
88 | return std::codecvt_base::error; | |
89 | } | |
90 | ||
91 | ucs_result *= (1 << 6); | |
92 | ||
93 | // each continuing character has an extra (10xxxxxx)b attached to | |
94 | // it that must be removed. | |
95 | ucs_result += (unsigned char)(*from++) - 0x80; | |
96 | ++i; | |
97 | } | |
98 | ||
99 | // If the buffer ends with an incomplete unicode character... | |
100 | if (from == from_end && i != cont_octet_count) { | |
101 | // rewind "from" to before the current character translation | |
102 | from_next = from - (i+1); | |
103 | to_next = to; | |
104 | return std::codecvt_base::partial; | |
105 | } | |
106 | *to++ = ucs_result; | |
107 | } | |
108 | from_next = from; | |
109 | to_next = to; | |
110 | ||
111 | // Were we done converting or did we run out of destination space? | |
112 | if(from == from_end) return std::codecvt_base::ok; | |
113 | else return std::codecvt_base::partial; | |
114 | } | |
115 | ||
116 | std::codecvt_base::result utf8_codecvt_facet_wchar_t::do_out( | |
117 | std::mbstate_t &, | |
118 | const wchar_t * from, | |
119 | const wchar_t * from_end, | |
120 | const wchar_t * & from_next, | |
121 | char * to, | |
122 | char * to_end, | |
123 | char * & to_next | |
124 | ) const | |
125 | { | |
126 | // RG - consider merging this table with the other one | |
127 | const wchar_t octet1_modifier_table[] = { | |
128 | 0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc | |
129 | }; | |
130 | ||
131 | while (from != from_end && to != to_end) { | |
132 | ||
133 | #define BOOST_NULL // Prevent macro expansion | |
134 | // Check for invalid UCS-4 character | |
135 | if (*from > std::numeric_limits<wchar_t>::max BOOST_NULL ()) { | |
136 | from_next = from; | |
137 | to_next = to; | |
138 | return std::codecvt_base::error; | |
139 | } | |
140 | #undef BOOST_NULL | |
141 | ||
142 | int cont_octet_count = get_cont_octet_out_count(*from); | |
143 | ||
144 | // RG - comment this formula better | |
145 | int shift_exponent = (cont_octet_count) * 6; | |
146 | ||
147 | // Process the first character | |
148 | *to++ = octet1_modifier_table[cont_octet_count] + | |
149 | (unsigned char)(*from / (1 << shift_exponent)); | |
150 | ||
151 | // Process the continuation characters | |
152 | // Invariants: At the start of the loop: | |
153 | // 1) 'i' continuing octets have been generated | |
154 | // 2) '*to' points to the next location to place an octet | |
155 | // 3) shift_exponent is 6 more than needed for the next octet | |
156 | int i = 0; | |
157 | while (i != cont_octet_count && to != to_end) { | |
158 | shift_exponent -= 6; | |
159 | *to++ = 0x80 + ((*from / (1 << shift_exponent)) % (1 << 6)); | |
160 | ++i; | |
161 | } | |
162 | // If we filled up the out buffer before encoding the character | |
163 | if(to == to_end && i != cont_octet_count) { | |
164 | from_next = from; | |
165 | to_next = to - (i+1); | |
166 | return std::codecvt_base::partial; | |
167 | } | |
168 | *from++; | |
169 | } | |
170 | from_next = from; | |
171 | to_next = to; | |
172 | // Were we done or did we run out of destination space | |
173 | if(from == from_end) return std::codecvt_base::ok; | |
174 | else return std::codecvt_base::partial; | |
175 | } | |
176 | ||
177 | // How many char objects can I process to get <= max_limit | |
178 | // wchar_t objects? | |
179 | int utf8_codecvt_facet_wchar_t::do_length( | |
180 | BOOST_IOSTREAMS_CODECVT_CV_QUALIFIER std::mbstate_t &, | |
181 | const char * from, | |
182 | const char * from_end, | |
183 | std::size_t max_limit | |
184 | ) const throw() | |
185 | { | |
186 | // RG - this code is confusing! I need a better way to express it. | |
187 | // and test cases. | |
188 | ||
189 | // Invariants: | |
190 | // 1) last_octet_count has the size of the last measured character | |
191 | // 2) char_count holds the number of characters shown to fit | |
192 | // within the bounds so far (no greater than max_limit) | |
193 | // 3) from_next points to the octet 'last_octet_count' before the | |
194 | // last measured character. | |
195 | int last_octet_count=0; | |
196 | std::size_t char_count = 0; | |
197 | const char* from_next = from; | |
198 | // Use "<" because the buffer may represent incomplete characters | |
199 | while (from_next+last_octet_count <= from_end && char_count <= max_limit) { | |
200 | from_next += last_octet_count; | |
201 | last_octet_count = (get_octet_count(*from_next)); | |
202 | ++char_count; | |
203 | } | |
204 | return from_next-from_end; | |
205 | } | |
206 | ||
207 | unsigned int utf8_codecvt_facet_wchar_t::get_octet_count( | |
208 | unsigned char lead_octet | |
209 | ){ | |
210 | // if the 0-bit (MSB) is 0, then 1 character | |
211 | if (lead_octet <= 0x7f) return 1; | |
212 | ||
213 | // Otherwise the count number of consecutive 1 bits starting at MSB | |
214 | assert(0xc0 <= lead_octet && lead_octet <= 0xfd); | |
215 | ||
216 | if (0xc0 <= lead_octet && lead_octet <= 0xdf) return 2; | |
217 | else if (0xe0 <= lead_octet && lead_octet <= 0xef) return 3; | |
218 | else if (0xf0 <= lead_octet && lead_octet <= 0xf7) return 4; | |
219 | else if (0xf8 <= lead_octet && lead_octet <= 0xfb) return 5; | |
220 | else return 6; | |
221 | } | |
222 | ||
223 | namespace { | |
224 | template<std::size_t s> | |
225 | int get_cont_octet_out_count_impl(wchar_t word){ | |
226 | if (word < 0x80) { | |
227 | return 0; | |
228 | } | |
229 | if (word < 0x800) { | |
230 | return 1; | |
231 | } | |
232 | return 2; | |
233 | } | |
234 | ||
235 | // note the following code will generate on some platforms where | |
236 | // wchar_t is defined as UCS2. The warnings are superfluous as | |
237 | // the specialization is never instantitiated with such compilers. | |
238 | template<> | |
239 | int get_cont_octet_out_count_impl<4>(wchar_t word) | |
240 | { | |
241 | if (word < 0x80) { | |
242 | return 0; | |
243 | } | |
244 | if (word < 0x800) { | |
245 | return 1; | |
246 | } | |
247 | if (word < 0x10000) { | |
248 | return 2; | |
249 | } | |
250 | if (word < 0x200000) { | |
251 | return 3; | |
252 | } | |
253 | if (word < 0x4000000) { | |
254 | return 4; | |
255 | } | |
256 | return 5; | |
257 | } | |
258 | ||
259 | } // namespace anonymous | |
260 | ||
261 | // How many "continuing octets" will be needed for this word | |
262 | // == total octets - 1. | |
263 | int utf8_codecvt_facet_wchar_t::get_cont_octet_out_count( | |
264 | wchar_t word | |
265 | ) const { | |
266 | return get_cont_octet_out_count_impl<sizeof(wchar_t)>(word); | |
267 | } | |
268 | ||
269 | #if 0 // not used? | |
270 | /////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8 | |
271 | // implementation for char | |
272 | ||
273 | std::codecvt_base::result utf8_codecvt_facet_char::do_in( | |
274 | std::mbstate_t & state, | |
275 | const char * from, | |
276 | const char * from_end, | |
277 | const char * & from_next, | |
278 | char * to, | |
279 | char * to_end, | |
280 | char * & to_next | |
281 | ) const | |
282 | { | |
283 | while(from_next < from_end){ | |
284 | wchar_t w; | |
285 | wchar_t *wnext = & w; | |
286 | utf8_codecvt_facet_wchar_t::result ucs4_result; | |
287 | ucs4_result = base_class::do_in( | |
288 | state, | |
289 | from, from_end, from_next, | |
290 | wnext, wnext + 1, wnext | |
291 | ); | |
292 | if(codecvt_base::ok != ucs4_result) | |
293 | return ucs4_result; | |
294 | // if the conversion succeeds. | |
295 | int length = std::wctomb(to_next, w); | |
296 | assert(-1 != length); | |
297 | to_next += length; | |
298 | } | |
299 | return codecvt_base::ok; | |
300 | } | |
301 | ||
302 | std::codecvt_base::result utf8_codecvt_facet_char::do_out( | |
303 | mbstate_t & state, | |
304 | const char * from, | |
305 | const char * from_end, | |
306 | const char * & from_next, | |
307 | char * to, | |
308 | char * to_end, | |
309 | char * & to_next | |
310 | ) const | |
311 | { | |
312 | while(from_next < from_end){ | |
313 | wchar_t w; | |
314 | int result = std::mbtowc(&w, from_next, MB_LENGTH_MAX); | |
315 | assert(-1 != result); | |
316 | from_next += result; | |
317 | utf8_codecvt_facet_wchar_t::result ucs4_result; | |
318 | ||
319 | const wchar_t *wptr = & w; | |
320 | ucs4_result = base_class::do_out( | |
321 | state, | |
322 | wptr, wptr+1, wptr, | |
323 | to_next, to_end, to_next | |
324 | ); | |
325 | if(codecvt_base::ok != ucs4_result) | |
326 | return ucs4_result; | |
327 | } | |
328 | return codecvt_base::ok; | |
329 | } | |
330 | ||
331 | // How many bytes objects can I process to get <= max_limit | |
332 | // char objects? | |
333 | int utf8_codecvt_facet_char::do_length( | |
334 | // it seems that the standard doesn't use const so these librarires | |
335 | // would be in error | |
336 | BOOST_IOSTREAMS_CODECVT_CV_QUALIFIER | |
337 | utf8_codecvt_facet_wchar_t::mbstate_t & initial_state, | |
338 | const char * from_next, | |
339 | const char * from_end, | |
340 | std::size_t max_limit | |
341 | ) const | |
342 | { | |
343 | int total_length = 0; | |
344 | const char *from = from_next; | |
345 | mbstate_t state = initial_state; | |
346 | while(from_next < from_end){ | |
347 | wchar_t w; | |
348 | wchar_t *wnext = & w; | |
349 | utf8_codecvt_facet_wchar_t::result ucs4_result; | |
350 | ucs4_result = base_class::do_in( | |
351 | state, | |
352 | from_next, from_end, from_next, | |
353 | wnext, wnext + 1, wnext | |
354 | ); | |
355 | ||
356 | if(codecvt_base::ok != ucs4_result) | |
357 | break; | |
358 | ||
359 | char carray[MB_LENGTH_MAX]; | |
360 | std::size_t count = wctomb(carray, w); | |
361 | if(count > max_limit) | |
362 | break; | |
363 | ||
364 | max_limit -= count; | |
365 | total_length = from_next - from; | |
366 | } | |
367 | return total_length; | |
368 | } | |
369 | #endif | |
370 | ||
371 | #endif //BOOST_IOSTREAMS_NO_WIDE_STREAMS |