1 /////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8
2 // utf8_codecvt_facet.cpp
4 // Copyright (c) 2001 Ronald Garcia, Indiana University (garcia@osl.iu.edu)
5 // Andrew Lumsdaine, Indiana University (lums@osl.iu.edu).
6 // Distributed under the Boost Software License, Version 1.0. (See accompany-
7 // ing file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
9 // See http://www.boost.org/libs/iostreams for documentation.
11 //#include <cstdlib> // for multi-byte converson routines
14 // - Replaced test for BOOST_NO_STD_WSTREAMBUF with test for
15 // BOOST_IOSTREAMS_NO_WIDE_STREAMS;
16 // - Derived from codecvt_helper instead of codecvt.
18 #include <boost/config.hpp>
19 #include <boost/iostreams/detail/config/wide_streams.hpp>
20 #include <boost/numeric/conversion/cast.hpp>
21 #ifdef BOOST_IOSTREAMS_NO_LOCALES
22 # error "C++ locales not supported on this platform"
28 #include <boost/detail/workaround.hpp>
29 #include "./utf8_codecvt_facet.hpp"
31 #if BOOST_WORKAROUND(BOOST_BORLANDC, <= 0x600)
32 # pragma warn -sig // Conversion may lose significant digits
33 # pragma warn -rng // Constant is out of range in comparison
36 /////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8
37 // implementation for wchar_t
39 // Translate incoming UTF-8 into UCS-4
40 std::codecvt_base::result
utf8_codecvt_facet_wchar_t::do_in(
43 const char * from_end
,
44 const char * & from_next
,
49 // Basic algorithm: The first octet determines how many
50 // octets total make up the UCS-4 character. The remaining
51 // "continuing octets" all begin with "10". To convert, subtract
52 // the amount that specifies the number of octets from the first
53 // octet. Subtract 0x80 (1000 0000) from each continuing octet,
54 // then mash the whole lot together. Note that each continuing
55 // octet only uses 6 bits as unique values, so only shift by
56 // multiples of 6 to combine.
57 while (from
!= from_end
&& to
!= to_end
) {
59 // Error checking on the first octet
60 if (invalid_leading_octet(*from
)){
63 return std::codecvt_base::error
;
66 // The first octet is adjusted by a value dependent upon
67 // the number of "continuing octets" encoding the character
68 const int cont_octet_count
= get_cont_octet_count(*from
);
69 const wchar_t octet1_modifier_table
[] = {
70 0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc
73 // The unsigned char conversion is necessary in case char is
74 // signed (I learned this the hard way)
76 (unsigned char)(*from
++) - octet1_modifier_table
[cont_octet_count
];
79 // 1) At the start of the loop, 'i' continuing characters have been
81 // 2) *from points to the next continuing character to be processed.
83 while(i
!= cont_octet_count
&& from
!= from_end
) {
85 // Error checking on continuing characters
86 if (invalid_continuing_octet(*from
)) {
89 return std::codecvt_base::error
;
92 ucs_result
*= (1 << 6);
94 // each continuing character has an extra (10xxxxxx)b attached to
95 // it that must be removed.
96 ucs_result
+= (unsigned char)(*from
++) - 0x80;
100 // If the buffer ends with an incomplete unicode character...
101 if (from
== from_end
&& i
!= cont_octet_count
) {
102 // rewind "from" to before the current character translation
103 from_next
= from
- (i
+1);
105 return std::codecvt_base::partial
;
112 // Were we done converting or did we run out of destination space?
113 if(from
== from_end
) return std::codecvt_base::ok
;
114 else return std::codecvt_base::partial
;
117 std::codecvt_base::result
utf8_codecvt_facet_wchar_t::do_out(
119 const wchar_t * from
,
120 const wchar_t * from_end
,
121 const wchar_t * & from_next
,
127 // RG - consider merging this table with the other one
128 const wchar_t octet1_modifier_table
[] = {
129 0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc
132 while (from
!= from_end
&& to
!= to_end
) {
134 #define BOOST_NULL // Prevent macro expansion
135 // Check for invalid UCS-4 character
136 if (*from
> std::numeric_limits
<wchar_t>::max
BOOST_NULL ()) {
139 return std::codecvt_base::error
;
143 int cont_octet_count
= get_cont_octet_out_count(*from
);
145 // RG - comment this formula better
146 int shift_exponent
= (cont_octet_count
) * 6;
148 // Process the first character
149 *to
++ = octet1_modifier_table
[cont_octet_count
] +
150 (unsigned char)(*from
/ (1 << shift_exponent
));
152 // Process the continuation characters
153 // Invariants: At the start of the loop:
154 // 1) 'i' continuing octets have been generated
155 // 2) '*to' points to the next location to place an octet
156 // 3) shift_exponent is 6 more than needed for the next octet
158 while (i
!= cont_octet_count
&& to
!= to_end
) {
160 *to
++ = 0x80 + ((*from
/ (1 << shift_exponent
)) % (1 << 6));
163 // If we filled up the out buffer before encoding the character
164 if(to
== to_end
&& i
!= cont_octet_count
) {
166 to_next
= to
- (i
+1);
167 return std::codecvt_base::partial
;
173 // Were we done or did we run out of destination space
174 if(from
== from_end
) return std::codecvt_base::ok
;
175 else return std::codecvt_base::partial
;
178 // How many char objects can I process to get <= max_limit
180 int utf8_codecvt_facet_wchar_t::do_length(
181 BOOST_IOSTREAMS_CODECVT_CV_QUALIFIER
std::mbstate_t &,
183 const char * from_end
,
184 std::size_t max_limit
187 // RG - this code is confusing! I need a better way to express it.
191 // 1) last_octet_count has the size of the last measured character
192 // 2) char_count holds the number of characters shown to fit
193 // within the bounds so far (no greater than max_limit)
194 // 3) from_next points to the octet 'last_octet_count' before the
195 // last measured character.
196 int last_octet_count
=0;
197 std::size_t char_count
= 0;
198 const char* from_next
= from
;
199 // Use "<" because the buffer may represent incomplete characters
200 while (from_next
+last_octet_count
<= from_end
&& char_count
<= max_limit
) {
201 from_next
+= last_octet_count
;
202 last_octet_count
= (get_octet_count(*from_next
));
205 return boost::numeric_cast
<int>(from_next
- from_end
);
208 unsigned int utf8_codecvt_facet_wchar_t::get_octet_count(
209 unsigned char lead_octet
211 // if the 0-bit (MSB) is 0, then 1 character
212 if (lead_octet
<= 0x7f) return 1;
214 // Otherwise the count number of consecutive 1 bits starting at MSB
215 assert(0xc0 <= lead_octet
&& lead_octet
<= 0xfd);
217 if (0xc0 <= lead_octet
&& lead_octet
<= 0xdf) return 2;
218 else if (0xe0 <= lead_octet
&& lead_octet
<= 0xef) return 3;
219 else if (0xf0 <= lead_octet
&& lead_octet
<= 0xf7) return 4;
220 else if (0xf8 <= lead_octet
&& lead_octet
<= 0xfb) return 5;
225 template<std::size_t s
>
226 int get_cont_octet_out_count_impl(wchar_t word
){
236 // note the following code will generate on some platforms where
237 // wchar_t is defined as UCS2. The warnings are superfluous as
238 // the specialization is never instantitiated with such compilers.
240 int get_cont_octet_out_count_impl
<4>(wchar_t word
)
248 if (word
< 0x10000) {
251 if (word
< 0x200000) {
254 if (word
< 0x4000000) {
260 } // namespace anonymous
262 // How many "continuing octets" will be needed for this word
263 // == total octets - 1.
264 int utf8_codecvt_facet_wchar_t::get_cont_octet_out_count(
267 return get_cont_octet_out_count_impl
<sizeof(wchar_t)>(word
);
271 /////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8
272 // implementation for char
274 std::codecvt_base::result
utf8_codecvt_facet_char::do_in(
275 std::mbstate_t & state
,
277 const char * from_end
,
278 const char * & from_next
,
284 while(from_next
< from_end
){
286 wchar_t *wnext
= & w
;
287 utf8_codecvt_facet_wchar_t::result ucs4_result
;
288 ucs4_result
= base_class::do_in(
290 from
, from_end
, from_next
,
291 wnext
, wnext
+ 1, wnext
293 if(codecvt_base::ok
!= ucs4_result
)
295 // if the conversion succeeds.
296 int length
= std::wctomb(to_next
, w
);
297 assert(-1 != length
);
300 return codecvt_base::ok
;
303 std::codecvt_base::result
utf8_codecvt_facet_char::do_out(
306 const char * from_end
,
307 const char * & from_next
,
313 while(from_next
< from_end
){
315 int result
= std::mbtowc(&w
, from_next
, MB_LENGTH_MAX
);
316 assert(-1 != result
);
318 utf8_codecvt_facet_wchar_t::result ucs4_result
;
320 const wchar_t *wptr
= & w
;
321 ucs4_result
= base_class::do_out(
324 to_next
, to_end
, to_next
326 if(codecvt_base::ok
!= ucs4_result
)
329 return codecvt_base::ok
;
332 // How many bytes objects can I process to get <= max_limit
334 int utf8_codecvt_facet_char::do_length(
335 // it seems that the standard doesn't use const so these librarires
337 BOOST_IOSTREAMS_CODECVT_CV_QUALIFIER
338 utf8_codecvt_facet_wchar_t::mbstate_t & initial_state
,
339 const char * from_next
,
340 const char * from_end
,
341 std::size_t max_limit
344 int total_length
= 0;
345 const char *from
= from_next
;
346 mbstate_t state
= initial_state
;
347 while(from_next
< from_end
){
349 wchar_t *wnext
= & w
;
350 utf8_codecvt_facet_wchar_t::result ucs4_result
;
351 ucs4_result
= base_class::do_in(
353 from_next
, from_end
, from_next
,
354 wnext
, wnext
+ 1, wnext
357 if(codecvt_base::ok
!= ucs4_result
)
360 char carray
[MB_LENGTH_MAX
];
361 std::size_t count
= wctomb(carray
, w
);
362 if(count
> max_limit
)
366 total_length
= from_next
- from
;
372 #endif //BOOST_IOSTREAMS_NO_WIDE_STREAMS