1 /////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8
2 // utf8_codecvt_facet.cpp
4 // Copyright (c) 2001 Ronald Garcia, Indiana University (garcia@osl.iu.edu)
5 // Andrew Lumsdaine, Indiana University (lums@osl.iu.edu).
6 // Distributed under the Boost Software License, Version 1.0. (See accompany-
7 // ing file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
9 // See http://www.boost.org/libs/iostreams for documentation.
11 //#include <cstdlib> // for multi-byte converson routines
14 // - Replaced test for BOOST_NO_STD_WSTREAMBUF with test for
15 // BOOST_IOSTREAMS_NO_WIDE_STREAMS;
16 // - Derived from codecvt_helper instead of codecvt.
18 #include <boost/config.hpp>
19 #include <boost/iostreams/detail/config/wide_streams.hpp>
20 #ifdef BOOST_IOSTREAMS_NO_LOCALES
21 # error "C++ locales not supported on this platform"
27 #include <boost/detail/workaround.hpp>
28 #include "./utf8_codecvt_facet.hpp"
30 #if BOOST_WORKAROUND(__BORLANDC__, <= 0x600)
31 # pragma warn -sig // Conversion may lose significant digits
32 # pragma warn -rng // Constant is out of range in comparison
35 /////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8
36 // implementation for wchar_t
38 // Translate incoming UTF-8 into UCS-4
39 std::codecvt_base::result
utf8_codecvt_facet_wchar_t::do_in(
42 const char * from_end
,
43 const char * & from_next
,
48 // Basic algorithm: The first octet determines how many
49 // octets total make up the UCS-4 character. The remaining
50 // "continuing octets" all begin with "10". To convert, subtract
51 // the amount that specifies the number of octets from the first
52 // octet. Subtract 0x80 (1000 0000) from each continuing octet,
53 // then mash the whole lot together. Note that each continuing
54 // octet only uses 6 bits as unique values, so only shift by
55 // multiples of 6 to combine.
56 while (from
!= from_end
&& to
!= to_end
) {
58 // Error checking on the first octet
59 if (invalid_leading_octet(*from
)){
62 return std::codecvt_base::error
;
65 // The first octet is adjusted by a value dependent upon
66 // the number of "continuing octets" encoding the character
67 const int cont_octet_count
= get_cont_octet_count(*from
);
68 const wchar_t octet1_modifier_table
[] = {
69 0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc
72 // The unsigned char conversion is necessary in case char is
73 // signed (I learned this the hard way)
75 (unsigned char)(*from
++) - octet1_modifier_table
[cont_octet_count
];
78 // 1) At the start of the loop, 'i' continuing characters have been
80 // 2) *from points to the next continuing character to be processed.
82 while(i
!= cont_octet_count
&& from
!= from_end
) {
84 // Error checking on continuing characters
85 if (invalid_continuing_octet(*from
)) {
88 return std::codecvt_base::error
;
91 ucs_result
*= (1 << 6);
93 // each continuing character has an extra (10xxxxxx)b attached to
94 // it that must be removed.
95 ucs_result
+= (unsigned char)(*from
++) - 0x80;
99 // If the buffer ends with an incomplete unicode character...
100 if (from
== from_end
&& i
!= cont_octet_count
) {
101 // rewind "from" to before the current character translation
102 from_next
= from
- (i
+1);
104 return std::codecvt_base::partial
;
111 // Were we done converting or did we run out of destination space?
112 if(from
== from_end
) return std::codecvt_base::ok
;
113 else return std::codecvt_base::partial
;
116 std::codecvt_base::result
utf8_codecvt_facet_wchar_t::do_out(
118 const wchar_t * from
,
119 const wchar_t * from_end
,
120 const wchar_t * & from_next
,
126 // RG - consider merging this table with the other one
127 const wchar_t octet1_modifier_table
[] = {
128 0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc
131 while (from
!= from_end
&& to
!= to_end
) {
133 #define BOOST_NULL // Prevent macro expansion
134 // Check for invalid UCS-4 character
135 if (*from
> std::numeric_limits
<wchar_t>::max
BOOST_NULL ()) {
138 return std::codecvt_base::error
;
142 int cont_octet_count
= get_cont_octet_out_count(*from
);
144 // RG - comment this formula better
145 int shift_exponent
= (cont_octet_count
) * 6;
147 // Process the first character
148 *to
++ = octet1_modifier_table
[cont_octet_count
] +
149 (unsigned char)(*from
/ (1 << shift_exponent
));
151 // Process the continuation characters
152 // Invariants: At the start of the loop:
153 // 1) 'i' continuing octets have been generated
154 // 2) '*to' points to the next location to place an octet
155 // 3) shift_exponent is 6 more than needed for the next octet
157 while (i
!= cont_octet_count
&& to
!= to_end
) {
159 *to
++ = 0x80 + ((*from
/ (1 << shift_exponent
)) % (1 << 6));
162 // If we filled up the out buffer before encoding the character
163 if(to
== to_end
&& i
!= cont_octet_count
) {
165 to_next
= to
- (i
+1);
166 return std::codecvt_base::partial
;
172 // Were we done or did we run out of destination space
173 if(from
== from_end
) return std::codecvt_base::ok
;
174 else return std::codecvt_base::partial
;
177 // How many char objects can I process to get <= max_limit
179 int utf8_codecvt_facet_wchar_t::do_length(
180 BOOST_IOSTREAMS_CODECVT_CV_QUALIFIER
std::mbstate_t &,
182 const char * from_end
,
183 std::size_t max_limit
186 // RG - this code is confusing! I need a better way to express it.
190 // 1) last_octet_count has the size of the last measured character
191 // 2) char_count holds the number of characters shown to fit
192 // within the bounds so far (no greater than max_limit)
193 // 3) from_next points to the octet 'last_octet_count' before the
194 // last measured character.
195 int last_octet_count
=0;
196 std::size_t char_count
= 0;
197 const char* from_next
= from
;
198 // Use "<" because the buffer may represent incomplete characters
199 while (from_next
+last_octet_count
<= from_end
&& char_count
<= max_limit
) {
200 from_next
+= last_octet_count
;
201 last_octet_count
= (get_octet_count(*from_next
));
204 return from_next
-from_end
;
207 unsigned int utf8_codecvt_facet_wchar_t::get_octet_count(
208 unsigned char lead_octet
210 // if the 0-bit (MSB) is 0, then 1 character
211 if (lead_octet
<= 0x7f) return 1;
213 // Otherwise the count number of consecutive 1 bits starting at MSB
214 assert(0xc0 <= lead_octet
&& lead_octet
<= 0xfd);
216 if (0xc0 <= lead_octet
&& lead_octet
<= 0xdf) return 2;
217 else if (0xe0 <= lead_octet
&& lead_octet
<= 0xef) return 3;
218 else if (0xf0 <= lead_octet
&& lead_octet
<= 0xf7) return 4;
219 else if (0xf8 <= lead_octet
&& lead_octet
<= 0xfb) return 5;
224 template<std::size_t s
>
225 int get_cont_octet_out_count_impl(wchar_t word
){
235 // note the following code will generate on some platforms where
236 // wchar_t is defined as UCS2. The warnings are superfluous as
237 // the specialization is never instantitiated with such compilers.
239 int get_cont_octet_out_count_impl
<4>(wchar_t word
)
247 if (word
< 0x10000) {
250 if (word
< 0x200000) {
253 if (word
< 0x4000000) {
259 } // namespace anonymous
261 // How many "continuing octets" will be needed for this word
262 // == total octets - 1.
263 int utf8_codecvt_facet_wchar_t::get_cont_octet_out_count(
266 return get_cont_octet_out_count_impl
<sizeof(wchar_t)>(word
);
270 /////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8
271 // implementation for char
273 std::codecvt_base::result
utf8_codecvt_facet_char::do_in(
274 std::mbstate_t & state
,
276 const char * from_end
,
277 const char * & from_next
,
283 while(from_next
< from_end
){
285 wchar_t *wnext
= & w
;
286 utf8_codecvt_facet_wchar_t::result ucs4_result
;
287 ucs4_result
= base_class::do_in(
289 from
, from_end
, from_next
,
290 wnext
, wnext
+ 1, wnext
292 if(codecvt_base::ok
!= ucs4_result
)
294 // if the conversion succeeds.
295 int length
= std::wctomb(to_next
, w
);
296 assert(-1 != length
);
299 return codecvt_base::ok
;
302 std::codecvt_base::result
utf8_codecvt_facet_char::do_out(
305 const char * from_end
,
306 const char * & from_next
,
312 while(from_next
< from_end
){
314 int result
= std::mbtowc(&w
, from_next
, MB_LENGTH_MAX
);
315 assert(-1 != result
);
317 utf8_codecvt_facet_wchar_t::result ucs4_result
;
319 const wchar_t *wptr
= & w
;
320 ucs4_result
= base_class::do_out(
323 to_next
, to_end
, to_next
325 if(codecvt_base::ok
!= ucs4_result
)
328 return codecvt_base::ok
;
331 // How many bytes objects can I process to get <= max_limit
333 int utf8_codecvt_facet_char::do_length(
334 // it seems that the standard doesn't use const so these librarires
336 BOOST_IOSTREAMS_CODECVT_CV_QUALIFIER
337 utf8_codecvt_facet_wchar_t::mbstate_t & initial_state
,
338 const char * from_next
,
339 const char * from_end
,
340 std::size_t max_limit
343 int total_length
= 0;
344 const char *from
= from_next
;
345 mbstate_t state
= initial_state
;
346 while(from_next
< from_end
){
348 wchar_t *wnext
= & w
;
349 utf8_codecvt_facet_wchar_t::result ucs4_result
;
350 ucs4_result
= base_class::do_in(
352 from_next
, from_end
, from_next
,
353 wnext
, wnext
+ 1, wnext
356 if(codecvt_base::ok
!= ucs4_result
)
359 char carray
[MB_LENGTH_MAX
];
360 std::size_t count
= wctomb(carray
, w
);
361 if(count
> max_limit
)
365 total_length
= from_next
- from
;
371 #endif //BOOST_IOSTREAMS_NO_WIDE_STREAMS