]> git.proxmox.com Git - ceph.git/blob - ceph/src/boost/libs/iostreams/test/detail/utf8_codecvt_facet.cpp
add subtree-ish sources for 12.0.3
[ceph.git] / ceph / src / boost / libs / iostreams / test / detail / utf8_codecvt_facet.cpp
1 /////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8
2 // utf8_codecvt_facet.cpp
3
4 // Copyright (c) 2001 Ronald Garcia, Indiana University (garcia@osl.iu.edu)
5 // Andrew Lumsdaine, Indiana University (lums@osl.iu.edu).
6 // Distributed under the Boost Software License, Version 1.0. (See accompany-
7 // ing file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
8
9 // See http://www.boost.org/libs/iostreams for documentation.
10
11 //#include <cstdlib> // for multi-byte converson routines
12
13 // Jonathan Turkanis:
14 // - Replaced test for BOOST_NO_STD_WSTREAMBUF with test for
15 // BOOST_IOSTREAMS_NO_WIDE_STREAMS;
16 // - Derived from codecvt_helper instead of codecvt.
17
18 #include <boost/config.hpp>
19 #include <boost/iostreams/detail/config/wide_streams.hpp>
20 #ifdef BOOST_IOSTREAMS_NO_LOCALES
21 # error "C++ locales not supported on this platform"
22 #else
23
24 #include <cassert>
25 #include <cstddef>
26
27 #include <boost/detail/workaround.hpp>
28 #include "./utf8_codecvt_facet.hpp"
29
30 #if BOOST_WORKAROUND(__BORLANDC__, <= 0x600)
31 # pragma warn -sig // Conversion may lose significant digits
32 # pragma warn -rng // Constant is out of range in comparison
33 #endif
34
35 /////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8
36 // implementation for wchar_t
37
38 // Translate incoming UTF-8 into UCS-4
39 std::codecvt_base::result utf8_codecvt_facet_wchar_t::do_in(
40 std::mbstate_t&,
41 const char * from,
42 const char * from_end,
43 const char * & from_next,
44 wchar_t * to,
45 wchar_t * to_end,
46 wchar_t * & to_next
47 ) const {
48 // Basic algorithm: The first octet determines how many
49 // octets total make up the UCS-4 character. The remaining
50 // "continuing octets" all begin with "10". To convert, subtract
51 // the amount that specifies the number of octets from the first
52 // octet. Subtract 0x80 (1000 0000) from each continuing octet,
53 // then mash the whole lot together. Note that each continuing
54 // octet only uses 6 bits as unique values, so only shift by
55 // multiples of 6 to combine.
56 while (from != from_end && to != to_end) {
57
58 // Error checking on the first octet
59 if (invalid_leading_octet(*from)){
60 from_next = from;
61 to_next = to;
62 return std::codecvt_base::error;
63 }
64
65 // The first octet is adjusted by a value dependent upon
66 // the number of "continuing octets" encoding the character
67 const int cont_octet_count = get_cont_octet_count(*from);
68 const wchar_t octet1_modifier_table[] = {
69 0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc
70 };
71
72 // The unsigned char conversion is necessary in case char is
73 // signed (I learned this the hard way)
74 wchar_t ucs_result =
75 (unsigned char)(*from++) - octet1_modifier_table[cont_octet_count];
76
77 // Invariants :
78 // 1) At the start of the loop, 'i' continuing characters have been
79 // processed
80 // 2) *from points to the next continuing character to be processed.
81 int i = 0;
82 while(i != cont_octet_count && from != from_end) {
83
84 // Error checking on continuing characters
85 if (invalid_continuing_octet(*from)) {
86 from_next = from;
87 to_next = to;
88 return std::codecvt_base::error;
89 }
90
91 ucs_result *= (1 << 6);
92
93 // each continuing character has an extra (10xxxxxx)b attached to
94 // it that must be removed.
95 ucs_result += (unsigned char)(*from++) - 0x80;
96 ++i;
97 }
98
99 // If the buffer ends with an incomplete unicode character...
100 if (from == from_end && i != cont_octet_count) {
101 // rewind "from" to before the current character translation
102 from_next = from - (i+1);
103 to_next = to;
104 return std::codecvt_base::partial;
105 }
106 *to++ = ucs_result;
107 }
108 from_next = from;
109 to_next = to;
110
111 // Were we done converting or did we run out of destination space?
112 if(from == from_end) return std::codecvt_base::ok;
113 else return std::codecvt_base::partial;
114 }
115
116 std::codecvt_base::result utf8_codecvt_facet_wchar_t::do_out(
117 std::mbstate_t &,
118 const wchar_t * from,
119 const wchar_t * from_end,
120 const wchar_t * & from_next,
121 char * to,
122 char * to_end,
123 char * & to_next
124 ) const
125 {
126 // RG - consider merging this table with the other one
127 const wchar_t octet1_modifier_table[] = {
128 0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc
129 };
130
131 while (from != from_end && to != to_end) {
132
133 #define BOOST_NULL // Prevent macro expansion
134 // Check for invalid UCS-4 character
135 if (*from > std::numeric_limits<wchar_t>::max BOOST_NULL ()) {
136 from_next = from;
137 to_next = to;
138 return std::codecvt_base::error;
139 }
140 #undef BOOST_NULL
141
142 int cont_octet_count = get_cont_octet_out_count(*from);
143
144 // RG - comment this formula better
145 int shift_exponent = (cont_octet_count) * 6;
146
147 // Process the first character
148 *to++ = octet1_modifier_table[cont_octet_count] +
149 (unsigned char)(*from / (1 << shift_exponent));
150
151 // Process the continuation characters
152 // Invariants: At the start of the loop:
153 // 1) 'i' continuing octets have been generated
154 // 2) '*to' points to the next location to place an octet
155 // 3) shift_exponent is 6 more than needed for the next octet
156 int i = 0;
157 while (i != cont_octet_count && to != to_end) {
158 shift_exponent -= 6;
159 *to++ = 0x80 + ((*from / (1 << shift_exponent)) % (1 << 6));
160 ++i;
161 }
162 // If we filled up the out buffer before encoding the character
163 if(to == to_end && i != cont_octet_count) {
164 from_next = from;
165 to_next = to - (i+1);
166 return std::codecvt_base::partial;
167 }
168 *from++;
169 }
170 from_next = from;
171 to_next = to;
172 // Were we done or did we run out of destination space
173 if(from == from_end) return std::codecvt_base::ok;
174 else return std::codecvt_base::partial;
175 }
176
177 // How many char objects can I process to get <= max_limit
178 // wchar_t objects?
179 int utf8_codecvt_facet_wchar_t::do_length(
180 BOOST_IOSTREAMS_CODECVT_CV_QUALIFIER std::mbstate_t &,
181 const char * from,
182 const char * from_end,
183 std::size_t max_limit
184 ) const throw()
185 {
186 // RG - this code is confusing! I need a better way to express it.
187 // and test cases.
188
189 // Invariants:
190 // 1) last_octet_count has the size of the last measured character
191 // 2) char_count holds the number of characters shown to fit
192 // within the bounds so far (no greater than max_limit)
193 // 3) from_next points to the octet 'last_octet_count' before the
194 // last measured character.
195 int last_octet_count=0;
196 std::size_t char_count = 0;
197 const char* from_next = from;
198 // Use "<" because the buffer may represent incomplete characters
199 while (from_next+last_octet_count <= from_end && char_count <= max_limit) {
200 from_next += last_octet_count;
201 last_octet_count = (get_octet_count(*from_next));
202 ++char_count;
203 }
204 return from_next-from_end;
205 }
206
207 unsigned int utf8_codecvt_facet_wchar_t::get_octet_count(
208 unsigned char lead_octet
209 ){
210 // if the 0-bit (MSB) is 0, then 1 character
211 if (lead_octet <= 0x7f) return 1;
212
213 // Otherwise the count number of consecutive 1 bits starting at MSB
214 assert(0xc0 <= lead_octet && lead_octet <= 0xfd);
215
216 if (0xc0 <= lead_octet && lead_octet <= 0xdf) return 2;
217 else if (0xe0 <= lead_octet && lead_octet <= 0xef) return 3;
218 else if (0xf0 <= lead_octet && lead_octet <= 0xf7) return 4;
219 else if (0xf8 <= lead_octet && lead_octet <= 0xfb) return 5;
220 else return 6;
221 }
222
223 namespace {
224 template<std::size_t s>
225 int get_cont_octet_out_count_impl(wchar_t word){
226 if (word < 0x80) {
227 return 0;
228 }
229 if (word < 0x800) {
230 return 1;
231 }
232 return 2;
233 }
234
235 // note the following code will generate on some platforms where
236 // wchar_t is defined as UCS2. The warnings are superfluous as
237 // the specialization is never instantitiated with such compilers.
238 template<>
239 int get_cont_octet_out_count_impl<4>(wchar_t word)
240 {
241 if (word < 0x80) {
242 return 0;
243 }
244 if (word < 0x800) {
245 return 1;
246 }
247 if (word < 0x10000) {
248 return 2;
249 }
250 if (word < 0x200000) {
251 return 3;
252 }
253 if (word < 0x4000000) {
254 return 4;
255 }
256 return 5;
257 }
258
259 } // namespace anonymous
260
261 // How many "continuing octets" will be needed for this word
262 // == total octets - 1.
263 int utf8_codecvt_facet_wchar_t::get_cont_octet_out_count(
264 wchar_t word
265 ) const {
266 return get_cont_octet_out_count_impl<sizeof(wchar_t)>(word);
267 }
268
269 #if 0 // not used?
270 /////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8
271 // implementation for char
272
273 std::codecvt_base::result utf8_codecvt_facet_char::do_in(
274 std::mbstate_t & state,
275 const char * from,
276 const char * from_end,
277 const char * & from_next,
278 char * to,
279 char * to_end,
280 char * & to_next
281 ) const
282 {
283 while(from_next < from_end){
284 wchar_t w;
285 wchar_t *wnext = & w;
286 utf8_codecvt_facet_wchar_t::result ucs4_result;
287 ucs4_result = base_class::do_in(
288 state,
289 from, from_end, from_next,
290 wnext, wnext + 1, wnext
291 );
292 if(codecvt_base::ok != ucs4_result)
293 return ucs4_result;
294 // if the conversion succeeds.
295 int length = std::wctomb(to_next, w);
296 assert(-1 != length);
297 to_next += length;
298 }
299 return codecvt_base::ok;
300 }
301
302 std::codecvt_base::result utf8_codecvt_facet_char::do_out(
303 mbstate_t & state,
304 const char * from,
305 const char * from_end,
306 const char * & from_next,
307 char * to,
308 char * to_end,
309 char * & to_next
310 ) const
311 {
312 while(from_next < from_end){
313 wchar_t w;
314 int result = std::mbtowc(&w, from_next, MB_LENGTH_MAX);
315 assert(-1 != result);
316 from_next += result;
317 utf8_codecvt_facet_wchar_t::result ucs4_result;
318
319 const wchar_t *wptr = & w;
320 ucs4_result = base_class::do_out(
321 state,
322 wptr, wptr+1, wptr,
323 to_next, to_end, to_next
324 );
325 if(codecvt_base::ok != ucs4_result)
326 return ucs4_result;
327 }
328 return codecvt_base::ok;
329 }
330
331 // How many bytes objects can I process to get <= max_limit
332 // char objects?
333 int utf8_codecvt_facet_char::do_length(
334 // it seems that the standard doesn't use const so these librarires
335 // would be in error
336 BOOST_IOSTREAMS_CODECVT_CV_QUALIFIER
337 utf8_codecvt_facet_wchar_t::mbstate_t & initial_state,
338 const char * from_next,
339 const char * from_end,
340 std::size_t max_limit
341 ) const
342 {
343 int total_length = 0;
344 const char *from = from_next;
345 mbstate_t state = initial_state;
346 while(from_next < from_end){
347 wchar_t w;
348 wchar_t *wnext = & w;
349 utf8_codecvt_facet_wchar_t::result ucs4_result;
350 ucs4_result = base_class::do_in(
351 state,
352 from_next, from_end, from_next,
353 wnext, wnext + 1, wnext
354 );
355
356 if(codecvt_base::ok != ucs4_result)
357 break;
358
359 char carray[MB_LENGTH_MAX];
360 std::size_t count = wctomb(carray, w);
361 if(count > max_limit)
362 break;
363
364 max_limit -= count;
365 total_length = from_next - from;
366 }
367 return total_length;
368 }
369 #endif
370
371 #endif //BOOST_IOSTREAMS_NO_WIDE_STREAMS