]> git.proxmox.com Git - ceph.git/blame - ceph/src/boost/libs/iostreams/test/detail/utf8_codecvt_facet.cpp
add subtree-ish sources for 12.0.3
[ceph.git] / ceph / src / boost / libs / iostreams / test / detail / utf8_codecvt_facet.cpp
CommitLineData
7c673cae
FG
1/////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8
2// utf8_codecvt_facet.cpp
3
4// Copyright (c) 2001 Ronald Garcia, Indiana University (garcia@osl.iu.edu)
5// Andrew Lumsdaine, Indiana University (lums@osl.iu.edu).
6// Distributed under the Boost Software License, Version 1.0. (See accompany-
7// ing file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
8
9// See http://www.boost.org/libs/iostreams for documentation.
10
11//#include <cstdlib> // for multi-byte converson routines
12
13// Jonathan Turkanis:
14// - Replaced test for BOOST_NO_STD_WSTREAMBUF with test for
15// BOOST_IOSTREAMS_NO_WIDE_STREAMS;
16// - Derived from codecvt_helper instead of codecvt.
17
18#include <boost/config.hpp>
19#include <boost/iostreams/detail/config/wide_streams.hpp>
20#ifdef BOOST_IOSTREAMS_NO_LOCALES
21# error "C++ locales not supported on this platform"
22#else
23
24#include <cassert>
25#include <cstddef>
26
27#include <boost/detail/workaround.hpp>
28#include "./utf8_codecvt_facet.hpp"
29
30#if BOOST_WORKAROUND(__BORLANDC__, <= 0x600)
31# pragma warn -sig // Conversion may lose significant digits
32# pragma warn -rng // Constant is out of range in comparison
33#endif
34
35/////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8
36// implementation for wchar_t
37
38// Translate incoming UTF-8 into UCS-4
39std::codecvt_base::result utf8_codecvt_facet_wchar_t::do_in(
40 std::mbstate_t&,
41 const char * from,
42 const char * from_end,
43 const char * & from_next,
44 wchar_t * to,
45 wchar_t * to_end,
46 wchar_t * & to_next
47) const {
48 // Basic algorithm: The first octet determines how many
49 // octets total make up the UCS-4 character. The remaining
50 // "continuing octets" all begin with "10". To convert, subtract
51 // the amount that specifies the number of octets from the first
52 // octet. Subtract 0x80 (1000 0000) from each continuing octet,
53 // then mash the whole lot together. Note that each continuing
54 // octet only uses 6 bits as unique values, so only shift by
55 // multiples of 6 to combine.
56 while (from != from_end && to != to_end) {
57
58 // Error checking on the first octet
59 if (invalid_leading_octet(*from)){
60 from_next = from;
61 to_next = to;
62 return std::codecvt_base::error;
63 }
64
65 // The first octet is adjusted by a value dependent upon
66 // the number of "continuing octets" encoding the character
67 const int cont_octet_count = get_cont_octet_count(*from);
68 const wchar_t octet1_modifier_table[] = {
69 0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc
70 };
71
72 // The unsigned char conversion is necessary in case char is
73 // signed (I learned this the hard way)
74 wchar_t ucs_result =
75 (unsigned char)(*from++) - octet1_modifier_table[cont_octet_count];
76
77 // Invariants :
78 // 1) At the start of the loop, 'i' continuing characters have been
79 // processed
80 // 2) *from points to the next continuing character to be processed.
81 int i = 0;
82 while(i != cont_octet_count && from != from_end) {
83
84 // Error checking on continuing characters
85 if (invalid_continuing_octet(*from)) {
86 from_next = from;
87 to_next = to;
88 return std::codecvt_base::error;
89 }
90
91 ucs_result *= (1 << 6);
92
93 // each continuing character has an extra (10xxxxxx)b attached to
94 // it that must be removed.
95 ucs_result += (unsigned char)(*from++) - 0x80;
96 ++i;
97 }
98
99 // If the buffer ends with an incomplete unicode character...
100 if (from == from_end && i != cont_octet_count) {
101 // rewind "from" to before the current character translation
102 from_next = from - (i+1);
103 to_next = to;
104 return std::codecvt_base::partial;
105 }
106 *to++ = ucs_result;
107 }
108 from_next = from;
109 to_next = to;
110
111 // Were we done converting or did we run out of destination space?
112 if(from == from_end) return std::codecvt_base::ok;
113 else return std::codecvt_base::partial;
114}
115
116std::codecvt_base::result utf8_codecvt_facet_wchar_t::do_out(
117 std::mbstate_t &,
118 const wchar_t * from,
119 const wchar_t * from_end,
120 const wchar_t * & from_next,
121 char * to,
122 char * to_end,
123 char * & to_next
124) const
125{
126 // RG - consider merging this table with the other one
127 const wchar_t octet1_modifier_table[] = {
128 0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc
129 };
130
131 while (from != from_end && to != to_end) {
132
133#define BOOST_NULL // Prevent macro expansion
134 // Check for invalid UCS-4 character
135 if (*from > std::numeric_limits<wchar_t>::max BOOST_NULL ()) {
136 from_next = from;
137 to_next = to;
138 return std::codecvt_base::error;
139 }
140#undef BOOST_NULL
141
142 int cont_octet_count = get_cont_octet_out_count(*from);
143
144 // RG - comment this formula better
145 int shift_exponent = (cont_octet_count) * 6;
146
147 // Process the first character
148 *to++ = octet1_modifier_table[cont_octet_count] +
149 (unsigned char)(*from / (1 << shift_exponent));
150
151 // Process the continuation characters
152 // Invariants: At the start of the loop:
153 // 1) 'i' continuing octets have been generated
154 // 2) '*to' points to the next location to place an octet
155 // 3) shift_exponent is 6 more than needed for the next octet
156 int i = 0;
157 while (i != cont_octet_count && to != to_end) {
158 shift_exponent -= 6;
159 *to++ = 0x80 + ((*from / (1 << shift_exponent)) % (1 << 6));
160 ++i;
161 }
162 // If we filled up the out buffer before encoding the character
163 if(to == to_end && i != cont_octet_count) {
164 from_next = from;
165 to_next = to - (i+1);
166 return std::codecvt_base::partial;
167 }
168 *from++;
169 }
170 from_next = from;
171 to_next = to;
172 // Were we done or did we run out of destination space
173 if(from == from_end) return std::codecvt_base::ok;
174 else return std::codecvt_base::partial;
175}
176
177// How many char objects can I process to get <= max_limit
178// wchar_t objects?
179int utf8_codecvt_facet_wchar_t::do_length(
180 BOOST_IOSTREAMS_CODECVT_CV_QUALIFIER std::mbstate_t &,
181 const char * from,
182 const char * from_end,
183 std::size_t max_limit
184) const throw()
185{
186 // RG - this code is confusing! I need a better way to express it.
187 // and test cases.
188
189 // Invariants:
190 // 1) last_octet_count has the size of the last measured character
191 // 2) char_count holds the number of characters shown to fit
192 // within the bounds so far (no greater than max_limit)
193 // 3) from_next points to the octet 'last_octet_count' before the
194 // last measured character.
195 int last_octet_count=0;
196 std::size_t char_count = 0;
197 const char* from_next = from;
198 // Use "<" because the buffer may represent incomplete characters
199 while (from_next+last_octet_count <= from_end && char_count <= max_limit) {
200 from_next += last_octet_count;
201 last_octet_count = (get_octet_count(*from_next));
202 ++char_count;
203 }
204 return from_next-from_end;
205}
206
207unsigned int utf8_codecvt_facet_wchar_t::get_octet_count(
208 unsigned char lead_octet
209){
210 // if the 0-bit (MSB) is 0, then 1 character
211 if (lead_octet <= 0x7f) return 1;
212
213 // Otherwise the count number of consecutive 1 bits starting at MSB
214 assert(0xc0 <= lead_octet && lead_octet <= 0xfd);
215
216 if (0xc0 <= lead_octet && lead_octet <= 0xdf) return 2;
217 else if (0xe0 <= lead_octet && lead_octet <= 0xef) return 3;
218 else if (0xf0 <= lead_octet && lead_octet <= 0xf7) return 4;
219 else if (0xf8 <= lead_octet && lead_octet <= 0xfb) return 5;
220 else return 6;
221}
222
223namespace {
224template<std::size_t s>
225int get_cont_octet_out_count_impl(wchar_t word){
226 if (word < 0x80) {
227 return 0;
228 }
229 if (word < 0x800) {
230 return 1;
231 }
232 return 2;
233}
234
235// note the following code will generate on some platforms where
236// wchar_t is defined as UCS2. The warnings are superfluous as
237// the specialization is never instantitiated with such compilers.
238template<>
239int get_cont_octet_out_count_impl<4>(wchar_t word)
240{
241 if (word < 0x80) {
242 return 0;
243 }
244 if (word < 0x800) {
245 return 1;
246 }
247 if (word < 0x10000) {
248 return 2;
249 }
250 if (word < 0x200000) {
251 return 3;
252 }
253 if (word < 0x4000000) {
254 return 4;
255 }
256 return 5;
257}
258
259} // namespace anonymous
260
261// How many "continuing octets" will be needed for this word
262// == total octets - 1.
263int utf8_codecvt_facet_wchar_t::get_cont_octet_out_count(
264 wchar_t word
265) const {
266 return get_cont_octet_out_count_impl<sizeof(wchar_t)>(word);
267}
268
269#if 0 // not used?
270/////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8
271// implementation for char
272
273std::codecvt_base::result utf8_codecvt_facet_char::do_in(
274 std::mbstate_t & state,
275 const char * from,
276 const char * from_end,
277 const char * & from_next,
278 char * to,
279 char * to_end,
280 char * & to_next
281) const
282{
283 while(from_next < from_end){
284 wchar_t w;
285 wchar_t *wnext = & w;
286 utf8_codecvt_facet_wchar_t::result ucs4_result;
287 ucs4_result = base_class::do_in(
288 state,
289 from, from_end, from_next,
290 wnext, wnext + 1, wnext
291 );
292 if(codecvt_base::ok != ucs4_result)
293 return ucs4_result;
294 // if the conversion succeeds.
295 int length = std::wctomb(to_next, w);
296 assert(-1 != length);
297 to_next += length;
298 }
299 return codecvt_base::ok;
300}
301
302std::codecvt_base::result utf8_codecvt_facet_char::do_out(
303 mbstate_t & state,
304 const char * from,
305 const char * from_end,
306 const char * & from_next,
307 char * to,
308 char * to_end,
309 char * & to_next
310) const
311{
312 while(from_next < from_end){
313 wchar_t w;
314 int result = std::mbtowc(&w, from_next, MB_LENGTH_MAX);
315 assert(-1 != result);
316 from_next += result;
317 utf8_codecvt_facet_wchar_t::result ucs4_result;
318
319 const wchar_t *wptr = & w;
320 ucs4_result = base_class::do_out(
321 state,
322 wptr, wptr+1, wptr,
323 to_next, to_end, to_next
324 );
325 if(codecvt_base::ok != ucs4_result)
326 return ucs4_result;
327 }
328 return codecvt_base::ok;
329}
330
331// How many bytes objects can I process to get <= max_limit
332// char objects?
333int utf8_codecvt_facet_char::do_length(
334 // it seems that the standard doesn't use const so these librarires
335 // would be in error
336 BOOST_IOSTREAMS_CODECVT_CV_QUALIFIER
337 utf8_codecvt_facet_wchar_t::mbstate_t & initial_state,
338 const char * from_next,
339 const char * from_end,
340 std::size_t max_limit
341) const
342{
343 int total_length = 0;
344 const char *from = from_next;
345 mbstate_t state = initial_state;
346 while(from_next < from_end){
347 wchar_t w;
348 wchar_t *wnext = & w;
349 utf8_codecvt_facet_wchar_t::result ucs4_result;
350 ucs4_result = base_class::do_in(
351 state,
352 from_next, from_end, from_next,
353 wnext, wnext + 1, wnext
354 );
355
356 if(codecvt_base::ok != ucs4_result)
357 break;
358
359 char carray[MB_LENGTH_MAX];
360 std::size_t count = wctomb(carray, w);
361 if(count > max_limit)
362 break;
363
364 max_limit -= count;
365 total_length = from_next - from;
366 }
367 return total_length;
368}
369#endif
370
371#endif //BOOST_IOSTREAMS_NO_WIDE_STREAMS