[ceph.git] / ceph / src / boost / libs / iostreams / test / detail / utf8_codecvt_facet.cpp

/////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8
// utf8_codecvt_facet.cpp

// Copyright (c) 2001 Ronald Garcia, Indiana University (garcia@osl.iu.edu)
// Andrew Lumsdaine, Indiana University (lums@osl.iu.edu). 
// Distributed under the Boost Software License, Version 1.0. (See accompany-
// ing file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)

// See http://www.boost.org/libs/iostreams for documentation.

//#include <cstdlib> // for multi-byte converson routines

// Jonathan Turkanis: 
//   - Replaced test for BOOST_NO_STD_WSTREAMBUF with test for 
//     BOOST_IOSTREAMS_NO_WIDE_STREAMS;
//   - Derived from codecvt_helper instead of codecvt.

#include <boost/config.hpp>
#include <boost/iostreams/detail/config/wide_streams.hpp>
#ifdef BOOST_IOSTREAMS_NO_LOCALES
# error "C++ locales not supported on this platform"
#else

#include <cassert>
#include <cstddef>

#include <boost/detail/workaround.hpp>
#include "./utf8_codecvt_facet.hpp"

#if BOOST_WORKAROUND(__BORLANDC__, <= 0x600)
# pragma warn -sig // Conversion may lose significant digits
# pragma warn -rng // Constant is out of range in comparison
#endif

/////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8
// implementation for wchar_t

// Translate incoming UTF-8 into UCS-4
std::codecvt_base::result utf8_codecvt_facet_wchar_t::do_in(
    std::mbstate_t&, 
    const char * from,
    const char * from_end, 
    const char * & from_next,
    wchar_t * to, 
    wchar_t * to_end, 
    wchar_t * & to_next
) const {
    // Basic algorithm:  The first octet determines how many
    // octets total make up the UCS-4 character.  The remaining
    // "continuing octets" all begin with "10". To convert, subtract
    // the amount that specifies the number of octets from the first
    // octet.  Subtract 0x80 (1000 0000) from each continuing octet,
    // then mash the whole lot together.  Note that each continuing
    // octet only uses 6 bits as unique values, so only shift by
    // multiples of 6 to combine.
    while (from != from_end && to != to_end) {

        // Error checking   on the first octet
        if (invalid_leading_octet(*from)){
            from_next = from;
            to_next = to;
            return std::codecvt_base::error;
        }

        // The first octet is   adjusted by a value dependent upon 
        // the number   of "continuing octets" encoding the character
        const   int cont_octet_count = get_cont_octet_count(*from);
        const   wchar_t octet1_modifier_table[] =   {
            0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc
        };

        // The unsigned char conversion is necessary in case char is
        // signed   (I learned this the hard way)
        wchar_t ucs_result = 
            (unsigned char)(*from++) - octet1_modifier_table[cont_octet_count];

        // Invariants   : 
        //   1) At the start of the loop,   'i' continuing characters have been
        //    processed 
        //   2) *from   points to the next continuing character to be processed.
        int i   = 0;
        while(i != cont_octet_count && from != from_end) {

            // Error checking on continuing characters
            if (invalid_continuing_octet(*from)) {
                from_next   = from;
                to_next =   to;
                return std::codecvt_base::error;
            }

            ucs_result *= (1 << 6); 

            // each continuing character has an extra (10xxxxxx)b attached to 
            // it that must be removed.
            ucs_result += (unsigned char)(*from++) - 0x80;
            ++i;
        }

        // If   the buffer ends with an incomplete unicode character...
        if (from == from_end && i   != cont_octet_count) {
            // rewind "from" to before the current character translation
            from_next = from - (i+1); 
            to_next = to;
            return std::codecvt_base::partial;
        }
        *to++   = ucs_result;
    }
    from_next = from;
    to_next = to;

    // Were we done converting or did we run out of destination space?
    if(from == from_end) return std::codecvt_base::ok;
    else return std::codecvt_base::partial;
}

std::codecvt_base::result utf8_codecvt_facet_wchar_t::do_out(
    std::mbstate_t &, 
    const wchar_t *   from,
    const wchar_t * from_end, 
    const wchar_t * & from_next,
    char * to, 
    char * to_end, 
    char * & to_next
) const
{
    // RG - consider merging this table with the other one
    const wchar_t octet1_modifier_table[] = {
        0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc
    };

    while (from != from_end && to != to_end) {

#define BOOST_NULL // Prevent macro expansion
        // Check for invalid UCS-4 character
        if (*from  > std::numeric_limits<wchar_t>::max BOOST_NULL ()) {
            from_next = from;
            to_next = to;
            return std::codecvt_base::error;
        }
#undef BOOST_NULL

        int cont_octet_count = get_cont_octet_out_count(*from);

        // RG  - comment this formula better
        int shift_exponent = (cont_octet_count) *   6;

        // Process the first character
        *to++ = octet1_modifier_table[cont_octet_count] +
            (unsigned char)(*from / (1 << shift_exponent));

        // Process the continuation characters 
        // Invariants: At   the start of the loop:
        //   1) 'i' continuing octets   have been generated
        //   2) '*to'   points to the next location to place an octet
        //   3) shift_exponent is   6 more than needed for the next octet
        int i   = 0;
        while   (i != cont_octet_count && to != to_end) {
            shift_exponent -= 6;
            *to++ = 0x80 + ((*from / (1 << shift_exponent)) % (1 << 6));
            ++i;
        }
        // If   we filled up the out buffer before encoding the character
        if(to   == to_end && i != cont_octet_count) {
            from_next = from;
            to_next = to - (i+1);
            return std::codecvt_base::partial;
        }
        *from++;
    }
    from_next = from;
    to_next = to;
    // Were we done or did we run out of destination space
    if(from == from_end) return std::codecvt_base::ok;
    else return std::codecvt_base::partial;
}

// How many char objects can I process to get <= max_limit
// wchar_t objects?
int utf8_codecvt_facet_wchar_t::do_length(
    BOOST_IOSTREAMS_CODECVT_CV_QUALIFIER std::mbstate_t &,
    const char * from,
    const char * from_end, 
    std::size_t max_limit
) const throw()
{ 
    // RG - this code is confusing!  I need a better way to express it.
    // and test cases.

    // Invariants:
    // 1) last_octet_count has the size of the last measured character
    // 2) char_count holds the number of characters shown to fit
    // within the bounds so far (no greater than max_limit)
    // 3) from_next points to the octet 'last_octet_count' before the
    // last measured character.  
    int last_octet_count=0;
    std::size_t char_count = 0;
    const char* from_next = from;
    // Use "<" because the buffer may represent incomplete characters
    while (from_next+last_octet_count <= from_end && char_count <= max_limit) {
        from_next += last_octet_count;
        last_octet_count = (get_octet_count(*from_next));
        ++char_count;
    }
    return from_next-from_end;
}

unsigned int utf8_codecvt_facet_wchar_t::get_octet_count(
    unsigned char   lead_octet
){
    // if the 0-bit (MSB) is 0, then 1 character
    if (lead_octet <= 0x7f) return 1;

    // Otherwise the count number of consecutive 1 bits starting at MSB
    assert(0xc0 <= lead_octet && lead_octet <= 0xfd);

    if (0xc0 <= lead_octet && lead_octet <= 0xdf) return 2;
    else if (0xe0 <= lead_octet && lead_octet <= 0xef) return 3;
    else if (0xf0 <= lead_octet && lead_octet <= 0xf7) return 4;
    else if (0xf8 <= lead_octet && lead_octet <= 0xfb) return 5;
    else return 6;
}

namespace {
template<std::size_t s>
int get_cont_octet_out_count_impl(wchar_t word){
    if (word < 0x80) {
        return 0;
    }
    if (word < 0x800) {
        return 1;
    }
    return 2;
}

// note the following code will generate on some platforms where
// wchar_t is defined as UCS2.  The warnings are superfluous as
// the specialization is never instantitiated with such compilers.
template<>
int get_cont_octet_out_count_impl<4>(wchar_t word)
{
    if (word < 0x80) {
        return 0;
    }
    if (word < 0x800) {
        return 1;
    }
    if (word < 0x10000) {
        return 2;
    }
    if (word < 0x200000) {
        return 3;
    }
    if (word < 0x4000000) {
        return 4;
    }
    return 5;
}

} // namespace anonymous

// How many "continuing octets" will be needed for this word
// ==   total octets - 1.
int utf8_codecvt_facet_wchar_t::get_cont_octet_out_count(
    wchar_t word
) const {
    return get_cont_octet_out_count_impl<sizeof(wchar_t)>(word);
}

#if 0 // not used?
/////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8
// implementation for char

std::codecvt_base::result utf8_codecvt_facet_char::do_in(
    std::mbstate_t & state, 
    const char * from, 
    const char * from_end, 
    const char * & from_next,
    char * to, 
    char * to_end, 
    char * & to_next
) const
{
    while(from_next < from_end){
        wchar_t w;
        wchar_t *wnext = & w;
        utf8_codecvt_facet_wchar_t::result ucs4_result;
        ucs4_result = base_class::do_in(
            state,
            from, from_end, from_next,
            wnext, wnext + 1, wnext
        );
        if(codecvt_base::ok != ucs4_result)
            return ucs4_result;
        // if the conversion succeeds. 
        int length = std::wctomb(to_next, w);
        assert(-1 != length);
        to_next += length;
    }
    return codecvt_base::ok;
}

std::codecvt_base::result utf8_codecvt_facet_char::do_out(
    mbstate_t & state, 
    const char * from,
    const char * from_end, 
    const char * & from_next,
    char * to, 
    char * to_end, 
    char * & to_next
) const
{
    while(from_next < from_end){
        wchar_t w;
        int result = std::mbtowc(&w, from_next,  MB_LENGTH_MAX);
        assert(-1 != result);
        from_next += result;
        utf8_codecvt_facet_wchar_t::result ucs4_result;

        const wchar_t *wptr = & w;
        ucs4_result = base_class::do_out(
            state,
            wptr, wptr+1, wptr,
            to_next, to_end, to_next
        );
        if(codecvt_base::ok != ucs4_result)
            return ucs4_result;     
    }
    return codecvt_base::ok;
}

// How many bytes objects can I process to get <= max_limit
// char objects?
int utf8_codecvt_facet_char::do_length(
    // it seems that the standard doesn't use const so these librarires
    // would be in error
    BOOST_IOSTREAMS_CODECVT_CV_QUALIFIER
    utf8_codecvt_facet_wchar_t::mbstate_t & initial_state,
    const char * from_next,
    const char * from_end, 
    std::size_t max_limit
) const
{
    int total_length = 0;
    const char *from = from_next;
    mbstate_t state = initial_state;
    while(from_next < from_end){
        wchar_t w;
        wchar_t *wnext = & w;
        utf8_codecvt_facet_wchar_t::result ucs4_result;
        ucs4_result = base_class::do_in(
            state,
            from_next, from_end, from_next,
            wnext, wnext + 1, wnext
        );

        if(codecvt_base::ok != ucs4_result)
            break;

        char carray[MB_LENGTH_MAX];
        std::size_t count = wctomb(carray, w);
        if(count > max_limit)
            break;

        max_limit -= count;
        total_length = from_next - from;
    }
    return total_length;
}
#endif

#endif //BOOST_IOSTREAMS_NO_WIDE_STREAMS
Commit	Line	Data
7c673cae FG	1	/////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8
	2	// utf8_codecvt_facet.cpp
	3
	4	// Copyright (c) 2001 Ronald Garcia, Indiana University (garcia@osl.iu.edu)
	5	// Andrew Lumsdaine, Indiana University (lums@osl.iu.edu).
	6	// Distributed under the Boost Software License, Version 1.0. (See accompany-
	7	// ing file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
	8
	9	// See http://www.boost.org/libs/iostreams for documentation.
	10
	11	//#include <cstdlib> // for multi-byte converson routines
	12
	13	// Jonathan Turkanis:
	14	// - Replaced test for BOOST_NO_STD_WSTREAMBUF with test for
	15	// BOOST_IOSTREAMS_NO_WIDE_STREAMS;
	16	// - Derived from codecvt_helper instead of codecvt.
	17
	18	#include <boost/config.hpp>
	19	#include <boost/iostreams/detail/config/wide_streams.hpp>
	20	#ifdef BOOST_IOSTREAMS_NO_LOCALES
	21	# error "C++ locales not supported on this platform"
	22	#else
	23
	24	#include <cassert>
	25	#include <cstddef>
	26
	27	#include <boost/detail/workaround.hpp>
	28	#include "./utf8_codecvt_facet.hpp"
	29
	30	#if BOOST_WORKAROUND(__BORLANDC__, <= 0x600)
	31	# pragma warn -sig // Conversion may lose significant digits
	32	# pragma warn -rng // Constant is out of range in comparison
	33	#endif
	34
	35	/////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8
	36	// implementation for wchar_t
	37
	38	// Translate incoming UTF-8 into UCS-4
	39	std::codecvt_base::result utf8_codecvt_facet_wchar_t::do_in(
	40	std::mbstate_t&,
	41	const char * from,
	42	const char * from_end,
	43	const char * & from_next,
	44	wchar_t * to,
	45	wchar_t * to_end,
	46	wchar_t * & to_next
	47	) const {
	48	// Basic algorithm: The first octet determines how many
	49	// octets total make up the UCS-4 character. The remaining
	50	// "continuing octets" all begin with "10". To convert, subtract
	51	// the amount that specifies the number of octets from the first
	52	// octet. Subtract 0x80 (1000 0000) from each continuing octet,
	53	// then mash the whole lot together. Note that each continuing
	54	// octet only uses 6 bits as unique values, so only shift by
	55	// multiples of 6 to combine.
	56	while (from != from_end && to != to_end) {
	57
	58	// Error checking on the first octet
	59	if (invalid_leading_octet(*from)){
	60	from_next = from;
	61	to_next = to;
	62	return std::codecvt_base::error;
	63	}
	64
65	// The first octet is adjusted by a value dependent upon
66	// the number of "continuing octets" encoding the character
67	const int cont_octet_count = get_cont_octet_count(*from);
68	const wchar_t octet1_modifier_table[] = {
69	0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc
70	};
71
72	// The unsigned char conversion is necessary in case char is
73	// signed (I learned this the hard way)
74	wchar_t ucs_result =
75	(unsigned char)(*from++) - octet1_modifier_table[cont_octet_count];
76
77	// Invariants :
78	// 1) At the start of the loop, 'i' continuing characters have been
79	// processed
80	// 2) *from points to the next continuing character to be processed.
81	int i = 0;
82	while(i != cont_octet_count && from != from_end) {
83
84	// Error checking on continuing characters
85	if (invalid_continuing_octet(*from)) {
86	from_next = from;
87	to_next = to;
88	return std::codecvt_base::error;
89	}
90
91	ucs_result *= (1 << 6);
92
93	// each continuing character has an extra (10xxxxxx)b attached to
94	// it that must be removed.
95	ucs_result += (unsigned char)(*from++) - 0x80;
96	++i;
97	}
98
99	// If the buffer ends with an incomplete unicode character...
100	if (from == from_end && i != cont_octet_count) {
101	// rewind "from" to before the current character translation
102	from_next = from - (i+1);
103	to_next = to;
104	return std::codecvt_base::partial;
105	}
106	*to++ = ucs_result;
107	}
108	from_next = from;
109	to_next = to;
110
111	// Were we done converting or did we run out of destination space?
112	if(from == from_end) return std::codecvt_base::ok;
113	else return std::codecvt_base::partial;
114	}
115
116	std::codecvt_base::result utf8_codecvt_facet_wchar_t::do_out(
117	std::mbstate_t &,
118	const wchar_t * from,
119	const wchar_t * from_end,
120	const wchar_t * & from_next,
121	char * to,
122	char * to_end,
123	char * & to_next
124	) const
125	{
126	// RG - consider merging this table with the other one
127	const wchar_t octet1_modifier_table[] = {
128	0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc
129	};
130
131	while (from != from_end && to != to_end) {
132
133	#define BOOST_NULL // Prevent macro expansion
134	// Check for invalid UCS-4 character
135	if (*from > std::numeric_limits<wchar_t>::max BOOST_NULL ()) {
136	from_next = from;
137	to_next = to;
138	return std::codecvt_base::error;
139	}
140	#undef BOOST_NULL
141
142	int cont_octet_count = get_cont_octet_out_count(*from);
143
144	// RG - comment this formula better
145	int shift_exponent = (cont_octet_count) * 6;
146
147	// Process the first character
148	*to++ = octet1_modifier_table[cont_octet_count] +
149	(unsigned char)(*from / (1 << shift_exponent));
150
151	// Process the continuation characters
152	// Invariants: At the start of the loop:
153	// 1) 'i' continuing octets have been generated
154	// 2) '*to' points to the next location to place an octet
155	// 3) shift_exponent is 6 more than needed for the next octet
156	int i = 0;
157	while (i != cont_octet_count && to != to_end) {
158	shift_exponent -= 6;
159	to++ = 0x80 + ((from / (1 << shift_exponent)) % (1 << 6));
160	++i;
161	}
162	// If we filled up the out buffer before encoding the character
163	if(to == to_end && i != cont_octet_count) {
164	from_next = from;
165	to_next = to - (i+1);
166	return std::codecvt_base::partial;
167	}
168	*from++;
169	}
170	from_next = from;
171	to_next = to;
172	// Were we done or did we run out of destination space
173	if(from == from_end) return std::codecvt_base::ok;
174	else return std::codecvt_base::partial;
175	}
176
177	// How many char objects can I process to get <= max_limit
178	// wchar_t objects?
179	int utf8_codecvt_facet_wchar_t::do_length(
180	BOOST_IOSTREAMS_CODECVT_CV_QUALIFIER std::mbstate_t &,
181	const char * from,
182	const char * from_end,
183	std::size_t max_limit
184	) const throw()
185	{
186	// RG - this code is confusing! I need a better way to express it.
187	// and test cases.
188
189	// Invariants:
190	// 1) last_octet_count has the size of the last measured character
191	// 2) char_count holds the number of characters shown to fit
192	// within the bounds so far (no greater than max_limit)
193	// 3) from_next points to the octet 'last_octet_count' before the
194	// last measured character.
195	int last_octet_count=0;
196	std::size_t char_count = 0;
197	const char* from_next = from;
198	// Use "<" because the buffer may represent incomplete characters
199	while (from_next+last_octet_count <= from_end && char_count <= max_limit) {
200	from_next += last_octet_count;
201	last_octet_count = (get_octet_count(*from_next));
202	++char_count;
203	}
204	return from_next-from_end;
205	}
206
207	unsigned int utf8_codecvt_facet_wchar_t::get_octet_count(
208	unsigned char lead_octet
209	){
210	// if the 0-bit (MSB) is 0, then 1 character
211	if (lead_octet <= 0x7f) return 1;
212
213	// Otherwise the count number of consecutive 1 bits starting at MSB
214	assert(0xc0 <= lead_octet && lead_octet <= 0xfd);
215
216	if (0xc0 <= lead_octet && lead_octet <= 0xdf) return 2;
217	else if (0xe0 <= lead_octet && lead_octet <= 0xef) return 3;
218	else if (0xf0 <= lead_octet && lead_octet <= 0xf7) return 4;
219	else if (0xf8 <= lead_octet && lead_octet <= 0xfb) return 5;
220	else return 6;
221	}
222
223	namespace {
224	template<std::size_t s>
225	int get_cont_octet_out_count_impl(wchar_t word){
226	if (word < 0x80) {
227	return 0;
228	}
229	if (word < 0x800) {
230	return 1;
231	}
232	return 2;
233	}
234
235	// note the following code will generate on some platforms where
236	// wchar_t is defined as UCS2. The warnings are superfluous as
237	// the specialization is never instantitiated with such compilers.
238	template<>
239	int get_cont_octet_out_count_impl<4>(wchar_t word)
240	{
241	if (word < 0x80) {
242	return 0;
243	}
244	if (word < 0x800) {
245	return 1;
246	}
247	if (word < 0x10000) {
248	return 2;
249	}
250	if (word < 0x200000) {
251	return 3;
252	}
253	if (word < 0x4000000) {
254	return 4;
255	}
256	return 5;
257	}
258
259	} // namespace anonymous
260
261	// How many "continuing octets" will be needed for this word
262	// == total octets - 1.
263	int utf8_codecvt_facet_wchar_t::get_cont_octet_out_count(
264	wchar_t word
265	) const {
266	return get_cont_octet_out_count_impl<sizeof(wchar_t)>(word);
267	}
268
269	#if 0 // not used?
270	/////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8
271	// implementation for char
272
273	std::codecvt_base::result utf8_codecvt_facet_char::do_in(
274	std::mbstate_t & state,
275	const char * from,
276	const char * from_end,
277	const char * & from_next,
278	char * to,
279	char * to_end,
280	char * & to_next
281	) const
282	{
283	while(from_next < from_end){
284	wchar_t w;
285	wchar_t *wnext = & w;
286	utf8_codecvt_facet_wchar_t::result ucs4_result;
287	ucs4_result = base_class::do_in(
288	state,
289	from, from_end, from_next,
290	wnext, wnext + 1, wnext
291	);
292	if(codecvt_base::ok != ucs4_result)
293	return ucs4_result;
294	// if the conversion succeeds.
295	int length = std::wctomb(to_next, w);
296	assert(-1 != length);
297	to_next += length;
298	}
299	return codecvt_base::ok;
300	}
301
302	std::codecvt_base::result utf8_codecvt_facet_char::do_out(
303	mbstate_t & state,
304	const char * from,
305	const char * from_end,
306	const char * & from_next,
307	char * to,
308	char * to_end,
309	char * & to_next
310	) const
311	{
312	while(from_next < from_end){
313	wchar_t w;
314	int result = std::mbtowc(&w, from_next, MB_LENGTH_MAX);
315	assert(-1 != result);
316	from_next += result;
317	utf8_codecvt_facet_wchar_t::result ucs4_result;
318
319	const wchar_t *wptr = & w;
320	ucs4_result = base_class::do_out(
321	state,
322	wptr, wptr+1, wptr,
323	to_next, to_end, to_next
324	);
325	if(codecvt_base::ok != ucs4_result)
326	return ucs4_result;
327	}
328	return codecvt_base::ok;
329	}
330
331	// How many bytes objects can I process to get <= max_limit
332	// char objects?
333	int utf8_codecvt_facet_char::do_length(
334	// it seems that the standard doesn't use const so these librarires
335	// would be in error
336	BOOST_IOSTREAMS_CODECVT_CV_QUALIFIER
337	utf8_codecvt_facet_wchar_t::mbstate_t & initial_state,
338	const char * from_next,
339	const char * from_end,
340	std::size_t max_limit
341	) const
342	{
343	int total_length = 0;
344	const char *from = from_next;
345	mbstate_t state = initial_state;
346	while(from_next < from_end){
347	wchar_t w;
348	wchar_t *wnext = & w;
349	utf8_codecvt_facet_wchar_t::result ucs4_result;
350	ucs4_result = base_class::do_in(
351	state,
352	from_next, from_end, from_next,
353	wnext, wnext + 1, wnext
354	);
355
356	if(codecvt_base::ok != ucs4_result)
357	break;
358
359	char carray[MB_LENGTH_MAX];
360	std::size_t count = wctomb(carray, w);
361	if(count > max_limit)
362	break;
363
364	max_limit -= count;
365	total_length = from_next - from;
366	}
367	return total_length;
368	}
369	#endif
370
371	#endif //BOOST_IOSTREAMS_NO_WIDE_STREAMS