[ceph.git] / ceph / src / boost / libs / detail / include / boost / detail / utf8_codecvt_facet.ipp

/////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8
// utf8_codecvt_facet.ipp

// Copyright (c) 2001 Ronald Garcia, Indiana University (garcia@osl.iu.edu)
// Andrew Lumsdaine, Indiana University (lums@osl.iu.edu). 
// Use, modification and distribution is subject to the Boost Software
// License, Version 1.0. (See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt)

// Please see the comments in <boost/detail/utf8_codecvt_facet.hpp> to
// learn how this file should be used.

#include <boost/detail/utf8_codecvt_facet.hpp>

#include <cstdlib> // for multi-byte converson routines
#include <cassert>

#include <boost/limits.hpp>
#include <boost/config.hpp>

// If we don't have wstring, then Unicode support 
// is not available anyway, so we don't need to even
// compiler this file. This also fixes the problem
// with mingw, which can compile this file, but will
// generate link error when building DLL.
#ifndef BOOST_NO_STD_WSTRING

BOOST_UTF8_BEGIN_NAMESPACE

/////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8
// implementation for wchar_t

BOOST_UTF8_DECL utf8_codecvt_facet::utf8_codecvt_facet(
    std::size_t no_locale_manage
) :
    std::codecvt<wchar_t, char, std::mbstate_t>(no_locale_manage)
{}

// Translate incoming UTF-8 into UCS-4
BOOST_UTF8_DECL std::codecvt_base::result utf8_codecvt_facet::do_in(
    std::mbstate_t& /*state*/, 
    const char * from,
    const char * from_end, 
    const char * & from_next,
    wchar_t * to, 
    wchar_t * to_end, 
    wchar_t * & to_next
) const {
    // Basic algorithm:  The first octet determines how many
    // octets total make up the UCS-4 character.  The remaining
    // "continuing octets" all begin with "10". To convert, subtract
    // the amount that specifies the number of octets from the first
    // octet.  Subtract 0x80 (1000 0000) from each continuing octet,
    // then mash the whole lot together.  Note that each continuing
    // octet only uses 6 bits as unique values, so only shift by
    // multiples of 6 to combine.
    while (from != from_end && to != to_end) {

        // Error checking   on the first octet
        if (invalid_leading_octet(*from)){
            from_next = from;
            to_next = to;
            return std::codecvt_base::error;
        }

        // The first octet is   adjusted by a value dependent upon 
        // the number   of "continuing octets" encoding the character
        const   int cont_octet_count = get_cont_octet_count(*from);
        const   wchar_t octet1_modifier_table[] =   {
            0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc
        };

        // The unsigned char conversion is necessary in case char is
        // signed   (I learned this the hard way)
        wchar_t ucs_result = 
            (unsigned char)(*from++) - octet1_modifier_table[cont_octet_count];

        // Invariants   : 
        //   1) At the start of the loop,   'i' continuing characters have been
        //    processed 
        //   2) *from   points to the next continuing character to be processed.
        int i   = 0;
        while(i != cont_octet_count && from != from_end) {

            // Error checking on continuing characters
            if (invalid_continuing_octet(*from)) {
                from_next   = from;
                to_next =   to;
                return std::codecvt_base::error;
            }

            ucs_result *= (1 << 6); 

            // each continuing character has an extra (10xxxxxx)b attached to 
            // it that must be removed.
            ucs_result += (unsigned char)(*from++) - 0x80;
            ++i;
        }

        // If   the buffer ends with an incomplete unicode character...
        if (from == from_end && i   != cont_octet_count) {
            // rewind "from" to before the current character translation
            from_next = from - (i+1); 
            to_next = to;
            return std::codecvt_base::partial;
        }
        *to++   = ucs_result;
    }
    from_next = from;
    to_next = to;

    // Were we done converting or did we run out of destination space?
    if(from == from_end) return std::codecvt_base::ok;
    else return std::codecvt_base::partial;
}

BOOST_UTF8_DECL std::codecvt_base::result utf8_codecvt_facet::do_out(
    std::mbstate_t& /*state*/, 
    const wchar_t *   from,
    const wchar_t * from_end, 
    const wchar_t * & from_next,
    char * to, 
    char * to_end, 
    char * & to_next
) const
{
    // RG - consider merging this table with the other one
    const wchar_t octet1_modifier_table[] = {
        0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc
    };

    wchar_t max_wchar = (std::numeric_limits<wchar_t>::max)();
    while (from != from_end && to != to_end) {

        // Check for invalid UCS-4 character
        if (*from  > max_wchar) {
            from_next = from;
            to_next = to;
            return std::codecvt_base::error;
        }

        int cont_octet_count = get_cont_octet_out_count(*from);

        // RG  - comment this formula better
        int shift_exponent = (cont_octet_count) *   6;

        // Process the first character
        *to++ = static_cast<char>(octet1_modifier_table[cont_octet_count] +
            (unsigned char)(*from / (1 << shift_exponent)));

        // Process the continuation characters 
        // Invariants: At   the start of the loop:
        //   1) 'i' continuing octets   have been generated
        //   2) '*to'   points to the next location to place an octet
        //   3) shift_exponent is   6 more than needed for the next octet
        int i   = 0;
        while   (i != cont_octet_count && to != to_end) {
            shift_exponent -= 6;
            *to++ = static_cast<char>(0x80 + ((*from / (1 << shift_exponent)) % (1 << 6)));
            ++i;
        }
        // If   we filled up the out buffer before encoding the character
        if(to   == to_end && i != cont_octet_count) {
            from_next = from;
            to_next = to - (i+1);
            return std::codecvt_base::partial;
        }
        ++from;
    }
    from_next = from;
    to_next = to;
    // Were we done or did we run out of destination space
    if(from == from_end) return std::codecvt_base::ok;
    else return std::codecvt_base::partial;
}

// How many char objects can I process to get <= max_limit
// wchar_t objects?
BOOST_UTF8_DECL int utf8_codecvt_facet::do_length(
    const std::mbstate_t &,
    const char * from,
    const char * from_end, 
    std::size_t max_limit
) const
#if BOOST_WORKAROUND(__IBMCPP__, BOOST_TESTED_AT(600))
        throw()
#endif
{ 
    // RG - this code is confusing!  I need a better way to express it.
    // and test cases.

    // Invariants:
    // 1) last_octet_count has the size of the last measured character
    // 2) char_count holds the number of characters shown to fit
    // within the bounds so far (no greater than max_limit)
    // 3) from_next points to the octet 'last_octet_count' before the
    // last measured character.  
    int last_octet_count=0;
    std::size_t char_count = 0;
    const char* from_next = from;
    // Use "<" because the buffer may represent incomplete characters
    while (from_next+last_octet_count <= from_end && char_count <= max_limit) {
        from_next += last_octet_count;
        last_octet_count = (get_octet_count(*from_next));
        ++char_count;
    }
    return static_cast<int>(from_next-from);
}

BOOST_UTF8_DECL unsigned int utf8_codecvt_facet::get_octet_count(
    unsigned char   lead_octet
){
    // if the 0-bit (MSB) is 0, then 1 character
    if (lead_octet <= 0x7f) return 1;

    // Otherwise the count number of consecutive 1 bits starting at MSB
//    assert(0xc0 <= lead_octet && lead_octet <= 0xfd);

    if (0xc0 <= lead_octet && lead_octet <= 0xdf) return 2;
    else if (0xe0 <= lead_octet && lead_octet <= 0xef) return 3;
    else if (0xf0 <= lead_octet && lead_octet <= 0xf7) return 4;
    else if (0xf8 <= lead_octet && lead_octet <= 0xfb) return 5;
    else return 6;
}

namespace detail {

template<std::size_t s>
int get_cont_octet_out_count_impl(wchar_t word){
    if (word < 0x80) {
        return 0;
    }
    if (word < 0x800) {
        return 1;
    }
    return 2;
}

template<>
int get_cont_octet_out_count_impl<4>(wchar_t word){
    if (word < 0x80) {
        return 0;
    }
    if (word < 0x800) {
        return 1;
    }

    // Note that the following code will generate warnings on some platforms
    // where wchar_t is defined as UCS2.  The warnings are superfluous as the
    // specialization is never instantitiated with such compilers, but this
    // can cause problems if warnings are being treated as errors, so we guard
    // against that.  Including <boost/detail/utf8_codecvt_facet.hpp> as we do
    // should be enough to get WCHAR_MAX defined.
#if !defined(WCHAR_MAX)
#   error WCHAR_MAX not defined!
#endif
    // cope with VC++ 7.1 or earlier having invalid WCHAR_MAX
#if defined(_MSC_VER) && _MSC_VER <= 1310 // 7.1 or earlier
    return 2;
#elif WCHAR_MAX > 0x10000
    
   if (word < 0x10000) {
        return 2;
    }
    if (word < 0x200000) {
        return 3;
    }
    if (word < 0x4000000) {
        return 4;
    }
    return 5;
    
#else
    return 2;
#endif
}

} // namespace detail

// How many "continuing octets" will be needed for this word
// ==   total octets - 1.
BOOST_UTF8_DECL int utf8_codecvt_facet::get_cont_octet_out_count(
    wchar_t word
) const {
    return detail::get_cont_octet_out_count_impl<sizeof(wchar_t)>(word);
}
BOOST_UTF8_END_NAMESPACE

#endif
Commit	Line	Data
7c673cae FG	1	/////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8
	2	// utf8_codecvt_facet.ipp
	3
	4	// Copyright (c) 2001 Ronald Garcia, Indiana University (garcia@osl.iu.edu)
	5	// Andrew Lumsdaine, Indiana University (lums@osl.iu.edu).
	6	// Use, modification and distribution is subject to the Boost Software
	7	// License, Version 1.0. (See accompanying file LICENSE_1_0.txt or copy at
	8	// http://www.boost.org/LICENSE_1_0.txt)
	9
	10	// Please see the comments in <boost/detail/utf8_codecvt_facet.hpp> to
	11	// learn how this file should be used.
	12
	13	#include <boost/detail/utf8_codecvt_facet.hpp>
	14
	15	#include <cstdlib> // for multi-byte converson routines
	16	#include <cassert>
	17
	18	#include <boost/limits.hpp>
	19	#include <boost/config.hpp>
	20
	21	// If we don't have wstring, then Unicode support
	22	// is not available anyway, so we don't need to even
	23	// compiler this file. This also fixes the problem
	24	// with mingw, which can compile this file, but will
	25	// generate link error when building DLL.
	26	#ifndef BOOST_NO_STD_WSTRING
	27
	28	BOOST_UTF8_BEGIN_NAMESPACE
	29
	30	/////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8
	31	// implementation for wchar_t
	32
	33	BOOST_UTF8_DECL utf8_codecvt_facet::utf8_codecvt_facet(
	34	std::size_t no_locale_manage
	35	) :
	36	std::codecvt<wchar_t, char, std::mbstate_t>(no_locale_manage)
	37	{}
	38
	39	// Translate incoming UTF-8 into UCS-4
	40	BOOST_UTF8_DECL std::codecvt_base::result utf8_codecvt_facet::do_in(
	41	std::mbstate_t& /state/,
	42	const char * from,
	43	const char * from_end,
	44	const char * & from_next,
	45	wchar_t * to,
	46	wchar_t * to_end,
	47	wchar_t * & to_next
	48	) const {
	49	// Basic algorithm: The first octet determines how many
	50	// octets total make up the UCS-4 character. The remaining
	51	// "continuing octets" all begin with "10". To convert, subtract
	52	// the amount that specifies the number of octets from the first
	53	// octet. Subtract 0x80 (1000 0000) from each continuing octet,
	54	// then mash the whole lot together. Note that each continuing
	55	// octet only uses 6 bits as unique values, so only shift by
	56	// multiples of 6 to combine.
	57	while (from != from_end && to != to_end) {
	58
	59	// Error checking on the first octet
	60	if (invalid_leading_octet(*from)){
	61	from_next = from;
	62	to_next = to;
	63	return std::codecvt_base::error;
	64	}
65
66	// The first octet is adjusted by a value dependent upon
67	// the number of "continuing octets" encoding the character
68	const int cont_octet_count = get_cont_octet_count(*from);
69	const wchar_t octet1_modifier_table[] = {
70	0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc
71	};
72
73	// The unsigned char conversion is necessary in case char is
74	// signed (I learned this the hard way)
75	wchar_t ucs_result =
76	(unsigned char)(*from++) - octet1_modifier_table[cont_octet_count];
77
78	// Invariants :
79	// 1) At the start of the loop, 'i' continuing characters have been
80	// processed
81	// 2) *from points to the next continuing character to be processed.
82	int i = 0;
83	while(i != cont_octet_count && from != from_end) {
84
85	// Error checking on continuing characters
86	if (invalid_continuing_octet(*from)) {
87	from_next = from;
88	to_next = to;
89	return std::codecvt_base::error;
90	}
91
92	ucs_result *= (1 << 6);
93
94	// each continuing character has an extra (10xxxxxx)b attached to
95	// it that must be removed.
96	ucs_result += (unsigned char)(*from++) - 0x80;
97	++i;
98	}
99
100	// If the buffer ends with an incomplete unicode character...
101	if (from == from_end && i != cont_octet_count) {
102	// rewind "from" to before the current character translation
103	from_next = from - (i+1);
104	to_next = to;
105	return std::codecvt_base::partial;
106	}
107	*to++ = ucs_result;
108	}
109	from_next = from;
110	to_next = to;
111
112	// Were we done converting or did we run out of destination space?
113	if(from == from_end) return std::codecvt_base::ok;
114	else return std::codecvt_base::partial;
115	}
116
117	BOOST_UTF8_DECL std::codecvt_base::result utf8_codecvt_facet::do_out(
118	std::mbstate_t& /state/,
119	const wchar_t * from,
120	const wchar_t * from_end,
121	const wchar_t * & from_next,
122	char * to,
123	char * to_end,
124	char * & to_next
125	) const
126	{
127	// RG - consider merging this table with the other one
128	const wchar_t octet1_modifier_table[] = {
129	0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc
130	};
131
132	wchar_t max_wchar = (std::numeric_limits<wchar_t>::max)();
133	while (from != from_end && to != to_end) {
134
135	// Check for invalid UCS-4 character
136	if (*from > max_wchar) {
137	from_next = from;
138	to_next = to;
139	return std::codecvt_base::error;
140	}
141
142	int cont_octet_count = get_cont_octet_out_count(*from);
143
144	// RG - comment this formula better
145	int shift_exponent = (cont_octet_count) * 6;
146
147	// Process the first character
148	*to++ = static_cast<char>(octet1_modifier_table[cont_octet_count] +
149	(unsigned char)(*from / (1 << shift_exponent)));
150
151	// Process the continuation characters
152	// Invariants: At the start of the loop:
153	// 1) 'i' continuing octets have been generated
154	// 2) '*to' points to the next location to place an octet
155	// 3) shift_exponent is 6 more than needed for the next octet
156	int i = 0;
157	while (i != cont_octet_count && to != to_end) {
158	shift_exponent -= 6;
159	to++ = static_cast<char>(0x80 + ((from / (1 << shift_exponent)) % (1 << 6)));
160	++i;
161	}
162	// If we filled up the out buffer before encoding the character
163	if(to == to_end && i != cont_octet_count) {
164	from_next = from;
165	to_next = to - (i+1);
166	return std::codecvt_base::partial;
167	}
168	++from;
169	}
170	from_next = from;
171	to_next = to;
172	// Were we done or did we run out of destination space
173	if(from == from_end) return std::codecvt_base::ok;
174	else return std::codecvt_base::partial;
175	}
176
177	// How many char objects can I process to get <= max_limit
178	// wchar_t objects?
179	BOOST_UTF8_DECL int utf8_codecvt_facet::do_length(
180	const std::mbstate_t &,
181	const char * from,
182	const char * from_end,
183	std::size_t max_limit
184	) const
185	#if BOOST_WORKAROUND(__IBMCPP__, BOOST_TESTED_AT(600))
186	throw()
187	#endif
188	{
189	// RG - this code is confusing! I need a better way to express it.
190	// and test cases.
191
192	// Invariants:
193	// 1) last_octet_count has the size of the last measured character
194	// 2) char_count holds the number of characters shown to fit
195	// within the bounds so far (no greater than max_limit)
196	// 3) from_next points to the octet 'last_octet_count' before the
197	// last measured character.
198	int last_octet_count=0;
199	std::size_t char_count = 0;
200	const char* from_next = from;
201	// Use "<" because the buffer may represent incomplete characters
202	while (from_next+last_octet_count <= from_end && char_count <= max_limit) {
203	from_next += last_octet_count;
204	last_octet_count = (get_octet_count(*from_next));
205	++char_count;
206	}
207	return static_cast<int>(from_next-from);
208	}
209
210	BOOST_UTF8_DECL unsigned int utf8_codecvt_facet::get_octet_count(
211	unsigned char lead_octet
212	){
213	// if the 0-bit (MSB) is 0, then 1 character
214	if (lead_octet <= 0x7f) return 1;
215
216	// Otherwise the count number of consecutive 1 bits starting at MSB
217	// assert(0xc0 <= lead_octet && lead_octet <= 0xfd);
218
219	if (0xc0 <= lead_octet && lead_octet <= 0xdf) return 2;
220	else if (0xe0 <= lead_octet && lead_octet <= 0xef) return 3;
221	else if (0xf0 <= lead_octet && lead_octet <= 0xf7) return 4;
222	else if (0xf8 <= lead_octet && lead_octet <= 0xfb) return 5;
223	else return 6;
224	}
225
226	namespace detail {
227
228	template<std::size_t s>
229	int get_cont_octet_out_count_impl(wchar_t word){
230	if (word < 0x80) {
231	return 0;
232	}
233	if (word < 0x800) {
234	return 1;
235	}
236	return 2;
237	}
238
239	template<>
240	int get_cont_octet_out_count_impl<4>(wchar_t word){
241	if (word < 0x80) {
242	return 0;
243	}
244	if (word < 0x800) {
245	return 1;
246	}
247
248	// Note that the following code will generate warnings on some platforms
249	// where wchar_t is defined as UCS2. The warnings are superfluous as the
250	// specialization is never instantitiated with such compilers, but this
251	// can cause problems if warnings are being treated as errors, so we guard
252	// against that. Including <boost/detail/utf8_codecvt_facet.hpp> as we do
253	// should be enough to get WCHAR_MAX defined.
254	#if !defined(WCHAR_MAX)
255	# error WCHAR_MAX not defined!
256	#endif
257	// cope with VC++ 7.1 or earlier having invalid WCHAR_MAX
258	#if defined(_MSC_VER) && _MSC_VER <= 1310 // 7.1 or earlier
259	return 2;
260	#elif WCHAR_MAX > 0x10000
261
262	if (word < 0x10000) {
263	return 2;
264	}
265	if (word < 0x200000) {
266	return 3;
267	}
268	if (word < 0x4000000) {
269	return 4;
270	}
271	return 5;
272
273	#else
274	return 2;
275	#endif
276	}
277
278	} // namespace detail
279
280	// How many "continuing octets" will be needed for this word
281	// == total octets - 1.
282	BOOST_UTF8_DECL int utf8_codecvt_facet::get_cont_octet_out_count(
283	wchar_t word
284	) const {
285	return detail::get_cont_octet_out_count_impl<sizeof(wchar_t)>(word);
286	}
287	BOOST_UTF8_END_NAMESPACE
288
289	#endif