[ceph.git] / ceph / src / boost / boost / locale / util.hpp

//
//  Copyright (c) 2009-2011 Artyom Beilis (Tonkikh)
//
//  Distributed under the Boost Software License, Version 1.0. (See
//  accompanying file LICENSE_1_0.txt or copy at
//  http://www.boost.org/LICENSE_1_0.txt)
//
#ifndef BOOST_LOCALE_UTIL_HPP
#define BOOST_LOCALE_UTIL_HPP
#include <locale>
#include <typeinfo>
#include <boost/cstdint.hpp>
#include <boost/locale/utf.hpp>
#include <boost/locale/generator.hpp>
#include <boost/assert.hpp>

#include <vector>
namespace boost {
namespace locale {
///
/// \brief This namespace provides various utility function useful for Boost.Locale backends
/// implementations
///
namespace util {
    
    ///
    /// \brief Return default system locale name in POSIX format.
    ///
    /// This function tries to detect the locale using, LC_CTYPE, LC_ALL and LANG environment
    /// variables in this order and if all of them unset, in POSIX platforms it returns "C"
    /// 
    /// On Windows additionally to check the above environment variables, this function
    /// tries to creates locale name from ISO-339 and ISO-3199 country codes defined
    /// for user default locale.
    /// If \a use_utf8_on_windows is true it sets the encoding to UTF-8, otherwise, if system
    /// locale supports ANSI code-page it defines the ANSI encoding like windows-1252, otherwise it fall-backs
    /// to UTF-8 encoding if ANSI code-page is not available.
    ///
    BOOST_LOCALE_DECL
    std::string get_system_locale(bool use_utf8_on_windows = false);

    ///
    /// \brief Installs information facet to locale in based on locale name \a name
    ///
    /// This function installs boost::locale::info facet into the locale \a in and returns
    /// newly created locale.
    ///
    /// Note: all information is based only on parsing of string \a name;
    ///
    /// The name has following format: language[_COUNTRY][.encoding][\@variant]
    /// Where language is ISO-639 language code like "en" or "ru", COUNTRY is ISO-3166
    /// country identifier like "US" or "RU". the Encoding is a charracter set name
    /// like UTF-8 or ISO-8859-1. Variant is backend specific variant like \c euro or
    /// calendar=hebrew.
    ///
    /// If some parameters are missing they are specified as blanks, default encoding
    /// is assumed to be US-ASCII and missing language is assumed to be "C"
    ///
    BOOST_LOCALE_DECL
    std::locale create_info(std::locale const &in,std::string const &name); 


    ///
    /// \brief This class represent a simple stateless converter from UCS-4 and to UCS-4 for
    ///  each single code point
    ///
    /// This class is used for creation of std::codecvt facet for converting utf-16/utf-32 encoding
    /// to encoding supported by this converter
    ///
    /// Please note, this converter should be fully stateless. Fully stateless means it should
    /// never assume that it is called in any specific order on the text. Even if the
    /// encoding itself seems to be stateless like windows-1255 or shift-jis, some
    /// encoders (most notably iconv) can actually compose several code-point into one or
    /// decompose them in case composite characters are found. So be very careful when implementing
    /// these converters for certain character set.
    ///
    class base_converter {
    public:

        ///
        /// This value should be returned when an illegal input sequence or code-point is observed:
        /// For example if a UCS-32 code-point is in the range reserved for UTF-16 surrogates
        /// or an invalid UTF-8 sequence is found
        ///
        static const uint32_t illegal=utf::illegal;

        ///
        /// This value is returned in following cases: The of incomplete input sequence was found or 
        /// insufficient output buffer was provided so complete output could not be written.
        ///
        static const uint32_t incomplete=utf::incomplete;
        
        virtual ~base_converter() 
        {
        }
        ///
        /// Return the maximal length that one Unicode code-point can be converted to, for example
        /// for UTF-8 it is 4, for Shift-JIS it is 2 and ISO-8859-1 is 1
        ///
        virtual int max_len() const 
        {
            return 1;
        }
        ///
        /// Returns true if calling the functions from_unicode, to_unicode, and max_len is thread safe.
        ///
        /// Rule of thumb: if this class' implementation uses simple tables that are unchanged
        /// or is purely algorithmic like UTF-8 - so it does not share any mutable bit for
        /// independent to_unicode, from_unicode calls, you may set it to true, otherwise,
        /// for example if you use iconv_t descriptor or UConverter as conversion object return false,
        /// and this object will be cloned for each use.
        ///
        virtual bool is_thread_safe() const 
        {
            return false;
        }
        ///
        /// Create a polymorphic copy of this object, usually called only if is_thread_safe() return false
        ///
        virtual base_converter *clone() const 
        {
            BOOST_ASSERT(typeid(*this)==typeid(base_converter));
            return new base_converter();
        }

        ///
        /// Convert a single character starting at begin and ending at most at end to Unicode code-point.
        ///
        /// if valid input sequence found in [\a begin,\a code_point_end) such as \a begin < \a code_point_end && \a code_point_end <= \a end
        /// it is converted to its Unicode code point equivalent, \a begin is set to \a code_point_end
        ///
        /// if incomplete input sequence found in [\a begin,\a end), i.e. there my be such \a code_point_end that \a code_point_end > \a end
        /// and [\a begin, \a code_point_end) would be valid input sequence, then \a incomplete is returned begin stays unchanged, for example
        /// for UTF-8 conversion a *begin = 0xc2, \a begin +1 = \a end is such situation.
        ///
        /// if invalid input sequence found, i.e. there is a sequence [\a begin, \a code_point_end) such as \a code_point_end <= \a end
        /// that is illegal for this encoding, \a illegal is returned and begin stays unchanged. For example if *begin = 0xFF and begin < end
        /// for UTF-8, then \a illegal is returned.
        /// 
        ///
        virtual uint32_t to_unicode(char const *&begin,char const *end) 
        {
            if(begin == end)
                return incomplete;
            unsigned char cp = *begin;
            if(cp <= 0x7F) {
                begin++;
                return cp;
            }
            return illegal;
        }
        ///
        /// Convert a single code-point \a u into encoding and store it in [begin,end) range.
        ///
        /// If u is invalid Unicode code-point, or it can not be mapped correctly to represented character set,
        /// \a illegal should be returned
        ///
        /// If u can be converted to a sequence of bytes c1, ... , cN (1<= N <= max_len() ) then
        /// 
        /// -# If end - begin >= N, c1, ... cN are written starting at begin and N is returned
        /// -# If end - begin < N, incomplete is returned, it is unspecified what would be
        ///    stored in bytes in range [begin,end)

        virtual uint32_t from_unicode(uint32_t u,char *begin,char const *end) 
        {
            if(begin==end)
                return incomplete;
            if(u >= 0x80)
                return illegal;
            *begin = static_cast<char>(u);
            return 1;
        }
    };

    #if !defined(BOOST_LOCALE_HIDE_AUTO_PTR) && !defined(BOOST_NO_AUTO_PTR)
    ///
    /// This function creates a \a base_converter that can be used for conversion between UTF-8 and
    /// unicode code points
    ///
    BOOST_LOCALE_DECL std::auto_ptr<base_converter> create_utf8_converter();
    ///
    /// This function creates a \a base_converter that can be used for conversion between single byte
    /// character encodings like ISO-8859-1, koi8-r, windows-1255 and Unicode code points,
    /// 
    /// If \a encoding is not supported, empty pointer is returned. You should check if
    /// std::auto_ptr<base_converter>::get() != 0
    ///
    BOOST_LOCALE_DECL std::auto_ptr<base_converter> create_simple_converter(std::string const &encoding);


    ///
    /// Install codecvt facet into locale \a in and return new locale that is based on \a in and uses new
    /// facet.
    ///
    /// codecvt facet would convert between narrow and wide/char16_t/char32_t encodings using \a cvt converter.
    /// If \a cvt is null pointer, always failure conversion would be used that fails on every first input or output.
    /// 
    /// Note: the codecvt facet handles both UTF-16 and UTF-32 wide encodings, it knows to break and join
    /// Unicode code-points above 0xFFFF to and from surrogate pairs correctly. \a cvt should be unaware
    /// of wide encoding type
    ///
    BOOST_LOCALE_DECL
    std::locale create_codecvt(std::locale const &in,std::auto_ptr<base_converter> cvt,character_facet_type type);
    #endif

    #ifndef BOOST_NO_CXX11_SMART_PTR
    ///
    /// This function creates a \a base_converter that can be used for conversion between UTF-8 and
    /// unicode code points
    ///
    BOOST_LOCALE_DECL std::unique_ptr<base_converter> create_utf8_converter_unique_ptr();
    ///
    /// This function creates a \a base_converter that can be used for conversion between single byte
    /// character encodings like ISO-8859-1, koi8-r, windows-1255 and Unicode code points,
    /// 
    /// If \a encoding is not supported, empty pointer is returned. You should check if
    /// std::unique_ptr<base_converter>::get() != 0
    ///
    BOOST_LOCALE_DECL std::unique_ptr<base_converter> create_simple_converter_unique_ptr(std::string const &encoding);

    ///
    /// Install codecvt facet into locale \a in and return new locale that is based on \a in and uses new
    /// facet.
    ///
    /// codecvt facet would convert between narrow and wide/char16_t/char32_t encodings using \a cvt converter.
    /// If \a cvt is null pointer, always failure conversion would be used that fails on every first input or output.
    /// 
    /// Note: the codecvt facet handles both UTF-16 and UTF-32 wide encodings, it knows to break and join
    /// Unicode code-points above 0xFFFF to and from surrogate pairs correctly. \a cvt should be unaware
    /// of wide encoding type
    ///
    BOOST_LOCALE_DECL
    std::locale create_codecvt(std::locale const &in,std::unique_ptr<base_converter> cvt,character_facet_type type);
    #endif

    ///
    /// This function creates a \a base_converter that can be used for conversion between UTF-8 and
    /// unicode code points
    ///
    BOOST_LOCALE_DECL base_converter *create_utf8_converter_new_ptr();
    ///
    /// This function creates a \a base_converter that can be used for conversion between single byte
    /// character encodings like ISO-8859-1, koi8-r, windows-1255 and Unicode code points,
    /// 
    /// If \a encoding is not supported, empty pointer is returned. You should check if
    /// std::unique_ptr<base_converter>::get() != 0
    ///
    BOOST_LOCALE_DECL base_converter *create_simple_converter_new_ptr(std::string const &encoding);

    ///
    /// Install codecvt facet into locale \a in and return new locale that is based on \a in and uses new
    /// facet.
    ///
    /// codecvt facet would convert between narrow and wide/char16_t/char32_t encodings using \a cvt converter.
    /// If \a cvt is null pointer, always failure conversion would be used that fails on every first input or output.
    /// 
    /// Note: the codecvt facet handles both UTF-16 and UTF-32 wide encodings, it knows to break and join
    /// Unicode code-points above 0xFFFF to and from surrogate pairs correctly. \a cvt should be unaware
    /// of wide encoding type
    ///
    /// ownership of cvt is transfered 
    ///
    BOOST_LOCALE_DECL
    std::locale create_codecvt_from_pointer(std::locale const &in,base_converter *cvt,character_facet_type type);

    /// 
    /// Install utf8 codecvt to UTF-16 or UTF-32 into locale \a in and return
    /// new locale that is based on \a in and uses new facet. 
    /// 
    BOOST_LOCALE_DECL
    std::locale create_utf8_codecvt(std::locale const &in,character_facet_type type);

    ///
    /// This function installs codecvt that can be used for conversion between single byte
    /// character encodings like ISO-8859-1, koi8-r, windows-1255 and Unicode code points,
    /// 
    /// Throws boost::locale::conv::invalid_charset_error if the chacater set is not supported or isn't single byte character
    /// set
    BOOST_LOCALE_DECL
    std::locale create_simple_codecvt(std::locale const &in,std::string const &encoding,character_facet_type type);
} // util
} // locale 
} // boost

#endif
// vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4
Commit	Line	Data
7c673cae FG	1	//
	2	// Copyright (c) 2009-2011 Artyom Beilis (Tonkikh)
	3	//
	4	// Distributed under the Boost Software License, Version 1.0. (See
	5	// accompanying file LICENSE_1_0.txt or copy at
	6	// http://www.boost.org/LICENSE_1_0.txt)
	7	//
	8	#ifndef BOOST_LOCALE_UTIL_HPP
	9	#define BOOST_LOCALE_UTIL_HPP
	10	#include <locale>
	11	#include <typeinfo>
	12	#include <boost/cstdint.hpp>
	13	#include <boost/locale/utf.hpp>
	14	#include <boost/locale/generator.hpp>
	15	#include <boost/assert.hpp>
	16
	17	#include <vector>
	18	namespace boost {
	19	namespace locale {
	20	///
	21	/// \brief This namespace provides various utility function useful for Boost.Locale backends
	22	/// implementations
	23	///
	24	namespace util {
	25
	26	///
	27	/// \brief Return default system locale name in POSIX format.
	28	///
	29	/// This function tries to detect the locale using, LC_CTYPE, LC_ALL and LANG environment
	30	/// variables in this order and if all of them unset, in POSIX platforms it returns "C"
	31	///
	32	/// On Windows additionally to check the above environment variables, this function
	33	/// tries to creates locale name from ISO-339 and ISO-3199 country codes defined
	34	/// for user default locale.
	35	/// If \a use_utf8_on_windows is true it sets the encoding to UTF-8, otherwise, if system
	36	/// locale supports ANSI code-page it defines the ANSI encoding like windows-1252, otherwise it fall-backs
	37	/// to UTF-8 encoding if ANSI code-page is not available.
	38	///
	39	BOOST_LOCALE_DECL
	40	std::string get_system_locale(bool use_utf8_on_windows = false);
	41
	42	///
	43	/// \brief Installs information facet to locale in based on locale name \a name
	44	///
	45	/// This function installs boost::locale::info facet into the locale \a in and returns
	46	/// newly created locale.
	47	///
	48	/// Note: all information is based only on parsing of string \a name;
	49	///
	50	/// The name has following format: language[_COUNTRY][.encoding][\@variant]
	51	/// Where language is ISO-639 language code like "en" or "ru", COUNTRY is ISO-3166
	52	/// country identifier like "US" or "RU". the Encoding is a charracter set name
	53	/// like UTF-8 or ISO-8859-1. Variant is backend specific variant like \c euro or
	54	/// calendar=hebrew.
	55	///
	56	/// If some parameters are missing they are specified as blanks, default encoding
	57	/// is assumed to be US-ASCII and missing language is assumed to be "C"
	58	///
	59	BOOST_LOCALE_DECL
	60	std::locale create_info(std::locale const &in,std::string const &name);
	61
	62
	63	///
	64	/// \brief This class represent a simple stateless converter from UCS-4 and to UCS-4 for
65	/// each single code point
66	///
67	/// This class is used for creation of std::codecvt facet for converting utf-16/utf-32 encoding
68	/// to encoding supported by this converter
69	///
70	/// Please note, this converter should be fully stateless. Fully stateless means it should
71	/// never assume that it is called in any specific order on the text. Even if the
72	/// encoding itself seems to be stateless like windows-1255 or shift-jis, some
73	/// encoders (most notably iconv) can actually compose several code-point into one or
74	/// decompose them in case composite characters are found. So be very careful when implementing
75	/// these converters for certain character set.
76	///
77	class base_converter {
78	public:
79
80	///
81	/// This value should be returned when an illegal input sequence or code-point is observed:
82	/// For example if a UCS-32 code-point is in the range reserved for UTF-16 surrogates
83	/// or an invalid UTF-8 sequence is found
84	///
85	static const uint32_t illegal=utf::illegal;
86
87	///
88	/// This value is returned in following cases: The of incomplete input sequence was found or
89	/// insufficient output buffer was provided so complete output could not be written.
90	///
91	static const uint32_t incomplete=utf::incomplete;
92
93	virtual ~base_converter()
94	{
95	}
96	///
97	/// Return the maximal length that one Unicode code-point can be converted to, for example
98	/// for UTF-8 it is 4, for Shift-JIS it is 2 and ISO-8859-1 is 1
99	///
100	virtual int max_len() const
101	{
102	return 1;
103	}
104	///
105	/// Returns true if calling the functions from_unicode, to_unicode, and max_len is thread safe.
106	///
107	/// Rule of thumb: if this class' implementation uses simple tables that are unchanged
108	/// or is purely algorithmic like UTF-8 - so it does not share any mutable bit for
109	/// independent to_unicode, from_unicode calls, you may set it to true, otherwise,
110	/// for example if you use iconv_t descriptor or UConverter as conversion object return false,
111	/// and this object will be cloned for each use.
112	///
113	virtual bool is_thread_safe() const
114	{
115	return false;
116	}
117	///
118	/// Create a polymorphic copy of this object, usually called only if is_thread_safe() return false
119	///
120	virtual base_converter *clone() const
121	{
122	BOOST_ASSERT(typeid(*this)==typeid(base_converter));
123	return new base_converter();
124	}
125
126	///
127	/// Convert a single character starting at begin and ending at most at end to Unicode code-point.
128	///
129	/// if valid input sequence found in [\a begin,\a code_point_end) such as \a begin < \a code_point_end && \a code_point_end <= \a end
130	/// it is converted to its Unicode code point equivalent, \a begin is set to \a code_point_end
131	///
132	/// if incomplete input sequence found in [\a begin,\a end), i.e. there my be such \a code_point_end that \a code_point_end > \a end
133	/// and [\a begin, \a code_point_end) would be valid input sequence, then \a incomplete is returned begin stays unchanged, for example
134	/// for UTF-8 conversion a *begin = 0xc2, \a begin +1 = \a end is such situation.
135	///
136	/// if invalid input sequence found, i.e. there is a sequence [\a begin, \a code_point_end) such as \a code_point_end <= \a end
137	/// that is illegal for this encoding, \a illegal is returned and begin stays unchanged. For example if *begin = 0xFF and begin < end
138	/// for UTF-8, then \a illegal is returned.
139	///
140	///
141	virtual uint32_t to_unicode(char const &begin,char const end)
142	{
143	if(begin == end)
144	return incomplete;
145	unsigned char cp = *begin;
146	if(cp <= 0x7F) {
147	begin++;
148	return cp;
149	}
150	return illegal;
151	}
152	///
153	/// Convert a single code-point \a u into encoding and store it in [begin,end) range.
154	///
155	/// If u is invalid Unicode code-point, or it can not be mapped correctly to represented character set,
156	/// \a illegal should be returned
157	///
158	/// If u can be converted to a sequence of bytes c1, ... , cN (1<= N <= max_len() ) then
159	///
160	/// -# If end - begin >= N, c1, ... cN are written starting at begin and N is returned
161	/// -# If end - begin < N, incomplete is returned, it is unspecified what would be
162	/// stored in bytes in range [begin,end)
163
164	virtual uint32_t from_unicode(uint32_t u,char begin,char const end)
165	{
166	if(begin==end)
167	return incomplete;
168	if(u >= 0x80)
169	return illegal;
170	*begin = static_cast<char>(u);
171	return 1;
172	}
173	};
174
11fdf7f2	175	#if !defined(BOOST_LOCALE_HIDE_AUTO_PTR) && !defined(BOOST_NO_AUTO_PTR)
7c673cae FG	176	///
	177	/// This function creates a \a base_converter that can be used for conversion between UTF-8 and
	178	/// unicode code points
	179	///
	180	BOOST_LOCALE_DECL std::auto_ptr<base_converter> create_utf8_converter();
	181	///
	182	/// This function creates a \a base_converter that can be used for conversion between single byte
	183	/// character encodings like ISO-8859-1, koi8-r, windows-1255 and Unicode code points,
	184	///
	185	/// If \a encoding is not supported, empty pointer is returned. You should check if
	186	/// std::auto_ptr<base_converter>::get() != 0
	187	///
	188	BOOST_LOCALE_DECL std::auto_ptr<base_converter> create_simple_converter(std::string const &encoding);
	189
	190
	191	///
	192	/// Install codecvt facet into locale \a in and return new locale that is based on \a in and uses new
	193	/// facet.
	194	///
	195	/// codecvt facet would convert between narrow and wide/char16_t/char32_t encodings using \a cvt converter.
	196	/// If \a cvt is null pointer, always failure conversion would be used that fails on every first input or output.
	197	///
	198	/// Note: the codecvt facet handles both UTF-16 and UTF-32 wide encodings, it knows to break and join
	199	/// Unicode code-points above 0xFFFF to and from surrogate pairs correctly. \a cvt should be unaware
	200	/// of wide encoding type
	201	///
	202	BOOST_LOCALE_DECL
	203	std::locale create_codecvt(std::locale const &in,std::auto_ptr<base_converter> cvt,character_facet_type type);
11fdf7f2 TL	204	#endif
	205
	206	#ifndef BOOST_NO_CXX11_SMART_PTR
	207	///
	208	/// This function creates a \a base_converter that can be used for conversion between UTF-8 and
	209	/// unicode code points
	210	///
	211	BOOST_LOCALE_DECL std::unique_ptr<base_converter> create_utf8_converter_unique_ptr();
	212	///
	213	/// This function creates a \a base_converter that can be used for conversion between single byte
	214	/// character encodings like ISO-8859-1, koi8-r, windows-1255 and Unicode code points,
	215	///
	216	/// If \a encoding is not supported, empty pointer is returned. You should check if
	217	/// std::unique_ptr<base_converter>::get() != 0
	218	///
	219	BOOST_LOCALE_DECL std::unique_ptr<base_converter> create_simple_converter_unique_ptr(std::string const &encoding);
	220
	221	///
	222	/// Install codecvt facet into locale \a in and return new locale that is based on \a in and uses new
	223	/// facet.
	224	///
	225	/// codecvt facet would convert between narrow and wide/char16_t/char32_t encodings using \a cvt converter.
	226	/// If \a cvt is null pointer, always failure conversion would be used that fails on every first input or output.
	227	///
	228	/// Note: the codecvt facet handles both UTF-16 and UTF-32 wide encodings, it knows to break and join
	229	/// Unicode code-points above 0xFFFF to and from surrogate pairs correctly. \a cvt should be unaware
	230	/// of wide encoding type
	231	///
	232	BOOST_LOCALE_DECL
	233	std::locale create_codecvt(std::locale const &in,std::unique_ptr<base_converter> cvt,character_facet_type type);
	234	#endif
	235
	236	///
	237	/// This function creates a \a base_converter that can be used for conversion between UTF-8 and
	238	/// unicode code points
	239	///
	240	BOOST_LOCALE_DECL base_converter *create_utf8_converter_new_ptr();
	241	///
	242	/// This function creates a \a base_converter that can be used for conversion between single byte
	243	/// character encodings like ISO-8859-1, koi8-r, windows-1255 and Unicode code points,
	244	///
	245	/// If \a encoding is not supported, empty pointer is returned. You should check if
	246	/// std::unique_ptr<base_converter>::get() != 0
	247	///
	248	BOOST_LOCALE_DECL base_converter *create_simple_converter_new_ptr(std::string const &encoding);
	249
	250	///
	251	/// Install codecvt facet into locale \a in and return new locale that is based on \a in and uses new
	252	/// facet.
	253	///
	254	/// codecvt facet would convert between narrow and wide/char16_t/char32_t encodings using \a cvt converter.
	255	/// If \a cvt is null pointer, always failure conversion would be used that fails on every first input or output.
	256	///
	257	/// Note: the codecvt facet handles both UTF-16 and UTF-32 wide encodings, it knows to break and join
	258	/// Unicode code-points above 0xFFFF to and from surrogate pairs correctly. \a cvt should be unaware
	259	/// of wide encoding type
	260	///
	261	/// ownership of cvt is transfered
	262	///
	263	BOOST_LOCALE_DECL
	264	std::locale create_codecvt_from_pointer(std::locale const &in,base_converter *cvt,character_facet_type type);
7c673cae FG	265
	266	///
	267	/// Install utf8 codecvt to UTF-16 or UTF-32 into locale \a in and return
	268	/// new locale that is based on \a in and uses new facet.
	269	///
	270	BOOST_LOCALE_DECL
	271	std::locale create_utf8_codecvt(std::locale const &in,character_facet_type type);
	272
	273	///
	274	/// This function installs codecvt that can be used for conversion between single byte
	275	/// character encodings like ISO-8859-1, koi8-r, windows-1255 and Unicode code points,
	276	///
	277	/// Throws boost::locale::conv::invalid_charset_error if the chacater set is not supported or isn't single byte character
	278	/// set
	279	BOOST_LOCALE_DECL
	280	std::locale create_simple_codecvt(std::locale const &in,std::string const &encoding,character_facet_type type);
	281	} // util
	282	} // locale
	283	} // boost
	284
	285	#endif
	286	// vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4