ceph/src/boost/boost/locale/util.hpp

   1 //
   2 //  Copyright (c) 2009-2011 Artyom Beilis (Tonkikh)
   3 //
   4 //  Distributed under the Boost Software License, Version 1.0. (See
   5 //  accompanying file LICENSE_1_0.txt or copy at
   6 //  http://www.boost.org/LICENSE_1_0.txt)
   7 //
   8 #ifndef BOOST_LOCALE_UTIL_HPP
   9 #define BOOST_LOCALE_UTIL_HPP
  10 #include <locale>
  11 #include <typeinfo>
  12 #include <boost/cstdint.hpp>
  13 #include <boost/locale/utf.hpp>
  14 #include <boost/locale/generator.hpp>
  15 #include <boost/assert.hpp>
  16
  17 #include <vector>
  18 namespace boost {
  19 namespace locale {
  20 ///
  21 /// \brief This namespace provides various utility function useful for Boost.Locale backends
  22 /// implementations
  23 ///
  24 namespace util {
  25
  26     ///
  27     /// \brief Return default system locale name in POSIX format.
  28     ///
  29     /// This function tries to detect the locale using, LC_CTYPE, LC_ALL and LANG environment
  30     /// variables in this order and if all of them unset, in POSIX platforms it returns "C"
  31     ///
  32     /// On Windows additionally to check the above environment variables, this function
  33     /// tries to creates locale name from ISO-339 and ISO-3199 country codes defined
  34     /// for user default locale.
  35     /// If \a use_utf8_on_windows is true it sets the encoding to UTF-8, otherwise, if system
  36     /// locale supports ANSI code-page it defines the ANSI encoding like windows-1252, otherwise it fall-backs
  37     /// to UTF-8 encoding if ANSI code-page is not available.
  38     ///
  39     BOOST_LOCALE_DECL
  40     std::string get_system_locale(bool use_utf8_on_windows = false);
  41
  42     ///
  43     /// \brief Installs information facet to locale in based on locale name \a name
  44     ///
  45     /// This function installs boost::locale::info facet into the locale \a in and returns
  46     /// newly created locale.
  47     ///
  48     /// Note: all information is based only on parsing of string \a name;
  49     ///
  50     /// The name has following format: language[_COUNTRY][.encoding][\@variant]
  51     /// Where language is ISO-639 language code like "en" or "ru", COUNTRY is ISO-3166
  52     /// country identifier like "US" or "RU". the Encoding is a charracter set name
  53     /// like UTF-8 or ISO-8859-1. Variant is backend specific variant like \c euro or
  54     /// calendar=hebrew.
  55     ///
  56     /// If some parameters are missing they are specified as blanks, default encoding
  57     /// is assumed to be US-ASCII and missing language is assumed to be "C"
  58     ///
  59     BOOST_LOCALE_DECL
  60     std::locale create_info(std::locale const &in,std::string const &name);
  61
  62
  63     ///
  64     /// \brief This class represent a simple stateless converter from UCS-4 and to UCS-4 for
  65     ///  each single code point
  66     ///
  67     /// This class is used for creation of std::codecvt facet for converting utf-16/utf-32 encoding
  68     /// to encoding supported by this converter
  69     ///
  70     /// Please note, this converter should be fully stateless. Fully stateless means it should
  71     /// never assume that it is called in any specific order on the text. Even if the
  72     /// encoding itself seems to be stateless like windows-1255 or shift-jis, some
  73     /// encoders (most notably iconv) can actually compose several code-point into one or
  74     /// decompose them in case composite characters are found. So be very careful when implementing
  75     /// these converters for certain character set.
  76     ///
  77     class base_converter {
  78     public:
  79
  80         ///
  81         /// This value should be returned when an illegal input sequence or code-point is observed:
  82         /// For example if a UCS-32 code-point is in the range reserved for UTF-16 surrogates
  83         /// or an invalid UTF-8 sequence is found
  84         ///
  85         static const uint32_t illegal=utf::illegal;
  86
  87         ///
  88         /// This value is returned in following cases: The of incomplete input sequence was found or
  89         /// insufficient output buffer was provided so complete output could not be written.
  90         ///
  91         static const uint32_t incomplete=utf::incomplete;
  92
  93         virtual ~base_converter()
  94         {
  95         }
  96         ///
  97         /// Return the maximal length that one Unicode code-point can be converted to, for example
  98         /// for UTF-8 it is 4, for Shift-JIS it is 2 and ISO-8859-1 is 1
  99         ///
 100         virtual int max_len() const
 101         {
 102             return 1;
 103         }
 104         ///
 105         /// Returns true if calling the functions from_unicode, to_unicode, and max_len is thread safe.
 106         ///
 107         /// Rule of thumb: if this class' implementation uses simple tables that are unchanged
 108         /// or is purely algorithmic like UTF-8 - so it does not share any mutable bit for
 109         /// independent to_unicode, from_unicode calls, you may set it to true, otherwise,
 110         /// for example if you use iconv_t descriptor or UConverter as conversion object return false,
 111         /// and this object will be cloned for each use.
 112         ///
 113         virtual bool is_thread_safe() const
 114         {
 115             return false;
 116         }
 117         ///
 118         /// Create a polymorphic copy of this object, usually called only if is_thread_safe() return false
 119         ///
 120         virtual base_converter *clone() const
 121         {
 122             BOOST_ASSERT(typeid(*this)==typeid(base_converter));
 123             return new base_converter();
 124         }
 125
 126         ///
 127         /// Convert a single character starting at begin and ending at most at end to Unicode code-point.
 128         ///
 129         /// if valid input sequence found in [\a begin,\a code_point_end) such as \a begin < \a code_point_end && \a code_point_end <= \a end
 130         /// it is converted to its Unicode code point equivalent, \a begin is set to \a code_point_end
 131         ///
 132         /// if incomplete input sequence found in [\a begin,\a end), i.e. there my be such \a code_point_end that \a code_point_end > \a end
 133         /// and [\a begin, \a code_point_end) would be valid input sequence, then \a incomplete is returned begin stays unchanged, for example
 134         /// for UTF-8 conversion a *begin = 0xc2, \a begin +1 = \a end is such situation.
 135         ///
 136         /// if invalid input sequence found, i.e. there is a sequence [\a begin, \a code_point_end) such as \a code_point_end <= \a end
 137         /// that is illegal for this encoding, \a illegal is returned and begin stays unchanged. For example if *begin = 0xFF and begin < end
 138         /// for UTF-8, then \a illegal is returned.
 139         ///
 140         ///
 141         virtual uint32_t to_unicode(char const *&begin,char const *end)
 142         {
 143             if(begin == end)
 144                 return incomplete;
 145             unsigned char cp = *begin;
 146             if(cp <= 0x7F) {
 147                 begin++;
 148                 return cp;
 149             }
 150             return illegal;
 151         }
 152         ///
 153         /// Convert a single code-point \a u into encoding and store it in [begin,end) range.
 154         ///
 155         /// If u is invalid Unicode code-point, or it can not be mapped correctly to represented character set,
 156         /// \a illegal should be returned
 157         ///
 158         /// If u can be converted to a sequence of bytes c1, ... , cN (1<= N <= max_len() ) then
 159         ///
 160         /// -# If end - begin >= N, c1, ... cN are written starting at begin and N is returned
 161         /// -# If end - begin < N, incomplete is returned, it is unspecified what would be
 162         ///    stored in bytes in range [begin,end)
 163
 164         virtual uint32_t from_unicode(uint32_t u,char *begin,char const *end)
 165         {
 166             if(begin==end)
 167                 return incomplete;
 168             if(u >= 0x80)
 169                 return illegal;
 170             *begin = static_cast<char>(u);
 171             return 1;
 172         }
 173     };
 174
 175     ///
 176     /// This function creates a \a base_converter that can be used for conversion between UTF-8 and
 177     /// unicode code points
 178     ///
 179     BOOST_LOCALE_DECL std::auto_ptr<base_converter> create_utf8_converter();
 180     ///
 181     /// This function creates a \a base_converter that can be used for conversion between single byte
 182     /// character encodings like ISO-8859-1, koi8-r, windows-1255 and Unicode code points,
 183     ///
 184     /// If \a encoding is not supported, empty pointer is returned. You should check if
 185     /// std::auto_ptr<base_converter>::get() != 0
 186     ///
 187     BOOST_LOCALE_DECL std::auto_ptr<base_converter> create_simple_converter(std::string const &encoding);
 188
 189
 190     ///
 191     /// Install codecvt facet into locale \a in and return new locale that is based on \a in and uses new
 192     /// facet.
 193     ///
 194     /// codecvt facet would convert between narrow and wide/char16_t/char32_t encodings using \a cvt converter.
 195     /// If \a cvt is null pointer, always failure conversion would be used that fails on every first input or output.
 196     ///
 197     /// Note: the codecvt facet handles both UTF-16 and UTF-32 wide encodings, it knows to break and join
 198     /// Unicode code-points above 0xFFFF to and from surrogate pairs correctly. \a cvt should be unaware
 199     /// of wide encoding type
 200     ///
 201     BOOST_LOCALE_DECL
 202     std::locale create_codecvt(std::locale const &in,std::auto_ptr<base_converter> cvt,character_facet_type type);
 203
 204     ///
 205     /// Install utf8 codecvt to UTF-16 or UTF-32 into locale \a in and return
 206     /// new locale that is based on \a in and uses new facet.
 207     ///
 208     BOOST_LOCALE_DECL
 209     std::locale create_utf8_codecvt(std::locale const &in,character_facet_type type);
 210
 211     ///
 212     /// This function installs codecvt that can be used for conversion between single byte
 213     /// character encodings like ISO-8859-1, koi8-r, windows-1255 and Unicode code points,
 214     ///
 215     /// Throws boost::locale::conv::invalid_charset_error if the chacater set is not supported or isn't single byte character
 216     /// set
 217     BOOST_LOCALE_DECL
 218     std::locale create_simple_codecvt(std::locale const &in,std::string const &encoding,character_facet_type type);
 219 } // util
 220 } // locale
 221 } // boost
 222
 223 #endif
 224 // vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4