ceph/src/boost/boost/detail/utf8_codecvt_facet.ipp

   1 /////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8
   2 // utf8_codecvt_facet.ipp
   3
   4 // Copyright (c) 2001 Ronald Garcia, Indiana University (garcia@osl.iu.edu)
   5 // Andrew Lumsdaine, Indiana University (lums@osl.iu.edu).
   6 // Use, modification and distribution is subject to the Boost Software
   7 // License, Version 1.0. (See accompanying file LICENSE_1_0.txt or copy at
   8 // http://www.boost.org/LICENSE_1_0.txt)
   9
  10 // Please see the comments in <boost/detail/utf8_codecvt_facet.hpp> to
  11 // learn how this file should be used.
  12
  13 #include <boost/detail/utf8_codecvt_facet.hpp>
  14
  15 #include <cstdlib> // for multi-byte converson routines
  16 #include <cassert>
  17
  18 #include <boost/limits.hpp>
  19 #include <boost/config.hpp>
  20
  21 // If we don't have wstring, then Unicode support
  22 // is not available anyway, so we don't need to even
  23 // compiler this file. This also fixes the problem
  24 // with mingw, which can compile this file, but will
  25 // generate link error when building DLL.
  26 #ifndef BOOST_NO_STD_WSTRING
  27
  28 BOOST_UTF8_BEGIN_NAMESPACE
  29
  30 /////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8
  31 // implementation for wchar_t
  32
  33 utf8_codecvt_facet::utf8_codecvt_facet(
  34     std::size_t no_locale_manage
  35 ) :
  36     std::codecvt<wchar_t, char, std::mbstate_t>(no_locale_manage)
  37 {}
  38
  39 // Translate incoming UTF-8 into UCS-4
  40 std::codecvt_base::result utf8_codecvt_facet::do_in(
  41     std::mbstate_t& /*state*/,
  42     const char * from,
  43     const char * from_end,
  44     const char * & from_next,
  45     wchar_t * to,
  46     wchar_t * to_end,
  47     wchar_t * & to_next
  48 ) const {
  49     // Basic algorithm:  The first octet determines how many
  50     // octets total make up the UCS-4 character.  The remaining
  51     // "continuing octets" all begin with "10". To convert, subtract
  52     // the amount that specifies the number of octets from the first
  53     // octet.  Subtract 0x80 (1000 0000) from each continuing octet,
  54     // then mash the whole lot together.  Note that each continuing
  55     // octet only uses 6 bits as unique values, so only shift by
  56     // multiples of 6 to combine.
  57     while (from != from_end && to != to_end) {
  58
  59         // Error checking   on the first octet
  60         if (invalid_leading_octet(*from)){
  61             from_next = from;
  62             to_next = to;
  63             return std::codecvt_base::error;
  64         }
  65
  66         // The first octet is   adjusted by a value dependent upon
  67         // the number   of "continuing octets" encoding the character
  68         const   int cont_octet_count = get_cont_octet_count(*from);
  69         const   wchar_t octet1_modifier_table[] =   {
  70             0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc
  71         };
  72
  73         // The unsigned char conversion is necessary in case char is
  74         // signed   (I learned this the hard way)
  75         wchar_t ucs_result =
  76             (unsigned char)(*from++) - octet1_modifier_table[cont_octet_count];
  77
  78         // Invariants   :
  79         //   1) At the start of the loop,   'i' continuing characters have been
  80         //    processed
  81         //   2) *from   points to the next continuing character to be processed.
  82         int i   = 0;
  83         while(i != cont_octet_count && from != from_end) {
  84
  85             // Error checking on continuing characters
  86             if (invalid_continuing_octet(*from)) {
  87                 from_next   = from;
  88                 to_next =   to;
  89                 return std::codecvt_base::error;
  90             }
  91
  92             ucs_result *= (1 << 6);
  93
  94             // each continuing character has an extra (10xxxxxx)b attached to
  95             // it that must be removed.
  96             ucs_result += (unsigned char)(*from++) - 0x80;
  97             ++i;
  98         }
  99
 100         // If   the buffer ends with an incomplete unicode character...
 101         if (from == from_end && i   != cont_octet_count) {
 102             // rewind "from" to before the current character translation
 103             from_next = from - (i+1);
 104             to_next = to;
 105             return std::codecvt_base::partial;
 106         }
 107         *to++   = ucs_result;
 108     }
 109     from_next = from;
 110     to_next = to;
 111
 112     // Were we done converting or did we run out of destination space?
 113     if(from == from_end) return std::codecvt_base::ok;
 114     else return std::codecvt_base::partial;
 115 }
 116
 117 std::codecvt_base::result utf8_codecvt_facet::do_out(
 118     std::mbstate_t& /*state*/,
 119     const wchar_t *   from,
 120     const wchar_t * from_end,
 121     const wchar_t * & from_next,
 122     char * to,
 123     char * to_end,
 124     char * & to_next
 125 ) const
 126 {
 127     // RG - consider merging this table with the other one
 128     const wchar_t octet1_modifier_table[] = {
 129         0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc
 130     };
 131
 132     wchar_t max_wchar = (std::numeric_limits<wchar_t>::max)();
 133     while (from != from_end && to != to_end) {
 134
 135         // Check for invalid UCS-4 character
 136         if (*from  > max_wchar) {
 137             from_next = from;
 138             to_next = to;
 139             return std::codecvt_base::error;
 140         }
 141
 142         int cont_octet_count = get_cont_octet_out_count(*from);
 143
 144         // RG  - comment this formula better
 145         int shift_exponent = (cont_octet_count) *   6;
 146
 147         // Process the first character
 148         *to++ = static_cast<char>(octet1_modifier_table[cont_octet_count] +
 149             (unsigned char)(*from / (1 << shift_exponent)));
 150
 151         // Process the continuation characters
 152         // Invariants: At   the start of the loop:
 153         //   1) 'i' continuing octets   have been generated
 154         //   2) '*to'   points to the next location to place an octet
 155         //   3) shift_exponent is   6 more than needed for the next octet
 156         int i   = 0;
 157         while   (i != cont_octet_count && to != to_end) {
 158             shift_exponent -= 6;
 159             *to++ = static_cast<char>(0x80 + ((*from / (1 << shift_exponent)) % (1 << 6)));
 160             ++i;
 161         }
 162         // If   we filled up the out buffer before encoding the character
 163         if(to   == to_end && i != cont_octet_count) {
 164             from_next = from;
 165             to_next = to - (i+1);
 166             return std::codecvt_base::partial;
 167         }
 168         ++from;
 169     }
 170     from_next = from;
 171     to_next = to;
 172     // Were we done or did we run out of destination space
 173     if(from == from_end) return std::codecvt_base::ok;
 174     else return std::codecvt_base::partial;
 175 }
 176
 177 // How many char objects can I process to get <= max_limit
 178 // wchar_t objects?
 179 int utf8_codecvt_facet::do_length(
 180     std::mbstate_t &,
 181     const char * from,
 182     const char * from_end,
 183     std::size_t max_limit
 184 ) const
 185 #if BOOST_WORKAROUND(__IBMCPP__, BOOST_TESTED_AT(600))
 186         throw()
 187 #endif
 188 {
 189     // RG - this code is confusing!  I need a better way to express it.
 190     // and test cases.
 191
 192     // Invariants:
 193     // 1) last_octet_count has the size of the last measured character
 194     // 2) char_count holds the number of characters shown to fit
 195     // within the bounds so far (no greater than max_limit)
 196     // 3) from_next points to the octet 'last_octet_count' before the
 197     // last measured character.
 198     int last_octet_count=0;
 199     std::size_t char_count = 0;
 200     const char* from_next = from;
 201     // Use "<" because the buffer may represent incomplete characters
 202     while (from_next+last_octet_count <= from_end && char_count <= max_limit) {
 203         from_next += last_octet_count;
 204         last_octet_count = (get_octet_count(*from_next));
 205         ++char_count;
 206     }
 207     return static_cast<int>(from_next-from);
 208 }
 209
 210 unsigned int utf8_codecvt_facet::get_octet_count(
 211     unsigned char lead_octet
 212 ){
 213     // if the 0-bit (MSB) is 0, then 1 character
 214     if (lead_octet <= 0x7f) return 1;
 215
 216     // Otherwise the count number of consecutive 1 bits starting at MSB
 217 //    assert(0xc0 <= lead_octet && lead_octet <= 0xfd);
 218
 219     if (0xc0 <= lead_octet && lead_octet <= 0xdf) return 2;
 220     else if (0xe0 <= lead_octet && lead_octet <= 0xef) return 3;
 221     else if (0xf0 <= lead_octet && lead_octet <= 0xf7) return 4;
 222     else if (0xf8 <= lead_octet && lead_octet <= 0xfb) return 5;
 223     else return 6;
 224 }
 225
 226 namespace detail {
 227
 228 template<std::size_t s>
 229 int get_cont_octet_out_count_impl(wchar_t word){
 230     if (word < 0x80) {
 231         return 0;
 232     }
 233     if (word < 0x800) {
 234         return 1;
 235     }
 236     return 2;
 237 }
 238
 239 template<>
 240 int get_cont_octet_out_count_impl<4>(wchar_t word){
 241     if (word < 0x80) {
 242         return 0;
 243     }
 244     if (word < 0x800) {
 245         return 1;
 246     }
 247
 248     // Note that the following code will generate warnings on some platforms
 249     // where wchar_t is defined as UCS2.  The warnings are superfluous as the
 250     // specialization is never instantitiated with such compilers, but this
 251     // can cause problems if warnings are being treated as errors, so we guard
 252     // against that.  Including <boost/detail/utf8_codecvt_facet.hpp> as we do
 253     // should be enough to get WCHAR_MAX defined.
 254 #if !defined(WCHAR_MAX)
 255 #   error WCHAR_MAX not defined!
 256 #endif
 257     // cope with VC++ 7.1 or earlier having invalid WCHAR_MAX
 258 #if defined(_MSC_VER) && _MSC_VER <= 1310 // 7.1 or earlier
 259     return 2;
 260 #elif WCHAR_MAX > 0x10000
 261
 262    if (word < 0x10000) {
 263         return 2;
 264     }
 265     if (word < 0x200000) {
 266         return 3;
 267     }
 268     if (word < 0x4000000) {
 269         return 4;
 270     }
 271     return 5;
 272
 273 #else
 274     return 2;
 275 #endif
 276 }
 277
 278 } // namespace detail
 279
 280 // How many "continuing octets" will be needed for this word
 281 // ==   total octets - 1.
 282 int utf8_codecvt_facet::get_cont_octet_out_count(
 283     wchar_t word
 284 ) const {
 285     return detail::get_cont_octet_out_count_impl<sizeof(wchar_t)>(word);
 286 }
 287 BOOST_UTF8_END_NAMESPACE
 288
 289 #endif