ceph/src/boost/libs/iostreams/test/detail/utf8_codecvt_facet.cpp

   1 /////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8
   2 // utf8_codecvt_facet.cpp
   3
   4 // Copyright (c) 2001 Ronald Garcia, Indiana University (garcia@osl.iu.edu)
   5 // Andrew Lumsdaine, Indiana University (lums@osl.iu.edu).
   6 // Distributed under the Boost Software License, Version 1.0. (See accompany-
   7 // ing file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
   8
   9 // See http://www.boost.org/libs/iostreams for documentation.
  10
  11 //#include <cstdlib> // for multi-byte converson routines
  12
  13 // Jonathan Turkanis:
  14 //   - Replaced test for BOOST_NO_STD_WSTREAMBUF with test for
  15 //     BOOST_IOSTREAMS_NO_WIDE_STREAMS;
  16 //   - Derived from codecvt_helper instead of codecvt.
  17
  18 #include <boost/config.hpp>
  19 #include <boost/iostreams/detail/config/wide_streams.hpp>
  20 #include <boost/numeric/conversion/cast.hpp>
  21 #ifdef BOOST_IOSTREAMS_NO_LOCALES
  22 # error "C++ locales not supported on this platform"
  23 #else
  24
  25 #include <cassert>
  26 #include <cstddef>
  27
  28 #include <boost/detail/workaround.hpp>
  29 #include "./utf8_codecvt_facet.hpp"
  30
  31 #if BOOST_WORKAROUND(BOOST_BORLANDC, <= 0x600)
  32 # pragma warn -sig // Conversion may lose significant digits
  33 # pragma warn -rng // Constant is out of range in comparison
  34 #endif
  35
  36 /////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8
  37 // implementation for wchar_t
  38
  39 // Translate incoming UTF-8 into UCS-4
  40 std::codecvt_base::result utf8_codecvt_facet_wchar_t::do_in(
  41     std::mbstate_t&,
  42     const char * from,
  43     const char * from_end,
  44     const char * & from_next,
  45     wchar_t * to,
  46     wchar_t * to_end,
  47     wchar_t * & to_next
  48 ) const {
  49     // Basic algorithm:  The first octet determines how many
  50     // octets total make up the UCS-4 character.  The remaining
  51     // "continuing octets" all begin with "10". To convert, subtract
  52     // the amount that specifies the number of octets from the first
  53     // octet.  Subtract 0x80 (1000 0000) from each continuing octet,
  54     // then mash the whole lot together.  Note that each continuing
  55     // octet only uses 6 bits as unique values, so only shift by
  56     // multiples of 6 to combine.
  57     while (from != from_end && to != to_end) {
  58
  59         // Error checking   on the first octet
  60         if (invalid_leading_octet(*from)){
  61             from_next = from;
  62             to_next = to;
  63             return std::codecvt_base::error;
  64         }
  65
  66         // The first octet is   adjusted by a value dependent upon
  67         // the number   of "continuing octets" encoding the character
  68         const   int cont_octet_count = get_cont_octet_count(*from);
  69         const   wchar_t octet1_modifier_table[] =   {
  70             0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc
  71         };
  72
  73         // The unsigned char conversion is necessary in case char is
  74         // signed   (I learned this the hard way)
  75         wchar_t ucs_result =
  76             (unsigned char)(*from++) - octet1_modifier_table[cont_octet_count];
  77
  78         // Invariants   :
  79         //   1) At the start of the loop,   'i' continuing characters have been
  80         //    processed
  81         //   2) *from   points to the next continuing character to be processed.
  82         int i   = 0;
  83         while(i != cont_octet_count && from != from_end) {
  84
  85             // Error checking on continuing characters
  86             if (invalid_continuing_octet(*from)) {
  87                 from_next   = from;
  88                 to_next =   to;
  89                 return std::codecvt_base::error;
  90             }
  91
  92             ucs_result *= (1 << 6);
  93
  94             // each continuing character has an extra (10xxxxxx)b attached to
  95             // it that must be removed.
  96             ucs_result += (unsigned char)(*from++) - 0x80;
  97             ++i;
  98         }
  99
 100         // If   the buffer ends with an incomplete unicode character...
 101         if (from == from_end && i   != cont_octet_count) {
 102             // rewind "from" to before the current character translation
 103             from_next = from - (i+1);
 104             to_next = to;
 105             return std::codecvt_base::partial;
 106         }
 107         *to++   = ucs_result;
 108     }
 109     from_next = from;
 110     to_next = to;
 111
 112     // Were we done converting or did we run out of destination space?
 113     if(from == from_end) return std::codecvt_base::ok;
 114     else return std::codecvt_base::partial;
 115 }
 116
 117 std::codecvt_base::result utf8_codecvt_facet_wchar_t::do_out(
 118     std::mbstate_t &,
 119     const wchar_t *   from,
 120     const wchar_t * from_end,
 121     const wchar_t * & from_next,
 122     char * to,
 123     char * to_end,
 124     char * & to_next
 125 ) const
 126 {
 127     // RG - consider merging this table with the other one
 128     const wchar_t octet1_modifier_table[] = {
 129         0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc
 130     };
 131
 132     while (from != from_end && to != to_end) {
 133
 134 #define BOOST_NULL // Prevent macro expansion
 135         // Check for invalid UCS-4 character
 136         if (*from  > std::numeric_limits<wchar_t>::max BOOST_NULL ()) {
 137             from_next = from;
 138             to_next = to;
 139             return std::codecvt_base::error;
 140         }
 141 #undef BOOST_NULL
 142
 143         int cont_octet_count = get_cont_octet_out_count(*from);
 144
 145         // RG  - comment this formula better
 146         int shift_exponent = (cont_octet_count) *   6;
 147
 148         // Process the first character
 149         *to++ = octet1_modifier_table[cont_octet_count] +
 150             (unsigned char)(*from / (1 << shift_exponent));
 151
 152         // Process the continuation characters
 153         // Invariants: At   the start of the loop:
 154         //   1) 'i' continuing octets   have been generated
 155         //   2) '*to'   points to the next location to place an octet
 156         //   3) shift_exponent is   6 more than needed for the next octet
 157         int i   = 0;
 158         while   (i != cont_octet_count && to != to_end) {
 159             shift_exponent -= 6;
 160             *to++ = 0x80 + ((*from / (1 << shift_exponent)) % (1 << 6));
 161             ++i;
 162         }
 163         // If   we filled up the out buffer before encoding the character
 164         if(to   == to_end && i != cont_octet_count) {
 165             from_next = from;
 166             to_next = to - (i+1);
 167             return std::codecvt_base::partial;
 168         }
 169         ++from;
 170     }
 171     from_next = from;
 172     to_next = to;
 173     // Were we done or did we run out of destination space
 174     if(from == from_end) return std::codecvt_base::ok;
 175     else return std::codecvt_base::partial;
 176 }
 177
 178 // How many char objects can I process to get <= max_limit
 179 // wchar_t objects?
 180 int utf8_codecvt_facet_wchar_t::do_length(
 181     BOOST_IOSTREAMS_CODECVT_CV_QUALIFIER std::mbstate_t &,
 182     const char * from,
 183     const char * from_end,
 184     std::size_t max_limit
 185 ) const throw()
 186 {
 187     // RG - this code is confusing!  I need a better way to express it.
 188     // and test cases.
 189
 190     // Invariants:
 191     // 1) last_octet_count has the size of the last measured character
 192     // 2) char_count holds the number of characters shown to fit
 193     // within the bounds so far (no greater than max_limit)
 194     // 3) from_next points to the octet 'last_octet_count' before the
 195     // last measured character.
 196     int last_octet_count=0;
 197     std::size_t char_count = 0;
 198     const char* from_next = from;
 199     // Use "<" because the buffer may represent incomplete characters
 200     while (from_next+last_octet_count <= from_end && char_count <= max_limit) {
 201         from_next += last_octet_count;
 202         last_octet_count = (get_octet_count(*from_next));
 203         ++char_count;
 204     }
 205     return boost::numeric_cast<int>(from_next - from_end);
 206 }
 207
 208 unsigned int utf8_codecvt_facet_wchar_t::get_octet_count(
 209     unsigned char   lead_octet
 210 ){
 211     // if the 0-bit (MSB) is 0, then 1 character
 212     if (lead_octet <= 0x7f) return 1;
 213
 214     // Otherwise the count number of consecutive 1 bits starting at MSB
 215     assert(0xc0 <= lead_octet && lead_octet <= 0xfd);
 216
 217     if (0xc0 <= lead_octet && lead_octet <= 0xdf) return 2;
 218     else if (0xe0 <= lead_octet && lead_octet <= 0xef) return 3;
 219     else if (0xf0 <= lead_octet && lead_octet <= 0xf7) return 4;
 220     else if (0xf8 <= lead_octet && lead_octet <= 0xfb) return 5;
 221     else return 6;
 222 }
 223
 224 namespace {
 225 template<std::size_t s>
 226 int get_cont_octet_out_count_impl(wchar_t word){
 227     if (word < 0x80) {
 228         return 0;
 229     }
 230     if (word < 0x800) {
 231         return 1;
 232     }
 233     return 2;
 234 }
 235
 236 // note the following code will generate on some platforms where
 237 // wchar_t is defined as UCS2.  The warnings are superfluous as
 238 // the specialization is never instantitiated with such compilers.
 239 template<>
 240 int get_cont_octet_out_count_impl<4>(wchar_t word)
 241 {
 242     if (word < 0x80) {
 243         return 0;
 244     }
 245     if (word < 0x800) {
 246         return 1;
 247     }
 248     if (word < 0x10000) {
 249         return 2;
 250     }
 251     if (word < 0x200000) {
 252         return 3;
 253     }
 254     if (word < 0x4000000) {
 255         return 4;
 256     }
 257     return 5;
 258 }
 259
 260 } // namespace anonymous
 261
 262 // How many "continuing octets" will be needed for this word
 263 // ==   total octets - 1.
 264 int utf8_codecvt_facet_wchar_t::get_cont_octet_out_count(
 265     wchar_t word
 266 ) const {
 267     return get_cont_octet_out_count_impl<sizeof(wchar_t)>(word);
 268 }
 269
 270 #if 0 // not used?
 271 /////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8
 272 // implementation for char
 273
 274 std::codecvt_base::result utf8_codecvt_facet_char::do_in(
 275     std::mbstate_t & state,
 276     const char * from,
 277     const char * from_end,
 278     const char * & from_next,
 279     char * to,
 280     char * to_end,
 281     char * & to_next
 282 ) const
 283 {
 284     while(from_next < from_end){
 285         wchar_t w;
 286         wchar_t *wnext = & w;
 287         utf8_codecvt_facet_wchar_t::result ucs4_result;
 288         ucs4_result = base_class::do_in(
 289             state,
 290             from, from_end, from_next,
 291             wnext, wnext + 1, wnext
 292         );
 293         if(codecvt_base::ok != ucs4_result)
 294             return ucs4_result;
 295         // if the conversion succeeds.
 296         int length = std::wctomb(to_next, w);
 297         assert(-1 != length);
 298         to_next += length;
 299     }
 300     return codecvt_base::ok;
 301 }
 302
 303 std::codecvt_base::result utf8_codecvt_facet_char::do_out(
 304     mbstate_t & state,
 305     const char * from,
 306     const char * from_end,
 307     const char * & from_next,
 308     char * to,
 309     char * to_end,
 310     char * & to_next
 311 ) const
 312 {
 313     while(from_next < from_end){
 314         wchar_t w;
 315         int result = std::mbtowc(&w, from_next,  MB_LENGTH_MAX);
 316         assert(-1 != result);
 317         from_next += result;
 318         utf8_codecvt_facet_wchar_t::result ucs4_result;
 319
 320         const wchar_t *wptr = & w;
 321         ucs4_result = base_class::do_out(
 322             state,
 323             wptr, wptr+1, wptr,
 324             to_next, to_end, to_next
 325         );
 326         if(codecvt_base::ok != ucs4_result)
 327             return ucs4_result;
 328     }
 329     return codecvt_base::ok;
 330 }
 331
 332 // How many bytes objects can I process to get <= max_limit
 333 // char objects?
 334 int utf8_codecvt_facet_char::do_length(
 335     // it seems that the standard doesn't use const so these librarires
 336     // would be in error
 337     BOOST_IOSTREAMS_CODECVT_CV_QUALIFIER
 338     utf8_codecvt_facet_wchar_t::mbstate_t & initial_state,
 339     const char * from_next,
 340     const char * from_end,
 341     std::size_t max_limit
 342 ) const
 343 {
 344     int total_length = 0;
 345     const char *from = from_next;
 346     mbstate_t state = initial_state;
 347     while(from_next < from_end){
 348         wchar_t w;
 349         wchar_t *wnext = & w;
 350         utf8_codecvt_facet_wchar_t::result ucs4_result;
 351         ucs4_result = base_class::do_in(
 352             state,
 353             from_next, from_end, from_next,
 354             wnext, wnext + 1, wnext
 355         );
 356
 357         if(codecvt_base::ok != ucs4_result)
 358             break;
 359
 360         char carray[MB_LENGTH_MAX];
 361         std::size_t count = wctomb(carray, w);
 362         if(count > max_limit)
 363             break;
 364
 365         max_limit -= count;
 366         total_length = from_next - from;
 367     }
 368     return total_length;
 369 }
 370 #endif
 371
 372 #endif //BOOST_IOSTREAMS_NO_WIDE_STREAMS