ceph/src/boost/libs/iostreams/test/detail/utf8_codecvt_facet.cpp

   1 /////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8
   2 // utf8_codecvt_facet.cpp
   3
   4 // Copyright (c) 2001 Ronald Garcia, Indiana University (garcia@osl.iu.edu)
   5 // Andrew Lumsdaine, Indiana University (lums@osl.iu.edu).
   6 // Distributed under the Boost Software License, Version 1.0. (See accompany-
   7 // ing file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
   8
   9 // See http://www.boost.org/libs/iostreams for documentation.
  10
  11 //#include <cstdlib> // for multi-byte converson routines
  12
  13 // Jonathan Turkanis:
  14 //   - Replaced test for BOOST_NO_STD_WSTREAMBUF with test for
  15 //     BOOST_IOSTREAMS_NO_WIDE_STREAMS;
  16 //   - Derived from codecvt_helper instead of codecvt.
  17
  18 #include <boost/config.hpp>
  19 #include <boost/iostreams/detail/config/wide_streams.hpp>
  20 #ifdef BOOST_IOSTREAMS_NO_LOCALES
  21 # error "C++ locales not supported on this platform"
  22 #else
  23
  24 #include <cassert>
  25 #include <cstddef>
  26
  27 #include <boost/detail/workaround.hpp>
  28 #include "./utf8_codecvt_facet.hpp"
  29
  30 #if BOOST_WORKAROUND(__BORLANDC__, <= 0x600)
  31 # pragma warn -sig // Conversion may lose significant digits
  32 # pragma warn -rng // Constant is out of range in comparison
  33 #endif
  34
  35 /////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8
  36 // implementation for wchar_t
  37
  38 // Translate incoming UTF-8 into UCS-4
  39 std::codecvt_base::result utf8_codecvt_facet_wchar_t::do_in(
  40     std::mbstate_t&,
  41     const char * from,
  42     const char * from_end,
  43     const char * & from_next,
  44     wchar_t * to,
  45     wchar_t * to_end,
  46     wchar_t * & to_next
  47 ) const {
  48     // Basic algorithm:  The first octet determines how many
  49     // octets total make up the UCS-4 character.  The remaining
  50     // "continuing octets" all begin with "10". To convert, subtract
  51     // the amount that specifies the number of octets from the first
  52     // octet.  Subtract 0x80 (1000 0000) from each continuing octet,
  53     // then mash the whole lot together.  Note that each continuing
  54     // octet only uses 6 bits as unique values, so only shift by
  55     // multiples of 6 to combine.
  56     while (from != from_end && to != to_end) {
  57
  58         // Error checking   on the first octet
  59         if (invalid_leading_octet(*from)){
  60             from_next = from;
  61             to_next = to;
  62             return std::codecvt_base::error;
  63         }
  64
  65         // The first octet is   adjusted by a value dependent upon
  66         // the number   of "continuing octets" encoding the character
  67         const   int cont_octet_count = get_cont_octet_count(*from);
  68         const   wchar_t octet1_modifier_table[] =   {
  69             0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc
  70         };
  71
  72         // The unsigned char conversion is necessary in case char is
  73         // signed   (I learned this the hard way)
  74         wchar_t ucs_result =
  75             (unsigned char)(*from++) - octet1_modifier_table[cont_octet_count];
  76
  77         // Invariants   :
  78         //   1) At the start of the loop,   'i' continuing characters have been
  79         //    processed
  80         //   2) *from   points to the next continuing character to be processed.
  81         int i   = 0;
  82         while(i != cont_octet_count && from != from_end) {
  83
  84             // Error checking on continuing characters
  85             if (invalid_continuing_octet(*from)) {
  86                 from_next   = from;
  87                 to_next =   to;
  88                 return std::codecvt_base::error;
  89             }
  90
  91             ucs_result *= (1 << 6);
  92
  93             // each continuing character has an extra (10xxxxxx)b attached to
  94             // it that must be removed.
  95             ucs_result += (unsigned char)(*from++) - 0x80;
  96             ++i;
  97         }
  98
  99         // If   the buffer ends with an incomplete unicode character...
 100         if (from == from_end && i   != cont_octet_count) {
 101             // rewind "from" to before the current character translation
 102             from_next = from - (i+1);
 103             to_next = to;
 104             return std::codecvt_base::partial;
 105         }
 106         *to++   = ucs_result;
 107     }
 108     from_next = from;
 109     to_next = to;
 110
 111     // Were we done converting or did we run out of destination space?
 112     if(from == from_end) return std::codecvt_base::ok;
 113     else return std::codecvt_base::partial;
 114 }
 115
 116 std::codecvt_base::result utf8_codecvt_facet_wchar_t::do_out(
 117     std::mbstate_t &,
 118     const wchar_t *   from,
 119     const wchar_t * from_end,
 120     const wchar_t * & from_next,
 121     char * to,
 122     char * to_end,
 123     char * & to_next
 124 ) const
 125 {
 126     // RG - consider merging this table with the other one
 127     const wchar_t octet1_modifier_table[] = {
 128         0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc
 129     };
 130
 131     while (from != from_end && to != to_end) {
 132
 133 #define BOOST_NULL // Prevent macro expansion
 134         // Check for invalid UCS-4 character
 135         if (*from  > std::numeric_limits<wchar_t>::max BOOST_NULL ()) {
 136             from_next = from;
 137             to_next = to;
 138             return std::codecvt_base::error;
 139         }
 140 #undef BOOST_NULL
 141
 142         int cont_octet_count = get_cont_octet_out_count(*from);
 143
 144         // RG  - comment this formula better
 145         int shift_exponent = (cont_octet_count) *   6;
 146
 147         // Process the first character
 148         *to++ = octet1_modifier_table[cont_octet_count] +
 149             (unsigned char)(*from / (1 << shift_exponent));
 150
 151         // Process the continuation characters
 152         // Invariants: At   the start of the loop:
 153         //   1) 'i' continuing octets   have been generated
 154         //   2) '*to'   points to the next location to place an octet
 155         //   3) shift_exponent is   6 more than needed for the next octet
 156         int i   = 0;
 157         while   (i != cont_octet_count && to != to_end) {
 158             shift_exponent -= 6;
 159             *to++ = 0x80 + ((*from / (1 << shift_exponent)) % (1 << 6));
 160             ++i;
 161         }
 162         // If   we filled up the out buffer before encoding the character
 163         if(to   == to_end && i != cont_octet_count) {
 164             from_next = from;
 165             to_next = to - (i+1);
 166             return std::codecvt_base::partial;
 167         }
 168         *from++;
 169     }
 170     from_next = from;
 171     to_next = to;
 172     // Were we done or did we run out of destination space
 173     if(from == from_end) return std::codecvt_base::ok;
 174     else return std::codecvt_base::partial;
 175 }
 176
 177 // How many char objects can I process to get <= max_limit
 178 // wchar_t objects?
 179 int utf8_codecvt_facet_wchar_t::do_length(
 180     BOOST_IOSTREAMS_CODECVT_CV_QUALIFIER std::mbstate_t &,
 181     const char * from,
 182     const char * from_end,
 183     std::size_t max_limit
 184 ) const throw()
 185 {
 186     // RG - this code is confusing!  I need a better way to express it.
 187     // and test cases.
 188
 189     // Invariants:
 190     // 1) last_octet_count has the size of the last measured character
 191     // 2) char_count holds the number of characters shown to fit
 192     // within the bounds so far (no greater than max_limit)
 193     // 3) from_next points to the octet 'last_octet_count' before the
 194     // last measured character.
 195     int last_octet_count=0;
 196     std::size_t char_count = 0;
 197     const char* from_next = from;
 198     // Use "<" because the buffer may represent incomplete characters
 199     while (from_next+last_octet_count <= from_end && char_count <= max_limit) {
 200         from_next += last_octet_count;
 201         last_octet_count = (get_octet_count(*from_next));
 202         ++char_count;
 203     }
 204     return from_next-from_end;
 205 }
 206
 207 unsigned int utf8_codecvt_facet_wchar_t::get_octet_count(
 208     unsigned char   lead_octet
 209 ){
 210     // if the 0-bit (MSB) is 0, then 1 character
 211     if (lead_octet <= 0x7f) return 1;
 212
 213     // Otherwise the count number of consecutive 1 bits starting at MSB
 214     assert(0xc0 <= lead_octet && lead_octet <= 0xfd);
 215
 216     if (0xc0 <= lead_octet && lead_octet <= 0xdf) return 2;
 217     else if (0xe0 <= lead_octet && lead_octet <= 0xef) return 3;
 218     else if (0xf0 <= lead_octet && lead_octet <= 0xf7) return 4;
 219     else if (0xf8 <= lead_octet && lead_octet <= 0xfb) return 5;
 220     else return 6;
 221 }
 222
 223 namespace {
 224 template<std::size_t s>
 225 int get_cont_octet_out_count_impl(wchar_t word){
 226     if (word < 0x80) {
 227         return 0;
 228     }
 229     if (word < 0x800) {
 230         return 1;
 231     }
 232     return 2;
 233 }
 234
 235 // note the following code will generate on some platforms where
 236 // wchar_t is defined as UCS2.  The warnings are superfluous as
 237 // the specialization is never instantitiated with such compilers.
 238 template<>
 239 int get_cont_octet_out_count_impl<4>(wchar_t word)
 240 {
 241     if (word < 0x80) {
 242         return 0;
 243     }
 244     if (word < 0x800) {
 245         return 1;
 246     }
 247     if (word < 0x10000) {
 248         return 2;
 249     }
 250     if (word < 0x200000) {
 251         return 3;
 252     }
 253     if (word < 0x4000000) {
 254         return 4;
 255     }
 256     return 5;
 257 }
 258
 259 } // namespace anonymous
 260
 261 // How many "continuing octets" will be needed for this word
 262 // ==   total octets - 1.
 263 int utf8_codecvt_facet_wchar_t::get_cont_octet_out_count(
 264     wchar_t word
 265 ) const {
 266     return get_cont_octet_out_count_impl<sizeof(wchar_t)>(word);
 267 }
 268
 269 #if 0 // not used?
 270 /////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8
 271 // implementation for char
 272
 273 std::codecvt_base::result utf8_codecvt_facet_char::do_in(
 274     std::mbstate_t & state,
 275     const char * from,
 276     const char * from_end,
 277     const char * & from_next,
 278     char * to,
 279     char * to_end,
 280     char * & to_next
 281 ) const
 282 {
 283     while(from_next < from_end){
 284         wchar_t w;
 285         wchar_t *wnext = & w;
 286         utf8_codecvt_facet_wchar_t::result ucs4_result;
 287         ucs4_result = base_class::do_in(
 288             state,
 289             from, from_end, from_next,
 290             wnext, wnext + 1, wnext
 291         );
 292         if(codecvt_base::ok != ucs4_result)
 293             return ucs4_result;
 294         // if the conversion succeeds.
 295         int length = std::wctomb(to_next, w);
 296         assert(-1 != length);
 297         to_next += length;
 298     }
 299     return codecvt_base::ok;
 300 }
 301
 302 std::codecvt_base::result utf8_codecvt_facet_char::do_out(
 303     mbstate_t & state,
 304     const char * from,
 305     const char * from_end,
 306     const char * & from_next,
 307     char * to,
 308     char * to_end,
 309     char * & to_next
 310 ) const
 311 {
 312     while(from_next < from_end){
 313         wchar_t w;
 314         int result = std::mbtowc(&w, from_next,  MB_LENGTH_MAX);
 315         assert(-1 != result);
 316         from_next += result;
 317         utf8_codecvt_facet_wchar_t::result ucs4_result;
 318
 319         const wchar_t *wptr = & w;
 320         ucs4_result = base_class::do_out(
 321             state,
 322             wptr, wptr+1, wptr,
 323             to_next, to_end, to_next
 324         );
 325         if(codecvt_base::ok != ucs4_result)
 326             return ucs4_result;
 327     }
 328     return codecvt_base::ok;
 329 }
 330
 331 // How many bytes objects can I process to get <= max_limit
 332 // char objects?
 333 int utf8_codecvt_facet_char::do_length(
 334     // it seems that the standard doesn't use const so these librarires
 335     // would be in error
 336     BOOST_IOSTREAMS_CODECVT_CV_QUALIFIER
 337     utf8_codecvt_facet_wchar_t::mbstate_t & initial_state,
 338     const char * from_next,
 339     const char * from_end,
 340     std::size_t max_limit
 341 ) const
 342 {
 343     int total_length = 0;
 344     const char *from = from_next;
 345     mbstate_t state = initial_state;
 346     while(from_next < from_end){
 347         wchar_t w;
 348         wchar_t *wnext = & w;
 349         utf8_codecvt_facet_wchar_t::result ucs4_result;
 350         ucs4_result = base_class::do_in(
 351             state,
 352             from_next, from_end, from_next,
 353             wnext, wnext + 1, wnext
 354         );
 355
 356         if(codecvt_base::ok != ucs4_result)
 357             break;
 358
 359         char carray[MB_LENGTH_MAX];
 360         std::size_t count = wctomb(carray, w);
 361         if(count > max_limit)
 362             break;
 363
 364         max_limit -= count;
 365         total_length = from_next - from;
 366     }
 367     return total_length;
 368 }
 369 #endif
 370
 371 #endif //BOOST_IOSTREAMS_NO_WIDE_STREAMS