ceph/src/boost/boost/nowide/utf8_codecvt.hpp

   1 //
   2 //  Copyright (c) 2015 Artyom Beilis (Tonkikh)
   3 //  Copyright (c) 2020 Alexander Grund
   4 //
   5 //  Distributed under the Boost Software License, Version 1.0. (See
   6 //  accompanying file LICENSE or copy at
   7 //  http://www.boost.org/LICENSE_1_0.txt)
   8 //
   9 #ifndef BOOST_NOWIDE_UTF8_CODECVT_HPP_INCLUDED
  10 #define BOOST_NOWIDE_UTF8_CODECVT_HPP_INCLUDED
  11
  12 #include <boost/nowide/replacement.hpp>
  13 #include <boost/nowide/utf/utf.hpp>
  14 #include <cstdint>
  15 #include <locale>
  16
  17 namespace boost {
  18 namespace nowide {
  19
  20     static_assert(sizeof(std::mbstate_t) >= 2, "mbstate_t is to small to store an UTF-16 codepoint");
  21     namespace detail {
  22         // Avoid including cstring for std::memcpy
  23         inline void copy_uint16_t(void* dst, const void* src)
  24         {
  25             unsigned char* cdst = static_cast<unsigned char*>(dst);
  26             const unsigned char* csrc = static_cast<const unsigned char*>(src);
  27             cdst[0] = csrc[0];
  28             cdst[1] = csrc[1];
  29         }
  30         inline std::uint16_t read_state(const std::mbstate_t& src)
  31         {
  32             std::uint16_t dst;
  33             copy_uint16_t(&dst, &src);
  34             return dst;
  35         }
  36         inline void write_state(std::mbstate_t& dst, const std::uint16_t src)
  37         {
  38             copy_uint16_t(&dst, &src);
  39         }
  40     } // namespace detail
  41
  42 #if defined _MSC_VER && _MSC_VER < 1700
  43 // MSVC do_length is non-standard it counts wide characters instead of narrow and does not change mbstate
  44 #define BOOST_NOWIDE_DO_LENGTH_MBSTATE_CONST
  45 #endif
  46
  47     /// std::codecvt implementation that converts between UTF-8 and UTF-16 or UTF-32
  48     ///
  49     /// @tparam CharSize Determines the encoding: 2 for UTF-16, 4 for UTF-32
  50     ///
  51     /// Invalid sequences are replaced by #BOOST_NOWIDE_REPLACEMENT_CHARACTER
  52     /// A trailing incomplete sequence will result in a return value of std::codecvt::partial
  53     template<typename CharType, int CharSize = sizeof(CharType)>
  54     class utf8_codecvt;
  55
  56     /// Specialization for the UTF-8 <-> UTF-16 variant of the std::codecvt implementation
  57     template<typename CharType>
  58     class BOOST_SYMBOL_VISIBLE utf8_codecvt<CharType, 2> : public std::codecvt<CharType, char, std::mbstate_t>
  59     {
  60     public:
  61         static_assert(sizeof(CharType) >= 2, "CharType must be able to store UTF16 code point");
  62
  63         utf8_codecvt(size_t refs = 0) : std::codecvt<CharType, char, std::mbstate_t>(refs)
  64         {}
  65
  66     protected:
  67         using uchar = CharType;
  68
  69         std::codecvt_base::result do_unshift(std::mbstate_t& s, char* from, char* /*to*/, char*& next) const override
  70         {
  71             if(detail::read_state(s) != 0)
  72                 return std::codecvt_base::error;
  73             next = from;
  74             return std::codecvt_base::ok;
  75         }
  76         int do_encoding() const noexcept override
  77         {
  78             return 0;
  79         }
  80         int do_max_length() const noexcept override
  81         {
  82             return 4;
  83         }
  84         bool do_always_noconv() const noexcept override
  85         {
  86             return false;
  87         }
  88
  89         int do_length(std::mbstate_t
  90 #ifdef BOOST_NOWIDE_DO_LENGTH_MBSTATE_CONST
  91                       const
  92 #endif
  93                         & std_state,
  94                       const char* from,
  95                       const char* from_end,
  96                       size_t max) const override
  97         {
  98             std::uint16_t state = detail::read_state(std_state);
  99 #ifndef BOOST_NOWIDE_DO_LENGTH_MBSTATE_CONST
 100             const char* save_from = from;
 101 #else
 102             size_t save_max = max;
 103 #endif
 104             while(max > 0 && from < from_end)
 105             {
 106                 const char* prev_from = from;
 107                 std::uint32_t ch = utf::utf_traits<char>::decode(from, from_end);
 108                 if(ch == utf::illegal)
 109                 {
 110                     ch = BOOST_NOWIDE_REPLACEMENT_CHARACTER;
 111                 } else if(ch == utf::incomplete)
 112                 {
 113                     from = prev_from;
 114                     break;
 115                 }
 116                 max--;
 117                 if(ch > 0xFFFF)
 118                 {
 119                     if(state == 0)
 120                     {
 121                         from = prev_from;
 122                         state = 1;
 123                     } else
 124                     {
 125                         state = 0;
 126                     }
 127                 }
 128             }
 129 #ifndef BOOST_NOWIDE_DO_LENGTH_MBSTATE_CONST
 130             detail::write_state(std_state, state);
 131             return static_cast<int>(from - save_from);
 132 #else
 133             return static_cast<int>(save_max - max);
 134 #endif
 135         }
 136
 137         std::codecvt_base::result do_in(std::mbstate_t& std_state,
 138                                         const char* from,
 139                                         const char* from_end,
 140                                         const char*& from_next,
 141                                         uchar* to,
 142                                         uchar* to_end,
 143                                         uchar*& to_next) const override
 144         {
 145             std::codecvt_base::result r = std::codecvt_base::ok;
 146
 147             // mbstate_t is POD type and should be initialized to 0 (i.a. state = stateT())
 148             // according to standard. We use it to keep a flag 0/1 for surrogate pair writing
 149             //
 150             // if 0 no code above >0xFFFF observed, of 1 a code above 0xFFFF observed
 151             // and first pair is written, but no input consumed
 152             std::uint16_t state = detail::read_state(std_state);
 153             while(to < to_end && from < from_end)
 154             {
 155                 const char* from_saved = from;
 156
 157                 uint32_t ch = utf::utf_traits<char>::decode(from, from_end);
 158
 159                 if(ch == utf::illegal)
 160                 {
 161                     ch = BOOST_NOWIDE_REPLACEMENT_CHARACTER;
 162                 } else if(ch == utf::incomplete)
 163                 {
 164                     from = from_saved;
 165                     r = std::codecvt_base::partial;
 166                     break;
 167                 }
 168                 // Normal codepoints go directly to stream
 169                 if(ch <= 0xFFFF)
 170                 {
 171                     *to++ = static_cast<CharType>(ch);
 172                 } else
 173                 {
 174                     // for  other codepoints we do following
 175                     //
 176                     // 1. We can't consume our input as we may find ourself
 177                     //    in state where all input consumed but not all output written,i.e. only
 178                     //    1st pair is written
 179                     // 2. We only write first pair and mark this in the state, we also revert back
 180                     //    the from pointer in order to make sure this codepoint would be read
 181                     //    once again and then we would consume our input together with writing
 182                     //    second surrogate pair
 183                     ch -= 0x10000;
 184                     std::uint16_t vh = static_cast<std::uint16_t>(ch >> 10);
 185                     std::uint16_t vl = ch & 0x3FF;
 186                     std::uint16_t w1 = vh + 0xD800;
 187                     std::uint16_t w2 = vl + 0xDC00;
 188                     if(state == 0)
 189                     {
 190                         from = from_saved;
 191                         *to++ = static_cast<CharType>(w1);
 192                         state = 1;
 193                     } else
 194                     {
 195                         *to++ = static_cast<CharType>(w2);
 196                         state = 0;
 197                     }
 198                 }
 199             }
 200             from_next = from;
 201             to_next = to;
 202             if(r == std::codecvt_base::ok && (from != from_end || state != 0))
 203                 r = std::codecvt_base::partial;
 204             detail::write_state(std_state, state);
 205             return r;
 206         }
 207
 208         std::codecvt_base::result do_out(std::mbstate_t& std_state,
 209                                          const uchar* from,
 210                                          const uchar* from_end,
 211                                          const uchar*& from_next,
 212                                          char* to,
 213                                          char* to_end,
 214                                          char*& to_next) const override
 215         {
 216             std::codecvt_base::result r = std::codecvt_base::ok;
 217             // mbstate_t is POD type and should be initialized to 0 (i.a. state = stateT())
 218             // according to standard. We assume that sizeof(mbstate_t) >=2 in order
 219             // to be able to store first observed surrogate pair
 220             //
 221             // State: state!=0 - a first surrogate pair was observed (state = first pair),
 222             // we expect the second one to come and then zero the state
 223             ///
 224             std::uint16_t state = detail::read_state(std_state);
 225             while(to < to_end && from < from_end)
 226             {
 227                 std::uint32_t ch = 0;
 228                 if(state != 0)
 229                 {
 230                     // if the state indicates that 1st surrogate pair was written
 231                     // we should make sure that the second one that comes is actually
 232                     // second surrogate
 233                     std::uint16_t w1 = state;
 234                     std::uint16_t w2 = *from;
 235                     // we don't forward from as writing may fail to incomplete or
 236                     // partial conversion
 237                     if(0xDC00 <= w2 && w2 <= 0xDFFF)
 238                     {
 239                         std::uint16_t vh = w1 - 0xD800;
 240                         std::uint16_t vl = w2 - 0xDC00;
 241                         ch = ((uint32_t(vh) << 10) | vl) + 0x10000;
 242                     } else
 243                     {
 244                         ch = BOOST_NOWIDE_REPLACEMENT_CHARACTER;
 245                     }
 246                 } else
 247                 {
 248                     ch = *from;
 249                     if(0xD800 <= ch && ch <= 0xDBFF)
 250                     {
 251                         // if this is a first surrogate pair we put
 252                         // it into the state and consume it, note we don't
 253                         // go forward as it should be illegal so we increase
 254                         // the from pointer manually
 255                         state = static_cast<std::uint16_t>(ch);
 256                         from++;
 257                         continue;
 258                     } else if(0xDC00 <= ch && ch <= 0xDFFF)
 259                     {
 260                         // if we observe second surrogate pair and
 261                         // first only may be expected we should break from the loop with error
 262                         // as it is illegal input
 263                         ch = BOOST_NOWIDE_REPLACEMENT_CHARACTER;
 264                     }
 265                 }
 266                 if(!utf::is_valid_codepoint(ch))
 267                 {
 268                     r = std::codecvt_base::error;
 269                     break;
 270                 }
 271                 int len = utf::utf_traits<char>::width(ch);
 272                 if(to_end - to < len)
 273                 {
 274                     r = std::codecvt_base::partial;
 275                     break;
 276                 }
 277                 to = utf::utf_traits<char>::encode(ch, to);
 278                 state = 0;
 279                 from++;
 280             }
 281             from_next = from;
 282             to_next = to;
 283             if(r == std::codecvt_base::ok && (from != from_end || state != 0))
 284                 r = std::codecvt_base::partial;
 285             detail::write_state(std_state, state);
 286             return r;
 287         }
 288     };
 289
 290     /// Specialization for the UTF-8 <-> UTF-32 variant of the std::codecvt implementation
 291     template<typename CharType>
 292     class BOOST_SYMBOL_VISIBLE utf8_codecvt<CharType, 4> : public std::codecvt<CharType, char, std::mbstate_t>
 293     {
 294     public:
 295         utf8_codecvt(size_t refs = 0) : std::codecvt<CharType, char, std::mbstate_t>(refs)
 296         {}
 297
 298     protected:
 299         using uchar = CharType;
 300
 301         std::codecvt_base::result
 302         do_unshift(std::mbstate_t& /*s*/, char* from, char* /*to*/, char*& next) const override
 303         {
 304             next = from;
 305             return std::codecvt_base::ok;
 306         }
 307         int do_encoding() const noexcept override
 308         {
 309             return 0;
 310         }
 311         int do_max_length() const noexcept override
 312         {
 313             return 4;
 314         }
 315         bool do_always_noconv() const noexcept override
 316         {
 317             return false;
 318         }
 319
 320         int do_length(std::mbstate_t
 321 #ifdef BOOST_NOWIDE_DO_LENGTH_MBSTATE_CONST
 322                       const
 323 #endif
 324                         & /*state*/,
 325                       const char* from,
 326                       const char* from_end,
 327                       size_t max) const override
 328         {
 329 #ifndef BOOST_NOWIDE_DO_LENGTH_MBSTATE_CONST
 330             const char* start_from = from;
 331 #else
 332             size_t save_max = max;
 333 #endif
 334
 335             while(max > 0 && from < from_end)
 336             {
 337                 const char* save_from = from;
 338                 std::uint32_t ch = utf::utf_traits<char>::decode(from, from_end);
 339                 if(ch == utf::incomplete)
 340                 {
 341                     from = save_from;
 342                     break;
 343                 } else if(ch == utf::illegal)
 344                 {
 345                     ch = BOOST_NOWIDE_REPLACEMENT_CHARACTER;
 346                 }
 347                 max--;
 348             }
 349 #ifndef BOOST_NOWIDE_DO_LENGTH_MBSTATE_CONST
 350             return from - start_from;
 351 #else
 352             return save_max - max;
 353 #endif
 354         }
 355
 356         std::codecvt_base::result do_in(std::mbstate_t& /*state*/,
 357                                         const char* from,
 358                                         const char* from_end,
 359                                         const char*& from_next,
 360                                         uchar* to,
 361                                         uchar* to_end,
 362                                         uchar*& to_next) const override
 363         {
 364             std::codecvt_base::result r = std::codecvt_base::ok;
 365
 366             while(to < to_end && from < from_end)
 367             {
 368                 const char* from_saved = from;
 369
 370                 uint32_t ch = utf::utf_traits<char>::decode(from, from_end);
 371
 372                 if(ch == utf::illegal)
 373                 {
 374                     ch = BOOST_NOWIDE_REPLACEMENT_CHARACTER;
 375                 } else if(ch == utf::incomplete)
 376                 {
 377                     r = std::codecvt_base::partial;
 378                     from = from_saved;
 379                     break;
 380                 }
 381                 *to++ = ch;
 382             }
 383             from_next = from;
 384             to_next = to;
 385             if(r == std::codecvt_base::ok && from != from_end)
 386                 r = std::codecvt_base::partial;
 387             return r;
 388         }
 389
 390         std::codecvt_base::result do_out(std::mbstate_t& /*std_state*/,
 391                                          const uchar* from,
 392                                          const uchar* from_end,
 393                                          const uchar*& from_next,
 394                                          char* to,
 395                                          char* to_end,
 396                                          char*& to_next) const override
 397         {
 398             std::codecvt_base::result r = std::codecvt_base::ok;
 399             while(to < to_end && from < from_end)
 400             {
 401                 std::uint32_t ch = 0;
 402                 ch = *from;
 403                 if(!utf::is_valid_codepoint(ch))
 404                 {
 405                     ch = BOOST_NOWIDE_REPLACEMENT_CHARACTER;
 406                 }
 407                 int len = utf::utf_traits<char>::width(ch);
 408                 if(to_end - to < len)
 409                 {
 410                     r = std::codecvt_base::partial;
 411                     break;
 412                 }
 413                 to = utf::utf_traits<char>::encode(ch, to);
 414                 from++;
 415             }
 416             from_next = from;
 417             to_next = to;
 418             if(r == std::codecvt_base::ok && from != from_end)
 419                 r = std::codecvt_base::partial;
 420             return r;
 421         }
 422     };
 423
 424 } // namespace nowide
 425 } // namespace boost
 426
 427 #endif