ceph/src/boost/boost/beast/websocket/detail/utf8_checker.hpp

   1 //
   2 // Copyright (c) 2016-2017 Vinnie Falco (vinnie dot falco at gmail dot com)
   3 //
   4 // Distributed under the Boost Software License, Version 1.0. (See accompanying
   5 // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
   6 //
   7 // Official repository: https://github.com/boostorg/beast
   8 //
   9
  10 #ifndef BOOST_BEAST_WEBSOCKET_DETAIL_UTF8_CHECKER_HPP
  11 #define BOOST_BEAST_WEBSOCKET_DETAIL_UTF8_CHECKER_HPP
  12
  13 #include <boost/beast/core/type_traits.hpp>
  14 #include <boost/asio/buffer.hpp>
  15 #include <boost/assert.hpp>
  16 #include <algorithm>
  17 #include <cstdint>
  18
  19 namespace boost {
  20 namespace beast {
  21 namespace websocket {
  22 namespace detail {
  23
  24 /** A UTF8 validator.
  25
  26     This validator can be used to check if a buffer containing UTF8 text is
  27     valid. The write function may be called incrementally with segmented UTF8
  28     sequences. The finish function determines if all processed text is valid.
  29 */
  30 template<class = void>
  31 class utf8_checker_t
  32 {
  33     std::size_t need_ = 0;  // chars we need to finish the code point
  34     std::uint8_t* p_ = cp_; // current position in temp buffer
  35     std::uint8_t cp_[4];    // a temp buffer for the code point
  36
  37 public:
  38     /** Prepare to process text as valid utf8
  39     */
  40     void
  41     reset();
  42
  43     /** Check that all processed text is valid utf8
  44     */
  45     bool
  46     finish();
  47
  48     /** Check if text is valid UTF8
  49
  50         @return `true` if the text is valid utf8 or false otherwise.
  51     */
  52     bool
  53     write(std::uint8_t const* in, std::size_t size);
  54
  55     /** Check if text is valid UTF8
  56
  57         @return `true` if the text is valid utf8 or false otherwise.
  58     */
  59     template<class ConstBufferSequence>
  60     bool
  61     write(ConstBufferSequence const& bs);
  62 };
  63
  64 template<class _>
  65 void
  66 utf8_checker_t<_>::
  67 reset()
  68 {
  69     need_ = 0;
  70     p_ = cp_;
  71 }
  72
  73 template<class _>
  74 bool
  75 utf8_checker_t<_>::
  76 finish()
  77 {
  78     auto const success = need_ == 0;
  79     reset();
  80     return success;
  81 }
  82
  83 template<class _>
  84 template<class ConstBufferSequence>
  85 bool
  86 utf8_checker_t<_>::
  87 write(ConstBufferSequence const& bs)
  88 {
  89     static_assert(boost::asio::is_const_buffer_sequence<ConstBufferSequence>::value,
  90         "ConstBufferSequence requirements not met");
  91     for(auto b : beast::detail::buffers_range(bs))
  92         if(! write(reinterpret_cast<
  93             std::uint8_t const*>(b.data()),
  94                 b.size()))
  95             return false;
  96     return true;
  97 }
  98
  99 template<class _>
 100 bool
 101 utf8_checker_t<_>::
 102 write(std::uint8_t const* in, std::size_t size)
 103 {
 104     auto const valid =
 105         [](std::uint8_t const*& p)
 106         {
 107             if(p[0] < 128)
 108             {
 109                 ++p;
 110                 return true;
 111             }
 112             if((p[0] & 0xe0) == 0xc0)
 113             {
 114                 if( (p[1] & 0xc0) != 0x80 ||
 115                     (p[0] & 0xfe) == 0xc0)  // overlong
 116                     return false;
 117                 p += 2;
 118                 return true;
 119             }
 120             if((p[0] & 0xf0) == 0xe0)
 121             {
 122                 if(    (p[1] & 0xc0) != 0x80
 123                     || (p[2] & 0xc0) != 0x80
 124                     || (p[0] == 0xe0 && (p[1] & 0xe0) == 0x80) // overlong
 125                     || (p[0] == 0xed && (p[1] & 0xe0) == 0xa0) // surrogate
 126                     //|| (p[0] == 0xef && p[1] == 0xbf && (p[2] & 0xfe) == 0xbe) // U+FFFE or U+FFFF
 127                     )
 128                     return false;
 129                 p += 3;
 130                 return true;
 131             }
 132             if((p[0] & 0xf8) == 0xf0)
 133             {
 134                 if(    (p[1] & 0xc0) != 0x80
 135                     || (p[2] & 0xc0) != 0x80
 136                     || (p[3] & 0xc0) != 0x80
 137                     || (p[0] == 0xf0 && (p[1] & 0xf0) == 0x80) // overlong
 138                     || (p[0] == 0xf4 && p[1] > 0x8f) || p[0] > 0xf4 // > U+10FFFF
 139                     )
 140                     return false;
 141                 p += 4;
 142                 return true;
 143             }
 144             return false;
 145         };
 146     auto const fail_fast =
 147         [&]()
 148         {
 149             auto const n = p_ - cp_;
 150             switch(n)
 151             {
 152             default:
 153                 BOOST_ASSERT(false);
 154                 BOOST_BEAST_FALLTHROUGH;
 155             case 1:
 156                 cp_[1] = 0x81;
 157                 BOOST_BEAST_FALLTHROUGH;
 158             case 2:
 159                 cp_[2] = 0x81;
 160                 BOOST_BEAST_FALLTHROUGH;
 161             case 3:
 162                 cp_[3] = 0x81;
 163                 break;
 164             }
 165             std::uint8_t const* p = cp_;
 166             return ! valid(p);
 167         };
 168     auto const needed =
 169         [](std::uint8_t const v)
 170         {
 171             if(v < 128)
 172                 return 1;
 173             if(v < 192)
 174                 return 0;
 175             if(v < 224)
 176                 return 2;
 177             if(v < 240)
 178                 return 3;
 179             if(v < 248)
 180                 return 4;
 181             return 0;
 182         };
 183
 184     auto const end = in + size;
 185
 186     // Finish up any incomplete code point
 187     if(need_ > 0)
 188     {
 189         // Calculate what we have
 190         auto n = (std::min)(size, need_);
 191         size -= n;
 192         need_ -= n;
 193
 194         // Add characters to the code point
 195         while(n--)
 196             *p_++ = *in++;
 197         BOOST_ASSERT(p_ <= cp_ + 5);
 198
 199         // Still incomplete?
 200         if(need_ > 0)
 201         {
 202             // Incomplete code point
 203             BOOST_ASSERT(in == end);
 204
 205             // Do partial validation on the incomplete
 206             // code point, this is called "Fail fast"
 207             // in Autobahn|Testsuite parlance.
 208             return ! fail_fast();
 209         }
 210
 211         // Complete code point, validate it
 212         std::uint8_t const* p = &cp_[0];
 213         if(! valid(p))
 214             return false;
 215         p_ = cp_;
 216     }
 217
 218     if(size <= sizeof(std::size_t))
 219         goto slow;
 220
 221     // Align `in` to sizeof(std::size_t) boundary
 222     {
 223         auto const in0 = in;
 224         auto last = reinterpret_cast<std::uint8_t const*>(
 225             ((reinterpret_cast<std::uintptr_t>(in) + sizeof(std::size_t) - 1) /
 226                 sizeof(std::size_t)) * sizeof(std::size_t));
 227
 228         // Check one character at a time for low-ASCII
 229         while(in < last)
 230         {
 231             if(*in & 0x80)
 232             {
 233                 // Not low-ASCII so switch to slow loop
 234                 size = size - (in - in0);
 235                 goto slow;
 236             }
 237             ++in;
 238         }
 239         size = size - (in - in0);
 240     }
 241
 242     // Fast loop: Process 4 or 8 low-ASCII characters at a time
 243     {
 244         auto const in0 = in;
 245         auto last = in + size - 7;
 246         auto constexpr mask = static_cast<
 247             std::size_t>(0x8080808080808080 & ~std::size_t{0});
 248         while(in < last)
 249         {
 250 #if 0
 251             std::size_t temp;
 252             std::memcpy(&temp, in, sizeof(temp));
 253             if((temp & mask) != 0)
 254 #else
 255             // Technically UB but works on all known platforms
 256             if((*reinterpret_cast<std::size_t const*>(in) & mask) != 0)
 257 #endif
 258             {
 259                 size = size - (in - in0);
 260                 goto slow;
 261             }
 262             in += sizeof(std::size_t);
 263         }
 264         // There's at least one more full code point left
 265         last += 4;
 266         while(in < last)
 267             if(! valid(in))
 268                 return false;
 269         goto tail;
 270     }
 271
 272 slow:
 273     // Slow loop: Full validation on one code point at a time
 274     {
 275         auto last = in + size - 3;
 276         while(in < last)
 277             if(! valid(in))
 278                 return false;
 279     }
 280
 281 tail:
 282     // Handle the remaining bytes. The last
 283     // characters could split a code point so
 284     // we save the partial code point for later.
 285     //
 286     // On entry to the loop, `in` points to the
 287     // beginning of a code point.
 288     //
 289     for(;;)
 290     {
 291         // Number of chars left
 292         auto n = end - in;
 293         if(! n)
 294             break;
 295
 296         // Chars we need to finish this code point
 297         auto const need = needed(*in);
 298         if(need == 0)
 299             return false;
 300         if(need <= n)
 301         {
 302             // Check a whole code point
 303             if(! valid(in))
 304                 return false;
 305         }
 306         else
 307         {
 308             // Calculate how many chars we need
 309             // to finish this partial code point
 310             need_ = need - n;
 311
 312             // Save the partial code point
 313             while(n--)
 314                 *p_++ = *in++;
 315             BOOST_ASSERT(in == end);
 316             BOOST_ASSERT(p_ <= cp_ + 5);
 317
 318             // Do partial validation on the incomplete
 319             // code point, this is called "Fail fast"
 320             // in Autobahn|Testsuite parlance.
 321             return ! fail_fast();
 322         }
 323     }
 324     return true;
 325 }
 326
 327 using utf8_checker = utf8_checker_t<>;
 328
 329 template<class = void>
 330 bool
 331 check_utf8(char const* p, std::size_t n)
 332 {
 333     utf8_checker c;
 334     if(! c.write(reinterpret_cast<const uint8_t*>(p), n))
 335         return false;
 336     return c.finish();
 337 }
 338
 339 } // detail
 340 } // websocket
 341 } // beast
 342 } // boost
 343
 344 #endif