2 // Copyright (c) 2016-2017 Vinnie Falco (vinnie dot falco at gmail dot com)
4 // Distributed under the Boost Software License, Version 1.0. (See accompanying
5 // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
7 // Official repository: https://github.com/boostorg/beast
10 #ifndef BOOST_BEAST_WEBSOCKET_DETAIL_UTF8_CHECKER_HPP
11 #define BOOST_BEAST_WEBSOCKET_DETAIL_UTF8_CHECKER_HPP
13 #include <boost/beast/core/type_traits.hpp>
14 #include <boost/asio/buffer.hpp>
15 #include <boost/assert.hpp>
26 This validator can be used to check if a buffer containing UTF8 text is
27 valid. The write function may be called incrementally with segmented UTF8
28 sequences. The finish function determines if all processed text is valid.
30 template<class = void>
33 std::size_t need_ = 0; // chars we need to finish the code point
34 std::uint8_t* p_ = cp_; // current position in temp buffer
35 std::uint8_t cp_[4]; // a temp buffer for the code point
38 /** Prepare to process text as valid utf8
43 /** Check that all processed text is valid utf8
48 /** Check if text is valid UTF8
50 @return `true` if the text is valid utf8 or false otherwise.
53 write(std::uint8_t const* in, std::size_t size);
55 /** Check if text is valid UTF8
57 @return `true` if the text is valid utf8 or false otherwise.
59 template<class ConstBufferSequence>
61 write(ConstBufferSequence const& bs);
78 auto const success = need_ == 0;
84 template<class ConstBufferSequence>
87 write(ConstBufferSequence const& bs)
89 static_assert(boost::asio::is_const_buffer_sequence<ConstBufferSequence>::value,
90 "ConstBufferSequence requirements not met");
91 for(auto b : beast::detail::buffers_range(bs))
92 if(! write(reinterpret_cast<
93 std::uint8_t const*>(b.data()),
102 write(std::uint8_t const* in, std::size_t size)
105 [](std::uint8_t const*& p)
112 if((p[0] & 0xe0) == 0xc0)
114 if( (p[1] & 0xc0) != 0x80 ||
115 (p[0] & 0xfe) == 0xc0) // overlong
120 if((p[0] & 0xf0) == 0xe0)
122 if( (p[1] & 0xc0) != 0x80
123 || (p[2] & 0xc0) != 0x80
124 || (p[0] == 0xe0 && (p[1] & 0xe0) == 0x80) // overlong
125 || (p[0] == 0xed && (p[1] & 0xe0) == 0xa0) // surrogate
126 //|| (p[0] == 0xef && p[1] == 0xbf && (p[2] & 0xfe) == 0xbe) // U+FFFE or U+FFFF
132 if((p[0] & 0xf8) == 0xf0)
134 if( (p[1] & 0xc0) != 0x80
135 || (p[2] & 0xc0) != 0x80
136 || (p[3] & 0xc0) != 0x80
137 || (p[0] == 0xf0 && (p[1] & 0xf0) == 0x80) // overlong
138 || (p[0] == 0xf4 && p[1] > 0x8f) || p[0] > 0xf4 // > U+10FFFF
146 auto const fail_fast =
149 auto const n = p_ - cp_;
154 BOOST_BEAST_FALLTHROUGH;
157 BOOST_BEAST_FALLTHROUGH;
160 BOOST_BEAST_FALLTHROUGH;
165 std::uint8_t const* p = cp_;
169 [](std::uint8_t const v)
184 auto const end = in + size;
186 // Finish up any incomplete code point
189 // Calculate what we have
190 auto n = (std::min)(size, need_);
194 // Add characters to the code point
197 BOOST_ASSERT(p_ <= cp_ + 5);
202 // Incomplete code point
203 BOOST_ASSERT(in == end);
205 // Do partial validation on the incomplete
206 // code point, this is called "Fail fast"
207 // in Autobahn|Testsuite parlance.
208 return ! fail_fast();
211 // Complete code point, validate it
212 std::uint8_t const* p = &cp_[0];
218 if(size <= sizeof(std::size_t))
221 // Align `in` to sizeof(std::size_t) boundary
224 auto last = reinterpret_cast<std::uint8_t const*>(
225 ((reinterpret_cast<std::uintptr_t>(in) + sizeof(std::size_t) - 1) /
226 sizeof(std::size_t)) * sizeof(std::size_t));
228 // Check one character at a time for low-ASCII
233 // Not low-ASCII so switch to slow loop
234 size = size - (in - in0);
239 size = size - (in - in0);
242 // Fast loop: Process 4 or 8 low-ASCII characters at a time
245 auto last = in + size - 7;
246 auto constexpr mask = static_cast<
247 std::size_t>(0x8080808080808080 & ~std::size_t{0});
252 std::memcpy(&temp, in, sizeof(temp));
253 if((temp & mask) != 0)
255 // Technically UB but works on all known platforms
256 if((*reinterpret_cast<std::size_t const*>(in) & mask) != 0)
259 size = size - (in - in0);
262 in += sizeof(std::size_t);
264 // There's at least one more full code point left
273 // Slow loop: Full validation on one code point at a time
275 auto last = in + size - 3;
282 // Handle the remaining bytes. The last
283 // characters could split a code point so
284 // we save the partial code point for later.
286 // On entry to the loop, `in` points to the
287 // beginning of a code point.
291 // Number of chars left
296 // Chars we need to finish this code point
297 auto const need = needed(*in);
302 // Check a whole code point
308 // Calculate how many chars we need
309 // to finish this partial code point
312 // Save the partial code point
315 BOOST_ASSERT(in == end);
316 BOOST_ASSERT(p_ <= cp_ + 5);
318 // Do partial validation on the incomplete
319 // code point, this is called "Fail fast"
320 // in Autobahn|Testsuite parlance.
321 return ! fail_fast();
327 using utf8_checker = utf8_checker_t<>;
329 template<class = void>
331 check_utf8(char const* p, std::size_t n)
334 if(! c.write(reinterpret_cast<const uint8_t*>(p), n))