2 // Copyright (c) 2013-2017 Vinnie Falco (vinnie dot falco at gmail dot com)
4 // Distributed under the Boost Software License, Version 1.0. (See accompanying
5 // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
8 #ifndef BEAST_WEBSOCKET_DETAIL_UTF8_CHECKER_HPP
9 #define BEAST_WEBSOCKET_DETAIL_UTF8_CHECKER_HPP
11 #include <boost/asio/buffer.hpp>
12 #include <boost/assert.hpp>
13 #include <beast/core/buffer_concepts.hpp>
21 /* This is a modified work.
23 Original version and license:
24 https://www.cl.cam.ac.uk/~mgk25/ucs/utf8_check.c
25 Permission is hereby granted, free of charge, to any person obtaining
26 a copy of this software and associated documentation files (the
27 "Software"), to deal in the Software without restriction, including
28 without limitation the rights to use, copy, modify, merge, publish,
29 distribute, sublicense, and/or sell copies of the Software, and to
30 permit persons to whom the Software is furnished to do so, subject
31 to the following conditions:
33 The above copyright notice and this permission notice shall be included
34 in all copies or substantial portions of the Software.
36 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
37 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
38 OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
39 IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR
40 ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
41 TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
42 SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. *
45 Optimized for predominantly 7-bit content, 2016
46 https://github.com/uWebSockets/uWebSockets/blob/755bd362649c06abff102f18e273c5792c51c1a0/src/WebSocketProtocol.h#L198
47 Copyright (c) 2016 Alex Hultman and contributors
49 This software is provided 'as-is', without any express or implied
50 warranty. In no event will the authors be held liable for any damages
51 arising from the use of this software.
53 Permission is granted to anyone to use this software for any purpose,
54 including commercial applications, and to alter it and redistribute it
55 freely, subject to the following restrictions:
57 1. The origin of this software must not be misrepresented; you must not
58 claim that you wrote the original software. If you use this software
59 in a product, an acknowledgement in the product documentation would be
60 appreciated but is not required.
61 2. Altered source versions must be plainly marked as such, and must not be
62 misrepresented as being the original software.
63 3. This notice may not be removed or altered from any source distribution.
68 This validator can be used to check if a buffer containing UTF8 text is
69 valid. The write function may be called incrementally with segmented UTF8
70 sequences. The finish function determines if all processed text is valid.
72 template<class = void>
75 std::size_t need_ = 0;
76 std::uint8_t* p_ = have_;
77 std::uint8_t have_[4];
80 /** Prepare to process text as valid utf8
85 /** Check that all processed text is valid utf8
90 /** Check if text is valid UTF8
92 @return `true` if the text is valid utf8 or false otherwise.
95 write(std::uint8_t const* in, std::size_t size);
97 /** Check if text is valid UTF8
99 @return `true` if the text is valid utf8 or false otherwise.
101 template<class ConstBufferSequence>
103 write(ConstBufferSequence const& bs);
108 utf8_checker_t<_>::reset()
116 utf8_checker_t<_>::finish()
118 auto const success = need_ == 0;
124 template<class ConstBufferSequence>
126 utf8_checker_t<_>::write(ConstBufferSequence const& bs)
128 static_assert(is_ConstBufferSequence<ConstBufferSequence>::value,
129 "ConstBufferSequence requirements not met");
130 using boost::asio::buffer_cast;
131 using boost::asio::buffer_size;
132 for(auto const& b : bs)
133 if(! write(buffer_cast<std::uint8_t const*>(b),
141 utf8_checker_t<_>::write(std::uint8_t const* in, std::size_t size)
144 [](std::uint8_t const*& in)
151 if ((in[0] & 0x60) == 0x40)
153 if ((in[1] & 0xc0) != 0x80)
158 if ((in[0] & 0xf0) == 0xe0)
160 if ((in[1] & 0xc0) != 0x80 ||
161 (in[2] & 0xc0) != 0x80 ||
162 (in[0] == 224 && in[1] < 160) ||
163 (in[0] == 237 && in[1] > 159))
168 if ((in[0] & 0xf8) == 0xf0)
171 (in[1] & 0xc0) != 0x80 ||
172 (in[2] & 0xc0) != 0x80 ||
173 (in[3] & 0xc0) != 0x80 ||
174 (in[0] == 240 && in[1] < 144) ||
175 (in[0] == 244 && in[1] > 143))
182 auto const valid_have =
185 if ((have_[0] & 0x60) == 0x40)
186 return have_[0] <= 223;
187 if ((have_[0] & 0xf0) == 0xe0)
189 if (p_ - have_ > 1 &&
190 ((have_[1] & 0xc0) != 0x80 ||
191 (have_[0] == 224 && have_[1] < 160) ||
192 (have_[0] == 237 && have_[1] > 159)))
196 if ((have_[0] & 0xf8) == 0xf0)
198 auto const size = p_ - have_;
199 if (size > 2 && (have_[2] & 0xc0) != 0x80)
202 ((have_[1] & 0xc0) != 0x80 ||
203 (have_[0] == 240 && have_[1] < 144) ||
204 (have_[0] == 244 && have_[1] > 143)))
210 [](std::uint8_t const in)
225 auto const end = in + size;
228 auto n = (std::min)(size, need_);
235 BOOST_ASSERT(in == end);
238 std::uint8_t const* p = &have_[0];
244 auto last = in + size - 7;
247 #if BEAST_WEBSOCKET_NO_UNALIGNED_READ
248 auto constexpr align = sizeof(std::size_t) - 1;
249 auto constexpr mask = static_cast<
250 std::size_t>(0x8080808080808080 &
254 std::uintptr_t>(in) & align) == 0) &&
256 std::size_t const*>(in) & mask) == 0)
257 in += sizeof(std::size_t);
261 auto constexpr mask = static_cast<
262 std::size_t>(0x8080808080808080 &
266 std::size_t const*>(in) & mask) == 0)
267 in += sizeof(std::size_t);
282 auto const need = needed(*in);
301 using utf8_checker = utf8_checker_t<>;
303 template<class = void>
305 check_utf8(char const* p, std::size_t n)
308 if(! c.write(reinterpret_cast<const uint8_t*>(p), n))