]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // |
2 | // Copyright (c) 2013-2017 Vinnie Falco (vinnie dot falco at gmail dot com) | |
3 | // | |
4 | // Distributed under the Boost Software License, Version 1.0. (See accompanying | |
5 | // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) | |
6 | // | |
7 | ||
8 | #ifndef BEAST_WEBSOCKET_DETAIL_UTF8_CHECKER_HPP | |
9 | #define BEAST_WEBSOCKET_DETAIL_UTF8_CHECKER_HPP | |
10 | ||
11 | #include <boost/asio/buffer.hpp> | |
12 | #include <boost/assert.hpp> | |
13 | #include <beast/core/buffer_concepts.hpp> | |
14 | #include <algorithm> | |
15 | #include <cstdint> | |
16 | ||
17 | namespace beast { | |
18 | namespace websocket { | |
19 | namespace detail { | |
20 | ||
21 | /* This is a modified work. | |
22 | ||
23 | Original version and license: | |
24 | https://www.cl.cam.ac.uk/~mgk25/ucs/utf8_check.c | |
25 | Permission is hereby granted, free of charge, to any person obtaining | |
26 | a copy of this software and associated documentation files (the | |
27 | "Software"), to deal in the Software without restriction, including | |
28 | without limitation the rights to use, copy, modify, merge, publish, | |
29 | distribute, sublicense, and/or sell copies of the Software, and to | |
30 | permit persons to whom the Software is furnished to do so, subject | |
31 | to the following conditions: | |
32 | ||
33 | The above copyright notice and this permission notice shall be included | |
34 | in all copies or substantial portions of the Software. | |
35 | ||
36 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | |
37 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES | |
38 | OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. | |
39 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR | |
40 | ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, | |
41 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE | |
42 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. * | |
43 | ||
44 | Additional changes: | |
45 | Optimized for predominantly 7-bit content, 2016 | |
46 | https://github.com/uWebSockets/uWebSockets/blob/755bd362649c06abff102f18e273c5792c51c1a0/src/WebSocketProtocol.h#L198 | |
47 | Copyright (c) 2016 Alex Hultman and contributors | |
48 | ||
49 | This software is provided 'as-is', without any express or implied | |
50 | warranty. In no event will the authors be held liable for any damages | |
51 | arising from the use of this software. | |
52 | ||
53 | Permission is granted to anyone to use this software for any purpose, | |
54 | including commercial applications, and to alter it and redistribute it | |
55 | freely, subject to the following restrictions: | |
56 | ||
57 | 1. The origin of this software must not be misrepresented; you must not | |
58 | claim that you wrote the original software. If you use this software | |
59 | in a product, an acknowledgement in the product documentation would be | |
60 | appreciated but is not required. | |
61 | 2. Altered source versions must be plainly marked as such, and must not be | |
62 | misrepresented as being the original software. | |
63 | 3. This notice may not be removed or altered from any source distribution. | |
64 | */ | |
65 | ||
66 | /** A UTF8 validator. | |
67 | ||
68 | This validator can be used to check if a buffer containing UTF8 text is | |
69 | valid. The write function may be called incrementally with segmented UTF8 | |
70 | sequences. The finish function determines if all processed text is valid. | |
71 | */ | |
72 | template<class = void> | |
73 | class utf8_checker_t | |
74 | { | |
75 | std::size_t need_ = 0; | |
76 | std::uint8_t* p_ = have_; | |
77 | std::uint8_t have_[4]; | |
78 | ||
79 | public: | |
80 | /** Prepare to process text as valid utf8 | |
81 | */ | |
82 | void | |
83 | reset(); | |
84 | ||
85 | /** Check that all processed text is valid utf8 | |
86 | */ | |
87 | bool | |
88 | finish(); | |
89 | ||
90 | /** Check if text is valid UTF8 | |
91 | ||
92 | @return `true` if the text is valid utf8 or false otherwise. | |
93 | */ | |
94 | bool | |
95 | write(std::uint8_t const* in, std::size_t size); | |
96 | ||
97 | /** Check if text is valid UTF8 | |
98 | ||
99 | @return `true` if the text is valid utf8 or false otherwise. | |
100 | */ | |
101 | template<class ConstBufferSequence> | |
102 | bool | |
103 | write(ConstBufferSequence const& bs); | |
104 | }; | |
105 | ||
106 | template<class _> | |
107 | void | |
108 | utf8_checker_t<_>::reset() | |
109 | { | |
110 | need_ = 0; | |
111 | p_ = have_; | |
112 | } | |
113 | ||
114 | template<class _> | |
115 | bool | |
116 | utf8_checker_t<_>::finish() | |
117 | { | |
118 | auto const success = need_ == 0; | |
119 | reset(); | |
120 | return success; | |
121 | } | |
122 | ||
123 | template<class _> | |
124 | template<class ConstBufferSequence> | |
125 | bool | |
126 | utf8_checker_t<_>::write(ConstBufferSequence const& bs) | |
127 | { | |
128 | static_assert(is_ConstBufferSequence<ConstBufferSequence>::value, | |
129 | "ConstBufferSequence requirements not met"); | |
130 | using boost::asio::buffer_cast; | |
131 | using boost::asio::buffer_size; | |
132 | for(auto const& b : bs) | |
133 | if(! write(buffer_cast<std::uint8_t const*>(b), | |
134 | buffer_size(b))) | |
135 | return false; | |
136 | return true; | |
137 | } | |
138 | ||
139 | template<class _> | |
140 | bool | |
141 | utf8_checker_t<_>::write(std::uint8_t const* in, std::size_t size) | |
142 | { | |
143 | auto const valid = | |
144 | [](std::uint8_t const*& in) | |
145 | { | |
146 | if (in[0] < 128) | |
147 | { | |
148 | ++in; | |
149 | return true; | |
150 | } | |
151 | if ((in[0] & 0x60) == 0x40) | |
152 | { | |
153 | if ((in[1] & 0xc0) != 0x80) | |
154 | return false; | |
155 | in += 2; | |
156 | return true; | |
157 | } | |
158 | if ((in[0] & 0xf0) == 0xe0) | |
159 | { | |
160 | if ((in[1] & 0xc0) != 0x80 || | |
161 | (in[2] & 0xc0) != 0x80 || | |
162 | (in[0] == 224 && in[1] < 160) || | |
163 | (in[0] == 237 && in[1] > 159)) | |
164 | return false; | |
165 | in += 3; | |
166 | return true; | |
167 | } | |
168 | if ((in[0] & 0xf8) == 0xf0) | |
169 | { | |
170 | if (in[0] > 244 || | |
171 | (in[1] & 0xc0) != 0x80 || | |
172 | (in[2] & 0xc0) != 0x80 || | |
173 | (in[3] & 0xc0) != 0x80 || | |
174 | (in[0] == 240 && in[1] < 144) || | |
175 | (in[0] == 244 && in[1] > 143)) | |
176 | return false; | |
177 | in += 4; | |
178 | return true; | |
179 | } | |
180 | return false; | |
181 | }; | |
182 | auto const valid_have = | |
183 | [&]() | |
184 | { | |
185 | if ((have_[0] & 0x60) == 0x40) | |
186 | return have_[0] <= 223; | |
187 | if ((have_[0] & 0xf0) == 0xe0) | |
188 | { | |
189 | if (p_ - have_ > 1 && | |
190 | ((have_[1] & 0xc0) != 0x80 || | |
191 | (have_[0] == 224 && have_[1] < 160) || | |
192 | (have_[0] == 237 && have_[1] > 159))) | |
193 | return false; | |
194 | return true; | |
195 | } | |
196 | if ((have_[0] & 0xf8) == 0xf0) | |
197 | { | |
198 | auto const size = p_ - have_; | |
199 | if (size > 2 && (have_[2] & 0xc0) != 0x80) | |
200 | return false; | |
201 | if (size > 1 && | |
202 | ((have_[1] & 0xc0) != 0x80 || | |
203 | (have_[0] == 240 && have_[1] < 144) || | |
204 | (have_[0] == 244 && have_[1] > 143))) | |
205 | return false; | |
206 | } | |
207 | return true; | |
208 | }; | |
209 | auto const needed = | |
210 | [](std::uint8_t const in) | |
211 | { | |
212 | if (in < 128) | |
213 | return 1; | |
214 | if (in < 194) | |
215 | return 0; | |
216 | if (in < 224) | |
217 | return 2; | |
218 | if (in < 240) | |
219 | return 3; | |
220 | if (in < 245) | |
221 | return 4; | |
222 | return 0; | |
223 | }; | |
224 | ||
225 | auto const end = in + size; | |
226 | if (need_ > 0) | |
227 | { | |
228 | auto n = (std::min)(size, need_); | |
229 | size -= n; | |
230 | need_ -= n; | |
231 | while(n--) | |
232 | *p_++ = *in++; | |
233 | if(need_ > 0) | |
234 | { | |
235 | BOOST_ASSERT(in == end); | |
236 | return valid_have(); | |
237 | } | |
238 | std::uint8_t const* p = &have_[0]; | |
239 | if (! valid(p)) | |
240 | return false; | |
241 | p_ = have_; | |
242 | } | |
243 | ||
244 | auto last = in + size - 7; | |
245 | while(in < last) | |
246 | { | |
247 | #if BEAST_WEBSOCKET_NO_UNALIGNED_READ | |
248 | auto constexpr align = sizeof(std::size_t) - 1; | |
249 | auto constexpr mask = static_cast< | |
250 | std::size_t>(0x8080808080808080 & | |
251 | ~std::size_t{0}); | |
252 | if( | |
253 | ((reinterpret_cast< | |
254 | std::uintptr_t>(in) & align) == 0) && | |
255 | (*reinterpret_cast< | |
256 | std::size_t const*>(in) & mask) == 0) | |
257 | in += sizeof(std::size_t); | |
258 | else if(! valid(in)) | |
259 | return false; | |
260 | #else | |
261 | auto constexpr mask = static_cast< | |
262 | std::size_t>(0x8080808080808080 & | |
263 | ~std::size_t{0}); | |
264 | if( | |
265 | (*reinterpret_cast< | |
266 | std::size_t const*>(in) & mask) == 0) | |
267 | in += sizeof(std::size_t); | |
268 | else if(! valid(in)) | |
269 | return false; | |
270 | #endif | |
271 | } | |
272 | last += 4; | |
273 | while(in < last) | |
274 | if(! valid(in)) | |
275 | return false; | |
276 | ||
277 | for(;;) | |
278 | { | |
279 | auto n = end - in; | |
280 | if(! n) | |
281 | break; | |
282 | auto const need = needed(*in); | |
283 | if (need == 0) | |
284 | return false; | |
285 | if(need <= n) | |
286 | { | |
287 | if(! valid(in)) | |
288 | return false; | |
289 | } | |
290 | else | |
291 | { | |
292 | need_ = need - n; | |
293 | while(n--) | |
294 | *p_++ = *in++; | |
295 | return valid_have(); | |
296 | } | |
297 | } | |
298 | return true; | |
299 | } | |
300 | ||
301 | using utf8_checker = utf8_checker_t<>; | |
302 | ||
303 | template<class = void> | |
304 | bool | |
305 | check_utf8(char const* p, std::size_t n) | |
306 | { | |
307 | utf8_checker c; | |
308 | if(! c.write(reinterpret_cast<const uint8_t*>(p), n)) | |
309 | return false; | |
310 | return c.finish(); | |
311 | } | |
312 | ||
313 | } // detail | |
314 | } // websocket | |
315 | } // beast | |
316 | ||
317 | #endif |