]> git.proxmox.com Git - ceph.git/blob - ceph/src/boost/boost/beast/websocket/detail/utf8_checker.hpp
update sources to v12.2.3
[ceph.git] / ceph / src / boost / boost / beast / websocket / detail / utf8_checker.hpp
1 //
2 // Copyright (c) 2016-2017 Vinnie Falco (vinnie dot falco at gmail dot com)
3 //
4 // Distributed under the Boost Software License, Version 1.0. (See accompanying
5 // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
6 //
7 // Official repository: https://github.com/boostorg/beast
8 //
9
10 #ifndef BOOST_BEAST_WEBSOCKET_DETAIL_UTF8_CHECKER_HPP
11 #define BOOST_BEAST_WEBSOCKET_DETAIL_UTF8_CHECKER_HPP
12
13 #include <boost/beast/core/type_traits.hpp>
14 #include <boost/asio/buffer.hpp>
15 #include <boost/assert.hpp>
16 #include <algorithm>
17 #include <cstdint>
18
19 namespace boost {
20 namespace beast {
21 namespace websocket {
22 namespace detail {
23
24 /** A UTF8 validator.
25
26 This validator can be used to check if a buffer containing UTF8 text is
27 valid. The write function may be called incrementally with segmented UTF8
28 sequences. The finish function determines if all processed text is valid.
29 */
30 template<class = void>
31 class utf8_checker_t
32 {
33 std::size_t need_ = 0; // chars we need to finish the code point
34 std::uint8_t* p_ = cp_; // current position in temp buffer
35 std::uint8_t cp_[4]; // a temp buffer for the code point
36
37 public:
38 /** Prepare to process text as valid utf8
39 */
40 void
41 reset();
42
43 /** Check that all processed text is valid utf8
44 */
45 bool
46 finish();
47
48 /** Check if text is valid UTF8
49
50 @return `true` if the text is valid utf8 or false otherwise.
51 */
52 bool
53 write(std::uint8_t const* in, std::size_t size);
54
55 /** Check if text is valid UTF8
56
57 @return `true` if the text is valid utf8 or false otherwise.
58 */
59 template<class ConstBufferSequence>
60 bool
61 write(ConstBufferSequence const& bs);
62 };
63
64 template<class _>
65 void
66 utf8_checker_t<_>::
67 reset()
68 {
69 need_ = 0;
70 p_ = cp_;
71 }
72
73 template<class _>
74 bool
75 utf8_checker_t<_>::
76 finish()
77 {
78 auto const success = need_ == 0;
79 reset();
80 return success;
81 }
82
83 template<class _>
84 template<class ConstBufferSequence>
85 bool
86 utf8_checker_t<_>::
87 write(ConstBufferSequence const& bs)
88 {
89 static_assert(boost::asio::is_const_buffer_sequence<ConstBufferSequence>::value,
90 "ConstBufferSequence requirements not met");
91 for(auto b : beast::detail::buffers_range(bs))
92 if(! write(reinterpret_cast<
93 std::uint8_t const*>(b.data()),
94 b.size()))
95 return false;
96 return true;
97 }
98
99 template<class _>
100 bool
101 utf8_checker_t<_>::
102 write(std::uint8_t const* in, std::size_t size)
103 {
104 auto const valid =
105 [](std::uint8_t const*& p)
106 {
107 if(p[0] < 128)
108 {
109 ++p;
110 return true;
111 }
112 if((p[0] & 0xe0) == 0xc0)
113 {
114 if( (p[1] & 0xc0) != 0x80 ||
115 (p[0] & 0xfe) == 0xc0) // overlong
116 return false;
117 p += 2;
118 return true;
119 }
120 if((p[0] & 0xf0) == 0xe0)
121 {
122 if( (p[1] & 0xc0) != 0x80
123 || (p[2] & 0xc0) != 0x80
124 || (p[0] == 0xe0 && (p[1] & 0xe0) == 0x80) // overlong
125 || (p[0] == 0xed && (p[1] & 0xe0) == 0xa0) // surrogate
126 //|| (p[0] == 0xef && p[1] == 0xbf && (p[2] & 0xfe) == 0xbe) // U+FFFE or U+FFFF
127 )
128 return false;
129 p += 3;
130 return true;
131 }
132 if((p[0] & 0xf8) == 0xf0)
133 {
134 if( (p[1] & 0xc0) != 0x80
135 || (p[2] & 0xc0) != 0x80
136 || (p[3] & 0xc0) != 0x80
137 || (p[0] == 0xf0 && (p[1] & 0xf0) == 0x80) // overlong
138 || (p[0] == 0xf4 && p[1] > 0x8f) || p[0] > 0xf4 // > U+10FFFF
139 )
140 return false;
141 p += 4;
142 return true;
143 }
144 return false;
145 };
146 auto const fail_fast =
147 [&]()
148 {
149 auto const n = p_ - cp_;
150 switch(n)
151 {
152 default:
153 BOOST_ASSERT(false);
154 BOOST_BEAST_FALLTHROUGH;
155 case 1:
156 cp_[1] = 0x81;
157 BOOST_BEAST_FALLTHROUGH;
158 case 2:
159 cp_[2] = 0x81;
160 BOOST_BEAST_FALLTHROUGH;
161 case 3:
162 cp_[3] = 0x81;
163 break;
164 }
165 std::uint8_t const* p = cp_;
166 return ! valid(p);
167 };
168 auto const needed =
169 [](std::uint8_t const v)
170 {
171 if(v < 128)
172 return 1;
173 if(v < 192)
174 return 0;
175 if(v < 224)
176 return 2;
177 if(v < 240)
178 return 3;
179 if(v < 248)
180 return 4;
181 return 0;
182 };
183
184 auto const end = in + size;
185
186 // Finish up any incomplete code point
187 if(need_ > 0)
188 {
189 // Calculate what we have
190 auto n = (std::min)(size, need_);
191 size -= n;
192 need_ -= n;
193
194 // Add characters to the code point
195 while(n--)
196 *p_++ = *in++;
197 BOOST_ASSERT(p_ <= cp_ + 5);
198
199 // Still incomplete?
200 if(need_ > 0)
201 {
202 // Incomplete code point
203 BOOST_ASSERT(in == end);
204
205 // Do partial validation on the incomplete
206 // code point, this is called "Fail fast"
207 // in Autobahn|Testsuite parlance.
208 return ! fail_fast();
209 }
210
211 // Complete code point, validate it
212 std::uint8_t const* p = &cp_[0];
213 if(! valid(p))
214 return false;
215 p_ = cp_;
216 }
217
218 if(size <= sizeof(std::size_t))
219 goto slow;
220
221 // Align `in` to sizeof(std::size_t) boundary
222 {
223 auto const in0 = in;
224 auto last = reinterpret_cast<std::uint8_t const*>(
225 ((reinterpret_cast<std::uintptr_t>(in) + sizeof(std::size_t) - 1) /
226 sizeof(std::size_t)) * sizeof(std::size_t));
227
228 // Check one character at a time for low-ASCII
229 while(in < last)
230 {
231 if(*in & 0x80)
232 {
233 // Not low-ASCII so switch to slow loop
234 size = size - (in - in0);
235 goto slow;
236 }
237 ++in;
238 }
239 size = size - (in - in0);
240 }
241
242 // Fast loop: Process 4 or 8 low-ASCII characters at a time
243 {
244 auto const in0 = in;
245 auto last = in + size - 7;
246 auto constexpr mask = static_cast<
247 std::size_t>(0x8080808080808080 & ~std::size_t{0});
248 while(in < last)
249 {
250 #if 0
251 std::size_t temp;
252 std::memcpy(&temp, in, sizeof(temp));
253 if((temp & mask) != 0)
254 #else
255 // Technically UB but works on all known platforms
256 if((*reinterpret_cast<std::size_t const*>(in) & mask) != 0)
257 #endif
258 {
259 size = size - (in - in0);
260 goto slow;
261 }
262 in += sizeof(std::size_t);
263 }
264 // There's at least one more full code point left
265 last += 4;
266 while(in < last)
267 if(! valid(in))
268 return false;
269 goto tail;
270 }
271
272 slow:
273 // Slow loop: Full validation on one code point at a time
274 {
275 auto last = in + size - 3;
276 while(in < last)
277 if(! valid(in))
278 return false;
279 }
280
281 tail:
282 // Handle the remaining bytes. The last
283 // characters could split a code point so
284 // we save the partial code point for later.
285 //
286 // On entry to the loop, `in` points to the
287 // beginning of a code point.
288 //
289 for(;;)
290 {
291 // Number of chars left
292 auto n = end - in;
293 if(! n)
294 break;
295
296 // Chars we need to finish this code point
297 auto const need = needed(*in);
298 if(need == 0)
299 return false;
300 if(need <= n)
301 {
302 // Check a whole code point
303 if(! valid(in))
304 return false;
305 }
306 else
307 {
308 // Calculate how many chars we need
309 // to finish this partial code point
310 need_ = need - n;
311
312 // Save the partial code point
313 while(n--)
314 *p_++ = *in++;
315 BOOST_ASSERT(in == end);
316 BOOST_ASSERT(p_ <= cp_ + 5);
317
318 // Do partial validation on the incomplete
319 // code point, this is called "Fail fast"
320 // in Autobahn|Testsuite parlance.
321 return ! fail_fast();
322 }
323 }
324 return true;
325 }
326
327 using utf8_checker = utf8_checker_t<>;
328
329 template<class = void>
330 bool
331 check_utf8(char const* p, std::size_t n)
332 {
333 utf8_checker c;
334 if(! c.write(reinterpret_cast<const uint8_t*>(p), n))
335 return false;
336 return c.finish();
337 }
338
339 } // detail
340 } // websocket
341 } // beast
342 } // boost
343
344 #endif