]>
Commit | Line | Data |
---|---|---|
20effc67 TL |
1 | // |
2 | // Copyright (c) 2020 Krystian Stasiowski (sdkrystian@gmail.com) | |
3 | // | |
4 | // Distributed under the Boost Software License, Version 1.0. (See accompanying | |
5 | // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) | |
6 | // | |
7 | // Official repository: https://github.com/boostorg/json | |
8 | // | |
9 | ||
10 | #ifndef BOOST_JSON_DETAIL_UTF8_HPP | |
11 | #define BOOST_JSON_DETAIL_UTF8_HPP | |
12 | ||
13 | #include <cstddef> | |
14 | #include <cstring> | |
15 | #include <cstdint> | |
16 | ||
17 | BOOST_JSON_NS_BEGIN | |
18 | namespace detail { | |
19 | ||
20 | template<int N> | |
21 | std::uint32_t | |
22 | load_little_endian(void const* p) | |
23 | { | |
24 | // VFALCO do we need to initialize this to 0? | |
25 | std::uint32_t v; | |
26 | std::memcpy(&v, p, N); | |
27 | #ifdef BOOST_JSON_BIG_ENDIAN | |
28 | v = ((v & 0xFF000000) >> 24) | | |
29 | ((v & 0x00FF0000) >> 8) | | |
30 | ((v & 0x0000FF00) << 8) | | |
31 | ((v & 0x000000FF) << 24); | |
32 | #endif | |
33 | return v; | |
34 | } | |
35 | ||
36 | inline | |
37 | uint16_t | |
38 | classify_utf8(char c) | |
39 | { | |
40 | // 0x000 = invalid | |
41 | // 0x102 = 2 bytes, second byte [80, BF] | |
42 | // 0x203 = 3 bytes, second byte [A0, BF] | |
43 | // 0x303 = 3 bytes, second byte [80, BF] | |
44 | // 0x403 = 3 bytes, second byte [80, 9F] | |
45 | // 0x504 = 4 bytes, second byte [90, BF] | |
46 | // 0x604 = 4 bytes, second byte [80, BF] | |
47 | // 0x704 = 4 bytes, second byte [80, 8F] | |
48 | static constexpr uint16_t first[128] | |
49 | { | |
50 | 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, | |
51 | 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, | |
52 | 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, | |
53 | 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, | |
54 | 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, | |
55 | 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, | |
56 | 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, | |
57 | 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, | |
58 | ||
59 | 0x000, 0x000, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, | |
60 | 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, | |
61 | 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, | |
62 | 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, | |
63 | 0x203, 0x303, 0x303, 0x303, 0x303, 0x303, 0x303, 0x303, | |
64 | 0x303, 0x303, 0x303, 0x303, 0x303, 0x403, 0x303, 0x303, | |
65 | 0x504, 0x604, 0x604, 0x604, 0x704, 0x000, 0x000, 0x000, | |
66 | 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, | |
67 | }; | |
68 | return first[static_cast<unsigned char>(c)]; | |
69 | } | |
70 | ||
71 | inline | |
72 | bool | |
73 | is_valid_utf8(const char* p, uint16_t first) | |
74 | { | |
75 | uint32_t v; | |
76 | switch(first >> 8) | |
77 | { | |
78 | default: | |
79 | return false; | |
80 | ||
81 | // 2 bytes, second byte [80, BF] | |
82 | case 1: | |
83 | v = load_little_endian<2>(p); | |
84 | return (v & 0xC000) == 0x8000; | |
85 | ||
86 | // 3 bytes, second byte [A0, BF] | |
87 | case 2: | |
88 | v = load_little_endian<3>(p); | |
89 | std::memcpy(&v, p, 3); | |
90 | return (v & 0xC0E000) == 0x80A000; | |
91 | ||
92 | // 3 bytes, second byte [80, BF] | |
93 | case 3: | |
94 | v = load_little_endian<3>(p); | |
95 | return (v & 0xC0C000) == 0x808000; | |
96 | ||
97 | // 3 bytes, second byte [80, 9F] | |
98 | case 4: | |
99 | v = load_little_endian<3>(p); | |
100 | return (v & 0xC0E000) == 0x808000; | |
101 | ||
102 | // 4 bytes, second byte [90, BF] | |
103 | case 5: | |
104 | v = load_little_endian<4>(p); | |
105 | return (v & 0xC0C0FF00) + 0x7F7F7000 <= 0x2F00; | |
106 | ||
107 | // 4 bytes, second byte [80, BF] | |
108 | case 6: | |
109 | v = load_little_endian<4>(p); | |
110 | return (v & 0xC0C0C000) == 0x80808000; | |
111 | ||
112 | // 4 bytes, second byte [80, 8F] | |
113 | case 7: | |
114 | v = load_little_endian<4>(p); | |
115 | return (v & 0xC0C0F000) == 0x80808000; | |
116 | } | |
117 | } | |
118 | ||
119 | class utf8_sequence | |
120 | { | |
121 | char seq_[4]; | |
122 | uint16_t first_; | |
123 | uint8_t size_; | |
124 | ||
125 | public: | |
126 | void | |
127 | save( | |
128 | const char* p, | |
129 | std::size_t remain) noexcept | |
130 | { | |
131 | first_ = classify_utf8(*p & 0x7F); | |
132 | if(remain >= length()) | |
133 | size_ = length(); | |
134 | else | |
135 | size_ = static_cast<uint8_t>(remain); | |
136 | std::memcpy(seq_, p, size_); | |
137 | } | |
138 | ||
139 | uint8_t | |
140 | length() const noexcept | |
141 | { | |
142 | return first_ & 0xFF; | |
143 | } | |
144 | ||
145 | bool | |
146 | complete() const noexcept | |
147 | { | |
148 | return size_ >= length(); | |
149 | } | |
150 | ||
151 | // returns true if complete | |
152 | bool | |
153 | append( | |
154 | const char* p, | |
155 | std::size_t remain) noexcept | |
156 | { | |
157 | if(BOOST_JSON_UNLIKELY(needed() == 0)) | |
158 | return true; | |
159 | if(BOOST_JSON_LIKELY(remain >= needed())) | |
160 | { | |
161 | std::memcpy( | |
162 | seq_ + size_, p, needed()); | |
163 | size_ = length(); | |
164 | return true; | |
165 | } | |
166 | if(BOOST_JSON_LIKELY(remain > 0)) | |
167 | { | |
168 | std::memcpy(seq_ + size_, p, remain); | |
169 | size_ += static_cast<uint8_t>(remain); | |
170 | } | |
171 | return false; | |
172 | } | |
173 | ||
174 | const char* | |
175 | data() const noexcept | |
176 | { | |
177 | return seq_; | |
178 | } | |
179 | ||
180 | uint8_t | |
181 | needed() const noexcept | |
182 | { | |
183 | return length() - size_; | |
184 | } | |
185 | ||
186 | bool | |
187 | valid() const noexcept | |
188 | { | |
189 | BOOST_ASSERT(size_ >= length()); | |
190 | return is_valid_utf8(seq_, first_); | |
191 | } | |
192 | }; | |
193 | ||
194 | } // detail | |
195 | BOOST_JSON_NS_END | |
196 | ||
197 | #endif |