]>
Commit | Line | Data |
---|---|---|
31f18b77 FG |
1 | // Tencent is pleased to support the open source community by making RapidJSON available. |
2 | // | |
3 | // Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip. All rights reserved. | |
4 | // | |
5 | // Licensed under the MIT License (the "License"); you may not use this file except | |
6 | // in compliance with the License. You may obtain a copy of the License at | |
7 | // | |
8 | // http://opensource.org/licenses/MIT | |
9 | // | |
10 | // Unless required by applicable law or agreed to in writing, software distributed | |
11 | // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |
12 | // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |
13 | // specific language governing permissions and limitations under the License. | |
14 | ||
15 | #ifndef RAPIDJSON_ENCODEDSTREAM_H_ | |
16 | #define RAPIDJSON_ENCODEDSTREAM_H_ | |
17 | ||
18 | #include "stream.h" | |
19 | #include "memorystream.h" | |
20 | ||
21 | #ifdef __GNUC__ | |
22 | RAPIDJSON_DIAG_PUSH | |
23 | RAPIDJSON_DIAG_OFF(effc++) | |
24 | #endif | |
25 | ||
26 | #ifdef __clang__ | |
27 | RAPIDJSON_DIAG_PUSH | |
28 | RAPIDJSON_DIAG_OFF(padded) | |
29 | #endif | |
30 | ||
31 | RAPIDJSON_NAMESPACE_BEGIN | |
32 | ||
33 | //! Input byte stream wrapper with a statically bound encoding. | |
34 | /*! | |
35 | \tparam Encoding The interpretation of encoding of the stream. Either UTF8, UTF16LE, UTF16BE, UTF32LE, UTF32BE. | |
36 | \tparam InputByteStream Type of input byte stream. For example, FileReadStream. | |
37 | */ | |
38 | template <typename Encoding, typename InputByteStream> | |
39 | class EncodedInputStream { | |
40 | RAPIDJSON_STATIC_ASSERT(sizeof(typename InputByteStream::Ch) == 1); | |
41 | public: | |
42 | typedef typename Encoding::Ch Ch; | |
43 | ||
44 | EncodedInputStream(InputByteStream& is) : is_(is) { | |
45 | current_ = Encoding::TakeBOM(is_); | |
46 | } | |
47 | ||
48 | Ch Peek() const { return current_; } | |
49 | Ch Take() { Ch c = current_; current_ = Encoding::Take(is_); return c; } | |
50 | size_t Tell() const { return is_.Tell(); } | |
51 | ||
52 | // Not implemented | |
53 | void Put(Ch) { RAPIDJSON_ASSERT(false); } | |
54 | void Flush() { RAPIDJSON_ASSERT(false); } | |
55 | Ch* PutBegin() { RAPIDJSON_ASSERT(false); return 0; } | |
56 | size_t PutEnd(Ch*) { RAPIDJSON_ASSERT(false); return 0; } | |
57 | ||
58 | private: | |
59 | EncodedInputStream(const EncodedInputStream&); | |
60 | EncodedInputStream& operator=(const EncodedInputStream&); | |
61 | ||
62 | InputByteStream& is_; | |
63 | Ch current_; | |
64 | }; | |
65 | ||
66 | //! Specialized for UTF8 MemoryStream. | |
67 | template <> | |
68 | class EncodedInputStream<UTF8<>, MemoryStream> { | |
69 | public: | |
70 | typedef UTF8<>::Ch Ch; | |
71 | ||
72 | EncodedInputStream(MemoryStream& is) : is_(is) { | |
73 | if (static_cast<unsigned char>(is_.Peek()) == 0xEFu) is_.Take(); | |
74 | if (static_cast<unsigned char>(is_.Peek()) == 0xBBu) is_.Take(); | |
75 | if (static_cast<unsigned char>(is_.Peek()) == 0xBFu) is_.Take(); | |
76 | } | |
77 | Ch Peek() const { return is_.Peek(); } | |
78 | Ch Take() { return is_.Take(); } | |
79 | size_t Tell() const { return is_.Tell(); } | |
80 | ||
81 | // Not implemented | |
82 | void Put(Ch) {} | |
83 | void Flush() {} | |
84 | Ch* PutBegin() { return 0; } | |
85 | size_t PutEnd(Ch*) { return 0; } | |
86 | ||
87 | MemoryStream& is_; | |
88 | ||
89 | private: | |
90 | EncodedInputStream(const EncodedInputStream&); | |
91 | EncodedInputStream& operator=(const EncodedInputStream&); | |
92 | }; | |
93 | ||
94 | //! Output byte stream wrapper with statically bound encoding. | |
95 | /*! | |
96 | \tparam Encoding The interpretation of encoding of the stream. Either UTF8, UTF16LE, UTF16BE, UTF32LE, UTF32BE. | |
97 | \tparam OutputByteStream Type of input byte stream. For example, FileWriteStream. | |
98 | */ | |
99 | template <typename Encoding, typename OutputByteStream> | |
100 | class EncodedOutputStream { | |
101 | RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputByteStream::Ch) == 1); | |
102 | public: | |
103 | typedef typename Encoding::Ch Ch; | |
104 | ||
105 | EncodedOutputStream(OutputByteStream& os, bool putBOM = true) : os_(os) { | |
106 | if (putBOM) | |
107 | Encoding::PutBOM(os_); | |
108 | } | |
109 | ||
110 | void Put(Ch c) { Encoding::Put(os_, c); } | |
111 | void Flush() { os_.Flush(); } | |
112 | ||
113 | // Not implemented | |
114 | Ch Peek() const { RAPIDJSON_ASSERT(false); return 0;} | |
115 | Ch Take() { RAPIDJSON_ASSERT(false); return 0;} | |
116 | size_t Tell() const { RAPIDJSON_ASSERT(false); return 0; } | |
117 | Ch* PutBegin() { RAPIDJSON_ASSERT(false); return 0; } | |
118 | size_t PutEnd(Ch*) { RAPIDJSON_ASSERT(false); return 0; } | |
119 | ||
120 | private: | |
121 | EncodedOutputStream(const EncodedOutputStream&); | |
122 | EncodedOutputStream& operator=(const EncodedOutputStream&); | |
123 | ||
124 | OutputByteStream& os_; | |
125 | }; | |
126 | ||
127 | #define RAPIDJSON_ENCODINGS_FUNC(x) UTF8<Ch>::x, UTF16LE<Ch>::x, UTF16BE<Ch>::x, UTF32LE<Ch>::x, UTF32BE<Ch>::x | |
128 | ||
129 | //! Input stream wrapper with dynamically bound encoding and automatic encoding detection. | |
130 | /*! | |
131 | \tparam CharType Type of character for reading. | |
132 | \tparam InputByteStream type of input byte stream to be wrapped. | |
133 | */ | |
134 | template <typename CharType, typename InputByteStream> | |
135 | class AutoUTFInputStream { | |
136 | RAPIDJSON_STATIC_ASSERT(sizeof(typename InputByteStream::Ch) == 1); | |
137 | public: | |
138 | typedef CharType Ch; | |
139 | ||
140 | //! Constructor. | |
141 | /*! | |
142 | \param is input stream to be wrapped. | |
143 | \param type UTF encoding type if it is not detected from the stream. | |
144 | */ | |
145 | AutoUTFInputStream(InputByteStream& is, UTFType type = kUTF8) : is_(&is), type_(type), hasBOM_(false) { | |
146 | RAPIDJSON_ASSERT(type >= kUTF8 && type <= kUTF32BE); | |
147 | DetectType(); | |
148 | static const TakeFunc f[] = { RAPIDJSON_ENCODINGS_FUNC(Take) }; | |
149 | takeFunc_ = f[type_]; | |
150 | current_ = takeFunc_(*is_); | |
151 | } | |
152 | ||
153 | UTFType GetType() const { return type_; } | |
154 | bool HasBOM() const { return hasBOM_; } | |
155 | ||
156 | Ch Peek() const { return current_; } | |
157 | Ch Take() { Ch c = current_; current_ = takeFunc_(*is_); return c; } | |
158 | size_t Tell() const { return is_->Tell(); } | |
159 | ||
160 | // Not implemented | |
161 | void Put(Ch) { RAPIDJSON_ASSERT(false); } | |
162 | void Flush() { RAPIDJSON_ASSERT(false); } | |
163 | Ch* PutBegin() { RAPIDJSON_ASSERT(false); return 0; } | |
164 | size_t PutEnd(Ch*) { RAPIDJSON_ASSERT(false); return 0; } | |
165 | ||
166 | private: | |
167 | AutoUTFInputStream(const AutoUTFInputStream&); | |
168 | AutoUTFInputStream& operator=(const AutoUTFInputStream&); | |
169 | ||
170 | // Detect encoding type with BOM or RFC 4627 | |
171 | void DetectType() { | |
172 | // BOM (Byte Order Mark): | |
173 | // 00 00 FE FF UTF-32BE | |
174 | // FF FE 00 00 UTF-32LE | |
175 | // FE FF UTF-16BE | |
176 | // FF FE UTF-16LE | |
177 | // EF BB BF UTF-8 | |
178 | ||
179 | const unsigned char* c = reinterpret_cast<const unsigned char *>(is_->Peek4()); | |
180 | if (!c) | |
181 | return; | |
182 | ||
183 | unsigned bom = static_cast<unsigned>(c[0] | (c[1] << 8) | (c[2] << 16) | (c[3] << 24)); | |
184 | hasBOM_ = false; | |
185 | if (bom == 0xFFFE0000) { type_ = kUTF32BE; hasBOM_ = true; is_->Take(); is_->Take(); is_->Take(); is_->Take(); } | |
186 | else if (bom == 0x0000FEFF) { type_ = kUTF32LE; hasBOM_ = true; is_->Take(); is_->Take(); is_->Take(); is_->Take(); } | |
187 | else if ((bom & 0xFFFF) == 0xFFFE) { type_ = kUTF16BE; hasBOM_ = true; is_->Take(); is_->Take(); } | |
188 | else if ((bom & 0xFFFF) == 0xFEFF) { type_ = kUTF16LE; hasBOM_ = true; is_->Take(); is_->Take(); } | |
189 | else if ((bom & 0xFFFFFF) == 0xBFBBEF) { type_ = kUTF8; hasBOM_ = true; is_->Take(); is_->Take(); is_->Take(); } | |
190 | ||
191 | // RFC 4627: Section 3 | |
192 | // "Since the first two characters of a JSON text will always be ASCII | |
193 | // characters [RFC0020], it is possible to determine whether an octet | |
194 | // stream is UTF-8, UTF-16 (BE or LE), or UTF-32 (BE or LE) by looking | |
195 | // at the pattern of nulls in the first four octets." | |
196 | // 00 00 00 xx UTF-32BE | |
197 | // 00 xx 00 xx UTF-16BE | |
198 | // xx 00 00 00 UTF-32LE | |
199 | // xx 00 xx 00 UTF-16LE | |
200 | // xx xx xx xx UTF-8 | |
201 | ||
202 | if (!hasBOM_) { | |
203 | unsigned pattern = (c[0] ? 1 : 0) | (c[1] ? 2 : 0) | (c[2] ? 4 : 0) | (c[3] ? 8 : 0); | |
204 | switch (pattern) { | |
205 | case 0x08: type_ = kUTF32BE; break; | |
206 | case 0x0A: type_ = kUTF16BE; break; | |
207 | case 0x01: type_ = kUTF32LE; break; | |
208 | case 0x05: type_ = kUTF16LE; break; | |
209 | case 0x0F: type_ = kUTF8; break; | |
210 | default: break; // Use type defined by user. | |
211 | } | |
212 | } | |
213 | ||
214 | // Runtime check whether the size of character type is sufficient. It only perform checks with assertion. | |
215 | if (type_ == kUTF16LE || type_ == kUTF16BE) RAPIDJSON_ASSERT(sizeof(Ch) >= 2); | |
216 | if (type_ == kUTF32LE || type_ == kUTF32BE) RAPIDJSON_ASSERT(sizeof(Ch) >= 4); | |
217 | } | |
218 | ||
219 | typedef Ch (*TakeFunc)(InputByteStream& is); | |
220 | InputByteStream* is_; | |
221 | UTFType type_; | |
222 | Ch current_; | |
223 | TakeFunc takeFunc_; | |
224 | bool hasBOM_; | |
225 | }; | |
226 | ||
227 | //! Output stream wrapper with dynamically bound encoding and automatic encoding detection. | |
228 | /*! | |
229 | \tparam CharType Type of character for writing. | |
230 | \tparam OutputByteStream type of output byte stream to be wrapped. | |
231 | */ | |
232 | template <typename CharType, typename OutputByteStream> | |
233 | class AutoUTFOutputStream { | |
234 | RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputByteStream::Ch) == 1); | |
235 | public: | |
236 | typedef CharType Ch; | |
237 | ||
238 | //! Constructor. | |
239 | /*! | |
240 | \param os output stream to be wrapped. | |
241 | \param type UTF encoding type. | |
242 | \param putBOM Whether to write BOM at the beginning of the stream. | |
243 | */ | |
244 | AutoUTFOutputStream(OutputByteStream& os, UTFType type, bool putBOM) : os_(&os), type_(type) { | |
245 | RAPIDJSON_ASSERT(type >= kUTF8 && type <= kUTF32BE); | |
246 | ||
247 | // Runtime check whether the size of character type is sufficient. It only perform checks with assertion. | |
248 | if (type_ == kUTF16LE || type_ == kUTF16BE) RAPIDJSON_ASSERT(sizeof(Ch) >= 2); | |
249 | if (type_ == kUTF32LE || type_ == kUTF32BE) RAPIDJSON_ASSERT(sizeof(Ch) >= 4); | |
250 | ||
251 | static const PutFunc f[] = { RAPIDJSON_ENCODINGS_FUNC(Put) }; | |
252 | putFunc_ = f[type_]; | |
253 | ||
254 | if (putBOM) | |
255 | PutBOM(); | |
256 | } | |
257 | ||
258 | UTFType GetType() const { return type_; } | |
259 | ||
260 | void Put(Ch c) { putFunc_(*os_, c); } | |
261 | void Flush() { os_->Flush(); } | |
262 | ||
263 | // Not implemented | |
264 | Ch Peek() const { RAPIDJSON_ASSERT(false); return 0;} | |
265 | Ch Take() { RAPIDJSON_ASSERT(false); return 0;} | |
266 | size_t Tell() const { RAPIDJSON_ASSERT(false); return 0; } | |
267 | Ch* PutBegin() { RAPIDJSON_ASSERT(false); return 0; } | |
268 | size_t PutEnd(Ch*) { RAPIDJSON_ASSERT(false); return 0; } | |
269 | ||
270 | private: | |
271 | AutoUTFOutputStream(const AutoUTFOutputStream&); | |
272 | AutoUTFOutputStream& operator=(const AutoUTFOutputStream&); | |
273 | ||
274 | void PutBOM() { | |
275 | typedef void (*PutBOMFunc)(OutputByteStream&); | |
276 | static const PutBOMFunc f[] = { RAPIDJSON_ENCODINGS_FUNC(PutBOM) }; | |
277 | f[type_](*os_); | |
278 | } | |
279 | ||
280 | typedef void (*PutFunc)(OutputByteStream&, Ch); | |
281 | ||
282 | OutputByteStream* os_; | |
283 | UTFType type_; | |
284 | PutFunc putFunc_; | |
285 | }; | |
286 | ||
287 | #undef RAPIDJSON_ENCODINGS_FUNC | |
288 | ||
289 | RAPIDJSON_NAMESPACE_END | |
290 | ||
291 | #ifdef __clang__ | |
292 | RAPIDJSON_DIAG_POP | |
293 | #endif | |
294 | ||
295 | #ifdef __GNUC__ | |
296 | RAPIDJSON_DIAG_POP | |
297 | #endif | |
298 | ||
299 | #endif // RAPIDJSON_FILESTREAM_H_ |