]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | #ifndef BOOST_PROPERTY_TREE_DETAIL_JSON_PARSER_PARSER_HPP |
2 | #define BOOST_PROPERTY_TREE_DETAIL_JSON_PARSER_PARSER_HPP | |
3 | ||
4 | #include <boost/property_tree/json_parser/error.hpp> | |
5 | ||
6 | #include <boost/ref.hpp> | |
7 | #include <boost/bind.hpp> | |
8 | #include <boost/format.hpp> | |
9 | ||
10 | #include <iterator> | |
11 | #include <sstream> | |
12 | #include <string> | |
13 | ||
14 | namespace boost { namespace property_tree { | |
15 | namespace json_parser { namespace detail | |
16 | { | |
17 | ||
18 | template <typename Encoding, typename Iterator, typename Sentinel> | |
19 | class source | |
20 | { | |
21 | public: | |
22 | typedef typename std::iterator_traits<Iterator>::value_type | |
23 | code_unit; | |
24 | typedef bool (Encoding::*encoding_predicate)(code_unit c) const; | |
25 | ||
26 | explicit source(Encoding& encoding) : encoding(encoding) {} | |
27 | ||
28 | template <typename Range> | |
29 | void set_input(const std::string& filename, const Range& r) | |
30 | { | |
31 | this->filename = filename; | |
32 | cur = r.begin(); | |
33 | end = r.end(); | |
34 | // Note that there is no backtracking, so if e.g. a UTF-8 file | |
35 | // starts with something that initially looks like a BOM but isn't, | |
36 | // there's trouble. | |
37 | // However, no valid JSON file can start with a UTF-8 EF byte. | |
38 | encoding.skip_introduction(cur, end); | |
39 | line = 1; | |
40 | offset = 0; | |
41 | } | |
42 | ||
43 | bool done() const { return cur == end; } | |
44 | ||
45 | void parse_error(const char* msg) { | |
46 | BOOST_PROPERTY_TREE_THROW( | |
47 | json_parser_error(msg, filename, line)); | |
48 | } | |
49 | ||
50 | void next() { | |
51 | if (encoding.is_nl(*cur)) { | |
52 | ++line; | |
53 | offset = 0; | |
54 | } else { | |
55 | ++offset; | |
56 | } | |
57 | ++cur; | |
58 | } | |
59 | ||
60 | template <typename Action> | |
61 | bool have(encoding_predicate p, Action& a) { | |
62 | bool found = cur != end && (encoding.*p)(*cur); | |
63 | if (found) { | |
64 | a(*cur); | |
65 | next(); | |
66 | } | |
67 | return found; | |
68 | } | |
69 | ||
70 | bool have(encoding_predicate p) { | |
71 | DoNothing n; | |
72 | return have(p, n); | |
73 | } | |
74 | ||
75 | template <typename Action> | |
76 | void expect(encoding_predicate p, const char* msg, Action& a) { | |
77 | if (!have(p, a)) { | |
78 | parse_error(msg); | |
79 | } | |
80 | } | |
81 | ||
82 | void expect(encoding_predicate p, const char* msg) { | |
83 | DoNothing n; | |
84 | expect(p, msg, n); | |
85 | } | |
86 | ||
87 | code_unit need_cur(const char* msg) { | |
88 | if (cur == end) { | |
89 | parse_error(msg); | |
90 | } | |
91 | return *cur; | |
92 | } | |
93 | ||
94 | Iterator& raw_cur() { return cur; } | |
95 | Sentinel raw_end() { return end; } | |
96 | ||
97 | private: | |
98 | struct DoNothing { | |
99 | void operator ()(code_unit) const {} | |
100 | }; | |
101 | ||
102 | Encoding& encoding; | |
103 | Iterator cur; | |
104 | Sentinel end; | |
105 | std::string filename; | |
106 | int line; | |
107 | int offset; | |
108 | }; | |
109 | ||
110 | template <typename Callbacks, typename Encoding, typename Iterator, | |
111 | typename = typename std::iterator_traits<Iterator> | |
112 | ::iterator_category> | |
113 | class number_callback_adapter | |
114 | { | |
115 | public: | |
116 | number_callback_adapter(Callbacks& callbacks, Encoding& encoding, | |
117 | Iterator& cur) | |
118 | : callbacks(callbacks), encoding(encoding), first(cur), cur(cur) | |
119 | {} | |
120 | ||
121 | void operator ()(typename Encoding::external_char) {} | |
122 | ||
123 | void finish() const { | |
124 | callbacks.on_number(encoding.to_internal(first, cur)); | |
125 | } | |
126 | ||
127 | private: | |
128 | number_callback_adapter(const number_callback_adapter&); | |
129 | ||
130 | Callbacks& callbacks; | |
131 | Encoding& encoding; | |
132 | Iterator first; | |
133 | Iterator& cur; | |
134 | }; | |
135 | ||
136 | template <typename Callbacks, typename Encoding, typename Iterator> | |
137 | class number_callback_adapter<Callbacks, Encoding, Iterator, | |
138 | std::input_iterator_tag> | |
139 | { | |
140 | public: | |
141 | number_callback_adapter(Callbacks& callbacks, Encoding& encoding, | |
142 | Iterator&) | |
143 | : callbacks(callbacks), encoding(encoding), first(true) | |
144 | {} | |
145 | ||
146 | void operator ()(typename Encoding::external_char c) { | |
147 | if (first) { | |
148 | callbacks.on_begin_number(); | |
149 | first = false; | |
150 | } | |
151 | callbacks.on_digit(encoding.to_internal_trivial(c)); | |
152 | } | |
153 | ||
154 | void finish() const { | |
155 | callbacks.on_end_number(); | |
156 | } | |
157 | private: | |
158 | number_callback_adapter(const number_callback_adapter&); | |
159 | ||
160 | Callbacks& callbacks; | |
161 | Encoding& encoding; | |
162 | bool first; | |
163 | }; | |
164 | ||
165 | template <typename Callbacks, typename Encoding, typename Iterator, | |
166 | typename = typename std::iterator_traits<Iterator> | |
167 | ::iterator_category> | |
168 | class string_callback_adapter | |
169 | { | |
170 | public: | |
171 | string_callback_adapter(Callbacks& callbacks, Encoding& encoding, | |
172 | Iterator& cur) | |
173 | : callbacks(callbacks), encoding(encoding), cur(cur), | |
174 | run_begin(cur) | |
175 | {} | |
176 | ||
177 | void start_run() { | |
178 | run_begin = cur; | |
179 | } | |
180 | ||
181 | void finish_run() { | |
182 | callbacks.on_code_units(encoding.to_internal(run_begin, cur)); | |
183 | } | |
184 | ||
185 | template <typename Sentinel, typename EncodingErrorFn> | |
186 | void process_codepoint(Sentinel end, EncodingErrorFn error_fn) { | |
187 | encoding.skip_codepoint(cur, end, error_fn); | |
188 | } | |
189 | ||
190 | private: | |
191 | string_callback_adapter(const string_callback_adapter&); | |
192 | ||
193 | Callbacks& callbacks; | |
194 | Encoding& encoding; | |
195 | Iterator& cur; | |
196 | Iterator run_begin; | |
197 | }; | |
198 | ||
199 | template <typename Callbacks, typename Encoding, typename Iterator> | |
200 | class string_callback_adapter<Callbacks, Encoding, Iterator, | |
201 | std::input_iterator_tag> | |
202 | { | |
203 | public: | |
204 | string_callback_adapter(Callbacks& callbacks, Encoding& encoding, | |
205 | Iterator& cur) | |
206 | : callbacks(callbacks), encoding(encoding), cur(cur) | |
207 | {} | |
208 | ||
209 | void start_run() {} | |
210 | ||
211 | void finish_run() {} | |
212 | ||
213 | template <typename Sentinel, typename EncodingErrorFn> | |
214 | void process_codepoint(Sentinel end, EncodingErrorFn error_fn) { | |
215 | encoding.transcode_codepoint(cur, end, | |
216 | boost::bind(&Callbacks::on_code_unit, | |
217 | boost::ref(callbacks), _1), | |
218 | error_fn); | |
219 | } | |
220 | ||
221 | private: | |
222 | string_callback_adapter(const string_callback_adapter&); | |
223 | ||
224 | Callbacks& callbacks; | |
225 | Encoding& encoding; | |
226 | Iterator& cur; | |
227 | }; | |
228 | ||
229 | template <typename Callbacks, typename Encoding, typename Iterator, | |
230 | typename Sentinel> | |
231 | class parser | |
232 | { | |
233 | typedef detail::number_callback_adapter<Callbacks, Encoding, Iterator> | |
234 | number_adapter; | |
235 | typedef detail::string_callback_adapter<Callbacks, Encoding, Iterator> | |
236 | string_adapter; | |
237 | typedef detail::source<Encoding, Iterator, Sentinel> source; | |
238 | typedef typename source::code_unit code_unit; | |
239 | ||
240 | public: | |
241 | parser(Callbacks& callbacks, Encoding& encoding) | |
242 | : callbacks(callbacks), encoding(encoding), src(encoding) | |
243 | {} | |
244 | ||
245 | template <typename Range> | |
246 | void set_input(const std::string& filename, const Range& r) { | |
247 | src.set_input(filename, r); | |
248 | } | |
249 | ||
250 | void finish() { | |
251 | skip_ws(); | |
252 | if (!src.done()) { | |
253 | parse_error("garbage after data"); | |
254 | } | |
255 | } | |
256 | ||
257 | void parse_value() { | |
258 | if (parse_object()) return; | |
259 | if (parse_array()) return; | |
260 | if (parse_string()) return; | |
261 | if (parse_boolean()) return; | |
262 | if (parse_null()) return; | |
263 | if (parse_number()) return; | |
264 | parse_error("expected value"); | |
265 | } | |
266 | ||
267 | bool parse_null() { | |
268 | skip_ws(); | |
269 | if (!have(&Encoding::is_n)) { | |
270 | return false; | |
271 | } | |
272 | expect(&Encoding::is_u, "expected 'null'"); | |
273 | expect(&Encoding::is_l, "expected 'null'"); | |
274 | expect(&Encoding::is_l, "expected 'null'"); | |
275 | callbacks.on_null(); | |
276 | return true; | |
277 | } | |
278 | ||
279 | bool parse_boolean() { | |
280 | skip_ws(); | |
281 | if (have(&Encoding::is_t)) { | |
282 | expect(&Encoding::is_r, "expected 'true'"); | |
283 | expect(&Encoding::is_u, "expected 'true'"); | |
284 | expect(&Encoding::is_e, "expected 'true'"); | |
285 | callbacks.on_boolean(true); | |
286 | return true; | |
287 | } | |
288 | if (have(&Encoding::is_f)) { | |
289 | expect(&Encoding::is_a, "expected 'false'"); | |
290 | expect(&Encoding::is_l, "expected 'false'"); | |
291 | expect(&Encoding::is_s, "expected 'false'"); | |
292 | expect(&Encoding::is_e, "expected 'false'"); | |
293 | callbacks.on_boolean(false); | |
294 | return true; | |
295 | } | |
296 | return false; | |
297 | } | |
298 | ||
299 | bool parse_number() { | |
300 | skip_ws(); | |
301 | ||
302 | number_adapter adapter(callbacks, encoding, src.raw_cur()); | |
303 | bool started = false; | |
304 | if (have(&Encoding::is_minus, adapter)) { | |
305 | started = true; | |
306 | } | |
307 | if (!have(&Encoding::is_0, adapter) && !parse_int_part(adapter)) { | |
308 | if (started) { | |
309 | parse_error("expected digits after -"); | |
310 | } | |
311 | return false; | |
312 | } | |
313 | parse_frac_part(adapter); | |
314 | parse_exp_part(adapter); | |
315 | adapter.finish(); | |
316 | return true; | |
317 | } | |
318 | ||
319 | bool parse_string() { | |
320 | skip_ws(); | |
321 | ||
322 | if (!have(&Encoding::is_quote)) { | |
323 | return false; | |
324 | } | |
325 | ||
326 | callbacks.on_begin_string(); | |
327 | string_adapter adapter(callbacks, encoding, src.raw_cur()); | |
328 | while (!encoding.is_quote(need_cur("unterminated string"))) { | |
329 | if (encoding.is_backslash(*src.raw_cur())) { | |
330 | adapter.finish_run(); | |
331 | next(); | |
332 | parse_escape(); | |
333 | adapter.start_run(); | |
334 | } else { | |
335 | adapter.process_codepoint(src.raw_end(), | |
336 | boost::bind(&parser::parse_error, | |
337 | this, "invalid code sequence")); | |
338 | } | |
339 | } | |
340 | adapter.finish_run(); | |
341 | callbacks.on_end_string(); | |
342 | next(); | |
343 | return true; | |
344 | } | |
345 | ||
346 | bool parse_array() { | |
347 | skip_ws(); | |
348 | ||
349 | if (!have(&Encoding::is_open_bracket)) { | |
350 | return false; | |
351 | } | |
352 | ||
353 | callbacks.on_begin_array(); | |
354 | skip_ws(); | |
355 | if (have(&Encoding::is_close_bracket)) { | |
356 | callbacks.on_end_array(); | |
357 | return true; | |
358 | } | |
359 | do { | |
360 | parse_value(); | |
361 | skip_ws(); | |
362 | } while (have(&Encoding::is_comma)); | |
363 | expect(&Encoding::is_close_bracket, "expected ']' or ','"); | |
364 | callbacks.on_end_array(); | |
365 | return true; | |
366 | } | |
367 | ||
368 | bool parse_object() { | |
369 | skip_ws(); | |
370 | ||
371 | if (!have(&Encoding::is_open_brace)) { | |
372 | return false; | |
373 | } | |
374 | ||
375 | callbacks.on_begin_object(); | |
376 | skip_ws(); | |
377 | if (have(&Encoding::is_close_brace)) { | |
378 | callbacks.on_end_object(); | |
379 | return true; | |
380 | } | |
381 | do { | |
382 | if (!parse_string()) { | |
383 | parse_error("expected key string"); | |
384 | } | |
385 | skip_ws(); | |
386 | expect(&Encoding::is_colon, "expected ':'"); | |
387 | parse_value(); | |
388 | skip_ws(); | |
389 | } while (have(&Encoding::is_comma)); | |
390 | expect(&Encoding::is_close_brace, "expected '}' or ','"); | |
391 | callbacks.on_end_object(); | |
392 | return true; | |
393 | } | |
394 | ||
395 | private: | |
396 | typedef typename source::encoding_predicate encoding_predicate; | |
397 | ||
398 | void parse_error(const char* msg) { src.parse_error(msg); } | |
399 | void next() { src.next(); } | |
400 | template <typename Action> | |
401 | bool have(encoding_predicate p, Action& a) { return src.have(p, a); } | |
402 | bool have(encoding_predicate p) { return src.have(p); } | |
403 | template <typename Action> | |
404 | void expect(encoding_predicate p, const char* msg, Action& a) { | |
405 | src.expect(p, msg, a); | |
406 | } | |
407 | void expect(encoding_predicate p, const char* msg) { | |
408 | src.expect(p, msg); | |
409 | } | |
410 | code_unit need_cur(const char* msg) { return src.need_cur(msg); } | |
411 | ||
412 | void skip_ws() { | |
413 | while (have(&Encoding::is_ws)) { | |
414 | } | |
415 | } | |
416 | ||
417 | bool parse_int_part(number_adapter& action) { | |
418 | if (!have(&Encoding::is_digit0, action)) { | |
419 | return false; | |
420 | } | |
421 | parse_digits(action); | |
422 | return true; | |
423 | } | |
424 | ||
425 | void parse_frac_part(number_adapter& action) { | |
426 | if (!have(&Encoding::is_dot, action)) { | |
427 | return; | |
428 | } | |
429 | expect(&Encoding::is_digit, "need at least one digit after '.'", | |
430 | action); | |
431 | parse_digits(action); | |
432 | } | |
433 | ||
434 | void parse_exp_part(number_adapter& action) { | |
435 | if (!have(&Encoding::is_eE, action)) { | |
436 | return; | |
437 | } | |
438 | have(&Encoding::is_plusminus, action); | |
439 | expect(&Encoding::is_digit, "need at least one digit in exponent", | |
440 | action); | |
441 | parse_digits(action); | |
442 | } | |
443 | ||
444 | void parse_digits(number_adapter& action) { | |
445 | while (have(&Encoding::is_digit, action)) { | |
446 | } | |
447 | } | |
448 | ||
449 | void parse_escape() { | |
450 | if (have(&Encoding::is_quote)) { | |
451 | feed(0x22); | |
452 | } else if (have(&Encoding::is_backslash)) { | |
453 | feed(0x5c); | |
454 | } else if (have(&Encoding::is_slash)) { | |
455 | feed(0x2f); | |
456 | } else if (have(&Encoding::is_b)) { | |
457 | feed(0x08); // backspace | |
458 | } else if (have(&Encoding::is_f)) { | |
459 | feed(0x0c); // formfeed | |
460 | } else if (have(&Encoding::is_n)) { | |
461 | feed(0x0a); // line feed | |
462 | } else if (have(&Encoding::is_r)) { | |
463 | feed(0x0d); // carriage return | |
464 | } else if (have(&Encoding::is_t)) { | |
465 | feed(0x09); // horizontal tab | |
466 | } else if (have(&Encoding::is_u)) { | |
467 | parse_codepoint_ref(); | |
468 | } else { | |
469 | parse_error("invalid escape sequence"); | |
470 | } | |
471 | } | |
472 | ||
473 | unsigned parse_hex_quad() { | |
474 | unsigned codepoint = 0; | |
475 | for (int i = 0; i < 4; ++i) { | |
476 | int value = encoding.decode_hexdigit( | |
477 | need_cur("invalid escape sequence")); | |
478 | if (value < 0) { | |
479 | parse_error("invalid escape sequence"); | |
480 | } | |
481 | codepoint *= 16; | |
482 | codepoint += value; | |
483 | next(); | |
484 | } | |
485 | return codepoint; | |
486 | } | |
487 | ||
488 | static bool is_surrogate_high(unsigned codepoint) { | |
489 | return (codepoint & 0xfc00) == 0xd800; | |
490 | } | |
491 | static bool is_surrogate_low(unsigned codepoint) { | |
492 | return (codepoint & 0xfc00) == 0xdc00; | |
493 | } | |
494 | static unsigned combine_surrogates(unsigned high, unsigned low) { | |
495 | return 0x010000 + (((high & 0x3ff) << 10) | (low & 0x3ff)); | |
496 | } | |
497 | ||
498 | void parse_codepoint_ref() { | |
499 | unsigned codepoint = parse_hex_quad(); | |
500 | if (is_surrogate_low(codepoint)) { | |
501 | parse_error("invalid codepoint, stray low surrogate"); | |
502 | } | |
503 | if (is_surrogate_high(codepoint)) { | |
504 | expect(&Encoding::is_backslash, | |
505 | "invalid codepoint, stray high surrogate"); | |
506 | expect(&Encoding::is_u, | |
507 | "expected codepoint reference after high surrogate"); | |
508 | int low = parse_hex_quad(); | |
509 | if (!is_surrogate_low(low)) { | |
510 | parse_error("expected low surrogate after high surrogate"); | |
511 | } | |
512 | codepoint = combine_surrogates(codepoint, low); | |
513 | } | |
514 | feed(codepoint); | |
515 | } | |
516 | ||
517 | void feed(unsigned codepoint) { | |
518 | encoding.feed_codepoint(codepoint, | |
519 | boost::bind(&Callbacks::on_code_unit, | |
520 | boost::ref(callbacks), _1)); | |
521 | } | |
522 | ||
523 | Callbacks& callbacks; | |
524 | Encoding& encoding; | |
525 | source src; | |
526 | }; | |
527 | ||
528 | }}}} | |
529 | ||
530 | #endif |