]> git.proxmox.com Git - ceph.git/blame - ceph/src/rocksdb/third-party/fbson/FbsonJsonParser.h
build: use dgit for download target
[ceph.git] / ceph / src / rocksdb / third-party / fbson / FbsonJsonParser.h
CommitLineData
11fdf7f2
TL
1// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2// This source code is licensed under both the GPLv2 (found in the
3// COPYING file in the root directory) and Apache 2.0 License
4// (found in the LICENSE.Apache file in the root directory).
7c673cae
FG
5
6/*
7 * This file defines FbsonJsonParserT (template) and FbsonJsonParser.
8 *
9 * FbsonJsonParserT is a template class which implements a JSON parser.
10 * FbsonJsonParserT parses JSON text, and serialize it to FBSON binary format
11 * by using FbsonWriterT object. By default, FbsonJsonParserT creates a new
12 * FbsonWriterT object with an output stream object. However, you can also
13 * pass in your FbsonWriterT or any stream object that implements some basic
14 * interface of std::ostream (see FbsonStream.h).
15 *
16 * FbsonJsonParser specializes FbsonJsonParserT with FbsonOutStream type (see
17 * FbsonStream.h). So unless you want to provide own a different output stream
18 * type, use FbsonJsonParser object.
19 *
20 * ** Parsing JSON **
21 * FbsonJsonParserT parses JSON string, and directly serializes into FBSON
22 * packed bytes. There are three ways to parse a JSON string: (1) using
23 * c-string, (2) using string with len, (3) using std::istream object. You can
24 * use custome streambuf to redirect output. FbsonOutBuffer is a streambuf used
25 * internally if the input is raw character buffer.
26 *
27 * You can reuse an FbsonJsonParserT object to parse/serialize multiple JSON
28 * strings, and the previous FBSON will be overwritten.
29 *
30 * If parsing fails (returned false), the error code will be set to one of
31 * FbsonErrType, and can be retrieved by calling getErrorCode().
32 *
33 * ** External dictionary **
11fdf7f2 34 * During parsing a JSON string, you can pass a callback function to map a key
7c673cae
FG
35 * string to an id, and store the dictionary id in FBSON to save space. The
36 * purpose of using an external dictionary is more towards a collection of
37 * documents (which has common keys) rather than a single document, so that
38 * space saving will be significant.
39 *
40 * ** Endianness **
41 * Note: FBSON serialization doesn't assume endianness of the server. However
42 * you will need to ensure that the endianness at the reader side is the same
43 * as that at the writer side (if they are on different machines). Otherwise,
44 * proper conversion is needed when a number value is returned to the
45 * caller/writer.
46 *
47 * @author Tian Xia <tianx@fb.com>
48 */
49
11fdf7f2 50#pragma once
7c673cae
FG
51
52#include <cmath>
53#include <limits>
54#include "FbsonDocument.h"
55#include "FbsonWriter.h"
56
57namespace fbson {
58
59const char* const kJsonDelim = " ,]}\t\r\n";
60const char* const kWhiteSpace = " \t\n\r";
61
62/*
63 * Error codes
64 */
65enum class FbsonErrType {
66 E_NONE = 0,
67 E_INVALID_VER,
68 E_EMPTY_STR,
69 E_OUTPUT_FAIL,
70 E_INVALID_DOCU,
71 E_INVALID_VALUE,
72 E_INVALID_KEY,
73 E_INVALID_STR,
74 E_INVALID_OBJ,
75 E_INVALID_ARR,
76 E_INVALID_HEX,
77 E_INVALID_OCTAL,
78 E_INVALID_DECIMAL,
79 E_INVALID_EXPONENT,
80 E_HEX_OVERFLOW,
81 E_OCTAL_OVERFLOW,
82 E_DECIMAL_OVERFLOW,
83 E_DOUBLE_OVERFLOW,
84 E_EXPONENT_OVERFLOW,
85};
86
87/*
88 * Template FbsonJsonParserT
89 */
90template <class OS_TYPE>
91class FbsonJsonParserT {
92 public:
93 FbsonJsonParserT() : err_(FbsonErrType::E_NONE) {}
94
95 explicit FbsonJsonParserT(OS_TYPE& os)
96 : writer_(os), err_(FbsonErrType::E_NONE) {}
97
98 // parse a UTF-8 JSON string
99 bool parse(const std::string& str, hDictInsert handler = nullptr) {
100 return parse(str.c_str(), (unsigned int)str.size(), handler);
101 }
102
103 // parse a UTF-8 JSON c-style string (NULL terminated)
104 bool parse(const char* c_str, hDictInsert handler = nullptr) {
105 return parse(c_str, (unsigned int)strlen(c_str), handler);
106 }
107
108 // parse a UTF-8 JSON string with length
109 bool parse(const char* pch, unsigned int len, hDictInsert handler = nullptr) {
110 if (!pch || len == 0) {
111 err_ = FbsonErrType::E_EMPTY_STR;
112 return false;
113 }
114
115 FbsonInBuffer sb(pch, len);
116 std::istream in(&sb);
117 return parse(in, handler);
118 }
119
120 // parse UTF-8 JSON text from an input stream
121 bool parse(std::istream& in, hDictInsert handler = nullptr) {
122 bool res = false;
123
124 // reset output stream
125 writer_.reset();
126
127 trim(in);
128
129 if (in.peek() == '{') {
130 in.ignore();
131 res = parseObject(in, handler);
132 } else if (in.peek() == '[') {
133 in.ignore();
134 res = parseArray(in, handler);
135 } else {
136 err_ = FbsonErrType::E_INVALID_DOCU;
137 }
138
139 trim(in);
140 if (res && !in.eof()) {
141 err_ = FbsonErrType::E_INVALID_DOCU;
142 return false;
143 }
144
145 return res;
146 }
147
148 FbsonWriterT<OS_TYPE>& getWriter() { return writer_; }
149
150 FbsonErrType getErrorCode() { return err_; }
151
152 // clear error code
153 void clearErr() { err_ = FbsonErrType::E_NONE; }
154
155 private:
156 // parse a JSON object (comma-separated list of key-value pairs)
157 bool parseObject(std::istream& in, hDictInsert handler) {
158 if (!writer_.writeStartObject()) {
159 err_ = FbsonErrType::E_OUTPUT_FAIL;
160 return false;
161 }
162
163 trim(in);
164
165 if (in.peek() == '}') {
166 in.ignore();
167 // empty object
168 if (!writer_.writeEndObject()) {
169 err_ = FbsonErrType::E_OUTPUT_FAIL;
170 return false;
171 }
172 return true;
173 }
174
175 while (in.good()) {
176 if (in.get() != '"') {
177 err_ = FbsonErrType::E_INVALID_KEY;
178 return false;
179 }
180
181 if (!parseKVPair(in, handler)) {
182 return false;
183 }
184
185 trim(in);
186
187 char ch = in.get();
188 if (ch == '}') {
189 // end of the object
190 if (!writer_.writeEndObject()) {
191 err_ = FbsonErrType::E_OUTPUT_FAIL;
192 return false;
193 }
194 return true;
195 } else if (ch != ',') {
196 err_ = FbsonErrType::E_INVALID_OBJ;
197 return false;
198 }
199
200 trim(in);
201 }
202
203 err_ = FbsonErrType::E_INVALID_OBJ;
204 return false;
205 }
206
207 // parse a JSON array (comma-separated list of values)
208 bool parseArray(std::istream& in, hDictInsert handler) {
209 if (!writer_.writeStartArray()) {
210 err_ = FbsonErrType::E_OUTPUT_FAIL;
211 return false;
212 }
213
214 trim(in);
215
216 if (in.peek() == ']') {
217 in.ignore();
218 // empty array
219 if (!writer_.writeEndArray()) {
220 err_ = FbsonErrType::E_OUTPUT_FAIL;
221 return false;
222 }
223 return true;
224 }
225
226 while (in.good()) {
227 if (!parseValue(in, handler)) {
228 return false;
229 }
230
231 trim(in);
232
233 char ch = in.get();
234 if (ch == ']') {
235 // end of the array
236 if (!writer_.writeEndArray()) {
237 err_ = FbsonErrType::E_OUTPUT_FAIL;
238 return false;
239 }
240 return true;
241 } else if (ch != ',') {
242 err_ = FbsonErrType::E_INVALID_ARR;
243 return false;
244 }
245
246 trim(in);
247 }
248
249 err_ = FbsonErrType::E_INVALID_ARR;
250 return false;
251 }
252
253 // parse a key-value pair, separated by ":"
254 bool parseKVPair(std::istream& in, hDictInsert handler) {
255 if (parseKey(in, handler) && parseValue(in, handler)) {
256 return true;
257 }
258
259 return false;
260 }
261
262 // parse a key (must be string)
263 bool parseKey(std::istream& in, hDictInsert handler) {
264 char key[FbsonKeyValue::sMaxKeyLen];
265 int i = 0;
266 while (in.good() && in.peek() != '"' && i < FbsonKeyValue::sMaxKeyLen) {
267 key[i++] = in.get();
268 }
269
270 if (!in.good() || in.peek() != '"' || i == 0) {
271 err_ = FbsonErrType::E_INVALID_KEY;
272 return false;
273 }
274
275 in.ignore(); // discard '"'
276
277 int key_id = -1;
278 if (handler) {
279 key_id = handler(key, i);
280 }
281
282 if (key_id < 0) {
283 writer_.writeKey(key, i);
284 } else {
285 writer_.writeKey(key_id);
286 }
287
288 trim(in);
289
290 if (in.get() != ':') {
291 err_ = FbsonErrType::E_INVALID_OBJ;
292 return false;
293 }
294
295 return true;
296 }
297
298 // parse a value
299 bool parseValue(std::istream& in, hDictInsert handler) {
300 bool res = false;
301
302 trim(in);
303
304 switch (in.peek()) {
305 case 'N':
306 case 'n': {
307 in.ignore();
308 res = parseNull(in);
309 break;
310 }
311 case 'T':
312 case 't': {
313 in.ignore();
314 res = parseTrue(in);
315 break;
316 }
317 case 'F':
318 case 'f': {
319 in.ignore();
320 res = parseFalse(in);
321 break;
322 }
323 case '"': {
324 in.ignore();
325 res = parseString(in);
326 break;
327 }
328 case '{': {
329 in.ignore();
330 res = parseObject(in, handler);
331 break;
332 }
333 case '[': {
334 in.ignore();
335 res = parseArray(in, handler);
336 break;
337 }
338 default: {
339 res = parseNumber(in);
340 break;
341 }
342 }
343
344 return res;
345 }
346
347 // parse NULL value
348 bool parseNull(std::istream& in) {
349 if (tolower(in.get()) == 'u' && tolower(in.get()) == 'l' &&
350 tolower(in.get()) == 'l') {
351 writer_.writeNull();
352 return true;
353 }
354
355 err_ = FbsonErrType::E_INVALID_VALUE;
356 return false;
357 }
358
359 // parse TRUE value
360 bool parseTrue(std::istream& in) {
361 if (tolower(in.get()) == 'r' && tolower(in.get()) == 'u' &&
362 tolower(in.get()) == 'e') {
363 writer_.writeBool(true);
364 return true;
365 }
366
367 err_ = FbsonErrType::E_INVALID_VALUE;
368 return false;
369 }
370
371 // parse FALSE value
372 bool parseFalse(std::istream& in) {
373 if (tolower(in.get()) == 'a' && tolower(in.get()) == 'l' &&
374 tolower(in.get()) == 's' && tolower(in.get()) == 'e') {
375 writer_.writeBool(false);
376 return true;
377 }
378
379 err_ = FbsonErrType::E_INVALID_VALUE;
380 return false;
381 }
382
383 // parse a string
384 bool parseString(std::istream& in) {
385 if (!writer_.writeStartString()) {
386 err_ = FbsonErrType::E_OUTPUT_FAIL;
387 return false;
388 }
389
390 bool escaped = false;
391 char buffer[4096]; // write 4KB at a time
392 int nread = 0;
393 while (in.good()) {
394 char ch = in.get();
395 if (ch != '"' || escaped) {
396 buffer[nread++] = ch;
397 if (nread == 4096) {
398 // flush buffer
399 if (!writer_.writeString(buffer, nread)) {
400 err_ = FbsonErrType::E_OUTPUT_FAIL;
401 return false;
402 }
403 nread = 0;
404 }
405 // set/reset escape
406 if (ch == '\\' || escaped) {
407 escaped = !escaped;
408 }
409 } else {
410 // write all remaining bytes in the buffer
411 if (nread > 0) {
412 if (!writer_.writeString(buffer, nread)) {
413 err_ = FbsonErrType::E_OUTPUT_FAIL;
414 return false;
415 }
416 }
417 // end writing string
418 if (!writer_.writeEndString()) {
419 err_ = FbsonErrType::E_OUTPUT_FAIL;
420 return false;
421 }
422 return true;
423 }
424 }
425
426 err_ = FbsonErrType::E_INVALID_STR;
427 return false;
428 }
429
430 // parse a number
431 // Number format can be hex, octal, or decimal (including float).
432 // Only decimal can have (+/-) sign prefix.
433 bool parseNumber(std::istream& in) {
434 bool ret = false;
435 switch (in.peek()) {
436 case '0': {
437 in.ignore();
438
439 if (in.peek() == 'x' || in.peek() == 'X') {
440 in.ignore();
441 ret = parseHex(in);
442 } else if (in.peek() == '.') {
443 in.ignore();
444 ret = parseDouble(in, 0, 0, 1);
445 } else {
446 ret = parseOctal(in);
447 }
448
449 break;
450 }
451 case '-': {
452 in.ignore();
453 ret = parseDecimal(in, -1);
454 break;
455 }
456 case '+':
457 in.ignore();
11fdf7f2
TL
458#if defined(__clang__)
459 [[clang::fallthrough]];
460#elif defined(__GNUC__) && __GNUC__ >= 7
461 [[gnu::fallthrough]];
462#endif
7c673cae
FG
463 default:
464 ret = parseDecimal(in, 1);
465 break;
466 }
467
468 return ret;
469 }
470
471 // parse a number in hex format
472 bool parseHex(std::istream& in) {
473 uint64_t val = 0;
474 int num_digits = 0;
475 char ch = tolower(in.peek());
476 while (in.good() && !strchr(kJsonDelim, ch) && (++num_digits) <= 16) {
477 if (ch >= '0' && ch <= '9') {
478 val = (val << 4) + (ch - '0');
479 } else if (ch >= 'a' && ch <= 'f') {
480 val = (val << 4) + (ch - 'a' + 10);
481 } else { // unrecognized hex digit
482 err_ = FbsonErrType::E_INVALID_HEX;
483 return false;
484 }
485
486 in.ignore();
487 ch = tolower(in.peek());
488 }
489
490 int size = 0;
491 if (num_digits <= 2) {
492 size = writer_.writeInt8((int8_t)val);
493 } else if (num_digits <= 4) {
494 size = writer_.writeInt16((int16_t)val);
495 } else if (num_digits <= 8) {
496 size = writer_.writeInt32((int32_t)val);
497 } else if (num_digits <= 16) {
498 size = writer_.writeInt64(val);
499 } else {
500 err_ = FbsonErrType::E_HEX_OVERFLOW;
501 return false;
502 }
503
504 if (size == 0) {
505 err_ = FbsonErrType::E_OUTPUT_FAIL;
506 return false;
507 }
508
509 return true;
510 }
511
512 // parse a number in octal format
513 bool parseOctal(std::istream& in) {
514 int64_t val = 0;
515 char ch = in.peek();
516 while (in.good() && !strchr(kJsonDelim, ch)) {
517 if (ch >= '0' && ch <= '7') {
518 val = val * 8 + (ch - '0');
519 } else {
520 err_ = FbsonErrType::E_INVALID_OCTAL;
521 return false;
522 }
523
524 // check if the number overflows
525 if (val < 0) {
526 err_ = FbsonErrType::E_OCTAL_OVERFLOW;
527 return false;
528 }
529
530 in.ignore();
531 ch = in.peek();
532 }
533
534 int size = 0;
535 if (val <= std::numeric_limits<int8_t>::max()) {
536 size = writer_.writeInt8((int8_t)val);
537 } else if (val <= std::numeric_limits<int16_t>::max()) {
538 size = writer_.writeInt16((int16_t)val);
539 } else if (val <= std::numeric_limits<int32_t>::max()) {
540 size = writer_.writeInt32((int32_t)val);
541 } else { // val <= INT64_MAX
542 size = writer_.writeInt64(val);
543 }
544
545 if (size == 0) {
546 err_ = FbsonErrType::E_OUTPUT_FAIL;
547 return false;
548 }
549
550 return true;
551 }
552
553 // parse a number in decimal (including float)
554 bool parseDecimal(std::istream& in, int sign) {
555 int64_t val = 0;
556 int precision = 0;
557
558 char ch = 0;
559 while (in.good() && (ch = in.peek()) == '0')
560 in.ignore();
561
562 while (in.good() && !strchr(kJsonDelim, ch)) {
563 if (ch >= '0' && ch <= '9') {
564 val = val * 10 + (ch - '0');
565 ++precision;
566 } else if (ch == '.') {
567 // note we don't pop out '.'
568 return parseDouble(in, static_cast<double>(val), precision, sign);
569 } else {
570 err_ = FbsonErrType::E_INVALID_DECIMAL;
571 return false;
572 }
573
574 in.ignore();
575
576 // if the number overflows int64_t, first parse it as double iff we see a
577 // decimal point later. Otherwise, will treat it as overflow
578 if (val < 0 && val > std::numeric_limits<int64_t>::min()) {
579 return parseDouble(in, static_cast<double>(val), precision, sign);
580 }
581
582 ch = in.peek();
583 }
584
585 if (sign < 0) {
586 val = -val;
587 }
588
589 int size = 0;
590 if (val >= std::numeric_limits<int8_t>::min() &&
591 val <= std::numeric_limits<int8_t>::max()) {
592 size = writer_.writeInt8((int8_t)val);
593 } else if (val >= std::numeric_limits<int16_t>::min() &&
594 val <= std::numeric_limits<int16_t>::max()) {
595 size = writer_.writeInt16((int16_t)val);
596 } else if (val >= std::numeric_limits<int32_t>::min() &&
597 val <= std::numeric_limits<int32_t>::max()) {
598 size = writer_.writeInt32((int32_t)val);
599 } else { // val <= INT64_MAX
600 size = writer_.writeInt64(val);
601 }
602
603 if (size == 0) {
604 err_ = FbsonErrType::E_OUTPUT_FAIL;
605 return false;
606 }
607
608 return true;
609 }
610
611 // parse IEEE745 double precision:
612 // Significand precision length - 15
613 // Maximum exponent value - 308
614 //
615 // "If a decimal string with at most 15 significant digits is converted to
616 // IEEE 754 double precision representation and then converted back to a
617 // string with the same number of significant digits, then the final string
618 // should match the original"
619 bool parseDouble(std::istream& in, double val, int precision, int sign) {
620 int integ = precision;
621 int frac = 0;
622 bool is_frac = false;
623
624 char ch = in.peek();
625 if (ch == '.') {
626 is_frac = true;
627 in.ignore();
628 ch = in.peek();
629 }
630
631 int exp = 0;
632 while (in.good() && !strchr(kJsonDelim, ch)) {
633 if (ch >= '0' && ch <= '9') {
634 if (precision < 15) {
635 val = val * 10 + (ch - '0');
636 if (is_frac) {
637 ++frac;
638 } else {
639 ++integ;
640 }
641 ++precision;
642 } else if (!is_frac) {
643 ++exp;
644 }
645 } else if (ch == 'e' || ch == 'E') {
646 in.ignore();
647 int exp2;
648 if (!parseExponent(in, exp2)) {
649 return false;
650 }
651
652 exp += exp2;
653 // check if exponent overflows
654 if (exp > 308 || exp < -308) {
655 err_ = FbsonErrType::E_EXPONENT_OVERFLOW;
656 return false;
657 }
658
659 is_frac = true;
660 break;
661 }
662
663 in.ignore();
664 ch = in.peek();
665 }
666
667 if (!is_frac) {
668 err_ = FbsonErrType::E_DECIMAL_OVERFLOW;
669 return false;
670 }
671
672 val *= std::pow(10, exp - frac);
673 if (std::isnan(val) || std::isinf(val)) {
674 err_ = FbsonErrType::E_DOUBLE_OVERFLOW;
675 return false;
676 }
677
678 if (sign < 0) {
679 val = -val;
680 }
681
682 if (writer_.writeDouble(val) == 0) {
683 err_ = FbsonErrType::E_OUTPUT_FAIL;
684 return false;
685 }
686
687 return true;
688 }
689
690 // parse the exponent part of a double number
691 bool parseExponent(std::istream& in, int& exp) {
692 bool neg = false;
693
694 char ch = in.peek();
695 if (ch == '+') {
696 in.ignore();
697 ch = in.peek();
698 } else if (ch == '-') {
699 neg = true;
700 in.ignore();
701 ch = in.peek();
702 }
703
704 exp = 0;
705 while (in.good() && !strchr(kJsonDelim, ch)) {
706 if (ch >= '0' && ch <= '9') {
707 exp = exp * 10 + (ch - '0');
708 } else {
709 err_ = FbsonErrType::E_INVALID_EXPONENT;
710 return false;
711 }
712
713 if (exp > 308) {
714 err_ = FbsonErrType::E_EXPONENT_OVERFLOW;
715 return false;
716 }
717
718 in.ignore();
719 ch = in.peek();
720 }
721
722 if (neg) {
723 exp = -exp;
724 }
725
726 return true;
727 }
728
729 void trim(std::istream& in) {
730 while (in.good() && strchr(kWhiteSpace, in.peek())) {
731 in.ignore();
732 }
733 }
734
735 private:
736 FbsonWriterT<OS_TYPE> writer_;
737 FbsonErrType err_;
738};
739
740typedef FbsonJsonParserT<FbsonOutStream> FbsonJsonParser;
741
742} // namespace fbson