]> git.proxmox.com Git - ceph.git/blob - ceph/src/rocksdb/third-party/fbson/FbsonJsonParser.h
add subtree-ish sources for 12.0.3
[ceph.git] / ceph / src / rocksdb / third-party / fbson / FbsonJsonParser.h
1 /*
2 * Copyright (c) 2011-present, Facebook, Inc.
3 * All rights reserved.
4 *
5 * This source code is licensed under the BSD-style license found in the
6 * LICENSE file in the root directory of this source tree. An additional grant
7 * of patent rights can be found in the PATENTS file in the same directory.
8 *
9 */
10
11 /*
12 * This file defines FbsonJsonParserT (template) and FbsonJsonParser.
13 *
14 * FbsonJsonParserT is a template class which implements a JSON parser.
15 * FbsonJsonParserT parses JSON text, and serialize it to FBSON binary format
16 * by using FbsonWriterT object. By default, FbsonJsonParserT creates a new
17 * FbsonWriterT object with an output stream object. However, you can also
18 * pass in your FbsonWriterT or any stream object that implements some basic
19 * interface of std::ostream (see FbsonStream.h).
20 *
21 * FbsonJsonParser specializes FbsonJsonParserT with FbsonOutStream type (see
22 * FbsonStream.h). So unless you want to provide own a different output stream
23 * type, use FbsonJsonParser object.
24 *
25 * ** Parsing JSON **
26 * FbsonJsonParserT parses JSON string, and directly serializes into FBSON
27 * packed bytes. There are three ways to parse a JSON string: (1) using
28 * c-string, (2) using string with len, (3) using std::istream object. You can
29 * use custome streambuf to redirect output. FbsonOutBuffer is a streambuf used
30 * internally if the input is raw character buffer.
31 *
32 * You can reuse an FbsonJsonParserT object to parse/serialize multiple JSON
33 * strings, and the previous FBSON will be overwritten.
34 *
35 * If parsing fails (returned false), the error code will be set to one of
36 * FbsonErrType, and can be retrieved by calling getErrorCode().
37 *
38 * ** External dictionary **
39 * During parsing a JSON string, you can pass a call-back function to map a key
40 * string to an id, and store the dictionary id in FBSON to save space. The
41 * purpose of using an external dictionary is more towards a collection of
42 * documents (which has common keys) rather than a single document, so that
43 * space saving will be significant.
44 *
45 * ** Endianness **
46 * Note: FBSON serialization doesn't assume endianness of the server. However
47 * you will need to ensure that the endianness at the reader side is the same
48 * as that at the writer side (if they are on different machines). Otherwise,
49 * proper conversion is needed when a number value is returned to the
50 * caller/writer.
51 *
52 * @author Tian Xia <tianx@fb.com>
53 */
54
55 #ifndef FBSON_FBSONPARSER_H
56 #define FBSON_FBSONPARSER_H
57
58 #include <cmath>
59 #include <limits>
60 #include "FbsonDocument.h"
61 #include "FbsonWriter.h"
62
63 namespace fbson {
64
65 const char* const kJsonDelim = " ,]}\t\r\n";
66 const char* const kWhiteSpace = " \t\n\r";
67
68 /*
69 * Error codes
70 */
71 enum class FbsonErrType {
72 E_NONE = 0,
73 E_INVALID_VER,
74 E_EMPTY_STR,
75 E_OUTPUT_FAIL,
76 E_INVALID_DOCU,
77 E_INVALID_VALUE,
78 E_INVALID_KEY,
79 E_INVALID_STR,
80 E_INVALID_OBJ,
81 E_INVALID_ARR,
82 E_INVALID_HEX,
83 E_INVALID_OCTAL,
84 E_INVALID_DECIMAL,
85 E_INVALID_EXPONENT,
86 E_HEX_OVERFLOW,
87 E_OCTAL_OVERFLOW,
88 E_DECIMAL_OVERFLOW,
89 E_DOUBLE_OVERFLOW,
90 E_EXPONENT_OVERFLOW,
91 };
92
93 /*
94 * Template FbsonJsonParserT
95 */
96 template <class OS_TYPE>
97 class FbsonJsonParserT {
98 public:
99 FbsonJsonParserT() : err_(FbsonErrType::E_NONE) {}
100
101 explicit FbsonJsonParserT(OS_TYPE& os)
102 : writer_(os), err_(FbsonErrType::E_NONE) {}
103
104 // parse a UTF-8 JSON string
105 bool parse(const std::string& str, hDictInsert handler = nullptr) {
106 return parse(str.c_str(), (unsigned int)str.size(), handler);
107 }
108
109 // parse a UTF-8 JSON c-style string (NULL terminated)
110 bool parse(const char* c_str, hDictInsert handler = nullptr) {
111 return parse(c_str, (unsigned int)strlen(c_str), handler);
112 }
113
114 // parse a UTF-8 JSON string with length
115 bool parse(const char* pch, unsigned int len, hDictInsert handler = nullptr) {
116 if (!pch || len == 0) {
117 err_ = FbsonErrType::E_EMPTY_STR;
118 return false;
119 }
120
121 FbsonInBuffer sb(pch, len);
122 std::istream in(&sb);
123 return parse(in, handler);
124 }
125
126 // parse UTF-8 JSON text from an input stream
127 bool parse(std::istream& in, hDictInsert handler = nullptr) {
128 bool res = false;
129
130 // reset output stream
131 writer_.reset();
132
133 trim(in);
134
135 if (in.peek() == '{') {
136 in.ignore();
137 res = parseObject(in, handler);
138 } else if (in.peek() == '[') {
139 in.ignore();
140 res = parseArray(in, handler);
141 } else {
142 err_ = FbsonErrType::E_INVALID_DOCU;
143 }
144
145 trim(in);
146 if (res && !in.eof()) {
147 err_ = FbsonErrType::E_INVALID_DOCU;
148 return false;
149 }
150
151 return res;
152 }
153
154 FbsonWriterT<OS_TYPE>& getWriter() { return writer_; }
155
156 FbsonErrType getErrorCode() { return err_; }
157
158 // clear error code
159 void clearErr() { err_ = FbsonErrType::E_NONE; }
160
161 private:
162 // parse a JSON object (comma-separated list of key-value pairs)
163 bool parseObject(std::istream& in, hDictInsert handler) {
164 if (!writer_.writeStartObject()) {
165 err_ = FbsonErrType::E_OUTPUT_FAIL;
166 return false;
167 }
168
169 trim(in);
170
171 if (in.peek() == '}') {
172 in.ignore();
173 // empty object
174 if (!writer_.writeEndObject()) {
175 err_ = FbsonErrType::E_OUTPUT_FAIL;
176 return false;
177 }
178 return true;
179 }
180
181 while (in.good()) {
182 if (in.get() != '"') {
183 err_ = FbsonErrType::E_INVALID_KEY;
184 return false;
185 }
186
187 if (!parseKVPair(in, handler)) {
188 return false;
189 }
190
191 trim(in);
192
193 char ch = in.get();
194 if (ch == '}') {
195 // end of the object
196 if (!writer_.writeEndObject()) {
197 err_ = FbsonErrType::E_OUTPUT_FAIL;
198 return false;
199 }
200 return true;
201 } else if (ch != ',') {
202 err_ = FbsonErrType::E_INVALID_OBJ;
203 return false;
204 }
205
206 trim(in);
207 }
208
209 err_ = FbsonErrType::E_INVALID_OBJ;
210 return false;
211 }
212
213 // parse a JSON array (comma-separated list of values)
214 bool parseArray(std::istream& in, hDictInsert handler) {
215 if (!writer_.writeStartArray()) {
216 err_ = FbsonErrType::E_OUTPUT_FAIL;
217 return false;
218 }
219
220 trim(in);
221
222 if (in.peek() == ']') {
223 in.ignore();
224 // empty array
225 if (!writer_.writeEndArray()) {
226 err_ = FbsonErrType::E_OUTPUT_FAIL;
227 return false;
228 }
229 return true;
230 }
231
232 while (in.good()) {
233 if (!parseValue(in, handler)) {
234 return false;
235 }
236
237 trim(in);
238
239 char ch = in.get();
240 if (ch == ']') {
241 // end of the array
242 if (!writer_.writeEndArray()) {
243 err_ = FbsonErrType::E_OUTPUT_FAIL;
244 return false;
245 }
246 return true;
247 } else if (ch != ',') {
248 err_ = FbsonErrType::E_INVALID_ARR;
249 return false;
250 }
251
252 trim(in);
253 }
254
255 err_ = FbsonErrType::E_INVALID_ARR;
256 return false;
257 }
258
259 // parse a key-value pair, separated by ":"
260 bool parseKVPair(std::istream& in, hDictInsert handler) {
261 if (parseKey(in, handler) && parseValue(in, handler)) {
262 return true;
263 }
264
265 return false;
266 }
267
268 // parse a key (must be string)
269 bool parseKey(std::istream& in, hDictInsert handler) {
270 char key[FbsonKeyValue::sMaxKeyLen];
271 int i = 0;
272 while (in.good() && in.peek() != '"' && i < FbsonKeyValue::sMaxKeyLen) {
273 key[i++] = in.get();
274 }
275
276 if (!in.good() || in.peek() != '"' || i == 0) {
277 err_ = FbsonErrType::E_INVALID_KEY;
278 return false;
279 }
280
281 in.ignore(); // discard '"'
282
283 int key_id = -1;
284 if (handler) {
285 key_id = handler(key, i);
286 }
287
288 if (key_id < 0) {
289 writer_.writeKey(key, i);
290 } else {
291 writer_.writeKey(key_id);
292 }
293
294 trim(in);
295
296 if (in.get() != ':') {
297 err_ = FbsonErrType::E_INVALID_OBJ;
298 return false;
299 }
300
301 return true;
302 }
303
304 // parse a value
305 bool parseValue(std::istream& in, hDictInsert handler) {
306 bool res = false;
307
308 trim(in);
309
310 switch (in.peek()) {
311 case 'N':
312 case 'n': {
313 in.ignore();
314 res = parseNull(in);
315 break;
316 }
317 case 'T':
318 case 't': {
319 in.ignore();
320 res = parseTrue(in);
321 break;
322 }
323 case 'F':
324 case 'f': {
325 in.ignore();
326 res = parseFalse(in);
327 break;
328 }
329 case '"': {
330 in.ignore();
331 res = parseString(in);
332 break;
333 }
334 case '{': {
335 in.ignore();
336 res = parseObject(in, handler);
337 break;
338 }
339 case '[': {
340 in.ignore();
341 res = parseArray(in, handler);
342 break;
343 }
344 default: {
345 res = parseNumber(in);
346 break;
347 }
348 }
349
350 return res;
351 }
352
353 // parse NULL value
354 bool parseNull(std::istream& in) {
355 if (tolower(in.get()) == 'u' && tolower(in.get()) == 'l' &&
356 tolower(in.get()) == 'l') {
357 writer_.writeNull();
358 return true;
359 }
360
361 err_ = FbsonErrType::E_INVALID_VALUE;
362 return false;
363 }
364
365 // parse TRUE value
366 bool parseTrue(std::istream& in) {
367 if (tolower(in.get()) == 'r' && tolower(in.get()) == 'u' &&
368 tolower(in.get()) == 'e') {
369 writer_.writeBool(true);
370 return true;
371 }
372
373 err_ = FbsonErrType::E_INVALID_VALUE;
374 return false;
375 }
376
377 // parse FALSE value
378 bool parseFalse(std::istream& in) {
379 if (tolower(in.get()) == 'a' && tolower(in.get()) == 'l' &&
380 tolower(in.get()) == 's' && tolower(in.get()) == 'e') {
381 writer_.writeBool(false);
382 return true;
383 }
384
385 err_ = FbsonErrType::E_INVALID_VALUE;
386 return false;
387 }
388
389 // parse a string
390 bool parseString(std::istream& in) {
391 if (!writer_.writeStartString()) {
392 err_ = FbsonErrType::E_OUTPUT_FAIL;
393 return false;
394 }
395
396 bool escaped = false;
397 char buffer[4096]; // write 4KB at a time
398 int nread = 0;
399 while (in.good()) {
400 char ch = in.get();
401 if (ch != '"' || escaped) {
402 buffer[nread++] = ch;
403 if (nread == 4096) {
404 // flush buffer
405 if (!writer_.writeString(buffer, nread)) {
406 err_ = FbsonErrType::E_OUTPUT_FAIL;
407 return false;
408 }
409 nread = 0;
410 }
411 // set/reset escape
412 if (ch == '\\' || escaped) {
413 escaped = !escaped;
414 }
415 } else {
416 // write all remaining bytes in the buffer
417 if (nread > 0) {
418 if (!writer_.writeString(buffer, nread)) {
419 err_ = FbsonErrType::E_OUTPUT_FAIL;
420 return false;
421 }
422 }
423 // end writing string
424 if (!writer_.writeEndString()) {
425 err_ = FbsonErrType::E_OUTPUT_FAIL;
426 return false;
427 }
428 return true;
429 }
430 }
431
432 err_ = FbsonErrType::E_INVALID_STR;
433 return false;
434 }
435
436 // parse a number
437 // Number format can be hex, octal, or decimal (including float).
438 // Only decimal can have (+/-) sign prefix.
439 bool parseNumber(std::istream& in) {
440 bool ret = false;
441 switch (in.peek()) {
442 case '0': {
443 in.ignore();
444
445 if (in.peek() == 'x' || in.peek() == 'X') {
446 in.ignore();
447 ret = parseHex(in);
448 } else if (in.peek() == '.') {
449 in.ignore();
450 ret = parseDouble(in, 0, 0, 1);
451 } else {
452 ret = parseOctal(in);
453 }
454
455 break;
456 }
457 case '-': {
458 in.ignore();
459 ret = parseDecimal(in, -1);
460 break;
461 }
462 case '+':
463 in.ignore();
464 // fall through
465 default:
466 ret = parseDecimal(in, 1);
467 break;
468 }
469
470 return ret;
471 }
472
473 // parse a number in hex format
474 bool parseHex(std::istream& in) {
475 uint64_t val = 0;
476 int num_digits = 0;
477 char ch = tolower(in.peek());
478 while (in.good() && !strchr(kJsonDelim, ch) && (++num_digits) <= 16) {
479 if (ch >= '0' && ch <= '9') {
480 val = (val << 4) + (ch - '0');
481 } else if (ch >= 'a' && ch <= 'f') {
482 val = (val << 4) + (ch - 'a' + 10);
483 } else { // unrecognized hex digit
484 err_ = FbsonErrType::E_INVALID_HEX;
485 return false;
486 }
487
488 in.ignore();
489 ch = tolower(in.peek());
490 }
491
492 int size = 0;
493 if (num_digits <= 2) {
494 size = writer_.writeInt8((int8_t)val);
495 } else if (num_digits <= 4) {
496 size = writer_.writeInt16((int16_t)val);
497 } else if (num_digits <= 8) {
498 size = writer_.writeInt32((int32_t)val);
499 } else if (num_digits <= 16) {
500 size = writer_.writeInt64(val);
501 } else {
502 err_ = FbsonErrType::E_HEX_OVERFLOW;
503 return false;
504 }
505
506 if (size == 0) {
507 err_ = FbsonErrType::E_OUTPUT_FAIL;
508 return false;
509 }
510
511 return true;
512 }
513
514 // parse a number in octal format
515 bool parseOctal(std::istream& in) {
516 int64_t val = 0;
517 char ch = in.peek();
518 while (in.good() && !strchr(kJsonDelim, ch)) {
519 if (ch >= '0' && ch <= '7') {
520 val = val * 8 + (ch - '0');
521 } else {
522 err_ = FbsonErrType::E_INVALID_OCTAL;
523 return false;
524 }
525
526 // check if the number overflows
527 if (val < 0) {
528 err_ = FbsonErrType::E_OCTAL_OVERFLOW;
529 return false;
530 }
531
532 in.ignore();
533 ch = in.peek();
534 }
535
536 int size = 0;
537 if (val <= std::numeric_limits<int8_t>::max()) {
538 size = writer_.writeInt8((int8_t)val);
539 } else if (val <= std::numeric_limits<int16_t>::max()) {
540 size = writer_.writeInt16((int16_t)val);
541 } else if (val <= std::numeric_limits<int32_t>::max()) {
542 size = writer_.writeInt32((int32_t)val);
543 } else { // val <= INT64_MAX
544 size = writer_.writeInt64(val);
545 }
546
547 if (size == 0) {
548 err_ = FbsonErrType::E_OUTPUT_FAIL;
549 return false;
550 }
551
552 return true;
553 }
554
555 // parse a number in decimal (including float)
556 bool parseDecimal(std::istream& in, int sign) {
557 int64_t val = 0;
558 int precision = 0;
559
560 char ch = 0;
561 while (in.good() && (ch = in.peek()) == '0')
562 in.ignore();
563
564 while (in.good() && !strchr(kJsonDelim, ch)) {
565 if (ch >= '0' && ch <= '9') {
566 val = val * 10 + (ch - '0');
567 ++precision;
568 } else if (ch == '.') {
569 // note we don't pop out '.'
570 return parseDouble(in, static_cast<double>(val), precision, sign);
571 } else {
572 err_ = FbsonErrType::E_INVALID_DECIMAL;
573 return false;
574 }
575
576 in.ignore();
577
578 // if the number overflows int64_t, first parse it as double iff we see a
579 // decimal point later. Otherwise, will treat it as overflow
580 if (val < 0 && val > std::numeric_limits<int64_t>::min()) {
581 return parseDouble(in, static_cast<double>(val), precision, sign);
582 }
583
584 ch = in.peek();
585 }
586
587 if (sign < 0) {
588 val = -val;
589 }
590
591 int size = 0;
592 if (val >= std::numeric_limits<int8_t>::min() &&
593 val <= std::numeric_limits<int8_t>::max()) {
594 size = writer_.writeInt8((int8_t)val);
595 } else if (val >= std::numeric_limits<int16_t>::min() &&
596 val <= std::numeric_limits<int16_t>::max()) {
597 size = writer_.writeInt16((int16_t)val);
598 } else if (val >= std::numeric_limits<int32_t>::min() &&
599 val <= std::numeric_limits<int32_t>::max()) {
600 size = writer_.writeInt32((int32_t)val);
601 } else { // val <= INT64_MAX
602 size = writer_.writeInt64(val);
603 }
604
605 if (size == 0) {
606 err_ = FbsonErrType::E_OUTPUT_FAIL;
607 return false;
608 }
609
610 return true;
611 }
612
613 // parse IEEE745 double precision:
614 // Significand precision length - 15
615 // Maximum exponent value - 308
616 //
617 // "If a decimal string with at most 15 significant digits is converted to
618 // IEEE 754 double precision representation and then converted back to a
619 // string with the same number of significant digits, then the final string
620 // should match the original"
621 bool parseDouble(std::istream& in, double val, int precision, int sign) {
622 int integ = precision;
623 int frac = 0;
624 bool is_frac = false;
625
626 char ch = in.peek();
627 if (ch == '.') {
628 is_frac = true;
629 in.ignore();
630 ch = in.peek();
631 }
632
633 int exp = 0;
634 while (in.good() && !strchr(kJsonDelim, ch)) {
635 if (ch >= '0' && ch <= '9') {
636 if (precision < 15) {
637 val = val * 10 + (ch - '0');
638 if (is_frac) {
639 ++frac;
640 } else {
641 ++integ;
642 }
643 ++precision;
644 } else if (!is_frac) {
645 ++exp;
646 }
647 } else if (ch == 'e' || ch == 'E') {
648 in.ignore();
649 int exp2;
650 if (!parseExponent(in, exp2)) {
651 return false;
652 }
653
654 exp += exp2;
655 // check if exponent overflows
656 if (exp > 308 || exp < -308) {
657 err_ = FbsonErrType::E_EXPONENT_OVERFLOW;
658 return false;
659 }
660
661 is_frac = true;
662 break;
663 }
664
665 in.ignore();
666 ch = in.peek();
667 }
668
669 if (!is_frac) {
670 err_ = FbsonErrType::E_DECIMAL_OVERFLOW;
671 return false;
672 }
673
674 val *= std::pow(10, exp - frac);
675 if (std::isnan(val) || std::isinf(val)) {
676 err_ = FbsonErrType::E_DOUBLE_OVERFLOW;
677 return false;
678 }
679
680 if (sign < 0) {
681 val = -val;
682 }
683
684 if (writer_.writeDouble(val) == 0) {
685 err_ = FbsonErrType::E_OUTPUT_FAIL;
686 return false;
687 }
688
689 return true;
690 }
691
692 // parse the exponent part of a double number
693 bool parseExponent(std::istream& in, int& exp) {
694 bool neg = false;
695
696 char ch = in.peek();
697 if (ch == '+') {
698 in.ignore();
699 ch = in.peek();
700 } else if (ch == '-') {
701 neg = true;
702 in.ignore();
703 ch = in.peek();
704 }
705
706 exp = 0;
707 while (in.good() && !strchr(kJsonDelim, ch)) {
708 if (ch >= '0' && ch <= '9') {
709 exp = exp * 10 + (ch - '0');
710 } else {
711 err_ = FbsonErrType::E_INVALID_EXPONENT;
712 return false;
713 }
714
715 if (exp > 308) {
716 err_ = FbsonErrType::E_EXPONENT_OVERFLOW;
717 return false;
718 }
719
720 in.ignore();
721 ch = in.peek();
722 }
723
724 if (neg) {
725 exp = -exp;
726 }
727
728 return true;
729 }
730
731 void trim(std::istream& in) {
732 while (in.good() && strchr(kWhiteSpace, in.peek())) {
733 in.ignore();
734 }
735 }
736
737 private:
738 FbsonWriterT<OS_TYPE> writer_;
739 FbsonErrType err_;
740 };
741
742 typedef FbsonJsonParserT<FbsonOutStream> FbsonJsonParser;
743
744 } // namespace fbson
745
746 #endif // FBSON_FBSONPARSER_H