]>
Commit | Line | Data |
---|---|---|
1d09f67e TL |
1 | // Licensed to the Apache Software Foundation (ASF) under one |
2 | // or more contributor license agreements. See the NOTICE file | |
3 | // distributed with this work for additional information | |
4 | // regarding copyright ownership. The ASF licenses this file | |
5 | // to you under the Apache License, Version 2.0 (the | |
6 | // "License"); you may not use this file except in compliance | |
7 | // with the License. You may obtain a copy of the License at | |
8 | // | |
9 | // http://www.apache.org/licenses/LICENSE-2.0 | |
10 | // | |
11 | // Unless required by applicable law or agreed to in writing, | |
12 | // software distributed under the License is distributed on an | |
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | |
14 | // KIND, either express or implied. See the License for the | |
15 | // specific language governing permissions and limitations | |
16 | // under the License. | |
17 | ||
18 | #pragma once | |
19 | ||
20 | #include <memory> | |
21 | #include <string> | |
22 | ||
23 | #include "arrow/json/options.h" | |
24 | #include "arrow/status.h" | |
25 | #include "arrow/util/key_value_metadata.h" | |
26 | #include "arrow/util/macros.h" | |
27 | #include "arrow/util/visibility.h" | |
28 | ||
29 | namespace arrow { | |
30 | ||
31 | class Array; | |
32 | class Buffer; | |
33 | class MemoryPool; | |
34 | class KeyValueMetadata; | |
35 | class ResizableBuffer; | |
36 | ||
37 | namespace json { | |
38 | ||
39 | struct Kind { | |
40 | enum type : uint8_t { kNull, kBoolean, kNumber, kString, kArray, kObject }; | |
41 | ||
42 | static const std::string& Name(Kind::type); | |
43 | ||
44 | static const std::shared_ptr<const KeyValueMetadata>& Tag(Kind::type); | |
45 | ||
46 | static Kind::type FromTag(const std::shared_ptr<const KeyValueMetadata>& tag); | |
47 | ||
48 | static Status ForType(const DataType& type, Kind::type* kind); | |
49 | }; | |
50 | ||
51 | constexpr int32_t kMaxParserNumRows = 100000; | |
52 | ||
53 | /// \class BlockParser | |
54 | /// \brief A reusable block-based parser for JSON data | |
55 | /// | |
56 | /// The parser takes a block of newline delimited JSON data and extracts Arrays | |
57 | /// of unconverted strings which can be fed to a Converter to obtain a usable Array. | |
58 | /// | |
59 | /// Note that in addition to parse errors (such as malformed JSON) some conversion | |
60 | /// errors are caught at parse time: | |
61 | /// - A null value in non-nullable column | |
62 | /// - Change in the JSON kind of a column. For example, if an explicit schema is provided | |
63 | /// which stipulates that field "a" is integral, a row of {"a": "not a number"} will | |
64 | /// result in an error. This also applies to fields outside an explicit schema. | |
65 | class ARROW_EXPORT BlockParser { | |
66 | public: | |
67 | virtual ~BlockParser() = default; | |
68 | ||
69 | /// \brief Reserve storage for scalars parsed from a block of json | |
70 | virtual Status ReserveScalarStorage(int64_t nbytes) = 0; | |
71 | ||
72 | /// \brief Parse a block of data | |
73 | virtual Status Parse(const std::shared_ptr<Buffer>& json) = 0; | |
74 | ||
75 | /// \brief Extract parsed data | |
76 | virtual Status Finish(std::shared_ptr<Array>* parsed) = 0; | |
77 | ||
78 | /// \brief Return the number of parsed rows | |
79 | int32_t num_rows() const { return num_rows_; } | |
80 | ||
81 | /// \brief Construct a BlockParser | |
82 | /// | |
83 | /// \param[in] pool MemoryPool to use when constructing parsed array | |
84 | /// \param[in] options ParseOptions to use when parsing JSON | |
85 | /// \param[out] out constructed BlockParser | |
86 | static Status Make(MemoryPool* pool, const ParseOptions& options, | |
87 | std::unique_ptr<BlockParser>* out); | |
88 | ||
89 | static Status Make(const ParseOptions& options, std::unique_ptr<BlockParser>* out); | |
90 | ||
91 | protected: | |
92 | ARROW_DISALLOW_COPY_AND_ASSIGN(BlockParser); | |
93 | ||
94 | explicit BlockParser(MemoryPool* pool) : pool_(pool) {} | |
95 | ||
96 | MemoryPool* pool_; | |
97 | int32_t num_rows_ = 0; | |
98 | }; | |
99 | ||
100 | } // namespace json | |
101 | } // namespace arrow |