]> git.proxmox.com Git - ceph.git/blob - ceph/src/arrow/docs/source/java/ipc.rst
import quincy 17.2.0
[ceph.git] / ceph / src / arrow / docs / source / java / ipc.rst
1 .. Licensed to the Apache Software Foundation (ASF) under one
2 .. or more contributor license agreements. See the NOTICE file
3 .. distributed with this work for additional information
4 .. regarding copyright ownership. The ASF licenses this file
5 .. to you under the Apache License, Version 2.0 (the
6 .. "License"); you may not use this file except in compliance
7 .. with the License. You may obtain a copy of the License at
8
9 .. http://www.apache.org/licenses/LICENSE-2.0
10
11 .. Unless required by applicable law or agreed to in writing,
12 .. software distributed under the License is distributed on an
13 .. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 .. KIND, either express or implied. See the License for the
15 .. specific language governing permissions and limitations
16 .. under the License.
17
18 ===========================
19 Reading/Writing IPC formats
20 ===========================
21 Arrow defines two types of binary formats for serializing record batches:
22
23 * **Streaming format**: for sending an arbitrary number of record
24 batches. The format must be processed from start to end, and does not support
25 random access
26
27 * **File or Random Access format**: for serializing a fixed number of record
28 batches. It supports random access, and thus is very useful when used with
29 memory maps
30
31 Writing and Reading Streaming Format
32 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
33 First, let's populate a :class:`VectorSchemaRoot` with a small batch of records
34
35 .. code-block:: Java
36
37 BitVector bitVector = new BitVector("boolean", allocator);
38 VarCharVector varCharVector = new VarCharVector("varchar", allocator);
39 for (int i = 0; i < 10; i++) {
40 bitVector.setSafe(i, i % 2 == 0 ? 0 : 1);
41 varCharVector.setSafe(i, ("test" + i).getBytes(StandardCharsets.UTF_8));
42 }
43 bitVector.setValueCount(10);
44 varCharVector.setValueCount(10);
45
46 List<Field> fields = Arrays.asList(bitVector.getField(), varCharVector.getField());
47 List<FieldVector> vectors = Arrays.asList(bitVector, varCharVector);
48 VectorSchemaRoot root = new VectorSchemaRoot(fields, vectors);
49
50 Now, we can begin writing a stream containing some number of these batches. For this we use :class:`ArrowStreamWriter`
51 (DictionaryProvider used for any vectors that are dictionary encoded is optional and can be null))::
52
53 ByteArrayOutputStream out = new ByteArrayOutputStream();
54 ArrowStreamWriter writer = new ArrowStreamWriter(root, /*DictionaryProvider=*/null, Channels.newChannel(out));
55
56
57 Here we used an in-memory stream, but this could have been a socket or some other IO stream. Then we can do
58
59 .. code-block:: Java
60
61 writer.start();
62 // write the first batch
63 writer.writeBatch();
64
65 // write another four batches.
66 for (int i = 0; i < 4; i++) {
67 // populate VectorSchemaRoot data and write the second batch
68 BitVector childVector1 = (BitVector)root.getVector(0);
69 VarCharVector childVector2 = (VarCharVector)root.getVector(1);
70 childVector1.reset();
71 childVector2.reset();
72 ... do some populate work here, could be different for each batch
73 writer.writeBatch();
74 }
75
76 // end
77 writer.end();
78
79 Note since the :class:`VectorSchemaRoot` in writer is a container that can hold batches, batches flow through
80 :class:`VectorSchemaRoot` as part of a pipeline, so we need to populate data before `writeBatch` so that later batches
81 could overwrite previous ones.
82
83 Now the :class:`ByteArrayOutputStream` contains the complete stream which contains 5 record batches.
84 We can read such a stream with :class:`ArrowStreamReader`, note that :class:`VectorSchemaRoot` within
85 reader will be loaded with new values on every call to :class:`loadNextBatch()`
86
87 .. code-block:: Java
88
89 try (ArrowStreamReader reader = new ArrowStreamReader(new ByteArrayInputStream(out.toByteArray()), allocator)) {
90 Schema schema = reader.getVectorSchemaRoot().getSchema();
91 for (int i = 0; i < 5; i++) {
92 // This will be loaded with new values on every call to loadNextBatch
93 VectorSchemaRoot readBatch = reader.getVectorSchemaRoot();
94 reader.loadNextBatch();
95 ... do something with readBatch
96 }
97
98 }
99
100 Here we also give a simple example with dictionary encoded vectors
101
102 .. code-block:: Java
103
104 DictionaryProvider.MapDictionaryProvider provider = new DictionaryProvider.MapDictionaryProvider();
105 // create dictionary and provider
106 final VarCharVector dictVector = new VarCharVector("dict", allocator);
107 dictVector.allocateNewSafe();
108 dictVector.setSafe(0, "aa".getBytes());
109 dictVector.setSafe(1, "bb".getBytes());
110 dictVector.setSafe(2, "cc".getBytes());
111 dictVector.setValueCount(3);
112
113 Dictionary dictionary =
114 new Dictionary(dictVector, new DictionaryEncoding(1L, false, /*indexType=*/null));
115 provider.put(dictionary);
116
117 // create vector and encode it
118 final VarCharVector vector = new VarCharVector("vector", allocator);
119 vector.allocateNewSafe();
120 vector.setSafe(0, "bb".getBytes());
121 vector.setSafe(1, "bb".getBytes());
122 vector.setSafe(2, "cc".getBytes());
123 vector.setSafe(3, "aa".getBytes());
124 vector.setValueCount(4);
125
126 // get the encoded vector
127 IntVector encodedVector = (IntVector) DictionaryEncoder.encode(vector, dictionary);
128
129 // create VectorSchemaRoot
130 List<Field> fields = Arrays.asList(encodedVector.getField());
131 List<FieldVector> vectors = Arrays.asList(encodedVector);
132 VectorSchemaRoot root = new VectorSchemaRoot(fields, vectors);
133
134 // write data
135 ByteArrayOutputStream out = new ByteArrayOutputStream();
136 ArrowStreamWriter writer = new ArrowStreamWriter(root, provider, Channels.newChannel(out));
137 writer.start();
138 writer.writeBatch();
139 writer.end();
140
141 // read data
142 try (ArrowStreamReader reader = new ArrowStreamReader(new ByteArrayInputStream(out.toByteArray()), allocator)) {
143 reader.loadNextBatch();
144 VectorSchemaRoot readRoot = reader.getVectorSchemaRoot();
145 // get the encoded vector
146 IntVector intVector = (IntVector) readRoot.getVector(0);
147
148 // get dictionaries and decode the vector
149 Map<Long, Dictionary> dictionaryMap = reader.getDictionaryVectors();
150 long dictionaryId = intVector.getField().getDictionary().getId();
151 VarCharVector varCharVector =
152 (VarCharVector) DictionaryEncoder.decode(intVector, dictionaryMap.get(dictionaryId));
153
154 }
155
156 Writing and Reading Random Access Files
157 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
158 The :class:`ArrowFileWriter` has the same API as :class:`ArrowStreamWriter`
159
160 .. code-block:: Java
161
162 ByteArrayOutputStream out = new ByteArrayOutputStream();
163 ArrowFileWriter writer = new ArrowFileWriter(root, null, Channels.newChannel(out));
164 writer.start();
165 // write the first batch
166 writer.writeBatch();
167 // write another four batches.
168 for (int i = 0; i < 4; i++) {
169 ... do populate work
170 writer.writeBatch();
171 }
172 writer.end();
173
174 The difference between :class:`ArrowFileReader` and :class:`ArrowStreamReader` is that the input source
175 must have a ``seek`` method for random access. Because we have access to the entire payload, we know the
176 number of record batches in the file, and can read any at random
177
178 .. code-block:: Java
179
180 try (ArrowFileReader reader = new ArrowFileReader(
181 new ByteArrayReadableSeekableByteChannel(out.toByteArray()), allocator)) {
182
183 // read the 4-th batch
184 ArrowBlock block = reader.getRecordBlocks().get(3);
185 reader.loadRecordBatch(block);
186 VectorSchemaRoot readBatch = reader.getVectorSchemaRoot();
187 }