ceph/src/arrow/docs/source/java/ipc.rst

   1 .. Licensed to the Apache Software Foundation (ASF) under one
   2 .. or more contributor license agreements.  See the NOTICE file
   3 .. distributed with this work for additional information
   4 .. regarding copyright ownership.  The ASF licenses this file
   5 .. to you under the Apache License, Version 2.0 (the
   6 .. "License"); you may not use this file except in compliance
   7 .. with the License.  You may obtain a copy of the License at
   8
   9 ..   http://www.apache.org/licenses/LICENSE-2.0
  10
  11 .. Unless required by applicable law or agreed to in writing,
  12 .. software distributed under the License is distributed on an
  13 .. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  14 .. KIND, either express or implied.  See the License for the
  15 .. specific language governing permissions and limitations
  16 .. under the License.
  17
  18 ===========================
  19 Reading/Writing IPC formats
  20 ===========================
  21 Arrow defines two types of binary formats for serializing record batches:
  22
  23 * **Streaming format**: for sending an arbitrary number of record
  24   batches. The format must be processed from start to end, and does not support
  25   random access
  26
  27 * **File or Random Access format**: for serializing a fixed number of record
  28   batches. It supports random access, and thus is very useful when used with
  29   memory maps
  30
  31 Writing and Reading Streaming Format
  32 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  33 First, let's populate a :class:`VectorSchemaRoot` with a small batch of records
  34
  35 .. code-block:: Java
  36
  37     BitVector bitVector = new BitVector("boolean", allocator);
  38     VarCharVector varCharVector = new VarCharVector("varchar", allocator);
  39     for (int i = 0; i < 10; i++) {
  40       bitVector.setSafe(i, i % 2 == 0 ? 0 : 1);
  41       varCharVector.setSafe(i, ("test" + i).getBytes(StandardCharsets.UTF_8));
  42     }
  43     bitVector.setValueCount(10);
  44     varCharVector.setValueCount(10);
  45
  46     List<Field> fields = Arrays.asList(bitVector.getField(), varCharVector.getField());
  47     List<FieldVector> vectors = Arrays.asList(bitVector, varCharVector);
  48     VectorSchemaRoot root = new VectorSchemaRoot(fields, vectors);
  49
  50 Now, we can begin writing a stream containing some number of these batches. For this we use :class:`ArrowStreamWriter`
  51 (DictionaryProvider used for any vectors that are dictionary encoded is optional and can be null))::
  52
  53     ByteArrayOutputStream out = new ByteArrayOutputStream();
  54     ArrowStreamWriter writer = new ArrowStreamWriter(root, /*DictionaryProvider=*/null, Channels.newChannel(out));
  55
  56
  57 Here we used an in-memory stream, but this could have been a socket or some other IO stream. Then we can do
  58
  59 .. code-block:: Java
  60
  61     writer.start();
  62     // write the first batch
  63     writer.writeBatch();
  64
  65     // write another four batches.
  66     for (int i = 0; i < 4; i++) {
  67       // populate VectorSchemaRoot data and write the second batch
  68       BitVector childVector1 = (BitVector)root.getVector(0);
  69       VarCharVector childVector2 = (VarCharVector)root.getVector(1);
  70       childVector1.reset();
  71       childVector2.reset();
  72       ... do some populate work here, could be different for each batch
  73       writer.writeBatch();
  74     }
  75
  76     // end
  77     writer.end();
  78
  79 Note since the :class:`VectorSchemaRoot` in writer is a container that can hold batches, batches flow through
  80 :class:`VectorSchemaRoot` as part of a pipeline, so we need to populate data before `writeBatch` so that later batches
  81 could overwrite previous ones.
  82
  83 Now the :class:`ByteArrayOutputStream` contains the complete stream which contains 5 record batches.
  84 We can read such a stream with :class:`ArrowStreamReader`, note that :class:`VectorSchemaRoot` within
  85 reader will be loaded with new values on every call to :class:`loadNextBatch()`
  86
  87 .. code-block:: Java
  88
  89     try (ArrowStreamReader reader = new ArrowStreamReader(new ByteArrayInputStream(out.toByteArray()), allocator)) {
  90       Schema schema = reader.getVectorSchemaRoot().getSchema();
  91       for (int i = 0; i < 5; i++) {
  92         // This will be loaded with new values on every call to loadNextBatch
  93         VectorSchemaRoot readBatch = reader.getVectorSchemaRoot();
  94         reader.loadNextBatch();
  95         ... do something with readBatch
  96       }
  97
  98     }
  99
 100 Here we also give a simple example with dictionary encoded vectors
 101
 102 .. code-block:: Java
 103
 104     DictionaryProvider.MapDictionaryProvider provider = new DictionaryProvider.MapDictionaryProvider();
 105     // create dictionary and provider
 106     final VarCharVector dictVector = new VarCharVector("dict", allocator);
 107     dictVector.allocateNewSafe();
 108     dictVector.setSafe(0, "aa".getBytes());
 109     dictVector.setSafe(1, "bb".getBytes());
 110     dictVector.setSafe(2, "cc".getBytes());
 111     dictVector.setValueCount(3);
 112
 113     Dictionary dictionary =
 114         new Dictionary(dictVector, new DictionaryEncoding(1L, false, /*indexType=*/null));
 115     provider.put(dictionary);
 116
 117     // create vector and encode it
 118     final VarCharVector vector = new VarCharVector("vector", allocator);
 119     vector.allocateNewSafe();
 120     vector.setSafe(0, "bb".getBytes());
 121     vector.setSafe(1, "bb".getBytes());
 122     vector.setSafe(2, "cc".getBytes());
 123     vector.setSafe(3, "aa".getBytes());
 124     vector.setValueCount(4);
 125
 126     // get the encoded vector
 127     IntVector encodedVector = (IntVector) DictionaryEncoder.encode(vector, dictionary);
 128
 129     // create VectorSchemaRoot
 130     List<Field> fields = Arrays.asList(encodedVector.getField());
 131     List<FieldVector> vectors = Arrays.asList(encodedVector);
 132     VectorSchemaRoot root = new VectorSchemaRoot(fields, vectors);
 133
 134     // write data
 135     ByteArrayOutputStream out = new ByteArrayOutputStream();
 136     ArrowStreamWriter writer = new ArrowStreamWriter(root, provider, Channels.newChannel(out));
 137     writer.start();
 138     writer.writeBatch();
 139     writer.end();
 140
 141     // read data
 142     try (ArrowStreamReader reader = new ArrowStreamReader(new ByteArrayInputStream(out.toByteArray()), allocator)) {
 143       reader.loadNextBatch();
 144       VectorSchemaRoot readRoot = reader.getVectorSchemaRoot();
 145       // get the encoded vector
 146       IntVector intVector = (IntVector) readRoot.getVector(0);
 147
 148       // get dictionaries and decode the vector
 149       Map<Long, Dictionary> dictionaryMap = reader.getDictionaryVectors();
 150       long dictionaryId = intVector.getField().getDictionary().getId();
 151       VarCharVector varCharVector =
 152           (VarCharVector) DictionaryEncoder.decode(intVector, dictionaryMap.get(dictionaryId));
 153
 154     }
 155
 156 Writing and Reading Random Access Files
 157 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 158 The :class:`ArrowFileWriter` has the same API as :class:`ArrowStreamWriter`
 159
 160 .. code-block:: Java
 161
 162     ByteArrayOutputStream out = new ByteArrayOutputStream();
 163     ArrowFileWriter writer = new ArrowFileWriter(root, null, Channels.newChannel(out));
 164     writer.start();
 165     // write the first batch
 166     writer.writeBatch();
 167     // write another four batches.
 168     for (int i = 0; i < 4; i++) {
 169       ... do populate work
 170       writer.writeBatch();
 171     }
 172     writer.end();
 173
 174 The difference between :class:`ArrowFileReader` and :class:`ArrowStreamReader` is that the input source
 175 must have a ``seek`` method for random access. Because we have access to the entire payload, we know the
 176 number of record batches in the file, and can read any at random
 177
 178 .. code-block:: Java
 179
 180     try (ArrowFileReader reader = new ArrowFileReader(
 181         new ByteArrayReadableSeekableByteChannel(out.toByteArray()), allocator)) {
 182
 183       // read the 4-th batch
 184       ArrowBlock block = reader.getRecordBlocks().get(3);
 185       reader.loadRecordBatch(block);
 186       VectorSchemaRoot readBatch = reader.getVectorSchemaRoot();
 187     }