]>
Commit | Line | Data |
---|---|---|
1d09f67e TL |
1 | /* |
2 | * Licensed to the Apache Software Foundation (ASF) under one or more | |
3 | * contributor license agreements. See the NOTICE file distributed with | |
4 | * this work for additional information regarding copyright ownership. | |
5 | * The ASF licenses this file to You under the Apache License, Version 2.0 | |
6 | * (the "License"); you may not use this file except in compliance with | |
7 | * the License. You may obtain a copy of the License at | |
8 | * | |
9 | * http://www.apache.org/licenses/LICENSE-2.0 | |
10 | * | |
11 | * Unless required by applicable law or agreed to in writing, software | |
12 | * distributed under the License is distributed on an "AS IS" BASIS, | |
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
14 | * See the License for the specific language governing permissions and | |
15 | * limitations under the License. | |
16 | */ | |
17 | ||
18 | package org.apache.arrow.vector.dictionary; | |
19 | ||
20 | import java.util.Collections; | |
21 | ||
22 | import org.apache.arrow.memory.BufferAllocator; | |
23 | import org.apache.arrow.memory.util.hash.ArrowBufHasher; | |
24 | import org.apache.arrow.memory.util.hash.SimpleHasher; | |
25 | import org.apache.arrow.vector.BaseIntVector; | |
26 | import org.apache.arrow.vector.FieldVector; | |
27 | import org.apache.arrow.vector.ValueVector; | |
28 | import org.apache.arrow.vector.complex.BaseListVector; | |
29 | import org.apache.arrow.vector.ipc.message.ArrowFieldNode; | |
30 | import org.apache.arrow.vector.types.pojo.Field; | |
31 | import org.apache.arrow.vector.types.pojo.FieldType; | |
32 | import org.apache.arrow.vector.util.TransferPair; | |
33 | ||
34 | /** | |
35 | * Sub fields encoder/decoder for Dictionary encoded {@link BaseListVector}. | |
36 | */ | |
37 | public class ListSubfieldEncoder { | |
38 | ||
39 | private final DictionaryHashTable hashTable; | |
40 | private final Dictionary dictionary; | |
41 | private final BufferAllocator allocator; | |
42 | ||
43 | public ListSubfieldEncoder(Dictionary dictionary, BufferAllocator allocator) { | |
44 | this (dictionary, allocator, SimpleHasher.INSTANCE); | |
45 | } | |
46 | ||
47 | /** | |
48 | * Construct an instance. | |
49 | */ | |
50 | public ListSubfieldEncoder(Dictionary dictionary, BufferAllocator allocator, ArrowBufHasher hasher) { | |
51 | this.dictionary = dictionary; | |
52 | this.allocator = allocator; | |
53 | BaseListVector dictVector = (BaseListVector) dictionary.getVector(); | |
54 | hashTable = new DictionaryHashTable(getDataVector(dictVector), hasher); | |
55 | } | |
56 | ||
57 | private FieldVector getDataVector(BaseListVector vector) { | |
58 | return vector.getChildrenFromFields().get(0); | |
59 | } | |
60 | ||
61 | private BaseListVector cloneVector(BaseListVector vector) { | |
62 | ||
63 | final FieldType fieldType = vector.getField().getFieldType(); | |
64 | BaseListVector cloned = (BaseListVector) fieldType.createNewSingleVector(vector.getField().getName(), | |
65 | allocator, /*schemaCallBack=*/null); | |
66 | ||
67 | final ArrowFieldNode fieldNode = new ArrowFieldNode(vector.getValueCount(), vector.getNullCount()); | |
68 | cloned.loadFieldBuffers(fieldNode, vector.getFieldBuffers()); | |
69 | ||
70 | return cloned; | |
71 | } | |
72 | ||
73 | /** | |
74 | * Dictionary encodes subfields for complex vector with a provided dictionary. | |
75 | * The dictionary must contain all values in the sub fields vector. | |
76 | * @param vector vector to encode | |
77 | * @return dictionary encoded vector | |
78 | */ | |
79 | public BaseListVector encodeListSubField(BaseListVector vector) { | |
80 | final int valueCount = vector.getValueCount(); | |
81 | ||
82 | FieldType indexFieldType = new FieldType(vector.getField().isNullable(), | |
83 | dictionary.getEncoding().getIndexType(), dictionary.getEncoding(), vector.getField().getMetadata()); | |
84 | Field valueField = new Field(vector.getField().getName(), indexFieldType, null); | |
85 | ||
86 | // clone list vector and initialize data vector | |
87 | BaseListVector encoded = cloneVector(vector); | |
88 | encoded.initializeChildrenFromFields(Collections.singletonList(valueField)); | |
89 | BaseIntVector indices = (BaseIntVector) getDataVector(encoded); | |
90 | ||
91 | ValueVector dataVector = getDataVector(vector); | |
92 | for (int i = 0; i < valueCount; i++) { | |
93 | if (!vector.isNull(i)) { | |
94 | int start = vector.getElementStartIndex(i); | |
95 | int end = vector.getElementEndIndex(i); | |
96 | ||
97 | DictionaryEncoder.buildIndexVector(dataVector, indices, hashTable, start, end); | |
98 | } | |
99 | } | |
100 | ||
101 | return encoded; | |
102 | } | |
103 | ||
104 | /** | |
105 | * Decodes a dictionary subfields encoded vector using the provided dictionary. | |
106 | * @param vector dictionary encoded vector, its data vector must be int type | |
107 | * @return vector with values restored from dictionary | |
108 | */ | |
109 | public BaseListVector decodeListSubField(BaseListVector vector) { | |
110 | ||
111 | int valueCount = vector.getValueCount(); | |
112 | BaseListVector dictionaryVector = (BaseListVector) dictionary.getVector(); | |
113 | int dictionaryValueCount = getDataVector(dictionaryVector).getValueCount(); | |
114 | ||
115 | // clone list vector and initialize data vector | |
116 | BaseListVector decoded = cloneVector(vector); | |
117 | Field dataVectorField = getDataVector(dictionaryVector).getField(); | |
118 | decoded.initializeChildrenFromFields(Collections.singletonList(dataVectorField)); | |
119 | ||
120 | // get data vector | |
121 | ValueVector dataVector = getDataVector(decoded); | |
122 | ||
123 | TransferPair transfer = getDataVector(dictionaryVector).makeTransferPair(dataVector); | |
124 | BaseIntVector indices = (BaseIntVector) getDataVector(vector); | |
125 | ||
126 | for (int i = 0; i < valueCount; i++) { | |
127 | ||
128 | if (!vector.isNull(i)) { | |
129 | int start = vector.getElementStartIndex(i); | |
130 | int end = vector.getElementEndIndex(i); | |
131 | ||
132 | DictionaryEncoder.retrieveIndexVector(indices, transfer, dictionaryValueCount, start, end); | |
133 | } | |
134 | } | |
135 | return decoded; | |
136 | } | |
137 | } |