]>
Commit | Line | Data |
---|---|---|
1d09f67e TL |
1 | /* |
2 | * Licensed to the Apache Software Foundation (ASF) under one or more | |
3 | * contributor license agreements. See the NOTICE file distributed with | |
4 | * this work for additional information regarding copyright ownership. | |
5 | * The ASF licenses this file to You under the Apache License, Version 2.0 | |
6 | * (the "License"); you may not use this file except in compliance with | |
7 | * the License. You may obtain a copy of the License at | |
8 | * | |
9 | * http://www.apache.org/licenses/LICENSE-2.0 | |
10 | * | |
11 | * Unless required by applicable law or agreed to in writing, software | |
12 | * distributed under the License is distributed on an "AS IS" BASIS, | |
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
14 | * See the License for the specific language governing permissions and | |
15 | * limitations under the License. | |
16 | */ | |
17 | ||
18 | package org.apache.arrow.vector.util; | |
19 | ||
20 | import java.util.ArrayList; | |
21 | import java.util.List; | |
22 | import java.util.Map; | |
23 | import java.util.Set; | |
24 | ||
25 | import org.apache.arrow.memory.BufferAllocator; | |
26 | import org.apache.arrow.vector.FieldVector; | |
27 | import org.apache.arrow.vector.dictionary.Dictionary; | |
28 | import org.apache.arrow.vector.dictionary.DictionaryProvider; | |
29 | import org.apache.arrow.vector.types.pojo.ArrowType; | |
30 | import org.apache.arrow.vector.types.pojo.DictionaryEncoding; | |
31 | import org.apache.arrow.vector.types.pojo.Field; | |
32 | import org.apache.arrow.vector.types.pojo.FieldType; | |
33 | ||
34 | /** | |
35 | * Utility methods for working with Dictionaries used in Dictionary encodings. | |
36 | */ | |
37 | public class DictionaryUtility { | |
38 | private DictionaryUtility() {} | |
39 | ||
40 | /** | |
41 | * Convert field and child fields that have a dictionary encoding to message format, so fields | |
42 | * have the dictionary type. | |
43 | * | |
44 | * <p>NOTE: in the message format, fields have the dictionary type | |
45 | * in the memory format, they have the index type | |
46 | */ | |
47 | public static Field toMessageFormat(Field field, DictionaryProvider provider, Set<Long> dictionaryIdsUsed) { | |
48 | if (!needConvertToMessageFormat(field)) { | |
49 | return field; | |
50 | } | |
51 | DictionaryEncoding encoding = field.getDictionary(); | |
52 | List<Field> children; | |
53 | ||
54 | ||
55 | ArrowType type; | |
56 | if (encoding == null) { | |
57 | type = field.getType(); | |
58 | children = field.getChildren(); | |
59 | } else { | |
60 | long id = encoding.getId(); | |
61 | Dictionary dictionary = provider.lookup(id); | |
62 | if (dictionary == null) { | |
63 | throw new IllegalArgumentException("Could not find dictionary with ID " + id); | |
64 | } | |
65 | type = dictionary.getVectorType(); | |
66 | children = dictionary.getVector().getField().getChildren(); | |
67 | ||
68 | dictionaryIdsUsed.add(id); | |
69 | } | |
70 | ||
71 | final List<Field> updatedChildren = new ArrayList<>(children.size()); | |
72 | for (Field child : children) { | |
73 | updatedChildren.add(toMessageFormat(child, provider, dictionaryIdsUsed)); | |
74 | } | |
75 | ||
76 | return new Field(field.getName(), new FieldType(field.isNullable(), type, encoding, field.getMetadata()), | |
77 | updatedChildren); | |
78 | } | |
79 | ||
80 | /** | |
81 | * Checks if it is required to convert the field to message format. | |
82 | * @param field the field to check. | |
83 | * @return true if a conversion is required, and false otherwise. | |
84 | */ | |
85 | public static boolean needConvertToMessageFormat(Field field) { | |
86 | DictionaryEncoding encoding = field.getDictionary(); | |
87 | ||
88 | if (encoding != null) { | |
89 | // when encoding is not null, the type must be determined from the | |
90 | // dictionary, so conversion must be performed. | |
91 | return true; | |
92 | } | |
93 | ||
94 | List<Field> children = field.getChildren(); | |
95 | for (Field child : children) { | |
96 | if (needConvertToMessageFormat(child)) { | |
97 | return true; | |
98 | } | |
99 | } | |
100 | return false; | |
101 | } | |
102 | ||
103 | /** | |
104 | * Convert field and child fields that have a dictionary encoding to memory format, so fields | |
105 | * have the index type. | |
106 | */ | |
107 | public static Field toMemoryFormat(Field field, BufferAllocator allocator, Map<Long, Dictionary> dictionaries) { | |
108 | DictionaryEncoding encoding = field.getDictionary(); | |
109 | List<Field> children = field.getChildren(); | |
110 | ||
111 | if (encoding == null && children.isEmpty()) { | |
112 | return field; | |
113 | } | |
114 | ||
115 | List<Field> updatedChildren = new ArrayList<>(children.size()); | |
116 | for (Field child : children) { | |
117 | updatedChildren.add(toMemoryFormat(child, allocator, dictionaries)); | |
118 | } | |
119 | ||
120 | ArrowType type; | |
121 | List<Field> fieldChildren = null; | |
122 | if (encoding == null) { | |
123 | type = field.getType(); | |
124 | fieldChildren = updatedChildren; | |
125 | } else { | |
126 | // re-type the field for in-memory format | |
127 | type = encoding.getIndexType(); | |
128 | if (type == null) { | |
129 | type = new ArrowType.Int(32, true); | |
130 | } | |
131 | // get existing or create dictionary vector | |
132 | if (!dictionaries.containsKey(encoding.getId())) { | |
133 | // create a new dictionary vector for the values | |
134 | String dictName = "DICT" + encoding.getId(); | |
135 | Field dictionaryField = new Field(dictName, | |
136 | new FieldType(field.isNullable(), field.getType(), null, null), updatedChildren); | |
137 | FieldVector dictionaryVector = dictionaryField.createVector(allocator); | |
138 | dictionaries.put(encoding.getId(), new Dictionary(dictionaryVector, encoding)); | |
139 | } | |
140 | } | |
141 | ||
142 | return new Field(field.getName(), new FieldType(field.isNullable(), type, encoding, field.getMetadata()), | |
143 | fieldChildren); | |
144 | } | |
145 | } |