]> git.proxmox.com Git - ceph.git/blame - ceph/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/util/DictionaryUtility.java
import quincy 17.2.0
[ceph.git] / ceph / src / arrow / java / vector / src / main / java / org / apache / arrow / vector / util / DictionaryUtility.java
CommitLineData
1d09f67e
TL
1/*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18package org.apache.arrow.vector.util;
19
20import java.util.ArrayList;
21import java.util.List;
22import java.util.Map;
23import java.util.Set;
24
25import org.apache.arrow.memory.BufferAllocator;
26import org.apache.arrow.vector.FieldVector;
27import org.apache.arrow.vector.dictionary.Dictionary;
28import org.apache.arrow.vector.dictionary.DictionaryProvider;
29import org.apache.arrow.vector.types.pojo.ArrowType;
30import org.apache.arrow.vector.types.pojo.DictionaryEncoding;
31import org.apache.arrow.vector.types.pojo.Field;
32import org.apache.arrow.vector.types.pojo.FieldType;
33
34/**
35 * Utility methods for working with Dictionaries used in Dictionary encodings.
36 */
37public class DictionaryUtility {
38 private DictionaryUtility() {}
39
40 /**
41 * Convert field and child fields that have a dictionary encoding to message format, so fields
42 * have the dictionary type.
43 *
44 * <p>NOTE: in the message format, fields have the dictionary type
45 * in the memory format, they have the index type
46 */
47 public static Field toMessageFormat(Field field, DictionaryProvider provider, Set<Long> dictionaryIdsUsed) {
48 if (!needConvertToMessageFormat(field)) {
49 return field;
50 }
51 DictionaryEncoding encoding = field.getDictionary();
52 List<Field> children;
53
54
55 ArrowType type;
56 if (encoding == null) {
57 type = field.getType();
58 children = field.getChildren();
59 } else {
60 long id = encoding.getId();
61 Dictionary dictionary = provider.lookup(id);
62 if (dictionary == null) {
63 throw new IllegalArgumentException("Could not find dictionary with ID " + id);
64 }
65 type = dictionary.getVectorType();
66 children = dictionary.getVector().getField().getChildren();
67
68 dictionaryIdsUsed.add(id);
69 }
70
71 final List<Field> updatedChildren = new ArrayList<>(children.size());
72 for (Field child : children) {
73 updatedChildren.add(toMessageFormat(child, provider, dictionaryIdsUsed));
74 }
75
76 return new Field(field.getName(), new FieldType(field.isNullable(), type, encoding, field.getMetadata()),
77 updatedChildren);
78 }
79
80 /**
81 * Checks if it is required to convert the field to message format.
82 * @param field the field to check.
83 * @return true if a conversion is required, and false otherwise.
84 */
85 public static boolean needConvertToMessageFormat(Field field) {
86 DictionaryEncoding encoding = field.getDictionary();
87
88 if (encoding != null) {
89 // when encoding is not null, the type must be determined from the
90 // dictionary, so conversion must be performed.
91 return true;
92 }
93
94 List<Field> children = field.getChildren();
95 for (Field child : children) {
96 if (needConvertToMessageFormat(child)) {
97 return true;
98 }
99 }
100 return false;
101 }
102
103 /**
104 * Convert field and child fields that have a dictionary encoding to memory format, so fields
105 * have the index type.
106 */
107 public static Field toMemoryFormat(Field field, BufferAllocator allocator, Map<Long, Dictionary> dictionaries) {
108 DictionaryEncoding encoding = field.getDictionary();
109 List<Field> children = field.getChildren();
110
111 if (encoding == null && children.isEmpty()) {
112 return field;
113 }
114
115 List<Field> updatedChildren = new ArrayList<>(children.size());
116 for (Field child : children) {
117 updatedChildren.add(toMemoryFormat(child, allocator, dictionaries));
118 }
119
120 ArrowType type;
121 List<Field> fieldChildren = null;
122 if (encoding == null) {
123 type = field.getType();
124 fieldChildren = updatedChildren;
125 } else {
126 // re-type the field for in-memory format
127 type = encoding.getIndexType();
128 if (type == null) {
129 type = new ArrowType.Int(32, true);
130 }
131 // get existing or create dictionary vector
132 if (!dictionaries.containsKey(encoding.getId())) {
133 // create a new dictionary vector for the values
134 String dictName = "DICT" + encoding.getId();
135 Field dictionaryField = new Field(dictName,
136 new FieldType(field.isNullable(), field.getType(), null, null), updatedChildren);
137 FieldVector dictionaryVector = dictionaryField.createVector(allocator);
138 dictionaries.put(encoding.getId(), new Dictionary(dictionaryVector, encoding));
139 }
140 }
141
142 return new Field(field.getName(), new FieldType(field.isNullable(), type, encoding, field.getMetadata()),
143 fieldChildren);
144 }
145}