]> git.proxmox.com Git - ceph.git/blame - ceph/src/arrow/java/algorithm/src/main/java/org/apache/arrow/algorithm/deduplicate/VectorRunDeduplicator.java
import quincy 17.2.0
[ceph.git] / ceph / src / arrow / java / algorithm / src / main / java / org / apache / arrow / algorithm / deduplicate / VectorRunDeduplicator.java
CommitLineData
1d09f67e
TL
1/*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18package org.apache.arrow.algorithm.deduplicate;
19
20import org.apache.arrow.memory.ArrowBuf;
21import org.apache.arrow.memory.BufferAllocator;
22import org.apache.arrow.util.Preconditions;
23import org.apache.arrow.vector.BitVectorHelper;
24import org.apache.arrow.vector.IntVector;
25import org.apache.arrow.vector.ValueVector;
26import org.apache.arrow.vector.util.DataSizeRoundingUtil;
27
28/**
29 * Remove adjacent equal elements from a vector.
30 * If the vector is sorted, it removes all duplicated values in the vector.
31 * @param <V> vector type.
32 */
33public class VectorRunDeduplicator<V extends ValueVector> implements AutoCloseable {
34
35 /**
36 * Bit set for distinct values.
37 * If the value at some index is not equal to the previous value,
38 * its bit is set to 1, otherwise its bit is set to 0.
39 */
40 private ArrowBuf distinctValueBuffer;
41
42 /**
43 * The vector to deduplicate.
44 */
45 private final V vector;
46
47 private final BufferAllocator allocator;
48
49 /**
50 * Constructs a vector run deduplicator for a given vector.
51 * @param vector the vector to deduplicate. Ownership is NOT taken.
52 * @param allocator the allocator used for allocating buffers for start indices.
53 */
54 public VectorRunDeduplicator(V vector, BufferAllocator allocator) {
55 this.vector = vector;
56 this.allocator = allocator;
57 }
58
59 private void createDistinctValueBuffer() {
60 Preconditions.checkArgument(distinctValueBuffer == null);
61 int bufSize = DataSizeRoundingUtil.divideBy8Ceil(vector.getValueCount());
62 distinctValueBuffer = allocator.buffer(bufSize);
63 DeduplicationUtils.populateRunStartIndicators(vector, distinctValueBuffer);
64 }
65
66 /**
67 * Gets the number of values which are different from their predecessor.
68 * @return the run count.
69 */
70 public int getRunCount() {
71 if (distinctValueBuffer == null) {
72 createDistinctValueBuffer();
73 }
74 return vector.getValueCount() - BitVectorHelper.getNullCount(distinctValueBuffer, vector.getValueCount());
75 }
76
77 /**
78 * Gets the vector with deduplicated adjacent values removed.
79 * @param outVector the output vector.
80 */
81 public void populateDeduplicatedValues(V outVector) {
82 if (distinctValueBuffer == null) {
83 createDistinctValueBuffer();
84 }
85
86 DeduplicationUtils.populateDeduplicatedValues(distinctValueBuffer, vector, outVector);
87 }
88
89 /**
90 * Gets the length of each distinct value.
91 * @param lengthVector the vector for holding length values.
92 */
93 public void populateRunLengths(IntVector lengthVector) {
94 if (distinctValueBuffer == null) {
95 createDistinctValueBuffer();
96 }
97
98 DeduplicationUtils.populateRunLengths(distinctValueBuffer, lengthVector, vector.getValueCount());
99 }
100
101 @Override
102 public void close() {
103 if (distinctValueBuffer != null) {
104 distinctValueBuffer.close();
105 distinctValueBuffer = null;
106 }
107 }
108}