]>
Commit | Line | Data |
---|---|---|
1d09f67e TL |
1 | /* |
2 | * Licensed to the Apache Software Foundation (ASF) under one or more | |
3 | * contributor license agreements. See the NOTICE file distributed with | |
4 | * this work for additional information regarding copyright ownership. | |
5 | * The ASF licenses this file to You under the Apache License, Version 2.0 | |
6 | * (the "License"); you may not use this file except in compliance with | |
7 | * the License. You may obtain a copy of the License at | |
8 | * | |
9 | * http://www.apache.org/licenses/LICENSE-2.0 | |
10 | * | |
11 | * Unless required by applicable law or agreed to in writing, software | |
12 | * distributed under the License is distributed on an "AS IS" BASIS, | |
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
14 | * See the License for the specific language governing permissions and | |
15 | * limitations under the License. | |
16 | */ | |
17 | ||
18 | package org.apache.arrow.algorithm.deduplicate; | |
19 | ||
20 | import org.apache.arrow.memory.ArrowBuf; | |
21 | import org.apache.arrow.memory.BufferAllocator; | |
22 | import org.apache.arrow.util.Preconditions; | |
23 | import org.apache.arrow.vector.BitVectorHelper; | |
24 | import org.apache.arrow.vector.IntVector; | |
25 | import org.apache.arrow.vector.ValueVector; | |
26 | import org.apache.arrow.vector.util.DataSizeRoundingUtil; | |
27 | ||
28 | /** | |
29 | * Remove adjacent equal elements from a vector. | |
30 | * If the vector is sorted, it removes all duplicated values in the vector. | |
31 | * @param <V> vector type. | |
32 | */ | |
33 | public class VectorRunDeduplicator<V extends ValueVector> implements AutoCloseable { | |
34 | ||
35 | /** | |
36 | * Bit set for distinct values. | |
37 | * If the value at some index is not equal to the previous value, | |
38 | * its bit is set to 1, otherwise its bit is set to 0. | |
39 | */ | |
40 | private ArrowBuf distinctValueBuffer; | |
41 | ||
42 | /** | |
43 | * The vector to deduplicate. | |
44 | */ | |
45 | private final V vector; | |
46 | ||
47 | private final BufferAllocator allocator; | |
48 | ||
49 | /** | |
50 | * Constructs a vector run deduplicator for a given vector. | |
51 | * @param vector the vector to deduplicate. Ownership is NOT taken. | |
52 | * @param allocator the allocator used for allocating buffers for start indices. | |
53 | */ | |
54 | public VectorRunDeduplicator(V vector, BufferAllocator allocator) { | |
55 | this.vector = vector; | |
56 | this.allocator = allocator; | |
57 | } | |
58 | ||
59 | private void createDistinctValueBuffer() { | |
60 | Preconditions.checkArgument(distinctValueBuffer == null); | |
61 | int bufSize = DataSizeRoundingUtil.divideBy8Ceil(vector.getValueCount()); | |
62 | distinctValueBuffer = allocator.buffer(bufSize); | |
63 | DeduplicationUtils.populateRunStartIndicators(vector, distinctValueBuffer); | |
64 | } | |
65 | ||
66 | /** | |
67 | * Gets the number of values which are different from their predecessor. | |
68 | * @return the run count. | |
69 | */ | |
70 | public int getRunCount() { | |
71 | if (distinctValueBuffer == null) { | |
72 | createDistinctValueBuffer(); | |
73 | } | |
74 | return vector.getValueCount() - BitVectorHelper.getNullCount(distinctValueBuffer, vector.getValueCount()); | |
75 | } | |
76 | ||
77 | /** | |
78 | * Gets the vector with deduplicated adjacent values removed. | |
79 | * @param outVector the output vector. | |
80 | */ | |
81 | public void populateDeduplicatedValues(V outVector) { | |
82 | if (distinctValueBuffer == null) { | |
83 | createDistinctValueBuffer(); | |
84 | } | |
85 | ||
86 | DeduplicationUtils.populateDeduplicatedValues(distinctValueBuffer, vector, outVector); | |
87 | } | |
88 | ||
89 | /** | |
90 | * Gets the length of each distinct value. | |
91 | * @param lengthVector the vector for holding length values. | |
92 | */ | |
93 | public void populateRunLengths(IntVector lengthVector) { | |
94 | if (distinctValueBuffer == null) { | |
95 | createDistinctValueBuffer(); | |
96 | } | |
97 | ||
98 | DeduplicationUtils.populateRunLengths(distinctValueBuffer, lengthVector, vector.getValueCount()); | |
99 | } | |
100 | ||
101 | @Override | |
102 | public void close() { | |
103 | if (distinctValueBuffer != null) { | |
104 | distinctValueBuffer.close(); | |
105 | distinctValueBuffer = null; | |
106 | } | |
107 | } | |
108 | } |