]>
Commit | Line | Data |
---|---|---|
1d09f67e TL |
1 | /* |
2 | * Licensed to the Apache Software Foundation (ASF) under one or more | |
3 | * contributor license agreements. See the NOTICE file distributed with | |
4 | * this work for additional information regarding copyright ownership. | |
5 | * The ASF licenses this file to You under the Apache License, Version 2.0 | |
6 | * (the "License"); you may not use this file except in compliance with | |
7 | * the License. You may obtain a copy of the License at | |
8 | * | |
9 | * http://www.apache.org/licenses/LICENSE-2.0 | |
10 | * | |
11 | * Unless required by applicable law or agreed to in writing, software | |
12 | * distributed under the License is distributed on an "AS IS" BASIS, | |
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
14 | * See the License for the specific language governing permissions and | |
15 | * limitations under the License. | |
16 | */ | |
17 | ||
18 | package org.apache.arrow.algorithm.deduplicate; | |
19 | ||
20 | import org.apache.arrow.memory.ArrowBuf; | |
21 | import org.apache.arrow.util.Preconditions; | |
22 | import org.apache.arrow.vector.BitVectorHelper; | |
23 | import org.apache.arrow.vector.IntVector; | |
24 | import org.apache.arrow.vector.ValueVector; | |
25 | import org.apache.arrow.vector.compare.Range; | |
26 | import org.apache.arrow.vector.compare.RangeEqualsVisitor; | |
27 | import org.apache.arrow.vector.util.DataSizeRoundingUtil; | |
28 | ||
29 | /** | |
30 | * Utilities for vector deduplication. | |
31 | */ | |
32 | class DeduplicationUtils { | |
33 | ||
34 | /** | |
35 | * Gets the start positions of the first distinct values in a vector. | |
36 | * @param vector the target vector. | |
37 | * @param runStarts the bit set to hold the start positions. | |
38 | * @param <V> vector type. | |
39 | */ | |
40 | public static <V extends ValueVector> void populateRunStartIndicators(V vector, ArrowBuf runStarts) { | |
41 | int bufSize = DataSizeRoundingUtil.divideBy8Ceil(vector.getValueCount()); | |
42 | Preconditions.checkArgument(runStarts.capacity() >= bufSize); | |
43 | runStarts.setZero(0, bufSize); | |
44 | ||
45 | BitVectorHelper.setBit(runStarts, 0); | |
46 | RangeEqualsVisitor visitor = new RangeEqualsVisitor(vector, vector, null); | |
47 | Range range = new Range(0, 0, 1); | |
48 | for (int i = 1; i < vector.getValueCount(); i++) { | |
49 | range.setLeftStart(i).setRightStart(i - 1); | |
50 | if (!visitor.rangeEquals(range)) { | |
51 | BitVectorHelper.setBit(runStarts, i); | |
52 | } | |
53 | } | |
54 | } | |
55 | ||
56 | /** | |
57 | * Gets the run lengths, given the start positions. | |
58 | * @param runStarts the bit set for start positions. | |
59 | * @param runLengths the run length vector to populate. | |
60 | * @param valueCount the number of values in the bit set. | |
61 | */ | |
62 | public static void populateRunLengths(ArrowBuf runStarts, IntVector runLengths, int valueCount) { | |
63 | int curStart = 0; | |
64 | int lengthIndex = 0; | |
65 | for (int i = 1; i < valueCount; i++) { | |
66 | if (BitVectorHelper.get(runStarts, i) != 0) { | |
67 | // we get a new distinct value | |
68 | runLengths.setSafe(lengthIndex++, i - curStart); | |
69 | curStart = i; | |
70 | } | |
71 | } | |
72 | ||
73 | // process the last value | |
74 | runLengths.setSafe(lengthIndex++, valueCount - curStart); | |
75 | runLengths.setValueCount(lengthIndex); | |
76 | } | |
77 | ||
78 | /** | |
79 | * Gets distinct values from the input vector by removing adjacent | |
80 | * duplicated values. | |
81 | * @param indicators the bit set containing the start positions of distinct values. | |
82 | * @param inputVector the input vector. | |
83 | * @param outputVector the output vector. | |
84 | * @param <V> vector type. | |
85 | */ | |
86 | public static <V extends ValueVector> void populateDeduplicatedValues( | |
87 | ArrowBuf indicators, V inputVector, V outputVector) { | |
88 | int dstIdx = 0; | |
89 | for (int srcIdx = 0; srcIdx < inputVector.getValueCount(); srcIdx++) { | |
90 | if (BitVectorHelper.get(indicators, srcIdx) != 0) { | |
91 | outputVector.copyFromSafe(srcIdx, dstIdx++, inputVector); | |
92 | } | |
93 | } | |
94 | outputVector.setValueCount(dstIdx); | |
95 | } | |
96 | } |