]> git.proxmox.com Git - ceph.git/blame - ceph/src/arrow/java/algorithm/src/main/java/org/apache/arrow/algorithm/deduplicate/DeduplicationUtils.java
import quincy 17.2.0
[ceph.git] / ceph / src / arrow / java / algorithm / src / main / java / org / apache / arrow / algorithm / deduplicate / DeduplicationUtils.java
CommitLineData
1d09f67e
TL
1/*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18package org.apache.arrow.algorithm.deduplicate;
19
20import org.apache.arrow.memory.ArrowBuf;
21import org.apache.arrow.util.Preconditions;
22import org.apache.arrow.vector.BitVectorHelper;
23import org.apache.arrow.vector.IntVector;
24import org.apache.arrow.vector.ValueVector;
25import org.apache.arrow.vector.compare.Range;
26import org.apache.arrow.vector.compare.RangeEqualsVisitor;
27import org.apache.arrow.vector.util.DataSizeRoundingUtil;
28
29/**
30 * Utilities for vector deduplication.
31 */
32class DeduplicationUtils {
33
34 /**
35 * Gets the start positions of the first distinct values in a vector.
36 * @param vector the target vector.
37 * @param runStarts the bit set to hold the start positions.
38 * @param <V> vector type.
39 */
40 public static <V extends ValueVector> void populateRunStartIndicators(V vector, ArrowBuf runStarts) {
41 int bufSize = DataSizeRoundingUtil.divideBy8Ceil(vector.getValueCount());
42 Preconditions.checkArgument(runStarts.capacity() >= bufSize);
43 runStarts.setZero(0, bufSize);
44
45 BitVectorHelper.setBit(runStarts, 0);
46 RangeEqualsVisitor visitor = new RangeEqualsVisitor(vector, vector, null);
47 Range range = new Range(0, 0, 1);
48 for (int i = 1; i < vector.getValueCount(); i++) {
49 range.setLeftStart(i).setRightStart(i - 1);
50 if (!visitor.rangeEquals(range)) {
51 BitVectorHelper.setBit(runStarts, i);
52 }
53 }
54 }
55
56 /**
57 * Gets the run lengths, given the start positions.
58 * @param runStarts the bit set for start positions.
59 * @param runLengths the run length vector to populate.
60 * @param valueCount the number of values in the bit set.
61 */
62 public static void populateRunLengths(ArrowBuf runStarts, IntVector runLengths, int valueCount) {
63 int curStart = 0;
64 int lengthIndex = 0;
65 for (int i = 1; i < valueCount; i++) {
66 if (BitVectorHelper.get(runStarts, i) != 0) {
67 // we get a new distinct value
68 runLengths.setSafe(lengthIndex++, i - curStart);
69 curStart = i;
70 }
71 }
72
73 // process the last value
74 runLengths.setSafe(lengthIndex++, valueCount - curStart);
75 runLengths.setValueCount(lengthIndex);
76 }
77
78 /**
79 * Gets distinct values from the input vector by removing adjacent
80 * duplicated values.
81 * @param indicators the bit set containing the start positions of distinct values.
82 * @param inputVector the input vector.
83 * @param outputVector the output vector.
84 * @param <V> vector type.
85 */
86 public static <V extends ValueVector> void populateDeduplicatedValues(
87 ArrowBuf indicators, V inputVector, V outputVector) {
88 int dstIdx = 0;
89 for (int srcIdx = 0; srcIdx < inputVector.getValueCount(); srcIdx++) {
90 if (BitVectorHelper.get(indicators, srcIdx) != 0) {
91 outputVector.copyFromSafe(srcIdx, dstIdx++, inputVector);
92 }
93 }
94 outputVector.setValueCount(dstIdx);
95 }
96}