]> git.proxmox.com Git - ceph.git/blob - ceph/src/arrow/go/parquet/internal/utils/bit_run_reader.go
97349f1b27a122c63f18653033a47fcc13db00ea
[ceph.git] / ceph / src / arrow / go / parquet / internal / utils / bit_run_reader.go
1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements. See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership. The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License. You may obtain a copy of the License at
8 //
9 // http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS,
13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 // See the License for the specific language governing permissions and
15 // limitations under the License.
16
17 package utils
18
19 import (
20 "encoding/binary"
21 "fmt"
22 "math/bits"
23 "unsafe"
24
25 "github.com/apache/arrow/go/v6/arrow"
26 "github.com/apache/arrow/go/v6/arrow/bitutil"
27 )
28
29 // BitRun represents a run of bits with the same value of length Len
30 // with Set representing if the group of bits were 1 or 0.
31 type BitRun struct {
32 Len int64
33 Set bool
34 }
35
36 // BitRunReader is an interface that is usable by multiple callers to provide
37 // multiple types of bit run readers such as a reverse reader and so on.
38 //
39 // It's a convenience interface for counting contiguous set/unset bits in a bitmap.
40 // In places where BitBlockCounter can be used, then it would be preferred to use that
41 // as it would be faster than using BitRunReader.
42 type BitRunReader interface {
43 NextRun() BitRun
44 }
45
46 func (b BitRun) String() string {
47 return fmt.Sprintf("{Length: %d, set=%t}", b.Len, b.Set)
48 }
49
50 type bitRunReader struct {
51 bitmap []byte
52 pos int64
53 length int64
54 word uint64
55 curRunBitSet bool
56 }
57
58 // NewBitRunReader returns a reader for the given bitmap, offset and length that
59 // grabs runs of the same value bit at a time for easy iteration.
60 func NewBitRunReader(bitmap []byte, offset int64, length int64) BitRunReader {
61 ret := &bitRunReader{
62 bitmap: bitmap[offset/8:],
63 pos: offset % 8,
64 length: (offset % 8) + length,
65 }
66
67 if length == 0 {
68 return ret
69 }
70
71 ret.curRunBitSet = bitutil.BitIsNotSet(bitmap, int(offset))
72 bitsRemaining := length + ret.pos
73 ret.loadWord(bitsRemaining)
74 ret.word = ret.word &^ LeastSignificantBitMask(ret.pos)
75 return ret
76 }
77
78 // NextRun returns a new BitRun containing the number of contiguous bits with the
79 // same value. Len == 0 indicates the end of the bitmap.
80 func (b *bitRunReader) NextRun() BitRun {
81 if b.pos >= b.length {
82 return BitRun{0, false}
83 }
84
85 // This implementation relies on a efficient implementations of
86 // CountTrailingZeros and assumes that runs are more often then
87 // not. The logic is to incrementally find the next bit change
88 // from the current position. This is done by zeroing all
89 // bits in word_ up to position_ and using the TrailingZeroCount
90 // to find the index of the next set bit.
91
92 // The runs alternate on each call, so flip the bit.
93 b.curRunBitSet = !b.curRunBitSet
94
95 start := b.pos
96 startOffset := start & 63
97
98 // Invert the word for proper use of CountTrailingZeros and
99 // clear bits so CountTrailingZeros can do it magic.
100 b.word = ^b.word &^ LeastSignificantBitMask(startOffset)
101
102 // Go forward until the next change from unset to set.
103 newbits := int64(bits.TrailingZeros64(b.word)) - startOffset
104 b.pos += newbits
105
106 if IsMultipleOf64(b.pos) && b.pos < b.length {
107 b.advanceUntilChange()
108 }
109 return BitRun{b.pos - start, b.curRunBitSet}
110 }
111
112 func (b *bitRunReader) advanceUntilChange() {
113 newbits := int64(0)
114 for {
115 b.bitmap = b.bitmap[arrow.Uint64SizeBytes:]
116 b.loadNextWord()
117 newbits = int64(bits.TrailingZeros64(b.word))
118 b.pos += newbits
119 if !IsMultipleOf64(b.pos) || b.pos >= b.length || newbits <= 0 {
120 break
121 }
122 }
123 }
124
125 func (b *bitRunReader) loadNextWord() {
126 b.loadWord(b.length - b.pos)
127 }
128
129 func (b *bitRunReader) loadWord(bitsRemaining int64) {
130 b.word = 0
131 if bitsRemaining >= 64 {
132 b.word = binary.LittleEndian.Uint64(b.bitmap)
133 } else {
134 nbytes := bitutil.BytesForBits(bitsRemaining)
135 wordptr := (*(*[8]byte)(unsafe.Pointer(&b.word)))[:]
136 copy(wordptr, b.bitmap[:nbytes])
137
138 bitutil.SetBitTo(wordptr, int(bitsRemaining), bitutil.BitIsNotSet(wordptr, int(bitsRemaining-1)))
139 // reset the value to little endian for big endian architectures
140 b.word = ToLEUint64(b.word)
141 }
142
143 // Two cases:
144 // 1. For unset, CountTrailingZeros works naturally so we don't
145 // invert the word.
146 // 2. Otherwise invert so we can use CountTrailingZeros.
147 if b.curRunBitSet {
148 b.word = ^b.word
149 }
150 }