]> git.proxmox.com Git - ceph.git/blob - ceph/src/arrow/go/parquet/internal/encoding/byte_array_encoder.go
import quincy 17.2.0
[ceph.git] / ceph / src / arrow / go / parquet / internal / encoding / byte_array_encoder.go
1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements. See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership. The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License. You may obtain a copy of the License at
8 //
9 // http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS,
13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 // See the License for the specific language governing permissions and
15 // limitations under the License.
16
17 package encoding
18
19 import (
20 "encoding/binary"
21 "unsafe"
22
23 "github.com/apache/arrow/go/v6/arrow"
24 "github.com/apache/arrow/go/v6/parquet"
25 "github.com/apache/arrow/go/v6/parquet/internal/utils"
26 )
27
28 // PlainByteArrayEncoder encodes byte arrays according to the spec for Plain encoding
29 // by encoding the length as a int32 followed by the bytes of the value.
30 type PlainByteArrayEncoder struct {
31 encoder
32
33 bitSetReader utils.SetBitRunReader
34 }
35
36 // PutByteArray writes out the 4 bytes for the length followed by the data
37 func (enc *PlainByteArrayEncoder) PutByteArray(val parquet.ByteArray) {
38 inc := val.Len() + arrow.Uint32SizeBytes
39 enc.sink.Reserve(inc)
40 vlen := utils.ToLEUint32(uint32(val.Len()))
41 enc.sink.UnsafeWrite((*(*[4]byte)(unsafe.Pointer(&vlen)))[:])
42 enc.sink.UnsafeWrite(val)
43 }
44
45 // Put writes out all of the values in this slice to the encoding sink
46 func (enc *PlainByteArrayEncoder) Put(in []parquet.ByteArray) {
47 for _, val := range in {
48 enc.PutByteArray(val)
49 }
50 }
51
52 // PutSpaced uses the bitmap of validBits to leave out anything that is null according
53 // to the bitmap.
54 //
55 // If validBits is nil, this is equivalent to calling Put
56 func (enc *PlainByteArrayEncoder) PutSpaced(in []parquet.ByteArray, validBits []byte, validBitsOffset int64) {
57 if validBits != nil {
58 if enc.bitSetReader == nil {
59 enc.bitSetReader = utils.NewSetBitRunReader(validBits, validBitsOffset, int64(len(in)))
60 } else {
61 enc.bitSetReader.Reset(validBits, validBitsOffset, int64(len(in)))
62 }
63
64 for {
65 run := enc.bitSetReader.NextRun()
66 if run.Length == 0 {
67 break
68 }
69 enc.Put(in[int(run.Pos):int(run.Pos+run.Length)])
70 }
71 } else {
72 enc.Put(in)
73 }
74 }
75
76 // Type returns parquet.Types.ByteArray for the bytearray encoder
77 func (PlainByteArrayEncoder) Type() parquet.Type {
78 return parquet.Types.ByteArray
79 }
80
81 // WriteDict writes the dictionary out to the provided slice, out should be
82 // at least DictEncodedSize() bytes
83 func (enc *DictByteArrayEncoder) WriteDict(out []byte) {
84 enc.memo.(BinaryMemoTable).VisitValues(0, func(v []byte) {
85 binary.LittleEndian.PutUint32(out, uint32(len(v)))
86 out = out[arrow.Uint32SizeBytes:]
87 copy(out, v)
88 out = out[len(v):]
89 })
90 }
91
92 // PutByteArray adds a single byte array to buffer, updating the dictionary
93 // and encoded size if it's a new value
94 func (enc *DictByteArrayEncoder) PutByteArray(in parquet.ByteArray) {
95 if in == nil {
96 in = empty[:]
97 }
98 memoIdx, found, err := enc.memo.GetOrInsert(in)
99 if err != nil {
100 panic(err)
101 }
102 if !found {
103 enc.dictEncodedSize += in.Len() + arrow.Uint32SizeBytes
104 }
105 enc.addIndex(memoIdx)
106 }
107
108 // Put takes a slice of ByteArrays to add and encode.
109 func (enc *DictByteArrayEncoder) Put(in []parquet.ByteArray) {
110 for _, val := range in {
111 enc.PutByteArray(val)
112 }
113 }
114
115 // PutSpaced like with the non-dict encoder leaves out the values where the validBits bitmap is 0
116 func (enc *DictByteArrayEncoder) PutSpaced(in []parquet.ByteArray, validBits []byte, validBitsOffset int64) {
117 utils.VisitSetBitRuns(validBits, validBitsOffset, int64(len(in)), func(pos, length int64) error {
118 for i := int64(0); i < length; i++ {
119 enc.PutByteArray(in[i+pos])
120 }
121 return nil
122 })
123 }