]> git.proxmox.com Git - ceph.git/blame - ceph/src/arrow/go/parquet/internal/encoding/encoding_benchmarks_test.go
import quincy 17.2.0
[ceph.git] / ceph / src / arrow / go / parquet / internal / encoding / encoding_benchmarks_test.go
CommitLineData
1d09f67e
TL
1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing, software
12// distributed under the License is distributed on an "AS IS" BASIS,
13// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14// See the License for the specific language governing permissions and
15// limitations under the License.
16
17package encoding_test
18
19import (
20 "fmt"
21 "math"
22 "testing"
23
24 "github.com/apache/arrow/go/v6/arrow"
25 "github.com/apache/arrow/go/v6/arrow/array"
26 "github.com/apache/arrow/go/v6/arrow/memory"
27 "github.com/apache/arrow/go/v6/parquet"
28 "github.com/apache/arrow/go/v6/parquet/internal/encoding"
29 "github.com/apache/arrow/go/v6/parquet/internal/hashing"
30 "github.com/apache/arrow/go/v6/parquet/internal/testutils"
31 "github.com/apache/arrow/go/v6/parquet/schema"
32)
33
34const (
35 MINSIZE = 1024
36 MAXSIZE = 65536
37)
38
39func BenchmarkPlainEncodingBoolean(b *testing.B) {
40 for sz := MINSIZE; sz < MAXSIZE+1; sz *= 2 {
41 b.Run(fmt.Sprintf("len %d", sz), func(b *testing.B) {
42 values := make([]bool, sz)
43 for idx := range values {
44 values[idx] = true
45 }
46 encoder := encoding.NewEncoder(parquet.Types.Boolean, parquet.Encodings.Plain,
47 false, nil, memory.DefaultAllocator).(encoding.BooleanEncoder)
48 b.ResetTimer()
49 b.SetBytes(int64(len(values)))
50 for n := 0; n < b.N; n++ {
51 encoder.Put(values)
52 buf, _ := encoder.FlushValues()
53 buf.Release()
54 }
55 })
56 }
57}
58
59func BenchmarkPlainEncodingInt32(b *testing.B) {
60 for sz := MINSIZE; sz < MAXSIZE+1; sz *= 2 {
61 b.Run(fmt.Sprintf("len %d", sz), func(b *testing.B) {
62 values := make([]int32, sz)
63 for idx := range values {
64 values[idx] = 64
65 }
66 encoder := encoding.NewEncoder(parquet.Types.Int32, parquet.Encodings.Plain,
67 false, nil, memory.DefaultAllocator).(encoding.Int32Encoder)
68 b.ResetTimer()
69 b.SetBytes(int64(len(values) * arrow.Int32SizeBytes))
70 for n := 0; n < b.N; n++ {
71 encoder.Put(values)
72 buf, _ := encoder.FlushValues()
73 buf.Release()
74 }
75 })
76 }
77}
78
79func BenchmarkPlainEncodingInt64(b *testing.B) {
80 for sz := MINSIZE; sz < MAXSIZE+1; sz *= 2 {
81 b.Run(fmt.Sprintf("len %d", sz), func(b *testing.B) {
82 values := make([]int64, sz)
83 for idx := range values {
84 values[idx] = 64
85 }
86 encoder := encoding.NewEncoder(parquet.Types.Int64, parquet.Encodings.Plain,
87 false, nil, memory.DefaultAllocator).(encoding.Int64Encoder)
88 b.ResetTimer()
89 b.SetBytes(int64(len(values) * arrow.Int64SizeBytes))
90 for n := 0; n < b.N; n++ {
91 encoder.Put(values)
92 buf, _ := encoder.FlushValues()
93 buf.Release()
94 }
95 })
96 }
97}
98
99func BenchmarkPlainEncodingFloat32(b *testing.B) {
100 for sz := MINSIZE; sz < MAXSIZE+1; sz *= 2 {
101 b.Run(fmt.Sprintf("len %d", sz), func(b *testing.B) {
102 values := make([]float32, sz)
103 for idx := range values {
104 values[idx] = 64.0
105 }
106 encoder := encoding.NewEncoder(parquet.Types.Float, parquet.Encodings.Plain,
107 false, nil, memory.DefaultAllocator).(encoding.Float32Encoder)
108 b.ResetTimer()
109 b.SetBytes(int64(len(values) * arrow.Float32SizeBytes))
110 for n := 0; n < b.N; n++ {
111 encoder.Put(values)
112 buf, _ := encoder.FlushValues()
113 buf.Release()
114 }
115 })
116 }
117}
118
119func BenchmarkPlainEncodingFloat64(b *testing.B) {
120 for sz := MINSIZE; sz < MAXSIZE+1; sz *= 2 {
121 b.Run(fmt.Sprintf("len %d", sz), func(b *testing.B) {
122 values := make([]float64, sz)
123 for idx := range values {
124 values[idx] = 64
125 }
126 encoder := encoding.NewEncoder(parquet.Types.Double, parquet.Encodings.Plain,
127 false, nil, memory.DefaultAllocator).(encoding.Float64Encoder)
128 b.ResetTimer()
129 b.SetBytes(int64(len(values) * arrow.Float64SizeBytes))
130 for n := 0; n < b.N; n++ {
131 encoder.Put(values)
132 buf, _ := encoder.FlushValues()
133 buf.Release()
134 }
135 })
136 }
137}
138
139func BenchmarkPlainDecodingBoolean(b *testing.B) {
140 for sz := MINSIZE; sz < MAXSIZE+1; sz *= 2 {
141 b.Run(fmt.Sprintf("len %d", sz), func(b *testing.B) {
142 output := make([]bool, sz)
143 values := make([]bool, sz)
144 for idx := range values {
145 values[idx] = true
146 }
147 encoder := encoding.NewEncoder(parquet.Types.Boolean, parquet.Encodings.Plain,
148 false, nil, memory.DefaultAllocator).(encoding.BooleanEncoder)
149 encoder.Put(values)
150 buf, _ := encoder.FlushValues()
151 defer buf.Release()
152
153 decoder := encoding.NewDecoder(parquet.Types.Boolean, parquet.Encodings.Plain, nil, memory.DefaultAllocator)
154 b.ResetTimer()
155 b.SetBytes(int64(len(values)))
156 for n := 0; n < b.N; n++ {
157 decoder.SetData(sz, buf.Bytes())
158 decoder.(encoding.BooleanDecoder).Decode(output)
159 }
160 })
161 }
162}
163
164func BenchmarkPlainDecodingInt32(b *testing.B) {
165 for sz := MINSIZE; sz < MAXSIZE+1; sz *= 2 {
166 b.Run(fmt.Sprintf("len %d", sz), func(b *testing.B) {
167 output := make([]int32, sz)
168 values := make([]int32, sz)
169 for idx := range values {
170 values[idx] = 64
171 }
172 encoder := encoding.NewEncoder(parquet.Types.Int32, parquet.Encodings.Plain,
173 false, nil, memory.DefaultAllocator).(encoding.Int32Encoder)
174 encoder.Put(values)
175 buf, _ := encoder.FlushValues()
176 defer buf.Release()
177
178 decoder := encoding.NewDecoder(parquet.Types.Int32, parquet.Encodings.Plain, nil, memory.DefaultAllocator)
179 b.ResetTimer()
180 b.SetBytes(int64(len(values)))
181 for n := 0; n < b.N; n++ {
182 decoder.SetData(sz, buf.Bytes())
183 decoder.(encoding.Int32Decoder).Decode(output)
184 }
185 })
186 }
187}
188
189func BenchmarkMemoTableFloat64(b *testing.B) {
190 tests := []struct {
191 nunique int32
192 nvalues int64
193 }{
194 {100, 65535},
195 {1000, 65535},
196 {5000, 65535},
197 }
198
199 for _, tt := range tests {
200 b.Run(fmt.Sprintf("%d unique n %d", tt.nunique, tt.nvalues), func(b *testing.B) {
201 rag := testutils.NewRandomArrayGenerator(0)
202 dict := rag.Float64(int64(tt.nunique), 0)
203 indices := rag.Int32(tt.nvalues, 0, int32(tt.nunique)-1, 0)
204
205 values := make([]float64, tt.nvalues)
206 for idx := range values {
207 values[idx] = dict.Value(int(indices.Value(idx)))
208 }
209
210 b.ResetTimer()
211 b.Run("go map", func(b *testing.B) {
212 for i := 0; i < b.N; i++ {
213 tbl := encoding.NewFloat64MemoTable(memory.DefaultAllocator)
214 for _, v := range values {
215 tbl.GetOrInsert(v)
216 }
217 if tbl.Size() != int(tt.nunique) {
218 b.Fatal(tbl.Size(), tt.nunique)
219 }
220 }
221 })
222 b.ResetTimer()
223 b.Run("xxh3", func(b *testing.B) {
224 for i := 0; i < b.N; i++ {
225 tbl := hashing.NewFloat64MemoTable(0)
226 for _, v := range values {
227 tbl.GetOrInsert(v)
228 }
229 if tbl.Size() != int(tt.nunique) {
230 b.Fatal(tbl.Size(), tt.nunique)
231 }
232 }
233 })
234 })
235 }
236}
237
238func BenchmarkMemoTableInt32(b *testing.B) {
239 tests := []struct {
240 nunique int32
241 nvalues int64
242 }{
243 {100, 65535},
244 {1000, 65535},
245 {5000, 65535},
246 }
247
248 for _, tt := range tests {
249 b.Run(fmt.Sprintf("%d unique n %d", tt.nunique, tt.nvalues), func(b *testing.B) {
250 rag := testutils.NewRandomArrayGenerator(0)
251 dict := rag.Int32(int64(tt.nunique), 0, math.MaxInt32-1, 0)
252 indices := rag.Int32(tt.nvalues, 0, int32(tt.nunique)-1, 0)
253
254 values := make([]int32, tt.nvalues)
255 for idx := range values {
256 values[idx] = dict.Value(int(indices.Value(idx)))
257 }
258 b.ResetTimer()
259 b.Run("xxh3", func(b *testing.B) {
260 for i := 0; i < b.N; i++ {
261 tbl := hashing.NewInt32MemoTable(0)
262 for _, v := range values {
263 tbl.GetOrInsert(v)
264 }
265 if tbl.Size() != int(tt.nunique) {
266 b.Fatal(tbl.Size(), tt.nunique)
267 }
268 }
269 })
270
271 b.Run("go map", func(b *testing.B) {
272 for i := 0; i < b.N; i++ {
273 tbl := encoding.NewInt32MemoTable(memory.DefaultAllocator)
274 for _, v := range values {
275 tbl.GetOrInsert(v)
276 }
277 if tbl.Size() != int(tt.nunique) {
278 b.Fatal(tbl.Size(), tt.nunique)
279 }
280 }
281 })
282 })
283 }
284}
285
286func BenchmarkMemoTable(b *testing.B) {
287 tests := []struct {
288 nunique int32
289 minLen int32
290 maxLen int32
291 nvalues int64
292 }{
293 {100, 32, 32, 65535},
294 {100, 8, 32, 65535},
295 {1000, 32, 32, 65535},
296 {1000, 8, 32, 65535},
297 {5000, 32, 32, 65535},
298 {5000, 8, 32, 65535},
299 }
300
301 for _, tt := range tests {
302 b.Run(fmt.Sprintf("%d unique len %d-%d n %d", tt.nunique, tt.minLen, tt.maxLen, tt.nvalues), func(b *testing.B) {
303
304 rag := testutils.NewRandomArrayGenerator(0)
305 dict := rag.ByteArray(int64(tt.nunique), tt.minLen, tt.maxLen, 0).(*array.String)
306 indices := rag.Int32(tt.nvalues, 0, int32(tt.nunique)-1, 0)
307
308 values := make([]parquet.ByteArray, tt.nvalues)
309 for idx := range values {
310 values[idx] = []byte(dict.Value(int(indices.Value(idx))))
311 }
312
313 b.ResetTimer()
314
315 b.Run("xxh3", func(b *testing.B) {
316 for i := 0; i < b.N; i++ {
317 tbl := hashing.NewBinaryMemoTable(memory.DefaultAllocator, 0, -1)
318 for _, v := range values {
319 tbl.GetOrInsert(v)
320 }
321 if tbl.Size() != int(tt.nunique) {
322 b.Fatal(tbl.Size(), tt.nunique)
323 }
324 tbl.Release()
325 }
326 })
327 b.ResetTimer()
328 b.Run("go map", func(b *testing.B) {
329 for i := 0; i < b.N; i++ {
330 tbl := encoding.NewBinaryMemoTable(memory.DefaultAllocator)
331 for _, v := range values {
332 tbl.GetOrInsert(v)
333 }
334 if tbl.Size() != int(tt.nunique) {
335 b.Fatal(tbl.Size(), tt.nunique)
336 }
337 tbl.Release()
338 }
339 })
340 })
341 }
342}
343
344func BenchmarkMemoTableAllUnique(b *testing.B) {
345 tests := []struct {
346 minLen int32
347 maxLen int32
348 nvalues int64
349 }{
350 {32, 32, 1024},
351 {8, 32, 1024},
352 {32, 32, 32767},
353 {8, 32, 32767},
354 {32, 32, 65535},
355 {8, 32, 65535},
356 }
357 for _, tt := range tests {
358 b.Run(fmt.Sprintf("values %d len %d-%d", tt.nvalues, tt.minLen, tt.maxLen), func(b *testing.B) {
359
360 rag := testutils.NewRandomArrayGenerator(0)
361 dict := rag.ByteArray(tt.nvalues, tt.minLen, tt.maxLen, 0).(*array.String)
362
363 values := make([]parquet.ByteArray, tt.nvalues)
364 for idx := range values {
365 values[idx] = []byte(dict.Value(idx))
366 }
367
368 b.ResetTimer()
369 b.Run("go map", func(b *testing.B) {
370 for i := 0; i < b.N; i++ {
371 tbl := encoding.NewBinaryMemoTable(memory.DefaultAllocator)
372 for _, v := range values {
373 tbl.GetOrInsert(v)
374 }
375 if tbl.Size() != int(tt.nvalues) {
376 b.Fatal(tbl.Size(), tt.nvalues)
377 }
378 tbl.Release()
379 }
380 })
381
382 b.Run("xxh3", func(b *testing.B) {
383 for i := 0; i < b.N; i++ {
384 tbl := hashing.NewBinaryMemoTable(memory.DefaultAllocator, 0, -1)
385 for _, v := range values {
386 tbl.GetOrInsert(v)
387 }
388 if tbl.Size() != int(tt.nvalues) {
389 b.Fatal(tbl.Size(), tt.nvalues)
390 }
391 tbl.Release()
392 }
393 })
394 })
395 }
396
397}
398
399func BenchmarkEncodeDictByteArray(b *testing.B) {
400 const (
401 nunique = 100
402 minLen = 8
403 maxLen = 32
404 nvalues = 65535
405 )
406
407 rag := testutils.NewRandomArrayGenerator(0)
408 dict := rag.ByteArray(nunique, minLen, maxLen, 0).(*array.String)
409 indices := rag.Int32(nvalues, 0, nunique-1, 0)
410
411 values := make([]parquet.ByteArray, nvalues)
412 for idx := range values {
413 values[idx] = []byte(dict.Value(int(indices.Value(idx))))
414 }
415 col := schema.NewColumn(schema.NewByteArrayNode("bytearray", parquet.Repetitions.Required, -1), 0, 0)
416
417 out := make([]byte, nunique*(maxLen+arrow.Uint32SizeBytes))
418 b.ResetTimer()
419 for i := 0; i < b.N; i++ {
420 enc := encoding.NewEncoder(parquet.Types.ByteArray, parquet.Encodings.PlainDict, true, col, memory.DefaultAllocator).(*encoding.DictByteArrayEncoder)
421 enc.Put(values)
422 enc.WriteDict(out)
423 }
424}
425
426func BenchmarkDecodeDictByteArray(b *testing.B) {
427 const (
428 nunique = 100
429 minLen = 32
430 maxLen = 32
431 nvalues = 65535
432 )
433
434 rag := testutils.NewRandomArrayGenerator(0)
435 dict := rag.ByteArray(nunique, minLen, maxLen, 0).(*array.String)
436 indices := rag.Int32(nvalues, 0, nunique-1, 0)
437
438 values := make([]parquet.ByteArray, nvalues)
439 for idx := range values {
440 values[idx] = []byte(dict.Value(int(indices.Value(idx))))
441 }
442
443 col := schema.NewColumn(schema.NewByteArrayNode("bytearray", parquet.Repetitions.Required, -1), 0, 0)
444 enc := encoding.NewEncoder(parquet.Types.ByteArray, parquet.Encodings.PlainDict, true, col, memory.DefaultAllocator).(*encoding.DictByteArrayEncoder)
445 enc.Put(values)
446
447 dictBuf := make([]byte, enc.DictEncodedSize())
448 enc.WriteDict(dictBuf)
449
450 idxBuf := make([]byte, enc.EstimatedDataEncodedSize())
451 enc.WriteIndices(idxBuf)
452
453 out := make([]parquet.ByteArray, nvalues)
454
455 b.ResetTimer()
456
457 for i := 0; i < b.N; i++ {
458 dec := encoding.NewDecoder(parquet.Types.ByteArray, parquet.Encodings.Plain, col, memory.DefaultAllocator)
459 dec.SetData(nunique, dictBuf)
460 dictDec := encoding.NewDictDecoder(parquet.Types.ByteArray, col, memory.DefaultAllocator).(*encoding.DictByteArrayDecoder)
461 dictDec.SetDict(dec)
462 dictDec.SetData(nvalues, idxBuf)
463
464 dictDec.Decode(out)
465 }
466}