]>
Commit | Line | Data |
---|---|---|
1d09f67e TL |
1 | // Licensed to the Apache Software Foundation (ASF) under one |
2 | // or more contributor license agreements. See the NOTICE file | |
3 | // distributed with this work for additional information | |
4 | // regarding copyright ownership. The ASF licenses this file | |
5 | // to you under the Apache License, Version 2.0 (the | |
6 | // "License"); you may not use this file except in compliance | |
7 | // with the License. You may obtain a copy of the License at | |
8 | // | |
9 | // http://www.apache.org/licenses/LICENSE-2.0 | |
10 | // | |
11 | // Unless required by applicable law or agreed to in writing, software | |
12 | // distributed under the License is distributed on an "AS IS" BASIS, | |
13 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
14 | // See the License for the specific language governing permissions and | |
15 | // limitations under the License. | |
16 | ||
17 | package encoding_test | |
18 | ||
19 | import ( | |
20 | "fmt" | |
21 | "math" | |
22 | "testing" | |
23 | ||
24 | "github.com/apache/arrow/go/v6/arrow" | |
25 | "github.com/apache/arrow/go/v6/arrow/array" | |
26 | "github.com/apache/arrow/go/v6/arrow/memory" | |
27 | "github.com/apache/arrow/go/v6/parquet" | |
28 | "github.com/apache/arrow/go/v6/parquet/internal/encoding" | |
29 | "github.com/apache/arrow/go/v6/parquet/internal/hashing" | |
30 | "github.com/apache/arrow/go/v6/parquet/internal/testutils" | |
31 | "github.com/apache/arrow/go/v6/parquet/schema" | |
32 | ) | |
33 | ||
34 | const ( | |
35 | MINSIZE = 1024 | |
36 | MAXSIZE = 65536 | |
37 | ) | |
38 | ||
39 | func BenchmarkPlainEncodingBoolean(b *testing.B) { | |
40 | for sz := MINSIZE; sz < MAXSIZE+1; sz *= 2 { | |
41 | b.Run(fmt.Sprintf("len %d", sz), func(b *testing.B) { | |
42 | values := make([]bool, sz) | |
43 | for idx := range values { | |
44 | values[idx] = true | |
45 | } | |
46 | encoder := encoding.NewEncoder(parquet.Types.Boolean, parquet.Encodings.Plain, | |
47 | false, nil, memory.DefaultAllocator).(encoding.BooleanEncoder) | |
48 | b.ResetTimer() | |
49 | b.SetBytes(int64(len(values))) | |
50 | for n := 0; n < b.N; n++ { | |
51 | encoder.Put(values) | |
52 | buf, _ := encoder.FlushValues() | |
53 | buf.Release() | |
54 | } | |
55 | }) | |
56 | } | |
57 | } | |
58 | ||
59 | func BenchmarkPlainEncodingInt32(b *testing.B) { | |
60 | for sz := MINSIZE; sz < MAXSIZE+1; sz *= 2 { | |
61 | b.Run(fmt.Sprintf("len %d", sz), func(b *testing.B) { | |
62 | values := make([]int32, sz) | |
63 | for idx := range values { | |
64 | values[idx] = 64 | |
65 | } | |
66 | encoder := encoding.NewEncoder(parquet.Types.Int32, parquet.Encodings.Plain, | |
67 | false, nil, memory.DefaultAllocator).(encoding.Int32Encoder) | |
68 | b.ResetTimer() | |
69 | b.SetBytes(int64(len(values) * arrow.Int32SizeBytes)) | |
70 | for n := 0; n < b.N; n++ { | |
71 | encoder.Put(values) | |
72 | buf, _ := encoder.FlushValues() | |
73 | buf.Release() | |
74 | } | |
75 | }) | |
76 | } | |
77 | } | |
78 | ||
79 | func BenchmarkPlainEncodingInt64(b *testing.B) { | |
80 | for sz := MINSIZE; sz < MAXSIZE+1; sz *= 2 { | |
81 | b.Run(fmt.Sprintf("len %d", sz), func(b *testing.B) { | |
82 | values := make([]int64, sz) | |
83 | for idx := range values { | |
84 | values[idx] = 64 | |
85 | } | |
86 | encoder := encoding.NewEncoder(parquet.Types.Int64, parquet.Encodings.Plain, | |
87 | false, nil, memory.DefaultAllocator).(encoding.Int64Encoder) | |
88 | b.ResetTimer() | |
89 | b.SetBytes(int64(len(values) * arrow.Int64SizeBytes)) | |
90 | for n := 0; n < b.N; n++ { | |
91 | encoder.Put(values) | |
92 | buf, _ := encoder.FlushValues() | |
93 | buf.Release() | |
94 | } | |
95 | }) | |
96 | } | |
97 | } | |
98 | ||
99 | func BenchmarkPlainEncodingFloat32(b *testing.B) { | |
100 | for sz := MINSIZE; sz < MAXSIZE+1; sz *= 2 { | |
101 | b.Run(fmt.Sprintf("len %d", sz), func(b *testing.B) { | |
102 | values := make([]float32, sz) | |
103 | for idx := range values { | |
104 | values[idx] = 64.0 | |
105 | } | |
106 | encoder := encoding.NewEncoder(parquet.Types.Float, parquet.Encodings.Plain, | |
107 | false, nil, memory.DefaultAllocator).(encoding.Float32Encoder) | |
108 | b.ResetTimer() | |
109 | b.SetBytes(int64(len(values) * arrow.Float32SizeBytes)) | |
110 | for n := 0; n < b.N; n++ { | |
111 | encoder.Put(values) | |
112 | buf, _ := encoder.FlushValues() | |
113 | buf.Release() | |
114 | } | |
115 | }) | |
116 | } | |
117 | } | |
118 | ||
119 | func BenchmarkPlainEncodingFloat64(b *testing.B) { | |
120 | for sz := MINSIZE; sz < MAXSIZE+1; sz *= 2 { | |
121 | b.Run(fmt.Sprintf("len %d", sz), func(b *testing.B) { | |
122 | values := make([]float64, sz) | |
123 | for idx := range values { | |
124 | values[idx] = 64 | |
125 | } | |
126 | encoder := encoding.NewEncoder(parquet.Types.Double, parquet.Encodings.Plain, | |
127 | false, nil, memory.DefaultAllocator).(encoding.Float64Encoder) | |
128 | b.ResetTimer() | |
129 | b.SetBytes(int64(len(values) * arrow.Float64SizeBytes)) | |
130 | for n := 0; n < b.N; n++ { | |
131 | encoder.Put(values) | |
132 | buf, _ := encoder.FlushValues() | |
133 | buf.Release() | |
134 | } | |
135 | }) | |
136 | } | |
137 | } | |
138 | ||
139 | func BenchmarkPlainDecodingBoolean(b *testing.B) { | |
140 | for sz := MINSIZE; sz < MAXSIZE+1; sz *= 2 { | |
141 | b.Run(fmt.Sprintf("len %d", sz), func(b *testing.B) { | |
142 | output := make([]bool, sz) | |
143 | values := make([]bool, sz) | |
144 | for idx := range values { | |
145 | values[idx] = true | |
146 | } | |
147 | encoder := encoding.NewEncoder(parquet.Types.Boolean, parquet.Encodings.Plain, | |
148 | false, nil, memory.DefaultAllocator).(encoding.BooleanEncoder) | |
149 | encoder.Put(values) | |
150 | buf, _ := encoder.FlushValues() | |
151 | defer buf.Release() | |
152 | ||
153 | decoder := encoding.NewDecoder(parquet.Types.Boolean, parquet.Encodings.Plain, nil, memory.DefaultAllocator) | |
154 | b.ResetTimer() | |
155 | b.SetBytes(int64(len(values))) | |
156 | for n := 0; n < b.N; n++ { | |
157 | decoder.SetData(sz, buf.Bytes()) | |
158 | decoder.(encoding.BooleanDecoder).Decode(output) | |
159 | } | |
160 | }) | |
161 | } | |
162 | } | |
163 | ||
164 | func BenchmarkPlainDecodingInt32(b *testing.B) { | |
165 | for sz := MINSIZE; sz < MAXSIZE+1; sz *= 2 { | |
166 | b.Run(fmt.Sprintf("len %d", sz), func(b *testing.B) { | |
167 | output := make([]int32, sz) | |
168 | values := make([]int32, sz) | |
169 | for idx := range values { | |
170 | values[idx] = 64 | |
171 | } | |
172 | encoder := encoding.NewEncoder(parquet.Types.Int32, parquet.Encodings.Plain, | |
173 | false, nil, memory.DefaultAllocator).(encoding.Int32Encoder) | |
174 | encoder.Put(values) | |
175 | buf, _ := encoder.FlushValues() | |
176 | defer buf.Release() | |
177 | ||
178 | decoder := encoding.NewDecoder(parquet.Types.Int32, parquet.Encodings.Plain, nil, memory.DefaultAllocator) | |
179 | b.ResetTimer() | |
180 | b.SetBytes(int64(len(values))) | |
181 | for n := 0; n < b.N; n++ { | |
182 | decoder.SetData(sz, buf.Bytes()) | |
183 | decoder.(encoding.Int32Decoder).Decode(output) | |
184 | } | |
185 | }) | |
186 | } | |
187 | } | |
188 | ||
189 | func BenchmarkMemoTableFloat64(b *testing.B) { | |
190 | tests := []struct { | |
191 | nunique int32 | |
192 | nvalues int64 | |
193 | }{ | |
194 | {100, 65535}, | |
195 | {1000, 65535}, | |
196 | {5000, 65535}, | |
197 | } | |
198 | ||
199 | for _, tt := range tests { | |
200 | b.Run(fmt.Sprintf("%d unique n %d", tt.nunique, tt.nvalues), func(b *testing.B) { | |
201 | rag := testutils.NewRandomArrayGenerator(0) | |
202 | dict := rag.Float64(int64(tt.nunique), 0) | |
203 | indices := rag.Int32(tt.nvalues, 0, int32(tt.nunique)-1, 0) | |
204 | ||
205 | values := make([]float64, tt.nvalues) | |
206 | for idx := range values { | |
207 | values[idx] = dict.Value(int(indices.Value(idx))) | |
208 | } | |
209 | ||
210 | b.ResetTimer() | |
211 | b.Run("go map", func(b *testing.B) { | |
212 | for i := 0; i < b.N; i++ { | |
213 | tbl := encoding.NewFloat64MemoTable(memory.DefaultAllocator) | |
214 | for _, v := range values { | |
215 | tbl.GetOrInsert(v) | |
216 | } | |
217 | if tbl.Size() != int(tt.nunique) { | |
218 | b.Fatal(tbl.Size(), tt.nunique) | |
219 | } | |
220 | } | |
221 | }) | |
222 | b.ResetTimer() | |
223 | b.Run("xxh3", func(b *testing.B) { | |
224 | for i := 0; i < b.N; i++ { | |
225 | tbl := hashing.NewFloat64MemoTable(0) | |
226 | for _, v := range values { | |
227 | tbl.GetOrInsert(v) | |
228 | } | |
229 | if tbl.Size() != int(tt.nunique) { | |
230 | b.Fatal(tbl.Size(), tt.nunique) | |
231 | } | |
232 | } | |
233 | }) | |
234 | }) | |
235 | } | |
236 | } | |
237 | ||
238 | func BenchmarkMemoTableInt32(b *testing.B) { | |
239 | tests := []struct { | |
240 | nunique int32 | |
241 | nvalues int64 | |
242 | }{ | |
243 | {100, 65535}, | |
244 | {1000, 65535}, | |
245 | {5000, 65535}, | |
246 | } | |
247 | ||
248 | for _, tt := range tests { | |
249 | b.Run(fmt.Sprintf("%d unique n %d", tt.nunique, tt.nvalues), func(b *testing.B) { | |
250 | rag := testutils.NewRandomArrayGenerator(0) | |
251 | dict := rag.Int32(int64(tt.nunique), 0, math.MaxInt32-1, 0) | |
252 | indices := rag.Int32(tt.nvalues, 0, int32(tt.nunique)-1, 0) | |
253 | ||
254 | values := make([]int32, tt.nvalues) | |
255 | for idx := range values { | |
256 | values[idx] = dict.Value(int(indices.Value(idx))) | |
257 | } | |
258 | b.ResetTimer() | |
259 | b.Run("xxh3", func(b *testing.B) { | |
260 | for i := 0; i < b.N; i++ { | |
261 | tbl := hashing.NewInt32MemoTable(0) | |
262 | for _, v := range values { | |
263 | tbl.GetOrInsert(v) | |
264 | } | |
265 | if tbl.Size() != int(tt.nunique) { | |
266 | b.Fatal(tbl.Size(), tt.nunique) | |
267 | } | |
268 | } | |
269 | }) | |
270 | ||
271 | b.Run("go map", func(b *testing.B) { | |
272 | for i := 0; i < b.N; i++ { | |
273 | tbl := encoding.NewInt32MemoTable(memory.DefaultAllocator) | |
274 | for _, v := range values { | |
275 | tbl.GetOrInsert(v) | |
276 | } | |
277 | if tbl.Size() != int(tt.nunique) { | |
278 | b.Fatal(tbl.Size(), tt.nunique) | |
279 | } | |
280 | } | |
281 | }) | |
282 | }) | |
283 | } | |
284 | } | |
285 | ||
286 | func BenchmarkMemoTable(b *testing.B) { | |
287 | tests := []struct { | |
288 | nunique int32 | |
289 | minLen int32 | |
290 | maxLen int32 | |
291 | nvalues int64 | |
292 | }{ | |
293 | {100, 32, 32, 65535}, | |
294 | {100, 8, 32, 65535}, | |
295 | {1000, 32, 32, 65535}, | |
296 | {1000, 8, 32, 65535}, | |
297 | {5000, 32, 32, 65535}, | |
298 | {5000, 8, 32, 65535}, | |
299 | } | |
300 | ||
301 | for _, tt := range tests { | |
302 | b.Run(fmt.Sprintf("%d unique len %d-%d n %d", tt.nunique, tt.minLen, tt.maxLen, tt.nvalues), func(b *testing.B) { | |
303 | ||
304 | rag := testutils.NewRandomArrayGenerator(0) | |
305 | dict := rag.ByteArray(int64(tt.nunique), tt.minLen, tt.maxLen, 0).(*array.String) | |
306 | indices := rag.Int32(tt.nvalues, 0, int32(tt.nunique)-1, 0) | |
307 | ||
308 | values := make([]parquet.ByteArray, tt.nvalues) | |
309 | for idx := range values { | |
310 | values[idx] = []byte(dict.Value(int(indices.Value(idx)))) | |
311 | } | |
312 | ||
313 | b.ResetTimer() | |
314 | ||
315 | b.Run("xxh3", func(b *testing.B) { | |
316 | for i := 0; i < b.N; i++ { | |
317 | tbl := hashing.NewBinaryMemoTable(memory.DefaultAllocator, 0, -1) | |
318 | for _, v := range values { | |
319 | tbl.GetOrInsert(v) | |
320 | } | |
321 | if tbl.Size() != int(tt.nunique) { | |
322 | b.Fatal(tbl.Size(), tt.nunique) | |
323 | } | |
324 | tbl.Release() | |
325 | } | |
326 | }) | |
327 | b.ResetTimer() | |
328 | b.Run("go map", func(b *testing.B) { | |
329 | for i := 0; i < b.N; i++ { | |
330 | tbl := encoding.NewBinaryMemoTable(memory.DefaultAllocator) | |
331 | for _, v := range values { | |
332 | tbl.GetOrInsert(v) | |
333 | } | |
334 | if tbl.Size() != int(tt.nunique) { | |
335 | b.Fatal(tbl.Size(), tt.nunique) | |
336 | } | |
337 | tbl.Release() | |
338 | } | |
339 | }) | |
340 | }) | |
341 | } | |
342 | } | |
343 | ||
344 | func BenchmarkMemoTableAllUnique(b *testing.B) { | |
345 | tests := []struct { | |
346 | minLen int32 | |
347 | maxLen int32 | |
348 | nvalues int64 | |
349 | }{ | |
350 | {32, 32, 1024}, | |
351 | {8, 32, 1024}, | |
352 | {32, 32, 32767}, | |
353 | {8, 32, 32767}, | |
354 | {32, 32, 65535}, | |
355 | {8, 32, 65535}, | |
356 | } | |
357 | for _, tt := range tests { | |
358 | b.Run(fmt.Sprintf("values %d len %d-%d", tt.nvalues, tt.minLen, tt.maxLen), func(b *testing.B) { | |
359 | ||
360 | rag := testutils.NewRandomArrayGenerator(0) | |
361 | dict := rag.ByteArray(tt.nvalues, tt.minLen, tt.maxLen, 0).(*array.String) | |
362 | ||
363 | values := make([]parquet.ByteArray, tt.nvalues) | |
364 | for idx := range values { | |
365 | values[idx] = []byte(dict.Value(idx)) | |
366 | } | |
367 | ||
368 | b.ResetTimer() | |
369 | b.Run("go map", func(b *testing.B) { | |
370 | for i := 0; i < b.N; i++ { | |
371 | tbl := encoding.NewBinaryMemoTable(memory.DefaultAllocator) | |
372 | for _, v := range values { | |
373 | tbl.GetOrInsert(v) | |
374 | } | |
375 | if tbl.Size() != int(tt.nvalues) { | |
376 | b.Fatal(tbl.Size(), tt.nvalues) | |
377 | } | |
378 | tbl.Release() | |
379 | } | |
380 | }) | |
381 | ||
382 | b.Run("xxh3", func(b *testing.B) { | |
383 | for i := 0; i < b.N; i++ { | |
384 | tbl := hashing.NewBinaryMemoTable(memory.DefaultAllocator, 0, -1) | |
385 | for _, v := range values { | |
386 | tbl.GetOrInsert(v) | |
387 | } | |
388 | if tbl.Size() != int(tt.nvalues) { | |
389 | b.Fatal(tbl.Size(), tt.nvalues) | |
390 | } | |
391 | tbl.Release() | |
392 | } | |
393 | }) | |
394 | }) | |
395 | } | |
396 | ||
397 | } | |
398 | ||
399 | func BenchmarkEncodeDictByteArray(b *testing.B) { | |
400 | const ( | |
401 | nunique = 100 | |
402 | minLen = 8 | |
403 | maxLen = 32 | |
404 | nvalues = 65535 | |
405 | ) | |
406 | ||
407 | rag := testutils.NewRandomArrayGenerator(0) | |
408 | dict := rag.ByteArray(nunique, minLen, maxLen, 0).(*array.String) | |
409 | indices := rag.Int32(nvalues, 0, nunique-1, 0) | |
410 | ||
411 | values := make([]parquet.ByteArray, nvalues) | |
412 | for idx := range values { | |
413 | values[idx] = []byte(dict.Value(int(indices.Value(idx)))) | |
414 | } | |
415 | col := schema.NewColumn(schema.NewByteArrayNode("bytearray", parquet.Repetitions.Required, -1), 0, 0) | |
416 | ||
417 | out := make([]byte, nunique*(maxLen+arrow.Uint32SizeBytes)) | |
418 | b.ResetTimer() | |
419 | for i := 0; i < b.N; i++ { | |
420 | enc := encoding.NewEncoder(parquet.Types.ByteArray, parquet.Encodings.PlainDict, true, col, memory.DefaultAllocator).(*encoding.DictByteArrayEncoder) | |
421 | enc.Put(values) | |
422 | enc.WriteDict(out) | |
423 | } | |
424 | } | |
425 | ||
426 | func BenchmarkDecodeDictByteArray(b *testing.B) { | |
427 | const ( | |
428 | nunique = 100 | |
429 | minLen = 32 | |
430 | maxLen = 32 | |
431 | nvalues = 65535 | |
432 | ) | |
433 | ||
434 | rag := testutils.NewRandomArrayGenerator(0) | |
435 | dict := rag.ByteArray(nunique, minLen, maxLen, 0).(*array.String) | |
436 | indices := rag.Int32(nvalues, 0, nunique-1, 0) | |
437 | ||
438 | values := make([]parquet.ByteArray, nvalues) | |
439 | for idx := range values { | |
440 | values[idx] = []byte(dict.Value(int(indices.Value(idx)))) | |
441 | } | |
442 | ||
443 | col := schema.NewColumn(schema.NewByteArrayNode("bytearray", parquet.Repetitions.Required, -1), 0, 0) | |
444 | enc := encoding.NewEncoder(parquet.Types.ByteArray, parquet.Encodings.PlainDict, true, col, memory.DefaultAllocator).(*encoding.DictByteArrayEncoder) | |
445 | enc.Put(values) | |
446 | ||
447 | dictBuf := make([]byte, enc.DictEncodedSize()) | |
448 | enc.WriteDict(dictBuf) | |
449 | ||
450 | idxBuf := make([]byte, enc.EstimatedDataEncodedSize()) | |
451 | enc.WriteIndices(idxBuf) | |
452 | ||
453 | out := make([]parquet.ByteArray, nvalues) | |
454 | ||
455 | b.ResetTimer() | |
456 | ||
457 | for i := 0; i < b.N; i++ { | |
458 | dec := encoding.NewDecoder(parquet.Types.ByteArray, parquet.Encodings.Plain, col, memory.DefaultAllocator) | |
459 | dec.SetData(nunique, dictBuf) | |
460 | dictDec := encoding.NewDictDecoder(parquet.Types.ByteArray, col, memory.DefaultAllocator).(*encoding.DictByteArrayDecoder) | |
461 | dictDec.SetDict(dec) | |
462 | dictDec.SetData(nvalues, idxBuf) | |
463 | ||
464 | dictDec.Decode(out) | |
465 | } | |
466 | } |