]>
Commit | Line | Data |
---|---|---|
1d09f67e TL |
1 | // Licensed to the Apache Software Foundation (ASF) under one |
2 | // or more contributor license agreements. See the NOTICE file | |
3 | // distributed with this work for additional information | |
4 | // regarding copyright ownership. The ASF licenses this file | |
5 | // to you under the Apache License, Version 2.0 (the | |
6 | // "License"); you may not use this file except in compliance | |
7 | // with the License. You may obtain a copy of the License at | |
8 | // | |
9 | // http://www.apache.org/licenses/LICENSE-2.0 | |
10 | // | |
11 | // Unless required by applicable law or agreed to in writing, | |
12 | // software distributed under the License is distributed on an | |
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | |
14 | // KIND, either express or implied. See the License for the | |
15 | // specific language governing permissions and limitations | |
16 | // under the License. | |
17 | ||
18 | // This module defines an abstract interface for iterating through pages in a | |
19 | // Parquet column chunk within a row group. It could be extended in the future | |
20 | // to iterate through all data pages in all chunks in a file. | |
21 | ||
22 | #include "parquet/test_util.h" | |
23 | ||
24 | #include <algorithm> | |
25 | #include <chrono> | |
26 | #include <limits> | |
27 | #include <memory> | |
28 | #include <random> | |
29 | #include <string> | |
30 | #include <utility> | |
31 | #include <vector> | |
32 | ||
33 | #include "parquet/column_page.h" | |
34 | #include "parquet/column_reader.h" | |
35 | #include "parquet/column_writer.h" | |
36 | #include "parquet/encoding.h" | |
37 | #include "parquet/platform.h" | |
38 | ||
39 | namespace parquet { | |
40 | namespace test { | |
41 | ||
42 | const char* get_data_dir() { | |
43 | const auto result = std::getenv("PARQUET_TEST_DATA"); | |
44 | if (!result || !result[0]) { | |
45 | throw ParquetTestException( | |
46 | "Please point the PARQUET_TEST_DATA environment " | |
47 | "variable to the test data directory"); | |
48 | } | |
49 | return result; | |
50 | } | |
51 | ||
52 | std::string get_bad_data_dir() { | |
53 | // PARQUET_TEST_DATA should point to ARROW_HOME/cpp/submodules/parquet-testing/data | |
54 | // so need to reach one folder up to access the "bad_data" folder. | |
55 | std::string data_dir(get_data_dir()); | |
56 | std::stringstream ss; | |
57 | ss << data_dir << "/../bad_data"; | |
58 | return ss.str(); | |
59 | } | |
60 | ||
61 | std::string get_data_file(const std::string& filename, bool is_good) { | |
62 | std::stringstream ss; | |
63 | ||
64 | if (is_good) { | |
65 | ss << get_data_dir(); | |
66 | } else { | |
67 | ss << get_bad_data_dir(); | |
68 | } | |
69 | ||
70 | ss << "/" << filename; | |
71 | return ss.str(); | |
72 | } | |
73 | ||
74 | void random_bytes(int n, uint32_t seed, std::vector<uint8_t>* out) { | |
75 | std::default_random_engine gen(seed); | |
76 | std::uniform_int_distribution<int> d(0, 255); | |
77 | ||
78 | out->resize(n); | |
79 | for (int i = 0; i < n; ++i) { | |
80 | (*out)[i] = static_cast<uint8_t>(d(gen)); | |
81 | } | |
82 | } | |
83 | ||
84 | void random_bools(int n, double p, uint32_t seed, bool* out) { | |
85 | std::default_random_engine gen(seed); | |
86 | std::bernoulli_distribution d(p); | |
87 | for (int i = 0; i < n; ++i) { | |
88 | out[i] = d(gen); | |
89 | } | |
90 | } | |
91 | ||
92 | void random_Int96_numbers(int n, uint32_t seed, int32_t min_value, int32_t max_value, | |
93 | Int96* out) { | |
94 | std::default_random_engine gen(seed); | |
95 | std::uniform_int_distribution<int32_t> d(min_value, max_value); | |
96 | for (int i = 0; i < n; ++i) { | |
97 | out[i].value[0] = d(gen); | |
98 | out[i].value[1] = d(gen); | |
99 | out[i].value[2] = d(gen); | |
100 | } | |
101 | } | |
102 | ||
103 | void random_fixed_byte_array(int n, uint32_t seed, uint8_t* buf, int len, FLBA* out) { | |
104 | std::default_random_engine gen(seed); | |
105 | std::uniform_int_distribution<int> d(0, 255); | |
106 | for (int i = 0; i < n; ++i) { | |
107 | out[i].ptr = buf; | |
108 | for (int j = 0; j < len; ++j) { | |
109 | buf[j] = static_cast<uint8_t>(d(gen)); | |
110 | } | |
111 | buf += len; | |
112 | } | |
113 | } | |
114 | ||
115 | void random_byte_array(int n, uint32_t seed, uint8_t* buf, ByteArray* out, int min_size, | |
116 | int max_size) { | |
117 | std::default_random_engine gen(seed); | |
118 | std::uniform_int_distribution<int> d1(min_size, max_size); | |
119 | std::uniform_int_distribution<int> d2(0, 255); | |
120 | for (int i = 0; i < n; ++i) { | |
121 | int len = d1(gen); | |
122 | out[i].len = len; | |
123 | out[i].ptr = buf; | |
124 | for (int j = 0; j < len; ++j) { | |
125 | buf[j] = static_cast<uint8_t>(d2(gen)); | |
126 | } | |
127 | buf += len; | |
128 | } | |
129 | } | |
130 | ||
131 | void random_byte_array(int n, uint32_t seed, uint8_t* buf, ByteArray* out, int max_size) { | |
132 | random_byte_array(n, seed, buf, out, 0, max_size); | |
133 | } | |
134 | ||
135 | } // namespace test | |
136 | } // namespace parquet |