]> git.proxmox.com Git - ceph.git/blob - ceph/src/s3select/include/s3select_csv_parser.h
update ceph source to reef 18.1.2
[ceph.git] / ceph / src / s3select / include / s3select_csv_parser.h
1 #include "csvparser/csv.h"
2
3 namespace io{
4
5 namespace error{
6 struct escaped_char_missing :
7 base,
8 with_file_name,
9 with_file_line{
10 void format_error_message()const override{
11 std::snprintf(error_message_buffer, sizeof(error_message_buffer),
12 "Escaped character missing in line %d in file \"%s\"."
13 , file_line, file_name);
14 }
15 };
16 }
17
18 namespace detail{
19 static void unescape(char*&col_begin, char*&col_end, char& quote, char& escape_char)
20 {
21 if(col_end - col_begin >= 2)
22 {
23 while(*col_begin == quote && *(col_begin + 1) == quote)
24 {
25 ++col_begin;
26 ++col_begin;
27 }
28 char*out = col_begin;
29 char* in = col_begin;
30 bool init = true;
31
32 while(in != col_end)
33 {
34 if(*in != quote && *in != escape_char)
35 {
36 if(init)
37 {
38 init = false;
39 }
40 else
41 {
42 *out = *in;
43 }
44 ++in;
45 ++out;
46 }
47 else
48 {
49 if(*in == escape_char)
50 {
51 ++in;
52 if(init)
53 {
54 ++col_begin;
55 ++out;
56 init = false;
57 }
58 else
59 {
60 *out = *in;
61 }
62 ++in;
63 ++out;
64 }
65 else
66 {
67 ++in;
68 while(*in != quote)
69 {
70 if(init)
71 {
72 ++col_begin;
73 ++out;
74 init = false;
75 }
76 else
77 {
78 *out = *in;
79 }
80 ++in;
81 ++out;
82 }
83 ++in;
84 }
85 }
86 }
87 *out = '\0';
88 col_end = out;
89 }
90 }
91
92 static void trim(char*&str_begin, char*&str_end, std::vector<char>& trim_chars)
93 {
94 while(str_begin != str_end && std::find(trim_chars.begin(), trim_chars.end(), *str_begin) != trim_chars.end())
95 ++str_begin;
96 while(str_begin != str_end && std::find(trim_chars.begin(), trim_chars.end(), *(str_end-1)) != trim_chars.end())
97 --str_end;
98 *str_end = '\0';
99 }
100
101 static const char*find_next_column_end(const char*col_begin, char& sep, char& quote, char& escape_char)
102 {
103 while(*col_begin != sep && *col_begin != '\0')
104 {
105 if(*col_begin != quote && *col_begin != escape_char)
106 ++col_begin;
107 else
108 {
109 if(*col_begin == escape_char)
110 {
111 if(*(col_begin+1) == '\0')
112 throw error::escaped_char_missing();
113 col_begin += 2;
114 }
115 else
116 {
117 do
118 {
119 ++col_begin;
120 while(*col_begin != quote)
121 {
122 if(*col_begin == '\0')
123 throw error::escaped_string_not_closed();
124 ++col_begin;
125 }
126 ++col_begin;
127 }while(*col_begin == quote);
128 }
129 }
130 }
131 return col_begin;
132 }
133
134 void chop_next_column(char*&line, char*&col_begin, char*&col_end, char& col_delimiter, char& quote, char& escape_char)
135 {
136 assert(line != nullptr);
137
138 col_begin = line;
139 // the col_begin + (... - col_begin) removes the constness
140 col_end = col_begin + (find_next_column_end(col_begin, col_delimiter, quote, escape_char) - col_begin);
141
142 if(*col_end == '\0')
143 {
144 line = nullptr;
145 }
146 else
147 {
148 *col_end = '\0';
149 line = col_end + 1;
150 }
151 }
152
153 void parse_line(char*line, std::vector<char*>& sorted_col, char& col_delimiter, char& quote, char& escape_char, std::vector<char>& trim_chars)
154 {
155 while (line != nullptr)
156 {
157 char*col_begin, *col_end;
158 chop_next_column(line, col_begin, col_end, col_delimiter, quote, escape_char);
159 if (!trim_chars.empty())
160 trim(col_begin, col_end, trim_chars);
161 if (!(quote == '\0' && escape_char == '\0'))
162 unescape(col_begin, col_end, quote, escape_char);
163 sorted_col.push_back(col_begin);
164 }
165 }
166
167
168 bool empty_comment_line(char* line)
169 {
170 if(*line == '\0')
171 return true;
172 while(*line == ' ' || *line == '\t')
173 {
174 ++line;
175 if(*line == '\0')
176 return true;
177 }
178 return false;
179 }
180
181 bool single_line_comment(char start_char, std::vector<char>& comment_chars)
182 {
183 if(std::find(comment_chars.begin(), comment_chars.end(), start_char) != comment_chars.end())
184 return true;
185 else
186 return false;
187 }
188
189 bool is_comment(char*&line, bool& comment_empty_line, std::vector<char>& comment_chars)
190 {
191 if(!comment_empty_line && comment_chars.empty())
192 return false;
193 else if(comment_empty_line && comment_chars.empty())
194 return empty_comment_line(line);
195 else if(!comment_empty_line && !comment_chars.empty())
196 return single_line_comment(*line, comment_chars);
197 else
198 return empty_comment_line(line) || single_line_comment(*line, comment_chars);
199 }
200
201 }
202 }
203
204
205 class CSVParser
206 {
207 private:
208 char row_delimiter;
209 char col_delimiter;
210 char quote;
211 char escape_char;
212 bool comment_empty_line;
213 std::vector<char> comment_characters;
214 std::vector<char> trim_characters;
215
216 static const int block_len = 1<<20;
217 std::unique_ptr<char[]>buffer; // must be constructed before (and thus destructed after) the reader!
218 #ifdef CSV_IO_NO_THREAD
219 io::detail::SynchronousReader reader;
220 #else
221 io::detail::AsynchronousReader reader;
222 #endif
223 int data_begin;
224 int data_end;
225
226 char file_name[io::error::max_file_name_length+1];
227 unsigned file_line;
228
229 void init(std::unique_ptr<io::ByteSourceBase>byte_source)
230 {
231 file_line = 0;
232
233 buffer = std::unique_ptr<char[]>(new char[3*block_len]);
234 data_begin = 0;
235 data_end = byte_source->read(buffer.get(), 2*block_len);
236
237 // Ignore UTF-8 BOM
238 if(data_end >= 3 && buffer[0] == '\xEF' && buffer[1] == '\xBB' && buffer[2] == '\xBF')
239 data_begin = 3;
240
241 if(data_end == 2*block_len){
242 reader.init(std::move(byte_source));
243 reader.start_read(buffer.get() + 2*block_len, block_len);
244 }
245 }
246
247 public:
248 CSVParser() = delete;
249 CSVParser(const CSVParser&) = delete;
250 CSVParser&operator=(const CSVParser&);
251
252 CSVParser(const char*file_name, const char*data_begin, const char*data_end)
253 {
254 set_file_name(file_name);
255 init(std::unique_ptr<io::ByteSourceBase>(new io::detail::NonOwningStringByteSource(data_begin, data_end-data_begin)));
256 }
257
258 CSVParser(const std::string&file_name, const char*data_begin, const char*data_end)
259 {
260 set_file_name(file_name.c_str());
261 init(std::unique_ptr<io::ByteSourceBase>(new io::detail::NonOwningStringByteSource(data_begin, data_end-data_begin)));
262 }
263
264 void set_file_name(const std::string&file_name)
265 {
266 set_file_name(file_name.c_str());
267 }
268
269 void set_file_name(const char*file_name)
270 {
271 if(file_name != nullptr)
272 {
273 strncpy(this->file_name, file_name, sizeof(this->file_name));
274 this->file_name[sizeof(this->file_name)-1] = '\0';
275 }
276 else
277 {
278 this->file_name[0] = '\0';
279 }
280 }
281
282 const char*get_truncated_file_name()const
283 {
284 return file_name;
285 }
286
287 void set_file_line(unsigned file_line)
288 {
289 this->file_line = file_line;
290 }
291
292 unsigned get_file_line()const
293 {
294 return file_line;
295 }
296
297 void set_csv_def(char& row_delimit, char& col_delimit, char& quote_char, char& escp_char, bool& cmnt_empty_line, std::vector<char>& comment_chars , std::vector<char>& trim_chars)
298 {
299 row_delimiter = row_delimit;
300 col_delimiter = col_delimit;
301 quote = quote_char;
302 escape_char = escp_char;
303 comment_empty_line = cmnt_empty_line;
304 comment_characters.assign(comment_chars.begin(), comment_chars.end());
305 trim_characters.assign(trim_chars.begin(), trim_chars.end());
306 }
307
308 char*next_line()
309 {
310 if(data_begin == data_end)
311 return nullptr;
312
313 ++file_line;
314
315 assert(data_begin < data_end);
316 assert(data_end <= block_len*2);
317
318 if(data_begin >= block_len)
319 {
320 std::memcpy(buffer.get(), buffer.get()+block_len, block_len);
321 data_begin -= block_len;
322 data_end -= block_len;
323 if(reader.is_valid())
324 {
325 data_end += reader.finish_read();
326 std::memcpy(buffer.get()+block_len, buffer.get()+2*block_len, block_len);
327 reader.start_read(buffer.get() + 2*block_len, block_len);
328 }
329 }
330
331 int line_end = data_begin;
332 while(line_end != data_end && buffer[line_end] != row_delimiter)
333 {
334 if(buffer[line_end] == quote || buffer[line_end] == escape_char)
335 {
336 if(buffer[line_end] == escape_char)
337 {
338 ++line_end;
339 if(line_end == data_end)
340 {
341 throw io::error::escaped_char_missing();
342 }
343 else if(buffer[line_end] == '\r' && buffer[line_end + 1] == '\n') // handle windows \r\n-line breaks
344 {
345 ++line_end;
346 }
347 }
348 else
349 {
350 ++line_end;
351 while(buffer[line_end] != quote)
352 {
353 if(line_end == data_end)
354 throw io::error::escaped_string_not_closed();
355 ++line_end;
356 }
357 }
358 }
359 ++line_end;
360 }
361
362 if(line_end - data_begin + 1 > block_len)
363 {
364 io::error::line_length_limit_exceeded err;
365 err.set_file_name(file_name);
366 err.set_file_line(file_line);
367 throw err;
368 }
369
370 if(line_end != data_end && buffer[line_end] == row_delimiter)
371 {
372 buffer[line_end] = '\0';
373 }
374 else
375 {
376 // some files are missing the newline at the end of the
377 // last line
378 ++data_end;
379 buffer[line_end] = '\0';
380 }
381
382 // handle windows \r\n-line breaks
383 if(row_delimiter == '\n')
384 {
385 if(line_end != data_begin && buffer[line_end-1] == '\r')
386 buffer[line_end-1] = '\0';
387 }
388
389 char*ret = buffer.get() + data_begin;
390 data_begin = line_end+1;
391 return ret;
392 }
393
394 bool read_row(std::vector<char*>& cols)
395 {
396 try{
397 try{
398 char*line;
399 do{
400 line = next_line();
401 if(!line)
402 return false;
403 }while(io::detail::is_comment(line, comment_empty_line, comment_characters));
404
405 io::detail::parse_line(line, cols, col_delimiter, quote, escape_char, trim_characters);
406
407 }catch(io::error::with_file_name&err){
408 err.set_file_name(get_truncated_file_name());
409 throw;
410 }
411 }catch(io::error::with_file_line&err){
412 err.set_file_line(get_file_line());
413 throw;
414 }
415
416 return true;
417 }
418 };