]>
Commit | Line | Data |
---|---|---|
1e59de90 TL |
1 | #include "csvparser/csv.h" |
2 | ||
3 | namespace io{ | |
4 | ||
5 | namespace error{ | |
6 | struct escaped_char_missing : | |
7 | base, | |
8 | with_file_name, | |
9 | with_file_line{ | |
10 | void format_error_message()const override{ | |
11 | std::snprintf(error_message_buffer, sizeof(error_message_buffer), | |
12 | "Escaped character missing in line %d in file \"%s\"." | |
13 | , file_line, file_name); | |
14 | } | |
15 | }; | |
20effc67 | 16 | } |
f67539c2 | 17 | |
1e59de90 TL |
18 | namespace detail{ |
19 | static void unescape(char*&col_begin, char*&col_end, char& quote, char& escape_char) | |
f67539c2 | 20 | { |
1e59de90 TL |
21 | if(col_end - col_begin >= 2) |
22 | { | |
23 | while(*col_begin == quote && *(col_begin + 1) == quote) | |
24 | { | |
25 | ++col_begin; | |
26 | ++col_begin; | |
27 | } | |
28 | char*out = col_begin; | |
29 | char* in = col_begin; | |
30 | bool init = true; | |
31 | ||
32 | while(in != col_end) | |
33 | { | |
34 | if(*in != quote && *in != escape_char) | |
35 | { | |
36 | if(init) | |
37 | { | |
38 | init = false; | |
39 | } | |
40 | else | |
41 | { | |
42 | *out = *in; | |
43 | } | |
44 | ++in; | |
45 | ++out; | |
46 | } | |
47 | else | |
48 | { | |
49 | if(*in == escape_char) | |
50 | { | |
51 | ++in; | |
52 | if(init) | |
53 | { | |
54 | ++col_begin; | |
55 | ++out; | |
56 | init = false; | |
57 | } | |
58 | else | |
59 | { | |
60 | *out = *in; | |
61 | } | |
62 | ++in; | |
63 | ++out; | |
64 | } | |
65 | else | |
66 | { | |
67 | ++in; | |
68 | while(*in != quote) | |
69 | { | |
70 | if(init) | |
71 | { | |
72 | ++col_begin; | |
73 | ++out; | |
74 | init = false; | |
75 | } | |
76 | else | |
77 | { | |
78 | *out = *in; | |
79 | } | |
80 | ++in; | |
81 | ++out; | |
82 | } | |
83 | ++in; | |
84 | } | |
85 | } | |
86 | } | |
87 | *out = '\0'; | |
88 | col_end = out; | |
89 | } | |
f67539c2 | 90 | } |
f67539c2 | 91 | |
1e59de90 TL |
92 | static void trim(char*&str_begin, char*&str_end, std::vector<char>& trim_chars) |
93 | { | |
94 | while(str_begin != str_end && std::find(trim_chars.begin(), trim_chars.end(), *str_begin) != trim_chars.end()) | |
95 | ++str_begin; | |
96 | while(str_begin != str_end && std::find(trim_chars.begin(), trim_chars.end(), *(str_end-1)) != trim_chars.end()) | |
97 | --str_end; | |
98 | *str_end = '\0'; | |
99 | } | |
f67539c2 | 100 | |
1e59de90 TL |
101 | static const char*find_next_column_end(const char*col_begin, char& sep, char& quote, char& escape_char) |
102 | { | |
103 | while(*col_begin != sep && *col_begin != '\0') | |
104 | { | |
105 | if(*col_begin != quote && *col_begin != escape_char) | |
106 | ++col_begin; | |
107 | else | |
108 | { | |
109 | if(*col_begin == escape_char) | |
110 | { | |
111 | if(*(col_begin+1) == '\0') | |
112 | throw error::escaped_char_missing(); | |
113 | col_begin += 2; | |
114 | } | |
115 | else | |
116 | { | |
117 | do | |
118 | { | |
119 | ++col_begin; | |
120 | while(*col_begin != quote) | |
121 | { | |
122 | if(*col_begin == '\0') | |
123 | throw error::escaped_string_not_closed(); | |
124 | ++col_begin; | |
125 | } | |
126 | ++col_begin; | |
127 | }while(*col_begin == quote); | |
128 | } | |
129 | } | |
130 | } | |
131 | return col_begin; | |
132 | } | |
f67539c2 | 133 | |
1e59de90 TL |
134 | void chop_next_column(char*&line, char*&col_begin, char*&col_end, char& col_delimiter, char& quote, char& escape_char) |
135 | { | |
136 | assert(line != nullptr); | |
137 | ||
138 | col_begin = line; | |
139 | // the col_begin + (... - col_begin) removes the constness | |
140 | col_end = col_begin + (find_next_column_end(col_begin, col_delimiter, quote, escape_char) - col_begin); | |
141 | ||
142 | if(*col_end == '\0') | |
143 | { | |
144 | line = nullptr; | |
145 | } | |
146 | else | |
147 | { | |
148 | *col_end = '\0'; | |
149 | line = col_end + 1; | |
150 | } | |
151 | } | |
f67539c2 | 152 | |
1e59de90 TL |
153 | void parse_line(char*line, std::vector<char*>& sorted_col, char& col_delimiter, char& quote, char& escape_char, std::vector<char>& trim_chars) |
154 | { | |
155 | while (line != nullptr) | |
156 | { | |
157 | char*col_begin, *col_end; | |
158 | chop_next_column(line, col_begin, col_end, col_delimiter, quote, escape_char); | |
159 | if (!trim_chars.empty()) | |
160 | trim(col_begin, col_end, trim_chars); | |
161 | if (!(quote == '\0' && escape_char == '\0')) | |
162 | unescape(col_begin, col_end, quote, escape_char); | |
163 | sorted_col.push_back(col_begin); | |
164 | } | |
165 | } | |
f67539c2 | 166 | |
f67539c2 | 167 | |
1e59de90 TL |
168 | bool empty_comment_line(char* line) |
169 | { | |
170 | if(*line == '\0') | |
171 | return true; | |
172 | while(*line == ' ' || *line == '\t') | |
173 | { | |
174 | ++line; | |
175 | if(*line == '\0') | |
176 | return true; | |
177 | } | |
178 | return false; | |
179 | } | |
f67539c2 | 180 | |
1e59de90 TL |
181 | bool single_line_comment(char start_char, std::vector<char>& comment_chars) |
182 | { | |
183 | if(std::find(comment_chars.begin(), comment_chars.end(), start_char) != comment_chars.end()) | |
184 | return true; | |
185 | else | |
186 | return false; | |
187 | } | |
f67539c2 | 188 | |
1e59de90 TL |
189 | bool is_comment(char*&line, bool& comment_empty_line, std::vector<char>& comment_chars) |
190 | { | |
191 | if(!comment_empty_line && comment_chars.empty()) | |
192 | return false; | |
193 | else if(comment_empty_line && comment_chars.empty()) | |
194 | return empty_comment_line(line); | |
195 | else if(!comment_empty_line && !comment_chars.empty()) | |
196 | return single_line_comment(*line, comment_chars); | |
197 | else | |
198 | return empty_comment_line(line) || single_line_comment(*line, comment_chars); | |
199 | } | |
f67539c2 | 200 | |
f67539c2 | 201 | } |
1e59de90 | 202 | } |
f67539c2 | 203 | |
f67539c2 | 204 | |
1e59de90 | 205 | class CSVParser |
f67539c2 | 206 | { |
1e59de90 TL |
207 | private: |
208 | char row_delimiter; | |
209 | char col_delimiter; | |
210 | char quote; | |
211 | char escape_char; | |
212 | bool comment_empty_line; | |
213 | std::vector<char> comment_characters; | |
214 | std::vector<char> trim_characters; | |
215 | ||
216 | static const int block_len = 1<<20; | |
217 | std::unique_ptr<char[]>buffer; // must be constructed before (and thus destructed after) the reader! | |
218 | #ifdef CSV_IO_NO_THREAD | |
219 | io::detail::SynchronousReader reader; | |
220 | #else | |
221 | io::detail::AsynchronousReader reader; | |
222 | #endif | |
223 | int data_begin; | |
224 | int data_end; | |
225 | ||
226 | char file_name[io::error::max_file_name_length+1]; | |
227 | unsigned file_line; | |
228 | ||
229 | void init(std::unique_ptr<io::ByteSourceBase>byte_source) | |
230 | { | |
231 | file_line = 0; | |
f67539c2 | 232 | |
1e59de90 TL |
233 | buffer = std::unique_ptr<char[]>(new char[3*block_len]); |
234 | data_begin = 0; | |
235 | data_end = byte_source->read(buffer.get(), 2*block_len); | |
f67539c2 | 236 | |
1e59de90 TL |
237 | // Ignore UTF-8 BOM |
238 | if(data_end >= 3 && buffer[0] == '\xEF' && buffer[1] == '\xBB' && buffer[2] == '\xBF') | |
239 | data_begin = 3; | |
f67539c2 | 240 | |
1e59de90 TL |
241 | if(data_end == 2*block_len){ |
242 | reader.init(std::move(byte_source)); | |
243 | reader.start_read(buffer.get() + 2*block_len, block_len); | |
244 | } | |
245 | } | |
20effc67 | 246 | |
1e59de90 TL |
247 | public: |
248 | CSVParser() = delete; | |
249 | CSVParser(const CSVParser&) = delete; | |
250 | CSVParser&operator=(const CSVParser&); | |
f67539c2 | 251 | |
1e59de90 TL |
252 | CSVParser(const char*file_name, const char*data_begin, const char*data_end) |
253 | { | |
254 | set_file_name(file_name); | |
255 | init(std::unique_ptr<io::ByteSourceBase>(new io::detail::NonOwningStringByteSource(data_begin, data_end-data_begin))); | |
256 | } | |
f67539c2 | 257 | |
1e59de90 TL |
258 | CSVParser(const std::string&file_name, const char*data_begin, const char*data_end) |
259 | { | |
260 | set_file_name(file_name.c_str()); | |
261 | init(std::unique_ptr<io::ByteSourceBase>(new io::detail::NonOwningStringByteSource(data_begin, data_end-data_begin))); | |
262 | } | |
f67539c2 | 263 | |
1e59de90 TL |
264 | void set_file_name(const std::string&file_name) |
265 | { | |
266 | set_file_name(file_name.c_str()); | |
267 | } | |
f67539c2 | 268 | |
1e59de90 TL |
269 | void set_file_name(const char*file_name) |
270 | { | |
271 | if(file_name != nullptr) | |
272 | { | |
273 | strncpy(this->file_name, file_name, sizeof(this->file_name)); | |
274 | this->file_name[sizeof(this->file_name)-1] = '\0'; | |
275 | } | |
276 | else | |
277 | { | |
278 | this->file_name[0] = '\0'; | |
279 | } | |
280 | } | |
f67539c2 | 281 | |
1e59de90 TL |
282 | const char*get_truncated_file_name()const |
283 | { | |
284 | return file_name; | |
285 | } | |
f67539c2 | 286 | |
1e59de90 TL |
287 | void set_file_line(unsigned file_line) |
288 | { | |
289 | this->file_line = file_line; | |
290 | } | |
f67539c2 | 291 | |
1e59de90 TL |
292 | unsigned get_file_line()const |
293 | { | |
294 | return file_line; | |
295 | } | |
296 | ||
297 | void set_csv_def(char& row_delimit, char& col_delimit, char& quote_char, char& escp_char, bool& cmnt_empty_line, std::vector<char>& comment_chars , std::vector<char>& trim_chars) | |
298 | { | |
299 | row_delimiter = row_delimit; | |
300 | col_delimiter = col_delimit; | |
301 | quote = quote_char; | |
302 | escape_char = escp_char; | |
303 | comment_empty_line = cmnt_empty_line; | |
304 | comment_characters.assign(comment_chars.begin(), comment_chars.end()); | |
305 | trim_characters.assign(trim_chars.begin(), trim_chars.end()); | |
306 | } | |
f67539c2 | 307 | |
1e59de90 TL |
308 | char*next_line() |
309 | { | |
310 | if(data_begin == data_end) | |
311 | return nullptr; | |
312 | ||
313 | ++file_line; | |
314 | ||
315 | assert(data_begin < data_end); | |
316 | assert(data_end <= block_len*2); | |
317 | ||
318 | if(data_begin >= block_len) | |
319 | { | |
320 | std::memcpy(buffer.get(), buffer.get()+block_len, block_len); | |
321 | data_begin -= block_len; | |
322 | data_end -= block_len; | |
323 | if(reader.is_valid()) | |
324 | { | |
325 | data_end += reader.finish_read(); | |
326 | std::memcpy(buffer.get()+block_len, buffer.get()+2*block_len, block_len); | |
327 | reader.start_read(buffer.get() + 2*block_len, block_len); | |
328 | } | |
329 | } | |
330 | ||
331 | int line_end = data_begin; | |
332 | while(line_end != data_end && buffer[line_end] != row_delimiter) | |
333 | { | |
334 | if(buffer[line_end] == quote || buffer[line_end] == escape_char) | |
335 | { | |
336 | if(buffer[line_end] == escape_char) | |
337 | { | |
338 | ++line_end; | |
339 | if(line_end == data_end) | |
340 | { | |
341 | throw io::error::escaped_char_missing(); | |
342 | } | |
343 | else if(buffer[line_end] == '\r' && buffer[line_end + 1] == '\n') // handle windows \r\n-line breaks | |
344 | { | |
345 | ++line_end; | |
346 | } | |
347 | } | |
348 | else | |
349 | { | |
350 | ++line_end; | |
351 | while(buffer[line_end] != quote) | |
352 | { | |
353 | if(line_end == data_end) | |
354 | throw io::error::escaped_string_not_closed(); | |
355 | ++line_end; | |
356 | } | |
357 | } | |
358 | } | |
359 | ++line_end; | |
360 | } | |
361 | ||
362 | if(line_end - data_begin + 1 > block_len) | |
363 | { | |
364 | io::error::line_length_limit_exceeded err; | |
365 | err.set_file_name(file_name); | |
366 | err.set_file_line(file_line); | |
367 | throw err; | |
368 | } | |
369 | ||
370 | if(line_end != data_end && buffer[line_end] == row_delimiter) | |
371 | { | |
372 | buffer[line_end] = '\0'; | |
373 | } | |
374 | else | |
375 | { | |
376 | // some files are missing the newline at the end of the | |
377 | // last line | |
378 | ++data_end; | |
379 | buffer[line_end] = '\0'; | |
380 | } | |
381 | ||
382 | // handle windows \r\n-line breaks | |
383 | if(row_delimiter == '\n') | |
384 | { | |
385 | if(line_end != data_begin && buffer[line_end-1] == '\r') | |
386 | buffer[line_end-1] = '\0'; | |
387 | } | |
388 | ||
389 | char*ret = buffer.get() + data_begin; | |
390 | data_begin = line_end+1; | |
391 | return ret; | |
392 | } | |
f67539c2 | 393 | |
1e59de90 TL |
394 | bool read_row(std::vector<char*>& cols) |
395 | { | |
396 | try{ | |
397 | try{ | |
398 | char*line; | |
399 | do{ | |
400 | line = next_line(); | |
401 | if(!line) | |
402 | return false; | |
403 | }while(io::detail::is_comment(line, comment_empty_line, comment_characters)); | |
404 | ||
405 | io::detail::parse_line(line, cols, col_delimiter, quote, escape_char, trim_characters); | |
406 | ||
407 | }catch(io::error::with_file_name&err){ | |
408 | err.set_file_name(get_truncated_file_name()); | |
409 | throw; | |
410 | } | |
411 | }catch(io::error::with_file_line&err){ | |
412 | err.set_file_line(get_file_line()); | |
413 | throw; | |
414 | } | |
415 | ||
416 | return true; | |
417 | } | |
418 | }; |