1 #include "csvparser/csv.h"
6 struct escaped_char_missing
:
10 void format_error_message()const override
{
11 std::snprintf(error_message_buffer
, sizeof(error_message_buffer
),
12 "Escaped character missing in line %d in file \"%s\"."
13 , file_line
, file_name
);
19 static void unescape(char*&col_begin
, char*&col_end
, char& quote
, char& escape_char
)
21 if(col_end
- col_begin
>= 2)
23 while(*col_begin
== quote
&& *(col_begin
+ 1) == quote
)
34 if(*in
!= quote
&& *in
!= escape_char
)
49 if(*in
== escape_char
)
92 static void trim(char*&str_begin
, char*&str_end
, std::vector
<char>& trim_chars
)
94 while(str_begin
!= str_end
&& std::find(trim_chars
.begin(), trim_chars
.end(), *str_begin
) != trim_chars
.end())
96 while(str_begin
!= str_end
&& std::find(trim_chars
.begin(), trim_chars
.end(), *(str_end
-1)) != trim_chars
.end())
101 static const char*find_next_column_end(const char*col_begin
, char& sep
, char& quote
, char& escape_char
)
103 while(*col_begin
!= sep
&& *col_begin
!= '\0')
105 if(*col_begin
!= quote
&& *col_begin
!= escape_char
)
109 if(*col_begin
== escape_char
)
111 if(*(col_begin
+1) == '\0')
112 throw error::escaped_char_missing();
120 while(*col_begin
!= quote
)
122 if(*col_begin
== '\0')
123 throw error::escaped_string_not_closed();
127 }while(*col_begin
== quote
);
134 void chop_next_column(char*&line
, char*&col_begin
, char*&col_end
, char& col_delimiter
, char& quote
, char& escape_char
)
136 assert(line
!= nullptr);
139 // the col_begin + (... - col_begin) removes the constness
140 col_end
= col_begin
+ (find_next_column_end(col_begin
, col_delimiter
, quote
, escape_char
) - col_begin
);
153 void parse_line(char*line
, std::vector
<char*>& sorted_col
, char& col_delimiter
, char& quote
, char& escape_char
, std::vector
<char>& trim_chars
)
155 while (line
!= nullptr)
157 char*col_begin
, *col_end
;
158 chop_next_column(line
, col_begin
, col_end
, col_delimiter
, quote
, escape_char
);
159 if (!trim_chars
.empty())
160 trim(col_begin
, col_end
, trim_chars
);
161 if (!(quote
== '\0' && escape_char
== '\0'))
162 unescape(col_begin
, col_end
, quote
, escape_char
);
163 sorted_col
.push_back(col_begin
);
168 bool empty_comment_line(char* line
)
172 while(*line
== ' ' || *line
== '\t')
181 bool single_line_comment(char start_char
, std::vector
<char>& comment_chars
)
183 if(std::find(comment_chars
.begin(), comment_chars
.end(), start_char
) != comment_chars
.end())
189 bool is_comment(char*&line
, bool& comment_empty_line
, std::vector
<char>& comment_chars
)
191 if(!comment_empty_line
&& comment_chars
.empty())
193 else if(comment_empty_line
&& comment_chars
.empty())
194 return empty_comment_line(line
);
195 else if(!comment_empty_line
&& !comment_chars
.empty())
196 return single_line_comment(*line
, comment_chars
);
198 return empty_comment_line(line
) || single_line_comment(*line
, comment_chars
);
212 bool comment_empty_line
;
213 std::vector
<char> comment_characters
;
214 std::vector
<char> trim_characters
;
216 static const int block_len
= 1<<20;
217 std::unique_ptr
<char[]>buffer
; // must be constructed before (and thus destructed after) the reader!
218 #ifdef CSV_IO_NO_THREAD
219 io::detail::SynchronousReader reader
;
221 io::detail::AsynchronousReader reader
;
226 char file_name
[io::error::max_file_name_length
+1];
229 void init(std::unique_ptr
<io::ByteSourceBase
>byte_source
)
233 buffer
= std::unique_ptr
<char[]>(new char[3*block_len
]);
235 data_end
= byte_source
->read(buffer
.get(), 2*block_len
);
238 if(data_end
>= 3 && buffer
[0] == '\xEF' && buffer
[1] == '\xBB' && buffer
[2] == '\xBF')
241 if(data_end
== 2*block_len
){
242 reader
.init(std::move(byte_source
));
243 reader
.start_read(buffer
.get() + 2*block_len
, block_len
);
248 CSVParser() = delete;
249 CSVParser(const CSVParser
&) = delete;
250 CSVParser
&operator=(const CSVParser
&);
252 CSVParser(const char*file_name
, const char*data_begin
, const char*data_end
)
254 set_file_name(file_name
);
255 init(std::unique_ptr
<io::ByteSourceBase
>(new io::detail::NonOwningStringByteSource(data_begin
, data_end
-data_begin
)));
258 CSVParser(const std::string
&file_name
, const char*data_begin
, const char*data_end
)
260 set_file_name(file_name
.c_str());
261 init(std::unique_ptr
<io::ByteSourceBase
>(new io::detail::NonOwningStringByteSource(data_begin
, data_end
-data_begin
)));
264 void set_file_name(const std::string
&file_name
)
266 set_file_name(file_name
.c_str());
269 void set_file_name(const char*file_name
)
271 if(file_name
!= nullptr)
273 strncpy(this->file_name
, file_name
, sizeof(this->file_name
));
274 this->file_name
[sizeof(this->file_name
)-1] = '\0';
278 this->file_name
[0] = '\0';
282 const char*get_truncated_file_name()const
287 void set_file_line(unsigned file_line
)
289 this->file_line
= file_line
;
292 unsigned get_file_line()const
297 void set_csv_def(char& row_delimit
, char& col_delimit
, char& quote_char
, char& escp_char
, bool& cmnt_empty_line
, std::vector
<char>& comment_chars
, std::vector
<char>& trim_chars
)
299 row_delimiter
= row_delimit
;
300 col_delimiter
= col_delimit
;
302 escape_char
= escp_char
;
303 comment_empty_line
= cmnt_empty_line
;
304 comment_characters
.assign(comment_chars
.begin(), comment_chars
.end());
305 trim_characters
.assign(trim_chars
.begin(), trim_chars
.end());
310 if(data_begin
== data_end
)
315 assert(data_begin
< data_end
);
316 assert(data_end
<= block_len
*2);
318 if(data_begin
>= block_len
)
320 std::memcpy(buffer
.get(), buffer
.get()+block_len
, block_len
);
321 data_begin
-= block_len
;
322 data_end
-= block_len
;
323 if(reader
.is_valid())
325 data_end
+= reader
.finish_read();
326 std::memcpy(buffer
.get()+block_len
, buffer
.get()+2*block_len
, block_len
);
327 reader
.start_read(buffer
.get() + 2*block_len
, block_len
);
331 int line_end
= data_begin
;
332 while(line_end
!= data_end
&& buffer
[line_end
] != row_delimiter
)
334 if(buffer
[line_end
] == quote
|| buffer
[line_end
] == escape_char
)
336 if(buffer
[line_end
] == escape_char
)
339 if(line_end
== data_end
)
341 throw io::error::escaped_char_missing();
343 else if(buffer
[line_end
] == '\r' && buffer
[line_end
+ 1] == '\n') // handle windows \r\n-line breaks
351 while(buffer
[line_end
] != quote
)
353 if(line_end
== data_end
)
354 throw io::error::escaped_string_not_closed();
362 if(line_end
- data_begin
+ 1 > block_len
)
364 io::error::line_length_limit_exceeded err
;
365 err
.set_file_name(file_name
);
366 err
.set_file_line(file_line
);
370 if(line_end
!= data_end
&& buffer
[line_end
] == row_delimiter
)
372 buffer
[line_end
] = '\0';
376 // some files are missing the newline at the end of the
379 buffer
[line_end
] = '\0';
382 // handle windows \r\n-line breaks
383 if(row_delimiter
== '\n')
385 if(line_end
!= data_begin
&& buffer
[line_end
-1] == '\r')
386 buffer
[line_end
-1] = '\0';
389 char*ret
= buffer
.get() + data_begin
;
390 data_begin
= line_end
+1;
394 bool read_row(std::vector
<char*>& cols
)
403 }while(io::detail::is_comment(line
, comment_empty_line
, comment_characters
));
405 io::detail::parse_line(line
, cols
, col_delimiter
, quote
, escape_char
, trim_characters
);
407 }catch(io::error::with_file_name
&err
){
408 err
.set_file_name(get_truncated_file_name());
411 }catch(io::error::with_file_line
&err
){
412 err
.set_file_line(get_file_line());