ceph/src/s3select/include/s3select_csv_parser.h

   1 #include "csvparser/csv.h"
   2
   3 namespace io{
   4
   5     namespace error{
   6         struct escaped_char_missing :
   7                 base,
   8                 with_file_name,
   9                 with_file_line{
  10                 void format_error_message()const override{
  11                         std::snprintf(error_message_buffer, sizeof(error_message_buffer),
  12                                 "Escaped character missing in line %d in file \"%s\"."
  13                                 , file_line, file_name);
  14                 }
  15         };
  16     }
  17
  18     namespace detail{
  19         static void unescape(char*&col_begin, char*&col_end, char& quote, char& escape_char)
  20         {
  21             if(col_end - col_begin >= 2)
  22             {
  23                 while(*col_begin == quote && *(col_begin + 1) == quote)
  24                 {
  25                     ++col_begin;
  26                     ++col_begin;
  27                 }
  28                 char*out = col_begin;
  29                 char* in = col_begin;
  30                 bool init = true;
  31
  32                 while(in != col_end)
  33                 {
  34                     if(*in != quote && *in != escape_char)
  35                     {
  36                         if(init)
  37                         {
  38                             init = false;
  39                         }
  40                         else
  41                         {
  42                             *out = *in;
  43                         }
  44                         ++in;
  45                         ++out;
  46                     }
  47                     else
  48                     {
  49                         if(*in == escape_char)
  50                         {
  51                             ++in;
  52                             if(init)
  53                             {
  54                                 ++col_begin;
  55                                 ++out;
  56                                 init = false;
  57                             }
  58                             else
  59                             {
  60                                 *out = *in;
  61                             }
  62                             ++in;
  63                             ++out;
  64                         }
  65                         else
  66                         {
  67                             ++in;
  68                             while(*in != quote)
  69                             {
  70                                 if(init)
  71                                 {
  72                                     ++col_begin;
  73                                     ++out;
  74                                     init = false;
  75                                 }
  76                                 else
  77                                 {
  78                                     *out = *in;
  79                                 }
  80                                 ++in;
  81                                 ++out;
  82                             }
  83                             ++in;
  84                         }
  85                     }
  86                 }
  87                 *out = '\0';
  88                 col_end = out;
  89             }
  90         }
  91
  92         static void trim(char*&str_begin, char*&str_end, std::vector<char>& trim_chars)
  93         {
  94             while(str_begin != str_end && std::find(trim_chars.begin(), trim_chars.end(), *str_begin) != trim_chars.end())
  95                 ++str_begin;
  96             while(str_begin != str_end && std::find(trim_chars.begin(), trim_chars.end(), *(str_end-1)) != trim_chars.end())
  97                 --str_end;
  98             *str_end = '\0';
  99         }
 100
 101         static const char*find_next_column_end(const char*col_begin, char& sep, char& quote, char& escape_char)
 102         {
 103             while(*col_begin != sep && *col_begin != '\0')
 104             {
 105                 if(*col_begin != quote && *col_begin != escape_char)
 106                     ++col_begin;
 107                 else
 108                 {
 109                     if(*col_begin == escape_char)
 110                     {
 111                         if(*(col_begin+1) == '\0')
 112                             throw error::escaped_char_missing();
 113                         col_begin += 2;
 114                     }
 115                     else
 116                     {
 117                         do
 118                         {
 119                             ++col_begin;
 120                             while(*col_begin != quote)
 121                             {
 122                                 if(*col_begin == '\0')
 123                                     throw error::escaped_string_not_closed();
 124                                 ++col_begin;
 125                             }
 126                             ++col_begin;
 127                         }while(*col_begin == quote);
 128                     }
 129                 }
 130             }
 131             return col_begin;
 132         }
 133
 134         void chop_next_column(char*&line, char*&col_begin, char*&col_end, char& col_delimiter, char& quote, char& escape_char)
 135         {
 136             assert(line != nullptr);
 137
 138             col_begin = line;
 139             // the col_begin + (... - col_begin) removes the constness
 140             col_end = col_begin + (find_next_column_end(col_begin, col_delimiter, quote, escape_char) - col_begin);
 141
 142             if(*col_end == '\0')
 143             {
 144                 line = nullptr;
 145             }
 146             else
 147             {
 148                 *col_end = '\0';
 149                  line = col_end + 1;
 150             }
 151         }
 152
 153         void parse_line(char*line, std::vector<char*>& sorted_col, char& col_delimiter, char& quote, char& escape_char, std::vector<char>& trim_chars)
 154         {
 155             while (line != nullptr)
 156             {
 157                 char*col_begin, *col_end;
 158                 chop_next_column(line, col_begin, col_end, col_delimiter, quote, escape_char);
 159                 if (!trim_chars.empty())
 160                     trim(col_begin, col_end, trim_chars);
 161                 if (!(quote == '\0' && escape_char == '\0'))
 162                     unescape(col_begin, col_end, quote, escape_char);
 163                 sorted_col.push_back(col_begin);
 164             }
 165         }
 166
 167
 168         bool empty_comment_line(char* line)
 169         {
 170             if(*line == '\0')
 171                 return true;
 172             while(*line == ' ' || *line == '\t')
 173             {
 174                 ++line;
 175                 if(*line == '\0')
 176                     return true;
 177             }
 178             return false;
 179         }
 180
 181         bool single_line_comment(char start_char, std::vector<char>& comment_chars)
 182         {
 183             if(std::find(comment_chars.begin(), comment_chars.end(), start_char) != comment_chars.end())
 184                 return true;
 185             else
 186                 return false;
 187         }
 188
 189         bool is_comment(char*&line, bool& comment_empty_line, std::vector<char>& comment_chars)
 190         {
 191             if(!comment_empty_line && comment_chars.empty())
 192                 return false;
 193             else if(comment_empty_line && comment_chars.empty())
 194                 return empty_comment_line(line);
 195             else if(!comment_empty_line && !comment_chars.empty())
 196                 return single_line_comment(*line, comment_chars);
 197             else
 198                 return empty_comment_line(line) || single_line_comment(*line, comment_chars);
 199         }
 200
 201     }
 202 }
 203
 204
 205 class CSVParser
 206 {
 207     private:
 208         char row_delimiter;
 209         char col_delimiter;
 210         char quote;
 211         char escape_char;
 212         bool comment_empty_line;
 213         std::vector<char> comment_characters;
 214         std::vector<char> trim_characters;
 215
 216         static const int block_len = 1<<20;
 217         std::unique_ptr<char[]>buffer; // must be constructed before (and thus destructed after) the reader!
 218         #ifdef CSV_IO_NO_THREAD
 219         io::detail::SynchronousReader reader;
 220         #else
 221         io::detail::AsynchronousReader reader;
 222         #endif
 223         int data_begin;
 224         int data_end;
 225
 226         char file_name[io::error::max_file_name_length+1];
 227         unsigned file_line;
 228
 229         void init(std::unique_ptr<io::ByteSourceBase>byte_source)
 230         {
 231             file_line = 0;
 232
 233             buffer = std::unique_ptr<char[]>(new char[3*block_len]);
 234             data_begin = 0;
 235             data_end = byte_source->read(buffer.get(), 2*block_len);
 236
 237             // Ignore UTF-8 BOM
 238             if(data_end >= 3 && buffer[0] == '\xEF' && buffer[1] == '\xBB' && buffer[2] == '\xBF')
 239                 data_begin = 3;
 240
 241             if(data_end == 2*block_len){
 242                 reader.init(std::move(byte_source));
 243                 reader.start_read(buffer.get() + 2*block_len, block_len);
 244             }
 245         }
 246
 247     public:
 248         CSVParser() = delete;
 249         CSVParser(const CSVParser&) = delete;
 250         CSVParser&operator=(const CSVParser&);
 251
 252         CSVParser(const char*file_name, const char*data_begin, const char*data_end)
 253         {
 254             set_file_name(file_name);
 255             init(std::unique_ptr<io::ByteSourceBase>(new io::detail::NonOwningStringByteSource(data_begin, data_end-data_begin)));
 256         }
 257
 258         CSVParser(const std::string&file_name, const char*data_begin, const char*data_end)
 259         {
 260             set_file_name(file_name.c_str());
 261             init(std::unique_ptr<io::ByteSourceBase>(new io::detail::NonOwningStringByteSource(data_begin, data_end-data_begin)));
 262         }
 263
 264         void set_file_name(const std::string&file_name)
 265         {
 266             set_file_name(file_name.c_str());
 267         }
 268
 269         void set_file_name(const char*file_name)
 270         {
 271             if(file_name != nullptr)
 272             {
 273                 strncpy(this->file_name, file_name, sizeof(this->file_name));
 274                 this->file_name[sizeof(this->file_name)-1] = '\0';
 275             }
 276             else
 277             {
 278                 this->file_name[0] = '\0';
 279             }
 280         }
 281
 282         const char*get_truncated_file_name()const
 283         {
 284             return file_name;
 285         }
 286
 287         void set_file_line(unsigned file_line)
 288         {
 289             this->file_line = file_line;
 290         }
 291
 292         unsigned get_file_line()const
 293         {
 294             return file_line;
 295         }
 296
 297         void set_csv_def(char& row_delimit, char& col_delimit, char& quote_char, char& escp_char, bool& cmnt_empty_line, std::vector<char>& comment_chars , std::vector<char>& trim_chars)
 298         {
 299             row_delimiter = row_delimit;
 300             col_delimiter = col_delimit;
 301             quote = quote_char;
 302             escape_char = escp_char;
 303             comment_empty_line  = cmnt_empty_line;
 304             comment_characters.assign(comment_chars.begin(), comment_chars.end());
 305             trim_characters.assign(trim_chars.begin(), trim_chars.end());
 306         }
 307
 308         char*next_line()
 309         {
 310             if(data_begin == data_end)
 311                 return nullptr;
 312
 313             ++file_line;
 314
 315             assert(data_begin < data_end);
 316             assert(data_end <= block_len*2);
 317
 318             if(data_begin >= block_len)
 319             {
 320                 std::memcpy(buffer.get(), buffer.get()+block_len, block_len);
 321                 data_begin -= block_len;
 322                 data_end -= block_len;
 323                 if(reader.is_valid())
 324                 {
 325                     data_end += reader.finish_read();
 326                     std::memcpy(buffer.get()+block_len, buffer.get()+2*block_len, block_len);
 327                     reader.start_read(buffer.get() + 2*block_len, block_len);
 328                 }
 329             }
 330
 331             int line_end = data_begin;
 332             while(line_end != data_end && buffer[line_end] != row_delimiter)
 333             {
 334                 if(buffer[line_end] == quote || buffer[line_end] == escape_char)
 335                 {
 336                     if(buffer[line_end] == escape_char)
 337                     {
 338                         ++line_end;
 339                         if(line_end == data_end)
 340                         {
 341                             throw io::error::escaped_char_missing();
 342                         }
 343                         else if(buffer[line_end] == '\r' && buffer[line_end + 1] == '\n')  // handle windows \r\n-line breaks
 344                         {
 345                             ++line_end;
 346                         }
 347                     }
 348                     else
 349                     {
 350                         ++line_end;
 351                         while(buffer[line_end] != quote)
 352                         {
 353                             if(line_end == data_end)
 354                                 throw io::error::escaped_string_not_closed();
 355                             ++line_end;
 356                         }
 357                     }
 358                 }
 359                 ++line_end;
 360             }
 361
 362             if(line_end - data_begin + 1 > block_len)
 363             {
 364                 io::error::line_length_limit_exceeded err;
 365                 err.set_file_name(file_name);
 366                 err.set_file_line(file_line);
 367                 throw err;
 368             }
 369
 370             if(line_end != data_end && buffer[line_end] == row_delimiter)
 371             {
 372                 buffer[line_end] = '\0';
 373             }
 374             else
 375             {
 376                 // some files are missing the newline at the end of the
 377                 // last line
 378                 ++data_end;
 379                 buffer[line_end] = '\0';
 380             }
 381
 382             // handle windows \r\n-line breaks
 383             if(row_delimiter == '\n')
 384             {
 385                 if(line_end != data_begin && buffer[line_end-1] == '\r')
 386                     buffer[line_end-1] = '\0';
 387             }
 388
 389             char*ret = buffer.get() + data_begin;
 390             data_begin = line_end+1;
 391             return ret;
 392         }
 393
 394         bool read_row(std::vector<char*>& cols)
 395         {
 396             try{
 397                 try{
 398                     char*line;
 399                     do{
 400                         line = next_line();
 401                         if(!line)
 402                             return false;
 403                     }while(io::detail::is_comment(line, comment_empty_line, comment_characters));
 404
 405                     io::detail::parse_line(line, cols, col_delimiter, quote, escape_char, trim_characters);
 406
 407                 }catch(io::error::with_file_name&err){
 408                     err.set_file_name(get_truncated_file_name());
 409                     throw;
 410                 }
 411             }catch(io::error::with_file_line&err){
 412                 err.set_file_line(get_file_line());
 413                 throw;
 414             }
 415
 416             return true;
 417         }
 418 };