ceph/src/arrow/cpp/src/gandiva/precompiled/string_ops.cc

   1 // Licensed to the Apache Software Foundation (ASF) under one
   2 // or more contributor license agreements.  See the NOTICE file
   3 // distributed with this work for additional information
   4 // regarding copyright ownership.  The ASF licenses this file
   5 // to you under the Apache License, Version 2.0 (the
   6 // "License"); you may not use this file except in compliance
   7 // with the License.  You may obtain a copy of the License at
   8 //
   9 //   http://www.apache.org/licenses/LICENSE-2.0
  10 //
  11 // Unless required by applicable law or agreed to in writing,
  12 // software distributed under the License is distributed on an
  13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  14 // KIND, either express or implied.  See the License for the
  15 // specific language governing permissions and limitations
  16 // under the License.
  17
  18 // String functions
  19 #include "arrow/util/value_parsing.h"
  20
  21 extern "C" {
  22
  23 #include <algorithm>
  24 #include <climits>
  25 #include <cstdio>
  26 #include <cstdlib>
  27 #include <cstring>
  28
  29 #include "./types.h"
  30
  31 FORCE_INLINE
  32 gdv_int32 octet_length_utf8(const gdv_utf8 input, gdv_int32 length) { return length; }
  33
  34 FORCE_INLINE
  35 gdv_int32 bit_length_utf8(const gdv_utf8 input, gdv_int32 length) { return length * 8; }
  36
  37 FORCE_INLINE
  38 gdv_int32 octet_length_binary(const gdv_binary input, gdv_int32 length) { return length; }
  39
  40 FORCE_INLINE
  41 gdv_int32 bit_length_binary(const gdv_binary input, gdv_int32 length) {
  42   return length * 8;
  43 }
  44
  45 FORCE_INLINE
  46 int match_string(const char* input, gdv_int32 input_len, gdv_int32 start_pos,
  47                  const char* delim, gdv_int32 delim_len) {
  48   for (int i = start_pos; i < input_len; i++) {
  49     int left_chars = input_len - i;
  50     if ((left_chars >= delim_len) && memcmp(input + i, delim, delim_len) == 0) {
  51       return i + delim_len;
  52     }
  53   }
  54
  55   return -1;
  56 }
  57
  58 FORCE_INLINE
  59 gdv_int32 mem_compare(const char* left, gdv_int32 left_len, const char* right,
  60                       gdv_int32 right_len) {
  61   int min = left_len;
  62   if (right_len < min) {
  63     min = right_len;
  64   }
  65
  66   int cmp_ret = memcmp(left, right, min);
  67   if (cmp_ret != 0) {
  68     return cmp_ret;
  69   } else {
  70     return left_len - right_len;
  71   }
  72 }
  73
  74 // Expand inner macro for all varlen types.
  75 #define VAR_LEN_OP_TYPES(INNER, NAME, OP) \
  76   INNER(NAME, utf8, OP)                   \
  77   INNER(NAME, binary, OP)
  78
  79 // Relational binary fns : left, right params are same, return is bool.
  80 #define BINARY_RELATIONAL(NAME, TYPE, OP)                                    \
  81   FORCE_INLINE                                                               \
  82   bool NAME##_##TYPE##_##TYPE(const gdv_##TYPE left, gdv_int32 left_len,     \
  83                               const gdv_##TYPE right, gdv_int32 right_len) { \
  84     return mem_compare(left, left_len, right, right_len) OP 0;               \
  85   }
  86
  87 VAR_LEN_OP_TYPES(BINARY_RELATIONAL, equal, ==)
  88 VAR_LEN_OP_TYPES(BINARY_RELATIONAL, not_equal, !=)
  89 VAR_LEN_OP_TYPES(BINARY_RELATIONAL, less_than, <)
  90 VAR_LEN_OP_TYPES(BINARY_RELATIONAL, less_than_or_equal_to, <=)
  91 VAR_LEN_OP_TYPES(BINARY_RELATIONAL, greater_than, >)
  92 VAR_LEN_OP_TYPES(BINARY_RELATIONAL, greater_than_or_equal_to, >=)
  93
  94 #undef BINARY_RELATIONAL
  95 #undef VAR_LEN_OP_TYPES
  96
  97 // Expand inner macro for all varlen types.
  98 #define VAR_LEN_TYPES(INNER, NAME) \
  99   INNER(NAME, utf8)                \
 100   INNER(NAME, binary)
 101
 102 FORCE_INLINE
 103 int to_binary_from_hex(char ch) {
 104   if (ch >= 'A' && ch <= 'F') {
 105     return 10 + (ch - 'A');
 106   } else if (ch >= 'a' && ch <= 'f') {
 107     return 10 + (ch - 'a');
 108   }
 109   return ch - '0';
 110 }
 111
 112 FORCE_INLINE
 113 bool starts_with_utf8_utf8(const char* data, gdv_int32 data_len, const char* prefix,
 114                            gdv_int32 prefix_len) {
 115   return ((data_len >= prefix_len) && (memcmp(data, prefix, prefix_len) == 0));
 116 }
 117
 118 FORCE_INLINE
 119 bool ends_with_utf8_utf8(const char* data, gdv_int32 data_len, const char* suffix,
 120                          gdv_int32 suffix_len) {
 121   return ((data_len >= suffix_len) &&
 122           (memcmp(data + data_len - suffix_len, suffix, suffix_len) == 0));
 123 }
 124
 125 FORCE_INLINE
 126 bool is_substr_utf8_utf8(const char* data, int32_t data_len, const char* substr,
 127                          int32_t substr_len) {
 128   for (int32_t i = 0; i <= data_len - substr_len; ++i) {
 129     if (memcmp(data + i, substr, substr_len) == 0) {
 130       return true;
 131     }
 132   }
 133   return false;
 134 }
 135
 136 FORCE_INLINE
 137 gdv_int32 utf8_char_length(char c) {
 138   if ((signed char)c >= 0) {  // 1-byte char (0x00 ~ 0x7F)
 139     return 1;
 140   } else if ((c & 0xE0) == 0xC0) {  // 2-byte char
 141     return 2;
 142   } else if ((c & 0xF0) == 0xE0) {  // 3-byte char
 143     return 3;
 144   } else if ((c & 0xF8) == 0xF0) {  // 4-byte char
 145     return 4;
 146   }
 147   // invalid char
 148   return 0;
 149 }
 150
 151 FORCE_INLINE
 152 void set_error_for_invalid_utf(int64_t execution_context, char val) {
 153   char const* fmt = "unexpected byte \\%02hhx encountered while decoding utf8 string";
 154   int size = static_cast<int>(strlen(fmt)) + 64;
 155   char* error = reinterpret_cast<char*>(malloc(size));
 156   snprintf(error, size, fmt, (unsigned char)val);
 157   gdv_fn_context_set_error_msg(execution_context, error);
 158   free(error);
 159 }
 160
 161 FORCE_INLINE
 162 bool validate_utf8_following_bytes(const char* data, int32_t data_len,
 163                                    int32_t char_index) {
 164   for (int j = 1; j < data_len; ++j) {
 165     if ((data[char_index + j] & 0xC0) != 0x80) {  // bytes following head-byte of glyph
 166       return false;
 167     }
 168   }
 169   return true;
 170 }
 171
 172 // Count the number of utf8 characters
 173 // return 0 for invalid/incomplete input byte sequences
 174 FORCE_INLINE
 175 gdv_int32 utf8_length(gdv_int64 context, const char* data, gdv_int32 data_len) {
 176   int char_len = 0;
 177   int count = 0;
 178   for (int i = 0; i < data_len; i += char_len) {
 179     char_len = utf8_char_length(data[i]);
 180     if (char_len == 0 || i + char_len > data_len) {  // invalid byte or incomplete glyph
 181       set_error_for_invalid_utf(context, data[i]);
 182       return 0;
 183     }
 184     for (int j = 1; j < char_len; ++j) {
 185       if ((data[i + j] & 0xC0) != 0x80) {  // bytes following head-byte of glyph
 186         set_error_for_invalid_utf(context, data[i + j]);
 187         return 0;
 188       }
 189     }
 190     ++count;
 191   }
 192   return count;
 193 }
 194
 195 // Count the number of utf8 characters, ignoring invalid char, considering size 1
 196 FORCE_INLINE
 197 gdv_int32 utf8_length_ignore_invalid(const char* data, gdv_int32 data_len) {
 198   int char_len = 0;
 199   int count = 0;
 200   for (int i = 0; i < data_len; i += char_len) {
 201     char_len = utf8_char_length(data[i]);
 202     if (char_len == 0 || i + char_len > data_len) {  // invalid byte or incomplete glyph
 203       // if invalid byte or incomplete glyph, ignore it
 204       char_len = 1;
 205     }
 206     for (int j = 1; j < char_len; ++j) {
 207       if ((data[i + j] & 0xC0) != 0x80) {  // bytes following head-byte of glyph
 208         char_len += 1;
 209       }
 210     }
 211     ++count;
 212   }
 213   return count;
 214 }
 215
 216 // Get the byte position corresponding to a character position for a non-empty utf8
 217 // sequence
 218 FORCE_INLINE
 219 gdv_int32 utf8_byte_pos(gdv_int64 context, const char* str, gdv_int32 str_len,
 220                         gdv_int32 char_pos) {
 221   int char_len = 0;
 222   int byte_index = 0;
 223   for (gdv_int32 char_index = 0; char_index < char_pos && byte_index < str_len;
 224        char_index++) {
 225     char_len = utf8_char_length(str[byte_index]);
 226     if (char_len == 0 ||
 227         byte_index + char_len > str_len) {  // invalid byte or incomplete glyph
 228       set_error_for_invalid_utf(context, str[byte_index]);
 229       return -1;
 230     }
 231     byte_index += char_len;
 232   }
 233   return byte_index;
 234 }
 235
 236 #define UTF8_LENGTH(NAME, TYPE)                                                 \
 237   FORCE_INLINE                                                                  \
 238   gdv_int32 NAME##_##TYPE(gdv_int64 context, gdv_##TYPE in, gdv_int32 in_len) { \
 239     return utf8_length(context, in, in_len);                                    \
 240   }
 241
 242 UTF8_LENGTH(char_length, utf8)
 243 UTF8_LENGTH(length, utf8)
 244 UTF8_LENGTH(lengthUtf8, binary)
 245
 246 // Returns a string of 'n' spaces.
 247 #define SPACE_STR(IN_TYPE)                                                              \
 248   GANDIVA_EXPORT                                                                        \
 249   const char* space_##IN_TYPE(gdv_int64 ctx, gdv_##IN_TYPE n, int32_t* out_len) {       \
 250     gdv_int32 n_times = static_cast<gdv_int32>(n);                                      \
 251     if (n_times <= 0) {                                                                 \
 252       *out_len = 0;                                                                     \
 253       return "";                                                                        \
 254     }                                                                                   \
 255     char* ret = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(ctx, n_times));     \
 256     if (ret == nullptr) {                                                               \
 257       gdv_fn_context_set_error_msg(ctx, "Could not allocate memory for output string"); \
 258       *out_len = 0;                                                                     \
 259       return "";                                                                        \
 260     }                                                                                   \
 261     for (int i = 0; i < n_times; i++) {                                                 \
 262       ret[i] = ' ';                                                                     \
 263     }                                                                                   \
 264     *out_len = n_times;                                                                 \
 265     return ret;                                                                         \
 266   }
 267
 268 SPACE_STR(int32)
 269 SPACE_STR(int64)
 270
 271 // Reverse a utf8 sequence
 272 FORCE_INLINE
 273 const char* reverse_utf8(gdv_int64 context, const char* data, gdv_int32 data_len,
 274                          int32_t* out_len) {
 275   if (data_len == 0) {
 276     *out_len = 0;
 277     return "";
 278   }
 279
 280   char* ret = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, data_len));
 281   if (ret == nullptr) {
 282     gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
 283     *out_len = 0;
 284     return "";
 285   }
 286
 287   gdv_int32 char_len;
 288   for (gdv_int32 i = 0; i < data_len; i += char_len) {
 289     char_len = utf8_char_length(data[i]);
 290
 291     if (char_len == 0 || i + char_len > data_len) {  // invalid byte or incomplete glyph
 292       set_error_for_invalid_utf(context, data[i]);
 293       *out_len = 0;
 294       return "";
 295     }
 296
 297     for (gdv_int32 j = 0; j < char_len; ++j) {
 298       if (j > 0 && (data[i + j] & 0xC0) != 0x80) {  // bytes following head-byte of glyph
 299         set_error_for_invalid_utf(context, data[i + j]);
 300         *out_len = 0;
 301         return "";
 302       }
 303       ret[data_len - i - char_len + j] = data[i + j];
 304     }
 305   }
 306   *out_len = data_len;
 307   return ret;
 308 }
 309
 310 // Trims whitespaces from the left end of the input utf8 sequence
 311 FORCE_INLINE
 312 const char* ltrim_utf8(gdv_int64 context, const char* data, gdv_int32 data_len,
 313                        int32_t* out_len) {
 314   if (data_len == 0) {
 315     *out_len = 0;
 316     return "";
 317   }
 318
 319   gdv_int32 start = 0;
 320   // start denotes the first position of non-space characters in the input string
 321   while (start < data_len && data[start] == ' ') {
 322     ++start;
 323   }
 324
 325   *out_len = data_len - start;
 326   return data + start;
 327 }
 328
 329 // Trims whitespaces from the right end of the input utf8 sequence
 330 FORCE_INLINE
 331 const char* rtrim_utf8(gdv_int64 context, const char* data, gdv_int32 data_len,
 332                        int32_t* out_len) {
 333   if (data_len == 0) {
 334     *out_len = 0;
 335     return "";
 336   }
 337
 338   gdv_int32 end = data_len - 1;
 339   // end denotes the last position of non-space characters in the input string
 340   while (end >= 0 && data[end] == ' ') {
 341     --end;
 342   }
 343
 344   *out_len = end + 1;
 345   return data;
 346 }
 347
 348 // Trims whitespaces from both the ends of the input utf8 sequence
 349 FORCE_INLINE
 350 const char* btrim_utf8(gdv_int64 context, const char* data, gdv_int32 data_len,
 351                        int32_t* out_len) {
 352   if (data_len == 0) {
 353     *out_len = 0;
 354     return "";
 355   }
 356
 357   gdv_int32 start = 0, end = data_len - 1;
 358   // start and end denote the first and last positions of non-space
 359   // characters in the input string respectively
 360   while (start <= end && data[start] == ' ') {
 361     ++start;
 362   }
 363   while (end >= start && data[end] == ' ') {
 364     --end;
 365   }
 366
 367   // string has some leading/trailing spaces and some non-space characters
 368   *out_len = end - start + 1;
 369   return data + start;
 370 }
 371
 372 // Trims characters present in the trim text from the left end of the base text
 373 FORCE_INLINE
 374 const char* ltrim_utf8_utf8(gdv_int64 context, const char* basetext,
 375                             gdv_int32 basetext_len, const char* trimtext,
 376                             gdv_int32 trimtext_len, int32_t* out_len) {
 377   if (basetext_len == 0) {
 378     *out_len = 0;
 379     return "";
 380   } else if (trimtext_len == 0) {
 381     *out_len = basetext_len;
 382     return basetext;
 383   }
 384
 385   gdv_int32 start_ptr, char_len;
 386   // scan the base text from left to right and increment the start pointer till
 387   // there is a character which is not present in the trim text
 388   for (start_ptr = 0; start_ptr < basetext_len; start_ptr += char_len) {
 389     char_len = utf8_char_length(basetext[start_ptr]);
 390     if (char_len == 0 || start_ptr + char_len > basetext_len) {
 391       // invalid byte or incomplete glyph
 392       set_error_for_invalid_utf(context, basetext[start_ptr]);
 393       *out_len = 0;
 394       return "";
 395     }
 396     if (!is_substr_utf8_utf8(trimtext, trimtext_len, basetext + start_ptr, char_len)) {
 397       break;
 398     }
 399   }
 400
 401   *out_len = basetext_len - start_ptr;
 402   return basetext + start_ptr;
 403 }
 404
 405 // Trims characters present in the trim text from the right end of the base text
 406 FORCE_INLINE
 407 const char* rtrim_utf8_utf8(gdv_int64 context, const char* basetext,
 408                             gdv_int32 basetext_len, const char* trimtext,
 409                             gdv_int32 trimtext_len, int32_t* out_len) {
 410   if (basetext_len == 0) {
 411     *out_len = 0;
 412     return "";
 413   } else if (trimtext_len == 0) {
 414     *out_len = basetext_len;
 415     return basetext;
 416   }
 417
 418   gdv_int32 char_len, end_ptr, byte_cnt = 1;
 419   // scan the base text from right to left and decrement the end pointer till
 420   // there is a character which is not present in the trim text
 421   for (end_ptr = basetext_len - 1; end_ptr >= 0; --end_ptr) {
 422     char_len = utf8_char_length(basetext[end_ptr]);
 423     if (char_len == 0) {  // trailing bytes of multibyte character
 424       ++byte_cnt;
 425       continue;
 426     }
 427     // this is the first byte of a character, hence check if char_len = char_cnt
 428     if (byte_cnt != char_len) {  // invalid byte or incomplete glyph
 429       set_error_for_invalid_utf(context, basetext[end_ptr]);
 430       *out_len = 0;
 431       return "";
 432     }
 433     byte_cnt = 1;  // reset the counter*/
 434     if (!is_substr_utf8_utf8(trimtext, trimtext_len, basetext + end_ptr, char_len)) {
 435       break;
 436     }
 437   }
 438
 439   // when all characters in the basetext are part of the trimtext
 440   if (end_ptr == -1) {
 441     *out_len = 0;
 442     return "";
 443   }
 444
 445   end_ptr += utf8_char_length(basetext[end_ptr]);  // point to the next character
 446   *out_len = end_ptr;
 447   return basetext;
 448 }
 449
 450 // Trims characters present in the trim text from both ends of the base text
 451 FORCE_INLINE
 452 const char* btrim_utf8_utf8(gdv_int64 context, const char* basetext,
 453                             gdv_int32 basetext_len, const char* trimtext,
 454                             gdv_int32 trimtext_len, int32_t* out_len) {
 455   if (basetext_len == 0) {
 456     *out_len = 0;
 457     return "";
 458   } else if (trimtext_len == 0) {
 459     *out_len = basetext_len;
 460     return basetext;
 461   }
 462
 463   gdv_int32 start_ptr, end_ptr, char_len, byte_cnt = 1;
 464   // scan the base text from left to right and increment the start and decrement the
 465   // end pointers till there are characters which are not present in the trim text
 466   for (start_ptr = 0; start_ptr < basetext_len; start_ptr += char_len) {
 467     char_len = utf8_char_length(basetext[start_ptr]);
 468     if (char_len == 0 || start_ptr + char_len > basetext_len) {
 469       // invalid byte or incomplete glyph
 470       set_error_for_invalid_utf(context, basetext[start_ptr]);
 471       *out_len = 0;
 472       return "";
 473     }
 474     if (!is_substr_utf8_utf8(trimtext, trimtext_len, basetext + start_ptr, char_len)) {
 475       break;
 476     }
 477   }
 478   for (end_ptr = basetext_len - 1; end_ptr >= start_ptr; --end_ptr) {
 479     char_len = utf8_char_length(basetext[end_ptr]);
 480     if (char_len == 0) {  // trailing byte in multibyte character
 481       ++byte_cnt;
 482       continue;
 483     }
 484     // this is the first byte of a character, hence check if char_len = char_cnt
 485     if (byte_cnt != char_len) {  // invalid byte or incomplete glyph
 486       set_error_for_invalid_utf(context, basetext[end_ptr]);
 487       *out_len = 0;
 488       return "";
 489     }
 490     byte_cnt = 1;  // reset the counter*/
 491     if (!is_substr_utf8_utf8(trimtext, trimtext_len, basetext + end_ptr, char_len)) {
 492       break;
 493     }
 494   }
 495
 496   // when all characters are trimmed, start_ptr has been incremented to basetext_len and
 497   // end_ptr still points to basetext_len - 1, hence we need to handle this case
 498   if (start_ptr > end_ptr) {
 499     *out_len = 0;
 500     return "";
 501   }
 502
 503   end_ptr += utf8_char_length(basetext[end_ptr]);  // point to the next character
 504   *out_len = end_ptr - start_ptr;
 505   return basetext + start_ptr;
 506 }
 507
 508 FORCE_INLINE
 509 gdv_boolean compare_lower_strings(const char* base_str, gdv_int32 base_str_len,
 510                                   const char* str, gdv_int32 str_len) {
 511   if (base_str_len != str_len) {
 512     return false;
 513   }
 514   for (int i = 0; i < str_len; i++) {
 515     // convert char to lower
 516     char cur = str[i];
 517     // 'A' - 'Z' : 0x41 - 0x5a
 518     // 'a' - 'z' : 0x61 - 0x7a
 519     if (cur >= 0x41 && cur <= 0x5a) {
 520       cur = static_cast<char>(cur + 0x20);
 521     }
 522     // if the character does not match, break the flow
 523     if (cur != base_str[i]) break;
 524     // if the character matches and it is the last iteration, return true
 525     if (i == str_len - 1) return true;
 526   }
 527   return false;
 528 }
 529
 530 // Try to cast the received string ('0', '1', 'true', 'false'), ignoring leading
 531 // and trailing spaces, also ignoring lower and upper case.
 532 FORCE_INLINE
 533 gdv_boolean castBIT_utf8(gdv_int64 context, const char* data, gdv_int32 data_len) {
 534   if (data_len <= 0) {
 535     gdv_fn_context_set_error_msg(context, "Invalid value for boolean.");
 536     return false;
 537   }
 538
 539   // trim leading and trailing spaces
 540   int32_t trimmed_len;
 541   int32_t start = 0, end = data_len - 1;
 542   while (start <= end && data[start] == ' ') {
 543     ++start;
 544   }
 545   while (end >= start && data[end] == ' ') {
 546     --end;
 547   }
 548   trimmed_len = end - start + 1;
 549   const char* trimmed_data = data + start;
 550
 551   // compare received string with the valid bool string values '1', '0', 'true', 'false'
 552   if (trimmed_len == 1) {
 553     // case for '0' and '1' value
 554     if (trimmed_data[0] == '1') return true;
 555     if (trimmed_data[0] == '0') return false;
 556   } else if (trimmed_len == 4) {
 557     // case for matching 'true'
 558     if (compare_lower_strings("true", 4, trimmed_data, trimmed_len)) return true;
 559   } else if (trimmed_len == 5) {
 560     // case for matching 'false'
 561     if (compare_lower_strings("false", 5, trimmed_data, trimmed_len)) return false;
 562   }
 563   // if no 'true', 'false', '0' or '1' value is found, set an error
 564   gdv_fn_context_set_error_msg(context, "Invalid value for boolean.");
 565   return false;
 566 }
 567
 568 FORCE_INLINE
 569 const char* castVARCHAR_bool_int64(gdv_int64 context, gdv_boolean value,
 570                                    gdv_int64 out_len, gdv_int32* out_length) {
 571   gdv_int32 len = static_cast<gdv_int32>(out_len);
 572   if (len < 0) {
 573     gdv_fn_context_set_error_msg(context, "Output buffer length can't be negative");
 574     *out_length = 0;
 575     return "";
 576   }
 577   const char* out =
 578       reinterpret_cast<const char*>(gdv_fn_context_arena_malloc(context, 5));
 579   out = value ? "true" : "false";
 580   *out_length = value ? ((len > 4) ? 4 : len) : ((len > 5) ? 5 : len);
 581   return out;
 582 }
 583
 584 // Truncates the string to given length
 585 #define CAST_VARCHAR_FROM_VARLEN_TYPE(TYPE)                                            \
 586   FORCE_INLINE                                                                         \
 587   const char* castVARCHAR_##TYPE##_int64(gdv_int64 context, const char* data,          \
 588                                          gdv_int32 data_len, int64_t out_len,          \
 589                                          int32_t* out_length) {                        \
 590     int32_t len = static_cast<int32_t>(out_len);                                       \
 591                                                                                        \
 592     if (len < 0) {                                                                     \
 593       gdv_fn_context_set_error_msg(context, "Output buffer length can't be negative"); \
 594       *out_length = 0;                                                                 \
 595       return "";                                                                       \
 596     }                                                                                  \
 597                                                                                        \
 598     if (len >= data_len || len == 0) {                                                 \
 599       *out_length = data_len;                                                          \
 600       return data;                                                                     \
 601     }                                                                                  \
 602                                                                                        \
 603     int32_t remaining = len;                                                           \
 604     int32_t index = 0;                                                                 \
 605     bool is_multibyte = false;                                                         \
 606     do {                                                                               \
 607       /* In utf8, MSB of a single byte unicode char is always 0,                       \
 608        * whereas for a multibyte character the MSB of each byte is 1.                  \
 609        * So for a single byte char, a bitwise-and with x80 (10000000) will be 0        \
 610        * and it won't be 0 for bytes of a multibyte char.                              \
 611        */                                                                              \
 612       char* data_ptr = const_cast<char*>(data);                                        \
 613                                                                                        \
 614       /* advance byte by byte till the 8-byte boundary then advance 8 bytes */         \
 615       auto num_bytes = reinterpret_cast<uintptr_t>(data_ptr) & 0x07;                   \
 616       num_bytes = (8 - num_bytes) & 0x07;                                              \
 617       while (num_bytes > 0) {                                                          \
 618         uint8_t* ptr = reinterpret_cast<uint8_t*>(data_ptr + index);                   \
 619         if ((*ptr & 0x80) != 0) {                                                      \
 620           is_multibyte = true;                                                         \
 621           break;                                                                       \
 622         }                                                                              \
 623         index++;                                                                       \
 624         remaining--;                                                                   \
 625         num_bytes--;                                                                   \
 626       }                                                                                \
 627       if (is_multibyte) break;                                                         \
 628       while (remaining >= 8) {                                                         \
 629         uint64_t* ptr = reinterpret_cast<uint64_t*>(data_ptr + index);                 \
 630         if ((*ptr & 0x8080808080808080) != 0) {                                        \
 631           is_multibyte = true;                                                         \
 632           break;                                                                       \
 633         }                                                                              \
 634         index += 8;                                                                    \
 635         remaining -= 8;                                                                \
 636       }                                                                                \
 637       if (is_multibyte) break;                                                         \
 638       if (remaining >= 4) {                                                            \
 639         uint32_t* ptr = reinterpret_cast<uint32_t*>(data_ptr + index);                 \
 640         if ((*ptr & 0x80808080) != 0) break;                                           \
 641         index += 4;                                                                    \
 642         remaining -= 4;                                                                \
 643       }                                                                                \
 644       while (remaining > 0) {                                                          \
 645         uint8_t* ptr = reinterpret_cast<uint8_t*>(data_ptr + index);                   \
 646         if ((*ptr & 0x80) != 0) {                                                      \
 647           is_multibyte = true;                                                         \
 648           break;                                                                       \
 649         }                                                                              \
 650         index++;                                                                       \
 651         remaining--;                                                                   \
 652       }                                                                                \
 653       if (is_multibyte) break;                                                         \
 654       /* reached here; all are single byte characters */                               \
 655       *out_length = len;                                                               \
 656       return data;                                                                     \
 657     } while (false);                                                                   \
 658                                                                                        \
 659     /* detected multibyte utf8 characters; slow path */                                \
 660     int32_t byte_pos =                                                                 \
 661         utf8_byte_pos(context, data + index, data_len - index, len - index);           \
 662     if (byte_pos < 0) {                                                                \
 663       *out_length = 0;                                                                 \
 664       return "";                                                                       \
 665     }                                                                                  \
 666                                                                                        \
 667     *out_length = index + byte_pos;                                                    \
 668     return data;                                                                       \
 669   }
 670
 671 CAST_VARCHAR_FROM_VARLEN_TYPE(utf8)
 672 CAST_VARCHAR_FROM_VARLEN_TYPE(binary)
 673
 674 #undef CAST_VARCHAR_FROM_VARLEN_TYPE
 675
 676 // Add functions for castVARBINARY
 677 #define CAST_VARBINARY_FROM_STRING_AND_BINARY(TYPE)                                    \
 678   GANDIVA_EXPORT                                                                       \
 679   const char* castVARBINARY_##TYPE##_int64(gdv_int64 context, const char* data,        \
 680                                            gdv_int32 data_len, int64_t out_len,        \
 681                                            int32_t* out_length) {                      \
 682     int32_t len = static_cast<int32_t>(out_len);                                       \
 683     if (len < 0) {                                                                     \
 684       gdv_fn_context_set_error_msg(context, "Output buffer length can't be negative"); \
 685       *out_length = 0;                                                                 \
 686       return "";                                                                       \
 687     }                                                                                  \
 688                                                                                        \
 689     if (len >= data_len || len == 0) {                                                 \
 690       *out_length = data_len;                                                          \
 691     } else {                                                                           \
 692       *out_length = len;                                                               \
 693     }                                                                                  \
 694     return data;                                                                       \
 695   }
 696
 697 CAST_VARBINARY_FROM_STRING_AND_BINARY(utf8)
 698 CAST_VARBINARY_FROM_STRING_AND_BINARY(binary)
 699
 700 #undef CAST_VARBINARY_FROM_STRING_AND_BINARY
 701
 702 #define IS_NULL(NAME, TYPE)                                                \
 703   FORCE_INLINE                                                             \
 704   bool NAME##_##TYPE(gdv_##TYPE in, gdv_int32 len, gdv_boolean is_valid) { \
 705     return !is_valid;                                                      \
 706   }
 707
 708 VAR_LEN_TYPES(IS_NULL, isnull)
 709
 710 #undef IS_NULL
 711
 712 #define IS_NOT_NULL(NAME, TYPE)                                            \
 713   FORCE_INLINE                                                             \
 714   bool NAME##_##TYPE(gdv_##TYPE in, gdv_int32 len, gdv_boolean is_valid) { \
 715     return is_valid;                                                       \
 716   }
 717
 718 VAR_LEN_TYPES(IS_NOT_NULL, isnotnull)
 719
 720 #undef IS_NOT_NULL
 721 #undef VAR_LEN_TYPES
 722
 723 /*
 724  We follow Oracle semantics for offset:
 725  - If position is positive, then the first glyph in the substring is determined by
 726  counting that many glyphs forward from the beginning of the input. (i.e., for position ==
 727  1 the first glyph in the substring will be identical to the first glyph in the input)
 728
 729  - If position is negative, then the first glyph in the substring is determined by
 730  counting that many glyphs backward from the end of the input. (i.e., for position == -1
 731  the first glyph in the substring will be identical to the last glyph in the input)
 732
 733  - If position is 0 then it is treated as 1.
 734  */
 735 FORCE_INLINE
 736 const char* substr_utf8_int64_int64(gdv_int64 context, const char* input,
 737                                     gdv_int32 in_data_len, gdv_int64 position,
 738                                     gdv_int64 substring_length, gdv_int32* out_data_len) {
 739   if (substring_length <= 0 || input == nullptr || in_data_len <= 0) {
 740     *out_data_len = 0;
 741     return "";
 742   }
 743
 744   gdv_int64 in_glyphs_count =
 745       static_cast<gdv_int64>(utf8_length(context, input, in_data_len));
 746
 747   // in_glyphs_count is zero if input has invalid glyphs
 748   if (in_glyphs_count == 0) {
 749     *out_data_len = 0;
 750     return "";
 751   }
 752
 753   gdv_int64 from_glyph;  // from_glyph==0 indicates the first glyph of the input
 754   if (position > 0) {
 755     from_glyph = position - 1;
 756   } else if (position < 0) {
 757     from_glyph = in_glyphs_count + position;
 758   } else {
 759     from_glyph = 0;
 760   }
 761
 762   if (from_glyph < 0 || from_glyph >= in_glyphs_count) {
 763     *out_data_len = 0;
 764     return "";
 765   }
 766
 767   gdv_int64 out_glyphs_count = substring_length;
 768   if (substring_length > in_glyphs_count - from_glyph) {
 769     out_glyphs_count = in_glyphs_count - from_glyph;
 770   }
 771
 772   gdv_int64 in_data_len64 = static_cast<gdv_int64>(in_data_len);
 773   gdv_int64 start_pos = 0;
 774   gdv_int64 end_pos = in_data_len64;
 775
 776   gdv_int64 current_glyph = 0;
 777   gdv_int64 pos = 0;
 778   while (pos < in_data_len64) {
 779     if (current_glyph == from_glyph) {
 780       start_pos = pos;
 781     }
 782     pos += static_cast<gdv_int64>(utf8_char_length(input[pos]));
 783     if (current_glyph - from_glyph + 1 == out_glyphs_count) {
 784       end_pos = pos;
 785     }
 786     current_glyph++;
 787   }
 788
 789   if (end_pos > in_data_len64 || end_pos > INT_MAX) {
 790     end_pos = in_data_len64;
 791   }
 792
 793   *out_data_len = static_cast<gdv_int32>(end_pos - start_pos);
 794   char* ret =
 795       reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, *out_data_len));
 796   if (ret == nullptr) {
 797     gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
 798     *out_data_len = 0;
 799     return "";
 800   }
 801   memcpy(ret, input + start_pos, *out_data_len);
 802   return ret;
 803 }
 804
 805 FORCE_INLINE
 806 const char* substr_utf8_int64(gdv_int64 context, const char* input, gdv_int32 in_len,
 807                               gdv_int64 offset64, gdv_int32* out_len) {
 808   return substr_utf8_int64_int64(context, input, in_len, offset64, in_len, out_len);
 809 }
 810
 811 FORCE_INLINE
 812 const char* repeat_utf8_int32(gdv_int64 context, const char* in, gdv_int32 in_len,
 813                               gdv_int32 repeat_number, gdv_int32* out_len) {
 814   // if the repeat number is zero, then return empty string
 815   if (repeat_number == 0 || in_len <= 0) {
 816     *out_len = 0;
 817     return "";
 818   }
 819   // if the repeat number is a negative number, an error is set on context
 820   if (repeat_number < 0) {
 821     gdv_fn_context_set_error_msg(context, "Repeat number can't be negative");
 822     *out_len = 0;
 823     return "";
 824   }
 825   *out_len = repeat_number * in_len;
 826   char* ret = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, *out_len));
 827   if (ret == nullptr) {
 828     gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
 829     *out_len = 0;
 830     return "";
 831   }
 832   for (int i = 0; i < repeat_number; ++i) {
 833     memcpy(ret + (i * in_len), in, in_len);
 834   }
 835   return ret;
 836 }
 837
 838 FORCE_INLINE
 839 const char* concat_utf8_utf8(gdv_int64 context, const char* left, gdv_int32 left_len,
 840                              bool left_validity, const char* right, gdv_int32 right_len,
 841                              bool right_validity, gdv_int32* out_len) {
 842   if (!left_validity) {
 843     left_len = 0;
 844   }
 845   if (!right_validity) {
 846     right_len = 0;
 847   }
 848   return concatOperator_utf8_utf8(context, left, left_len, right, right_len, out_len);
 849 }
 850
 851 FORCE_INLINE
 852 const char* concatOperator_utf8_utf8(gdv_int64 context, const char* left,
 853                                      gdv_int32 left_len, const char* right,
 854                                      gdv_int32 right_len, gdv_int32* out_len) {
 855   *out_len = left_len + right_len;
 856   if (*out_len <= 0) {
 857     *out_len = 0;
 858     return "";
 859   }
 860   char* ret = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, *out_len));
 861   if (ret == nullptr) {
 862     gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
 863     *out_len = 0;
 864     return "";
 865   }
 866   memcpy(ret, left, left_len);
 867   memcpy(ret + left_len, right, right_len);
 868   return ret;
 869 }
 870
 871 FORCE_INLINE
 872 const char* concat_utf8_utf8_utf8(gdv_int64 context, const char* in1, gdv_int32 in1_len,
 873                                   bool in1_validity, const char* in2, gdv_int32 in2_len,
 874                                   bool in2_validity, const char* in3, gdv_int32 in3_len,
 875                                   bool in3_validity, gdv_int32* out_len) {
 876   if (!in1_validity) {
 877     in1_len = 0;
 878   }
 879   if (!in2_validity) {
 880     in2_len = 0;
 881   }
 882   if (!in3_validity) {
 883     in3_len = 0;
 884   }
 885   return concatOperator_utf8_utf8_utf8(context, in1, in1_len, in2, in2_len, in3, in3_len,
 886                                        out_len);
 887 }
 888
 889 FORCE_INLINE
 890 const char* concatOperator_utf8_utf8_utf8(gdv_int64 context, const char* in1,
 891                                           gdv_int32 in1_len, const char* in2,
 892                                           gdv_int32 in2_len, const char* in3,
 893                                           gdv_int32 in3_len, gdv_int32* out_len) {
 894   *out_len = in1_len + in2_len + in3_len;
 895   if (*out_len <= 0) {
 896     *out_len = 0;
 897     return "";
 898   }
 899   char* ret = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, *out_len));
 900   if (ret == nullptr) {
 901     gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
 902     *out_len = 0;
 903     return "";
 904   }
 905   memcpy(ret, in1, in1_len);
 906   memcpy(ret + in1_len, in2, in2_len);
 907   memcpy(ret + in1_len + in2_len, in3, in3_len);
 908   return ret;
 909 }
 910
 911 FORCE_INLINE
 912 const char* concat_utf8_utf8_utf8_utf8(gdv_int64 context, const char* in1,
 913                                        gdv_int32 in1_len, bool in1_validity,
 914                                        const char* in2, gdv_int32 in2_len,
 915                                        bool in2_validity, const char* in3,
 916                                        gdv_int32 in3_len, bool in3_validity,
 917                                        const char* in4, gdv_int32 in4_len,
 918                                        bool in4_validity, gdv_int32* out_len) {
 919   if (!in1_validity) {
 920     in1_len = 0;
 921   }
 922   if (!in2_validity) {
 923     in2_len = 0;
 924   }
 925   if (!in3_validity) {
 926     in3_len = 0;
 927   }
 928   if (!in4_validity) {
 929     in4_len = 0;
 930   }
 931   return concatOperator_utf8_utf8_utf8_utf8(context, in1, in1_len, in2, in2_len, in3,
 932                                             in3_len, in4, in4_len, out_len);
 933 }
 934
 935 FORCE_INLINE
 936 const char* concatOperator_utf8_utf8_utf8_utf8(gdv_int64 context, const char* in1,
 937                                                gdv_int32 in1_len, const char* in2,
 938                                                gdv_int32 in2_len, const char* in3,
 939                                                gdv_int32 in3_len, const char* in4,
 940                                                gdv_int32 in4_len, gdv_int32* out_len) {
 941   *out_len = in1_len + in2_len + in3_len + in4_len;
 942   if (*out_len <= 0) {
 943     *out_len = 0;
 944     return "";
 945   }
 946   char* ret = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, *out_len));
 947   if (ret == nullptr) {
 948     gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
 949     *out_len = 0;
 950     return "";
 951   }
 952   memcpy(ret, in1, in1_len);
 953   memcpy(ret + in1_len, in2, in2_len);
 954   memcpy(ret + in1_len + in2_len, in3, in3_len);
 955   memcpy(ret + in1_len + in2_len + in3_len, in4, in4_len);
 956   return ret;
 957 }
 958
 959 FORCE_INLINE
 960 const char* concat_utf8_utf8_utf8_utf8_utf8(
 961     gdv_int64 context, const char* in1, gdv_int32 in1_len, bool in1_validity,
 962     const char* in2, gdv_int32 in2_len, bool in2_validity, const char* in3,
 963     gdv_int32 in3_len, bool in3_validity, const char* in4, gdv_int32 in4_len,
 964     bool in4_validity, const char* in5, gdv_int32 in5_len, bool in5_validity,
 965     gdv_int32* out_len) {
 966   if (!in1_validity) {
 967     in1_len = 0;
 968   }
 969   if (!in2_validity) {
 970     in2_len = 0;
 971   }
 972   if (!in3_validity) {
 973     in3_len = 0;
 974   }
 975   if (!in4_validity) {
 976     in4_len = 0;
 977   }
 978   if (!in5_validity) {
 979     in5_len = 0;
 980   }
 981   return concatOperator_utf8_utf8_utf8_utf8_utf8(context, in1, in1_len, in2, in2_len, in3,
 982                                                  in3_len, in4, in4_len, in5, in5_len,
 983                                                  out_len);
 984 }
 985
 986 FORCE_INLINE
 987 const char* concatOperator_utf8_utf8_utf8_utf8_utf8(
 988     gdv_int64 context, const char* in1, gdv_int32 in1_len, const char* in2,
 989     gdv_int32 in2_len, const char* in3, gdv_int32 in3_len, const char* in4,
 990     gdv_int32 in4_len, const char* in5, gdv_int32 in5_len, gdv_int32* out_len) {
 991   *out_len = in1_len + in2_len + in3_len + in4_len + in5_len;
 992   if (*out_len <= 0) {
 993     *out_len = 0;
 994     return "";
 995   }
 996   char* ret = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, *out_len));
 997   if (ret == nullptr) {
 998     gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
 999     *out_len = 0;
1000     return "";
1001   }
1002   memcpy(ret, in1, in1_len);
1003   memcpy(ret + in1_len, in2, in2_len);
1004   memcpy(ret + in1_len + in2_len, in3, in3_len);
1005   memcpy(ret + in1_len + in2_len + in3_len, in4, in4_len);
1006   memcpy(ret + in1_len + in2_len + in3_len + in4_len, in5, in5_len);
1007   return ret;
1008 }
1009
1010 FORCE_INLINE
1011 const char* concat_utf8_utf8_utf8_utf8_utf8_utf8(
1012     gdv_int64 context, const char* in1, gdv_int32 in1_len, bool in1_validity,
1013     const char* in2, gdv_int32 in2_len, bool in2_validity, const char* in3,
1014     gdv_int32 in3_len, bool in3_validity, const char* in4, gdv_int32 in4_len,
1015     bool in4_validity, const char* in5, gdv_int32 in5_len, bool in5_validity,
1016     const char* in6, gdv_int32 in6_len, bool in6_validity, gdv_int32* out_len) {
1017   if (!in1_validity) {
1018     in1_len = 0;
1019   }
1020   if (!in2_validity) {
1021     in2_len = 0;
1022   }
1023   if (!in3_validity) {
1024     in3_len = 0;
1025   }
1026   if (!in4_validity) {
1027     in4_len = 0;
1028   }
1029   if (!in5_validity) {
1030     in5_len = 0;
1031   }
1032   if (!in6_validity) {
1033     in6_len = 0;
1034   }
1035   return concatOperator_utf8_utf8_utf8_utf8_utf8_utf8(context, in1, in1_len, in2, in2_len,
1036                                                       in3, in3_len, in4, in4_len, in5,
1037                                                       in5_len, in6, in6_len, out_len);
1038 }
1039
1040 FORCE_INLINE
1041 const char* concatOperator_utf8_utf8_utf8_utf8_utf8_utf8(
1042     gdv_int64 context, const char* in1, gdv_int32 in1_len, const char* in2,
1043     gdv_int32 in2_len, const char* in3, gdv_int32 in3_len, const char* in4,
1044     gdv_int32 in4_len, const char* in5, gdv_int32 in5_len, const char* in6,
1045     gdv_int32 in6_len, gdv_int32* out_len) {
1046   *out_len = in1_len + in2_len + in3_len + in4_len + in5_len + in6_len;
1047   if (*out_len <= 0) {
1048     *out_len = 0;
1049     return "";
1050   }
1051   char* ret = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, *out_len));
1052   if (ret == nullptr) {
1053     gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
1054     *out_len = 0;
1055     return "";
1056   }
1057   memcpy(ret, in1, in1_len);
1058   memcpy(ret + in1_len, in2, in2_len);
1059   memcpy(ret + in1_len + in2_len, in3, in3_len);
1060   memcpy(ret + in1_len + in2_len + in3_len, in4, in4_len);
1061   memcpy(ret + in1_len + in2_len + in3_len + in4_len, in5, in5_len);
1062   memcpy(ret + in1_len + in2_len + in3_len + in4_len + in5_len, in6, in6_len);
1063   return ret;
1064 }
1065
1066 FORCE_INLINE
1067 const char* concat_utf8_utf8_utf8_utf8_utf8_utf8_utf8(
1068     gdv_int64 context, const char* in1, gdv_int32 in1_len, bool in1_validity,
1069     const char* in2, gdv_int32 in2_len, bool in2_validity, const char* in3,
1070     gdv_int32 in3_len, bool in3_validity, const char* in4, gdv_int32 in4_len,
1071     bool in4_validity, const char* in5, gdv_int32 in5_len, bool in5_validity,
1072     const char* in6, gdv_int32 in6_len, bool in6_validity, const char* in7,
1073     gdv_int32 in7_len, bool in7_validity, gdv_int32* out_len) {
1074   if (!in1_validity) {
1075     in1_len = 0;
1076   }
1077   if (!in2_validity) {
1078     in2_len = 0;
1079   }
1080   if (!in3_validity) {
1081     in3_len = 0;
1082   }
1083   if (!in4_validity) {
1084     in4_len = 0;
1085   }
1086   if (!in5_validity) {
1087     in5_len = 0;
1088   }
1089   if (!in6_validity) {
1090     in6_len = 0;
1091   }
1092   if (!in7_validity) {
1093     in7_len = 0;
1094   }
1095   return concatOperator_utf8_utf8_utf8_utf8_utf8_utf8_utf8(
1096       context, in1, in1_len, in2, in2_len, in3, in3_len, in4, in4_len, in5, in5_len, in6,
1097       in6_len, in7, in7_len, out_len);
1098 }
1099
1100 FORCE_INLINE
1101 const char* concatOperator_utf8_utf8_utf8_utf8_utf8_utf8_utf8(
1102     gdv_int64 context, const char* in1, gdv_int32 in1_len, const char* in2,
1103     gdv_int32 in2_len, const char* in3, gdv_int32 in3_len, const char* in4,
1104     gdv_int32 in4_len, const char* in5, gdv_int32 in5_len, const char* in6,
1105     gdv_int32 in6_len, const char* in7, gdv_int32 in7_len, gdv_int32* out_len) {
1106   *out_len = in1_len + in2_len + in3_len + in4_len + in5_len + in6_len + in7_len;
1107   if (*out_len <= 0) {
1108     *out_len = 0;
1109     return "";
1110   }
1111   char* ret = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, *out_len));
1112   if (ret == nullptr) {
1113     gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
1114     *out_len = 0;
1115     return "";
1116   }
1117   memcpy(ret, in1, in1_len);
1118   memcpy(ret + in1_len, in2, in2_len);
1119   memcpy(ret + in1_len + in2_len, in3, in3_len);
1120   memcpy(ret + in1_len + in2_len + in3_len, in4, in4_len);
1121   memcpy(ret + in1_len + in2_len + in3_len + in4_len, in5, in5_len);
1122   memcpy(ret + in1_len + in2_len + in3_len + in4_len + in5_len, in6, in6_len);
1123   memcpy(ret + in1_len + in2_len + in3_len + in4_len + in5_len + in6_len, in7, in7_len);
1124   return ret;
1125 }
1126
1127 FORCE_INLINE
1128 const char* concat_utf8_utf8_utf8_utf8_utf8_utf8_utf8_utf8(
1129     gdv_int64 context, const char* in1, gdv_int32 in1_len, bool in1_validity,
1130     const char* in2, gdv_int32 in2_len, bool in2_validity, const char* in3,
1131     gdv_int32 in3_len, bool in3_validity, const char* in4, gdv_int32 in4_len,
1132     bool in4_validity, const char* in5, gdv_int32 in5_len, bool in5_validity,
1133     const char* in6, gdv_int32 in6_len, bool in6_validity, const char* in7,
1134     gdv_int32 in7_len, bool in7_validity, const char* in8, gdv_int32 in8_len,
1135     bool in8_validity, gdv_int32* out_len) {
1136   if (!in1_validity) {
1137     in1_len = 0;
1138   }
1139   if (!in2_validity) {
1140     in2_len = 0;
1141   }
1142   if (!in3_validity) {
1143     in3_len = 0;
1144   }
1145   if (!in4_validity) {
1146     in4_len = 0;
1147   }
1148   if (!in5_validity) {
1149     in5_len = 0;
1150   }
1151   if (!in6_validity) {
1152     in6_len = 0;
1153   }
1154   if (!in7_validity) {
1155     in7_len = 0;
1156   }
1157   if (!in8_validity) {
1158     in8_len = 0;
1159   }
1160   return concatOperator_utf8_utf8_utf8_utf8_utf8_utf8_utf8_utf8(
1161       context, in1, in1_len, in2, in2_len, in3, in3_len, in4, in4_len, in5, in5_len, in6,
1162       in6_len, in7, in7_len, in8, in8_len, out_len);
1163 }
1164
1165 FORCE_INLINE
1166 const char* concatOperator_utf8_utf8_utf8_utf8_utf8_utf8_utf8_utf8(
1167     gdv_int64 context, const char* in1, gdv_int32 in1_len, const char* in2,
1168     gdv_int32 in2_len, const char* in3, gdv_int32 in3_len, const char* in4,
1169     gdv_int32 in4_len, const char* in5, gdv_int32 in5_len, const char* in6,
1170     gdv_int32 in6_len, const char* in7, gdv_int32 in7_len, const char* in8,
1171     gdv_int32 in8_len, gdv_int32* out_len) {
1172   *out_len =
1173       in1_len + in2_len + in3_len + in4_len + in5_len + in6_len + in7_len + in8_len;
1174   if (*out_len <= 0) {
1175     *out_len = 0;
1176     return "";
1177   }
1178   char* ret = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, *out_len));
1179   if (ret == nullptr) {
1180     gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
1181     *out_len = 0;
1182     return "";
1183   }
1184   memcpy(ret, in1, in1_len);
1185   memcpy(ret + in1_len, in2, in2_len);
1186   memcpy(ret + in1_len + in2_len, in3, in3_len);
1187   memcpy(ret + in1_len + in2_len + in3_len, in4, in4_len);
1188   memcpy(ret + in1_len + in2_len + in3_len + in4_len, in5, in5_len);
1189   memcpy(ret + in1_len + in2_len + in3_len + in4_len + in5_len, in6, in6_len);
1190   memcpy(ret + in1_len + in2_len + in3_len + in4_len + in5_len + in6_len, in7, in7_len);
1191   memcpy(ret + in1_len + in2_len + in3_len + in4_len + in5_len + in6_len + in7_len, in8,
1192          in8_len);
1193   return ret;
1194 }
1195
1196 FORCE_INLINE
1197 const char* concat_utf8_utf8_utf8_utf8_utf8_utf8_utf8_utf8_utf8(
1198     gdv_int64 context, const char* in1, gdv_int32 in1_len, bool in1_validity,
1199     const char* in2, gdv_int32 in2_len, bool in2_validity, const char* in3,
1200     gdv_int32 in3_len, bool in3_validity, const char* in4, gdv_int32 in4_len,
1201     bool in4_validity, const char* in5, gdv_int32 in5_len, bool in5_validity,
1202     const char* in6, gdv_int32 in6_len, bool in6_validity, const char* in7,
1203     gdv_int32 in7_len, bool in7_validity, const char* in8, gdv_int32 in8_len,
1204     bool in8_validity, const char* in9, gdv_int32 in9_len, bool in9_validity,
1205     gdv_int32* out_len) {
1206   if (!in1_validity) {
1207     in1_len = 0;
1208   }
1209   if (!in2_validity) {
1210     in2_len = 0;
1211   }
1212   if (!in3_validity) {
1213     in3_len = 0;
1214   }
1215   if (!in4_validity) {
1216     in4_len = 0;
1217   }
1218   if (!in5_validity) {
1219     in5_len = 0;
1220   }
1221   if (!in6_validity) {
1222     in6_len = 0;
1223   }
1224   if (!in7_validity) {
1225     in7_len = 0;
1226   }
1227   if (!in8_validity) {
1228     in8_len = 0;
1229   }
1230   if (!in9_validity) {
1231     in9_len = 0;
1232   }
1233   return concatOperator_utf8_utf8_utf8_utf8_utf8_utf8_utf8_utf8_utf8(
1234       context, in1, in1_len, in2, in2_len, in3, in3_len, in4, in4_len, in5, in5_len, in6,
1235       in6_len, in7, in7_len, in8, in8_len, in9, in9_len, out_len);
1236 }
1237
1238 FORCE_INLINE
1239 const char* concatOperator_utf8_utf8_utf8_utf8_utf8_utf8_utf8_utf8_utf8(
1240     gdv_int64 context, const char* in1, gdv_int32 in1_len, const char* in2,
1241     gdv_int32 in2_len, const char* in3, gdv_int32 in3_len, const char* in4,
1242     gdv_int32 in4_len, const char* in5, gdv_int32 in5_len, const char* in6,
1243     gdv_int32 in6_len, const char* in7, gdv_int32 in7_len, const char* in8,
1244     gdv_int32 in8_len, const char* in9, gdv_int32 in9_len, gdv_int32* out_len) {
1245   *out_len = in1_len + in2_len + in3_len + in4_len + in5_len + in6_len + in7_len +
1246              in8_len + in9_len;
1247   if (*out_len <= 0) {
1248     *out_len = 0;
1249     return "";
1250   }
1251   char* ret = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, *out_len));
1252   if (ret == nullptr) {
1253     gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
1254     *out_len = 0;
1255     return "";
1256   }
1257   memcpy(ret, in1, in1_len);
1258   memcpy(ret + in1_len, in2, in2_len);
1259   memcpy(ret + in1_len + in2_len, in3, in3_len);
1260   memcpy(ret + in1_len + in2_len + in3_len, in4, in4_len);
1261   memcpy(ret + in1_len + in2_len + in3_len + in4_len, in5, in5_len);
1262   memcpy(ret + in1_len + in2_len + in3_len + in4_len + in5_len, in6, in6_len);
1263   memcpy(ret + in1_len + in2_len + in3_len + in4_len + in5_len + in6_len, in7, in7_len);
1264   memcpy(ret + in1_len + in2_len + in3_len + in4_len + in5_len + in6_len + in7_len, in8,
1265          in8_len);
1266   memcpy(
1267       ret + in1_len + in2_len + in3_len + in4_len + in5_len + in6_len + in7_len + in8_len,
1268       in9, in9_len);
1269   return ret;
1270 }
1271
1272 FORCE_INLINE
1273 const char* concat_utf8_utf8_utf8_utf8_utf8_utf8_utf8_utf8_utf8_utf8(
1274     gdv_int64 context, const char* in1, gdv_int32 in1_len, bool in1_validity,
1275     const char* in2, gdv_int32 in2_len, bool in2_validity, const char* in3,
1276     gdv_int32 in3_len, bool in3_validity, const char* in4, gdv_int32 in4_len,
1277     bool in4_validity, const char* in5, gdv_int32 in5_len, bool in5_validity,
1278     const char* in6, gdv_int32 in6_len, bool in6_validity, const char* in7,
1279     gdv_int32 in7_len, bool in7_validity, const char* in8, gdv_int32 in8_len,
1280     bool in8_validity, const char* in9, gdv_int32 in9_len, bool in9_validity,
1281     const char* in10, gdv_int32 in10_len, bool in10_validity, gdv_int32* out_len) {
1282   if (!in1_validity) {
1283     in1_len = 0;
1284   }
1285   if (!in2_validity) {
1286     in2_len = 0;
1287   }
1288   if (!in3_validity) {
1289     in3_len = 0;
1290   }
1291   if (!in4_validity) {
1292     in4_len = 0;
1293   }
1294   if (!in5_validity) {
1295     in5_len = 0;
1296   }
1297   if (!in6_validity) {
1298     in6_len = 0;
1299   }
1300   if (!in7_validity) {
1301     in7_len = 0;
1302   }
1303   if (!in8_validity) {
1304     in8_len = 0;
1305   }
1306   if (!in9_validity) {
1307     in9_len = 0;
1308   }
1309   if (!in10_validity) {
1310     in10_len = 0;
1311   }
1312   return concatOperator_utf8_utf8_utf8_utf8_utf8_utf8_utf8_utf8_utf8_utf8(
1313       context, in1, in1_len, in2, in2_len, in3, in3_len, in4, in4_len, in5, in5_len, in6,
1314       in6_len, in7, in7_len, in8, in8_len, in9, in9_len, in10, in10_len, out_len);
1315 }
1316
1317 FORCE_INLINE
1318 const char* concatOperator_utf8_utf8_utf8_utf8_utf8_utf8_utf8_utf8_utf8_utf8(
1319     gdv_int64 context, const char* in1, gdv_int32 in1_len, const char* in2,
1320     gdv_int32 in2_len, const char* in3, gdv_int32 in3_len, const char* in4,
1321     gdv_int32 in4_len, const char* in5, gdv_int32 in5_len, const char* in6,
1322     gdv_int32 in6_len, const char* in7, gdv_int32 in7_len, const char* in8,
1323     gdv_int32 in8_len, const char* in9, gdv_int32 in9_len, const char* in10,
1324     gdv_int32 in10_len, gdv_int32* out_len) {
1325   *out_len = in1_len + in2_len + in3_len + in4_len + in5_len + in6_len + in7_len +
1326              in8_len + in9_len + in10_len;
1327   if (*out_len <= 0) {
1328     *out_len = 0;
1329     return "";
1330   }
1331   char* ret = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, *out_len));
1332   if (ret == nullptr) {
1333     gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
1334     *out_len = 0;
1335     return "";
1336   }
1337   memcpy(ret, in1, in1_len);
1338   memcpy(ret + in1_len, in2, in2_len);
1339   memcpy(ret + in1_len + in2_len, in3, in3_len);
1340   memcpy(ret + in1_len + in2_len + in3_len, in4, in4_len);
1341   memcpy(ret + in1_len + in2_len + in3_len + in4_len, in5, in5_len);
1342   memcpy(ret + in1_len + in2_len + in3_len + in4_len + in5_len, in6, in6_len);
1343   memcpy(ret + in1_len + in2_len + in3_len + in4_len + in5_len + in6_len, in7, in7_len);
1344   memcpy(ret + in1_len + in2_len + in3_len + in4_len + in5_len + in6_len + in7_len, in8,
1345          in8_len);
1346   memcpy(
1347       ret + in1_len + in2_len + in3_len + in4_len + in5_len + in6_len + in7_len + in8_len,
1348       in9, in9_len);
1349   memcpy(ret + in1_len + in2_len + in3_len + in4_len + in5_len + in6_len + in7_len +
1350              in8_len + in9_len,
1351          in10, in10_len);
1352   return ret;
1353 }
1354
1355 // Returns the numeric value of the first character of str.
1356 GANDIVA_EXPORT
1357 gdv_int32 ascii_utf8(const char* data, gdv_int32 data_len) {
1358   if (data_len == 0) {
1359     return 0;
1360   }
1361   return static_cast<gdv_int32>(data[0]);
1362 }
1363
1364 FORCE_INLINE
1365 const char* convert_fromUTF8_binary(gdv_int64 context, const char* bin_in, gdv_int32 len,
1366                                     gdv_int32* out_len) {
1367   *out_len = len;
1368   char* ret = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, *out_len));
1369   if (ret == nullptr) {
1370     gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
1371     *out_len = 0;
1372     return "";
1373   }
1374   memcpy(ret, bin_in, *out_len);
1375   return ret;
1376 }
1377
1378 FORCE_INLINE
1379 const char* convert_replace_invalid_fromUTF8_binary(int64_t context, const char* text_in,
1380                                                     int32_t text_len,
1381                                                     const char* char_to_replace,
1382                                                     int32_t char_to_replace_len,
1383                                                     int32_t* out_len) {
1384   if (char_to_replace_len > 1) {
1385     gdv_fn_context_set_error_msg(context, "Replacement of multiple bytes not supported");
1386     *out_len = 0;
1387     return "";
1388   }
1389   // actually the convert_replace function replaces invalid chars with an ASCII
1390   // character so the output length will be the same as the input length
1391   *out_len = text_len;
1392   char* ret = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, *out_len));
1393   if (ret == nullptr) {
1394     gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
1395     *out_len = 0;
1396     return "";
1397   }
1398   int32_t valid_bytes_to_cpy = 0;
1399   int32_t out_byte_counter = 0;
1400   int32_t in_byte_counter = 0;
1401   int32_t char_len;
1402   // scan the base text from left to right and increment the start pointer till
1403   // looking for invalid chars to substitute
1404   for (int text_index = 0; text_index < text_len; text_index += char_len) {
1405     char_len = utf8_char_length(text_in[text_index]);
1406     // only memory copy the bytes when detect invalid char
1407     if (char_len == 0 || text_index + char_len > text_len ||
1408         !validate_utf8_following_bytes(text_in, char_len, text_index)) {
1409       // define char_len = 1 to increase text_index by 1 (as ASCII char fits in 1 byte)
1410       char_len = 1;
1411       // first copy the valid bytes until now and then replace the invalid character
1412       memcpy(ret + out_byte_counter, text_in + in_byte_counter, valid_bytes_to_cpy);
1413       // if the replacement char is empty, the invalid char should be ignored
1414       if (char_to_replace_len == 0) {
1415         out_byte_counter += valid_bytes_to_cpy;
1416       } else {
1417         ret[out_byte_counter + valid_bytes_to_cpy] = char_to_replace[0];
1418         out_byte_counter += valid_bytes_to_cpy + char_len;
1419       }
1420       in_byte_counter += valid_bytes_to_cpy + char_len;
1421       valid_bytes_to_cpy = 0;
1422       continue;
1423     }
1424     valid_bytes_to_cpy += char_len;
1425   }
1426   // if invalid chars were not found, return the original string
1427   if (out_byte_counter == 0 && in_byte_counter == 0) return text_in;
1428   // if there are still valid bytes to copy, do it
1429   if (valid_bytes_to_cpy != 0) {
1430     memcpy(ret + out_byte_counter, text_in + in_byte_counter, valid_bytes_to_cpy);
1431   }
1432   // the out length will be the out bytes copied + the missing end bytes copied
1433   *out_len = valid_bytes_to_cpy + out_byte_counter;
1434   return ret;
1435 }
1436
1437 // The function reverse a char array in-place
1438 static inline void reverse_char_buf(char* buf, int32_t len) {
1439   char temp;
1440
1441   for (int32_t i = 0; i < len / 2; i++) {
1442     int32_t pos_swp = len - (1 + i);
1443     temp = buf[pos_swp];
1444     buf[pos_swp] = buf[i];
1445     buf[i] = temp;
1446   }
1447 }
1448
1449 // Converts a double variable to binary
1450 FORCE_INLINE
1451 const char* convert_toDOUBLE(int64_t context, double value, int32_t* out_len) {
1452   *out_len = sizeof(value);
1453   char* ret = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, *out_len));
1454
1455   if (ret == nullptr) {
1456     gdv_fn_context_set_error_msg(context,
1457                                  "Could not allocate memory for the output string");
1458
1459     *out_len = 0;
1460     return "";
1461   }
1462
1463   memcpy(ret, &value, *out_len);
1464
1465   return ret;
1466 }
1467
1468 FORCE_INLINE
1469 const char* convert_toDOUBLE_be(int64_t context, double value, int32_t* out_len) {
1470   // The function behaves like convert_toDOUBLE, but always return the result
1471   // in big endian format
1472   char* ret = const_cast<char*>(convert_toDOUBLE(context, value, out_len));
1473
1474 #if ARROW_LITTLE_ENDIAN
1475   reverse_char_buf(ret, *out_len);
1476 #endif
1477
1478   return ret;
1479 }
1480
1481 // Converts a float variable to binary
1482 FORCE_INLINE
1483 const char* convert_toFLOAT(int64_t context, float value, int32_t* out_len) {
1484   *out_len = sizeof(value);
1485   char* ret = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, *out_len));
1486
1487   if (ret == nullptr) {
1488     gdv_fn_context_set_error_msg(context,
1489                                  "Could not allocate memory for the output string");
1490
1491     *out_len = 0;
1492     return "";
1493   }
1494
1495   memcpy(ret, &value, *out_len);
1496
1497   return ret;
1498 }
1499
1500 FORCE_INLINE
1501 const char* convert_toFLOAT_be(int64_t context, float value, int32_t* out_len) {
1502   // The function behaves like convert_toFLOAT, but always return the result
1503   // in big endian format
1504   char* ret = const_cast<char*>(convert_toFLOAT(context, value, out_len));
1505
1506 #if ARROW_LITTLE_ENDIAN
1507   reverse_char_buf(ret, *out_len);
1508 #endif
1509
1510   return ret;
1511 }
1512
1513 // Converts a bigint(int with 64 bits) variable to binary
1514 FORCE_INLINE
1515 const char* convert_toBIGINT(int64_t context, int64_t value, int32_t* out_len) {
1516   *out_len = sizeof(value);
1517   char* ret = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, *out_len));
1518
1519   if (ret == nullptr) {
1520     gdv_fn_context_set_error_msg(context,
1521                                  "Could not allocate memory for the output string");
1522
1523     *out_len = 0;
1524     return "";
1525   }
1526
1527   memcpy(ret, &value, *out_len);
1528
1529   return ret;
1530 }
1531
1532 FORCE_INLINE
1533 const char* convert_toBIGINT_be(int64_t context, int64_t value, int32_t* out_len) {
1534   // The function behaves like convert_toBIGINT, but always return the result
1535   // in big endian format
1536   char* ret = const_cast<char*>(convert_toBIGINT(context, value, out_len));
1537
1538 #if ARROW_LITTLE_ENDIAN
1539   reverse_char_buf(ret, *out_len);
1540 #endif
1541
1542   return ret;
1543 }
1544
1545 // Converts an integer(with 32 bits) variable to binary
1546 FORCE_INLINE
1547 const char* convert_toINT(int64_t context, int32_t value, int32_t* out_len) {
1548   *out_len = sizeof(value);
1549   char* ret = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, *out_len));
1550
1551   if (ret == nullptr) {
1552     gdv_fn_context_set_error_msg(context,
1553                                  "Could not allocate memory for the output string");
1554
1555     *out_len = 0;
1556     return "";
1557   }
1558
1559   memcpy(ret, &value, *out_len);
1560
1561   return ret;
1562 }
1563
1564 FORCE_INLINE
1565 const char* convert_toINT_be(int64_t context, int32_t value, int32_t* out_len) {
1566   // The function behaves like convert_toINT, but always return the result
1567   // in big endian format
1568   char* ret = const_cast<char*>(convert_toINT(context, value, out_len));
1569
1570 #if ARROW_LITTLE_ENDIAN
1571   reverse_char_buf(ret, *out_len);
1572 #endif
1573
1574   return ret;
1575 }
1576
1577 // Converts a boolean variable to binary
1578 FORCE_INLINE
1579 const char* convert_toBOOLEAN(int64_t context, bool value, int32_t* out_len) {
1580   *out_len = sizeof(value);
1581   char* ret = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, *out_len));
1582
1583   if (ret == nullptr) {
1584     gdv_fn_context_set_error_msg(context,
1585                                  "Could not allocate memory for the output string");
1586
1587     *out_len = 0;
1588     return "";
1589   }
1590
1591   memcpy(ret, &value, *out_len);
1592
1593   return ret;
1594 }
1595
1596 // Converts a time variable to binary
1597 FORCE_INLINE
1598 const char* convert_toTIME_EPOCH(int64_t context, int32_t value, int32_t* out_len) {
1599   return convert_toINT(context, value, out_len);
1600 }
1601
1602 FORCE_INLINE
1603 const char* convert_toTIME_EPOCH_be(int64_t context, int32_t value, int32_t* out_len) {
1604   // The function behaves as convert_toTIME_EPOCH, but
1605   // returns the bytes in big endian format
1606   return convert_toINT_be(context, value, out_len);
1607 }
1608
1609 // Converts a timestamp variable to binary
1610 FORCE_INLINE
1611 const char* convert_toTIMESTAMP_EPOCH(int64_t context, int64_t timestamp,
1612                                       int32_t* out_len) {
1613   return convert_toBIGINT(context, timestamp, out_len);
1614 }
1615
1616 FORCE_INLINE
1617 const char* convert_toTIMESTAMP_EPOCH_be(int64_t context, int64_t timestamp,
1618                                          int32_t* out_len) {
1619   // The function behaves as convert_toTIMESTAMP_EPOCH, but
1620   // returns the bytes in big endian format
1621   return convert_toBIGINT_be(context, timestamp, out_len);
1622 }
1623
1624 // Converts a date variable to binary
1625 FORCE_INLINE
1626 const char* convert_toDATE_EPOCH(int64_t context, int64_t date, int32_t* out_len) {
1627   return convert_toBIGINT(context, date, out_len);
1628 }
1629
1630 FORCE_INLINE
1631 const char* convert_toDATE_EPOCH_be(int64_t context, int64_t date, int32_t* out_len) {
1632   // The function behaves as convert_toDATE_EPOCH, but
1633   // returns the bytes in big endian format
1634   return convert_toBIGINT_be(context, date, out_len);
1635 }
1636
1637 // Converts a string variable to binary
1638 FORCE_INLINE
1639 const char* convert_toUTF8(int64_t context, const char* value, int32_t value_len,
1640                            int32_t* out_len) {
1641   *out_len = value_len;
1642   return value;
1643 }
1644
1645 // Search for a string within another string
1646 // Same as "locate(substr, str)", except for the reverse order of the arguments.
1647 FORCE_INLINE
1648 gdv_int32 strpos_utf8_utf8(gdv_int64 context, const char* str, gdv_int32 str_len,
1649                            const char* sub_str, gdv_int32 sub_str_len) {
1650   return locate_utf8_utf8_int32(context, sub_str, sub_str_len, str, str_len, 1);
1651 }
1652
1653 // Search for a string within another string
1654 FORCE_INLINE
1655 gdv_int32 locate_utf8_utf8(gdv_int64 context, const char* sub_str, gdv_int32 sub_str_len,
1656                            const char* str, gdv_int32 str_len) {
1657   return locate_utf8_utf8_int32(context, sub_str, sub_str_len, str, str_len, 1);
1658 }
1659
1660 // Search for a string within another string starting at position start-pos (1-indexed)
1661 FORCE_INLINE
1662 gdv_int32 locate_utf8_utf8_int32(gdv_int64 context, const char* sub_str,
1663                                  gdv_int32 sub_str_len, const char* str,
1664                                  gdv_int32 str_len, gdv_int32 start_pos) {
1665   if (start_pos < 1) {
1666     gdv_fn_context_set_error_msg(context, "Start position must be greater than 0");
1667     return 0;
1668   }
1669
1670   if (str_len == 0 || sub_str_len == 0) {
1671     return 0;
1672   }
1673
1674   gdv_int32 byte_pos = utf8_byte_pos(context, str, str_len, start_pos - 1);
1675   if (byte_pos < 0 || byte_pos >= str_len) {
1676     return 0;
1677   }
1678   for (gdv_int32 i = byte_pos; i <= str_len - sub_str_len; ++i) {
1679     if (memcmp(str + i, sub_str, sub_str_len) == 0) {
1680       return utf8_length(context, str, i) + 1;
1681     }
1682   }
1683   return 0;
1684 }
1685
1686 FORCE_INLINE
1687 const char* replace_with_max_len_utf8_utf8_utf8(gdv_int64 context, const char* text,
1688                                                 gdv_int32 text_len, const char* from_str,
1689                                                 gdv_int32 from_str_len,
1690                                                 const char* to_str, gdv_int32 to_str_len,
1691                                                 gdv_int32 max_length,
1692                                                 gdv_int32* out_len) {
1693   // if from_str is empty or its length exceeds that of original string,
1694   // return the original string
1695   if (from_str_len <= 0 || from_str_len > text_len) {
1696     *out_len = text_len;
1697     return text;
1698   }
1699
1700   bool found = false;
1701   gdv_int32 text_index = 0;
1702   char* out;
1703   gdv_int32 out_index = 0;
1704   gdv_int32 last_match_index =
1705       0;  // defer copying string from last_match_index till next match is found
1706
1707   for (; text_index <= text_len - from_str_len;) {
1708     if (memcmp(text + text_index, from_str, from_str_len) == 0) {
1709       if (out_index + text_index - last_match_index + to_str_len > max_length) {
1710         gdv_fn_context_set_error_msg(context, "Buffer overflow for output string");
1711         *out_len = 0;
1712         return "";
1713       }
1714       if (!found) {
1715         // found match for first time
1716         out = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, max_length));
1717         if (out == nullptr) {
1718           gdv_fn_context_set_error_msg(context,
1719                                        "Could not allocate memory for output string");
1720           *out_len = 0;
1721           return "";
1722         }
1723         found = true;
1724       }
1725       // first copy the part deferred till now
1726       memcpy(out + out_index, text + last_match_index, (text_index - last_match_index));
1727       out_index += text_index - last_match_index;
1728       // then copy the target string
1729       memcpy(out + out_index, to_str, to_str_len);
1730       out_index += to_str_len;
1731
1732       text_index += from_str_len;
1733       last_match_index = text_index;
1734     } else {
1735       text_index++;
1736     }
1737   }
1738
1739   if (!found) {
1740     *out_len = text_len;
1741     return text;
1742   }
1743
1744   if (out_index + text_len - last_match_index > max_length) {
1745     gdv_fn_context_set_error_msg(context, "Buffer overflow for output string");
1746     *out_len = 0;
1747     return "";
1748   }
1749   memcpy(out + out_index, text + last_match_index, text_len - last_match_index);
1750   out_index += text_len - last_match_index;
1751   *out_len = out_index;
1752   return out;
1753 }
1754
1755 FORCE_INLINE
1756 const char* replace_utf8_utf8_utf8(gdv_int64 context, const char* text,
1757                                    gdv_int32 text_len, const char* from_str,
1758                                    gdv_int32 from_str_len, const char* to_str,
1759                                    gdv_int32 to_str_len, gdv_int32* out_len) {
1760   return replace_with_max_len_utf8_utf8_utf8(context, text, text_len, from_str,
1761                                              from_str_len, to_str, to_str_len, 65535,
1762                                              out_len);
1763 }
1764
1765 FORCE_INLINE
1766 const char* lpad_utf8_int32_utf8(gdv_int64 context, const char* text, gdv_int32 text_len,
1767                                  gdv_int32 return_length, const char* fill_text,
1768                                  gdv_int32 fill_text_len, gdv_int32* out_len) {
1769   // if the text length or the defined return length (number of characters to return)
1770   // is <=0, then return an empty string.
1771   if (text_len == 0 || return_length <= 0) {
1772     *out_len = 0;
1773     return "";
1774   }
1775
1776   // count the number of utf8 characters on text, ignoring invalid bytes
1777   int text_char_count = utf8_length_ignore_invalid(text, text_len);
1778
1779   if (return_length == text_char_count ||
1780       (return_length > text_char_count && fill_text_len == 0)) {
1781     // case where the return length is same as the text's length, or if it need to
1782     // fill into text but "fill_text" is empty, then return text directly.
1783     *out_len = text_len;
1784     return text;
1785   } else if (return_length < text_char_count) {
1786     // case where it truncates the result on return length.
1787     *out_len = utf8_byte_pos(context, text, text_len, return_length);
1788     return text;
1789   } else {
1790     // case (return_length > text_char_count)
1791     // case where it needs to copy "fill_text" on the string left. The total number
1792     // of chars to copy is given by (return_length -  text_char_count)
1793     char* ret =
1794         reinterpret_cast<gdv_binary>(gdv_fn_context_arena_malloc(context, return_length));
1795     if (ret == nullptr) {
1796       gdv_fn_context_set_error_msg(context,
1797                                    "Could not allocate memory for output string");
1798       *out_len = 0;
1799       return "";
1800     }
1801     // try to fulfill the return string with the "fill_text" continuously
1802     int32_t copied_chars_count = 0;
1803     int32_t copied_chars_position = 0;
1804     while (copied_chars_count < return_length - text_char_count) {
1805       int32_t char_len;
1806       int32_t fill_index;
1807       // for each char, evaluate its length to consider it when mem copying
1808       for (fill_index = 0; fill_index < fill_text_len; fill_index += char_len) {
1809         if (copied_chars_count >= return_length - text_char_count) {
1810           break;
1811         }
1812         char_len = utf8_char_length(fill_text[fill_index]);
1813         // ignore invalid char on the fill text, considering it as size 1
1814         if (char_len == 0) char_len += 1;
1815         copied_chars_count++;
1816       }
1817       memcpy(ret + copied_chars_position, fill_text, fill_index);
1818       copied_chars_position += fill_index;
1819     }
1820     // after fulfilling the text, copy the main string
1821     memcpy(ret + copied_chars_position, text, text_len);
1822     *out_len = copied_chars_position + text_len;
1823     return ret;
1824   }
1825 }
1826
1827 FORCE_INLINE
1828 const char* rpad_utf8_int32_utf8(gdv_int64 context, const char* text, gdv_int32 text_len,
1829                                  gdv_int32 return_length, const char* fill_text,
1830                                  gdv_int32 fill_text_len, gdv_int32* out_len) {
1831   // if the text length or the defined return length (number of characters to return)
1832   // is <=0, then return an empty string.
1833   if (text_len == 0 || return_length <= 0) {
1834     *out_len = 0;
1835     return "";
1836   }
1837
1838   // count the number of utf8 characters on text, ignoring invalid bytes
1839   int text_char_count = utf8_length_ignore_invalid(text, text_len);
1840
1841   if (return_length == text_char_count ||
1842       (return_length > text_char_count && fill_text_len == 0)) {
1843     // case where the return length is same as the text's length, or if it need to
1844     // fill into text but "fill_text" is empty, then return text directly.
1845     *out_len = text_len;
1846     return text;
1847   } else if (return_length < text_char_count) {
1848     // case where it truncates the result on return length.
1849     *out_len = utf8_byte_pos(context, text, text_len, return_length);
1850     return text;
1851   } else {
1852     // case (return_length > text_char_count)
1853     // case where it needs to copy "fill_text" on the string right
1854     char* ret =
1855         reinterpret_cast<gdv_binary>(gdv_fn_context_arena_malloc(context, return_length));
1856     if (ret == nullptr) {
1857       gdv_fn_context_set_error_msg(context,
1858                                    "Could not allocate memory for output string");
1859       *out_len = 0;
1860       return "";
1861     }
1862     // fulfill the initial text copying the main input string
1863     memcpy(ret, text, text_len);
1864     // try to fulfill the return string with the "fill_text" continuously
1865     int32_t copied_chars_count = 0;
1866     int32_t copied_chars_position = 0;
1867     while (text_char_count + copied_chars_count < return_length) {
1868       int32_t char_len;
1869       int32_t fill_length;
1870       // for each char, evaluate its length to consider it when mem copying
1871       for (fill_length = 0; fill_length < fill_text_len; fill_length += char_len) {
1872         if (text_char_count + copied_chars_count >= return_length) {
1873           break;
1874         }
1875         char_len = utf8_char_length(fill_text[fill_length]);
1876         // ignore invalid char on the fill text, considering it as size 1
1877         if (char_len == 0) char_len += 1;
1878         copied_chars_count++;
1879       }
1880       memcpy(ret + text_len + copied_chars_position, fill_text, fill_length);
1881       copied_chars_position += fill_length;
1882     }
1883     *out_len = copied_chars_position + text_len;
1884     return ret;
1885   }
1886 }
1887
1888 FORCE_INLINE
1889 const char* lpad_utf8_int32(gdv_int64 context, const char* text, gdv_int32 text_len,
1890                             gdv_int32 return_length, gdv_int32* out_len) {
1891   return lpad_utf8_int32_utf8(context, text, text_len, return_length, " ", 1, out_len);
1892 }
1893
1894 FORCE_INLINE
1895 const char* rpad_utf8_int32(gdv_int64 context, const char* text, gdv_int32 text_len,
1896                             gdv_int32 return_length, gdv_int32* out_len) {
1897   return rpad_utf8_int32_utf8(context, text, text_len, return_length, " ", 1, out_len);
1898 }
1899
1900 FORCE_INLINE
1901 const char* split_part(gdv_int64 context, const char* text, gdv_int32 text_len,
1902                        const char* delimiter, gdv_int32 delim_len, gdv_int32 index,
1903                        gdv_int32* out_len) {
1904   *out_len = 0;
1905   if (index < 1) {
1906     char error_message[100];
1907     snprintf(error_message, sizeof(error_message),
1908              "Index in split_part must be positive, value provided was %d", index);
1909     gdv_fn_context_set_error_msg(context, error_message);
1910     return "";
1911   }
1912
1913   if (delim_len == 0 || text_len == 0) {
1914     // output will just be text if no delimiter is provided
1915     *out_len = text_len;
1916     return text;
1917   }
1918
1919   int i = 0, match_no = 1;
1920
1921   while (i < text_len) {
1922     // find the position where delimiter matched for the first time
1923     int match_pos = match_string(text, text_len, i, delimiter, delim_len);
1924     if (match_pos == -1 && match_no != index) {
1925       // reached the end without finding a match.
1926       return "";
1927     } else {
1928       // Found a match. If the match number is index then return this match
1929       if (match_no == index) {
1930         int end_pos = match_pos - delim_len;
1931
1932         if (match_pos == -1) {
1933           // end position should be last position of the string as we have the last
1934           // delimiter
1935           end_pos = text_len;
1936         }
1937
1938         *out_len = end_pos - i;
1939         char* out_str =
1940             reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, *out_len));
1941         if (out_str == nullptr) {
1942           gdv_fn_context_set_error_msg(context,
1943                                        "Could not allocate memory for output string");
1944           *out_len = 0;
1945           return "";
1946         }
1947         memcpy(out_str, text + i, *out_len);
1948         return out_str;
1949       } else {
1950         i = match_pos;
1951         match_no++;
1952       }
1953     }
1954   }
1955
1956   return "";
1957 }
1958
1959 // Returns the x leftmost characters of a given string. Cases:
1960 //     LEFT("TestString", 10) => "TestString"
1961 //     LEFT("TestString", 3) => "Tes"
1962 //     LEFT("TestString", -3) => "TestStr"
1963 FORCE_INLINE
1964 const char* left_utf8_int32(gdv_int64 context, const char* text, gdv_int32 text_len,
1965                             gdv_int32 number, gdv_int32* out_len) {
1966   // returns the 'number' left most characters of a given text
1967   if (text_len == 0 || number == 0) {
1968     *out_len = 0;
1969     return "";
1970   }
1971
1972   // iterate over the utf8 string validating each character
1973   int char_len;
1974   int char_count = 0;
1975   int byte_index = 0;
1976   for (int i = 0; i < text_len; i += char_len) {
1977     char_len = utf8_char_length(text[i]);
1978     if (char_len == 0 || i + char_len > text_len) {  // invalid byte or incomplete glyph
1979       set_error_for_invalid_utf(context, text[i]);
1980       *out_len = 0;
1981       return "";
1982     }
1983     for (int j = 1; j < char_len; ++j) {
1984       if ((text[i + j] & 0xC0) != 0x80) {  // bytes following head-byte of glyph
1985         set_error_for_invalid_utf(context, text[i + j]);
1986         *out_len = 0;
1987         return "";
1988       }
1989     }
1990     byte_index += char_len;
1991     ++char_count;
1992     // Define the rules to stop the iteration over the string
1993     // case where left('abc', 5) -> 'abc'
1994     if (number > 0 && char_count == number) break;
1995     // case where left('abc', -5) ==> ''
1996     if (number < 0 && char_count == number + text_len) break;
1997   }
1998
1999   *out_len = byte_index;
2000   return text;
2001 }
2002
2003 // Returns the x rightmost characters of a given string. Cases:
2004 //     RIGHT("TestString", 10) => "TestString"
2005 //     RIGHT("TestString", 3) => "ing"
2006 //     RIGHT("TestString", -3) => "tString"
2007 FORCE_INLINE
2008 const char* right_utf8_int32(gdv_int64 context, const char* text, gdv_int32 text_len,
2009                              gdv_int32 number, gdv_int32* out_len) {
2010   // returns the 'number' left most characters of a given text
2011   if (text_len == 0 || number == 0) {
2012     *out_len = 0;
2013     return "";
2014   }
2015
2016   // initially counts the number of utf8 characters in the defined text
2017   int32_t char_count = utf8_length(context, text, text_len);
2018   // char_count is zero if input has invalid utf8 char
2019   if (char_count == 0) {
2020     *out_len = 0;
2021     return "";
2022   }
2023
2024   int32_t start_char_pos;  // the char result start position (inclusive)
2025   int32_t end_char_len;    // the char result end position (inclusive)
2026   if (number > 0) {
2027     // case where right('abc', 5) ==> 'abc' start_char_pos=1.
2028     start_char_pos = (char_count > number) ? char_count - number : 0;
2029     end_char_len = char_count - start_char_pos;
2030   } else {
2031     start_char_pos = number * -1;
2032     end_char_len = char_count - start_char_pos;
2033   }
2034
2035   // calculate the start byte position and the output length
2036   int32_t start_byte_pos = utf8_byte_pos(context, text, text_len, start_char_pos);
2037   *out_len = utf8_byte_pos(context, text, text_len, end_char_len);
2038
2039   // try to allocate memory for the response
2040   char* ret =
2041       reinterpret_cast<gdv_binary>(gdv_fn_context_arena_malloc(context, *out_len));
2042   if (ret == nullptr) {
2043     gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
2044     *out_len = 0;
2045     return "";
2046   }
2047   memcpy(ret, text + start_byte_pos, *out_len);
2048   return ret;
2049 }
2050
2051 FORCE_INLINE
2052 const char* binary_string(gdv_int64 context, const char* text, gdv_int32 text_len,
2053                           gdv_int32* out_len) {
2054   gdv_binary ret =
2055       reinterpret_cast<gdv_binary>(gdv_fn_context_arena_malloc(context, text_len));
2056
2057   if (ret == nullptr) {
2058     gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
2059     *out_len = 0;
2060     return "";
2061   }
2062
2063   if (text_len == 0) {
2064     *out_len = 0;
2065     return "";
2066   }
2067
2068   // converting hex encoded string to normal string
2069   int j = 0;
2070   for (int i = 0; i < text_len; i++, j++) {
2071     if (text[i] == '\\' && i + 3 < text_len &&
2072         (text[i + 1] == 'x' || text[i + 1] == 'X')) {
2073       char hd1 = text[i + 2];
2074       char hd2 = text[i + 3];
2075       if (isxdigit(hd1) && isxdigit(hd2)) {
2076         // [a-fA-F0-9]
2077         ret[j] = to_binary_from_hex(hd1) * 16 + to_binary_from_hex(hd2);
2078         i += 3;
2079       } else {
2080         ret[j] = text[i];
2081       }
2082     } else {
2083       ret[j] = text[i];
2084     }
2085   }
2086   *out_len = j;
2087   return ret;
2088 }
2089
2090 #define CAST_INT_BIGINT_VARBINARY(OUT_TYPE, TYPE_NAME)                                 \
2091   FORCE_INLINE                                                                         \
2092   OUT_TYPE                                                                             \
2093   cast##TYPE_NAME##_varbinary(gdv_int64 context, const char* in, int32_t in_len) {     \
2094     if (in_len == 0) {                                                                 \
2095       gdv_fn_context_set_error_msg(context, "Can't cast an empty string.");            \
2096       return -1;                                                                       \
2097     }                                                                                  \
2098     char sign = in[0];                                                                 \
2099                                                                                        \
2100     bool negative = false;                                                             \
2101     if (sign == '-') {                                                                 \
2102       negative = true;                                                                 \
2103       /* Ignores the sign char in the hexadecimal string */                            \
2104       in++;                                                                            \
2105       in_len--;                                                                        \
2106     }                                                                                  \
2107                                                                                        \
2108     if (negative && in_len == 0) {                                                     \
2109       gdv_fn_context_set_error_msg(context,                                            \
2110                                    "Can't cast hexadecimal with only a minus sign.");  \
2111       return -1;                                                                       \
2112     }                                                                                  \
2113                                                                                        \
2114     OUT_TYPE result = 0;                                                               \
2115     int digit;                                                                         \
2116                                                                                        \
2117     int read_index = 0;                                                                \
2118     while (read_index < in_len) {                                                      \
2119       char c1 = in[read_index];                                                        \
2120       if (isxdigit(c1)) {                                                              \
2121         digit = to_binary_from_hex(c1);                                                \
2122                                                                                        \
2123         OUT_TYPE next = result * 16 - digit;                                           \
2124                                                                                        \
2125         if (next > result) {                                                           \
2126           gdv_fn_context_set_error_msg(context, "Integer overflow.");                  \
2127           return -1;                                                                   \
2128         }                                                                              \
2129         result = next;                                                                 \
2130         read_index++;                                                                  \
2131       } else {                                                                         \
2132         gdv_fn_context_set_error_msg(context,                                          \
2133                                      "The hexadecimal given has invalid characters."); \
2134         return -1;                                                                     \
2135       }                                                                                \
2136     }                                                                                  \
2137     if (!negative) {                                                                   \
2138       result *= -1;                                                                    \
2139                                                                                        \
2140       if (result < 0) {                                                                \
2141         gdv_fn_context_set_error_msg(context, "Integer overflow.");                    \
2142         return -1;                                                                     \
2143       }                                                                                \
2144     }                                                                                  \
2145     return result;                                                                     \
2146   }
2147
2148 CAST_INT_BIGINT_VARBINARY(int32_t, INT)
2149 CAST_INT_BIGINT_VARBINARY(int64_t, BIGINT)
2150
2151 #undef CAST_INT_BIGINT_VARBINARY
2152
2153 // Produces the binary representation of a string y characters long derived by starting
2154 // at offset 'x' and considering the defined length 'y'. Notice that the offset index
2155 // may be a negative number (starting from the end of the string), or a positive number
2156 // starting on index 1. Cases:
2157 //     BYTE_SUBSTR("TestString", 1, 10) => "TestString"
2158 //     BYTE_SUBSTR("TestString", 5, 10) => "String"
2159 //     BYTE_SUBSTR("TestString", -6, 10) => "String"
2160 //     BYTE_SUBSTR("TestString", -600, 10) => "TestString"
2161 FORCE_INLINE
2162 const char* byte_substr_binary_int32_int32(gdv_int64 context, const char* text,
2163                                            gdv_int32 text_len, gdv_int32 offset,
2164                                            gdv_int32 length, gdv_int32* out_len) {
2165   // the first offset position for a string is 1, so not consider offset == 0
2166   // also, the length should be always a positive number
2167   if (text_len == 0 || offset == 0 || length <= 0) {
2168     *out_len = 0;
2169     return "";
2170   }
2171
2172   char* ret =
2173       reinterpret_cast<gdv_binary>(gdv_fn_context_arena_malloc(context, text_len));
2174
2175   if (ret == nullptr) {
2176     gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
2177     *out_len = 0;
2178     return "";
2179   }
2180
2181   int32_t startPos = 0;
2182   if (offset >= 0) {
2183     startPos = offset - 1;
2184   } else if (text_len + offset >= 0) {
2185     startPos = text_len + offset;
2186   }
2187
2188   // calculate end position from length and truncate to upper value bounds
2189   if (startPos + length > text_len) {
2190     *out_len = text_len - startPos;
2191   } else {
2192     *out_len = length;
2193   }
2194
2195   memcpy(ret, text + startPos, *out_len);
2196   return ret;
2197 }
2198 }  // extern "C"