]> git.proxmox.com Git - ceph.git/blob - ceph/src/arrow/cpp/src/gandiva/precompiled/string_ops.cc
import quincy 17.2.0
[ceph.git] / ceph / src / arrow / cpp / src / gandiva / precompiled / string_ops.cc
1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements. See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership. The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License. You may obtain a copy of the License at
8 //
9 // http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied. See the License for the
15 // specific language governing permissions and limitations
16 // under the License.
17
18 // String functions
19 #include "arrow/util/value_parsing.h"
20
21 extern "C" {
22
23 #include <algorithm>
24 #include <climits>
25 #include <cstdio>
26 #include <cstdlib>
27 #include <cstring>
28
29 #include "./types.h"
30
31 FORCE_INLINE
32 gdv_int32 octet_length_utf8(const gdv_utf8 input, gdv_int32 length) { return length; }
33
34 FORCE_INLINE
35 gdv_int32 bit_length_utf8(const gdv_utf8 input, gdv_int32 length) { return length * 8; }
36
37 FORCE_INLINE
38 gdv_int32 octet_length_binary(const gdv_binary input, gdv_int32 length) { return length; }
39
40 FORCE_INLINE
41 gdv_int32 bit_length_binary(const gdv_binary input, gdv_int32 length) {
42 return length * 8;
43 }
44
45 FORCE_INLINE
46 int match_string(const char* input, gdv_int32 input_len, gdv_int32 start_pos,
47 const char* delim, gdv_int32 delim_len) {
48 for (int i = start_pos; i < input_len; i++) {
49 int left_chars = input_len - i;
50 if ((left_chars >= delim_len) && memcmp(input + i, delim, delim_len) == 0) {
51 return i + delim_len;
52 }
53 }
54
55 return -1;
56 }
57
58 FORCE_INLINE
59 gdv_int32 mem_compare(const char* left, gdv_int32 left_len, const char* right,
60 gdv_int32 right_len) {
61 int min = left_len;
62 if (right_len < min) {
63 min = right_len;
64 }
65
66 int cmp_ret = memcmp(left, right, min);
67 if (cmp_ret != 0) {
68 return cmp_ret;
69 } else {
70 return left_len - right_len;
71 }
72 }
73
74 // Expand inner macro for all varlen types.
75 #define VAR_LEN_OP_TYPES(INNER, NAME, OP) \
76 INNER(NAME, utf8, OP) \
77 INNER(NAME, binary, OP)
78
79 // Relational binary fns : left, right params are same, return is bool.
80 #define BINARY_RELATIONAL(NAME, TYPE, OP) \
81 FORCE_INLINE \
82 bool NAME##_##TYPE##_##TYPE(const gdv_##TYPE left, gdv_int32 left_len, \
83 const gdv_##TYPE right, gdv_int32 right_len) { \
84 return mem_compare(left, left_len, right, right_len) OP 0; \
85 }
86
87 VAR_LEN_OP_TYPES(BINARY_RELATIONAL, equal, ==)
88 VAR_LEN_OP_TYPES(BINARY_RELATIONAL, not_equal, !=)
89 VAR_LEN_OP_TYPES(BINARY_RELATIONAL, less_than, <)
90 VAR_LEN_OP_TYPES(BINARY_RELATIONAL, less_than_or_equal_to, <=)
91 VAR_LEN_OP_TYPES(BINARY_RELATIONAL, greater_than, >)
92 VAR_LEN_OP_TYPES(BINARY_RELATIONAL, greater_than_or_equal_to, >=)
93
94 #undef BINARY_RELATIONAL
95 #undef VAR_LEN_OP_TYPES
96
97 // Expand inner macro for all varlen types.
98 #define VAR_LEN_TYPES(INNER, NAME) \
99 INNER(NAME, utf8) \
100 INNER(NAME, binary)
101
102 FORCE_INLINE
103 int to_binary_from_hex(char ch) {
104 if (ch >= 'A' && ch <= 'F') {
105 return 10 + (ch - 'A');
106 } else if (ch >= 'a' && ch <= 'f') {
107 return 10 + (ch - 'a');
108 }
109 return ch - '0';
110 }
111
112 FORCE_INLINE
113 bool starts_with_utf8_utf8(const char* data, gdv_int32 data_len, const char* prefix,
114 gdv_int32 prefix_len) {
115 return ((data_len >= prefix_len) && (memcmp(data, prefix, prefix_len) == 0));
116 }
117
118 FORCE_INLINE
119 bool ends_with_utf8_utf8(const char* data, gdv_int32 data_len, const char* suffix,
120 gdv_int32 suffix_len) {
121 return ((data_len >= suffix_len) &&
122 (memcmp(data + data_len - suffix_len, suffix, suffix_len) == 0));
123 }
124
125 FORCE_INLINE
126 bool is_substr_utf8_utf8(const char* data, int32_t data_len, const char* substr,
127 int32_t substr_len) {
128 for (int32_t i = 0; i <= data_len - substr_len; ++i) {
129 if (memcmp(data + i, substr, substr_len) == 0) {
130 return true;
131 }
132 }
133 return false;
134 }
135
136 FORCE_INLINE
137 gdv_int32 utf8_char_length(char c) {
138 if ((signed char)c >= 0) { // 1-byte char (0x00 ~ 0x7F)
139 return 1;
140 } else if ((c & 0xE0) == 0xC0) { // 2-byte char
141 return 2;
142 } else if ((c & 0xF0) == 0xE0) { // 3-byte char
143 return 3;
144 } else if ((c & 0xF8) == 0xF0) { // 4-byte char
145 return 4;
146 }
147 // invalid char
148 return 0;
149 }
150
151 FORCE_INLINE
152 void set_error_for_invalid_utf(int64_t execution_context, char val) {
153 char const* fmt = "unexpected byte \\%02hhx encountered while decoding utf8 string";
154 int size = static_cast<int>(strlen(fmt)) + 64;
155 char* error = reinterpret_cast<char*>(malloc(size));
156 snprintf(error, size, fmt, (unsigned char)val);
157 gdv_fn_context_set_error_msg(execution_context, error);
158 free(error);
159 }
160
161 FORCE_INLINE
162 bool validate_utf8_following_bytes(const char* data, int32_t data_len,
163 int32_t char_index) {
164 for (int j = 1; j < data_len; ++j) {
165 if ((data[char_index + j] & 0xC0) != 0x80) { // bytes following head-byte of glyph
166 return false;
167 }
168 }
169 return true;
170 }
171
172 // Count the number of utf8 characters
173 // return 0 for invalid/incomplete input byte sequences
174 FORCE_INLINE
175 gdv_int32 utf8_length(gdv_int64 context, const char* data, gdv_int32 data_len) {
176 int char_len = 0;
177 int count = 0;
178 for (int i = 0; i < data_len; i += char_len) {
179 char_len = utf8_char_length(data[i]);
180 if (char_len == 0 || i + char_len > data_len) { // invalid byte or incomplete glyph
181 set_error_for_invalid_utf(context, data[i]);
182 return 0;
183 }
184 for (int j = 1; j < char_len; ++j) {
185 if ((data[i + j] & 0xC0) != 0x80) { // bytes following head-byte of glyph
186 set_error_for_invalid_utf(context, data[i + j]);
187 return 0;
188 }
189 }
190 ++count;
191 }
192 return count;
193 }
194
195 // Count the number of utf8 characters, ignoring invalid char, considering size 1
196 FORCE_INLINE
197 gdv_int32 utf8_length_ignore_invalid(const char* data, gdv_int32 data_len) {
198 int char_len = 0;
199 int count = 0;
200 for (int i = 0; i < data_len; i += char_len) {
201 char_len = utf8_char_length(data[i]);
202 if (char_len == 0 || i + char_len > data_len) { // invalid byte or incomplete glyph
203 // if invalid byte or incomplete glyph, ignore it
204 char_len = 1;
205 }
206 for (int j = 1; j < char_len; ++j) {
207 if ((data[i + j] & 0xC0) != 0x80) { // bytes following head-byte of glyph
208 char_len += 1;
209 }
210 }
211 ++count;
212 }
213 return count;
214 }
215
216 // Get the byte position corresponding to a character position for a non-empty utf8
217 // sequence
218 FORCE_INLINE
219 gdv_int32 utf8_byte_pos(gdv_int64 context, const char* str, gdv_int32 str_len,
220 gdv_int32 char_pos) {
221 int char_len = 0;
222 int byte_index = 0;
223 for (gdv_int32 char_index = 0; char_index < char_pos && byte_index < str_len;
224 char_index++) {
225 char_len = utf8_char_length(str[byte_index]);
226 if (char_len == 0 ||
227 byte_index + char_len > str_len) { // invalid byte or incomplete glyph
228 set_error_for_invalid_utf(context, str[byte_index]);
229 return -1;
230 }
231 byte_index += char_len;
232 }
233 return byte_index;
234 }
235
236 #define UTF8_LENGTH(NAME, TYPE) \
237 FORCE_INLINE \
238 gdv_int32 NAME##_##TYPE(gdv_int64 context, gdv_##TYPE in, gdv_int32 in_len) { \
239 return utf8_length(context, in, in_len); \
240 }
241
242 UTF8_LENGTH(char_length, utf8)
243 UTF8_LENGTH(length, utf8)
244 UTF8_LENGTH(lengthUtf8, binary)
245
246 // Returns a string of 'n' spaces.
247 #define SPACE_STR(IN_TYPE) \
248 GANDIVA_EXPORT \
249 const char* space_##IN_TYPE(gdv_int64 ctx, gdv_##IN_TYPE n, int32_t* out_len) { \
250 gdv_int32 n_times = static_cast<gdv_int32>(n); \
251 if (n_times <= 0) { \
252 *out_len = 0; \
253 return ""; \
254 } \
255 char* ret = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(ctx, n_times)); \
256 if (ret == nullptr) { \
257 gdv_fn_context_set_error_msg(ctx, "Could not allocate memory for output string"); \
258 *out_len = 0; \
259 return ""; \
260 } \
261 for (int i = 0; i < n_times; i++) { \
262 ret[i] = ' '; \
263 } \
264 *out_len = n_times; \
265 return ret; \
266 }
267
268 SPACE_STR(int32)
269 SPACE_STR(int64)
270
271 // Reverse a utf8 sequence
272 FORCE_INLINE
273 const char* reverse_utf8(gdv_int64 context, const char* data, gdv_int32 data_len,
274 int32_t* out_len) {
275 if (data_len == 0) {
276 *out_len = 0;
277 return "";
278 }
279
280 char* ret = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, data_len));
281 if (ret == nullptr) {
282 gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
283 *out_len = 0;
284 return "";
285 }
286
287 gdv_int32 char_len;
288 for (gdv_int32 i = 0; i < data_len; i += char_len) {
289 char_len = utf8_char_length(data[i]);
290
291 if (char_len == 0 || i + char_len > data_len) { // invalid byte or incomplete glyph
292 set_error_for_invalid_utf(context, data[i]);
293 *out_len = 0;
294 return "";
295 }
296
297 for (gdv_int32 j = 0; j < char_len; ++j) {
298 if (j > 0 && (data[i + j] & 0xC0) != 0x80) { // bytes following head-byte of glyph
299 set_error_for_invalid_utf(context, data[i + j]);
300 *out_len = 0;
301 return "";
302 }
303 ret[data_len - i - char_len + j] = data[i + j];
304 }
305 }
306 *out_len = data_len;
307 return ret;
308 }
309
310 // Trims whitespaces from the left end of the input utf8 sequence
311 FORCE_INLINE
312 const char* ltrim_utf8(gdv_int64 context, const char* data, gdv_int32 data_len,
313 int32_t* out_len) {
314 if (data_len == 0) {
315 *out_len = 0;
316 return "";
317 }
318
319 gdv_int32 start = 0;
320 // start denotes the first position of non-space characters in the input string
321 while (start < data_len && data[start] == ' ') {
322 ++start;
323 }
324
325 *out_len = data_len - start;
326 return data + start;
327 }
328
329 // Trims whitespaces from the right end of the input utf8 sequence
330 FORCE_INLINE
331 const char* rtrim_utf8(gdv_int64 context, const char* data, gdv_int32 data_len,
332 int32_t* out_len) {
333 if (data_len == 0) {
334 *out_len = 0;
335 return "";
336 }
337
338 gdv_int32 end = data_len - 1;
339 // end denotes the last position of non-space characters in the input string
340 while (end >= 0 && data[end] == ' ') {
341 --end;
342 }
343
344 *out_len = end + 1;
345 return data;
346 }
347
348 // Trims whitespaces from both the ends of the input utf8 sequence
349 FORCE_INLINE
350 const char* btrim_utf8(gdv_int64 context, const char* data, gdv_int32 data_len,
351 int32_t* out_len) {
352 if (data_len == 0) {
353 *out_len = 0;
354 return "";
355 }
356
357 gdv_int32 start = 0, end = data_len - 1;
358 // start and end denote the first and last positions of non-space
359 // characters in the input string respectively
360 while (start <= end && data[start] == ' ') {
361 ++start;
362 }
363 while (end >= start && data[end] == ' ') {
364 --end;
365 }
366
367 // string has some leading/trailing spaces and some non-space characters
368 *out_len = end - start + 1;
369 return data + start;
370 }
371
372 // Trims characters present in the trim text from the left end of the base text
373 FORCE_INLINE
374 const char* ltrim_utf8_utf8(gdv_int64 context, const char* basetext,
375 gdv_int32 basetext_len, const char* trimtext,
376 gdv_int32 trimtext_len, int32_t* out_len) {
377 if (basetext_len == 0) {
378 *out_len = 0;
379 return "";
380 } else if (trimtext_len == 0) {
381 *out_len = basetext_len;
382 return basetext;
383 }
384
385 gdv_int32 start_ptr, char_len;
386 // scan the base text from left to right and increment the start pointer till
387 // there is a character which is not present in the trim text
388 for (start_ptr = 0; start_ptr < basetext_len; start_ptr += char_len) {
389 char_len = utf8_char_length(basetext[start_ptr]);
390 if (char_len == 0 || start_ptr + char_len > basetext_len) {
391 // invalid byte or incomplete glyph
392 set_error_for_invalid_utf(context, basetext[start_ptr]);
393 *out_len = 0;
394 return "";
395 }
396 if (!is_substr_utf8_utf8(trimtext, trimtext_len, basetext + start_ptr, char_len)) {
397 break;
398 }
399 }
400
401 *out_len = basetext_len - start_ptr;
402 return basetext + start_ptr;
403 }
404
405 // Trims characters present in the trim text from the right end of the base text
406 FORCE_INLINE
407 const char* rtrim_utf8_utf8(gdv_int64 context, const char* basetext,
408 gdv_int32 basetext_len, const char* trimtext,
409 gdv_int32 trimtext_len, int32_t* out_len) {
410 if (basetext_len == 0) {
411 *out_len = 0;
412 return "";
413 } else if (trimtext_len == 0) {
414 *out_len = basetext_len;
415 return basetext;
416 }
417
418 gdv_int32 char_len, end_ptr, byte_cnt = 1;
419 // scan the base text from right to left and decrement the end pointer till
420 // there is a character which is not present in the trim text
421 for (end_ptr = basetext_len - 1; end_ptr >= 0; --end_ptr) {
422 char_len = utf8_char_length(basetext[end_ptr]);
423 if (char_len == 0) { // trailing bytes of multibyte character
424 ++byte_cnt;
425 continue;
426 }
427 // this is the first byte of a character, hence check if char_len = char_cnt
428 if (byte_cnt != char_len) { // invalid byte or incomplete glyph
429 set_error_for_invalid_utf(context, basetext[end_ptr]);
430 *out_len = 0;
431 return "";
432 }
433 byte_cnt = 1; // reset the counter*/
434 if (!is_substr_utf8_utf8(trimtext, trimtext_len, basetext + end_ptr, char_len)) {
435 break;
436 }
437 }
438
439 // when all characters in the basetext are part of the trimtext
440 if (end_ptr == -1) {
441 *out_len = 0;
442 return "";
443 }
444
445 end_ptr += utf8_char_length(basetext[end_ptr]); // point to the next character
446 *out_len = end_ptr;
447 return basetext;
448 }
449
450 // Trims characters present in the trim text from both ends of the base text
451 FORCE_INLINE
452 const char* btrim_utf8_utf8(gdv_int64 context, const char* basetext,
453 gdv_int32 basetext_len, const char* trimtext,
454 gdv_int32 trimtext_len, int32_t* out_len) {
455 if (basetext_len == 0) {
456 *out_len = 0;
457 return "";
458 } else if (trimtext_len == 0) {
459 *out_len = basetext_len;
460 return basetext;
461 }
462
463 gdv_int32 start_ptr, end_ptr, char_len, byte_cnt = 1;
464 // scan the base text from left to right and increment the start and decrement the
465 // end pointers till there are characters which are not present in the trim text
466 for (start_ptr = 0; start_ptr < basetext_len; start_ptr += char_len) {
467 char_len = utf8_char_length(basetext[start_ptr]);
468 if (char_len == 0 || start_ptr + char_len > basetext_len) {
469 // invalid byte or incomplete glyph
470 set_error_for_invalid_utf(context, basetext[start_ptr]);
471 *out_len = 0;
472 return "";
473 }
474 if (!is_substr_utf8_utf8(trimtext, trimtext_len, basetext + start_ptr, char_len)) {
475 break;
476 }
477 }
478 for (end_ptr = basetext_len - 1; end_ptr >= start_ptr; --end_ptr) {
479 char_len = utf8_char_length(basetext[end_ptr]);
480 if (char_len == 0) { // trailing byte in multibyte character
481 ++byte_cnt;
482 continue;
483 }
484 // this is the first byte of a character, hence check if char_len = char_cnt
485 if (byte_cnt != char_len) { // invalid byte or incomplete glyph
486 set_error_for_invalid_utf(context, basetext[end_ptr]);
487 *out_len = 0;
488 return "";
489 }
490 byte_cnt = 1; // reset the counter*/
491 if (!is_substr_utf8_utf8(trimtext, trimtext_len, basetext + end_ptr, char_len)) {
492 break;
493 }
494 }
495
496 // when all characters are trimmed, start_ptr has been incremented to basetext_len and
497 // end_ptr still points to basetext_len - 1, hence we need to handle this case
498 if (start_ptr > end_ptr) {
499 *out_len = 0;
500 return "";
501 }
502
503 end_ptr += utf8_char_length(basetext[end_ptr]); // point to the next character
504 *out_len = end_ptr - start_ptr;
505 return basetext + start_ptr;
506 }
507
508 FORCE_INLINE
509 gdv_boolean compare_lower_strings(const char* base_str, gdv_int32 base_str_len,
510 const char* str, gdv_int32 str_len) {
511 if (base_str_len != str_len) {
512 return false;
513 }
514 for (int i = 0; i < str_len; i++) {
515 // convert char to lower
516 char cur = str[i];
517 // 'A' - 'Z' : 0x41 - 0x5a
518 // 'a' - 'z' : 0x61 - 0x7a
519 if (cur >= 0x41 && cur <= 0x5a) {
520 cur = static_cast<char>(cur + 0x20);
521 }
522 // if the character does not match, break the flow
523 if (cur != base_str[i]) break;
524 // if the character matches and it is the last iteration, return true
525 if (i == str_len - 1) return true;
526 }
527 return false;
528 }
529
530 // Try to cast the received string ('0', '1', 'true', 'false'), ignoring leading
531 // and trailing spaces, also ignoring lower and upper case.
532 FORCE_INLINE
533 gdv_boolean castBIT_utf8(gdv_int64 context, const char* data, gdv_int32 data_len) {
534 if (data_len <= 0) {
535 gdv_fn_context_set_error_msg(context, "Invalid value for boolean.");
536 return false;
537 }
538
539 // trim leading and trailing spaces
540 int32_t trimmed_len;
541 int32_t start = 0, end = data_len - 1;
542 while (start <= end && data[start] == ' ') {
543 ++start;
544 }
545 while (end >= start && data[end] == ' ') {
546 --end;
547 }
548 trimmed_len = end - start + 1;
549 const char* trimmed_data = data + start;
550
551 // compare received string with the valid bool string values '1', '0', 'true', 'false'
552 if (trimmed_len == 1) {
553 // case for '0' and '1' value
554 if (trimmed_data[0] == '1') return true;
555 if (trimmed_data[0] == '0') return false;
556 } else if (trimmed_len == 4) {
557 // case for matching 'true'
558 if (compare_lower_strings("true", 4, trimmed_data, trimmed_len)) return true;
559 } else if (trimmed_len == 5) {
560 // case for matching 'false'
561 if (compare_lower_strings("false", 5, trimmed_data, trimmed_len)) return false;
562 }
563 // if no 'true', 'false', '0' or '1' value is found, set an error
564 gdv_fn_context_set_error_msg(context, "Invalid value for boolean.");
565 return false;
566 }
567
568 FORCE_INLINE
569 const char* castVARCHAR_bool_int64(gdv_int64 context, gdv_boolean value,
570 gdv_int64 out_len, gdv_int32* out_length) {
571 gdv_int32 len = static_cast<gdv_int32>(out_len);
572 if (len < 0) {
573 gdv_fn_context_set_error_msg(context, "Output buffer length can't be negative");
574 *out_length = 0;
575 return "";
576 }
577 const char* out =
578 reinterpret_cast<const char*>(gdv_fn_context_arena_malloc(context, 5));
579 out = value ? "true" : "false";
580 *out_length = value ? ((len > 4) ? 4 : len) : ((len > 5) ? 5 : len);
581 return out;
582 }
583
584 // Truncates the string to given length
585 #define CAST_VARCHAR_FROM_VARLEN_TYPE(TYPE) \
586 FORCE_INLINE \
587 const char* castVARCHAR_##TYPE##_int64(gdv_int64 context, const char* data, \
588 gdv_int32 data_len, int64_t out_len, \
589 int32_t* out_length) { \
590 int32_t len = static_cast<int32_t>(out_len); \
591 \
592 if (len < 0) { \
593 gdv_fn_context_set_error_msg(context, "Output buffer length can't be negative"); \
594 *out_length = 0; \
595 return ""; \
596 } \
597 \
598 if (len >= data_len || len == 0) { \
599 *out_length = data_len; \
600 return data; \
601 } \
602 \
603 int32_t remaining = len; \
604 int32_t index = 0; \
605 bool is_multibyte = false; \
606 do { \
607 /* In utf8, MSB of a single byte unicode char is always 0, \
608 * whereas for a multibyte character the MSB of each byte is 1. \
609 * So for a single byte char, a bitwise-and with x80 (10000000) will be 0 \
610 * and it won't be 0 for bytes of a multibyte char. \
611 */ \
612 char* data_ptr = const_cast<char*>(data); \
613 \
614 /* advance byte by byte till the 8-byte boundary then advance 8 bytes */ \
615 auto num_bytes = reinterpret_cast<uintptr_t>(data_ptr) & 0x07; \
616 num_bytes = (8 - num_bytes) & 0x07; \
617 while (num_bytes > 0) { \
618 uint8_t* ptr = reinterpret_cast<uint8_t*>(data_ptr + index); \
619 if ((*ptr & 0x80) != 0) { \
620 is_multibyte = true; \
621 break; \
622 } \
623 index++; \
624 remaining--; \
625 num_bytes--; \
626 } \
627 if (is_multibyte) break; \
628 while (remaining >= 8) { \
629 uint64_t* ptr = reinterpret_cast<uint64_t*>(data_ptr + index); \
630 if ((*ptr & 0x8080808080808080) != 0) { \
631 is_multibyte = true; \
632 break; \
633 } \
634 index += 8; \
635 remaining -= 8; \
636 } \
637 if (is_multibyte) break; \
638 if (remaining >= 4) { \
639 uint32_t* ptr = reinterpret_cast<uint32_t*>(data_ptr + index); \
640 if ((*ptr & 0x80808080) != 0) break; \
641 index += 4; \
642 remaining -= 4; \
643 } \
644 while (remaining > 0) { \
645 uint8_t* ptr = reinterpret_cast<uint8_t*>(data_ptr + index); \
646 if ((*ptr & 0x80) != 0) { \
647 is_multibyte = true; \
648 break; \
649 } \
650 index++; \
651 remaining--; \
652 } \
653 if (is_multibyte) break; \
654 /* reached here; all are single byte characters */ \
655 *out_length = len; \
656 return data; \
657 } while (false); \
658 \
659 /* detected multibyte utf8 characters; slow path */ \
660 int32_t byte_pos = \
661 utf8_byte_pos(context, data + index, data_len - index, len - index); \
662 if (byte_pos < 0) { \
663 *out_length = 0; \
664 return ""; \
665 } \
666 \
667 *out_length = index + byte_pos; \
668 return data; \
669 }
670
671 CAST_VARCHAR_FROM_VARLEN_TYPE(utf8)
672 CAST_VARCHAR_FROM_VARLEN_TYPE(binary)
673
674 #undef CAST_VARCHAR_FROM_VARLEN_TYPE
675
676 // Add functions for castVARBINARY
677 #define CAST_VARBINARY_FROM_STRING_AND_BINARY(TYPE) \
678 GANDIVA_EXPORT \
679 const char* castVARBINARY_##TYPE##_int64(gdv_int64 context, const char* data, \
680 gdv_int32 data_len, int64_t out_len, \
681 int32_t* out_length) { \
682 int32_t len = static_cast<int32_t>(out_len); \
683 if (len < 0) { \
684 gdv_fn_context_set_error_msg(context, "Output buffer length can't be negative"); \
685 *out_length = 0; \
686 return ""; \
687 } \
688 \
689 if (len >= data_len || len == 0) { \
690 *out_length = data_len; \
691 } else { \
692 *out_length = len; \
693 } \
694 return data; \
695 }
696
697 CAST_VARBINARY_FROM_STRING_AND_BINARY(utf8)
698 CAST_VARBINARY_FROM_STRING_AND_BINARY(binary)
699
700 #undef CAST_VARBINARY_FROM_STRING_AND_BINARY
701
702 #define IS_NULL(NAME, TYPE) \
703 FORCE_INLINE \
704 bool NAME##_##TYPE(gdv_##TYPE in, gdv_int32 len, gdv_boolean is_valid) { \
705 return !is_valid; \
706 }
707
708 VAR_LEN_TYPES(IS_NULL, isnull)
709
710 #undef IS_NULL
711
712 #define IS_NOT_NULL(NAME, TYPE) \
713 FORCE_INLINE \
714 bool NAME##_##TYPE(gdv_##TYPE in, gdv_int32 len, gdv_boolean is_valid) { \
715 return is_valid; \
716 }
717
718 VAR_LEN_TYPES(IS_NOT_NULL, isnotnull)
719
720 #undef IS_NOT_NULL
721 #undef VAR_LEN_TYPES
722
723 /*
724 We follow Oracle semantics for offset:
725 - If position is positive, then the first glyph in the substring is determined by
726 counting that many glyphs forward from the beginning of the input. (i.e., for position ==
727 1 the first glyph in the substring will be identical to the first glyph in the input)
728
729 - If position is negative, then the first glyph in the substring is determined by
730 counting that many glyphs backward from the end of the input. (i.e., for position == -1
731 the first glyph in the substring will be identical to the last glyph in the input)
732
733 - If position is 0 then it is treated as 1.
734 */
735 FORCE_INLINE
736 const char* substr_utf8_int64_int64(gdv_int64 context, const char* input,
737 gdv_int32 in_data_len, gdv_int64 position,
738 gdv_int64 substring_length, gdv_int32* out_data_len) {
739 if (substring_length <= 0 || input == nullptr || in_data_len <= 0) {
740 *out_data_len = 0;
741 return "";
742 }
743
744 gdv_int64 in_glyphs_count =
745 static_cast<gdv_int64>(utf8_length(context, input, in_data_len));
746
747 // in_glyphs_count is zero if input has invalid glyphs
748 if (in_glyphs_count == 0) {
749 *out_data_len = 0;
750 return "";
751 }
752
753 gdv_int64 from_glyph; // from_glyph==0 indicates the first glyph of the input
754 if (position > 0) {
755 from_glyph = position - 1;
756 } else if (position < 0) {
757 from_glyph = in_glyphs_count + position;
758 } else {
759 from_glyph = 0;
760 }
761
762 if (from_glyph < 0 || from_glyph >= in_glyphs_count) {
763 *out_data_len = 0;
764 return "";
765 }
766
767 gdv_int64 out_glyphs_count = substring_length;
768 if (substring_length > in_glyphs_count - from_glyph) {
769 out_glyphs_count = in_glyphs_count - from_glyph;
770 }
771
772 gdv_int64 in_data_len64 = static_cast<gdv_int64>(in_data_len);
773 gdv_int64 start_pos = 0;
774 gdv_int64 end_pos = in_data_len64;
775
776 gdv_int64 current_glyph = 0;
777 gdv_int64 pos = 0;
778 while (pos < in_data_len64) {
779 if (current_glyph == from_glyph) {
780 start_pos = pos;
781 }
782 pos += static_cast<gdv_int64>(utf8_char_length(input[pos]));
783 if (current_glyph - from_glyph + 1 == out_glyphs_count) {
784 end_pos = pos;
785 }
786 current_glyph++;
787 }
788
789 if (end_pos > in_data_len64 || end_pos > INT_MAX) {
790 end_pos = in_data_len64;
791 }
792
793 *out_data_len = static_cast<gdv_int32>(end_pos - start_pos);
794 char* ret =
795 reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, *out_data_len));
796 if (ret == nullptr) {
797 gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
798 *out_data_len = 0;
799 return "";
800 }
801 memcpy(ret, input + start_pos, *out_data_len);
802 return ret;
803 }
804
805 FORCE_INLINE
806 const char* substr_utf8_int64(gdv_int64 context, const char* input, gdv_int32 in_len,
807 gdv_int64 offset64, gdv_int32* out_len) {
808 return substr_utf8_int64_int64(context, input, in_len, offset64, in_len, out_len);
809 }
810
811 FORCE_INLINE
812 const char* repeat_utf8_int32(gdv_int64 context, const char* in, gdv_int32 in_len,
813 gdv_int32 repeat_number, gdv_int32* out_len) {
814 // if the repeat number is zero, then return empty string
815 if (repeat_number == 0 || in_len <= 0) {
816 *out_len = 0;
817 return "";
818 }
819 // if the repeat number is a negative number, an error is set on context
820 if (repeat_number < 0) {
821 gdv_fn_context_set_error_msg(context, "Repeat number can't be negative");
822 *out_len = 0;
823 return "";
824 }
825 *out_len = repeat_number * in_len;
826 char* ret = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, *out_len));
827 if (ret == nullptr) {
828 gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
829 *out_len = 0;
830 return "";
831 }
832 for (int i = 0; i < repeat_number; ++i) {
833 memcpy(ret + (i * in_len), in, in_len);
834 }
835 return ret;
836 }
837
838 FORCE_INLINE
839 const char* concat_utf8_utf8(gdv_int64 context, const char* left, gdv_int32 left_len,
840 bool left_validity, const char* right, gdv_int32 right_len,
841 bool right_validity, gdv_int32* out_len) {
842 if (!left_validity) {
843 left_len = 0;
844 }
845 if (!right_validity) {
846 right_len = 0;
847 }
848 return concatOperator_utf8_utf8(context, left, left_len, right, right_len, out_len);
849 }
850
851 FORCE_INLINE
852 const char* concatOperator_utf8_utf8(gdv_int64 context, const char* left,
853 gdv_int32 left_len, const char* right,
854 gdv_int32 right_len, gdv_int32* out_len) {
855 *out_len = left_len + right_len;
856 if (*out_len <= 0) {
857 *out_len = 0;
858 return "";
859 }
860 char* ret = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, *out_len));
861 if (ret == nullptr) {
862 gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
863 *out_len = 0;
864 return "";
865 }
866 memcpy(ret, left, left_len);
867 memcpy(ret + left_len, right, right_len);
868 return ret;
869 }
870
871 FORCE_INLINE
872 const char* concat_utf8_utf8_utf8(gdv_int64 context, const char* in1, gdv_int32 in1_len,
873 bool in1_validity, const char* in2, gdv_int32 in2_len,
874 bool in2_validity, const char* in3, gdv_int32 in3_len,
875 bool in3_validity, gdv_int32* out_len) {
876 if (!in1_validity) {
877 in1_len = 0;
878 }
879 if (!in2_validity) {
880 in2_len = 0;
881 }
882 if (!in3_validity) {
883 in3_len = 0;
884 }
885 return concatOperator_utf8_utf8_utf8(context, in1, in1_len, in2, in2_len, in3, in3_len,
886 out_len);
887 }
888
889 FORCE_INLINE
890 const char* concatOperator_utf8_utf8_utf8(gdv_int64 context, const char* in1,
891 gdv_int32 in1_len, const char* in2,
892 gdv_int32 in2_len, const char* in3,
893 gdv_int32 in3_len, gdv_int32* out_len) {
894 *out_len = in1_len + in2_len + in3_len;
895 if (*out_len <= 0) {
896 *out_len = 0;
897 return "";
898 }
899 char* ret = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, *out_len));
900 if (ret == nullptr) {
901 gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
902 *out_len = 0;
903 return "";
904 }
905 memcpy(ret, in1, in1_len);
906 memcpy(ret + in1_len, in2, in2_len);
907 memcpy(ret + in1_len + in2_len, in3, in3_len);
908 return ret;
909 }
910
911 FORCE_INLINE
912 const char* concat_utf8_utf8_utf8_utf8(gdv_int64 context, const char* in1,
913 gdv_int32 in1_len, bool in1_validity,
914 const char* in2, gdv_int32 in2_len,
915 bool in2_validity, const char* in3,
916 gdv_int32 in3_len, bool in3_validity,
917 const char* in4, gdv_int32 in4_len,
918 bool in4_validity, gdv_int32* out_len) {
919 if (!in1_validity) {
920 in1_len = 0;
921 }
922 if (!in2_validity) {
923 in2_len = 0;
924 }
925 if (!in3_validity) {
926 in3_len = 0;
927 }
928 if (!in4_validity) {
929 in4_len = 0;
930 }
931 return concatOperator_utf8_utf8_utf8_utf8(context, in1, in1_len, in2, in2_len, in3,
932 in3_len, in4, in4_len, out_len);
933 }
934
935 FORCE_INLINE
936 const char* concatOperator_utf8_utf8_utf8_utf8(gdv_int64 context, const char* in1,
937 gdv_int32 in1_len, const char* in2,
938 gdv_int32 in2_len, const char* in3,
939 gdv_int32 in3_len, const char* in4,
940 gdv_int32 in4_len, gdv_int32* out_len) {
941 *out_len = in1_len + in2_len + in3_len + in4_len;
942 if (*out_len <= 0) {
943 *out_len = 0;
944 return "";
945 }
946 char* ret = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, *out_len));
947 if (ret == nullptr) {
948 gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
949 *out_len = 0;
950 return "";
951 }
952 memcpy(ret, in1, in1_len);
953 memcpy(ret + in1_len, in2, in2_len);
954 memcpy(ret + in1_len + in2_len, in3, in3_len);
955 memcpy(ret + in1_len + in2_len + in3_len, in4, in4_len);
956 return ret;
957 }
958
959 FORCE_INLINE
960 const char* concat_utf8_utf8_utf8_utf8_utf8(
961 gdv_int64 context, const char* in1, gdv_int32 in1_len, bool in1_validity,
962 const char* in2, gdv_int32 in2_len, bool in2_validity, const char* in3,
963 gdv_int32 in3_len, bool in3_validity, const char* in4, gdv_int32 in4_len,
964 bool in4_validity, const char* in5, gdv_int32 in5_len, bool in5_validity,
965 gdv_int32* out_len) {
966 if (!in1_validity) {
967 in1_len = 0;
968 }
969 if (!in2_validity) {
970 in2_len = 0;
971 }
972 if (!in3_validity) {
973 in3_len = 0;
974 }
975 if (!in4_validity) {
976 in4_len = 0;
977 }
978 if (!in5_validity) {
979 in5_len = 0;
980 }
981 return concatOperator_utf8_utf8_utf8_utf8_utf8(context, in1, in1_len, in2, in2_len, in3,
982 in3_len, in4, in4_len, in5, in5_len,
983 out_len);
984 }
985
986 FORCE_INLINE
987 const char* concatOperator_utf8_utf8_utf8_utf8_utf8(
988 gdv_int64 context, const char* in1, gdv_int32 in1_len, const char* in2,
989 gdv_int32 in2_len, const char* in3, gdv_int32 in3_len, const char* in4,
990 gdv_int32 in4_len, const char* in5, gdv_int32 in5_len, gdv_int32* out_len) {
991 *out_len = in1_len + in2_len + in3_len + in4_len + in5_len;
992 if (*out_len <= 0) {
993 *out_len = 0;
994 return "";
995 }
996 char* ret = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, *out_len));
997 if (ret == nullptr) {
998 gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
999 *out_len = 0;
1000 return "";
1001 }
1002 memcpy(ret, in1, in1_len);
1003 memcpy(ret + in1_len, in2, in2_len);
1004 memcpy(ret + in1_len + in2_len, in3, in3_len);
1005 memcpy(ret + in1_len + in2_len + in3_len, in4, in4_len);
1006 memcpy(ret + in1_len + in2_len + in3_len + in4_len, in5, in5_len);
1007 return ret;
1008 }
1009
1010 FORCE_INLINE
1011 const char* concat_utf8_utf8_utf8_utf8_utf8_utf8(
1012 gdv_int64 context, const char* in1, gdv_int32 in1_len, bool in1_validity,
1013 const char* in2, gdv_int32 in2_len, bool in2_validity, const char* in3,
1014 gdv_int32 in3_len, bool in3_validity, const char* in4, gdv_int32 in4_len,
1015 bool in4_validity, const char* in5, gdv_int32 in5_len, bool in5_validity,
1016 const char* in6, gdv_int32 in6_len, bool in6_validity, gdv_int32* out_len) {
1017 if (!in1_validity) {
1018 in1_len = 0;
1019 }
1020 if (!in2_validity) {
1021 in2_len = 0;
1022 }
1023 if (!in3_validity) {
1024 in3_len = 0;
1025 }
1026 if (!in4_validity) {
1027 in4_len = 0;
1028 }
1029 if (!in5_validity) {
1030 in5_len = 0;
1031 }
1032 if (!in6_validity) {
1033 in6_len = 0;
1034 }
1035 return concatOperator_utf8_utf8_utf8_utf8_utf8_utf8(context, in1, in1_len, in2, in2_len,
1036 in3, in3_len, in4, in4_len, in5,
1037 in5_len, in6, in6_len, out_len);
1038 }
1039
1040 FORCE_INLINE
1041 const char* concatOperator_utf8_utf8_utf8_utf8_utf8_utf8(
1042 gdv_int64 context, const char* in1, gdv_int32 in1_len, const char* in2,
1043 gdv_int32 in2_len, const char* in3, gdv_int32 in3_len, const char* in4,
1044 gdv_int32 in4_len, const char* in5, gdv_int32 in5_len, const char* in6,
1045 gdv_int32 in6_len, gdv_int32* out_len) {
1046 *out_len = in1_len + in2_len + in3_len + in4_len + in5_len + in6_len;
1047 if (*out_len <= 0) {
1048 *out_len = 0;
1049 return "";
1050 }
1051 char* ret = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, *out_len));
1052 if (ret == nullptr) {
1053 gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
1054 *out_len = 0;
1055 return "";
1056 }
1057 memcpy(ret, in1, in1_len);
1058 memcpy(ret + in1_len, in2, in2_len);
1059 memcpy(ret + in1_len + in2_len, in3, in3_len);
1060 memcpy(ret + in1_len + in2_len + in3_len, in4, in4_len);
1061 memcpy(ret + in1_len + in2_len + in3_len + in4_len, in5, in5_len);
1062 memcpy(ret + in1_len + in2_len + in3_len + in4_len + in5_len, in6, in6_len);
1063 return ret;
1064 }
1065
1066 FORCE_INLINE
1067 const char* concat_utf8_utf8_utf8_utf8_utf8_utf8_utf8(
1068 gdv_int64 context, const char* in1, gdv_int32 in1_len, bool in1_validity,
1069 const char* in2, gdv_int32 in2_len, bool in2_validity, const char* in3,
1070 gdv_int32 in3_len, bool in3_validity, const char* in4, gdv_int32 in4_len,
1071 bool in4_validity, const char* in5, gdv_int32 in5_len, bool in5_validity,
1072 const char* in6, gdv_int32 in6_len, bool in6_validity, const char* in7,
1073 gdv_int32 in7_len, bool in7_validity, gdv_int32* out_len) {
1074 if (!in1_validity) {
1075 in1_len = 0;
1076 }
1077 if (!in2_validity) {
1078 in2_len = 0;
1079 }
1080 if (!in3_validity) {
1081 in3_len = 0;
1082 }
1083 if (!in4_validity) {
1084 in4_len = 0;
1085 }
1086 if (!in5_validity) {
1087 in5_len = 0;
1088 }
1089 if (!in6_validity) {
1090 in6_len = 0;
1091 }
1092 if (!in7_validity) {
1093 in7_len = 0;
1094 }
1095 return concatOperator_utf8_utf8_utf8_utf8_utf8_utf8_utf8(
1096 context, in1, in1_len, in2, in2_len, in3, in3_len, in4, in4_len, in5, in5_len, in6,
1097 in6_len, in7, in7_len, out_len);
1098 }
1099
1100 FORCE_INLINE
1101 const char* concatOperator_utf8_utf8_utf8_utf8_utf8_utf8_utf8(
1102 gdv_int64 context, const char* in1, gdv_int32 in1_len, const char* in2,
1103 gdv_int32 in2_len, const char* in3, gdv_int32 in3_len, const char* in4,
1104 gdv_int32 in4_len, const char* in5, gdv_int32 in5_len, const char* in6,
1105 gdv_int32 in6_len, const char* in7, gdv_int32 in7_len, gdv_int32* out_len) {
1106 *out_len = in1_len + in2_len + in3_len + in4_len + in5_len + in6_len + in7_len;
1107 if (*out_len <= 0) {
1108 *out_len = 0;
1109 return "";
1110 }
1111 char* ret = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, *out_len));
1112 if (ret == nullptr) {
1113 gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
1114 *out_len = 0;
1115 return "";
1116 }
1117 memcpy(ret, in1, in1_len);
1118 memcpy(ret + in1_len, in2, in2_len);
1119 memcpy(ret + in1_len + in2_len, in3, in3_len);
1120 memcpy(ret + in1_len + in2_len + in3_len, in4, in4_len);
1121 memcpy(ret + in1_len + in2_len + in3_len + in4_len, in5, in5_len);
1122 memcpy(ret + in1_len + in2_len + in3_len + in4_len + in5_len, in6, in6_len);
1123 memcpy(ret + in1_len + in2_len + in3_len + in4_len + in5_len + in6_len, in7, in7_len);
1124 return ret;
1125 }
1126
1127 FORCE_INLINE
1128 const char* concat_utf8_utf8_utf8_utf8_utf8_utf8_utf8_utf8(
1129 gdv_int64 context, const char* in1, gdv_int32 in1_len, bool in1_validity,
1130 const char* in2, gdv_int32 in2_len, bool in2_validity, const char* in3,
1131 gdv_int32 in3_len, bool in3_validity, const char* in4, gdv_int32 in4_len,
1132 bool in4_validity, const char* in5, gdv_int32 in5_len, bool in5_validity,
1133 const char* in6, gdv_int32 in6_len, bool in6_validity, const char* in7,
1134 gdv_int32 in7_len, bool in7_validity, const char* in8, gdv_int32 in8_len,
1135 bool in8_validity, gdv_int32* out_len) {
1136 if (!in1_validity) {
1137 in1_len = 0;
1138 }
1139 if (!in2_validity) {
1140 in2_len = 0;
1141 }
1142 if (!in3_validity) {
1143 in3_len = 0;
1144 }
1145 if (!in4_validity) {
1146 in4_len = 0;
1147 }
1148 if (!in5_validity) {
1149 in5_len = 0;
1150 }
1151 if (!in6_validity) {
1152 in6_len = 0;
1153 }
1154 if (!in7_validity) {
1155 in7_len = 0;
1156 }
1157 if (!in8_validity) {
1158 in8_len = 0;
1159 }
1160 return concatOperator_utf8_utf8_utf8_utf8_utf8_utf8_utf8_utf8(
1161 context, in1, in1_len, in2, in2_len, in3, in3_len, in4, in4_len, in5, in5_len, in6,
1162 in6_len, in7, in7_len, in8, in8_len, out_len);
1163 }
1164
1165 FORCE_INLINE
1166 const char* concatOperator_utf8_utf8_utf8_utf8_utf8_utf8_utf8_utf8(
1167 gdv_int64 context, const char* in1, gdv_int32 in1_len, const char* in2,
1168 gdv_int32 in2_len, const char* in3, gdv_int32 in3_len, const char* in4,
1169 gdv_int32 in4_len, const char* in5, gdv_int32 in5_len, const char* in6,
1170 gdv_int32 in6_len, const char* in7, gdv_int32 in7_len, const char* in8,
1171 gdv_int32 in8_len, gdv_int32* out_len) {
1172 *out_len =
1173 in1_len + in2_len + in3_len + in4_len + in5_len + in6_len + in7_len + in8_len;
1174 if (*out_len <= 0) {
1175 *out_len = 0;
1176 return "";
1177 }
1178 char* ret = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, *out_len));
1179 if (ret == nullptr) {
1180 gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
1181 *out_len = 0;
1182 return "";
1183 }
1184 memcpy(ret, in1, in1_len);
1185 memcpy(ret + in1_len, in2, in2_len);
1186 memcpy(ret + in1_len + in2_len, in3, in3_len);
1187 memcpy(ret + in1_len + in2_len + in3_len, in4, in4_len);
1188 memcpy(ret + in1_len + in2_len + in3_len + in4_len, in5, in5_len);
1189 memcpy(ret + in1_len + in2_len + in3_len + in4_len + in5_len, in6, in6_len);
1190 memcpy(ret + in1_len + in2_len + in3_len + in4_len + in5_len + in6_len, in7, in7_len);
1191 memcpy(ret + in1_len + in2_len + in3_len + in4_len + in5_len + in6_len + in7_len, in8,
1192 in8_len);
1193 return ret;
1194 }
1195
1196 FORCE_INLINE
1197 const char* concat_utf8_utf8_utf8_utf8_utf8_utf8_utf8_utf8_utf8(
1198 gdv_int64 context, const char* in1, gdv_int32 in1_len, bool in1_validity,
1199 const char* in2, gdv_int32 in2_len, bool in2_validity, const char* in3,
1200 gdv_int32 in3_len, bool in3_validity, const char* in4, gdv_int32 in4_len,
1201 bool in4_validity, const char* in5, gdv_int32 in5_len, bool in5_validity,
1202 const char* in6, gdv_int32 in6_len, bool in6_validity, const char* in7,
1203 gdv_int32 in7_len, bool in7_validity, const char* in8, gdv_int32 in8_len,
1204 bool in8_validity, const char* in9, gdv_int32 in9_len, bool in9_validity,
1205 gdv_int32* out_len) {
1206 if (!in1_validity) {
1207 in1_len = 0;
1208 }
1209 if (!in2_validity) {
1210 in2_len = 0;
1211 }
1212 if (!in3_validity) {
1213 in3_len = 0;
1214 }
1215 if (!in4_validity) {
1216 in4_len = 0;
1217 }
1218 if (!in5_validity) {
1219 in5_len = 0;
1220 }
1221 if (!in6_validity) {
1222 in6_len = 0;
1223 }
1224 if (!in7_validity) {
1225 in7_len = 0;
1226 }
1227 if (!in8_validity) {
1228 in8_len = 0;
1229 }
1230 if (!in9_validity) {
1231 in9_len = 0;
1232 }
1233 return concatOperator_utf8_utf8_utf8_utf8_utf8_utf8_utf8_utf8_utf8(
1234 context, in1, in1_len, in2, in2_len, in3, in3_len, in4, in4_len, in5, in5_len, in6,
1235 in6_len, in7, in7_len, in8, in8_len, in9, in9_len, out_len);
1236 }
1237
1238 FORCE_INLINE
1239 const char* concatOperator_utf8_utf8_utf8_utf8_utf8_utf8_utf8_utf8_utf8(
1240 gdv_int64 context, const char* in1, gdv_int32 in1_len, const char* in2,
1241 gdv_int32 in2_len, const char* in3, gdv_int32 in3_len, const char* in4,
1242 gdv_int32 in4_len, const char* in5, gdv_int32 in5_len, const char* in6,
1243 gdv_int32 in6_len, const char* in7, gdv_int32 in7_len, const char* in8,
1244 gdv_int32 in8_len, const char* in9, gdv_int32 in9_len, gdv_int32* out_len) {
1245 *out_len = in1_len + in2_len + in3_len + in4_len + in5_len + in6_len + in7_len +
1246 in8_len + in9_len;
1247 if (*out_len <= 0) {
1248 *out_len = 0;
1249 return "";
1250 }
1251 char* ret = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, *out_len));
1252 if (ret == nullptr) {
1253 gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
1254 *out_len = 0;
1255 return "";
1256 }
1257 memcpy(ret, in1, in1_len);
1258 memcpy(ret + in1_len, in2, in2_len);
1259 memcpy(ret + in1_len + in2_len, in3, in3_len);
1260 memcpy(ret + in1_len + in2_len + in3_len, in4, in4_len);
1261 memcpy(ret + in1_len + in2_len + in3_len + in4_len, in5, in5_len);
1262 memcpy(ret + in1_len + in2_len + in3_len + in4_len + in5_len, in6, in6_len);
1263 memcpy(ret + in1_len + in2_len + in3_len + in4_len + in5_len + in6_len, in7, in7_len);
1264 memcpy(ret + in1_len + in2_len + in3_len + in4_len + in5_len + in6_len + in7_len, in8,
1265 in8_len);
1266 memcpy(
1267 ret + in1_len + in2_len + in3_len + in4_len + in5_len + in6_len + in7_len + in8_len,
1268 in9, in9_len);
1269 return ret;
1270 }
1271
1272 FORCE_INLINE
1273 const char* concat_utf8_utf8_utf8_utf8_utf8_utf8_utf8_utf8_utf8_utf8(
1274 gdv_int64 context, const char* in1, gdv_int32 in1_len, bool in1_validity,
1275 const char* in2, gdv_int32 in2_len, bool in2_validity, const char* in3,
1276 gdv_int32 in3_len, bool in3_validity, const char* in4, gdv_int32 in4_len,
1277 bool in4_validity, const char* in5, gdv_int32 in5_len, bool in5_validity,
1278 const char* in6, gdv_int32 in6_len, bool in6_validity, const char* in7,
1279 gdv_int32 in7_len, bool in7_validity, const char* in8, gdv_int32 in8_len,
1280 bool in8_validity, const char* in9, gdv_int32 in9_len, bool in9_validity,
1281 const char* in10, gdv_int32 in10_len, bool in10_validity, gdv_int32* out_len) {
1282 if (!in1_validity) {
1283 in1_len = 0;
1284 }
1285 if (!in2_validity) {
1286 in2_len = 0;
1287 }
1288 if (!in3_validity) {
1289 in3_len = 0;
1290 }
1291 if (!in4_validity) {
1292 in4_len = 0;
1293 }
1294 if (!in5_validity) {
1295 in5_len = 0;
1296 }
1297 if (!in6_validity) {
1298 in6_len = 0;
1299 }
1300 if (!in7_validity) {
1301 in7_len = 0;
1302 }
1303 if (!in8_validity) {
1304 in8_len = 0;
1305 }
1306 if (!in9_validity) {
1307 in9_len = 0;
1308 }
1309 if (!in10_validity) {
1310 in10_len = 0;
1311 }
1312 return concatOperator_utf8_utf8_utf8_utf8_utf8_utf8_utf8_utf8_utf8_utf8(
1313 context, in1, in1_len, in2, in2_len, in3, in3_len, in4, in4_len, in5, in5_len, in6,
1314 in6_len, in7, in7_len, in8, in8_len, in9, in9_len, in10, in10_len, out_len);
1315 }
1316
1317 FORCE_INLINE
1318 const char* concatOperator_utf8_utf8_utf8_utf8_utf8_utf8_utf8_utf8_utf8_utf8(
1319 gdv_int64 context, const char* in1, gdv_int32 in1_len, const char* in2,
1320 gdv_int32 in2_len, const char* in3, gdv_int32 in3_len, const char* in4,
1321 gdv_int32 in4_len, const char* in5, gdv_int32 in5_len, const char* in6,
1322 gdv_int32 in6_len, const char* in7, gdv_int32 in7_len, const char* in8,
1323 gdv_int32 in8_len, const char* in9, gdv_int32 in9_len, const char* in10,
1324 gdv_int32 in10_len, gdv_int32* out_len) {
1325 *out_len = in1_len + in2_len + in3_len + in4_len + in5_len + in6_len + in7_len +
1326 in8_len + in9_len + in10_len;
1327 if (*out_len <= 0) {
1328 *out_len = 0;
1329 return "";
1330 }
1331 char* ret = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, *out_len));
1332 if (ret == nullptr) {
1333 gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
1334 *out_len = 0;
1335 return "";
1336 }
1337 memcpy(ret, in1, in1_len);
1338 memcpy(ret + in1_len, in2, in2_len);
1339 memcpy(ret + in1_len + in2_len, in3, in3_len);
1340 memcpy(ret + in1_len + in2_len + in3_len, in4, in4_len);
1341 memcpy(ret + in1_len + in2_len + in3_len + in4_len, in5, in5_len);
1342 memcpy(ret + in1_len + in2_len + in3_len + in4_len + in5_len, in6, in6_len);
1343 memcpy(ret + in1_len + in2_len + in3_len + in4_len + in5_len + in6_len, in7, in7_len);
1344 memcpy(ret + in1_len + in2_len + in3_len + in4_len + in5_len + in6_len + in7_len, in8,
1345 in8_len);
1346 memcpy(
1347 ret + in1_len + in2_len + in3_len + in4_len + in5_len + in6_len + in7_len + in8_len,
1348 in9, in9_len);
1349 memcpy(ret + in1_len + in2_len + in3_len + in4_len + in5_len + in6_len + in7_len +
1350 in8_len + in9_len,
1351 in10, in10_len);
1352 return ret;
1353 }
1354
1355 // Returns the numeric value of the first character of str.
1356 GANDIVA_EXPORT
1357 gdv_int32 ascii_utf8(const char* data, gdv_int32 data_len) {
1358 if (data_len == 0) {
1359 return 0;
1360 }
1361 return static_cast<gdv_int32>(data[0]);
1362 }
1363
1364 FORCE_INLINE
1365 const char* convert_fromUTF8_binary(gdv_int64 context, const char* bin_in, gdv_int32 len,
1366 gdv_int32* out_len) {
1367 *out_len = len;
1368 char* ret = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, *out_len));
1369 if (ret == nullptr) {
1370 gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
1371 *out_len = 0;
1372 return "";
1373 }
1374 memcpy(ret, bin_in, *out_len);
1375 return ret;
1376 }
1377
1378 FORCE_INLINE
1379 const char* convert_replace_invalid_fromUTF8_binary(int64_t context, const char* text_in,
1380 int32_t text_len,
1381 const char* char_to_replace,
1382 int32_t char_to_replace_len,
1383 int32_t* out_len) {
1384 if (char_to_replace_len > 1) {
1385 gdv_fn_context_set_error_msg(context, "Replacement of multiple bytes not supported");
1386 *out_len = 0;
1387 return "";
1388 }
1389 // actually the convert_replace function replaces invalid chars with an ASCII
1390 // character so the output length will be the same as the input length
1391 *out_len = text_len;
1392 char* ret = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, *out_len));
1393 if (ret == nullptr) {
1394 gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
1395 *out_len = 0;
1396 return "";
1397 }
1398 int32_t valid_bytes_to_cpy = 0;
1399 int32_t out_byte_counter = 0;
1400 int32_t in_byte_counter = 0;
1401 int32_t char_len;
1402 // scan the base text from left to right and increment the start pointer till
1403 // looking for invalid chars to substitute
1404 for (int text_index = 0; text_index < text_len; text_index += char_len) {
1405 char_len = utf8_char_length(text_in[text_index]);
1406 // only memory copy the bytes when detect invalid char
1407 if (char_len == 0 || text_index + char_len > text_len ||
1408 !validate_utf8_following_bytes(text_in, char_len, text_index)) {
1409 // define char_len = 1 to increase text_index by 1 (as ASCII char fits in 1 byte)
1410 char_len = 1;
1411 // first copy the valid bytes until now and then replace the invalid character
1412 memcpy(ret + out_byte_counter, text_in + in_byte_counter, valid_bytes_to_cpy);
1413 // if the replacement char is empty, the invalid char should be ignored
1414 if (char_to_replace_len == 0) {
1415 out_byte_counter += valid_bytes_to_cpy;
1416 } else {
1417 ret[out_byte_counter + valid_bytes_to_cpy] = char_to_replace[0];
1418 out_byte_counter += valid_bytes_to_cpy + char_len;
1419 }
1420 in_byte_counter += valid_bytes_to_cpy + char_len;
1421 valid_bytes_to_cpy = 0;
1422 continue;
1423 }
1424 valid_bytes_to_cpy += char_len;
1425 }
1426 // if invalid chars were not found, return the original string
1427 if (out_byte_counter == 0 && in_byte_counter == 0) return text_in;
1428 // if there are still valid bytes to copy, do it
1429 if (valid_bytes_to_cpy != 0) {
1430 memcpy(ret + out_byte_counter, text_in + in_byte_counter, valid_bytes_to_cpy);
1431 }
1432 // the out length will be the out bytes copied + the missing end bytes copied
1433 *out_len = valid_bytes_to_cpy + out_byte_counter;
1434 return ret;
1435 }
1436
1437 // The function reverse a char array in-place
1438 static inline void reverse_char_buf(char* buf, int32_t len) {
1439 char temp;
1440
1441 for (int32_t i = 0; i < len / 2; i++) {
1442 int32_t pos_swp = len - (1 + i);
1443 temp = buf[pos_swp];
1444 buf[pos_swp] = buf[i];
1445 buf[i] = temp;
1446 }
1447 }
1448
1449 // Converts a double variable to binary
1450 FORCE_INLINE
1451 const char* convert_toDOUBLE(int64_t context, double value, int32_t* out_len) {
1452 *out_len = sizeof(value);
1453 char* ret = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, *out_len));
1454
1455 if (ret == nullptr) {
1456 gdv_fn_context_set_error_msg(context,
1457 "Could not allocate memory for the output string");
1458
1459 *out_len = 0;
1460 return "";
1461 }
1462
1463 memcpy(ret, &value, *out_len);
1464
1465 return ret;
1466 }
1467
1468 FORCE_INLINE
1469 const char* convert_toDOUBLE_be(int64_t context, double value, int32_t* out_len) {
1470 // The function behaves like convert_toDOUBLE, but always return the result
1471 // in big endian format
1472 char* ret = const_cast<char*>(convert_toDOUBLE(context, value, out_len));
1473
1474 #if ARROW_LITTLE_ENDIAN
1475 reverse_char_buf(ret, *out_len);
1476 #endif
1477
1478 return ret;
1479 }
1480
1481 // Converts a float variable to binary
1482 FORCE_INLINE
1483 const char* convert_toFLOAT(int64_t context, float value, int32_t* out_len) {
1484 *out_len = sizeof(value);
1485 char* ret = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, *out_len));
1486
1487 if (ret == nullptr) {
1488 gdv_fn_context_set_error_msg(context,
1489 "Could not allocate memory for the output string");
1490
1491 *out_len = 0;
1492 return "";
1493 }
1494
1495 memcpy(ret, &value, *out_len);
1496
1497 return ret;
1498 }
1499
1500 FORCE_INLINE
1501 const char* convert_toFLOAT_be(int64_t context, float value, int32_t* out_len) {
1502 // The function behaves like convert_toFLOAT, but always return the result
1503 // in big endian format
1504 char* ret = const_cast<char*>(convert_toFLOAT(context, value, out_len));
1505
1506 #if ARROW_LITTLE_ENDIAN
1507 reverse_char_buf(ret, *out_len);
1508 #endif
1509
1510 return ret;
1511 }
1512
1513 // Converts a bigint(int with 64 bits) variable to binary
1514 FORCE_INLINE
1515 const char* convert_toBIGINT(int64_t context, int64_t value, int32_t* out_len) {
1516 *out_len = sizeof(value);
1517 char* ret = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, *out_len));
1518
1519 if (ret == nullptr) {
1520 gdv_fn_context_set_error_msg(context,
1521 "Could not allocate memory for the output string");
1522
1523 *out_len = 0;
1524 return "";
1525 }
1526
1527 memcpy(ret, &value, *out_len);
1528
1529 return ret;
1530 }
1531
1532 FORCE_INLINE
1533 const char* convert_toBIGINT_be(int64_t context, int64_t value, int32_t* out_len) {
1534 // The function behaves like convert_toBIGINT, but always return the result
1535 // in big endian format
1536 char* ret = const_cast<char*>(convert_toBIGINT(context, value, out_len));
1537
1538 #if ARROW_LITTLE_ENDIAN
1539 reverse_char_buf(ret, *out_len);
1540 #endif
1541
1542 return ret;
1543 }
1544
1545 // Converts an integer(with 32 bits) variable to binary
1546 FORCE_INLINE
1547 const char* convert_toINT(int64_t context, int32_t value, int32_t* out_len) {
1548 *out_len = sizeof(value);
1549 char* ret = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, *out_len));
1550
1551 if (ret == nullptr) {
1552 gdv_fn_context_set_error_msg(context,
1553 "Could not allocate memory for the output string");
1554
1555 *out_len = 0;
1556 return "";
1557 }
1558
1559 memcpy(ret, &value, *out_len);
1560
1561 return ret;
1562 }
1563
1564 FORCE_INLINE
1565 const char* convert_toINT_be(int64_t context, int32_t value, int32_t* out_len) {
1566 // The function behaves like convert_toINT, but always return the result
1567 // in big endian format
1568 char* ret = const_cast<char*>(convert_toINT(context, value, out_len));
1569
1570 #if ARROW_LITTLE_ENDIAN
1571 reverse_char_buf(ret, *out_len);
1572 #endif
1573
1574 return ret;
1575 }
1576
1577 // Converts a boolean variable to binary
1578 FORCE_INLINE
1579 const char* convert_toBOOLEAN(int64_t context, bool value, int32_t* out_len) {
1580 *out_len = sizeof(value);
1581 char* ret = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, *out_len));
1582
1583 if (ret == nullptr) {
1584 gdv_fn_context_set_error_msg(context,
1585 "Could not allocate memory for the output string");
1586
1587 *out_len = 0;
1588 return "";
1589 }
1590
1591 memcpy(ret, &value, *out_len);
1592
1593 return ret;
1594 }
1595
1596 // Converts a time variable to binary
1597 FORCE_INLINE
1598 const char* convert_toTIME_EPOCH(int64_t context, int32_t value, int32_t* out_len) {
1599 return convert_toINT(context, value, out_len);
1600 }
1601
1602 FORCE_INLINE
1603 const char* convert_toTIME_EPOCH_be(int64_t context, int32_t value, int32_t* out_len) {
1604 // The function behaves as convert_toTIME_EPOCH, but
1605 // returns the bytes in big endian format
1606 return convert_toINT_be(context, value, out_len);
1607 }
1608
1609 // Converts a timestamp variable to binary
1610 FORCE_INLINE
1611 const char* convert_toTIMESTAMP_EPOCH(int64_t context, int64_t timestamp,
1612 int32_t* out_len) {
1613 return convert_toBIGINT(context, timestamp, out_len);
1614 }
1615
1616 FORCE_INLINE
1617 const char* convert_toTIMESTAMP_EPOCH_be(int64_t context, int64_t timestamp,
1618 int32_t* out_len) {
1619 // The function behaves as convert_toTIMESTAMP_EPOCH, but
1620 // returns the bytes in big endian format
1621 return convert_toBIGINT_be(context, timestamp, out_len);
1622 }
1623
1624 // Converts a date variable to binary
1625 FORCE_INLINE
1626 const char* convert_toDATE_EPOCH(int64_t context, int64_t date, int32_t* out_len) {
1627 return convert_toBIGINT(context, date, out_len);
1628 }
1629
1630 FORCE_INLINE
1631 const char* convert_toDATE_EPOCH_be(int64_t context, int64_t date, int32_t* out_len) {
1632 // The function behaves as convert_toDATE_EPOCH, but
1633 // returns the bytes in big endian format
1634 return convert_toBIGINT_be(context, date, out_len);
1635 }
1636
1637 // Converts a string variable to binary
1638 FORCE_INLINE
1639 const char* convert_toUTF8(int64_t context, const char* value, int32_t value_len,
1640 int32_t* out_len) {
1641 *out_len = value_len;
1642 return value;
1643 }
1644
1645 // Search for a string within another string
1646 // Same as "locate(substr, str)", except for the reverse order of the arguments.
1647 FORCE_INLINE
1648 gdv_int32 strpos_utf8_utf8(gdv_int64 context, const char* str, gdv_int32 str_len,
1649 const char* sub_str, gdv_int32 sub_str_len) {
1650 return locate_utf8_utf8_int32(context, sub_str, sub_str_len, str, str_len, 1);
1651 }
1652
1653 // Search for a string within another string
1654 FORCE_INLINE
1655 gdv_int32 locate_utf8_utf8(gdv_int64 context, const char* sub_str, gdv_int32 sub_str_len,
1656 const char* str, gdv_int32 str_len) {
1657 return locate_utf8_utf8_int32(context, sub_str, sub_str_len, str, str_len, 1);
1658 }
1659
1660 // Search for a string within another string starting at position start-pos (1-indexed)
1661 FORCE_INLINE
1662 gdv_int32 locate_utf8_utf8_int32(gdv_int64 context, const char* sub_str,
1663 gdv_int32 sub_str_len, const char* str,
1664 gdv_int32 str_len, gdv_int32 start_pos) {
1665 if (start_pos < 1) {
1666 gdv_fn_context_set_error_msg(context, "Start position must be greater than 0");
1667 return 0;
1668 }
1669
1670 if (str_len == 0 || sub_str_len == 0) {
1671 return 0;
1672 }
1673
1674 gdv_int32 byte_pos = utf8_byte_pos(context, str, str_len, start_pos - 1);
1675 if (byte_pos < 0 || byte_pos >= str_len) {
1676 return 0;
1677 }
1678 for (gdv_int32 i = byte_pos; i <= str_len - sub_str_len; ++i) {
1679 if (memcmp(str + i, sub_str, sub_str_len) == 0) {
1680 return utf8_length(context, str, i) + 1;
1681 }
1682 }
1683 return 0;
1684 }
1685
1686 FORCE_INLINE
1687 const char* replace_with_max_len_utf8_utf8_utf8(gdv_int64 context, const char* text,
1688 gdv_int32 text_len, const char* from_str,
1689 gdv_int32 from_str_len,
1690 const char* to_str, gdv_int32 to_str_len,
1691 gdv_int32 max_length,
1692 gdv_int32* out_len) {
1693 // if from_str is empty or its length exceeds that of original string,
1694 // return the original string
1695 if (from_str_len <= 0 || from_str_len > text_len) {
1696 *out_len = text_len;
1697 return text;
1698 }
1699
1700 bool found = false;
1701 gdv_int32 text_index = 0;
1702 char* out;
1703 gdv_int32 out_index = 0;
1704 gdv_int32 last_match_index =
1705 0; // defer copying string from last_match_index till next match is found
1706
1707 for (; text_index <= text_len - from_str_len;) {
1708 if (memcmp(text + text_index, from_str, from_str_len) == 0) {
1709 if (out_index + text_index - last_match_index + to_str_len > max_length) {
1710 gdv_fn_context_set_error_msg(context, "Buffer overflow for output string");
1711 *out_len = 0;
1712 return "";
1713 }
1714 if (!found) {
1715 // found match for first time
1716 out = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, max_length));
1717 if (out == nullptr) {
1718 gdv_fn_context_set_error_msg(context,
1719 "Could not allocate memory for output string");
1720 *out_len = 0;
1721 return "";
1722 }
1723 found = true;
1724 }
1725 // first copy the part deferred till now
1726 memcpy(out + out_index, text + last_match_index, (text_index - last_match_index));
1727 out_index += text_index - last_match_index;
1728 // then copy the target string
1729 memcpy(out + out_index, to_str, to_str_len);
1730 out_index += to_str_len;
1731
1732 text_index += from_str_len;
1733 last_match_index = text_index;
1734 } else {
1735 text_index++;
1736 }
1737 }
1738
1739 if (!found) {
1740 *out_len = text_len;
1741 return text;
1742 }
1743
1744 if (out_index + text_len - last_match_index > max_length) {
1745 gdv_fn_context_set_error_msg(context, "Buffer overflow for output string");
1746 *out_len = 0;
1747 return "";
1748 }
1749 memcpy(out + out_index, text + last_match_index, text_len - last_match_index);
1750 out_index += text_len - last_match_index;
1751 *out_len = out_index;
1752 return out;
1753 }
1754
1755 FORCE_INLINE
1756 const char* replace_utf8_utf8_utf8(gdv_int64 context, const char* text,
1757 gdv_int32 text_len, const char* from_str,
1758 gdv_int32 from_str_len, const char* to_str,
1759 gdv_int32 to_str_len, gdv_int32* out_len) {
1760 return replace_with_max_len_utf8_utf8_utf8(context, text, text_len, from_str,
1761 from_str_len, to_str, to_str_len, 65535,
1762 out_len);
1763 }
1764
1765 FORCE_INLINE
1766 const char* lpad_utf8_int32_utf8(gdv_int64 context, const char* text, gdv_int32 text_len,
1767 gdv_int32 return_length, const char* fill_text,
1768 gdv_int32 fill_text_len, gdv_int32* out_len) {
1769 // if the text length or the defined return length (number of characters to return)
1770 // is <=0, then return an empty string.
1771 if (text_len == 0 || return_length <= 0) {
1772 *out_len = 0;
1773 return "";
1774 }
1775
1776 // count the number of utf8 characters on text, ignoring invalid bytes
1777 int text_char_count = utf8_length_ignore_invalid(text, text_len);
1778
1779 if (return_length == text_char_count ||
1780 (return_length > text_char_count && fill_text_len == 0)) {
1781 // case where the return length is same as the text's length, or if it need to
1782 // fill into text but "fill_text" is empty, then return text directly.
1783 *out_len = text_len;
1784 return text;
1785 } else if (return_length < text_char_count) {
1786 // case where it truncates the result on return length.
1787 *out_len = utf8_byte_pos(context, text, text_len, return_length);
1788 return text;
1789 } else {
1790 // case (return_length > text_char_count)
1791 // case where it needs to copy "fill_text" on the string left. The total number
1792 // of chars to copy is given by (return_length - text_char_count)
1793 char* ret =
1794 reinterpret_cast<gdv_binary>(gdv_fn_context_arena_malloc(context, return_length));
1795 if (ret == nullptr) {
1796 gdv_fn_context_set_error_msg(context,
1797 "Could not allocate memory for output string");
1798 *out_len = 0;
1799 return "";
1800 }
1801 // try to fulfill the return string with the "fill_text" continuously
1802 int32_t copied_chars_count = 0;
1803 int32_t copied_chars_position = 0;
1804 while (copied_chars_count < return_length - text_char_count) {
1805 int32_t char_len;
1806 int32_t fill_index;
1807 // for each char, evaluate its length to consider it when mem copying
1808 for (fill_index = 0; fill_index < fill_text_len; fill_index += char_len) {
1809 if (copied_chars_count >= return_length - text_char_count) {
1810 break;
1811 }
1812 char_len = utf8_char_length(fill_text[fill_index]);
1813 // ignore invalid char on the fill text, considering it as size 1
1814 if (char_len == 0) char_len += 1;
1815 copied_chars_count++;
1816 }
1817 memcpy(ret + copied_chars_position, fill_text, fill_index);
1818 copied_chars_position += fill_index;
1819 }
1820 // after fulfilling the text, copy the main string
1821 memcpy(ret + copied_chars_position, text, text_len);
1822 *out_len = copied_chars_position + text_len;
1823 return ret;
1824 }
1825 }
1826
1827 FORCE_INLINE
1828 const char* rpad_utf8_int32_utf8(gdv_int64 context, const char* text, gdv_int32 text_len,
1829 gdv_int32 return_length, const char* fill_text,
1830 gdv_int32 fill_text_len, gdv_int32* out_len) {
1831 // if the text length or the defined return length (number of characters to return)
1832 // is <=0, then return an empty string.
1833 if (text_len == 0 || return_length <= 0) {
1834 *out_len = 0;
1835 return "";
1836 }
1837
1838 // count the number of utf8 characters on text, ignoring invalid bytes
1839 int text_char_count = utf8_length_ignore_invalid(text, text_len);
1840
1841 if (return_length == text_char_count ||
1842 (return_length > text_char_count && fill_text_len == 0)) {
1843 // case where the return length is same as the text's length, or if it need to
1844 // fill into text but "fill_text" is empty, then return text directly.
1845 *out_len = text_len;
1846 return text;
1847 } else if (return_length < text_char_count) {
1848 // case where it truncates the result on return length.
1849 *out_len = utf8_byte_pos(context, text, text_len, return_length);
1850 return text;
1851 } else {
1852 // case (return_length > text_char_count)
1853 // case where it needs to copy "fill_text" on the string right
1854 char* ret =
1855 reinterpret_cast<gdv_binary>(gdv_fn_context_arena_malloc(context, return_length));
1856 if (ret == nullptr) {
1857 gdv_fn_context_set_error_msg(context,
1858 "Could not allocate memory for output string");
1859 *out_len = 0;
1860 return "";
1861 }
1862 // fulfill the initial text copying the main input string
1863 memcpy(ret, text, text_len);
1864 // try to fulfill the return string with the "fill_text" continuously
1865 int32_t copied_chars_count = 0;
1866 int32_t copied_chars_position = 0;
1867 while (text_char_count + copied_chars_count < return_length) {
1868 int32_t char_len;
1869 int32_t fill_length;
1870 // for each char, evaluate its length to consider it when mem copying
1871 for (fill_length = 0; fill_length < fill_text_len; fill_length += char_len) {
1872 if (text_char_count + copied_chars_count >= return_length) {
1873 break;
1874 }
1875 char_len = utf8_char_length(fill_text[fill_length]);
1876 // ignore invalid char on the fill text, considering it as size 1
1877 if (char_len == 0) char_len += 1;
1878 copied_chars_count++;
1879 }
1880 memcpy(ret + text_len + copied_chars_position, fill_text, fill_length);
1881 copied_chars_position += fill_length;
1882 }
1883 *out_len = copied_chars_position + text_len;
1884 return ret;
1885 }
1886 }
1887
1888 FORCE_INLINE
1889 const char* lpad_utf8_int32(gdv_int64 context, const char* text, gdv_int32 text_len,
1890 gdv_int32 return_length, gdv_int32* out_len) {
1891 return lpad_utf8_int32_utf8(context, text, text_len, return_length, " ", 1, out_len);
1892 }
1893
1894 FORCE_INLINE
1895 const char* rpad_utf8_int32(gdv_int64 context, const char* text, gdv_int32 text_len,
1896 gdv_int32 return_length, gdv_int32* out_len) {
1897 return rpad_utf8_int32_utf8(context, text, text_len, return_length, " ", 1, out_len);
1898 }
1899
1900 FORCE_INLINE
1901 const char* split_part(gdv_int64 context, const char* text, gdv_int32 text_len,
1902 const char* delimiter, gdv_int32 delim_len, gdv_int32 index,
1903 gdv_int32* out_len) {
1904 *out_len = 0;
1905 if (index < 1) {
1906 char error_message[100];
1907 snprintf(error_message, sizeof(error_message),
1908 "Index in split_part must be positive, value provided was %d", index);
1909 gdv_fn_context_set_error_msg(context, error_message);
1910 return "";
1911 }
1912
1913 if (delim_len == 0 || text_len == 0) {
1914 // output will just be text if no delimiter is provided
1915 *out_len = text_len;
1916 return text;
1917 }
1918
1919 int i = 0, match_no = 1;
1920
1921 while (i < text_len) {
1922 // find the position where delimiter matched for the first time
1923 int match_pos = match_string(text, text_len, i, delimiter, delim_len);
1924 if (match_pos == -1 && match_no != index) {
1925 // reached the end without finding a match.
1926 return "";
1927 } else {
1928 // Found a match. If the match number is index then return this match
1929 if (match_no == index) {
1930 int end_pos = match_pos - delim_len;
1931
1932 if (match_pos == -1) {
1933 // end position should be last position of the string as we have the last
1934 // delimiter
1935 end_pos = text_len;
1936 }
1937
1938 *out_len = end_pos - i;
1939 char* out_str =
1940 reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, *out_len));
1941 if (out_str == nullptr) {
1942 gdv_fn_context_set_error_msg(context,
1943 "Could not allocate memory for output string");
1944 *out_len = 0;
1945 return "";
1946 }
1947 memcpy(out_str, text + i, *out_len);
1948 return out_str;
1949 } else {
1950 i = match_pos;
1951 match_no++;
1952 }
1953 }
1954 }
1955
1956 return "";
1957 }
1958
1959 // Returns the x leftmost characters of a given string. Cases:
1960 // LEFT("TestString", 10) => "TestString"
1961 // LEFT("TestString", 3) => "Tes"
1962 // LEFT("TestString", -3) => "TestStr"
1963 FORCE_INLINE
1964 const char* left_utf8_int32(gdv_int64 context, const char* text, gdv_int32 text_len,
1965 gdv_int32 number, gdv_int32* out_len) {
1966 // returns the 'number' left most characters of a given text
1967 if (text_len == 0 || number == 0) {
1968 *out_len = 0;
1969 return "";
1970 }
1971
1972 // iterate over the utf8 string validating each character
1973 int char_len;
1974 int char_count = 0;
1975 int byte_index = 0;
1976 for (int i = 0; i < text_len; i += char_len) {
1977 char_len = utf8_char_length(text[i]);
1978 if (char_len == 0 || i + char_len > text_len) { // invalid byte or incomplete glyph
1979 set_error_for_invalid_utf(context, text[i]);
1980 *out_len = 0;
1981 return "";
1982 }
1983 for (int j = 1; j < char_len; ++j) {
1984 if ((text[i + j] & 0xC0) != 0x80) { // bytes following head-byte of glyph
1985 set_error_for_invalid_utf(context, text[i + j]);
1986 *out_len = 0;
1987 return "";
1988 }
1989 }
1990 byte_index += char_len;
1991 ++char_count;
1992 // Define the rules to stop the iteration over the string
1993 // case where left('abc', 5) -> 'abc'
1994 if (number > 0 && char_count == number) break;
1995 // case where left('abc', -5) ==> ''
1996 if (number < 0 && char_count == number + text_len) break;
1997 }
1998
1999 *out_len = byte_index;
2000 return text;
2001 }
2002
2003 // Returns the x rightmost characters of a given string. Cases:
2004 // RIGHT("TestString", 10) => "TestString"
2005 // RIGHT("TestString", 3) => "ing"
2006 // RIGHT("TestString", -3) => "tString"
2007 FORCE_INLINE
2008 const char* right_utf8_int32(gdv_int64 context, const char* text, gdv_int32 text_len,
2009 gdv_int32 number, gdv_int32* out_len) {
2010 // returns the 'number' left most characters of a given text
2011 if (text_len == 0 || number == 0) {
2012 *out_len = 0;
2013 return "";
2014 }
2015
2016 // initially counts the number of utf8 characters in the defined text
2017 int32_t char_count = utf8_length(context, text, text_len);
2018 // char_count is zero if input has invalid utf8 char
2019 if (char_count == 0) {
2020 *out_len = 0;
2021 return "";
2022 }
2023
2024 int32_t start_char_pos; // the char result start position (inclusive)
2025 int32_t end_char_len; // the char result end position (inclusive)
2026 if (number > 0) {
2027 // case where right('abc', 5) ==> 'abc' start_char_pos=1.
2028 start_char_pos = (char_count > number) ? char_count - number : 0;
2029 end_char_len = char_count - start_char_pos;
2030 } else {
2031 start_char_pos = number * -1;
2032 end_char_len = char_count - start_char_pos;
2033 }
2034
2035 // calculate the start byte position and the output length
2036 int32_t start_byte_pos = utf8_byte_pos(context, text, text_len, start_char_pos);
2037 *out_len = utf8_byte_pos(context, text, text_len, end_char_len);
2038
2039 // try to allocate memory for the response
2040 char* ret =
2041 reinterpret_cast<gdv_binary>(gdv_fn_context_arena_malloc(context, *out_len));
2042 if (ret == nullptr) {
2043 gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
2044 *out_len = 0;
2045 return "";
2046 }
2047 memcpy(ret, text + start_byte_pos, *out_len);
2048 return ret;
2049 }
2050
2051 FORCE_INLINE
2052 const char* binary_string(gdv_int64 context, const char* text, gdv_int32 text_len,
2053 gdv_int32* out_len) {
2054 gdv_binary ret =
2055 reinterpret_cast<gdv_binary>(gdv_fn_context_arena_malloc(context, text_len));
2056
2057 if (ret == nullptr) {
2058 gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
2059 *out_len = 0;
2060 return "";
2061 }
2062
2063 if (text_len == 0) {
2064 *out_len = 0;
2065 return "";
2066 }
2067
2068 // converting hex encoded string to normal string
2069 int j = 0;
2070 for (int i = 0; i < text_len; i++, j++) {
2071 if (text[i] == '\\' && i + 3 < text_len &&
2072 (text[i + 1] == 'x' || text[i + 1] == 'X')) {
2073 char hd1 = text[i + 2];
2074 char hd2 = text[i + 3];
2075 if (isxdigit(hd1) && isxdigit(hd2)) {
2076 // [a-fA-F0-9]
2077 ret[j] = to_binary_from_hex(hd1) * 16 + to_binary_from_hex(hd2);
2078 i += 3;
2079 } else {
2080 ret[j] = text[i];
2081 }
2082 } else {
2083 ret[j] = text[i];
2084 }
2085 }
2086 *out_len = j;
2087 return ret;
2088 }
2089
2090 #define CAST_INT_BIGINT_VARBINARY(OUT_TYPE, TYPE_NAME) \
2091 FORCE_INLINE \
2092 OUT_TYPE \
2093 cast##TYPE_NAME##_varbinary(gdv_int64 context, const char* in, int32_t in_len) { \
2094 if (in_len == 0) { \
2095 gdv_fn_context_set_error_msg(context, "Can't cast an empty string."); \
2096 return -1; \
2097 } \
2098 char sign = in[0]; \
2099 \
2100 bool negative = false; \
2101 if (sign == '-') { \
2102 negative = true; \
2103 /* Ignores the sign char in the hexadecimal string */ \
2104 in++; \
2105 in_len--; \
2106 } \
2107 \
2108 if (negative && in_len == 0) { \
2109 gdv_fn_context_set_error_msg(context, \
2110 "Can't cast hexadecimal with only a minus sign."); \
2111 return -1; \
2112 } \
2113 \
2114 OUT_TYPE result = 0; \
2115 int digit; \
2116 \
2117 int read_index = 0; \
2118 while (read_index < in_len) { \
2119 char c1 = in[read_index]; \
2120 if (isxdigit(c1)) { \
2121 digit = to_binary_from_hex(c1); \
2122 \
2123 OUT_TYPE next = result * 16 - digit; \
2124 \
2125 if (next > result) { \
2126 gdv_fn_context_set_error_msg(context, "Integer overflow."); \
2127 return -1; \
2128 } \
2129 result = next; \
2130 read_index++; \
2131 } else { \
2132 gdv_fn_context_set_error_msg(context, \
2133 "The hexadecimal given has invalid characters."); \
2134 return -1; \
2135 } \
2136 } \
2137 if (!negative) { \
2138 result *= -1; \
2139 \
2140 if (result < 0) { \
2141 gdv_fn_context_set_error_msg(context, "Integer overflow."); \
2142 return -1; \
2143 } \
2144 } \
2145 return result; \
2146 }
2147
2148 CAST_INT_BIGINT_VARBINARY(int32_t, INT)
2149 CAST_INT_BIGINT_VARBINARY(int64_t, BIGINT)
2150
2151 #undef CAST_INT_BIGINT_VARBINARY
2152
2153 // Produces the binary representation of a string y characters long derived by starting
2154 // at offset 'x' and considering the defined length 'y'. Notice that the offset index
2155 // may be a negative number (starting from the end of the string), or a positive number
2156 // starting on index 1. Cases:
2157 // BYTE_SUBSTR("TestString", 1, 10) => "TestString"
2158 // BYTE_SUBSTR("TestString", 5, 10) => "String"
2159 // BYTE_SUBSTR("TestString", -6, 10) => "String"
2160 // BYTE_SUBSTR("TestString", -600, 10) => "TestString"
2161 FORCE_INLINE
2162 const char* byte_substr_binary_int32_int32(gdv_int64 context, const char* text,
2163 gdv_int32 text_len, gdv_int32 offset,
2164 gdv_int32 length, gdv_int32* out_len) {
2165 // the first offset position for a string is 1, so not consider offset == 0
2166 // also, the length should be always a positive number
2167 if (text_len == 0 || offset == 0 || length <= 0) {
2168 *out_len = 0;
2169 return "";
2170 }
2171
2172 char* ret =
2173 reinterpret_cast<gdv_binary>(gdv_fn_context_arena_malloc(context, text_len));
2174
2175 if (ret == nullptr) {
2176 gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
2177 *out_len = 0;
2178 return "";
2179 }
2180
2181 int32_t startPos = 0;
2182 if (offset >= 0) {
2183 startPos = offset - 1;
2184 } else if (text_len + offset >= 0) {
2185 startPos = text_len + offset;
2186 }
2187
2188 // calculate end position from length and truncate to upper value bounds
2189 if (startPos + length > text_len) {
2190 *out_len = text_len - startPos;
2191 } else {
2192 *out_len = length;
2193 }
2194
2195 memcpy(ret, text + startPos, *out_len);
2196 return ret;
2197 }
2198 } // extern "C"