]> git.proxmox.com Git - ceph.git/blob - ceph/src/arrow/cpp/src/arrow/compute/kernels/scalar_string.cc
import quincy 17.2.0
[ceph.git] / ceph / src / arrow / cpp / src / arrow / compute / kernels / scalar_string.cc
1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements. See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership. The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License. You may obtain a copy of the License at
8 //
9 // http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied. See the License for the
15 // specific language governing permissions and limitations
16 // under the License.
17
18 #include <algorithm>
19 #include <cctype>
20 #include <iterator>
21 #include <string>
22
23 #ifdef ARROW_WITH_UTF8PROC
24 #include <utf8proc.h>
25 #endif
26
27 #ifdef ARROW_WITH_RE2
28 #include <re2/re2.h>
29 #endif
30
31 #include "arrow/array/builder_binary.h"
32 #include "arrow/array/builder_nested.h"
33 #include "arrow/buffer_builder.h"
34
35 #include "arrow/builder.h"
36 #include "arrow/compute/api_scalar.h"
37 #include "arrow/compute/kernels/common.h"
38 #include "arrow/util/checked_cast.h"
39 #include "arrow/util/utf8.h"
40 #include "arrow/util/value_parsing.h"
41 #include "arrow/visitor_inline.h"
42
43 namespace arrow {
44
45 using internal::checked_cast;
46
47 namespace compute {
48 namespace internal {
49
50 namespace {
51
52 #ifdef ARROW_WITH_RE2
53 util::string_view ToStringView(re2::StringPiece piece) {
54 return {piece.data(), piece.length()};
55 }
56
57 re2::StringPiece ToStringPiece(util::string_view view) {
58 return {view.data(), view.length()};
59 }
60
61 Status RegexStatus(const RE2& regex) {
62 if (!regex.ok()) {
63 return Status::Invalid("Invalid regular expression: ", regex.error());
64 }
65 return Status::OK();
66 }
67 #endif
68
69 // Code units in the range [a-z] can only be an encoding of an ASCII
70 // character/codepoint, not the 2nd, 3rd or 4th code unit (byte) of a different
71 // codepoint. This is guaranteed by the non-overlap design of the Unicode
72 // standard. (see section 2.5 of Unicode Standard Core Specification v13.0)
73
74 // IsAlpha/Digit etc
75
76 static inline bool IsAsciiCharacter(uint8_t character) { return character < 128; }
77
78 static inline bool IsLowerCaseCharacterAscii(uint8_t ascii_character) {
79 return (ascii_character >= 'a') && (ascii_character <= 'z');
80 }
81
82 static inline bool IsUpperCaseCharacterAscii(uint8_t ascii_character) {
83 return (ascii_character >= 'A') && (ascii_character <= 'Z');
84 }
85
86 static inline bool IsCasedCharacterAscii(uint8_t ascii_character) {
87 // Note: Non-ASCII characters are seen as uncased.
88 return IsLowerCaseCharacterAscii(ascii_character) ||
89 IsUpperCaseCharacterAscii(ascii_character);
90 }
91
92 static inline bool IsAlphaCharacterAscii(uint8_t ascii_character) {
93 return IsCasedCharacterAscii(ascii_character);
94 }
95
96 static inline bool IsAlphaNumericCharacterAscii(uint8_t ascii_character) {
97 return ((ascii_character >= '0') && (ascii_character <= '9')) ||
98 ((ascii_character >= 'a') && (ascii_character <= 'z')) ||
99 ((ascii_character >= 'A') && (ascii_character <= 'Z'));
100 }
101
102 static inline bool IsDecimalCharacterAscii(uint8_t ascii_character) {
103 return ((ascii_character >= '0') && (ascii_character <= '9'));
104 }
105
106 static inline bool IsSpaceCharacterAscii(uint8_t ascii_character) {
107 return ((ascii_character >= 9) && (ascii_character <= 13)) || (ascii_character == ' ');
108 }
109
110 static inline bool IsPrintableCharacterAscii(uint8_t ascii_character) {
111 return ((ascii_character >= ' ') && (ascii_character <= '~'));
112 }
113
114 struct BinaryLength {
115 template <typename OutValue, typename Arg0Value = util::string_view>
116 static OutValue Call(KernelContext*, Arg0Value val, Status*) {
117 return static_cast<OutValue>(val.size());
118 }
119
120 static Status FixedSizeExec(KernelContext*, const ExecBatch& batch, Datum* out) {
121 // Output is preallocated and validity buffer is precomputed
122 const int32_t width =
123 checked_cast<const FixedSizeBinaryType&>(*batch[0].type()).byte_width();
124 if (batch.values[0].is_array()) {
125 int32_t* buffer = out->mutable_array()->GetMutableValues<int32_t>(1);
126 std::fill(buffer, buffer + batch.length, width);
127 } else {
128 checked_cast<Int32Scalar*>(out->scalar().get())->value = width;
129 }
130 return Status::OK();
131 }
132 };
133
134 struct Utf8Length {
135 template <typename OutValue, typename Arg0Value = util::string_view>
136 static OutValue Call(KernelContext*, Arg0Value val, Status*) {
137 auto str = reinterpret_cast<const uint8_t*>(val.data());
138 auto strlen = val.size();
139 return static_cast<OutValue>(util::UTF8Length(str, str + strlen));
140 }
141 };
142
143 static inline uint8_t ascii_tolower(uint8_t utf8_code_unit) {
144 return ((utf8_code_unit >= 'A') && (utf8_code_unit <= 'Z')) ? (utf8_code_unit + 32)
145 : utf8_code_unit;
146 }
147
148 static inline uint8_t ascii_toupper(uint8_t utf8_code_unit) {
149 return ((utf8_code_unit >= 'a') && (utf8_code_unit <= 'z')) ? (utf8_code_unit - 32)
150 : utf8_code_unit;
151 }
152
153 static inline uint8_t ascii_swapcase(uint8_t utf8_code_unit) {
154 if (IsLowerCaseCharacterAscii(utf8_code_unit)) {
155 utf8_code_unit -= 32;
156 } else if (IsUpperCaseCharacterAscii(utf8_code_unit)) {
157 utf8_code_unit += 32;
158 }
159 return utf8_code_unit;
160 }
161
162 #ifdef ARROW_WITH_UTF8PROC
163
164 // Direct lookup tables for unicode properties
165 constexpr uint32_t kMaxCodepointLookup =
166 0xffff; // up to this codepoint is in a lookup table
167 std::vector<uint32_t> lut_upper_codepoint;
168 std::vector<uint32_t> lut_lower_codepoint;
169 std::vector<uint32_t> lut_swapcase_codepoint;
170 std::vector<utf8proc_category_t> lut_category;
171 std::once_flag flag_case_luts;
172
173 // IsAlpha/Digit etc
174
175 static inline bool HasAnyUnicodeGeneralCategory(uint32_t codepoint, uint32_t mask) {
176 utf8proc_category_t general_category = codepoint <= kMaxCodepointLookup
177 ? lut_category[codepoint]
178 : utf8proc_category(codepoint);
179 uint32_t general_category_bit = 1 << general_category;
180 // for e.g. undefined (but valid) codepoints, general_category == 0 ==
181 // UTF8PROC_CATEGORY_CN
182 return (general_category != UTF8PROC_CATEGORY_CN) &&
183 ((general_category_bit & mask) != 0);
184 }
185
186 template <typename... Categories>
187 static inline bool HasAnyUnicodeGeneralCategory(uint32_t codepoint, uint32_t mask,
188 utf8proc_category_t category,
189 Categories... categories) {
190 return HasAnyUnicodeGeneralCategory(codepoint, mask | (1 << category), categories...);
191 }
192
193 template <typename... Categories>
194 static inline bool HasAnyUnicodeGeneralCategory(uint32_t codepoint,
195 utf8proc_category_t category,
196 Categories... categories) {
197 return HasAnyUnicodeGeneralCategory(codepoint, static_cast<uint32_t>(1u << category),
198 categories...);
199 }
200
201 static inline bool IsCasedCharacterUnicode(uint32_t codepoint) {
202 return HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_LU,
203 UTF8PROC_CATEGORY_LL, UTF8PROC_CATEGORY_LT) ||
204 ((static_cast<uint32_t>(utf8proc_toupper(codepoint)) != codepoint) ||
205 (static_cast<uint32_t>(utf8proc_tolower(codepoint)) != codepoint));
206 }
207
208 static inline bool IsLowerCaseCharacterUnicode(uint32_t codepoint) {
209 // although this trick seems to work for upper case, this is not enough for lower case
210 // testing, see https://github.com/JuliaStrings/utf8proc/issues/195 . But currently the
211 // best we can do
212 return (HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_LL) ||
213 ((static_cast<uint32_t>(utf8proc_toupper(codepoint)) != codepoint) &&
214 (static_cast<uint32_t>(utf8proc_tolower(codepoint)) == codepoint))) &&
215 !HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_LT);
216 }
217
218 static inline bool IsUpperCaseCharacterUnicode(uint32_t codepoint) {
219 // this seems to be a good workaround for utf8proc not having case information
220 // https://github.com/JuliaStrings/utf8proc/issues/195
221 return (HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_LU) ||
222 ((static_cast<uint32_t>(utf8proc_toupper(codepoint)) == codepoint) &&
223 (static_cast<uint32_t>(utf8proc_tolower(codepoint)) != codepoint))) &&
224 !HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_LT);
225 }
226
227 static inline bool IsAlphaNumericCharacterUnicode(uint32_t codepoint) {
228 return HasAnyUnicodeGeneralCategory(
229 codepoint, UTF8PROC_CATEGORY_LU, UTF8PROC_CATEGORY_LL, UTF8PROC_CATEGORY_LT,
230 UTF8PROC_CATEGORY_LM, UTF8PROC_CATEGORY_LO, UTF8PROC_CATEGORY_ND,
231 UTF8PROC_CATEGORY_NL, UTF8PROC_CATEGORY_NO);
232 }
233
234 static inline bool IsAlphaCharacterUnicode(uint32_t codepoint) {
235 return HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_LU,
236 UTF8PROC_CATEGORY_LL, UTF8PROC_CATEGORY_LT,
237 UTF8PROC_CATEGORY_LM, UTF8PROC_CATEGORY_LO);
238 }
239
240 static inline bool IsDecimalCharacterUnicode(uint32_t codepoint) {
241 return HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_ND);
242 }
243
244 static inline bool IsDigitCharacterUnicode(uint32_t codepoint) {
245 // Python defines this as Numeric_Type=Digit or Numeric_Type=Decimal.
246 // utf8proc has no support for this, this is the best we can do:
247 return HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_ND);
248 }
249
250 static inline bool IsNumericCharacterUnicode(uint32_t codepoint) {
251 // Formally this is not correct, but utf8proc does not allow us to query for Numerical
252 // properties, e.g. Numeric_Value and Numeric_Type
253 // Python defines Numeric as Numeric_Type=Digit, Numeric_Type=Decimal or
254 // Numeric_Type=Numeric.
255 return HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_ND,
256 UTF8PROC_CATEGORY_NL, UTF8PROC_CATEGORY_NO);
257 }
258
259 static inline bool IsSpaceCharacterUnicode(uint32_t codepoint) {
260 auto property = utf8proc_get_property(codepoint);
261 return HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_ZS) ||
262 property->bidi_class == UTF8PROC_BIDI_CLASS_WS ||
263 property->bidi_class == UTF8PROC_BIDI_CLASS_B ||
264 property->bidi_class == UTF8PROC_BIDI_CLASS_S;
265 }
266
267 static inline bool IsPrintableCharacterUnicode(uint32_t codepoint) {
268 uint32_t general_category = utf8proc_category(codepoint);
269 return (general_category != UTF8PROC_CATEGORY_CN) &&
270 !HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_CC,
271 UTF8PROC_CATEGORY_CF, UTF8PROC_CATEGORY_CS,
272 UTF8PROC_CATEGORY_CO, UTF8PROC_CATEGORY_ZS,
273 UTF8PROC_CATEGORY_ZL, UTF8PROC_CATEGORY_ZP);
274 }
275
276 void EnsureLookupTablesFilled() {
277 std::call_once(flag_case_luts, []() {
278 lut_upper_codepoint.reserve(kMaxCodepointLookup + 1);
279 lut_lower_codepoint.reserve(kMaxCodepointLookup + 1);
280 lut_swapcase_codepoint.reserve(kMaxCodepointLookup + 1);
281 for (uint32_t i = 0; i <= kMaxCodepointLookup; i++) {
282 lut_upper_codepoint.push_back(utf8proc_toupper(i));
283 lut_lower_codepoint.push_back(utf8proc_tolower(i));
284 lut_category.push_back(utf8proc_category(i));
285
286 if (IsLowerCaseCharacterUnicode(i)) {
287 lut_swapcase_codepoint.push_back(utf8proc_toupper(i));
288 } else if (IsUpperCaseCharacterUnicode(i)) {
289 lut_swapcase_codepoint.push_back(utf8proc_tolower(i));
290 } else {
291 lut_swapcase_codepoint.push_back(i);
292 }
293 }
294 });
295 }
296
297 #else
298
299 void EnsureLookupTablesFilled() {}
300
301 #endif // ARROW_WITH_UTF8PROC
302
303 constexpr int64_t kTransformError = -1;
304
305 struct StringTransformBase {
306 virtual ~StringTransformBase() = default;
307 virtual Status PreExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
308 return Status::OK();
309 }
310
311 // Return the maximum total size of the output in codeunits (i.e. bytes)
312 // given input characteristics.
313 virtual int64_t MaxCodeunits(int64_t ninputs, int64_t input_ncodeunits) {
314 return input_ncodeunits;
315 }
316
317 virtual Status InvalidStatus() {
318 return Status::Invalid("Invalid UTF8 sequence in input");
319 }
320
321 // Derived classes should also define this method:
322 // int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
323 // uint8_t* output);
324 };
325
326 template <typename Type, typename StringTransform>
327 struct StringTransformExecBase {
328 using offset_type = typename Type::offset_type;
329 using ArrayType = typename TypeTraits<Type>::ArrayType;
330
331 static Status Execute(KernelContext* ctx, StringTransform* transform,
332 const ExecBatch& batch, Datum* out) {
333 if (batch[0].kind() == Datum::ARRAY) {
334 return ExecArray(ctx, transform, batch[0].array(), out);
335 }
336 DCHECK_EQ(batch[0].kind(), Datum::SCALAR);
337 return ExecScalar(ctx, transform, batch[0].scalar(), out);
338 }
339
340 static Status ExecArray(KernelContext* ctx, StringTransform* transform,
341 const std::shared_ptr<ArrayData>& data, Datum* out) {
342 ArrayType input(data);
343 ArrayData* output = out->mutable_array();
344
345 const int64_t input_ncodeunits = input.total_values_length();
346 const int64_t input_nstrings = input.length();
347
348 const int64_t output_ncodeunits_max =
349 transform->MaxCodeunits(input_nstrings, input_ncodeunits);
350 if (output_ncodeunits_max > std::numeric_limits<offset_type>::max()) {
351 return Status::CapacityError(
352 "Result might not fit in a 32bit utf8 array, convert to large_utf8");
353 }
354
355 ARROW_ASSIGN_OR_RAISE(auto values_buffer, ctx->Allocate(output_ncodeunits_max));
356 output->buffers[2] = values_buffer;
357
358 // String offsets are preallocated
359 offset_type* output_string_offsets = output->GetMutableValues<offset_type>(1);
360 uint8_t* output_str = output->buffers[2]->mutable_data();
361 offset_type output_ncodeunits = 0;
362
363 output_string_offsets[0] = 0;
364 for (int64_t i = 0; i < input_nstrings; i++) {
365 if (!input.IsNull(i)) {
366 offset_type input_string_ncodeunits;
367 const uint8_t* input_string = input.GetValue(i, &input_string_ncodeunits);
368 auto encoded_nbytes = static_cast<offset_type>(transform->Transform(
369 input_string, input_string_ncodeunits, output_str + output_ncodeunits));
370 if (encoded_nbytes < 0) {
371 return transform->InvalidStatus();
372 }
373 output_ncodeunits += encoded_nbytes;
374 }
375 output_string_offsets[i + 1] = output_ncodeunits;
376 }
377 DCHECK_LE(output_ncodeunits, output_ncodeunits_max);
378
379 // Trim the codepoint buffer, since we allocated too much
380 return values_buffer->Resize(output_ncodeunits, /*shrink_to_fit=*/true);
381 }
382
383 static Status ExecScalar(KernelContext* ctx, StringTransform* transform,
384 const std::shared_ptr<Scalar>& scalar, Datum* out) {
385 const auto& input = checked_cast<const BaseBinaryScalar&>(*scalar);
386 if (!input.is_valid) {
387 return Status::OK();
388 }
389 auto* result = checked_cast<BaseBinaryScalar*>(out->scalar().get());
390 result->is_valid = true;
391 const int64_t data_nbytes = static_cast<int64_t>(input.value->size());
392
393 const int64_t output_ncodeunits_max = transform->MaxCodeunits(1, data_nbytes);
394 if (output_ncodeunits_max > std::numeric_limits<offset_type>::max()) {
395 return Status::CapacityError(
396 "Result might not fit in a 32bit utf8 array, convert to large_utf8");
397 }
398 ARROW_ASSIGN_OR_RAISE(auto value_buffer, ctx->Allocate(output_ncodeunits_max));
399 result->value = value_buffer;
400 auto encoded_nbytes = static_cast<offset_type>(transform->Transform(
401 input.value->data(), data_nbytes, value_buffer->mutable_data()));
402 if (encoded_nbytes < 0) {
403 return transform->InvalidStatus();
404 }
405 DCHECK_LE(encoded_nbytes, output_ncodeunits_max);
406 return value_buffer->Resize(encoded_nbytes, /*shrink_to_fit=*/true);
407 }
408 };
409
410 template <typename Type, typename StringTransform>
411 struct StringTransformExec : public StringTransformExecBase<Type, StringTransform> {
412 using StringTransformExecBase<Type, StringTransform>::Execute;
413
414 static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
415 StringTransform transform;
416 RETURN_NOT_OK(transform.PreExec(ctx, batch, out));
417 return Execute(ctx, &transform, batch, out);
418 }
419 };
420
421 template <typename Type, typename StringTransform>
422 struct StringTransformExecWithState
423 : public StringTransformExecBase<Type, StringTransform> {
424 using State = typename StringTransform::State;
425 using StringTransformExecBase<Type, StringTransform>::Execute;
426
427 static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
428 StringTransform transform(State::Get(ctx));
429 RETURN_NOT_OK(transform.PreExec(ctx, batch, out));
430 return Execute(ctx, &transform, batch, out);
431 }
432 };
433
434 template <typename StringTransform>
435 struct FixedSizeBinaryTransformExecBase {
436 static Status Execute(KernelContext* ctx, StringTransform* transform,
437 const ExecBatch& batch, Datum* out) {
438 if (batch[0].kind() == Datum::ARRAY) {
439 return ExecArray(ctx, transform, batch[0].array(), out);
440 }
441 DCHECK_EQ(batch[0].kind(), Datum::SCALAR);
442 return ExecScalar(ctx, transform, batch[0].scalar(), out);
443 }
444
445 static Status ExecArray(KernelContext* ctx, StringTransform* transform,
446 const std::shared_ptr<ArrayData>& data, Datum* out) {
447 FixedSizeBinaryArray input(data);
448 ArrayData* output = out->mutable_array();
449
450 const int32_t input_width =
451 checked_cast<const FixedSizeBinaryType&>(*data->type).byte_width();
452 const int32_t output_width =
453 checked_cast<const FixedSizeBinaryType&>(*out->type()).byte_width();
454 const int64_t input_nstrings = input.length();
455 ARROW_ASSIGN_OR_RAISE(auto values_buffer,
456 ctx->Allocate(output_width * input_nstrings));
457 uint8_t* output_str = values_buffer->mutable_data();
458
459 for (int64_t i = 0; i < input_nstrings; i++) {
460 if (!input.IsNull(i)) {
461 const uint8_t* input_string = input.GetValue(i);
462 auto encoded_nbytes = static_cast<int32_t>(
463 transform->Transform(input_string, input_width, output_str));
464 if (encoded_nbytes != output_width) {
465 return transform->InvalidStatus();
466 }
467 } else {
468 std::memset(output_str, 0x00, output_width);
469 }
470 output_str += output_width;
471 }
472
473 output->buffers[1] = std::move(values_buffer);
474 return Status::OK();
475 }
476
477 static Status ExecScalar(KernelContext* ctx, StringTransform* transform,
478 const std::shared_ptr<Scalar>& scalar, Datum* out) {
479 const auto& input = checked_cast<const BaseBinaryScalar&>(*scalar);
480 if (!input.is_valid) {
481 return Status::OK();
482 }
483 const int32_t out_width =
484 checked_cast<const FixedSizeBinaryType&>(*out->type()).byte_width();
485 auto* result = checked_cast<BaseBinaryScalar*>(out->scalar().get());
486
487 const int32_t data_nbytes = static_cast<int32_t>(input.value->size());
488 ARROW_ASSIGN_OR_RAISE(auto value_buffer, ctx->Allocate(out_width));
489 auto encoded_nbytes = static_cast<int32_t>(transform->Transform(
490 input.value->data(), data_nbytes, value_buffer->mutable_data()));
491 if (encoded_nbytes != out_width) {
492 return transform->InvalidStatus();
493 }
494
495 result->is_valid = true;
496 result->value = std::move(value_buffer);
497 return Status::OK();
498 }
499 };
500
501 template <typename StringTransform>
502 struct FixedSizeBinaryTransformExecWithState
503 : public FixedSizeBinaryTransformExecBase<StringTransform> {
504 using State = typename StringTransform::State;
505 using FixedSizeBinaryTransformExecBase<StringTransform>::Execute;
506
507 static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
508 StringTransform transform(State::Get(ctx));
509 RETURN_NOT_OK(transform.PreExec(ctx, batch, out));
510 return Execute(ctx, &transform, batch, out);
511 }
512
513 static Result<ValueDescr> OutputType(KernelContext* ctx,
514 const std::vector<ValueDescr>& descrs) {
515 DCHECK_EQ(1, descrs.size());
516 const auto& options = State::Get(ctx);
517 const int32_t input_width =
518 checked_cast<const FixedSizeBinaryType&>(*descrs[0].type).byte_width();
519 const int32_t output_width = StringTransform::FixedOutputSize(options, input_width);
520 return ValueDescr(fixed_size_binary(output_width), descrs[0].shape);
521 }
522 };
523
524 #ifdef ARROW_WITH_UTF8PROC
525
526 struct FunctionalCaseMappingTransform : public StringTransformBase {
527 Status PreExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) override {
528 EnsureLookupTablesFilled();
529 return Status::OK();
530 }
531
532 int64_t MaxCodeunits(int64_t ninputs, int64_t input_ncodeunits) override {
533 // Section 5.18 of the Unicode spec claims that the number of codepoints for case
534 // mapping can grow by a factor of 3. This means grow by a factor of 3 in bytes
535 // However, since we don't support all casings (SpecialCasing.txt) the growth
536 // in bytes is actually only at max 3/2 (as covered by the unittest).
537 // Note that rounding down the 3/2 is ok, since only codepoints encoded by
538 // two code units (even) can grow to 3 code units.
539 return static_cast<int64_t>(input_ncodeunits) * 3 / 2;
540 }
541 };
542
543 template <typename CodepointTransform>
544 struct StringTransformCodepoint : public FunctionalCaseMappingTransform {
545 int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
546 uint8_t* output) {
547 uint8_t* output_start = output;
548 if (ARROW_PREDICT_FALSE(
549 !arrow::util::UTF8Transform(input, input + input_string_ncodeunits, &output,
550 CodepointTransform::TransformCodepoint))) {
551 return kTransformError;
552 }
553 return output - output_start;
554 }
555 };
556
557 struct UTF8UpperTransform : public FunctionalCaseMappingTransform {
558 static uint32_t TransformCodepoint(uint32_t codepoint) {
559 return codepoint <= kMaxCodepointLookup ? lut_upper_codepoint[codepoint]
560 : utf8proc_toupper(codepoint);
561 }
562 };
563
564 template <typename Type>
565 using UTF8Upper = StringTransformExec<Type, StringTransformCodepoint<UTF8UpperTransform>>;
566
567 struct UTF8LowerTransform : public FunctionalCaseMappingTransform {
568 static uint32_t TransformCodepoint(uint32_t codepoint) {
569 return codepoint <= kMaxCodepointLookup ? lut_lower_codepoint[codepoint]
570 : utf8proc_tolower(codepoint);
571 }
572 };
573
574 template <typename Type>
575 using UTF8Lower = StringTransformExec<Type, StringTransformCodepoint<UTF8LowerTransform>>;
576
577 struct UTF8SwapCaseTransform : public FunctionalCaseMappingTransform {
578 static uint32_t TransformCodepoint(uint32_t codepoint) {
579 if (codepoint <= kMaxCodepointLookup) {
580 return lut_swapcase_codepoint[codepoint];
581 } else {
582 if (IsLowerCaseCharacterUnicode(codepoint)) {
583 return utf8proc_toupper(codepoint);
584 } else if (IsUpperCaseCharacterUnicode(codepoint)) {
585 return utf8proc_tolower(codepoint);
586 }
587 }
588
589 return codepoint;
590 }
591 };
592
593 template <typename Type>
594 using UTF8SwapCase =
595 StringTransformExec<Type, StringTransformCodepoint<UTF8SwapCaseTransform>>;
596
597 struct Utf8CapitalizeTransform : public FunctionalCaseMappingTransform {
598 int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
599 uint8_t* output) {
600 uint8_t* output_start = output;
601 const uint8_t* end = input + input_string_ncodeunits;
602 const uint8_t* next = input;
603 if (input_string_ncodeunits > 0) {
604 if (ARROW_PREDICT_FALSE(!util::UTF8AdvanceCodepoints(input, end, &next, 1))) {
605 return kTransformError;
606 }
607 if (ARROW_PREDICT_FALSE(!util::UTF8Transform(
608 input, next, &output, UTF8UpperTransform::TransformCodepoint))) {
609 return kTransformError;
610 }
611 if (ARROW_PREDICT_FALSE(!util::UTF8Transform(
612 next, end, &output, UTF8LowerTransform::TransformCodepoint))) {
613 return kTransformError;
614 }
615 }
616 return output - output_start;
617 }
618 };
619
620 template <typename Type>
621 using Utf8Capitalize = StringTransformExec<Type, Utf8CapitalizeTransform>;
622
623 struct Utf8TitleTransform : public FunctionalCaseMappingTransform {
624 int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
625 uint8_t* output) {
626 uint8_t* output_start = output;
627 const uint8_t* end = input + input_string_ncodeunits;
628 const uint8_t* next = input;
629 bool is_next_upper = true;
630 while ((input = next) < end) {
631 uint32_t codepoint;
632 if (ARROW_PREDICT_FALSE(!util::UTF8Decode(&next, &codepoint))) {
633 return kTransformError;
634 }
635 if (IsCasedCharacterUnicode(codepoint)) {
636 // Lower/uppercase current codepoint and
637 // prepare to lowercase next consecutive cased codepoints
638 output = is_next_upper
639 ? util::UTF8Encode(output,
640 UTF8UpperTransform::TransformCodepoint(codepoint))
641 : util::UTF8Encode(
642 output, UTF8LowerTransform::TransformCodepoint(codepoint));
643 is_next_upper = false;
644 } else {
645 // Copy current uncased codepoint and
646 // prepare to uppercase next cased codepoint
647 std::memcpy(output, input, next - input);
648 output += next - input;
649 is_next_upper = true;
650 }
651 }
652 return output - output_start;
653 }
654 };
655
656 template <typename Type>
657 using Utf8Title = StringTransformExec<Type, Utf8TitleTransform>;
658
659 #endif // ARROW_WITH_UTF8PROC
660
661 struct AsciiReverseTransform : public StringTransformBase {
662 int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
663 uint8_t* output) {
664 uint8_t utf8_char_found = 0;
665 for (int64_t i = 0; i < input_string_ncodeunits; i++) {
666 // if a utf8 char is found, report to utf8_char_found
667 utf8_char_found |= input[i] & 0x80;
668 output[input_string_ncodeunits - i - 1] = input[i];
669 }
670 return utf8_char_found ? kTransformError : input_string_ncodeunits;
671 }
672
673 Status InvalidStatus() override {
674 return Status::Invalid("Non-ASCII sequence in input");
675 }
676 };
677
678 template <typename Type>
679 using AsciiReverse = StringTransformExec<Type, AsciiReverseTransform>;
680
681 struct Utf8ReverseTransform : public StringTransformBase {
682 int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
683 uint8_t* output) {
684 int64_t i = 0;
685 while (i < input_string_ncodeunits) {
686 int64_t char_end = std::min(i + util::ValidUtf8CodepointByteSize(input + i),
687 input_string_ncodeunits);
688 std::copy(input + i, input + char_end, output + input_string_ncodeunits - char_end);
689 i = char_end;
690 }
691 return input_string_ncodeunits;
692 }
693 };
694
695 template <typename Type>
696 using Utf8Reverse = StringTransformExec<Type, Utf8ReverseTransform>;
697
698 using TransformFunc = std::function<void(const uint8_t*, int64_t, uint8_t*)>;
699
700 // Transform a buffer of offsets to one which begins with 0 and has same
701 // value lengths.
702 template <typename T>
703 Status GetShiftedOffsets(KernelContext* ctx, const Buffer& input_buffer, int64_t offset,
704 int64_t length, std::shared_ptr<Buffer>* out) {
705 ARROW_ASSIGN_OR_RAISE(*out, ctx->Allocate((length + 1) * sizeof(T)));
706 const T* input_offsets = reinterpret_cast<const T*>(input_buffer.data()) + offset;
707 T* out_offsets = reinterpret_cast<T*>((*out)->mutable_data());
708 T first_offset = *input_offsets;
709 for (int64_t i = 0; i < length; ++i) {
710 *out_offsets++ = input_offsets[i] - first_offset;
711 }
712 *out_offsets = input_offsets[length] - first_offset;
713 return Status::OK();
714 }
715
716 // Apply `transform` to input character data- this function cannot change the
717 // length
718 template <typename Type>
719 Status StringDataTransform(KernelContext* ctx, const ExecBatch& batch,
720 TransformFunc transform, Datum* out) {
721 using ArrayType = typename TypeTraits<Type>::ArrayType;
722 using offset_type = typename Type::offset_type;
723
724 if (batch[0].kind() == Datum::ARRAY) {
725 const ArrayData& input = *batch[0].array();
726 ArrayType input_boxed(batch[0].array());
727
728 ArrayData* out_arr = out->mutable_array();
729
730 if (input.offset == 0) {
731 // We can reuse offsets from input
732 out_arr->buffers[1] = input.buffers[1];
733 } else {
734 DCHECK(input.buffers[1]);
735 // We must allocate new space for the offsets and shift the existing offsets
736 RETURN_NOT_OK(GetShiftedOffsets<offset_type>(ctx, *input.buffers[1], input.offset,
737 input.length, &out_arr->buffers[1]));
738 }
739
740 // Allocate space for output data
741 int64_t data_nbytes = input_boxed.total_values_length();
742 RETURN_NOT_OK(ctx->Allocate(data_nbytes).Value(&out_arr->buffers[2]));
743 if (input.length > 0) {
744 transform(input.buffers[2]->data() + input_boxed.value_offset(0), data_nbytes,
745 out_arr->buffers[2]->mutable_data());
746 }
747 } else {
748 const auto& input = checked_cast<const BaseBinaryScalar&>(*batch[0].scalar());
749 auto result = checked_pointer_cast<BaseBinaryScalar>(MakeNullScalar(out->type()));
750 if (input.is_valid) {
751 result->is_valid = true;
752 int64_t data_nbytes = input.value->size();
753 RETURN_NOT_OK(ctx->Allocate(data_nbytes).Value(&result->value));
754 transform(input.value->data(), data_nbytes, result->value->mutable_data());
755 }
756 out->value = result;
757 }
758
759 return Status::OK();
760 }
761
762 void TransformAsciiUpper(const uint8_t* input, int64_t length, uint8_t* output) {
763 std::transform(input, input + length, output, ascii_toupper);
764 }
765
766 template <typename Type>
767 struct AsciiUpper {
768 static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
769 return StringDataTransform<Type>(ctx, batch, TransformAsciiUpper, out);
770 }
771 };
772
773 void TransformAsciiLower(const uint8_t* input, int64_t length, uint8_t* output) {
774 std::transform(input, input + length, output, ascii_tolower);
775 }
776
777 template <typename Type>
778 struct AsciiLower {
779 static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
780 return StringDataTransform<Type>(ctx, batch, TransformAsciiLower, out);
781 }
782 };
783
784 void TransformAsciiSwapCase(const uint8_t* input, int64_t length, uint8_t* output) {
785 std::transform(input, input + length, output, ascii_swapcase);
786 }
787
788 template <typename Type>
789 struct AsciiSwapCase {
790 static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
791 return StringDataTransform<Type>(ctx, batch, TransformAsciiSwapCase, out);
792 }
793 };
794
795 struct AsciiCapitalizeTransform : public StringTransformBase {
796 int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
797 uint8_t* output) {
798 if (input_string_ncodeunits > 0) {
799 *output++ = ascii_toupper(*input++);
800 TransformAsciiLower(input, input_string_ncodeunits - 1, output);
801 }
802 return input_string_ncodeunits;
803 }
804 };
805
806 template <typename Type>
807 using AsciiCapitalize = StringTransformExec<Type, AsciiCapitalizeTransform>;
808
809 struct AsciiTitleTransform : public StringTransformBase {
810 int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
811 uint8_t* output) {
812 const uint8_t* end = input + input_string_ncodeunits;
813 const uint8_t* next = input;
814 bool is_next_upper = true;
815 while ((input = next++) < end) {
816 if (IsCasedCharacterAscii(*input)) {
817 // Lower/uppercase current character and
818 // prepare to lowercase next consecutive cased characters
819 *output++ = is_next_upper ? ascii_toupper(*input) : ascii_tolower(*input);
820 is_next_upper = false;
821 } else {
822 // Copy current uncased character and
823 // prepare to uppercase next cased character
824 *output++ = *input;
825 is_next_upper = true;
826 }
827 }
828 return input_string_ncodeunits;
829 }
830 };
831
832 template <typename Type>
833 using AsciiTitle = StringTransformExec<Type, AsciiTitleTransform>;
834
835 // ----------------------------------------------------------------------
836 // exact pattern detection
837
838 using StrToBoolTransformFunc =
839 std::function<void(const void*, const uint8_t*, int64_t, int64_t, uint8_t*)>;
840
841 // Apply `transform` to input character data- this function cannot change the
842 // length
843 template <typename Type>
844 void StringBoolTransform(KernelContext* ctx, const ExecBatch& batch,
845 StrToBoolTransformFunc transform, Datum* out) {
846 using offset_type = typename Type::offset_type;
847
848 if (batch[0].kind() == Datum::ARRAY) {
849 const ArrayData& input = *batch[0].array();
850 ArrayData* out_arr = out->mutable_array();
851 if (input.length > 0) {
852 transform(
853 reinterpret_cast<const offset_type*>(input.buffers[1]->data()) + input.offset,
854 input.buffers[2]->data(), input.length, out_arr->offset,
855 out_arr->buffers[1]->mutable_data());
856 }
857 } else {
858 const auto& input = checked_cast<const BaseBinaryScalar&>(*batch[0].scalar());
859 if (input.is_valid) {
860 uint8_t result_value = 0;
861 std::array<offset_type, 2> offsets{0,
862 static_cast<offset_type>(input.value->size())};
863 transform(offsets.data(), input.value->data(), 1, /*output_offset=*/0,
864 &result_value);
865 out->value = std::make_shared<BooleanScalar>(result_value > 0);
866 }
867 }
868 }
869
870 using MatchSubstringState = OptionsWrapper<MatchSubstringOptions>;
871
872 // This is an implementation of the Knuth-Morris-Pratt algorithm
873 struct PlainSubstringMatcher {
874 const MatchSubstringOptions& options_;
875 std::vector<int64_t> prefix_table;
876
877 static Result<std::unique_ptr<PlainSubstringMatcher>> Make(
878 const MatchSubstringOptions& options) {
879 // Should be handled by partial template specialization below
880 DCHECK(!options.ignore_case);
881 return ::arrow::internal::make_unique<PlainSubstringMatcher>(options);
882 }
883
884 explicit PlainSubstringMatcher(const MatchSubstringOptions& options)
885 : options_(options) {
886 // Phase 1: Build the prefix table
887 const auto pattern_length = options_.pattern.size();
888 prefix_table.resize(pattern_length + 1, /*value=*/0);
889 int64_t prefix_length = -1;
890 prefix_table[0] = -1;
891 for (size_t pos = 0; pos < pattern_length; ++pos) {
892 // The prefix cannot be expanded, reset.
893 while (prefix_length >= 0 &&
894 options_.pattern[pos] != options_.pattern[prefix_length]) {
895 prefix_length = prefix_table[prefix_length];
896 }
897 prefix_length++;
898 prefix_table[pos + 1] = prefix_length;
899 }
900 }
901
902 int64_t Find(util::string_view current) const {
903 // Phase 2: Find the prefix in the data
904 const auto pattern_length = options_.pattern.size();
905 int64_t pattern_pos = 0;
906 int64_t pos = 0;
907 if (pattern_length == 0) return 0;
908 for (const auto c : current) {
909 while ((pattern_pos >= 0) && (options_.pattern[pattern_pos] != c)) {
910 pattern_pos = prefix_table[pattern_pos];
911 }
912 pattern_pos++;
913 if (static_cast<size_t>(pattern_pos) == pattern_length) {
914 return pos + 1 - pattern_length;
915 }
916 pos++;
917 }
918 return -1;
919 }
920
921 bool Match(util::string_view current) const { return Find(current) >= 0; }
922 };
923
924 struct PlainStartsWithMatcher {
925 const MatchSubstringOptions& options_;
926
927 explicit PlainStartsWithMatcher(const MatchSubstringOptions& options)
928 : options_(options) {}
929
930 static Result<std::unique_ptr<PlainStartsWithMatcher>> Make(
931 const MatchSubstringOptions& options) {
932 // Should be handled by partial template specialization below
933 DCHECK(!options.ignore_case);
934 return ::arrow::internal::make_unique<PlainStartsWithMatcher>(options);
935 }
936
937 bool Match(util::string_view current) const {
938 // string_view::starts_with is C++20
939 return current.substr(0, options_.pattern.size()) == options_.pattern;
940 }
941 };
942
943 struct PlainEndsWithMatcher {
944 const MatchSubstringOptions& options_;
945
946 explicit PlainEndsWithMatcher(const MatchSubstringOptions& options)
947 : options_(options) {}
948
949 static Result<std::unique_ptr<PlainEndsWithMatcher>> Make(
950 const MatchSubstringOptions& options) {
951 // Should be handled by partial template specialization below
952 DCHECK(!options.ignore_case);
953 return ::arrow::internal::make_unique<PlainEndsWithMatcher>(options);
954 }
955
956 bool Match(util::string_view current) const {
957 // string_view::ends_with is C++20
958 return current.size() >= options_.pattern.size() &&
959 current.substr(current.size() - options_.pattern.size(),
960 options_.pattern.size()) == options_.pattern;
961 }
962 };
963
964 #ifdef ARROW_WITH_RE2
965 struct RegexSubstringMatcher {
966 const MatchSubstringOptions& options_;
967 const RE2 regex_match_;
968
969 static Result<std::unique_ptr<RegexSubstringMatcher>> Make(
970 const MatchSubstringOptions& options, bool literal = false) {
971 auto matcher =
972 ::arrow::internal::make_unique<RegexSubstringMatcher>(options, literal);
973 RETURN_NOT_OK(RegexStatus(matcher->regex_match_));
974 return std::move(matcher);
975 }
976
977 explicit RegexSubstringMatcher(const MatchSubstringOptions& options,
978 bool literal = false)
979 : options_(options),
980 regex_match_(options_.pattern, MakeRE2Options(options, literal)) {}
981
982 bool Match(util::string_view current) const {
983 auto piece = re2::StringPiece(current.data(), current.length());
984 return re2::RE2::PartialMatch(piece, regex_match_);
985 }
986
987 static RE2::RE2::Options MakeRE2Options(const MatchSubstringOptions& options,
988 bool literal) {
989 RE2::RE2::Options re2_options(RE2::Quiet);
990 re2_options.set_case_sensitive(!options.ignore_case);
991 re2_options.set_literal(literal);
992 return re2_options;
993 }
994 };
995 #endif
996
997 template <typename Type, typename Matcher>
998 struct MatchSubstringImpl {
999 using offset_type = typename Type::offset_type;
1000
1001 static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out,
1002 const Matcher* matcher) {
1003 StringBoolTransform<Type>(
1004 ctx, batch,
1005 [&matcher](const void* raw_offsets, const uint8_t* data, int64_t length,
1006 int64_t output_offset, uint8_t* output) {
1007 const offset_type* offsets = reinterpret_cast<const offset_type*>(raw_offsets);
1008 FirstTimeBitmapWriter bitmap_writer(output, output_offset, length);
1009 for (int64_t i = 0; i < length; ++i) {
1010 const char* current_data = reinterpret_cast<const char*>(data + offsets[i]);
1011 int64_t current_length = offsets[i + 1] - offsets[i];
1012 if (matcher->Match(util::string_view(current_data, current_length))) {
1013 bitmap_writer.Set();
1014 }
1015 bitmap_writer.Next();
1016 }
1017 bitmap_writer.Finish();
1018 },
1019 out);
1020 return Status::OK();
1021 }
1022 };
1023
1024 template <typename Type, typename Matcher>
1025 struct MatchSubstring {
1026 static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
1027 // TODO Cache matcher across invocations (for regex compilation)
1028 ARROW_ASSIGN_OR_RAISE(auto matcher, Matcher::Make(MatchSubstringState::Get(ctx)));
1029 return MatchSubstringImpl<Type, Matcher>::Exec(ctx, batch, out, matcher.get());
1030 }
1031 };
1032
1033 template <typename Type>
1034 struct MatchSubstring<Type, PlainSubstringMatcher> {
1035 static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
1036 auto options = MatchSubstringState::Get(ctx);
1037 if (options.ignore_case) {
1038 #ifdef ARROW_WITH_RE2
1039 ARROW_ASSIGN_OR_RAISE(auto matcher,
1040 RegexSubstringMatcher::Make(options, /*literal=*/true));
1041 return MatchSubstringImpl<Type, RegexSubstringMatcher>::Exec(ctx, batch, out,
1042 matcher.get());
1043 #else
1044 return Status::NotImplemented("ignore_case requires RE2");
1045 #endif
1046 }
1047 ARROW_ASSIGN_OR_RAISE(auto matcher, PlainSubstringMatcher::Make(options));
1048 return MatchSubstringImpl<Type, PlainSubstringMatcher>::Exec(ctx, batch, out,
1049 matcher.get());
1050 }
1051 };
1052
1053 template <typename Type>
1054 struct MatchSubstring<Type, PlainStartsWithMatcher> {
1055 static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
1056 auto options = MatchSubstringState::Get(ctx);
1057 if (options.ignore_case) {
1058 #ifdef ARROW_WITH_RE2
1059 MatchSubstringOptions converted_options = options;
1060 converted_options.pattern = "^" + RE2::QuoteMeta(options.pattern);
1061 ARROW_ASSIGN_OR_RAISE(auto matcher, RegexSubstringMatcher::Make(converted_options));
1062 return MatchSubstringImpl<Type, RegexSubstringMatcher>::Exec(ctx, batch, out,
1063 matcher.get());
1064 #else
1065 return Status::NotImplemented("ignore_case requires RE2");
1066 #endif
1067 }
1068 ARROW_ASSIGN_OR_RAISE(auto matcher, PlainStartsWithMatcher::Make(options));
1069 return MatchSubstringImpl<Type, PlainStartsWithMatcher>::Exec(ctx, batch, out,
1070 matcher.get());
1071 }
1072 };
1073
1074 template <typename Type>
1075 struct MatchSubstring<Type, PlainEndsWithMatcher> {
1076 static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
1077 auto options = MatchSubstringState::Get(ctx);
1078 if (options.ignore_case) {
1079 #ifdef ARROW_WITH_RE2
1080 MatchSubstringOptions converted_options = options;
1081 converted_options.pattern = RE2::QuoteMeta(options.pattern) + "$";
1082 ARROW_ASSIGN_OR_RAISE(auto matcher, RegexSubstringMatcher::Make(converted_options));
1083 return MatchSubstringImpl<Type, RegexSubstringMatcher>::Exec(ctx, batch, out,
1084 matcher.get());
1085 #else
1086 return Status::NotImplemented("ignore_case requires RE2");
1087 #endif
1088 }
1089 ARROW_ASSIGN_OR_RAISE(auto matcher, PlainEndsWithMatcher::Make(options));
1090 return MatchSubstringImpl<Type, PlainEndsWithMatcher>::Exec(ctx, batch, out,
1091 matcher.get());
1092 }
1093 };
1094
1095 const FunctionDoc match_substring_doc(
1096 "Match strings against literal pattern",
1097 ("For each string in `strings`, emit true iff it contains a given pattern.\n"
1098 "Null inputs emit null. The pattern must be given in MatchSubstringOptions. "
1099 "If ignore_case is set, only simple case folding is performed."),
1100 {"strings"}, "MatchSubstringOptions");
1101
1102 const FunctionDoc starts_with_doc(
1103 "Check if strings start with a literal pattern",
1104 ("For each string in `strings`, emit true iff it starts with a given pattern.\n"
1105 "Null inputs emit null. The pattern must be given in MatchSubstringOptions. "
1106 "If ignore_case is set, only simple case folding is performed."),
1107 {"strings"}, "MatchSubstringOptions");
1108
1109 const FunctionDoc ends_with_doc(
1110 "Check if strings end with a literal pattern",
1111 ("For each string in `strings`, emit true iff it ends with a given pattern.\n"
1112 "Null inputs emit null. The pattern must be given in MatchSubstringOptions. "
1113 "If ignore_case is set, only simple case folding is performed."),
1114 {"strings"}, "MatchSubstringOptions");
1115
1116 #ifdef ARROW_WITH_RE2
1117 const FunctionDoc match_substring_regex_doc(
1118 "Match strings against regex pattern",
1119 ("For each string in `strings`, emit true iff it matches a given pattern at any "
1120 "position.\n"
1121 "Null inputs emit null. The pattern must be given in MatchSubstringOptions. "
1122 "If ignore_case is set, only simple case folding is performed."),
1123 {"strings"}, "MatchSubstringOptions");
1124
1125 // SQL LIKE match
1126
1127 /// Convert a SQL-style LIKE pattern (using '%' and '_') into a regex pattern
1128 std::string MakeLikeRegex(const MatchSubstringOptions& options) {
1129 // Allow . to match \n
1130 std::string like_pattern = "(?s:^";
1131 like_pattern.reserve(options.pattern.size() + 7);
1132 bool escaped = false;
1133 for (const char c : options.pattern) {
1134 if (!escaped && c == '%') {
1135 like_pattern.append(".*");
1136 } else if (!escaped && c == '_') {
1137 like_pattern.append(".");
1138 } else if (!escaped && c == '\\') {
1139 escaped = true;
1140 } else {
1141 switch (c) {
1142 case '.':
1143 case '?':
1144 case '+':
1145 case '*':
1146 case '^':
1147 case '$':
1148 case '\\':
1149 case '[':
1150 case '{':
1151 case '(':
1152 case ')':
1153 case '|': {
1154 like_pattern.push_back('\\');
1155 like_pattern.push_back(c);
1156 escaped = false;
1157 break;
1158 }
1159 default: {
1160 like_pattern.push_back(c);
1161 escaped = false;
1162 break;
1163 }
1164 }
1165 }
1166 }
1167 like_pattern.append("$)");
1168 return like_pattern;
1169 }
1170
1171 // Evaluate a SQL-like LIKE pattern by translating it to a regexp or
1172 // substring search as appropriate. See what Apache Impala does:
1173 // https://github.com/apache/impala/blob/9c38568657d62b6f6d7b10aa1c721ba843374dd8/be/src/exprs/like-predicate.cc
1174 template <typename StringType>
1175 struct MatchLike {
1176 static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
1177 // NOTE: avoid making those constants global to avoid compiling regexes at startup
1178 // A LIKE pattern matching this regex can be translated into a substring search.
1179 static const RE2 kLikePatternIsSubstringMatch(R"(%+([^%_]*[^\\%_])?%+)");
1180 // A LIKE pattern matching this regex can be translated into a prefix search.
1181 static const RE2 kLikePatternIsStartsWith(R"(([^%_]*[^\\%_])?%+)");
1182 // A LIKE pattern matching this regex can be translated into a suffix search.
1183 static const RE2 kLikePatternIsEndsWith(R"(%+([^%_]*))");
1184
1185 auto original_options = MatchSubstringState::Get(ctx);
1186 auto original_state = ctx->state();
1187
1188 Status status;
1189 std::string pattern;
1190 if (!original_options.ignore_case &&
1191 re2::RE2::FullMatch(original_options.pattern, kLikePatternIsSubstringMatch,
1192 &pattern)) {
1193 MatchSubstringOptions converted_options{pattern, original_options.ignore_case};
1194 MatchSubstringState converted_state(converted_options);
1195 ctx->SetState(&converted_state);
1196 status = MatchSubstring<StringType, PlainSubstringMatcher>::Exec(ctx, batch, out);
1197 } else if (!original_options.ignore_case &&
1198 re2::RE2::FullMatch(original_options.pattern, kLikePatternIsStartsWith,
1199 &pattern)) {
1200 MatchSubstringOptions converted_options{pattern, original_options.ignore_case};
1201 MatchSubstringState converted_state(converted_options);
1202 ctx->SetState(&converted_state);
1203 status = MatchSubstring<StringType, PlainStartsWithMatcher>::Exec(ctx, batch, out);
1204 } else if (!original_options.ignore_case &&
1205 re2::RE2::FullMatch(original_options.pattern, kLikePatternIsEndsWith,
1206 &pattern)) {
1207 MatchSubstringOptions converted_options{pattern, original_options.ignore_case};
1208 MatchSubstringState converted_state(converted_options);
1209 ctx->SetState(&converted_state);
1210 status = MatchSubstring<StringType, PlainEndsWithMatcher>::Exec(ctx, batch, out);
1211 } else {
1212 MatchSubstringOptions converted_options{MakeLikeRegex(original_options),
1213 original_options.ignore_case};
1214 MatchSubstringState converted_state(converted_options);
1215 ctx->SetState(&converted_state);
1216 status = MatchSubstring<StringType, RegexSubstringMatcher>::Exec(ctx, batch, out);
1217 }
1218 ctx->SetState(original_state);
1219 return status;
1220 }
1221 };
1222
1223 const FunctionDoc match_like_doc(
1224 "Match strings against SQL-style LIKE pattern",
1225 ("For each string in `strings`, emit true iff it fully matches a given pattern "
1226 "at any position. That is, '%' will match any number of characters, '_' will "
1227 "match exactly one character, and any other character matches itself. To "
1228 "match a literal '%', '_', or '\\', precede the character with a backslash.\n"
1229 "Null inputs emit null. The pattern must be given in MatchSubstringOptions."),
1230 {"strings"}, "MatchSubstringOptions");
1231
1232 #endif
1233
1234 void AddMatchSubstring(FunctionRegistry* registry) {
1235 {
1236 auto func = std::make_shared<ScalarFunction>("match_substring", Arity::Unary(),
1237 &match_substring_doc);
1238 auto exec_32 = MatchSubstring<StringType, PlainSubstringMatcher>::Exec;
1239 auto exec_64 = MatchSubstring<LargeStringType, PlainSubstringMatcher>::Exec;
1240 DCHECK_OK(func->AddKernel({utf8()}, boolean(), exec_32, MatchSubstringState::Init));
1241 DCHECK_OK(
1242 func->AddKernel({large_utf8()}, boolean(), exec_64, MatchSubstringState::Init));
1243 DCHECK_OK(registry->AddFunction(std::move(func)));
1244 }
1245 {
1246 auto func = std::make_shared<ScalarFunction>("starts_with", Arity::Unary(),
1247 &match_substring_doc);
1248 auto exec_32 = MatchSubstring<StringType, PlainStartsWithMatcher>::Exec;
1249 auto exec_64 = MatchSubstring<LargeStringType, PlainStartsWithMatcher>::Exec;
1250 DCHECK_OK(func->AddKernel({utf8()}, boolean(), exec_32, MatchSubstringState::Init));
1251 DCHECK_OK(
1252 func->AddKernel({large_utf8()}, boolean(), exec_64, MatchSubstringState::Init));
1253 DCHECK_OK(registry->AddFunction(std::move(func)));
1254 }
1255 {
1256 auto func = std::make_shared<ScalarFunction>("ends_with", Arity::Unary(),
1257 &match_substring_doc);
1258 auto exec_32 = MatchSubstring<StringType, PlainEndsWithMatcher>::Exec;
1259 auto exec_64 = MatchSubstring<LargeStringType, PlainEndsWithMatcher>::Exec;
1260 DCHECK_OK(func->AddKernel({utf8()}, boolean(), exec_32, MatchSubstringState::Init));
1261 DCHECK_OK(
1262 func->AddKernel({large_utf8()}, boolean(), exec_64, MatchSubstringState::Init));
1263 DCHECK_OK(registry->AddFunction(std::move(func)));
1264 }
1265 #ifdef ARROW_WITH_RE2
1266 {
1267 auto func = std::make_shared<ScalarFunction>("match_substring_regex", Arity::Unary(),
1268 &match_substring_regex_doc);
1269 auto exec_32 = MatchSubstring<StringType, RegexSubstringMatcher>::Exec;
1270 auto exec_64 = MatchSubstring<LargeStringType, RegexSubstringMatcher>::Exec;
1271 DCHECK_OK(func->AddKernel({utf8()}, boolean(), exec_32, MatchSubstringState::Init));
1272 DCHECK_OK(
1273 func->AddKernel({large_utf8()}, boolean(), exec_64, MatchSubstringState::Init));
1274 DCHECK_OK(registry->AddFunction(std::move(func)));
1275 }
1276 {
1277 auto func =
1278 std::make_shared<ScalarFunction>("match_like", Arity::Unary(), &match_like_doc);
1279 auto exec_32 = MatchLike<StringType>::Exec;
1280 auto exec_64 = MatchLike<LargeStringType>::Exec;
1281 DCHECK_OK(func->AddKernel({utf8()}, boolean(), exec_32, MatchSubstringState::Init));
1282 DCHECK_OK(
1283 func->AddKernel({large_utf8()}, boolean(), exec_64, MatchSubstringState::Init));
1284 DCHECK_OK(registry->AddFunction(std::move(func)));
1285 }
1286 #endif
1287 }
1288
1289 // Substring find - lfind/index/etc.
1290
1291 struct FindSubstring {
1292 const PlainSubstringMatcher matcher_;
1293
1294 explicit FindSubstring(PlainSubstringMatcher matcher) : matcher_(std::move(matcher)) {}
1295
1296 template <typename OutValue, typename... Ignored>
1297 OutValue Call(KernelContext*, util::string_view val, Status*) const {
1298 return static_cast<OutValue>(matcher_.Find(val));
1299 }
1300 };
1301
1302 #ifdef ARROW_WITH_RE2
1303 struct FindSubstringRegex {
1304 std::unique_ptr<RE2> regex_match_;
1305
1306 explicit FindSubstringRegex(const MatchSubstringOptions& options,
1307 bool literal = false) {
1308 std::string regex = "(";
1309 regex.reserve(options.pattern.length() + 2);
1310 regex += literal ? RE2::QuoteMeta(options.pattern) : options.pattern;
1311 regex += ")";
1312 regex_match_.reset(new RE2(std::move(regex), RegexSubstringMatcher::MakeRE2Options(
1313 options, /*literal=*/false)));
1314 }
1315
1316 template <typename OutValue, typename... Ignored>
1317 OutValue Call(KernelContext*, util::string_view val, Status*) const {
1318 re2::StringPiece piece(val.data(), val.length());
1319 re2::StringPiece match;
1320 if (re2::RE2::PartialMatch(piece, *regex_match_, &match)) {
1321 return static_cast<OutValue>(match.data() - piece.data());
1322 }
1323 return -1;
1324 }
1325 };
1326 #endif
1327
1328 template <typename InputType>
1329 struct FindSubstringExec {
1330 using OffsetType = typename TypeTraits<InputType>::OffsetType;
1331 static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
1332 const MatchSubstringOptions& options = MatchSubstringState::Get(ctx);
1333 if (options.ignore_case) {
1334 #ifdef ARROW_WITH_RE2
1335 applicator::ScalarUnaryNotNullStateful<OffsetType, InputType, FindSubstringRegex>
1336 kernel{FindSubstringRegex(options, /*literal=*/true)};
1337 return kernel.Exec(ctx, batch, out);
1338 #endif
1339 return Status::NotImplemented("ignore_case requires RE2");
1340 }
1341 applicator::ScalarUnaryNotNullStateful<OffsetType, InputType, FindSubstring> kernel{
1342 FindSubstring(PlainSubstringMatcher(options))};
1343 return kernel.Exec(ctx, batch, out);
1344 }
1345 };
1346
1347 const FunctionDoc find_substring_doc(
1348 "Find first occurrence of substring",
1349 ("For each string in `strings`, emit the index of the first occurrence of the given "
1350 "pattern, or -1 if not found.\n"
1351 "Null inputs emit null. The pattern must be given in MatchSubstringOptions."),
1352 {"strings"}, "MatchSubstringOptions");
1353
1354 #ifdef ARROW_WITH_RE2
1355 template <typename InputType>
1356 struct FindSubstringRegexExec {
1357 using OffsetType = typename TypeTraits<InputType>::OffsetType;
1358 static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
1359 const MatchSubstringOptions& options = MatchSubstringState::Get(ctx);
1360 applicator::ScalarUnaryNotNullStateful<OffsetType, InputType, FindSubstringRegex>
1361 kernel{FindSubstringRegex(options, /*literal=*/false)};
1362 return kernel.Exec(ctx, batch, out);
1363 }
1364 };
1365
1366 const FunctionDoc find_substring_regex_doc(
1367 "Find location of first match of regex pattern",
1368 ("For each string in `strings`, emit the index of the first match of the given "
1369 "pattern, or -1 if not found.\n"
1370 "Null inputs emit null. The pattern must be given in MatchSubstringOptions."),
1371 {"strings"}, "MatchSubstringOptions");
1372 #endif
1373
1374 void AddFindSubstring(FunctionRegistry* registry) {
1375 {
1376 auto func = std::make_shared<ScalarFunction>("find_substring", Arity::Unary(),
1377 &find_substring_doc);
1378 for (const auto& ty : BaseBinaryTypes()) {
1379 auto offset_type = offset_bit_width(ty->id()) == 64 ? int64() : int32();
1380 DCHECK_OK(func->AddKernel({ty}, offset_type,
1381 GenerateTypeAgnosticVarBinaryBase<FindSubstringExec>(ty),
1382 MatchSubstringState::Init));
1383 }
1384 DCHECK_OK(func->AddKernel({InputType(Type::FIXED_SIZE_BINARY)}, int32(),
1385 FindSubstringExec<FixedSizeBinaryType>::Exec,
1386 MatchSubstringState::Init));
1387 DCHECK_OK(registry->AddFunction(std::move(func)));
1388 }
1389 #ifdef ARROW_WITH_RE2
1390 {
1391 auto func = std::make_shared<ScalarFunction>("find_substring_regex", Arity::Unary(),
1392 &find_substring_regex_doc);
1393 for (const auto& ty : BaseBinaryTypes()) {
1394 auto offset_type = offset_bit_width(ty->id()) == 64 ? int64() : int32();
1395 DCHECK_OK(
1396 func->AddKernel({ty}, offset_type,
1397 GenerateTypeAgnosticVarBinaryBase<FindSubstringRegexExec>(ty),
1398 MatchSubstringState::Init));
1399 }
1400 DCHECK_OK(func->AddKernel({InputType(Type::FIXED_SIZE_BINARY)}, int32(),
1401 FindSubstringRegexExec<FixedSizeBinaryType>::Exec,
1402 MatchSubstringState::Init));
1403 DCHECK_OK(registry->AddFunction(std::move(func)));
1404 }
1405 #endif
1406 }
1407
1408 // Substring count
1409
1410 struct CountSubstring {
1411 const PlainSubstringMatcher matcher_;
1412
1413 explicit CountSubstring(PlainSubstringMatcher matcher) : matcher_(std::move(matcher)) {}
1414
1415 template <typename OutValue, typename... Ignored>
1416 OutValue Call(KernelContext*, util::string_view val, Status*) const {
1417 OutValue count = 0;
1418 uint64_t start = 0;
1419 const auto pattern_size = std::max<uint64_t>(1, matcher_.options_.pattern.size());
1420 while (start <= val.size()) {
1421 const int64_t index = matcher_.Find(val.substr(start));
1422 if (index >= 0) {
1423 count++;
1424 start += index + pattern_size;
1425 } else {
1426 break;
1427 }
1428 }
1429 return count;
1430 }
1431 };
1432
1433 #ifdef ARROW_WITH_RE2
1434 struct CountSubstringRegex {
1435 std::unique_ptr<RE2> regex_match_;
1436
1437 explicit CountSubstringRegex(const MatchSubstringOptions& options, bool literal = false)
1438 : regex_match_(new RE2(options.pattern,
1439 RegexSubstringMatcher::MakeRE2Options(options, literal))) {}
1440
1441 static Result<CountSubstringRegex> Make(const MatchSubstringOptions& options,
1442 bool literal = false) {
1443 CountSubstringRegex counter(options, literal);
1444 RETURN_NOT_OK(RegexStatus(*counter.regex_match_));
1445 return std::move(counter);
1446 }
1447
1448 template <typename OutValue, typename... Ignored>
1449 OutValue Call(KernelContext*, util::string_view val, Status*) const {
1450 OutValue count = 0;
1451 re2::StringPiece input(val.data(), val.size());
1452 auto last_size = input.size();
1453 while (re2::RE2::FindAndConsume(&input, *regex_match_)) {
1454 count++;
1455 if (last_size == input.size()) {
1456 // 0-length match
1457 if (input.size() > 0) {
1458 input.remove_prefix(1);
1459 } else {
1460 break;
1461 }
1462 }
1463 last_size = input.size();
1464 }
1465 return count;
1466 }
1467 };
1468
1469 template <typename InputType>
1470 struct CountSubstringRegexExec {
1471 using OffsetType = typename TypeTraits<InputType>::OffsetType;
1472 static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
1473 const MatchSubstringOptions& options = MatchSubstringState::Get(ctx);
1474 ARROW_ASSIGN_OR_RAISE(auto counter, CountSubstringRegex::Make(options));
1475 applicator::ScalarUnaryNotNullStateful<OffsetType, InputType, CountSubstringRegex>
1476 kernel{std::move(counter)};
1477 return kernel.Exec(ctx, batch, out);
1478 }
1479 };
1480 #endif
1481
1482 template <typename InputType>
1483 struct CountSubstringExec {
1484 using OffsetType = typename TypeTraits<InputType>::OffsetType;
1485 static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
1486 const MatchSubstringOptions& options = MatchSubstringState::Get(ctx);
1487 if (options.ignore_case) {
1488 #ifdef ARROW_WITH_RE2
1489 ARROW_ASSIGN_OR_RAISE(auto counter,
1490 CountSubstringRegex::Make(options, /*literal=*/true));
1491 applicator::ScalarUnaryNotNullStateful<OffsetType, InputType, CountSubstringRegex>
1492 kernel{std::move(counter)};
1493 return kernel.Exec(ctx, batch, out);
1494 #else
1495 return Status::NotImplemented("ignore_case requires RE2");
1496 #endif
1497 }
1498 applicator::ScalarUnaryNotNullStateful<OffsetType, InputType, CountSubstring> kernel{
1499 CountSubstring(PlainSubstringMatcher(options))};
1500 return kernel.Exec(ctx, batch, out);
1501 }
1502 };
1503
1504 const FunctionDoc count_substring_doc(
1505 "Count occurrences of substring",
1506 ("For each string in `strings`, emit the number of occurrences of the given "
1507 "pattern.\n"
1508 "Null inputs emit null. The pattern must be given in MatchSubstringOptions."),
1509 {"strings"}, "MatchSubstringOptions");
1510
1511 #ifdef ARROW_WITH_RE2
1512 const FunctionDoc count_substring_regex_doc(
1513 "Count occurrences of substring",
1514 ("For each string in `strings`, emit the number of occurrences of the given "
1515 "regex pattern.\n"
1516 "Null inputs emit null. The pattern must be given in MatchSubstringOptions."),
1517 {"strings"}, "MatchSubstringOptions");
1518 #endif
1519
1520 void AddCountSubstring(FunctionRegistry* registry) {
1521 {
1522 auto func = std::make_shared<ScalarFunction>("count_substring", Arity::Unary(),
1523 &count_substring_doc);
1524 for (const auto& ty : BaseBinaryTypes()) {
1525 auto offset_type = offset_bit_width(ty->id()) == 64 ? int64() : int32();
1526 DCHECK_OK(func->AddKernel({ty}, offset_type,
1527 GenerateTypeAgnosticVarBinaryBase<CountSubstringExec>(ty),
1528 MatchSubstringState::Init));
1529 }
1530 DCHECK_OK(func->AddKernel({InputType(Type::FIXED_SIZE_BINARY)}, int32(),
1531 CountSubstringExec<FixedSizeBinaryType>::Exec,
1532 MatchSubstringState::Init));
1533 DCHECK_OK(registry->AddFunction(std::move(func)));
1534 }
1535 #ifdef ARROW_WITH_RE2
1536 {
1537 auto func = std::make_shared<ScalarFunction>("count_substring_regex", Arity::Unary(),
1538 &count_substring_regex_doc);
1539 for (const auto& ty : BaseBinaryTypes()) {
1540 auto offset_type = offset_bit_width(ty->id()) == 64 ? int64() : int32();
1541 DCHECK_OK(
1542 func->AddKernel({ty}, offset_type,
1543 GenerateTypeAgnosticVarBinaryBase<CountSubstringRegexExec>(ty),
1544 MatchSubstringState::Init));
1545 }
1546 DCHECK_OK(func->AddKernel({InputType(Type::FIXED_SIZE_BINARY)}, int32(),
1547 CountSubstringRegexExec<FixedSizeBinaryType>::Exec,
1548 MatchSubstringState::Init));
1549 DCHECK_OK(registry->AddFunction(std::move(func)));
1550 }
1551 #endif
1552 }
1553
1554 // Slicing
1555
1556 struct SliceTransformBase : public StringTransformBase {
1557 using State = OptionsWrapper<SliceOptions>;
1558
1559 const SliceOptions* options;
1560
1561 Status PreExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) override {
1562 options = &State::Get(ctx);
1563 if (options->step == 0) {
1564 return Status::Invalid("Slice step cannot be zero");
1565 }
1566 return Status::OK();
1567 }
1568 };
1569
1570 struct SliceCodeunitsTransform : SliceTransformBase {
1571 int64_t MaxCodeunits(int64_t ninputs, int64_t input_ncodeunits) override {
1572 const SliceOptions& opt = *this->options;
1573 if ((opt.start >= 0) != (opt.stop >= 0)) {
1574 // If start and stop don't have the same sign, we can't guess an upper bound
1575 // on the resulting slice lengths, so return a worst case estimate.
1576 return input_ncodeunits;
1577 }
1578 int64_t max_slice_codepoints = (opt.stop - opt.start + opt.step - 1) / opt.step;
1579 // The maximum UTF8 byte size of a codepoint is 4
1580 return std::min(input_ncodeunits,
1581 4 * ninputs * std::max<int64_t>(0, max_slice_codepoints));
1582 }
1583
1584 int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
1585 uint8_t* output) {
1586 if (options->step >= 1) {
1587 return SliceForward(input, input_string_ncodeunits, output);
1588 }
1589 return SliceBackward(input, input_string_ncodeunits, output);
1590 }
1591
1592 #define RETURN_IF_UTF8_ERROR(expr) \
1593 do { \
1594 if (ARROW_PREDICT_FALSE(!expr)) { \
1595 return kTransformError; \
1596 } \
1597 } while (0)
1598
1599 int64_t SliceForward(const uint8_t* input, int64_t input_string_ncodeunits,
1600 uint8_t* output) {
1601 // Slice in forward order (step > 0)
1602 const SliceOptions& opt = *this->options;
1603 const uint8_t* begin = input;
1604 const uint8_t* end = input + input_string_ncodeunits;
1605 const uint8_t* begin_sliced = begin;
1606 const uint8_t* end_sliced = end;
1607
1608 // First, compute begin_sliced and end_sliced
1609 if (opt.start >= 0) {
1610 // start counting from the left
1611 RETURN_IF_UTF8_ERROR(
1612 arrow::util::UTF8AdvanceCodepoints(begin, end, &begin_sliced, opt.start));
1613 if (opt.stop > opt.start) {
1614 // continue counting from begin_sliced
1615 const int64_t length = opt.stop - opt.start;
1616 RETURN_IF_UTF8_ERROR(
1617 arrow::util::UTF8AdvanceCodepoints(begin_sliced, end, &end_sliced, length));
1618 } else if (opt.stop < 0) {
1619 // or from the end (but we will never need to < begin_sliced)
1620 RETURN_IF_UTF8_ERROR(arrow::util::UTF8AdvanceCodepointsReverse(
1621 begin_sliced, end, &end_sliced, -opt.stop));
1622 } else {
1623 // zero length slice
1624 return 0;
1625 }
1626 } else {
1627 // start counting from the right
1628 RETURN_IF_UTF8_ERROR(arrow::util::UTF8AdvanceCodepointsReverse(
1629 begin, end, &begin_sliced, -opt.start));
1630 if (opt.stop > 0) {
1631 // continue counting from the left, we cannot start from begin_sliced because we
1632 // don't know how many codepoints are between begin and begin_sliced
1633 RETURN_IF_UTF8_ERROR(
1634 arrow::util::UTF8AdvanceCodepoints(begin, end, &end_sliced, opt.stop));
1635 // and therefore we also needs this
1636 if (end_sliced <= begin_sliced) {
1637 // zero length slice
1638 return 0;
1639 }
1640 } else if ((opt.stop < 0) && (opt.stop > opt.start)) {
1641 // stop is negative, but larger than start, so we count again from the right
1642 // in some cases we can optimize this, depending on the shortest path (from end
1643 // or begin_sliced), but begin_sliced and opt.start can be 'out of sync',
1644 // for instance when start=-100, when the string length is only 10.
1645 RETURN_IF_UTF8_ERROR(arrow::util::UTF8AdvanceCodepointsReverse(
1646 begin_sliced, end, &end_sliced, -opt.stop));
1647 } else {
1648 // zero length slice
1649 return 0;
1650 }
1651 }
1652
1653 // Second, copy computed slice to output
1654 DCHECK(begin_sliced <= end_sliced);
1655 if (opt.step == 1) {
1656 // fast case, where we simply can finish with a memcpy
1657 std::copy(begin_sliced, end_sliced, output);
1658 return end_sliced - begin_sliced;
1659 }
1660 uint8_t* dest = output;
1661 const uint8_t* i = begin_sliced;
1662
1663 while (i < end_sliced) {
1664 uint32_t codepoint = 0;
1665 // write a single codepoint
1666 RETURN_IF_UTF8_ERROR(arrow::util::UTF8Decode(&i, &codepoint));
1667 dest = arrow::util::UTF8Encode(dest, codepoint);
1668 // and skip the remainder
1669 int64_t skips = opt.step - 1;
1670 while ((skips--) && (i < end_sliced)) {
1671 RETURN_IF_UTF8_ERROR(arrow::util::UTF8Decode(&i, &codepoint));
1672 }
1673 }
1674 return dest - output;
1675 }
1676
1677 int64_t SliceBackward(const uint8_t* input, int64_t input_string_ncodeunits,
1678 uint8_t* output) {
1679 // Slice in reverse order (step < 0)
1680 const SliceOptions& opt = *this->options;
1681 const uint8_t* begin = input;
1682 const uint8_t* end = input + input_string_ncodeunits;
1683 const uint8_t* begin_sliced = begin;
1684 const uint8_t* end_sliced = end;
1685
1686 // Serious +1 -1 kung fu because begin_sliced and end_sliced act like
1687 // reverse iterators.
1688 if (opt.start >= 0) {
1689 // +1 because begin_sliced acts as as the end of a reverse iterator
1690 RETURN_IF_UTF8_ERROR(
1691 arrow::util::UTF8AdvanceCodepoints(begin, end, &begin_sliced, opt.start + 1));
1692 } else {
1693 // -1 because start=-1 means the last codeunit, which is 0 advances
1694 RETURN_IF_UTF8_ERROR(arrow::util::UTF8AdvanceCodepointsReverse(
1695 begin, end, &begin_sliced, -opt.start - 1));
1696 }
1697 // make it point at the last codeunit of the previous codeunit
1698 begin_sliced--;
1699
1700 // similar to opt.start
1701 if (opt.stop >= 0) {
1702 RETURN_IF_UTF8_ERROR(
1703 arrow::util::UTF8AdvanceCodepoints(begin, end, &end_sliced, opt.stop + 1));
1704 } else {
1705 RETURN_IF_UTF8_ERROR(arrow::util::UTF8AdvanceCodepointsReverse(
1706 begin, end, &end_sliced, -opt.stop - 1));
1707 }
1708 end_sliced--;
1709
1710 // Copy computed slice to output
1711 uint8_t* dest = output;
1712 const uint8_t* i = begin_sliced;
1713 while (i > end_sliced) {
1714 uint32_t codepoint = 0;
1715 // write a single codepoint
1716 RETURN_IF_UTF8_ERROR(arrow::util::UTF8DecodeReverse(&i, &codepoint));
1717 dest = arrow::util::UTF8Encode(dest, codepoint);
1718 // and skip the remainder
1719 int64_t skips = -opt.step - 1;
1720 while ((skips--) && (i > end_sliced)) {
1721 RETURN_IF_UTF8_ERROR(arrow::util::UTF8DecodeReverse(&i, &codepoint));
1722 }
1723 }
1724 return dest - output;
1725 }
1726
1727 #undef RETURN_IF_UTF8_ERROR
1728 };
1729
1730 template <typename Type>
1731 using SliceCodeunits = StringTransformExec<Type, SliceCodeunitsTransform>;
1732
1733 const FunctionDoc utf8_slice_codeunits_doc(
1734 "Slice string ",
1735 ("For each string in `strings`, slice into a substring defined by\n"
1736 "`start`, `stop`, `step`) as given by `SliceOptions` where `start` is inclusive\n"
1737 "and `stop` is exclusive and are measured in codeunits. If step is negative, the\n"
1738 "string will be advanced in reversed order. A `step` of zero is considered an\n"
1739 "error.\n"
1740 "Null inputs emit null."),
1741 {"strings"}, "SliceOptions");
1742
1743 void AddSlice(FunctionRegistry* registry) {
1744 auto func = std::make_shared<ScalarFunction>("utf8_slice_codeunits", Arity::Unary(),
1745 &utf8_slice_codeunits_doc);
1746 using t32 = SliceCodeunits<StringType>;
1747 using t64 = SliceCodeunits<LargeStringType>;
1748 DCHECK_OK(
1749 func->AddKernel({utf8()}, utf8(), t32::Exec, SliceCodeunitsTransform::State::Init));
1750 DCHECK_OK(func->AddKernel({large_utf8()}, large_utf8(), t64::Exec,
1751 SliceCodeunitsTransform::State::Init));
1752 DCHECK_OK(registry->AddFunction(std::move(func)));
1753 }
1754
1755 template <typename Derived, bool allow_empty = false>
1756 struct CharacterPredicateUnicode {
1757 static bool Call(KernelContext*, const uint8_t* input, size_t input_string_ncodeunits,
1758 Status* st) {
1759 if (allow_empty && input_string_ncodeunits == 0) {
1760 return true;
1761 }
1762 bool all;
1763 bool any = false;
1764 if (!ARROW_PREDICT_TRUE(arrow::util::UTF8AllOf(
1765 input, input + input_string_ncodeunits, &all, [&any](uint32_t codepoint) {
1766 any |= Derived::PredicateCharacterAny(codepoint);
1767 return Derived::PredicateCharacterAll(codepoint);
1768 }))) {
1769 *st = Status::Invalid("Invalid UTF8 sequence in input");
1770 return false;
1771 }
1772 return all & any;
1773 }
1774
1775 static inline bool PredicateCharacterAny(uint32_t) {
1776 return true; // default condition make sure there is at least 1 charachter
1777 }
1778 };
1779
1780 template <typename Derived, bool allow_empty = false>
1781 struct CharacterPredicateAscii {
1782 static bool Call(KernelContext*, const uint8_t* input, size_t input_string_ncodeunits,
1783 Status*) {
1784 if (allow_empty && input_string_ncodeunits == 0) {
1785 return true;
1786 }
1787 bool any = false;
1788 // MB: A simple for loops seems 8% faster on gcc 9.3, running the IsAlphaNumericAscii
1789 // benchmark. I don't consider that worth it.
1790 bool all = std::all_of(input, input + input_string_ncodeunits,
1791 [&any](uint8_t ascii_character) {
1792 any |= Derived::PredicateCharacterAny(ascii_character);
1793 return Derived::PredicateCharacterAll(ascii_character);
1794 });
1795 return all & any;
1796 }
1797
1798 static inline bool PredicateCharacterAny(uint8_t) {
1799 return true; // default condition make sure there is at least 1 charachter
1800 }
1801 };
1802
1803 #ifdef ARROW_WITH_UTF8PROC
1804 struct IsAlphaNumericUnicode : CharacterPredicateUnicode<IsAlphaNumericUnicode> {
1805 static inline bool PredicateCharacterAll(uint32_t codepoint) {
1806 return IsAlphaNumericCharacterUnicode(codepoint);
1807 }
1808 };
1809 #endif
1810
1811 struct IsAlphaNumericAscii : CharacterPredicateAscii<IsAlphaNumericAscii> {
1812 static inline bool PredicateCharacterAll(uint8_t ascii_character) {
1813 return IsAlphaNumericCharacterAscii(ascii_character);
1814 }
1815 };
1816
1817 #ifdef ARROW_WITH_UTF8PROC
1818 struct IsAlphaUnicode : CharacterPredicateUnicode<IsAlphaUnicode> {
1819 static inline bool PredicateCharacterAll(uint32_t codepoint) {
1820 return IsAlphaCharacterUnicode(codepoint);
1821 }
1822 };
1823 #endif
1824
1825 struct IsAlphaAscii : CharacterPredicateAscii<IsAlphaAscii> {
1826 static inline bool PredicateCharacterAll(uint8_t ascii_character) {
1827 return IsAlphaCharacterAscii(ascii_character);
1828 }
1829 };
1830
1831 #ifdef ARROW_WITH_UTF8PROC
1832 struct IsDecimalUnicode : CharacterPredicateUnicode<IsDecimalUnicode> {
1833 static inline bool PredicateCharacterAll(uint32_t codepoint) {
1834 return IsDecimalCharacterUnicode(codepoint);
1835 }
1836 };
1837 #endif
1838
1839 struct IsDecimalAscii : CharacterPredicateAscii<IsDecimalAscii> {
1840 static inline bool PredicateCharacterAll(uint8_t ascii_character) {
1841 return IsDecimalCharacterAscii(ascii_character);
1842 }
1843 };
1844
1845 #ifdef ARROW_WITH_UTF8PROC
1846 struct IsDigitUnicode : CharacterPredicateUnicode<IsDigitUnicode> {
1847 static inline bool PredicateCharacterAll(uint32_t codepoint) {
1848 return IsDigitCharacterUnicode(codepoint);
1849 }
1850 };
1851
1852 struct IsNumericUnicode : CharacterPredicateUnicode<IsNumericUnicode> {
1853 static inline bool PredicateCharacterAll(uint32_t codepoint) {
1854 return IsNumericCharacterUnicode(codepoint);
1855 }
1856 };
1857 #endif
1858
1859 struct IsAscii {
1860 static bool Call(KernelContext*, const uint8_t* input,
1861 size_t input_string_nascii_characters, Status*) {
1862 return std::all_of(input, input + input_string_nascii_characters, IsAsciiCharacter);
1863 }
1864 };
1865
1866 #ifdef ARROW_WITH_UTF8PROC
1867 struct IsLowerUnicode : CharacterPredicateUnicode<IsLowerUnicode> {
1868 static inline bool PredicateCharacterAll(uint32_t codepoint) {
1869 // Only for cased character it needs to be lower case
1870 return !IsCasedCharacterUnicode(codepoint) || IsLowerCaseCharacterUnicode(codepoint);
1871 }
1872 static inline bool PredicateCharacterAny(uint32_t codepoint) {
1873 return IsCasedCharacterUnicode(codepoint); // at least 1 cased character
1874 }
1875 };
1876 #endif
1877
1878 struct IsLowerAscii : CharacterPredicateAscii<IsLowerAscii> {
1879 static inline bool PredicateCharacterAll(uint8_t ascii_character) {
1880 // Only for cased character it needs to be lower case
1881 return !IsCasedCharacterAscii(ascii_character) ||
1882 IsLowerCaseCharacterAscii(ascii_character);
1883 }
1884 static inline bool PredicateCharacterAny(uint8_t ascii_character) {
1885 return IsCasedCharacterAscii(ascii_character); // at least 1 cased character
1886 }
1887 };
1888
1889 #ifdef ARROW_WITH_UTF8PROC
1890 struct IsPrintableUnicode
1891 : CharacterPredicateUnicode<IsPrintableUnicode, /*allow_empty=*/true> {
1892 static inline bool PredicateCharacterAll(uint32_t codepoint) {
1893 return codepoint == ' ' || IsPrintableCharacterUnicode(codepoint);
1894 }
1895 };
1896 #endif
1897
1898 struct IsPrintableAscii
1899 : CharacterPredicateAscii<IsPrintableAscii, /*allow_empty=*/true> {
1900 static inline bool PredicateCharacterAll(uint8_t ascii_character) {
1901 return IsPrintableCharacterAscii(ascii_character);
1902 }
1903 };
1904
1905 #ifdef ARROW_WITH_UTF8PROC
1906 struct IsSpaceUnicode : CharacterPredicateUnicode<IsSpaceUnicode> {
1907 static inline bool PredicateCharacterAll(uint32_t codepoint) {
1908 return IsSpaceCharacterUnicode(codepoint);
1909 }
1910 };
1911 #endif
1912
1913 struct IsSpaceAscii : CharacterPredicateAscii<IsSpaceAscii> {
1914 static inline bool PredicateCharacterAll(uint8_t ascii_character) {
1915 return IsSpaceCharacterAscii(ascii_character);
1916 }
1917 };
1918
1919 #ifdef ARROW_WITH_UTF8PROC
1920 struct IsTitleUnicode {
1921 static bool Call(KernelContext*, const uint8_t* input, size_t input_string_ncodeunits,
1922 Status* st) {
1923 // rules:
1924 // 1. lower case follows cased
1925 // 2. upper case follows uncased
1926 // 3. at least 1 cased character (which logically should be upper/title)
1927 bool rules_1_and_2;
1928 bool previous_cased = false; // in LL, LU or LT
1929 bool rule_3 = false;
1930 bool status =
1931 arrow::util::UTF8AllOf(input, input + input_string_ncodeunits, &rules_1_and_2,
1932 [&previous_cased, &rule_3](uint32_t codepoint) {
1933 if (IsLowerCaseCharacterUnicode(codepoint)) {
1934 if (!previous_cased) return false; // rule 1 broken
1935 // next should be more lower case or uncased
1936 previous_cased = true;
1937 } else if (IsCasedCharacterUnicode(codepoint)) {
1938 if (previous_cased) return false; // rule 2 broken
1939 // next should be a lower case or uncased
1940 previous_cased = true;
1941 rule_3 = true; // rule 3 obeyed
1942 } else {
1943 // an uncased char, like _ or 1
1944 // next should be upper case or more uncased
1945 previous_cased = false;
1946 }
1947 return true;
1948 });
1949 if (!ARROW_PREDICT_TRUE(status)) {
1950 *st = Status::Invalid("Invalid UTF8 sequence in input");
1951 return false;
1952 }
1953 return rules_1_and_2 & rule_3;
1954 }
1955 };
1956 #endif
1957
1958 struct IsTitleAscii {
1959 static bool Call(KernelContext*, const uint8_t* input, size_t input_string_ncodeunits,
1960 Status*) {
1961 // Rules:
1962 // 1. lower case follows cased
1963 // 2. upper case follows uncased
1964 // 3. at least 1 cased character (which logically should be upper/title)
1965 bool rules_1_and_2 = true;
1966 bool previous_cased = false; // in LL, LU or LT
1967 bool rule_3 = false;
1968 for (const uint8_t* c = input; c < input + input_string_ncodeunits; ++c) {
1969 if (IsLowerCaseCharacterAscii(*c)) {
1970 if (!previous_cased) {
1971 // rule 1 broken
1972 rules_1_and_2 = false;
1973 break;
1974 }
1975 // next should be more lower case or uncased
1976 previous_cased = true;
1977 } else if (IsCasedCharacterAscii(*c)) {
1978 if (previous_cased) {
1979 // rule 2 broken
1980 rules_1_and_2 = false;
1981 break;
1982 }
1983 // next should be a lower case or uncased
1984 previous_cased = true;
1985 rule_3 = true; // rule 3 obeyed
1986 } else {
1987 // an uncased character, like _ or 1
1988 // next should be upper case or more uncased
1989 previous_cased = false;
1990 }
1991 }
1992 return rules_1_and_2 & rule_3;
1993 }
1994 };
1995
1996 #ifdef ARROW_WITH_UTF8PROC
1997 struct IsUpperUnicode : CharacterPredicateUnicode<IsUpperUnicode> {
1998 static inline bool PredicateCharacterAll(uint32_t codepoint) {
1999 // Only for cased character it needs to be lower case
2000 return !IsCasedCharacterUnicode(codepoint) || IsUpperCaseCharacterUnicode(codepoint);
2001 }
2002 static inline bool PredicateCharacterAny(uint32_t codepoint) {
2003 return IsCasedCharacterUnicode(codepoint); // at least 1 cased character
2004 }
2005 };
2006 #endif
2007
2008 struct IsUpperAscii : CharacterPredicateAscii<IsUpperAscii> {
2009 static inline bool PredicateCharacterAll(uint8_t ascii_character) {
2010 // Only for cased character it needs to be lower case
2011 return !IsCasedCharacterAscii(ascii_character) ||
2012 IsUpperCaseCharacterAscii(ascii_character);
2013 }
2014 static inline bool PredicateCharacterAny(uint8_t ascii_character) {
2015 return IsCasedCharacterAscii(ascii_character); // at least 1 cased character
2016 }
2017 };
2018
2019 // splitting
2020
2021 template <typename Options>
2022 struct SplitFinderBase {
2023 virtual ~SplitFinderBase() = default;
2024 virtual Status PreExec(const Options& options) { return Status::OK(); }
2025
2026 // Derived classes should also define these methods:
2027 // static bool Find(const uint8_t* begin, const uint8_t* end,
2028 // const uint8_t** separator_begin,
2029 // const uint8_t** separator_end,
2030 // const SplitPatternOptions& options);
2031 //
2032 // static bool FindReverse(const uint8_t* begin, const uint8_t* end,
2033 // const uint8_t** separator_begin,
2034 // const uint8_t** separator_end,
2035 // const SplitPatternOptions& options);
2036 };
2037
2038 template <typename Type, typename ListType, typename SplitFinder,
2039 typename Options = typename SplitFinder::Options>
2040 struct SplitExec {
2041 using string_offset_type = typename Type::offset_type;
2042 using list_offset_type = typename ListType::offset_type;
2043 using ArrayType = typename TypeTraits<Type>::ArrayType;
2044 using ArrayListType = typename TypeTraits<ListType>::ArrayType;
2045 using ListScalarType = typename TypeTraits<ListType>::ScalarType;
2046 using ScalarType = typename TypeTraits<Type>::ScalarType;
2047 using BuilderType = typename TypeTraits<Type>::BuilderType;
2048 using ListOffsetsBuilderType = TypedBufferBuilder<list_offset_type>;
2049 using State = OptionsWrapper<Options>;
2050
2051 // Keep the temporary storage accross individual values, to minimize reallocations
2052 std::vector<util::string_view> parts;
2053 Options options;
2054
2055 explicit SplitExec(const Options& options) : options(options) {}
2056
2057 static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
2058 return SplitExec{State::Get(ctx)}.Execute(ctx, batch, out);
2059 }
2060
2061 Status Execute(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
2062 SplitFinder finder;
2063 RETURN_NOT_OK(finder.PreExec(options));
2064 if (batch[0].kind() == Datum::ARRAY) {
2065 return Execute(ctx, &finder, batch[0].array(), out);
2066 }
2067 DCHECK_EQ(batch[0].kind(), Datum::SCALAR);
2068 return Execute(ctx, &finder, batch[0].scalar(), out);
2069 }
2070
2071 Status Execute(KernelContext* ctx, SplitFinder* finder,
2072 const std::shared_ptr<ArrayData>& data, Datum* out) {
2073 const ArrayType input(data);
2074
2075 BuilderType builder(input.type(), ctx->memory_pool());
2076 // A slight overestimate of the data needed
2077 RETURN_NOT_OK(builder.ReserveData(input.total_values_length()));
2078 // The minimum amount of strings needed
2079 RETURN_NOT_OK(builder.Resize(input.length() - input.null_count()));
2080
2081 ArrayData* output_list = out->mutable_array();
2082 // List offsets were preallocated
2083 auto* list_offsets = output_list->GetMutableValues<list_offset_type>(1);
2084 DCHECK_NE(list_offsets, nullptr);
2085 // Initial value
2086 *list_offsets++ = 0;
2087 for (int64_t i = 0; i < input.length(); ++i) {
2088 if (!input.IsNull(i)) {
2089 RETURN_NOT_OK(SplitString(input.GetView(i), finder, &builder));
2090 if (ARROW_PREDICT_FALSE(builder.length() >
2091 std::numeric_limits<list_offset_type>::max())) {
2092 return Status::CapacityError("List offset does not fit into 32 bit");
2093 }
2094 }
2095 *list_offsets++ = static_cast<list_offset_type>(builder.length());
2096 }
2097 // Assign string array to list child data
2098 std::shared_ptr<Array> string_array;
2099 RETURN_NOT_OK(builder.Finish(&string_array));
2100 output_list->child_data.push_back(string_array->data());
2101 return Status::OK();
2102 }
2103
2104 Status Execute(KernelContext* ctx, SplitFinder* finder,
2105 const std::shared_ptr<Scalar>& scalar, Datum* out) {
2106 const auto& input = checked_cast<const ScalarType&>(*scalar);
2107 auto result = checked_cast<ListScalarType*>(out->scalar().get());
2108 if (input.is_valid) {
2109 result->is_valid = true;
2110 BuilderType builder(input.type, ctx->memory_pool());
2111 util::string_view s(*input.value);
2112 RETURN_NOT_OK(SplitString(s, finder, &builder));
2113 RETURN_NOT_OK(builder.Finish(&result->value));
2114 }
2115 return Status::OK();
2116 }
2117
2118 Status SplitString(const util::string_view& s, SplitFinder* finder,
2119 BuilderType* builder) {
2120 const uint8_t* begin = reinterpret_cast<const uint8_t*>(s.data());
2121 const uint8_t* end = begin + s.length();
2122
2123 int64_t max_splits = options.max_splits;
2124 // if there is no max splits, reversing does not make sense (and is probably less
2125 // efficient), but is useful for testing
2126 if (options.reverse) {
2127 // note that i points 1 further than the 'current'
2128 const uint8_t* i = end;
2129 // we will record the parts in reverse order
2130 parts.clear();
2131 if (max_splits > -1) {
2132 parts.reserve(max_splits + 1);
2133 }
2134 while (max_splits != 0) {
2135 const uint8_t *separator_begin, *separator_end;
2136 // find with whatever algo the part we will 'cut out'
2137 if (finder->FindReverse(begin, i, &separator_begin, &separator_end, options)) {
2138 parts.emplace_back(reinterpret_cast<const char*>(separator_end),
2139 i - separator_end);
2140 i = separator_begin;
2141 max_splits--;
2142 } else {
2143 // if we cannot find a separator, we're done
2144 break;
2145 }
2146 }
2147 parts.emplace_back(reinterpret_cast<const char*>(begin), i - begin);
2148 // now we do the copying
2149 for (auto it = parts.rbegin(); it != parts.rend(); ++it) {
2150 RETURN_NOT_OK(builder->Append(*it));
2151 }
2152 } else {
2153 const uint8_t* i = begin;
2154 while (max_splits != 0) {
2155 const uint8_t *separator_begin, *separator_end;
2156 // find with whatever algo the part we will 'cut out'
2157 if (finder->Find(i, end, &separator_begin, &separator_end, options)) {
2158 // the part till the beginning of the 'cut'
2159 RETURN_NOT_OK(
2160 builder->Append(i, static_cast<string_offset_type>(separator_begin - i)));
2161 i = separator_end;
2162 max_splits--;
2163 } else {
2164 // if we cannot find a separator, we're done
2165 break;
2166 }
2167 }
2168 // trailing part
2169 RETURN_NOT_OK(builder->Append(i, static_cast<string_offset_type>(end - i)));
2170 }
2171 return Status::OK();
2172 }
2173 };
2174
2175 struct SplitPatternFinder : public SplitFinderBase<SplitPatternOptions> {
2176 using Options = SplitPatternOptions;
2177
2178 Status PreExec(const SplitPatternOptions& options) override {
2179 if (options.pattern.length() == 0) {
2180 return Status::Invalid("Empty separator");
2181 }
2182 return Status::OK();
2183 }
2184
2185 static bool Find(const uint8_t* begin, const uint8_t* end,
2186 const uint8_t** separator_begin, const uint8_t** separator_end,
2187 const SplitPatternOptions& options) {
2188 const uint8_t* pattern = reinterpret_cast<const uint8_t*>(options.pattern.c_str());
2189 const int64_t pattern_length = options.pattern.length();
2190 const uint8_t* i = begin;
2191 // this is O(n*m) complexity, we could use the Knuth-Morris-Pratt algorithm used in
2192 // the match kernel
2193 while ((i + pattern_length <= end)) {
2194 i = std::search(i, end, pattern, pattern + pattern_length);
2195 if (i != end) {
2196 *separator_begin = i;
2197 *separator_end = i + pattern_length;
2198 return true;
2199 }
2200 }
2201 return false;
2202 }
2203
2204 static bool FindReverse(const uint8_t* begin, const uint8_t* end,
2205 const uint8_t** separator_begin, const uint8_t** separator_end,
2206 const SplitPatternOptions& options) {
2207 const uint8_t* pattern = reinterpret_cast<const uint8_t*>(options.pattern.c_str());
2208 const int64_t pattern_length = options.pattern.length();
2209 // this is O(n*m) complexity, we could use the Knuth-Morris-Pratt algorithm used in
2210 // the match kernel
2211 std::reverse_iterator<const uint8_t*> ri(end);
2212 std::reverse_iterator<const uint8_t*> rend(begin);
2213 std::reverse_iterator<const uint8_t*> pattern_rbegin(pattern + pattern_length);
2214 std::reverse_iterator<const uint8_t*> pattern_rend(pattern);
2215 while (begin <= ri.base() - pattern_length) {
2216 ri = std::search(ri, rend, pattern_rbegin, pattern_rend);
2217 if (ri != rend) {
2218 *separator_begin = ri.base() - pattern_length;
2219 *separator_end = ri.base();
2220 return true;
2221 }
2222 }
2223 return false;
2224 }
2225 };
2226
2227 template <typename Type, typename ListType>
2228 using SplitPatternExec = SplitExec<Type, ListType, SplitPatternFinder>;
2229
2230 const FunctionDoc split_pattern_doc(
2231 "Split string according to separator",
2232 ("Split each string according to the exact `pattern` defined in\n"
2233 "SplitPatternOptions. The output for each string input is a list\n"
2234 "of strings.\n"
2235 "\n"
2236 "The maximum number of splits and direction of splitting\n"
2237 "(forward, reverse) can optionally be defined in SplitPatternOptions."),
2238 {"strings"}, "SplitPatternOptions");
2239
2240 const FunctionDoc ascii_split_whitespace_doc(
2241 "Split string according to any ASCII whitespace",
2242 ("Split each string according any non-zero length sequence of ASCII\n"
2243 "whitespace characters. The output for each string input is a list\n"
2244 "of strings.\n"
2245 "\n"
2246 "The maximum number of splits and direction of splitting\n"
2247 "(forward, reverse) can optionally be defined in SplitOptions."),
2248 {"strings"}, "SplitOptions");
2249
2250 const FunctionDoc utf8_split_whitespace_doc(
2251 "Split string according to any Unicode whitespace",
2252 ("Split each string according any non-zero length sequence of Unicode\n"
2253 "whitespace characters. The output for each string input is a list\n"
2254 "of strings.\n"
2255 "\n"
2256 "The maximum number of splits and direction of splitting\n"
2257 "(forward, reverse) can optionally be defined in SplitOptions."),
2258 {"strings"}, "SplitOptions");
2259
2260 void AddSplitPattern(FunctionRegistry* registry) {
2261 auto func = std::make_shared<ScalarFunction>("split_pattern", Arity::Unary(),
2262 &split_pattern_doc);
2263 using t32 = SplitPatternExec<StringType, ListType>;
2264 using t64 = SplitPatternExec<LargeStringType, ListType>;
2265 DCHECK_OK(func->AddKernel({utf8()}, {list(utf8())}, t32::Exec, t32::State::Init));
2266 DCHECK_OK(
2267 func->AddKernel({large_utf8()}, {list(large_utf8())}, t64::Exec, t64::State::Init));
2268 DCHECK_OK(registry->AddFunction(std::move(func)));
2269 }
2270
2271 struct SplitWhitespaceAsciiFinder : public SplitFinderBase<SplitOptions> {
2272 using Options = SplitOptions;
2273
2274 static bool Find(const uint8_t* begin, const uint8_t* end,
2275 const uint8_t** separator_begin, const uint8_t** separator_end,
2276 const SplitOptions& options) {
2277 const uint8_t* i = begin;
2278 while (i < end) {
2279 if (IsSpaceCharacterAscii(*i)) {
2280 *separator_begin = i;
2281 do {
2282 i++;
2283 } while (IsSpaceCharacterAscii(*i) && i < end);
2284 *separator_end = i;
2285 return true;
2286 }
2287 i++;
2288 }
2289 return false;
2290 }
2291
2292 static bool FindReverse(const uint8_t* begin, const uint8_t* end,
2293 const uint8_t** separator_begin, const uint8_t** separator_end,
2294 const SplitOptions& options) {
2295 const uint8_t* i = end - 1;
2296 while ((i >= begin)) {
2297 if (IsSpaceCharacterAscii(*i)) {
2298 *separator_end = i + 1;
2299 do {
2300 i--;
2301 } while (IsSpaceCharacterAscii(*i) && i >= begin);
2302 *separator_begin = i + 1;
2303 return true;
2304 }
2305 i--;
2306 }
2307 return false;
2308 }
2309 };
2310
2311 template <typename Type, typename ListType>
2312 using SplitWhitespaceAsciiExec = SplitExec<Type, ListType, SplitWhitespaceAsciiFinder>;
2313
2314 void AddSplitWhitespaceAscii(FunctionRegistry* registry) {
2315 static const SplitOptions default_options{};
2316 auto func =
2317 std::make_shared<ScalarFunction>("ascii_split_whitespace", Arity::Unary(),
2318 &ascii_split_whitespace_doc, &default_options);
2319 using t32 = SplitWhitespaceAsciiExec<StringType, ListType>;
2320 using t64 = SplitWhitespaceAsciiExec<LargeStringType, ListType>;
2321 DCHECK_OK(func->AddKernel({utf8()}, {list(utf8())}, t32::Exec, t32::State::Init));
2322 DCHECK_OK(
2323 func->AddKernel({large_utf8()}, {list(large_utf8())}, t64::Exec, t64::State::Init));
2324 DCHECK_OK(registry->AddFunction(std::move(func)));
2325 }
2326
2327 #ifdef ARROW_WITH_UTF8PROC
2328 struct SplitWhitespaceUtf8Finder : public SplitFinderBase<SplitOptions> {
2329 using Options = SplitOptions;
2330
2331 Status PreExec(const SplitOptions& options) override {
2332 EnsureLookupTablesFilled();
2333 return Status::OK();
2334 }
2335
2336 bool Find(const uint8_t* begin, const uint8_t* end, const uint8_t** separator_begin,
2337 const uint8_t** separator_end, const SplitOptions& options) {
2338 const uint8_t* i = begin;
2339 while ((i < end)) {
2340 uint32_t codepoint = 0;
2341 *separator_begin = i;
2342 if (ARROW_PREDICT_FALSE(!arrow::util::UTF8Decode(&i, &codepoint))) {
2343 return false;
2344 }
2345 if (IsSpaceCharacterUnicode(codepoint)) {
2346 do {
2347 *separator_end = i;
2348 if (ARROW_PREDICT_FALSE(!arrow::util::UTF8Decode(&i, &codepoint))) {
2349 return false;
2350 }
2351 } while (IsSpaceCharacterUnicode(codepoint) && i < end);
2352 return true;
2353 }
2354 }
2355 return false;
2356 }
2357
2358 bool FindReverse(const uint8_t* begin, const uint8_t* end,
2359 const uint8_t** separator_begin, const uint8_t** separator_end,
2360 const SplitOptions& options) {
2361 const uint8_t* i = end - 1;
2362 while ((i >= begin)) {
2363 uint32_t codepoint = 0;
2364 *separator_end = i + 1;
2365 if (ARROW_PREDICT_FALSE(!arrow::util::UTF8DecodeReverse(&i, &codepoint))) {
2366 return false;
2367 }
2368 if (IsSpaceCharacterUnicode(codepoint)) {
2369 do {
2370 *separator_begin = i + 1;
2371 if (ARROW_PREDICT_FALSE(!arrow::util::UTF8DecodeReverse(&i, &codepoint))) {
2372 return false;
2373 }
2374 } while (IsSpaceCharacterUnicode(codepoint) && i >= begin);
2375 return true;
2376 }
2377 }
2378 return false;
2379 }
2380 };
2381
2382 template <typename Type, typename ListType>
2383 using SplitWhitespaceUtf8Exec = SplitExec<Type, ListType, SplitWhitespaceUtf8Finder>;
2384
2385 void AddSplitWhitespaceUTF8(FunctionRegistry* registry) {
2386 static const SplitOptions default_options{};
2387 auto func =
2388 std::make_shared<ScalarFunction>("utf8_split_whitespace", Arity::Unary(),
2389 &utf8_split_whitespace_doc, &default_options);
2390 using t32 = SplitWhitespaceUtf8Exec<StringType, ListType>;
2391 using t64 = SplitWhitespaceUtf8Exec<LargeStringType, ListType>;
2392 DCHECK_OK(func->AddKernel({utf8()}, {list(utf8())}, t32::Exec, t32::State::Init));
2393 DCHECK_OK(
2394 func->AddKernel({large_utf8()}, {list(large_utf8())}, t64::Exec, t64::State::Init));
2395 DCHECK_OK(registry->AddFunction(std::move(func)));
2396 }
2397 #endif // ARROW_WITH_UTF8PROC
2398
2399 #ifdef ARROW_WITH_RE2
2400 struct SplitRegexFinder : public SplitFinderBase<SplitPatternOptions> {
2401 using Options = SplitPatternOptions;
2402
2403 util::optional<RE2> regex_split;
2404
2405 Status PreExec(const SplitPatternOptions& options) override {
2406 if (options.reverse) {
2407 return Status::NotImplemented("Cannot split in reverse with regex");
2408 }
2409 // RE2 does *not* give you the full match! Must wrap the regex in a capture group
2410 // There is FindAndConsume, but it would give only the end of the separator
2411 std::string pattern = "(";
2412 pattern.reserve(options.pattern.size() + 2);
2413 pattern += options.pattern;
2414 pattern += ')';
2415 regex_split.emplace(std::move(pattern));
2416 return RegexStatus(*regex_split);
2417 }
2418
2419 bool Find(const uint8_t* begin, const uint8_t* end, const uint8_t** separator_begin,
2420 const uint8_t** separator_end, const SplitPatternOptions& options) {
2421 re2::StringPiece piece(reinterpret_cast<const char*>(begin),
2422 std::distance(begin, end));
2423 // "StringPiece is mutated to point to matched piece"
2424 re2::StringPiece result;
2425 if (!re2::RE2::PartialMatch(piece, *regex_split, &result)) {
2426 return false;
2427 }
2428 *separator_begin = reinterpret_cast<const uint8_t*>(result.data());
2429 *separator_end = reinterpret_cast<const uint8_t*>(result.data() + result.size());
2430 return true;
2431 }
2432
2433 bool FindReverse(const uint8_t* begin, const uint8_t* end,
2434 const uint8_t** separator_begin, const uint8_t** separator_end,
2435 const SplitPatternOptions& options) {
2436 // Unsupported (see PreExec)
2437 return false;
2438 }
2439 };
2440
2441 template <typename Type, typename ListType>
2442 using SplitRegexExec = SplitExec<Type, ListType, SplitRegexFinder>;
2443
2444 const FunctionDoc split_pattern_regex_doc(
2445 "Split string according to regex pattern",
2446 ("Split each string according to the regex `pattern` defined in\n"
2447 "SplitPatternOptions. The output for each string input is a list\n"
2448 "of strings.\n"
2449 "\n"
2450 "The maximum number of splits and direction of splitting\n"
2451 "(forward, reverse) can optionally be defined in SplitPatternOptions."),
2452 {"strings"}, "SplitPatternOptions");
2453
2454 void AddSplitRegex(FunctionRegistry* registry) {
2455 auto func = std::make_shared<ScalarFunction>("split_pattern_regex", Arity::Unary(),
2456 &split_pattern_regex_doc);
2457 using t32 = SplitRegexExec<StringType, ListType>;
2458 using t64 = SplitRegexExec<LargeStringType, ListType>;
2459 DCHECK_OK(func->AddKernel({utf8()}, {list(utf8())}, t32::Exec, t32::State::Init));
2460 DCHECK_OK(
2461 func->AddKernel({large_utf8()}, {list(large_utf8())}, t64::Exec, t64::State::Init));
2462 DCHECK_OK(registry->AddFunction(std::move(func)));
2463 }
2464 #endif // ARROW_WITH_RE2
2465
2466 void AddSplit(FunctionRegistry* registry) {
2467 AddSplitPattern(registry);
2468 AddSplitWhitespaceAscii(registry);
2469 #ifdef ARROW_WITH_UTF8PROC
2470 AddSplitWhitespaceUTF8(registry);
2471 #endif
2472 #ifdef ARROW_WITH_RE2
2473 AddSplitRegex(registry);
2474 #endif
2475 }
2476
2477 // ----------------------------------------------------------------------
2478 // Replace substring (plain, regex)
2479
2480 template <typename Type, typename Replacer>
2481 struct ReplaceSubString {
2482 using ScalarType = typename TypeTraits<Type>::ScalarType;
2483 using offset_type = typename Type::offset_type;
2484 using ValueDataBuilder = TypedBufferBuilder<uint8_t>;
2485 using OffsetBuilder = TypedBufferBuilder<offset_type>;
2486 using State = OptionsWrapper<ReplaceSubstringOptions>;
2487
2488 static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
2489 // TODO Cache replacer across invocations (for regex compilation)
2490 ARROW_ASSIGN_OR_RAISE(auto replacer, Replacer::Make(State::Get(ctx)));
2491 return Replace(ctx, batch, *replacer, out);
2492 }
2493
2494 static Status Replace(KernelContext* ctx, const ExecBatch& batch,
2495 const Replacer& replacer, Datum* out) {
2496 ValueDataBuilder value_data_builder(ctx->memory_pool());
2497 OffsetBuilder offset_builder(ctx->memory_pool());
2498
2499 if (batch[0].kind() == Datum::ARRAY) {
2500 // We already know how many strings we have, so we can use Reserve/UnsafeAppend
2501 RETURN_NOT_OK(offset_builder.Reserve(batch[0].array()->length + 1));
2502 offset_builder.UnsafeAppend(0); // offsets start at 0
2503
2504 const ArrayData& input = *batch[0].array();
2505 RETURN_NOT_OK(VisitArrayDataInline<Type>(
2506 input,
2507 [&](util::string_view s) {
2508 RETURN_NOT_OK(replacer.ReplaceString(s, &value_data_builder));
2509 offset_builder.UnsafeAppend(
2510 static_cast<offset_type>(value_data_builder.length()));
2511 return Status::OK();
2512 },
2513 [&]() {
2514 // offset for null value
2515 offset_builder.UnsafeAppend(
2516 static_cast<offset_type>(value_data_builder.length()));
2517 return Status::OK();
2518 }));
2519 ArrayData* output = out->mutable_array();
2520 RETURN_NOT_OK(value_data_builder.Finish(&output->buffers[2]));
2521 RETURN_NOT_OK(offset_builder.Finish(&output->buffers[1]));
2522 } else {
2523 const auto& input = checked_cast<const ScalarType&>(*batch[0].scalar());
2524 auto result = std::make_shared<ScalarType>();
2525 if (input.is_valid) {
2526 util::string_view s = static_cast<util::string_view>(*input.value);
2527 RETURN_NOT_OK(replacer.ReplaceString(s, &value_data_builder));
2528 RETURN_NOT_OK(value_data_builder.Finish(&result->value));
2529 result->is_valid = true;
2530 }
2531 out->value = result;
2532 }
2533
2534 return Status::OK();
2535 }
2536 };
2537
2538 struct PlainSubStringReplacer {
2539 const ReplaceSubstringOptions& options_;
2540
2541 static Result<std::unique_ptr<PlainSubStringReplacer>> Make(
2542 const ReplaceSubstringOptions& options) {
2543 return arrow::internal::make_unique<PlainSubStringReplacer>(options);
2544 }
2545
2546 explicit PlainSubStringReplacer(const ReplaceSubstringOptions& options)
2547 : options_(options) {}
2548
2549 Status ReplaceString(util::string_view s, TypedBufferBuilder<uint8_t>* builder) const {
2550 const char* i = s.begin();
2551 const char* end = s.end();
2552 int64_t max_replacements = options_.max_replacements;
2553 while ((i < end) && (max_replacements != 0)) {
2554 const char* pos =
2555 std::search(i, end, options_.pattern.begin(), options_.pattern.end());
2556 if (pos == end) {
2557 RETURN_NOT_OK(builder->Append(reinterpret_cast<const uint8_t*>(i),
2558 static_cast<int64_t>(end - i)));
2559 i = end;
2560 } else {
2561 // the string before the pattern
2562 RETURN_NOT_OK(builder->Append(reinterpret_cast<const uint8_t*>(i),
2563 static_cast<int64_t>(pos - i)));
2564 // the replacement
2565 RETURN_NOT_OK(
2566 builder->Append(reinterpret_cast<const uint8_t*>(options_.replacement.data()),
2567 options_.replacement.length()));
2568 // skip pattern
2569 i = pos + options_.pattern.length();
2570 max_replacements--;
2571 }
2572 }
2573 // if we exited early due to max_replacements, add the trailing part
2574 return builder->Append(reinterpret_cast<const uint8_t*>(i),
2575 static_cast<int64_t>(end - i));
2576 }
2577 };
2578
2579 #ifdef ARROW_WITH_RE2
2580 struct RegexSubStringReplacer {
2581 const ReplaceSubstringOptions& options_;
2582 const RE2 regex_find_;
2583 const RE2 regex_replacement_;
2584
2585 static Result<std::unique_ptr<RegexSubStringReplacer>> Make(
2586 const ReplaceSubstringOptions& options) {
2587 auto replacer = arrow::internal::make_unique<RegexSubStringReplacer>(options);
2588
2589 RETURN_NOT_OK(RegexStatus(replacer->regex_find_));
2590 RETURN_NOT_OK(RegexStatus(replacer->regex_replacement_));
2591
2592 std::string replacement_error;
2593 if (!replacer->regex_replacement_.CheckRewriteString(replacer->options_.replacement,
2594 &replacement_error)) {
2595 return Status::Invalid("Invalid replacement string: ",
2596 std::move(replacement_error));
2597 }
2598
2599 return std::move(replacer);
2600 }
2601
2602 // Using RE2::FindAndConsume we can only find the pattern if it is a group, therefore
2603 // we have 2 regexes, one with () around it, one without.
2604 explicit RegexSubStringReplacer(const ReplaceSubstringOptions& options)
2605 : options_(options),
2606 regex_find_("(" + options_.pattern + ")", RE2::Quiet),
2607 regex_replacement_(options_.pattern, RE2::Quiet) {}
2608
2609 Status ReplaceString(util::string_view s, TypedBufferBuilder<uint8_t>* builder) const {
2610 re2::StringPiece replacement(options_.replacement);
2611
2612 if (options_.max_replacements == -1) {
2613 std::string s_copy(s.to_string());
2614 re2::RE2::GlobalReplace(&s_copy, regex_replacement_, replacement);
2615 return builder->Append(reinterpret_cast<const uint8_t*>(s_copy.data()),
2616 s_copy.length());
2617 }
2618
2619 // Since RE2 does not have the concept of max_replacements, we have to do some work
2620 // ourselves.
2621 // We might do this faster similar to RE2::GlobalReplace using Match and Rewrite
2622 const char* i = s.begin();
2623 const char* end = s.end();
2624 re2::StringPiece piece(s.data(), s.length());
2625
2626 int64_t max_replacements = options_.max_replacements;
2627 while ((i < end) && (max_replacements != 0)) {
2628 std::string found;
2629 if (!re2::RE2::FindAndConsume(&piece, regex_find_, &found)) {
2630 RETURN_NOT_OK(builder->Append(reinterpret_cast<const uint8_t*>(i),
2631 static_cast<int64_t>(end - i)));
2632 i = end;
2633 } else {
2634 // wind back to the beginning of the match
2635 const char* pos = piece.begin() - found.length();
2636 // the string before the pattern
2637 RETURN_NOT_OK(builder->Append(reinterpret_cast<const uint8_t*>(i),
2638 static_cast<int64_t>(pos - i)));
2639 // replace the pattern in what we found
2640 if (!re2::RE2::Replace(&found, regex_replacement_, replacement)) {
2641 return Status::Invalid("Regex found, but replacement failed");
2642 }
2643 RETURN_NOT_OK(builder->Append(reinterpret_cast<const uint8_t*>(found.data()),
2644 static_cast<int64_t>(found.length())));
2645 // skip pattern
2646 i = piece.begin();
2647 max_replacements--;
2648 }
2649 }
2650 // If we exited early due to max_replacements, add the trailing part
2651 return builder->Append(reinterpret_cast<const uint8_t*>(i),
2652 static_cast<int64_t>(end - i));
2653 }
2654 };
2655 #endif
2656
2657 template <typename Type>
2658 using ReplaceSubStringPlain = ReplaceSubString<Type, PlainSubStringReplacer>;
2659
2660 const FunctionDoc replace_substring_doc(
2661 "Replace non-overlapping substrings that match pattern by replacement",
2662 ("For each string in `strings`, replace non-overlapping substrings that match\n"
2663 "`pattern` by `replacement`. If `max_replacements != -1`, it determines the\n"
2664 "maximum amount of replacements made, counting from the left. Null values emit\n"
2665 "null."),
2666 {"strings"}, "ReplaceSubstringOptions");
2667
2668 #ifdef ARROW_WITH_RE2
2669 template <typename Type>
2670 using ReplaceSubStringRegex = ReplaceSubString<Type, RegexSubStringReplacer>;
2671
2672 const FunctionDoc replace_substring_regex_doc(
2673 "Replace non-overlapping substrings that match regex `pattern` by `replacement`",
2674 ("For each string in `strings`, replace non-overlapping substrings that match the\n"
2675 "regular expression `pattern` by `replacement` using the Google RE2 library.\n"
2676 "If `max_replacements != -1`, it determines the maximum amount of replacements\n"
2677 "made, counting from the left. Note that if the pattern contains groups,\n"
2678 "backreferencing macan be used. Null values emit null."),
2679 {"strings"}, "ReplaceSubstringOptions");
2680 #endif
2681
2682 // ----------------------------------------------------------------------
2683 // Replace slice
2684
2685 struct ReplaceSliceTransformBase : public StringTransformBase {
2686 using State = OptionsWrapper<ReplaceSliceOptions>;
2687
2688 const ReplaceSliceOptions* options;
2689
2690 explicit ReplaceSliceTransformBase(const ReplaceSliceOptions& options)
2691 : options{&options} {}
2692
2693 int64_t MaxCodeunits(int64_t ninputs, int64_t input_ncodeunits) override {
2694 return ninputs * options->replacement.size() + input_ncodeunits;
2695 }
2696 };
2697
2698 struct BinaryReplaceSliceTransform : ReplaceSliceTransformBase {
2699 using ReplaceSliceTransformBase::ReplaceSliceTransformBase;
2700 int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
2701 uint8_t* output) {
2702 const auto& opts = *options;
2703 int64_t before_slice = 0;
2704 int64_t after_slice = 0;
2705 uint8_t* output_start = output;
2706
2707 if (opts.start >= 0) {
2708 // Count from left
2709 before_slice = std::min<int64_t>(input_string_ncodeunits, opts.start);
2710 } else {
2711 // Count from right
2712 before_slice = std::max<int64_t>(0, input_string_ncodeunits + opts.start);
2713 }
2714 // Mimic Pandas: if stop would be before start, treat as 0-length slice
2715 if (opts.stop >= 0) {
2716 // Count from left
2717 after_slice =
2718 std::min<int64_t>(input_string_ncodeunits, std::max(before_slice, opts.stop));
2719 } else {
2720 // Count from right
2721 after_slice = std::max<int64_t>(before_slice, input_string_ncodeunits + opts.stop);
2722 }
2723 output = std::copy(input, input + before_slice, output);
2724 output = std::copy(opts.replacement.begin(), opts.replacement.end(), output);
2725 output = std::copy(input + after_slice, input + input_string_ncodeunits, output);
2726 return output - output_start;
2727 }
2728
2729 static int32_t FixedOutputSize(const ReplaceSliceOptions& opts, int32_t input_width) {
2730 int32_t before_slice = 0;
2731 int32_t after_slice = 0;
2732 const int32_t start = static_cast<int32_t>(opts.start);
2733 const int32_t stop = static_cast<int32_t>(opts.stop);
2734 if (opts.start >= 0) {
2735 // Count from left
2736 before_slice = std::min<int32_t>(input_width, start);
2737 } else {
2738 // Count from right
2739 before_slice = std::max<int32_t>(0, input_width + start);
2740 }
2741 if (opts.stop >= 0) {
2742 // Count from left
2743 after_slice = std::min<int32_t>(input_width, std::max<int32_t>(before_slice, stop));
2744 } else {
2745 // Count from right
2746 after_slice = std::max<int32_t>(before_slice, input_width + stop);
2747 }
2748 return static_cast<int32_t>(before_slice + opts.replacement.size() +
2749 (input_width - after_slice));
2750 }
2751 };
2752
2753 struct Utf8ReplaceSliceTransform : ReplaceSliceTransformBase {
2754 using ReplaceSliceTransformBase::ReplaceSliceTransformBase;
2755 int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
2756 uint8_t* output) {
2757 const auto& opts = *options;
2758 const uint8_t* begin = input;
2759 const uint8_t* end = input + input_string_ncodeunits;
2760 const uint8_t *begin_sliced, *end_sliced;
2761 uint8_t* output_start = output;
2762
2763 // Mimic Pandas: if stop would be before start, treat as 0-length slice
2764 if (opts.start >= 0) {
2765 // Count from left
2766 if (!arrow::util::UTF8AdvanceCodepoints(begin, end, &begin_sliced, opts.start)) {
2767 return kTransformError;
2768 }
2769 if (opts.stop > options->start) {
2770 // Continue counting from left
2771 const int64_t length = opts.stop - options->start;
2772 if (!arrow::util::UTF8AdvanceCodepoints(begin_sliced, end, &end_sliced, length)) {
2773 return kTransformError;
2774 }
2775 } else if (opts.stop < 0) {
2776 // Count from right
2777 if (!arrow::util::UTF8AdvanceCodepointsReverse(begin_sliced, end, &end_sliced,
2778 -opts.stop)) {
2779 return kTransformError;
2780 }
2781 } else {
2782 // Zero-length slice
2783 end_sliced = begin_sliced;
2784 }
2785 } else {
2786 // Count from right
2787 if (!arrow::util::UTF8AdvanceCodepointsReverse(begin, end, &begin_sliced,
2788 -opts.start)) {
2789 return kTransformError;
2790 }
2791 if (opts.stop >= 0) {
2792 // Restart counting from left
2793 if (!arrow::util::UTF8AdvanceCodepoints(begin, end, &end_sliced, opts.stop)) {
2794 return kTransformError;
2795 }
2796 if (end_sliced <= begin_sliced) {
2797 // Zero-length slice
2798 end_sliced = begin_sliced;
2799 }
2800 } else if ((opts.stop < 0) && (options->stop > options->start)) {
2801 // Count from right
2802 if (!arrow::util::UTF8AdvanceCodepointsReverse(begin_sliced, end, &end_sliced,
2803 -opts.stop)) {
2804 return kTransformError;
2805 }
2806 } else {
2807 // zero-length slice
2808 end_sliced = begin_sliced;
2809 }
2810 }
2811 output = std::copy(begin, begin_sliced, output);
2812 output = std::copy(opts.replacement.begin(), options->replacement.end(), output);
2813 output = std::copy(end_sliced, end, output);
2814 return output - output_start;
2815 }
2816 };
2817
2818 template <typename Type>
2819 using BinaryReplaceSlice =
2820 StringTransformExecWithState<Type, BinaryReplaceSliceTransform>;
2821 template <typename Type>
2822 using Utf8ReplaceSlice = StringTransformExecWithState<Type, Utf8ReplaceSliceTransform>;
2823
2824 const FunctionDoc binary_replace_slice_doc(
2825 "Replace a slice of a binary string with `replacement`",
2826 ("For each string in `strings`, replace a slice of the string defined by `start`"
2827 "and `stop` with `replacement`. `start` is inclusive and `stop` is exclusive, "
2828 "and both are measured in bytes.\n"
2829 "Null values emit null."),
2830 {"strings"}, "ReplaceSliceOptions");
2831
2832 const FunctionDoc utf8_replace_slice_doc(
2833 "Replace a slice of a string with `replacement`",
2834 ("For each string in `strings`, replace a slice of the string defined by `start`"
2835 "and `stop` with `replacement`. `start` is inclusive and `stop` is exclusive, "
2836 "and both are measured in codeunits.\n"
2837 "Null values emit null."),
2838 {"strings"}, "ReplaceSliceOptions");
2839
2840 void AddReplaceSlice(FunctionRegistry* registry) {
2841 {
2842 auto func = std::make_shared<ScalarFunction>("binary_replace_slice", Arity::Unary(),
2843 &binary_replace_slice_doc);
2844 for (const auto& ty : BaseBinaryTypes()) {
2845 DCHECK_OK(func->AddKernel({ty}, ty,
2846 GenerateTypeAgnosticVarBinaryBase<BinaryReplaceSlice>(ty),
2847 ReplaceSliceTransformBase::State::Init));
2848 }
2849 using TransformExec =
2850 FixedSizeBinaryTransformExecWithState<BinaryReplaceSliceTransform>;
2851 DCHECK_OK(func->AddKernel({InputType(Type::FIXED_SIZE_BINARY)},
2852 OutputType(TransformExec::OutputType), TransformExec::Exec,
2853 ReplaceSliceTransformBase::State::Init));
2854 DCHECK_OK(registry->AddFunction(std::move(func)));
2855 }
2856
2857 {
2858 auto func = std::make_shared<ScalarFunction>("utf8_replace_slice", Arity::Unary(),
2859 &utf8_replace_slice_doc);
2860 DCHECK_OK(func->AddKernel({utf8()}, utf8(), Utf8ReplaceSlice<StringType>::Exec,
2861 ReplaceSliceTransformBase::State::Init));
2862 DCHECK_OK(func->AddKernel({large_utf8()}, large_utf8(),
2863 Utf8ReplaceSlice<LargeStringType>::Exec,
2864 ReplaceSliceTransformBase::State::Init));
2865 DCHECK_OK(registry->AddFunction(std::move(func)));
2866 }
2867 }
2868
2869 // ----------------------------------------------------------------------
2870 // Extract with regex
2871
2872 #ifdef ARROW_WITH_RE2
2873
2874 // TODO cache this once per ExtractRegexOptions
2875 struct ExtractRegexData {
2876 // Use unique_ptr<> because RE2 is non-movable
2877 std::unique_ptr<RE2> regex;
2878 std::vector<std::string> group_names;
2879
2880 static Result<ExtractRegexData> Make(const ExtractRegexOptions& options) {
2881 ExtractRegexData data(options.pattern);
2882 RETURN_NOT_OK(RegexStatus(*data.regex));
2883
2884 const int group_count = data.regex->NumberOfCapturingGroups();
2885 const auto& name_map = data.regex->CapturingGroupNames();
2886 data.group_names.reserve(group_count);
2887
2888 for (int i = 0; i < group_count; i++) {
2889 auto item = name_map.find(i + 1); // re2 starts counting from 1
2890 if (item == name_map.end()) {
2891 // XXX should we instead just create fields with an empty name?
2892 return Status::Invalid("Regular expression contains unnamed groups");
2893 }
2894 data.group_names.emplace_back(item->second);
2895 }
2896 return std::move(data);
2897 }
2898
2899 Result<ValueDescr> ResolveOutputType(const std::vector<ValueDescr>& args) const {
2900 const auto& input_type = args[0].type;
2901 if (input_type == nullptr) {
2902 // No input type specified => propagate shape
2903 return args[0];
2904 }
2905 // Input type is either String or LargeString and is also the type of each
2906 // field in the output struct type.
2907 DCHECK(input_type->id() == Type::STRING || input_type->id() == Type::LARGE_STRING);
2908 FieldVector fields;
2909 fields.reserve(group_names.size());
2910 std::transform(group_names.begin(), group_names.end(), std::back_inserter(fields),
2911 [&](const std::string& name) { return field(name, input_type); });
2912 return struct_(std::move(fields));
2913 }
2914
2915 private:
2916 explicit ExtractRegexData(const std::string& pattern)
2917 : regex(new RE2(pattern, RE2::Quiet)) {}
2918 };
2919
2920 Result<ValueDescr> ResolveExtractRegexOutput(KernelContext* ctx,
2921 const std::vector<ValueDescr>& args) {
2922 using State = OptionsWrapper<ExtractRegexOptions>;
2923 ExtractRegexOptions options = State::Get(ctx);
2924 ARROW_ASSIGN_OR_RAISE(auto data, ExtractRegexData::Make(options));
2925 return data.ResolveOutputType(args);
2926 }
2927
2928 struct ExtractRegexBase {
2929 const ExtractRegexData& data;
2930 const int group_count;
2931 std::vector<re2::StringPiece> found_values;
2932 std::vector<re2::RE2::Arg> args;
2933 std::vector<const re2::RE2::Arg*> args_pointers;
2934 const re2::RE2::Arg** args_pointers_start;
2935 const re2::RE2::Arg* null_arg = nullptr;
2936
2937 explicit ExtractRegexBase(const ExtractRegexData& data)
2938 : data(data),
2939 group_count(static_cast<int>(data.group_names.size())),
2940 found_values(group_count) {
2941 args.reserve(group_count);
2942 args_pointers.reserve(group_count);
2943
2944 for (int i = 0; i < group_count; i++) {
2945 args.emplace_back(&found_values[i]);
2946 // Since we reserved capacity, we're guaranteed the pointer remains valid
2947 args_pointers.push_back(&args[i]);
2948 }
2949 // Avoid null pointer if there is no capture group
2950 args_pointers_start = (group_count > 0) ? args_pointers.data() : &null_arg;
2951 }
2952
2953 bool Match(util::string_view s) {
2954 return re2::RE2::PartialMatchN(ToStringPiece(s), *data.regex, args_pointers_start,
2955 group_count);
2956 }
2957 };
2958
2959 template <typename Type>
2960 struct ExtractRegex : public ExtractRegexBase {
2961 using ArrayType = typename TypeTraits<Type>::ArrayType;
2962 using ScalarType = typename TypeTraits<Type>::ScalarType;
2963 using BuilderType = typename TypeTraits<Type>::BuilderType;
2964 using State = OptionsWrapper<ExtractRegexOptions>;
2965
2966 using ExtractRegexBase::ExtractRegexBase;
2967
2968 static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
2969 ExtractRegexOptions options = State::Get(ctx);
2970 ARROW_ASSIGN_OR_RAISE(auto data, ExtractRegexData::Make(options));
2971 return ExtractRegex{data}.Extract(ctx, batch, out);
2972 }
2973
2974 Status Extract(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
2975 ARROW_ASSIGN_OR_RAISE(auto descr, data.ResolveOutputType(batch.GetDescriptors()));
2976 DCHECK_NE(descr.type, nullptr);
2977 const auto& type = descr.type;
2978
2979 if (batch[0].kind() == Datum::ARRAY) {
2980 std::unique_ptr<ArrayBuilder> array_builder;
2981 RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), type, &array_builder));
2982 StructBuilder* struct_builder = checked_cast<StructBuilder*>(array_builder.get());
2983
2984 std::vector<BuilderType*> field_builders;
2985 field_builders.reserve(group_count);
2986 for (int i = 0; i < group_count; i++) {
2987 field_builders.push_back(
2988 checked_cast<BuilderType*>(struct_builder->field_builder(i)));
2989 }
2990
2991 auto visit_null = [&]() { return struct_builder->AppendNull(); };
2992 auto visit_value = [&](util::string_view s) {
2993 if (Match(s)) {
2994 for (int i = 0; i < group_count; i++) {
2995 RETURN_NOT_OK(field_builders[i]->Append(ToStringView(found_values[i])));
2996 }
2997 return struct_builder->Append();
2998 } else {
2999 return struct_builder->AppendNull();
3000 }
3001 };
3002 const ArrayData& input = *batch[0].array();
3003 RETURN_NOT_OK(VisitArrayDataInline<Type>(input, visit_value, visit_null));
3004
3005 std::shared_ptr<Array> out_array;
3006 RETURN_NOT_OK(struct_builder->Finish(&out_array));
3007 *out = std::move(out_array);
3008 } else {
3009 const auto& input = checked_cast<const ScalarType&>(*batch[0].scalar());
3010 auto result = std::make_shared<StructScalar>(type);
3011 if (input.is_valid && Match(util::string_view(*input.value))) {
3012 result->value.reserve(group_count);
3013 for (int i = 0; i < group_count; i++) {
3014 result->value.push_back(
3015 std::make_shared<ScalarType>(found_values[i].as_string()));
3016 }
3017 result->is_valid = true;
3018 } else {
3019 result->is_valid = false;
3020 }
3021 out->value = std::move(result);
3022 }
3023
3024 return Status::OK();
3025 }
3026 };
3027
3028 const FunctionDoc extract_regex_doc(
3029 "Extract substrings captured by a regex pattern",
3030 ("For each string in `strings`, match the regular expression and, if\n"
3031 "successful, emit a struct with field names and values coming from the\n"
3032 "regular expression's named capture groups. If the input is null or the\n"
3033 "regular expression fails matching, a null output value is emitted.\n"
3034 "\n"
3035 "Regular expression matching is done using the Google RE2 library."),
3036 {"strings"}, "ExtractRegexOptions");
3037
3038 void AddExtractRegex(FunctionRegistry* registry) {
3039 auto func = std::make_shared<ScalarFunction>("extract_regex", Arity::Unary(),
3040 &extract_regex_doc);
3041 using t32 = ExtractRegex<StringType>;
3042 using t64 = ExtractRegex<LargeStringType>;
3043 OutputType out_ty(ResolveExtractRegexOutput);
3044 ScalarKernel kernel;
3045
3046 // Null values will be computed based on regex match or not
3047 kernel.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE;
3048 kernel.mem_allocation = MemAllocation::NO_PREALLOCATE;
3049 kernel.signature.reset(new KernelSignature({utf8()}, out_ty));
3050 kernel.exec = t32::Exec;
3051 kernel.init = t32::State::Init;
3052 DCHECK_OK(func->AddKernel(kernel));
3053 kernel.signature.reset(new KernelSignature({large_utf8()}, out_ty));
3054 kernel.exec = t64::Exec;
3055 kernel.init = t64::State::Init;
3056 DCHECK_OK(func->AddKernel(kernel));
3057
3058 DCHECK_OK(registry->AddFunction(std::move(func)));
3059 }
3060 #endif // ARROW_WITH_RE2
3061
3062 // ----------------------------------------------------------------------
3063 // strptime string parsing
3064
3065 using StrptimeState = OptionsWrapper<StrptimeOptions>;
3066
3067 struct ParseStrptime {
3068 explicit ParseStrptime(const StrptimeOptions& options)
3069 : parser(TimestampParser::MakeStrptime(options.format)), unit(options.unit) {}
3070
3071 template <typename... Ignored>
3072 int64_t Call(KernelContext*, util::string_view val, Status* st) const {
3073 int64_t result = 0;
3074 if (!(*parser)(val.data(), val.size(), unit, &result)) {
3075 *st = Status::Invalid("Failed to parse string: '", val, "' as a scalar of type ",
3076 TimestampType(unit).ToString());
3077 }
3078 return result;
3079 }
3080
3081 std::shared_ptr<TimestampParser> parser;
3082 TimeUnit::type unit;
3083 };
3084
3085 template <typename InputType>
3086 Status StrptimeExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
3087 applicator::ScalarUnaryNotNullStateful<TimestampType, InputType, ParseStrptime> kernel{
3088 ParseStrptime(StrptimeState::Get(ctx))};
3089 return kernel.Exec(ctx, batch, out);
3090 }
3091
3092 Result<ValueDescr> StrptimeResolve(KernelContext* ctx, const std::vector<ValueDescr>&) {
3093 if (ctx->state()) {
3094 return ::arrow::timestamp(StrptimeState::Get(ctx).unit);
3095 }
3096
3097 return Status::Invalid("strptime does not provide default StrptimeOptions");
3098 }
3099
3100 // ----------------------------------------------------------------------
3101 // string padding
3102
3103 template <bool PadLeft, bool PadRight>
3104 struct AsciiPadTransform : public StringTransformBase {
3105 using State = OptionsWrapper<PadOptions>;
3106
3107 const PadOptions& options_;
3108
3109 explicit AsciiPadTransform(const PadOptions& options) : options_(options) {}
3110
3111 Status PreExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) override {
3112 if (options_.padding.size() != 1) {
3113 return Status::Invalid("Padding must be one byte, got '", options_.padding, "'");
3114 }
3115 return Status::OK();
3116 }
3117
3118 int64_t MaxCodeunits(int64_t ninputs, int64_t input_ncodeunits) override {
3119 // This is likely very overallocated but hard to do better without
3120 // actually looking at each string (because of strings that may be
3121 // longer than the given width)
3122 return input_ncodeunits + ninputs * options_.width;
3123 }
3124
3125 int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
3126 uint8_t* output) {
3127 if (input_string_ncodeunits >= options_.width) {
3128 std::copy(input, input + input_string_ncodeunits, output);
3129 return input_string_ncodeunits;
3130 }
3131 const int64_t spaces = options_.width - input_string_ncodeunits;
3132 int64_t left = 0;
3133 int64_t right = 0;
3134 if (PadLeft && PadRight) {
3135 // If odd number of spaces, put the extra space on the right
3136 left = spaces / 2;
3137 right = spaces - left;
3138 } else if (PadLeft) {
3139 left = spaces;
3140 } else if (PadRight) {
3141 right = spaces;
3142 } else {
3143 DCHECK(false) << "unreachable";
3144 return 0;
3145 }
3146 std::fill(output, output + left, options_.padding[0]);
3147 output += left;
3148 output = std::copy(input, input + input_string_ncodeunits, output);
3149 std::fill(output, output + right, options_.padding[0]);
3150 return options_.width;
3151 }
3152 };
3153
3154 template <bool PadLeft, bool PadRight>
3155 struct Utf8PadTransform : public StringTransformBase {
3156 using State = OptionsWrapper<PadOptions>;
3157
3158 const PadOptions& options_;
3159
3160 explicit Utf8PadTransform(const PadOptions& options) : options_(options) {}
3161
3162 Status PreExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) override {
3163 auto str = reinterpret_cast<const uint8_t*>(options_.padding.data());
3164 auto strlen = options_.padding.size();
3165 if (util::UTF8Length(str, str + strlen) != 1) {
3166 return Status::Invalid("Padding must be one codepoint, got '", options_.padding,
3167 "'");
3168 }
3169 return Status::OK();
3170 }
3171
3172 int64_t MaxCodeunits(int64_t ninputs, int64_t input_ncodeunits) override {
3173 // This is likely very overallocated but hard to do better without
3174 // actually looking at each string (because of strings that may be
3175 // longer than the given width)
3176 // One codepoint may be up to 4 bytes
3177 return input_ncodeunits + 4 * ninputs * options_.width;
3178 }
3179
3180 int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
3181 uint8_t* output) {
3182 const int64_t input_width = util::UTF8Length(input, input + input_string_ncodeunits);
3183 if (input_width >= options_.width) {
3184 std::copy(input, input + input_string_ncodeunits, output);
3185 return input_string_ncodeunits;
3186 }
3187 const int64_t spaces = options_.width - input_width;
3188 int64_t left = 0;
3189 int64_t right = 0;
3190 if (PadLeft && PadRight) {
3191 // If odd number of spaces, put the extra space on the right
3192 left = spaces / 2;
3193 right = spaces - left;
3194 } else if (PadLeft) {
3195 left = spaces;
3196 } else if (PadRight) {
3197 right = spaces;
3198 } else {
3199 DCHECK(false) << "unreachable";
3200 return 0;
3201 }
3202 uint8_t* start = output;
3203 while (left) {
3204 output = std::copy(options_.padding.begin(), options_.padding.end(), output);
3205 left--;
3206 }
3207 output = std::copy(input, input + input_string_ncodeunits, output);
3208 while (right) {
3209 output = std::copy(options_.padding.begin(), options_.padding.end(), output);
3210 right--;
3211 }
3212 return output - start;
3213 }
3214 };
3215
3216 template <typename Type>
3217 using AsciiLPad = StringTransformExecWithState<Type, AsciiPadTransform<true, false>>;
3218 template <typename Type>
3219 using AsciiRPad = StringTransformExecWithState<Type, AsciiPadTransform<false, true>>;
3220 template <typename Type>
3221 using AsciiCenter = StringTransformExecWithState<Type, AsciiPadTransform<true, true>>;
3222 template <typename Type>
3223 using Utf8LPad = StringTransformExecWithState<Type, Utf8PadTransform<true, false>>;
3224 template <typename Type>
3225 using Utf8RPad = StringTransformExecWithState<Type, Utf8PadTransform<false, true>>;
3226 template <typename Type>
3227 using Utf8Center = StringTransformExecWithState<Type, Utf8PadTransform<true, true>>;
3228
3229 // ----------------------------------------------------------------------
3230 // string trimming
3231
3232 #ifdef ARROW_WITH_UTF8PROC
3233
3234 template <bool TrimLeft, bool TrimRight>
3235 struct UTF8TrimWhitespaceTransform : public StringTransformBase {
3236 Status PreExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) override {
3237 EnsureLookupTablesFilled();
3238 return Status::OK();
3239 }
3240
3241 int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
3242 uint8_t* output) {
3243 const uint8_t* begin = input;
3244 const uint8_t* end = input + input_string_ncodeunits;
3245 const uint8_t* end_trimmed = end;
3246 const uint8_t* begin_trimmed = begin;
3247
3248 auto predicate = [](uint32_t c) { return !IsSpaceCharacterUnicode(c); };
3249 if (TrimLeft && !ARROW_PREDICT_TRUE(
3250 arrow::util::UTF8FindIf(begin, end, predicate, &begin_trimmed))) {
3251 return kTransformError;
3252 }
3253 if (TrimRight && begin_trimmed < end) {
3254 if (!ARROW_PREDICT_TRUE(arrow::util::UTF8FindIfReverse(begin_trimmed, end,
3255 predicate, &end_trimmed))) {
3256 return kTransformError;
3257 }
3258 }
3259 std::copy(begin_trimmed, end_trimmed, output);
3260 return end_trimmed - begin_trimmed;
3261 }
3262 };
3263
3264 template <typename Type>
3265 using UTF8TrimWhitespace =
3266 StringTransformExec<Type, UTF8TrimWhitespaceTransform<true, true>>;
3267
3268 template <typename Type>
3269 using UTF8LTrimWhitespace =
3270 StringTransformExec<Type, UTF8TrimWhitespaceTransform<true, false>>;
3271
3272 template <typename Type>
3273 using UTF8RTrimWhitespace =
3274 StringTransformExec<Type, UTF8TrimWhitespaceTransform<false, true>>;
3275
3276 struct UTF8TrimState {
3277 TrimOptions options_;
3278 std::vector<bool> codepoints_;
3279 Status status_ = Status::OK();
3280
3281 explicit UTF8TrimState(KernelContext* ctx, TrimOptions options)
3282 : options_(std::move(options)) {
3283 if (!ARROW_PREDICT_TRUE(
3284 arrow::util::UTF8ForEach(options_.characters, [&](uint32_t c) {
3285 codepoints_.resize(
3286 std::max(c + 1, static_cast<uint32_t>(codepoints_.size())));
3287 codepoints_.at(c) = true;
3288 }))) {
3289 status_ = Status::Invalid("Invalid UTF8 sequence in input");
3290 }
3291 }
3292 };
3293
3294 template <bool TrimLeft, bool TrimRight>
3295 struct UTF8TrimTransform : public StringTransformBase {
3296 using State = KernelStateFromFunctionOptions<UTF8TrimState, TrimOptions>;
3297
3298 const UTF8TrimState& state_;
3299
3300 explicit UTF8TrimTransform(const UTF8TrimState& state) : state_(state) {}
3301
3302 Status PreExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) override {
3303 return state_.status_;
3304 }
3305
3306 int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
3307 uint8_t* output) {
3308 const uint8_t* begin = input;
3309 const uint8_t* end = input + input_string_ncodeunits;
3310 const uint8_t* end_trimmed = end;
3311 const uint8_t* begin_trimmed = begin;
3312 const auto& codepoints = state_.codepoints_;
3313
3314 auto predicate = [&](uint32_t c) { return c >= codepoints.size() || !codepoints[c]; };
3315 if (TrimLeft && !ARROW_PREDICT_TRUE(
3316 arrow::util::UTF8FindIf(begin, end, predicate, &begin_trimmed))) {
3317 return kTransformError;
3318 }
3319 if (TrimRight && begin_trimmed < end) {
3320 if (!ARROW_PREDICT_TRUE(arrow::util::UTF8FindIfReverse(begin_trimmed, end,
3321 predicate, &end_trimmed))) {
3322 return kTransformError;
3323 }
3324 }
3325 std::copy(begin_trimmed, end_trimmed, output);
3326 return end_trimmed - begin_trimmed;
3327 }
3328 };
3329
3330 template <typename Type>
3331 using UTF8Trim = StringTransformExecWithState<Type, UTF8TrimTransform<true, true>>;
3332
3333 template <typename Type>
3334 using UTF8LTrim = StringTransformExecWithState<Type, UTF8TrimTransform<true, false>>;
3335
3336 template <typename Type>
3337 using UTF8RTrim = StringTransformExecWithState<Type, UTF8TrimTransform<false, true>>;
3338
3339 #endif
3340
3341 template <bool TrimLeft, bool TrimRight>
3342 struct AsciiTrimWhitespaceTransform : public StringTransformBase {
3343 int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
3344 uint8_t* output) {
3345 const uint8_t* begin = input;
3346 const uint8_t* end = input + input_string_ncodeunits;
3347 const uint8_t* end_trimmed = end;
3348 const uint8_t* begin_trimmed = begin;
3349
3350 auto predicate = [](unsigned char c) { return !IsSpaceCharacterAscii(c); };
3351 if (TrimLeft) {
3352 begin_trimmed = std::find_if(begin, end, predicate);
3353 }
3354 if (TrimRight && begin_trimmed < end) {
3355 std::reverse_iterator<const uint8_t*> rbegin(end);
3356 std::reverse_iterator<const uint8_t*> rend(begin_trimmed);
3357 end_trimmed = std::find_if(rbegin, rend, predicate).base();
3358 }
3359 std::copy(begin_trimmed, end_trimmed, output);
3360 return end_trimmed - begin_trimmed;
3361 }
3362 };
3363
3364 template <typename Type>
3365 using AsciiTrimWhitespace =
3366 StringTransformExec<Type, AsciiTrimWhitespaceTransform<true, true>>;
3367
3368 template <typename Type>
3369 using AsciiLTrimWhitespace =
3370 StringTransformExec<Type, AsciiTrimWhitespaceTransform<true, false>>;
3371
3372 template <typename Type>
3373 using AsciiRTrimWhitespace =
3374 StringTransformExec<Type, AsciiTrimWhitespaceTransform<false, true>>;
3375
3376 struct AsciiTrimState {
3377 TrimOptions options_;
3378 std::vector<bool> characters_;
3379
3380 explicit AsciiTrimState(KernelContext* ctx, TrimOptions options)
3381 : options_(std::move(options)), characters_(256) {
3382 for (const auto c : options_.characters) {
3383 characters_[static_cast<unsigned char>(c)] = true;
3384 }
3385 }
3386 };
3387
3388 template <bool TrimLeft, bool TrimRight>
3389 struct AsciiTrimTransform : public StringTransformBase {
3390 using State = KernelStateFromFunctionOptions<AsciiTrimState, TrimOptions>;
3391
3392 const AsciiTrimState& state_;
3393
3394 explicit AsciiTrimTransform(const AsciiTrimState& state) : state_(state) {}
3395
3396 int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
3397 uint8_t* output) {
3398 const uint8_t* begin = input;
3399 const uint8_t* end = input + input_string_ncodeunits;
3400 const uint8_t* end_trimmed = end;
3401 const uint8_t* begin_trimmed = begin;
3402 const auto& characters = state_.characters_;
3403
3404 auto predicate = [&](uint8_t c) { return !characters[c]; };
3405 if (TrimLeft) {
3406 begin_trimmed = std::find_if(begin, end, predicate);
3407 }
3408 if (TrimRight && begin_trimmed < end) {
3409 std::reverse_iterator<const uint8_t*> rbegin(end);
3410 std::reverse_iterator<const uint8_t*> rend(begin_trimmed);
3411 end_trimmed = std::find_if(rbegin, rend, predicate).base();
3412 }
3413 std::copy(begin_trimmed, end_trimmed, output);
3414 return end_trimmed - begin_trimmed;
3415 }
3416 };
3417
3418 template <typename Type>
3419 using AsciiTrim = StringTransformExecWithState<Type, AsciiTrimTransform<true, true>>;
3420
3421 template <typename Type>
3422 using AsciiLTrim = StringTransformExecWithState<Type, AsciiTrimTransform<true, false>>;
3423
3424 template <typename Type>
3425 using AsciiRTrim = StringTransformExecWithState<Type, AsciiTrimTransform<false, true>>;
3426
3427 const FunctionDoc utf8_center_doc(
3428 "Center strings by padding with a given character",
3429 ("For each string in `strings`, emit a centered string by padding both sides \n"
3430 "with the given UTF8 codeunit.\nNull values emit null."),
3431 {"strings"}, "PadOptions");
3432
3433 const FunctionDoc utf8_lpad_doc(
3434 "Right-align strings by padding with a given character",
3435 ("For each string in `strings`, emit a right-aligned string by prepending \n"
3436 "the given UTF8 codeunit.\nNull values emit null."),
3437 {"strings"}, "PadOptions");
3438
3439 const FunctionDoc utf8_rpad_doc(
3440 "Left-align strings by padding with a given character",
3441 ("For each string in `strings`, emit a left-aligned string by appending \n"
3442 "the given UTF8 codeunit.\nNull values emit null."),
3443 {"strings"}, "PadOptions");
3444
3445 const FunctionDoc ascii_center_doc(
3446 utf8_center_doc.description + "",
3447 ("For each string in `strings`, emit a centered string by padding both sides \n"
3448 "with the given ASCII character.\nNull values emit null."),
3449 {"strings"}, "PadOptions");
3450
3451 const FunctionDoc ascii_lpad_doc(
3452 utf8_lpad_doc.description + "",
3453 ("For each string in `strings`, emit a right-aligned string by prepending \n"
3454 "the given ASCII character.\nNull values emit null."),
3455 {"strings"}, "PadOptions");
3456
3457 const FunctionDoc ascii_rpad_doc(
3458 utf8_rpad_doc.description + "",
3459 ("For each string in `strings`, emit a left-aligned string by appending \n"
3460 "the given ASCII character.\nNull values emit null."),
3461 {"strings"}, "PadOptions");
3462
3463 const FunctionDoc utf8_trim_whitespace_doc(
3464 "Trim leading and trailing whitespace characters",
3465 ("For each string in `strings`, emit a string with leading and trailing whitespace\n"
3466 "characters removed, where whitespace characters are defined by the Unicode\n"
3467 "standard. Null values emit null."),
3468 {"strings"});
3469
3470 const FunctionDoc utf8_ltrim_whitespace_doc(
3471 "Trim leading whitespace characters",
3472 ("For each string in `strings`, emit a string with leading whitespace\n"
3473 "characters removed, where whitespace characters are defined by the Unicode\n"
3474 "standard. Null values emit null."),
3475 {"strings"});
3476
3477 const FunctionDoc utf8_rtrim_whitespace_doc(
3478 "Trim trailing whitespace characters",
3479 ("For each string in `strings`, emit a string with trailing whitespace\n"
3480 "characters removed, where whitespace characters are defined by the Unicode\n"
3481 "standard. Null values emit null."),
3482 {"strings"});
3483
3484 const FunctionDoc ascii_trim_whitespace_doc(
3485 "Trim leading and trailing ASCII whitespace characters",
3486 ("For each string in `strings`, emit a string with leading and trailing ASCII\n"
3487 "whitespace characters removed. Use `utf8_trim_whitespace` to trim Unicode\n"
3488 "whitespace characters. Null values emit null."),
3489 {"strings"});
3490
3491 const FunctionDoc ascii_ltrim_whitespace_doc(
3492 "Trim leading ASCII whitespace characters",
3493 ("For each string in `strings`, emit a string with leading ASCII whitespace\n"
3494 "characters removed. Use `utf8_ltrim_whitespace` to trim leading Unicode\n"
3495 "whitespace characters. Null values emit null."),
3496 {"strings"});
3497
3498 const FunctionDoc ascii_rtrim_whitespace_doc(
3499 "Trim trailing ASCII whitespace characters",
3500 ("For each string in `strings`, emit a string with trailing ASCII whitespace\n"
3501 "characters removed. Use `utf8_rtrim_whitespace` to trim trailing Unicode\n"
3502 "whitespace characters. Null values emit null."),
3503 {"strings"});
3504
3505 const FunctionDoc utf8_trim_doc(
3506 "Trim leading and trailing characters present in the `characters` arguments",
3507 ("For each string in `strings`, emit a string with leading and trailing\n"
3508 "characters removed that are present in the `characters` argument. Null values\n"
3509 "emit null."),
3510 {"strings"}, "TrimOptions");
3511
3512 const FunctionDoc utf8_ltrim_doc(
3513 "Trim leading characters present in the `characters` arguments",
3514 ("For each string in `strings`, emit a string with leading\n"
3515 "characters removed that are present in the `characters` argument. Null values\n"
3516 "emit null."),
3517 {"strings"}, "TrimOptions");
3518
3519 const FunctionDoc utf8_rtrim_doc(
3520 "Trim trailing characters present in the `characters` arguments",
3521 ("For each string in `strings`, emit a string with leading "
3522 "characters removed that are present in the `characters` argument. Null values\n"
3523 "emit null."),
3524 {"strings"}, "TrimOptions");
3525
3526 const FunctionDoc ascii_trim_doc(
3527 utf8_trim_doc.summary + "",
3528 utf8_trim_doc.description +
3529 ("\nBoth the input string as the `characters` argument are interepreted as\n"
3530 "ASCII characters, to trim non-ASCII characters, use `utf8_trim`."),
3531 {"strings"}, "TrimOptions");
3532
3533 const FunctionDoc ascii_ltrim_doc(
3534 utf8_ltrim_doc.summary + "",
3535 utf8_ltrim_doc.description +
3536 ("\nBoth the input string as the `characters` argument are interepreted as\n"
3537 "ASCII characters, to trim non-ASCII characters, use `utf8_trim`."),
3538 {"strings"}, "TrimOptions");
3539
3540 const FunctionDoc ascii_rtrim_doc(
3541 utf8_rtrim_doc.summary + "",
3542 utf8_rtrim_doc.description +
3543 ("\nBoth the input string as the `characters` argument are interepreted as\n"
3544 "ASCII characters, to trim non-ASCII characters, use `utf8_trim`."),
3545 {"strings"}, "TrimOptions");
3546
3547 const FunctionDoc strptime_doc(
3548 "Parse timestamps",
3549 ("For each string in `strings`, parse it as a timestamp.\n"
3550 "The timestamp unit and the expected string pattern must be given\n"
3551 "in StrptimeOptions. Null inputs emit null. If a non-null string\n"
3552 "fails parsing, an error is returned."),
3553 {"strings"}, "StrptimeOptions");
3554
3555 const FunctionDoc binary_length_doc(
3556 "Compute string lengths",
3557 ("For each string in `strings`, emit the number of bytes. Null values emit null."),
3558 {"strings"});
3559
3560 const FunctionDoc utf8_length_doc("Compute UTF8 string lengths",
3561 ("For each string in `strings`, emit the number of "
3562 "UTF8 characters. Null values emit null."),
3563 {"strings"});
3564
3565 void AddStrptime(FunctionRegistry* registry) {
3566 auto func = std::make_shared<ScalarFunction>("strptime", Arity::Unary(), &strptime_doc);
3567 DCHECK_OK(func->AddKernel({utf8()}, OutputType(StrptimeResolve),
3568 StrptimeExec<StringType>, StrptimeState::Init));
3569 DCHECK_OK(func->AddKernel({large_utf8()}, OutputType(StrptimeResolve),
3570 StrptimeExec<LargeStringType>, StrptimeState::Init));
3571 DCHECK_OK(registry->AddFunction(std::move(func)));
3572 }
3573
3574 void AddBinaryLength(FunctionRegistry* registry) {
3575 auto func = std::make_shared<ScalarFunction>("binary_length", Arity::Unary(),
3576 &binary_length_doc);
3577 ArrayKernelExec exec_offset_32 =
3578 applicator::ScalarUnaryNotNull<Int32Type, StringType, BinaryLength>::Exec;
3579 ArrayKernelExec exec_offset_64 =
3580 applicator::ScalarUnaryNotNull<Int64Type, LargeStringType, BinaryLength>::Exec;
3581 for (const auto& input_type : {binary(), utf8()}) {
3582 DCHECK_OK(func->AddKernel({input_type}, int32(), exec_offset_32));
3583 }
3584 for (const auto& input_type : {large_binary(), large_utf8()}) {
3585 DCHECK_OK(func->AddKernel({input_type}, int64(), exec_offset_64));
3586 }
3587 DCHECK_OK(func->AddKernel({InputType(Type::FIXED_SIZE_BINARY)}, int32(),
3588 BinaryLength::FixedSizeExec));
3589 DCHECK_OK(registry->AddFunction(std::move(func)));
3590 }
3591
3592 void AddUtf8Length(FunctionRegistry* registry) {
3593 auto func =
3594 std::make_shared<ScalarFunction>("utf8_length", Arity::Unary(), &utf8_length_doc);
3595
3596 ArrayKernelExec exec_offset_32 =
3597 applicator::ScalarUnaryNotNull<Int32Type, StringType, Utf8Length>::Exec;
3598 DCHECK_OK(func->AddKernel({utf8()}, int32(), std::move(exec_offset_32)));
3599
3600 ArrayKernelExec exec_offset_64 =
3601 applicator::ScalarUnaryNotNull<Int64Type, LargeStringType, Utf8Length>::Exec;
3602 DCHECK_OK(func->AddKernel({large_utf8()}, int64(), std::move(exec_offset_64)));
3603
3604 DCHECK_OK(registry->AddFunction(std::move(func)));
3605 }
3606
3607 template <typename BinaryType, typename ListType>
3608 struct BinaryJoin {
3609 using ArrayType = typename TypeTraits<BinaryType>::ArrayType;
3610 using ListArrayType = typename TypeTraits<ListType>::ArrayType;
3611 using ListScalarType = typename TypeTraits<ListType>::ScalarType;
3612 using ListOffsetType = typename ListArrayType::offset_type;
3613 using BuilderType = typename TypeTraits<BinaryType>::BuilderType;
3614
3615 static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
3616 if (batch[0].kind() == Datum::SCALAR) {
3617 if (batch[1].kind() == Datum::SCALAR) {
3618 return ExecScalarScalar(ctx, *batch[0].scalar(), *batch[1].scalar(), out);
3619 }
3620 DCHECK_EQ(batch[1].kind(), Datum::ARRAY);
3621 return ExecScalarArray(ctx, *batch[0].scalar(), batch[1].array(), out);
3622 }
3623 DCHECK_EQ(batch[0].kind(), Datum::ARRAY);
3624 if (batch[1].kind() == Datum::SCALAR) {
3625 return ExecArrayScalar(ctx, batch[0].array(), *batch[1].scalar(), out);
3626 }
3627 DCHECK_EQ(batch[1].kind(), Datum::ARRAY);
3628 return ExecArrayArray(ctx, batch[0].array(), batch[1].array(), out);
3629 }
3630
3631 struct ListScalarOffsetLookup {
3632 const ArrayType& values;
3633
3634 int64_t GetStart(int64_t i) { return 0; }
3635 int64_t GetStop(int64_t i) { return values.length(); }
3636 bool IsNull(int64_t i) { return false; }
3637 };
3638
3639 struct ListArrayOffsetLookup {
3640 explicit ListArrayOffsetLookup(const ListArrayType& lists)
3641 : lists_(lists), offsets_(lists.raw_value_offsets()) {}
3642
3643 int64_t GetStart(int64_t i) { return offsets_[i]; }
3644 int64_t GetStop(int64_t i) { return offsets_[i + 1]; }
3645 bool IsNull(int64_t i) { return lists_.IsNull(i); }
3646
3647 private:
3648 const ListArrayType& lists_;
3649 const ListOffsetType* offsets_;
3650 };
3651
3652 struct SeparatorScalarLookup {
3653 const util::string_view separator;
3654
3655 bool IsNull(int64_t i) { return false; }
3656 util::string_view GetView(int64_t i) { return separator; }
3657 };
3658
3659 struct SeparatorArrayLookup {
3660 const ArrayType& separators;
3661
3662 bool IsNull(int64_t i) { return separators.IsNull(i); }
3663 util::string_view GetView(int64_t i) { return separators.GetView(i); }
3664 };
3665
3666 // Scalar, scalar -> scalar
3667 static Status ExecScalarScalar(KernelContext* ctx, const Scalar& left,
3668 const Scalar& right, Datum* out) {
3669 const auto& list = checked_cast<const ListScalarType&>(left);
3670 const auto& separator_scalar = checked_cast<const BaseBinaryScalar&>(right);
3671 if (!list.is_valid || !separator_scalar.is_valid) {
3672 return Status::OK();
3673 }
3674 util::string_view separator(*separator_scalar.value);
3675
3676 const auto& strings = checked_cast<const ArrayType&>(*list.value);
3677 if (strings.null_count() > 0) {
3678 out->scalar()->is_valid = false;
3679 return Status::OK();
3680 }
3681
3682 TypedBufferBuilder<uint8_t> builder(ctx->memory_pool());
3683 auto Append = [&](util::string_view value) {
3684 return builder.Append(reinterpret_cast<const uint8_t*>(value.data()),
3685 static_cast<int64_t>(value.size()));
3686 };
3687 if (strings.length() > 0) {
3688 auto data_length =
3689 strings.total_values_length() + (strings.length() - 1) * separator.length();
3690 RETURN_NOT_OK(builder.Reserve(data_length));
3691 RETURN_NOT_OK(Append(strings.GetView(0)));
3692 for (int64_t j = 1; j < strings.length(); j++) {
3693 RETURN_NOT_OK(Append(separator));
3694 RETURN_NOT_OK(Append(strings.GetView(j)));
3695 }
3696 }
3697 auto out_scalar = checked_cast<BaseBinaryScalar*>(out->scalar().get());
3698 return builder.Finish(&out_scalar->value);
3699 }
3700
3701 // Scalar, array -> array
3702 static Status ExecScalarArray(KernelContext* ctx, const Scalar& left,
3703 const std::shared_ptr<ArrayData>& right, Datum* out) {
3704 const auto& list_scalar = checked_cast<const BaseListScalar&>(left);
3705 if (!list_scalar.is_valid) {
3706 ARROW_ASSIGN_OR_RAISE(
3707 auto nulls, MakeArrayOfNull(right->type, right->length, ctx->memory_pool()));
3708 *out = *nulls->data();
3709 return Status::OK();
3710 }
3711 const auto& strings = checked_cast<const ArrayType&>(*list_scalar.value);
3712 if (strings.null_count() != 0) {
3713 ARROW_ASSIGN_OR_RAISE(
3714 auto nulls, MakeArrayOfNull(right->type, right->length, ctx->memory_pool()));
3715 *out = *nulls->data();
3716 return Status::OK();
3717 }
3718 const ArrayType separators(right);
3719
3720 BuilderType builder(ctx->memory_pool());
3721 RETURN_NOT_OK(builder.Reserve(separators.length()));
3722
3723 // Presize data to avoid multiple reallocations when joining strings
3724 int64_t total_data_length = 0;
3725 const int64_t list_length = strings.length();
3726 if (list_length) {
3727 const int64_t string_length = strings.total_values_length();
3728 total_data_length +=
3729 string_length * (separators.length() - separators.null_count());
3730 for (int64_t i = 0; i < separators.length(); ++i) {
3731 if (separators.IsNull(i)) {
3732 continue;
3733 }
3734 total_data_length += (list_length - 1) * separators.value_length(i);
3735 }
3736 }
3737 RETURN_NOT_OK(builder.ReserveData(total_data_length));
3738
3739 return JoinStrings(separators.length(), strings, ListScalarOffsetLookup{strings},
3740 SeparatorArrayLookup{separators}, &builder, out);
3741 }
3742
3743 // Array, scalar -> array
3744 static Status ExecArrayScalar(KernelContext* ctx,
3745 const std::shared_ptr<ArrayData>& left,
3746 const Scalar& right, Datum* out) {
3747 const ListArrayType lists(left);
3748 const auto& separator_scalar = checked_cast<const BaseBinaryScalar&>(right);
3749
3750 if (!separator_scalar.is_valid) {
3751 ARROW_ASSIGN_OR_RAISE(
3752 auto nulls,
3753 MakeArrayOfNull(lists.value_type(), lists.length(), ctx->memory_pool()));
3754 *out = *nulls->data();
3755 return Status::OK();
3756 }
3757
3758 util::string_view separator(*separator_scalar.value);
3759 const auto& strings = checked_cast<const ArrayType&>(*lists.values());
3760 const auto list_offsets = lists.raw_value_offsets();
3761
3762 BuilderType builder(ctx->memory_pool());
3763 RETURN_NOT_OK(builder.Reserve(lists.length()));
3764
3765 // Presize data to avoid multiple reallocations when joining strings
3766 int64_t total_data_length = strings.total_values_length();
3767 for (int64_t i = 0; i < lists.length(); ++i) {
3768 const auto start = list_offsets[i], end = list_offsets[i + 1];
3769 if (end > start && !ValuesContainNull(strings, start, end)) {
3770 total_data_length += (end - start - 1) * separator.length();
3771 }
3772 }
3773 RETURN_NOT_OK(builder.ReserveData(total_data_length));
3774
3775 return JoinStrings(lists.length(), strings, ListArrayOffsetLookup{lists},
3776 SeparatorScalarLookup{separator}, &builder, out);
3777 }
3778
3779 // Array, array -> array
3780 static Status ExecArrayArray(KernelContext* ctx, const std::shared_ptr<ArrayData>& left,
3781 const std::shared_ptr<ArrayData>& right, Datum* out) {
3782 const ListArrayType lists(left);
3783 const auto& strings = checked_cast<const ArrayType&>(*lists.values());
3784 const auto list_offsets = lists.raw_value_offsets();
3785 const auto string_offsets = strings.raw_value_offsets();
3786 const ArrayType separators(right);
3787
3788 BuilderType builder(ctx->memory_pool());
3789 RETURN_NOT_OK(builder.Reserve(lists.length()));
3790
3791 // Presize data to avoid multiple reallocations when joining strings
3792 int64_t total_data_length = 0;
3793 for (int64_t i = 0; i < lists.length(); ++i) {
3794 if (separators.IsNull(i)) {
3795 continue;
3796 }
3797 const auto start = list_offsets[i], end = list_offsets[i + 1];
3798 if (end > start && !ValuesContainNull(strings, start, end)) {
3799 total_data_length += string_offsets[end] - string_offsets[start];
3800 total_data_length += (end - start - 1) * separators.value_length(i);
3801 }
3802 }
3803 RETURN_NOT_OK(builder.ReserveData(total_data_length));
3804
3805 struct SeparatorLookup {
3806 const ArrayType& separators;
3807
3808 bool IsNull(int64_t i) { return separators.IsNull(i); }
3809 util::string_view GetView(int64_t i) { return separators.GetView(i); }
3810 };
3811 return JoinStrings(lists.length(), strings, ListArrayOffsetLookup{lists},
3812 SeparatorArrayLookup{separators}, &builder, out);
3813 }
3814
3815 template <typename ListOffsetLookup, typename SeparatorLookup>
3816 static Status JoinStrings(int64_t length, const ArrayType& strings,
3817 ListOffsetLookup&& list_offsets, SeparatorLookup&& separators,
3818 BuilderType* builder, Datum* out) {
3819 for (int64_t i = 0; i < length; ++i) {
3820 if (list_offsets.IsNull(i) || separators.IsNull(i)) {
3821 builder->UnsafeAppendNull();
3822 continue;
3823 }
3824 const auto j_start = list_offsets.GetStart(i), j_end = list_offsets.GetStop(i);
3825 if (j_start == j_end) {
3826 builder->UnsafeAppendEmptyValue();
3827 continue;
3828 }
3829 if (ValuesContainNull(strings, j_start, j_end)) {
3830 builder->UnsafeAppendNull();
3831 continue;
3832 }
3833 builder->UnsafeAppend(strings.GetView(j_start));
3834 for (int64_t j = j_start + 1; j < j_end; ++j) {
3835 builder->UnsafeExtendCurrent(separators.GetView(i));
3836 builder->UnsafeExtendCurrent(strings.GetView(j));
3837 }
3838 }
3839
3840 std::shared_ptr<Array> string_array;
3841 RETURN_NOT_OK(builder->Finish(&string_array));
3842 *out = *string_array->data();
3843 // Correct the output type based on the input
3844 out->mutable_array()->type = strings.type();
3845 return Status::OK();
3846 }
3847
3848 static bool ValuesContainNull(const ArrayType& values, int64_t start, int64_t end) {
3849 if (values.null_count() == 0) {
3850 return false;
3851 }
3852 for (int64_t i = start; i < end; ++i) {
3853 if (values.IsNull(i)) {
3854 return true;
3855 }
3856 }
3857 return false;
3858 }
3859 };
3860
3861 using BinaryJoinElementWiseState = OptionsWrapper<JoinOptions>;
3862
3863 template <typename Type>
3864 struct BinaryJoinElementWise {
3865 using ArrayType = typename TypeTraits<Type>::ArrayType;
3866 using BuilderType = typename TypeTraits<Type>::BuilderType;
3867 using offset_type = typename Type::offset_type;
3868
3869 static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
3870 JoinOptions options = BinaryJoinElementWiseState::Get(ctx);
3871 // Last argument is the separator (for consistency with binary_join)
3872 if (std::all_of(batch.values.begin(), batch.values.end(),
3873 [](const Datum& d) { return d.is_scalar(); })) {
3874 return ExecOnlyScalar(ctx, options, batch, out);
3875 }
3876 return ExecContainingArrays(ctx, options, batch, out);
3877 }
3878
3879 static Status ExecOnlyScalar(KernelContext* ctx, const JoinOptions& options,
3880 const ExecBatch& batch, Datum* out) {
3881 BaseBinaryScalar* output = checked_cast<BaseBinaryScalar*>(out->scalar().get());
3882 const size_t num_args = batch.values.size();
3883 if (num_args == 1) {
3884 // Only separator, no values
3885 output->is_valid = batch.values[0].scalar()->is_valid;
3886 if (output->is_valid) {
3887 ARROW_ASSIGN_OR_RAISE(output->value, ctx->Allocate(0));
3888 }
3889 return Status::OK();
3890 }
3891
3892 int64_t final_size = CalculateRowSize(options, batch, 0);
3893 if (final_size < 0) {
3894 output->is_valid = false;
3895 return Status::OK();
3896 }
3897 ARROW_ASSIGN_OR_RAISE(output->value, ctx->Allocate(final_size));
3898 const auto separator = UnboxScalar<Type>::Unbox(*batch.values.back().scalar());
3899 uint8_t* buf = output->value->mutable_data();
3900 bool first = true;
3901 for (size_t i = 0; i < num_args - 1; i++) {
3902 const Scalar& scalar = *batch[i].scalar();
3903 util::string_view s;
3904 if (scalar.is_valid) {
3905 s = UnboxScalar<Type>::Unbox(scalar);
3906 } else {
3907 switch (options.null_handling) {
3908 case JoinOptions::EMIT_NULL:
3909 // Handled by CalculateRowSize
3910 DCHECK(false) << "unreachable";
3911 break;
3912 case JoinOptions::SKIP:
3913 continue;
3914 case JoinOptions::REPLACE:
3915 s = options.null_replacement;
3916 break;
3917 }
3918 }
3919 if (!first) {
3920 buf = std::copy(separator.begin(), separator.end(), buf);
3921 }
3922 first = false;
3923 buf = std::copy(s.begin(), s.end(), buf);
3924 }
3925 output->is_valid = true;
3926 DCHECK_EQ(final_size, buf - output->value->mutable_data());
3927 return Status::OK();
3928 }
3929
3930 static Status ExecContainingArrays(KernelContext* ctx, const JoinOptions& options,
3931 const ExecBatch& batch, Datum* out) {
3932 // Presize data to avoid reallocations
3933 int64_t final_size = 0;
3934 for (int64_t i = 0; i < batch.length; i++) {
3935 auto size = CalculateRowSize(options, batch, i);
3936 if (size > 0) final_size += size;
3937 }
3938 BuilderType builder(ctx->memory_pool());
3939 RETURN_NOT_OK(builder.Reserve(batch.length));
3940 RETURN_NOT_OK(builder.ReserveData(final_size));
3941
3942 std::vector<util::string_view> valid_cols(batch.values.size());
3943 for (size_t row = 0; row < static_cast<size_t>(batch.length); row++) {
3944 size_t num_valid = 0; // Not counting separator
3945 for (size_t col = 0; col < batch.values.size(); col++) {
3946 if (batch[col].is_scalar()) {
3947 const auto& scalar = *batch[col].scalar();
3948 if (scalar.is_valid) {
3949 valid_cols[col] = UnboxScalar<Type>::Unbox(scalar);
3950 if (col < batch.values.size() - 1) num_valid++;
3951 } else {
3952 valid_cols[col] = util::string_view();
3953 }
3954 } else {
3955 const ArrayData& array = *batch[col].array();
3956 if (!array.MayHaveNulls() ||
3957 BitUtil::GetBit(array.buffers[0]->data(), array.offset + row)) {
3958 const offset_type* offsets = array.GetValues<offset_type>(1);
3959 const uint8_t* data = array.GetValues<uint8_t>(2, /*absolute_offset=*/0);
3960 const int64_t length = offsets[row + 1] - offsets[row];
3961 valid_cols[col] = util::string_view(
3962 reinterpret_cast<const char*>(data + offsets[row]), length);
3963 if (col < batch.values.size() - 1) num_valid++;
3964 } else {
3965 valid_cols[col] = util::string_view();
3966 }
3967 }
3968 }
3969
3970 if (!valid_cols.back().data()) {
3971 // Separator is null
3972 builder.UnsafeAppendNull();
3973 continue;
3974 } else if (batch.values.size() == 1) {
3975 // Only given separator
3976 builder.UnsafeAppendEmptyValue();
3977 continue;
3978 } else if (num_valid < batch.values.size() - 1) {
3979 // We had some nulls
3980 if (options.null_handling == JoinOptions::EMIT_NULL) {
3981 builder.UnsafeAppendNull();
3982 continue;
3983 }
3984 }
3985 const auto separator = valid_cols.back();
3986 bool first = true;
3987 for (size_t col = 0; col < batch.values.size() - 1; col++) {
3988 util::string_view value = valid_cols[col];
3989 if (!value.data()) {
3990 switch (options.null_handling) {
3991 case JoinOptions::EMIT_NULL:
3992 DCHECK(false) << "unreachable";
3993 break;
3994 case JoinOptions::SKIP:
3995 continue;
3996 case JoinOptions::REPLACE:
3997 value = options.null_replacement;
3998 break;
3999 }
4000 }
4001 if (first) {
4002 builder.UnsafeAppend(value);
4003 first = false;
4004 continue;
4005 }
4006 builder.UnsafeExtendCurrent(separator);
4007 builder.UnsafeExtendCurrent(value);
4008 }
4009 }
4010
4011 std::shared_ptr<Array> string_array;
4012 RETURN_NOT_OK(builder.Finish(&string_array));
4013 *out = *string_array->data();
4014 out->mutable_array()->type = batch[0].type();
4015 DCHECK_EQ(batch.length, out->array()->length);
4016 DCHECK_EQ(final_size,
4017 checked_cast<const ArrayType&>(*string_array).total_values_length());
4018 return Status::OK();
4019 }
4020
4021 // Compute the length of the output for the given position, or -1 if it would be null.
4022 static int64_t CalculateRowSize(const JoinOptions& options, const ExecBatch& batch,
4023 const int64_t index) {
4024 const auto num_args = batch.values.size();
4025 int64_t final_size = 0;
4026 int64_t num_non_null_args = 0;
4027 for (size_t i = 0; i < num_args; i++) {
4028 int64_t element_size = 0;
4029 bool valid = true;
4030 if (batch[i].is_scalar()) {
4031 const Scalar& scalar = *batch[i].scalar();
4032 valid = scalar.is_valid;
4033 element_size = UnboxScalar<Type>::Unbox(scalar).size();
4034 } else {
4035 const ArrayData& array = *batch[i].array();
4036 valid = !array.MayHaveNulls() ||
4037 BitUtil::GetBit(array.buffers[0]->data(), array.offset + index);
4038 const offset_type* offsets = array.GetValues<offset_type>(1);
4039 element_size = offsets[index + 1] - offsets[index];
4040 }
4041 if (i == num_args - 1) {
4042 if (!valid) return -1;
4043 if (num_non_null_args > 1) {
4044 // Add separator size (only if there were values to join)
4045 final_size += (num_non_null_args - 1) * element_size;
4046 }
4047 break;
4048 }
4049 if (!valid) {
4050 switch (options.null_handling) {
4051 case JoinOptions::EMIT_NULL:
4052 return -1;
4053 case JoinOptions::SKIP:
4054 continue;
4055 case JoinOptions::REPLACE:
4056 element_size = options.null_replacement.size();
4057 break;
4058 }
4059 }
4060 num_non_null_args++;
4061 final_size += element_size;
4062 }
4063 return final_size;
4064 }
4065 };
4066
4067 const FunctionDoc binary_join_doc(
4068 "Join a list of strings together with a `separator` to form a single string",
4069 ("Insert `separator` between `list` elements, and concatenate them.\n"
4070 "Any null input and any null `list` element emits a null output.\n"),
4071 {"list", "separator"});
4072
4073 const FunctionDoc binary_join_element_wise_doc(
4074 "Join string arguments into one, using the last argument as the separator",
4075 ("Insert the last argument of `strings` between the rest of the elements, "
4076 "and concatenate them.\n"
4077 "Any null separator element emits a null output. Null elements either "
4078 "emit a null (the default), are skipped, or replaced with a given string.\n"),
4079 {"*strings"}, "JoinOptions");
4080
4081 const auto kDefaultJoinOptions = JoinOptions::Defaults();
4082
4083 template <typename ListType>
4084 void AddBinaryJoinForListType(ScalarFunction* func) {
4085 for (const std::shared_ptr<DataType>& ty : BaseBinaryTypes()) {
4086 auto exec = GenerateTypeAgnosticVarBinaryBase<BinaryJoin, ListType>(*ty);
4087 auto list_ty = std::make_shared<ListType>(ty);
4088 DCHECK_OK(func->AddKernel({InputType(list_ty), InputType(ty)}, ty, exec));
4089 }
4090 }
4091
4092 void AddBinaryJoin(FunctionRegistry* registry) {
4093 {
4094 auto func = std::make_shared<ScalarFunction>("binary_join", Arity::Binary(),
4095 &binary_join_doc);
4096 AddBinaryJoinForListType<ListType>(func.get());
4097 AddBinaryJoinForListType<LargeListType>(func.get());
4098 DCHECK_OK(registry->AddFunction(std::move(func)));
4099 }
4100 {
4101 auto func = std::make_shared<ScalarFunction>(
4102 "binary_join_element_wise", Arity::VarArgs(/*min_args=*/1),
4103 &binary_join_element_wise_doc, &kDefaultJoinOptions);
4104 for (const auto& ty : BaseBinaryTypes()) {
4105 ScalarKernel kernel{KernelSignature::Make({InputType(ty)}, ty, /*is_varargs=*/true),
4106 GenerateTypeAgnosticVarBinaryBase<BinaryJoinElementWise>(ty),
4107 BinaryJoinElementWiseState::Init};
4108 kernel.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE;
4109 kernel.mem_allocation = MemAllocation::NO_PREALLOCATE;
4110 DCHECK_OK(func->AddKernel(std::move(kernel)));
4111 }
4112 DCHECK_OK(registry->AddFunction(std::move(func)));
4113 }
4114 }
4115
4116 template <template <typename> class ExecFunctor>
4117 void MakeUnaryStringBatchKernel(
4118 std::string name, FunctionRegistry* registry, const FunctionDoc* doc,
4119 MemAllocation::type mem_allocation = MemAllocation::PREALLOCATE) {
4120 auto func = std::make_shared<ScalarFunction>(name, Arity::Unary(), doc);
4121 {
4122 auto exec_32 = ExecFunctor<StringType>::Exec;
4123 ScalarKernel kernel{{utf8()}, utf8(), exec_32};
4124 kernel.mem_allocation = mem_allocation;
4125 DCHECK_OK(func->AddKernel(std::move(kernel)));
4126 }
4127 {
4128 auto exec_64 = ExecFunctor<LargeStringType>::Exec;
4129 ScalarKernel kernel{{large_utf8()}, large_utf8(), exec_64};
4130 kernel.mem_allocation = mem_allocation;
4131 DCHECK_OK(func->AddKernel(std::move(kernel)));
4132 }
4133 DCHECK_OK(registry->AddFunction(std::move(func)));
4134 }
4135
4136 template <template <typename> class ExecFunctor>
4137 void MakeUnaryStringBatchKernelWithState(
4138 std::string name, FunctionRegistry* registry, const FunctionDoc* doc,
4139 MemAllocation::type mem_allocation = MemAllocation::PREALLOCATE) {
4140 auto func = std::make_shared<ScalarFunction>(name, Arity::Unary(), doc);
4141 {
4142 using t32 = ExecFunctor<StringType>;
4143 ScalarKernel kernel{{utf8()}, utf8(), t32::Exec, t32::State::Init};
4144 kernel.mem_allocation = mem_allocation;
4145 DCHECK_OK(func->AddKernel(std::move(kernel)));
4146 }
4147 {
4148 using t64 = ExecFunctor<LargeStringType>;
4149 ScalarKernel kernel{{large_utf8()}, large_utf8(), t64::Exec, t64::State::Init};
4150 kernel.mem_allocation = mem_allocation;
4151 DCHECK_OK(func->AddKernel(std::move(kernel)));
4152 }
4153 DCHECK_OK(registry->AddFunction(std::move(func)));
4154 }
4155
4156 #ifdef ARROW_WITH_UTF8PROC
4157
4158 template <template <typename> class Transformer>
4159 void MakeUnaryStringUTF8TransformKernel(std::string name, FunctionRegistry* registry,
4160 const FunctionDoc* doc) {
4161 auto func = std::make_shared<ScalarFunction>(name, Arity::Unary(), doc);
4162 ArrayKernelExec exec_32 = Transformer<StringType>::Exec;
4163 ArrayKernelExec exec_64 = Transformer<LargeStringType>::Exec;
4164 DCHECK_OK(func->AddKernel({utf8()}, utf8(), exec_32));
4165 DCHECK_OK(func->AddKernel({large_utf8()}, large_utf8(), exec_64));
4166 DCHECK_OK(registry->AddFunction(std::move(func)));
4167 }
4168
4169 #endif
4170
4171 // NOTE: Predicate should only populate 'status' with errors,
4172 // leave it unmodified to indicate Status::OK()
4173 using StringPredicate =
4174 std::function<bool(KernelContext*, const uint8_t*, size_t, Status*)>;
4175
4176 template <typename Type>
4177 Status ApplyPredicate(KernelContext* ctx, const ExecBatch& batch,
4178 StringPredicate predicate, Datum* out) {
4179 Status st = Status::OK();
4180 EnsureLookupTablesFilled();
4181 if (batch[0].kind() == Datum::ARRAY) {
4182 const ArrayData& input = *batch[0].array();
4183 ArrayIterator<Type> input_it(input);
4184 ArrayData* out_arr = out->mutable_array();
4185 ::arrow::internal::GenerateBitsUnrolled(
4186 out_arr->buffers[1]->mutable_data(), out_arr->offset, input.length,
4187 [&]() -> bool {
4188 util::string_view val = input_it();
4189 return predicate(ctx, reinterpret_cast<const uint8_t*>(val.data()), val.size(),
4190 &st);
4191 });
4192 } else {
4193 const auto& input = checked_cast<const BaseBinaryScalar&>(*batch[0].scalar());
4194 if (input.is_valid) {
4195 bool boolean_result = predicate(ctx, input.value->data(),
4196 static_cast<size_t>(input.value->size()), &st);
4197 // UTF decoding can lead to issues
4198 if (st.ok()) {
4199 out->value = std::make_shared<BooleanScalar>(boolean_result);
4200 }
4201 }
4202 }
4203 return st;
4204 }
4205
4206 template <typename Predicate>
4207 void AddUnaryStringPredicate(std::string name, FunctionRegistry* registry,
4208 const FunctionDoc* doc) {
4209 auto func = std::make_shared<ScalarFunction>(name, Arity::Unary(), doc);
4210 auto exec_32 = [](KernelContext* ctx, const ExecBatch& batch, Datum* out) {
4211 return ApplyPredicate<StringType>(ctx, batch, Predicate::Call, out);
4212 };
4213 auto exec_64 = [](KernelContext* ctx, const ExecBatch& batch, Datum* out) {
4214 return ApplyPredicate<LargeStringType>(ctx, batch, Predicate::Call, out);
4215 };
4216 DCHECK_OK(func->AddKernel({utf8()}, boolean(), std::move(exec_32)));
4217 DCHECK_OK(func->AddKernel({large_utf8()}, boolean(), std::move(exec_64)));
4218 DCHECK_OK(registry->AddFunction(std::move(func)));
4219 }
4220
4221 FunctionDoc StringPredicateDoc(std::string summary, std::string description) {
4222 return FunctionDoc{std::move(summary), std::move(description), {"strings"}};
4223 }
4224
4225 FunctionDoc StringClassifyDoc(std::string class_summary, std::string class_desc,
4226 bool non_empty) {
4227 std::string summary, description;
4228 {
4229 std::stringstream ss;
4230 ss << "Classify strings as " << class_summary;
4231 summary = ss.str();
4232 }
4233 {
4234 std::stringstream ss;
4235 if (non_empty) {
4236 ss
4237 << ("For each string in `strings`, emit true iff the string is non-empty\n"
4238 "and consists only of ");
4239 } else {
4240 ss
4241 << ("For each string in `strings`, emit true iff the string consists only\n"
4242 "of ");
4243 }
4244 ss << class_desc << ". Null strings emit null.";
4245 description = ss.str();
4246 }
4247 return StringPredicateDoc(std::move(summary), std::move(description));
4248 }
4249
4250 const auto string_is_ascii_doc = StringClassifyDoc("ASCII", "ASCII characters", false);
4251
4252 const auto ascii_is_alnum_doc =
4253 StringClassifyDoc("ASCII alphanumeric", "alphanumeric ASCII characters", true);
4254 const auto ascii_is_alpha_doc =
4255 StringClassifyDoc("ASCII alphabetic", "alphabetic ASCII characters", true);
4256 const auto ascii_is_decimal_doc =
4257 StringClassifyDoc("ASCII decimal", "decimal ASCII characters", true);
4258 const auto ascii_is_lower_doc =
4259 StringClassifyDoc("ASCII lowercase", "lowercase ASCII characters", true);
4260 const auto ascii_is_printable_doc =
4261 StringClassifyDoc("ASCII printable", "printable ASCII characters", true);
4262 const auto ascii_is_space_doc =
4263 StringClassifyDoc("ASCII whitespace", "whitespace ASCII characters", true);
4264 const auto ascii_is_upper_doc =
4265 StringClassifyDoc("ASCII uppercase", "uppercase ASCII characters", true);
4266
4267 const auto ascii_is_title_doc = StringPredicateDoc(
4268 "Classify strings as ASCII titlecase",
4269 ("For each string in `strings`, emit true iff the string is title-cased,\n"
4270 "i.e. it has at least one cased character, each uppercase character\n"
4271 "follows an uncased character, and each lowercase character follows\n"
4272 "an uppercase character.\n"));
4273
4274 const auto utf8_is_alnum_doc =
4275 StringClassifyDoc("alphanumeric", "alphanumeric Unicode characters", true);
4276 const auto utf8_is_alpha_doc =
4277 StringClassifyDoc("alphabetic", "alphabetic Unicode characters", true);
4278 const auto utf8_is_decimal_doc =
4279 StringClassifyDoc("decimal", "decimal Unicode characters", true);
4280 const auto utf8_is_digit_doc = StringClassifyDoc("digits", "Unicode digits", true);
4281 const auto utf8_is_lower_doc =
4282 StringClassifyDoc("lowercase", "lowercase Unicode characters", true);
4283 const auto utf8_is_numeric_doc =
4284 StringClassifyDoc("numeric", "numeric Unicode characters", true);
4285 const auto utf8_is_printable_doc =
4286 StringClassifyDoc("printable", "printable Unicode characters", true);
4287 const auto utf8_is_space_doc =
4288 StringClassifyDoc("whitespace", "whitespace Unicode characters", true);
4289 const auto utf8_is_upper_doc =
4290 StringClassifyDoc("uppercase", "uppercase Unicode characters", true);
4291
4292 const auto utf8_is_title_doc = StringPredicateDoc(
4293 "Classify strings as titlecase",
4294 ("For each string in `strings`, emit true iff the string is title-cased,\n"
4295 "i.e. it has at least one cased character, each uppercase character\n"
4296 "follows an uncased character, and each lowercase character follows\n"
4297 "an uppercase character.\n"));
4298
4299 const FunctionDoc ascii_upper_doc(
4300 "Transform ASCII input to uppercase",
4301 ("For each string in `strings`, return an uppercase version.\n\n"
4302 "This function assumes the input is fully ASCII. It it may contain\n"
4303 "non-ASCII characters, use \"utf8_upper\" instead."),
4304 {"strings"});
4305
4306 const FunctionDoc ascii_lower_doc(
4307 "Transform ASCII input to lowercase",
4308 ("For each string in `strings`, return a lowercase version.\n\n"
4309 "This function assumes the input is fully ASCII. If it may contain\n"
4310 "non-ASCII characters, use \"utf8_lower\" instead."),
4311 {"strings"});
4312
4313 const FunctionDoc ascii_swapcase_doc(
4314 "Transform ASCII input lowercase characters to uppercase and uppercase characters to "
4315 "lowercase",
4316 ("For each string in `strings`, return a string with opposite casing.\n\n"
4317 "This function assumes the input is fully ASCII. If it may contain\n"
4318 "non-ASCII characters, use \"utf8_swapcase\" instead."),
4319 {"strings"});
4320
4321 const FunctionDoc ascii_capitalize_doc(
4322 "Capitalize the first character of ASCII input",
4323 ("For each string in `strings`, return a capitalized version.\n\n"
4324 "This function assumes the input is fully ASCII. If it may contain\n"
4325 "non-ASCII characters, use \"utf8_capitalize\" instead."),
4326 {"strings"});
4327
4328 const FunctionDoc ascii_title_doc(
4329 "Titlecase each word of ASCII input",
4330 ("For each string in `strings`, return a titlecased version.\n"
4331 "Each word in the output will start with an uppercase character and its\n"
4332 "remaining characters will be lowercase.\n\n"
4333 "This function assumes the input is fully ASCII. If it may contain\n"
4334 "non-ASCII characters, use \"utf8_title\" instead."),
4335 {"strings"});
4336
4337 const FunctionDoc ascii_reverse_doc(
4338 "Reverse ASCII input",
4339 ("For each ASCII string in `strings`, return a reversed version.\n\n"
4340 "This function assumes the input is fully ASCII. If it may contain\n"
4341 "non-ASCII characters, use \"utf8_reverse\" instead."),
4342 {"strings"});
4343
4344 const FunctionDoc utf8_upper_doc(
4345 "Transform input to uppercase",
4346 ("For each string in `strings`, return an uppercase version."), {"strings"});
4347
4348 const FunctionDoc utf8_lower_doc(
4349 "Transform input to lowercase",
4350 ("For each string in `strings`, return a lowercase version."), {"strings"});
4351
4352 const FunctionDoc utf8_swapcase_doc(
4353 "Transform input lowercase characters to uppercase and uppercase characters to "
4354 "lowercase",
4355 ("For each string in `strings`, return an opposite case version."), {"strings"});
4356
4357 const FunctionDoc utf8_capitalize_doc(
4358 "Capitalize the first character of input",
4359 ("For each string in `strings`, return a capitalized version,\n"
4360 "with the first character uppercased and the others lowercased."),
4361 {"strings"});
4362
4363 const FunctionDoc utf8_title_doc(
4364 "Titlecase each word of input",
4365 ("For each string in `strings`, return a titlecased version.\n"
4366 "Each word in the output will start with an uppercase character and its\n"
4367 "remaining characters will be lowercase."),
4368 {"strings"});
4369
4370 const FunctionDoc utf8_reverse_doc(
4371 "Reverse input",
4372 ("For each string in `strings`, return a reversed version.\n\n"
4373 "This function operates on Unicode codepoints, not grapheme\n"
4374 "clusters. Hence, it will not correctly reverse grapheme clusters\n"
4375 "composed of multiple codepoints."),
4376 {"strings"});
4377
4378 } // namespace
4379
4380 void RegisterScalarStringAscii(FunctionRegistry* registry) {
4381 // Some kernels are able to reuse the original offsets buffer, so don't
4382 // preallocate them in the output. Only kernels that invoke
4383 // "StringDataTransform" support no preallocation.
4384 MakeUnaryStringBatchKernel<AsciiUpper>("ascii_upper", registry, &ascii_upper_doc,
4385 MemAllocation::NO_PREALLOCATE);
4386 MakeUnaryStringBatchKernel<AsciiLower>("ascii_lower", registry, &ascii_lower_doc,
4387 MemAllocation::NO_PREALLOCATE);
4388 MakeUnaryStringBatchKernel<AsciiSwapCase>(
4389 "ascii_swapcase", registry, &ascii_swapcase_doc, MemAllocation::NO_PREALLOCATE);
4390 MakeUnaryStringBatchKernel<AsciiCapitalize>("ascii_capitalize", registry,
4391 &ascii_capitalize_doc);
4392 MakeUnaryStringBatchKernel<AsciiTitle>("ascii_title", registry, &ascii_title_doc);
4393 MakeUnaryStringBatchKernel<AsciiTrimWhitespace>("ascii_trim_whitespace", registry,
4394 &ascii_trim_whitespace_doc);
4395 MakeUnaryStringBatchKernel<AsciiLTrimWhitespace>("ascii_ltrim_whitespace", registry,
4396 &ascii_ltrim_whitespace_doc);
4397 MakeUnaryStringBatchKernel<AsciiRTrimWhitespace>("ascii_rtrim_whitespace", registry,
4398 &ascii_rtrim_whitespace_doc);
4399 MakeUnaryStringBatchKernel<AsciiReverse>("ascii_reverse", registry, &ascii_reverse_doc);
4400 MakeUnaryStringBatchKernel<Utf8Reverse>("utf8_reverse", registry, &utf8_reverse_doc);
4401
4402 MakeUnaryStringBatchKernelWithState<AsciiCenter>("ascii_center", registry,
4403 &ascii_center_doc);
4404 MakeUnaryStringBatchKernelWithState<AsciiLPad>("ascii_lpad", registry, &ascii_lpad_doc);
4405 MakeUnaryStringBatchKernelWithState<AsciiRPad>("ascii_rpad", registry, &ascii_rpad_doc);
4406 MakeUnaryStringBatchKernelWithState<Utf8Center>("utf8_center", registry,
4407 &utf8_center_doc);
4408 MakeUnaryStringBatchKernelWithState<Utf8LPad>("utf8_lpad", registry, &utf8_lpad_doc);
4409 MakeUnaryStringBatchKernelWithState<Utf8RPad>("utf8_rpad", registry, &utf8_rpad_doc);
4410
4411 MakeUnaryStringBatchKernelWithState<AsciiTrim>("ascii_trim", registry, &ascii_trim_doc);
4412 MakeUnaryStringBatchKernelWithState<AsciiLTrim>("ascii_ltrim", registry,
4413 &ascii_ltrim_doc);
4414 MakeUnaryStringBatchKernelWithState<AsciiRTrim>("ascii_rtrim", registry,
4415 &ascii_rtrim_doc);
4416
4417 AddUnaryStringPredicate<IsAscii>("string_is_ascii", registry, &string_is_ascii_doc);
4418
4419 AddUnaryStringPredicate<IsAlphaNumericAscii>("ascii_is_alnum", registry,
4420 &ascii_is_alnum_doc);
4421 AddUnaryStringPredicate<IsAlphaAscii>("ascii_is_alpha", registry, &ascii_is_alpha_doc);
4422 AddUnaryStringPredicate<IsDecimalAscii>("ascii_is_decimal", registry,
4423 &ascii_is_decimal_doc);
4424 // no is_digit for ascii, since it is the same as is_decimal
4425 AddUnaryStringPredicate<IsLowerAscii>("ascii_is_lower", registry, &ascii_is_lower_doc);
4426 // no is_numeric for ascii, since it is the same as is_decimal
4427 AddUnaryStringPredicate<IsPrintableAscii>("ascii_is_printable", registry,
4428 &ascii_is_printable_doc);
4429 AddUnaryStringPredicate<IsSpaceAscii>("ascii_is_space", registry, &ascii_is_space_doc);
4430 AddUnaryStringPredicate<IsTitleAscii>("ascii_is_title", registry, &ascii_is_title_doc);
4431 AddUnaryStringPredicate<IsUpperAscii>("ascii_is_upper", registry, &ascii_is_upper_doc);
4432
4433 #ifdef ARROW_WITH_UTF8PROC
4434 MakeUnaryStringUTF8TransformKernel<UTF8Upper>("utf8_upper", registry, &utf8_upper_doc);
4435 MakeUnaryStringUTF8TransformKernel<UTF8Lower>("utf8_lower", registry, &utf8_lower_doc);
4436 MakeUnaryStringUTF8TransformKernel<UTF8SwapCase>("utf8_swapcase", registry,
4437 &utf8_swapcase_doc);
4438 MakeUnaryStringBatchKernel<Utf8Capitalize>("utf8_capitalize", registry,
4439 &utf8_capitalize_doc);
4440 MakeUnaryStringBatchKernel<Utf8Title>("utf8_title", registry, &utf8_title_doc);
4441 MakeUnaryStringBatchKernel<UTF8TrimWhitespace>("utf8_trim_whitespace", registry,
4442 &utf8_trim_whitespace_doc);
4443 MakeUnaryStringBatchKernel<UTF8LTrimWhitespace>("utf8_ltrim_whitespace", registry,
4444 &utf8_ltrim_whitespace_doc);
4445 MakeUnaryStringBatchKernel<UTF8RTrimWhitespace>("utf8_rtrim_whitespace", registry,
4446 &utf8_rtrim_whitespace_doc);
4447 MakeUnaryStringBatchKernelWithState<UTF8Trim>("utf8_trim", registry, &utf8_trim_doc);
4448 MakeUnaryStringBatchKernelWithState<UTF8LTrim>("utf8_ltrim", registry, &utf8_ltrim_doc);
4449 MakeUnaryStringBatchKernelWithState<UTF8RTrim>("utf8_rtrim", registry, &utf8_rtrim_doc);
4450
4451 AddUnaryStringPredicate<IsAlphaNumericUnicode>("utf8_is_alnum", registry,
4452 &utf8_is_alnum_doc);
4453 AddUnaryStringPredicate<IsAlphaUnicode>("utf8_is_alpha", registry, &utf8_is_alpha_doc);
4454 AddUnaryStringPredicate<IsDecimalUnicode>("utf8_is_decimal", registry,
4455 &utf8_is_decimal_doc);
4456 AddUnaryStringPredicate<IsDigitUnicode>("utf8_is_digit", registry, &utf8_is_digit_doc);
4457 AddUnaryStringPredicate<IsLowerUnicode>("utf8_is_lower", registry, &utf8_is_lower_doc);
4458 AddUnaryStringPredicate<IsNumericUnicode>("utf8_is_numeric", registry,
4459 &utf8_is_numeric_doc);
4460 AddUnaryStringPredicate<IsPrintableUnicode>("utf8_is_printable", registry,
4461 &utf8_is_printable_doc);
4462 AddUnaryStringPredicate<IsSpaceUnicode>("utf8_is_space", registry, &utf8_is_space_doc);
4463 AddUnaryStringPredicate<IsTitleUnicode>("utf8_is_title", registry, &utf8_is_title_doc);
4464 AddUnaryStringPredicate<IsUpperUnicode>("utf8_is_upper", registry, &utf8_is_upper_doc);
4465 #endif
4466
4467 AddBinaryLength(registry);
4468 AddUtf8Length(registry);
4469 AddMatchSubstring(registry);
4470 AddFindSubstring(registry);
4471 AddCountSubstring(registry);
4472 MakeUnaryStringBatchKernelWithState<ReplaceSubStringPlain>(
4473 "replace_substring", registry, &replace_substring_doc,
4474 MemAllocation::NO_PREALLOCATE);
4475 #ifdef ARROW_WITH_RE2
4476 MakeUnaryStringBatchKernelWithState<ReplaceSubStringRegex>(
4477 "replace_substring_regex", registry, &replace_substring_regex_doc,
4478 MemAllocation::NO_PREALLOCATE);
4479 AddExtractRegex(registry);
4480 #endif
4481 AddReplaceSlice(registry);
4482 AddSlice(registry);
4483 AddSplit(registry);
4484 AddStrptime(registry);
4485 AddBinaryJoin(registry);
4486 }
4487
4488 } // namespace internal
4489 } // namespace compute
4490 } // namespace arrow