1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements. See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership. The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License. You may obtain a copy of the License at
9 // http://www.apache.org/licenses/LICENSE-2.0
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied. See the License for the
15 // specific language governing permissions and limitations
23 #ifdef ARROW_WITH_UTF8PROC
31 #include "arrow/array/builder_binary.h"
32 #include "arrow/array/builder_nested.h"
33 #include "arrow/buffer_builder.h"
35 #include "arrow/builder.h"
36 #include "arrow/compute/api_scalar.h"
37 #include "arrow/compute/kernels/common.h"
38 #include "arrow/util/checked_cast.h"
39 #include "arrow/util/utf8.h"
40 #include "arrow/util/value_parsing.h"
41 #include "arrow/visitor_inline.h"
45 using internal::checked_cast
;
53 util::string_view
ToStringView(re2::StringPiece piece
) {
54 return {piece
.data(), piece
.length()};
57 re2::StringPiece
ToStringPiece(util::string_view view
) {
58 return {view
.data(), view
.length()};
61 Status
RegexStatus(const RE2
& regex
) {
63 return Status::Invalid("Invalid regular expression: ", regex
.error());
69 // Code units in the range [a-z] can only be an encoding of an ASCII
70 // character/codepoint, not the 2nd, 3rd or 4th code unit (byte) of a different
71 // codepoint. This is guaranteed by the non-overlap design of the Unicode
72 // standard. (see section 2.5 of Unicode Standard Core Specification v13.0)
76 static inline bool IsAsciiCharacter(uint8_t character
) { return character
< 128; }
78 static inline bool IsLowerCaseCharacterAscii(uint8_t ascii_character
) {
79 return (ascii_character
>= 'a') && (ascii_character
<= 'z');
82 static inline bool IsUpperCaseCharacterAscii(uint8_t ascii_character
) {
83 return (ascii_character
>= 'A') && (ascii_character
<= 'Z');
86 static inline bool IsCasedCharacterAscii(uint8_t ascii_character
) {
87 // Note: Non-ASCII characters are seen as uncased.
88 return IsLowerCaseCharacterAscii(ascii_character
) ||
89 IsUpperCaseCharacterAscii(ascii_character
);
92 static inline bool IsAlphaCharacterAscii(uint8_t ascii_character
) {
93 return IsCasedCharacterAscii(ascii_character
);
96 static inline bool IsAlphaNumericCharacterAscii(uint8_t ascii_character
) {
97 return ((ascii_character
>= '0') && (ascii_character
<= '9')) ||
98 ((ascii_character
>= 'a') && (ascii_character
<= 'z')) ||
99 ((ascii_character
>= 'A') && (ascii_character
<= 'Z'));
102 static inline bool IsDecimalCharacterAscii(uint8_t ascii_character
) {
103 return ((ascii_character
>= '0') && (ascii_character
<= '9'));
106 static inline bool IsSpaceCharacterAscii(uint8_t ascii_character
) {
107 return ((ascii_character
>= 9) && (ascii_character
<= 13)) || (ascii_character
== ' ');
110 static inline bool IsPrintableCharacterAscii(uint8_t ascii_character
) {
111 return ((ascii_character
>= ' ') && (ascii_character
<= '~'));
114 struct BinaryLength
{
115 template <typename OutValue
, typename Arg0Value
= util::string_view
>
116 static OutValue
Call(KernelContext
*, Arg0Value val
, Status
*) {
117 return static_cast<OutValue
>(val
.size());
120 static Status
FixedSizeExec(KernelContext
*, const ExecBatch
& batch
, Datum
* out
) {
121 // Output is preallocated and validity buffer is precomputed
122 const int32_t width
=
123 checked_cast
<const FixedSizeBinaryType
&>(*batch
[0].type()).byte_width();
124 if (batch
.values
[0].is_array()) {
125 int32_t* buffer
= out
->mutable_array()->GetMutableValues
<int32_t>(1);
126 std::fill(buffer
, buffer
+ batch
.length
, width
);
128 checked_cast
<Int32Scalar
*>(out
->scalar().get())->value
= width
;
135 template <typename OutValue
, typename Arg0Value
= util::string_view
>
136 static OutValue
Call(KernelContext
*, Arg0Value val
, Status
*) {
137 auto str
= reinterpret_cast<const uint8_t*>(val
.data());
138 auto strlen
= val
.size();
139 return static_cast<OutValue
>(util::UTF8Length(str
, str
+ strlen
));
143 static inline uint8_t ascii_tolower(uint8_t utf8_code_unit
) {
144 return ((utf8_code_unit
>= 'A') && (utf8_code_unit
<= 'Z')) ? (utf8_code_unit
+ 32)
148 static inline uint8_t ascii_toupper(uint8_t utf8_code_unit
) {
149 return ((utf8_code_unit
>= 'a') && (utf8_code_unit
<= 'z')) ? (utf8_code_unit
- 32)
153 static inline uint8_t ascii_swapcase(uint8_t utf8_code_unit
) {
154 if (IsLowerCaseCharacterAscii(utf8_code_unit
)) {
155 utf8_code_unit
-= 32;
156 } else if (IsUpperCaseCharacterAscii(utf8_code_unit
)) {
157 utf8_code_unit
+= 32;
159 return utf8_code_unit
;
162 #ifdef ARROW_WITH_UTF8PROC
164 // Direct lookup tables for unicode properties
165 constexpr uint32_t kMaxCodepointLookup
=
166 0xffff; // up to this codepoint is in a lookup table
167 std::vector
<uint32_t> lut_upper_codepoint
;
168 std::vector
<uint32_t> lut_lower_codepoint
;
169 std::vector
<uint32_t> lut_swapcase_codepoint
;
170 std::vector
<utf8proc_category_t
> lut_category
;
171 std::once_flag flag_case_luts
;
175 static inline bool HasAnyUnicodeGeneralCategory(uint32_t codepoint
, uint32_t mask
) {
176 utf8proc_category_t general_category
= codepoint
<= kMaxCodepointLookup
177 ? lut_category
[codepoint
]
178 : utf8proc_category(codepoint
);
179 uint32_t general_category_bit
= 1 << general_category
;
180 // for e.g. undefined (but valid) codepoints, general_category == 0 ==
181 // UTF8PROC_CATEGORY_CN
182 return (general_category
!= UTF8PROC_CATEGORY_CN
) &&
183 ((general_category_bit
& mask
) != 0);
186 template <typename
... Categories
>
187 static inline bool HasAnyUnicodeGeneralCategory(uint32_t codepoint
, uint32_t mask
,
188 utf8proc_category_t category
,
189 Categories
... categories
) {
190 return HasAnyUnicodeGeneralCategory(codepoint
, mask
| (1 << category
), categories
...);
193 template <typename
... Categories
>
194 static inline bool HasAnyUnicodeGeneralCategory(uint32_t codepoint
,
195 utf8proc_category_t category
,
196 Categories
... categories
) {
197 return HasAnyUnicodeGeneralCategory(codepoint
, static_cast<uint32_t>(1u << category
),
201 static inline bool IsCasedCharacterUnicode(uint32_t codepoint
) {
202 return HasAnyUnicodeGeneralCategory(codepoint
, UTF8PROC_CATEGORY_LU
,
203 UTF8PROC_CATEGORY_LL
, UTF8PROC_CATEGORY_LT
) ||
204 ((static_cast<uint32_t>(utf8proc_toupper(codepoint
)) != codepoint
) ||
205 (static_cast<uint32_t>(utf8proc_tolower(codepoint
)) != codepoint
));
208 static inline bool IsLowerCaseCharacterUnicode(uint32_t codepoint
) {
209 // although this trick seems to work for upper case, this is not enough for lower case
210 // testing, see https://github.com/JuliaStrings/utf8proc/issues/195 . But currently the
212 return (HasAnyUnicodeGeneralCategory(codepoint
, UTF8PROC_CATEGORY_LL
) ||
213 ((static_cast<uint32_t>(utf8proc_toupper(codepoint
)) != codepoint
) &&
214 (static_cast<uint32_t>(utf8proc_tolower(codepoint
)) == codepoint
))) &&
215 !HasAnyUnicodeGeneralCategory(codepoint
, UTF8PROC_CATEGORY_LT
);
218 static inline bool IsUpperCaseCharacterUnicode(uint32_t codepoint
) {
219 // this seems to be a good workaround for utf8proc not having case information
220 // https://github.com/JuliaStrings/utf8proc/issues/195
221 return (HasAnyUnicodeGeneralCategory(codepoint
, UTF8PROC_CATEGORY_LU
) ||
222 ((static_cast<uint32_t>(utf8proc_toupper(codepoint
)) == codepoint
) &&
223 (static_cast<uint32_t>(utf8proc_tolower(codepoint
)) != codepoint
))) &&
224 !HasAnyUnicodeGeneralCategory(codepoint
, UTF8PROC_CATEGORY_LT
);
227 static inline bool IsAlphaNumericCharacterUnicode(uint32_t codepoint
) {
228 return HasAnyUnicodeGeneralCategory(
229 codepoint
, UTF8PROC_CATEGORY_LU
, UTF8PROC_CATEGORY_LL
, UTF8PROC_CATEGORY_LT
,
230 UTF8PROC_CATEGORY_LM
, UTF8PROC_CATEGORY_LO
, UTF8PROC_CATEGORY_ND
,
231 UTF8PROC_CATEGORY_NL
, UTF8PROC_CATEGORY_NO
);
234 static inline bool IsAlphaCharacterUnicode(uint32_t codepoint
) {
235 return HasAnyUnicodeGeneralCategory(codepoint
, UTF8PROC_CATEGORY_LU
,
236 UTF8PROC_CATEGORY_LL
, UTF8PROC_CATEGORY_LT
,
237 UTF8PROC_CATEGORY_LM
, UTF8PROC_CATEGORY_LO
);
240 static inline bool IsDecimalCharacterUnicode(uint32_t codepoint
) {
241 return HasAnyUnicodeGeneralCategory(codepoint
, UTF8PROC_CATEGORY_ND
);
244 static inline bool IsDigitCharacterUnicode(uint32_t codepoint
) {
245 // Python defines this as Numeric_Type=Digit or Numeric_Type=Decimal.
246 // utf8proc has no support for this, this is the best we can do:
247 return HasAnyUnicodeGeneralCategory(codepoint
, UTF8PROC_CATEGORY_ND
);
250 static inline bool IsNumericCharacterUnicode(uint32_t codepoint
) {
251 // Formally this is not correct, but utf8proc does not allow us to query for Numerical
252 // properties, e.g. Numeric_Value and Numeric_Type
253 // Python defines Numeric as Numeric_Type=Digit, Numeric_Type=Decimal or
254 // Numeric_Type=Numeric.
255 return HasAnyUnicodeGeneralCategory(codepoint
, UTF8PROC_CATEGORY_ND
,
256 UTF8PROC_CATEGORY_NL
, UTF8PROC_CATEGORY_NO
);
259 static inline bool IsSpaceCharacterUnicode(uint32_t codepoint
) {
260 auto property
= utf8proc_get_property(codepoint
);
261 return HasAnyUnicodeGeneralCategory(codepoint
, UTF8PROC_CATEGORY_ZS
) ||
262 property
->bidi_class
== UTF8PROC_BIDI_CLASS_WS
||
263 property
->bidi_class
== UTF8PROC_BIDI_CLASS_B
||
264 property
->bidi_class
== UTF8PROC_BIDI_CLASS_S
;
267 static inline bool IsPrintableCharacterUnicode(uint32_t codepoint
) {
268 uint32_t general_category
= utf8proc_category(codepoint
);
269 return (general_category
!= UTF8PROC_CATEGORY_CN
) &&
270 !HasAnyUnicodeGeneralCategory(codepoint
, UTF8PROC_CATEGORY_CC
,
271 UTF8PROC_CATEGORY_CF
, UTF8PROC_CATEGORY_CS
,
272 UTF8PROC_CATEGORY_CO
, UTF8PROC_CATEGORY_ZS
,
273 UTF8PROC_CATEGORY_ZL
, UTF8PROC_CATEGORY_ZP
);
276 void EnsureLookupTablesFilled() {
277 std::call_once(flag_case_luts
, []() {
278 lut_upper_codepoint
.reserve(kMaxCodepointLookup
+ 1);
279 lut_lower_codepoint
.reserve(kMaxCodepointLookup
+ 1);
280 lut_swapcase_codepoint
.reserve(kMaxCodepointLookup
+ 1);
281 for (uint32_t i
= 0; i
<= kMaxCodepointLookup
; i
++) {
282 lut_upper_codepoint
.push_back(utf8proc_toupper(i
));
283 lut_lower_codepoint
.push_back(utf8proc_tolower(i
));
284 lut_category
.push_back(utf8proc_category(i
));
286 if (IsLowerCaseCharacterUnicode(i
)) {
287 lut_swapcase_codepoint
.push_back(utf8proc_toupper(i
));
288 } else if (IsUpperCaseCharacterUnicode(i
)) {
289 lut_swapcase_codepoint
.push_back(utf8proc_tolower(i
));
291 lut_swapcase_codepoint
.push_back(i
);
299 void EnsureLookupTablesFilled() {}
301 #endif // ARROW_WITH_UTF8PROC
303 constexpr int64_t kTransformError
= -1;
305 struct StringTransformBase
{
306 virtual ~StringTransformBase() = default;
307 virtual Status
PreExec(KernelContext
* ctx
, const ExecBatch
& batch
, Datum
* out
) {
311 // Return the maximum total size of the output in codeunits (i.e. bytes)
312 // given input characteristics.
313 virtual int64_t MaxCodeunits(int64_t ninputs
, int64_t input_ncodeunits
) {
314 return input_ncodeunits
;
317 virtual Status
InvalidStatus() {
318 return Status::Invalid("Invalid UTF8 sequence in input");
321 // Derived classes should also define this method:
322 // int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
326 template <typename Type
, typename StringTransform
>
327 struct StringTransformExecBase
{
328 using offset_type
= typename
Type::offset_type
;
329 using ArrayType
= typename TypeTraits
<Type
>::ArrayType
;
331 static Status
Execute(KernelContext
* ctx
, StringTransform
* transform
,
332 const ExecBatch
& batch
, Datum
* out
) {
333 if (batch
[0].kind() == Datum::ARRAY
) {
334 return ExecArray(ctx
, transform
, batch
[0].array(), out
);
336 DCHECK_EQ(batch
[0].kind(), Datum::SCALAR
);
337 return ExecScalar(ctx
, transform
, batch
[0].scalar(), out
);
340 static Status
ExecArray(KernelContext
* ctx
, StringTransform
* transform
,
341 const std::shared_ptr
<ArrayData
>& data
, Datum
* out
) {
342 ArrayType
input(data
);
343 ArrayData
* output
= out
->mutable_array();
345 const int64_t input_ncodeunits
= input
.total_values_length();
346 const int64_t input_nstrings
= input
.length();
348 const int64_t output_ncodeunits_max
=
349 transform
->MaxCodeunits(input_nstrings
, input_ncodeunits
);
350 if (output_ncodeunits_max
> std::numeric_limits
<offset_type
>::max()) {
351 return Status::CapacityError(
352 "Result might not fit in a 32bit utf8 array, convert to large_utf8");
355 ARROW_ASSIGN_OR_RAISE(auto values_buffer
, ctx
->Allocate(output_ncodeunits_max
));
356 output
->buffers
[2] = values_buffer
;
358 // String offsets are preallocated
359 offset_type
* output_string_offsets
= output
->GetMutableValues
<offset_type
>(1);
360 uint8_t* output_str
= output
->buffers
[2]->mutable_data();
361 offset_type output_ncodeunits
= 0;
363 output_string_offsets
[0] = 0;
364 for (int64_t i
= 0; i
< input_nstrings
; i
++) {
365 if (!input
.IsNull(i
)) {
366 offset_type input_string_ncodeunits
;
367 const uint8_t* input_string
= input
.GetValue(i
, &input_string_ncodeunits
);
368 auto encoded_nbytes
= static_cast<offset_type
>(transform
->Transform(
369 input_string
, input_string_ncodeunits
, output_str
+ output_ncodeunits
));
370 if (encoded_nbytes
< 0) {
371 return transform
->InvalidStatus();
373 output_ncodeunits
+= encoded_nbytes
;
375 output_string_offsets
[i
+ 1] = output_ncodeunits
;
377 DCHECK_LE(output_ncodeunits
, output_ncodeunits_max
);
379 // Trim the codepoint buffer, since we allocated too much
380 return values_buffer
->Resize(output_ncodeunits
, /*shrink_to_fit=*/true);
383 static Status
ExecScalar(KernelContext
* ctx
, StringTransform
* transform
,
384 const std::shared_ptr
<Scalar
>& scalar
, Datum
* out
) {
385 const auto& input
= checked_cast
<const BaseBinaryScalar
&>(*scalar
);
386 if (!input
.is_valid
) {
389 auto* result
= checked_cast
<BaseBinaryScalar
*>(out
->scalar().get());
390 result
->is_valid
= true;
391 const int64_t data_nbytes
= static_cast<int64_t>(input
.value
->size());
393 const int64_t output_ncodeunits_max
= transform
->MaxCodeunits(1, data_nbytes
);
394 if (output_ncodeunits_max
> std::numeric_limits
<offset_type
>::max()) {
395 return Status::CapacityError(
396 "Result might not fit in a 32bit utf8 array, convert to large_utf8");
398 ARROW_ASSIGN_OR_RAISE(auto value_buffer
, ctx
->Allocate(output_ncodeunits_max
));
399 result
->value
= value_buffer
;
400 auto encoded_nbytes
= static_cast<offset_type
>(transform
->Transform(
401 input
.value
->data(), data_nbytes
, value_buffer
->mutable_data()));
402 if (encoded_nbytes
< 0) {
403 return transform
->InvalidStatus();
405 DCHECK_LE(encoded_nbytes
, output_ncodeunits_max
);
406 return value_buffer
->Resize(encoded_nbytes
, /*shrink_to_fit=*/true);
410 template <typename Type
, typename StringTransform
>
411 struct StringTransformExec
: public StringTransformExecBase
<Type
, StringTransform
> {
412 using StringTransformExecBase
<Type
, StringTransform
>::Execute
;
414 static Status
Exec(KernelContext
* ctx
, const ExecBatch
& batch
, Datum
* out
) {
415 StringTransform transform
;
416 RETURN_NOT_OK(transform
.PreExec(ctx
, batch
, out
));
417 return Execute(ctx
, &transform
, batch
, out
);
421 template <typename Type
, typename StringTransform
>
422 struct StringTransformExecWithState
423 : public StringTransformExecBase
<Type
, StringTransform
> {
424 using State
= typename
StringTransform::State
;
425 using StringTransformExecBase
<Type
, StringTransform
>::Execute
;
427 static Status
Exec(KernelContext
* ctx
, const ExecBatch
& batch
, Datum
* out
) {
428 StringTransform
transform(State::Get(ctx
));
429 RETURN_NOT_OK(transform
.PreExec(ctx
, batch
, out
));
430 return Execute(ctx
, &transform
, batch
, out
);
434 template <typename StringTransform
>
435 struct FixedSizeBinaryTransformExecBase
{
436 static Status
Execute(KernelContext
* ctx
, StringTransform
* transform
,
437 const ExecBatch
& batch
, Datum
* out
) {
438 if (batch
[0].kind() == Datum::ARRAY
) {
439 return ExecArray(ctx
, transform
, batch
[0].array(), out
);
441 DCHECK_EQ(batch
[0].kind(), Datum::SCALAR
);
442 return ExecScalar(ctx
, transform
, batch
[0].scalar(), out
);
445 static Status
ExecArray(KernelContext
* ctx
, StringTransform
* transform
,
446 const std::shared_ptr
<ArrayData
>& data
, Datum
* out
) {
447 FixedSizeBinaryArray
input(data
);
448 ArrayData
* output
= out
->mutable_array();
450 const int32_t input_width
=
451 checked_cast
<const FixedSizeBinaryType
&>(*data
->type
).byte_width();
452 const int32_t output_width
=
453 checked_cast
<const FixedSizeBinaryType
&>(*out
->type()).byte_width();
454 const int64_t input_nstrings
= input
.length();
455 ARROW_ASSIGN_OR_RAISE(auto values_buffer
,
456 ctx
->Allocate(output_width
* input_nstrings
));
457 uint8_t* output_str
= values_buffer
->mutable_data();
459 for (int64_t i
= 0; i
< input_nstrings
; i
++) {
460 if (!input
.IsNull(i
)) {
461 const uint8_t* input_string
= input
.GetValue(i
);
462 auto encoded_nbytes
= static_cast<int32_t>(
463 transform
->Transform(input_string
, input_width
, output_str
));
464 if (encoded_nbytes
!= output_width
) {
465 return transform
->InvalidStatus();
468 std::memset(output_str
, 0x00, output_width
);
470 output_str
+= output_width
;
473 output
->buffers
[1] = std::move(values_buffer
);
477 static Status
ExecScalar(KernelContext
* ctx
, StringTransform
* transform
,
478 const std::shared_ptr
<Scalar
>& scalar
, Datum
* out
) {
479 const auto& input
= checked_cast
<const BaseBinaryScalar
&>(*scalar
);
480 if (!input
.is_valid
) {
483 const int32_t out_width
=
484 checked_cast
<const FixedSizeBinaryType
&>(*out
->type()).byte_width();
485 auto* result
= checked_cast
<BaseBinaryScalar
*>(out
->scalar().get());
487 const int32_t data_nbytes
= static_cast<int32_t>(input
.value
->size());
488 ARROW_ASSIGN_OR_RAISE(auto value_buffer
, ctx
->Allocate(out_width
));
489 auto encoded_nbytes
= static_cast<int32_t>(transform
->Transform(
490 input
.value
->data(), data_nbytes
, value_buffer
->mutable_data()));
491 if (encoded_nbytes
!= out_width
) {
492 return transform
->InvalidStatus();
495 result
->is_valid
= true;
496 result
->value
= std::move(value_buffer
);
501 template <typename StringTransform
>
502 struct FixedSizeBinaryTransformExecWithState
503 : public FixedSizeBinaryTransformExecBase
<StringTransform
> {
504 using State
= typename
StringTransform::State
;
505 using FixedSizeBinaryTransformExecBase
<StringTransform
>::Execute
;
507 static Status
Exec(KernelContext
* ctx
, const ExecBatch
& batch
, Datum
* out
) {
508 StringTransform
transform(State::Get(ctx
));
509 RETURN_NOT_OK(transform
.PreExec(ctx
, batch
, out
));
510 return Execute(ctx
, &transform
, batch
, out
);
513 static Result
<ValueDescr
> OutputType(KernelContext
* ctx
,
514 const std::vector
<ValueDescr
>& descrs
) {
515 DCHECK_EQ(1, descrs
.size());
516 const auto& options
= State::Get(ctx
);
517 const int32_t input_width
=
518 checked_cast
<const FixedSizeBinaryType
&>(*descrs
[0].type
).byte_width();
519 const int32_t output_width
= StringTransform::FixedOutputSize(options
, input_width
);
520 return ValueDescr(fixed_size_binary(output_width
), descrs
[0].shape
);
524 #ifdef ARROW_WITH_UTF8PROC
526 struct FunctionalCaseMappingTransform
: public StringTransformBase
{
527 Status
PreExec(KernelContext
* ctx
, const ExecBatch
& batch
, Datum
* out
) override
{
528 EnsureLookupTablesFilled();
532 int64_t MaxCodeunits(int64_t ninputs
, int64_t input_ncodeunits
) override
{
533 // Section 5.18 of the Unicode spec claims that the number of codepoints for case
534 // mapping can grow by a factor of 3. This means grow by a factor of 3 in bytes
535 // However, since we don't support all casings (SpecialCasing.txt) the growth
536 // in bytes is actually only at max 3/2 (as covered by the unittest).
537 // Note that rounding down the 3/2 is ok, since only codepoints encoded by
538 // two code units (even) can grow to 3 code units.
539 return static_cast<int64_t>(input_ncodeunits
) * 3 / 2;
543 template <typename CodepointTransform
>
544 struct StringTransformCodepoint
: public FunctionalCaseMappingTransform
{
545 int64_t Transform(const uint8_t* input
, int64_t input_string_ncodeunits
,
547 uint8_t* output_start
= output
;
548 if (ARROW_PREDICT_FALSE(
549 !arrow::util::UTF8Transform(input
, input
+ input_string_ncodeunits
, &output
,
550 CodepointTransform::TransformCodepoint
))) {
551 return kTransformError
;
553 return output
- output_start
;
557 struct UTF8UpperTransform
: public FunctionalCaseMappingTransform
{
558 static uint32_t TransformCodepoint(uint32_t codepoint
) {
559 return codepoint
<= kMaxCodepointLookup
? lut_upper_codepoint
[codepoint
]
560 : utf8proc_toupper(codepoint
);
564 template <typename Type
>
565 using UTF8Upper
= StringTransformExec
<Type
, StringTransformCodepoint
<UTF8UpperTransform
>>;
567 struct UTF8LowerTransform
: public FunctionalCaseMappingTransform
{
568 static uint32_t TransformCodepoint(uint32_t codepoint
) {
569 return codepoint
<= kMaxCodepointLookup
? lut_lower_codepoint
[codepoint
]
570 : utf8proc_tolower(codepoint
);
574 template <typename Type
>
575 using UTF8Lower
= StringTransformExec
<Type
, StringTransformCodepoint
<UTF8LowerTransform
>>;
577 struct UTF8SwapCaseTransform
: public FunctionalCaseMappingTransform
{
578 static uint32_t TransformCodepoint(uint32_t codepoint
) {
579 if (codepoint
<= kMaxCodepointLookup
) {
580 return lut_swapcase_codepoint
[codepoint
];
582 if (IsLowerCaseCharacterUnicode(codepoint
)) {
583 return utf8proc_toupper(codepoint
);
584 } else if (IsUpperCaseCharacterUnicode(codepoint
)) {
585 return utf8proc_tolower(codepoint
);
593 template <typename Type
>
595 StringTransformExec
<Type
, StringTransformCodepoint
<UTF8SwapCaseTransform
>>;
597 struct Utf8CapitalizeTransform
: public FunctionalCaseMappingTransform
{
598 int64_t Transform(const uint8_t* input
, int64_t input_string_ncodeunits
,
600 uint8_t* output_start
= output
;
601 const uint8_t* end
= input
+ input_string_ncodeunits
;
602 const uint8_t* next
= input
;
603 if (input_string_ncodeunits
> 0) {
604 if (ARROW_PREDICT_FALSE(!util::UTF8AdvanceCodepoints(input
, end
, &next
, 1))) {
605 return kTransformError
;
607 if (ARROW_PREDICT_FALSE(!util::UTF8Transform(
608 input
, next
, &output
, UTF8UpperTransform::TransformCodepoint
))) {
609 return kTransformError
;
611 if (ARROW_PREDICT_FALSE(!util::UTF8Transform(
612 next
, end
, &output
, UTF8LowerTransform::TransformCodepoint
))) {
613 return kTransformError
;
616 return output
- output_start
;
620 template <typename Type
>
621 using Utf8Capitalize
= StringTransformExec
<Type
, Utf8CapitalizeTransform
>;
623 struct Utf8TitleTransform
: public FunctionalCaseMappingTransform
{
624 int64_t Transform(const uint8_t* input
, int64_t input_string_ncodeunits
,
626 uint8_t* output_start
= output
;
627 const uint8_t* end
= input
+ input_string_ncodeunits
;
628 const uint8_t* next
= input
;
629 bool is_next_upper
= true;
630 while ((input
= next
) < end
) {
632 if (ARROW_PREDICT_FALSE(!util::UTF8Decode(&next
, &codepoint
))) {
633 return kTransformError
;
635 if (IsCasedCharacterUnicode(codepoint
)) {
636 // Lower/uppercase current codepoint and
637 // prepare to lowercase next consecutive cased codepoints
638 output
= is_next_upper
639 ? util::UTF8Encode(output
,
640 UTF8UpperTransform::TransformCodepoint(codepoint
))
642 output
, UTF8LowerTransform::TransformCodepoint(codepoint
));
643 is_next_upper
= false;
645 // Copy current uncased codepoint and
646 // prepare to uppercase next cased codepoint
647 std::memcpy(output
, input
, next
- input
);
648 output
+= next
- input
;
649 is_next_upper
= true;
652 return output
- output_start
;
656 template <typename Type
>
657 using Utf8Title
= StringTransformExec
<Type
, Utf8TitleTransform
>;
659 #endif // ARROW_WITH_UTF8PROC
661 struct AsciiReverseTransform
: public StringTransformBase
{
662 int64_t Transform(const uint8_t* input
, int64_t input_string_ncodeunits
,
664 uint8_t utf8_char_found
= 0;
665 for (int64_t i
= 0; i
< input_string_ncodeunits
; i
++) {
666 // if a utf8 char is found, report to utf8_char_found
667 utf8_char_found
|= input
[i
] & 0x80;
668 output
[input_string_ncodeunits
- i
- 1] = input
[i
];
670 return utf8_char_found
? kTransformError
: input_string_ncodeunits
;
673 Status
InvalidStatus() override
{
674 return Status::Invalid("Non-ASCII sequence in input");
678 template <typename Type
>
679 using AsciiReverse
= StringTransformExec
<Type
, AsciiReverseTransform
>;
681 struct Utf8ReverseTransform
: public StringTransformBase
{
682 int64_t Transform(const uint8_t* input
, int64_t input_string_ncodeunits
,
685 while (i
< input_string_ncodeunits
) {
686 int64_t char_end
= std::min(i
+ util::ValidUtf8CodepointByteSize(input
+ i
),
687 input_string_ncodeunits
);
688 std::copy(input
+ i
, input
+ char_end
, output
+ input_string_ncodeunits
- char_end
);
691 return input_string_ncodeunits
;
695 template <typename Type
>
696 using Utf8Reverse
= StringTransformExec
<Type
, Utf8ReverseTransform
>;
698 using TransformFunc
= std::function
<void(const uint8_t*, int64_t, uint8_t*)>;
700 // Transform a buffer of offsets to one which begins with 0 and has same
702 template <typename T
>
703 Status
GetShiftedOffsets(KernelContext
* ctx
, const Buffer
& input_buffer
, int64_t offset
,
704 int64_t length
, std::shared_ptr
<Buffer
>* out
) {
705 ARROW_ASSIGN_OR_RAISE(*out
, ctx
->Allocate((length
+ 1) * sizeof(T
)));
706 const T
* input_offsets
= reinterpret_cast<const T
*>(input_buffer
.data()) + offset
;
707 T
* out_offsets
= reinterpret_cast<T
*>((*out
)->mutable_data());
708 T first_offset
= *input_offsets
;
709 for (int64_t i
= 0; i
< length
; ++i
) {
710 *out_offsets
++ = input_offsets
[i
] - first_offset
;
712 *out_offsets
= input_offsets
[length
] - first_offset
;
716 // Apply `transform` to input character data- this function cannot change the
718 template <typename Type
>
719 Status
StringDataTransform(KernelContext
* ctx
, const ExecBatch
& batch
,
720 TransformFunc transform
, Datum
* out
) {
721 using ArrayType
= typename TypeTraits
<Type
>::ArrayType
;
722 using offset_type
= typename
Type::offset_type
;
724 if (batch
[0].kind() == Datum::ARRAY
) {
725 const ArrayData
& input
= *batch
[0].array();
726 ArrayType
input_boxed(batch
[0].array());
728 ArrayData
* out_arr
= out
->mutable_array();
730 if (input
.offset
== 0) {
731 // We can reuse offsets from input
732 out_arr
->buffers
[1] = input
.buffers
[1];
734 DCHECK(input
.buffers
[1]);
735 // We must allocate new space for the offsets and shift the existing offsets
736 RETURN_NOT_OK(GetShiftedOffsets
<offset_type
>(ctx
, *input
.buffers
[1], input
.offset
,
737 input
.length
, &out_arr
->buffers
[1]));
740 // Allocate space for output data
741 int64_t data_nbytes
= input_boxed
.total_values_length();
742 RETURN_NOT_OK(ctx
->Allocate(data_nbytes
).Value(&out_arr
->buffers
[2]));
743 if (input
.length
> 0) {
744 transform(input
.buffers
[2]->data() + input_boxed
.value_offset(0), data_nbytes
,
745 out_arr
->buffers
[2]->mutable_data());
748 const auto& input
= checked_cast
<const BaseBinaryScalar
&>(*batch
[0].scalar());
749 auto result
= checked_pointer_cast
<BaseBinaryScalar
>(MakeNullScalar(out
->type()));
750 if (input
.is_valid
) {
751 result
->is_valid
= true;
752 int64_t data_nbytes
= input
.value
->size();
753 RETURN_NOT_OK(ctx
->Allocate(data_nbytes
).Value(&result
->value
));
754 transform(input
.value
->data(), data_nbytes
, result
->value
->mutable_data());
762 void TransformAsciiUpper(const uint8_t* input
, int64_t length
, uint8_t* output
) {
763 std::transform(input
, input
+ length
, output
, ascii_toupper
);
766 template <typename Type
>
768 static Status
Exec(KernelContext
* ctx
, const ExecBatch
& batch
, Datum
* out
) {
769 return StringDataTransform
<Type
>(ctx
, batch
, TransformAsciiUpper
, out
);
773 void TransformAsciiLower(const uint8_t* input
, int64_t length
, uint8_t* output
) {
774 std::transform(input
, input
+ length
, output
, ascii_tolower
);
777 template <typename Type
>
779 static Status
Exec(KernelContext
* ctx
, const ExecBatch
& batch
, Datum
* out
) {
780 return StringDataTransform
<Type
>(ctx
, batch
, TransformAsciiLower
, out
);
784 void TransformAsciiSwapCase(const uint8_t* input
, int64_t length
, uint8_t* output
) {
785 std::transform(input
, input
+ length
, output
, ascii_swapcase
);
788 template <typename Type
>
789 struct AsciiSwapCase
{
790 static Status
Exec(KernelContext
* ctx
, const ExecBatch
& batch
, Datum
* out
) {
791 return StringDataTransform
<Type
>(ctx
, batch
, TransformAsciiSwapCase
, out
);
795 struct AsciiCapitalizeTransform
: public StringTransformBase
{
796 int64_t Transform(const uint8_t* input
, int64_t input_string_ncodeunits
,
798 if (input_string_ncodeunits
> 0) {
799 *output
++ = ascii_toupper(*input
++);
800 TransformAsciiLower(input
, input_string_ncodeunits
- 1, output
);
802 return input_string_ncodeunits
;
806 template <typename Type
>
807 using AsciiCapitalize
= StringTransformExec
<Type
, AsciiCapitalizeTransform
>;
809 struct AsciiTitleTransform
: public StringTransformBase
{
810 int64_t Transform(const uint8_t* input
, int64_t input_string_ncodeunits
,
812 const uint8_t* end
= input
+ input_string_ncodeunits
;
813 const uint8_t* next
= input
;
814 bool is_next_upper
= true;
815 while ((input
= next
++) < end
) {
816 if (IsCasedCharacterAscii(*input
)) {
817 // Lower/uppercase current character and
818 // prepare to lowercase next consecutive cased characters
819 *output
++ = is_next_upper
? ascii_toupper(*input
) : ascii_tolower(*input
);
820 is_next_upper
= false;
822 // Copy current uncased character and
823 // prepare to uppercase next cased character
825 is_next_upper
= true;
828 return input_string_ncodeunits
;
832 template <typename Type
>
833 using AsciiTitle
= StringTransformExec
<Type
, AsciiTitleTransform
>;
835 // ----------------------------------------------------------------------
836 // exact pattern detection
838 using StrToBoolTransformFunc
=
839 std::function
<void(const void*, const uint8_t*, int64_t, int64_t, uint8_t*)>;
841 // Apply `transform` to input character data- this function cannot change the
843 template <typename Type
>
844 void StringBoolTransform(KernelContext
* ctx
, const ExecBatch
& batch
,
845 StrToBoolTransformFunc transform
, Datum
* out
) {
846 using offset_type
= typename
Type::offset_type
;
848 if (batch
[0].kind() == Datum::ARRAY
) {
849 const ArrayData
& input
= *batch
[0].array();
850 ArrayData
* out_arr
= out
->mutable_array();
851 if (input
.length
> 0) {
853 reinterpret_cast<const offset_type
*>(input
.buffers
[1]->data()) + input
.offset
,
854 input
.buffers
[2]->data(), input
.length
, out_arr
->offset
,
855 out_arr
->buffers
[1]->mutable_data());
858 const auto& input
= checked_cast
<const BaseBinaryScalar
&>(*batch
[0].scalar());
859 if (input
.is_valid
) {
860 uint8_t result_value
= 0;
861 std::array
<offset_type
, 2> offsets
{0,
862 static_cast<offset_type
>(input
.value
->size())};
863 transform(offsets
.data(), input
.value
->data(), 1, /*output_offset=*/0,
865 out
->value
= std::make_shared
<BooleanScalar
>(result_value
> 0);
870 using MatchSubstringState
= OptionsWrapper
<MatchSubstringOptions
>;
872 // This is an implementation of the Knuth-Morris-Pratt algorithm
873 struct PlainSubstringMatcher
{
874 const MatchSubstringOptions
& options_
;
875 std::vector
<int64_t> prefix_table
;
877 static Result
<std::unique_ptr
<PlainSubstringMatcher
>> Make(
878 const MatchSubstringOptions
& options
) {
879 // Should be handled by partial template specialization below
880 DCHECK(!options
.ignore_case
);
881 return ::arrow::internal::make_unique
<PlainSubstringMatcher
>(options
);
884 explicit PlainSubstringMatcher(const MatchSubstringOptions
& options
)
885 : options_(options
) {
886 // Phase 1: Build the prefix table
887 const auto pattern_length
= options_
.pattern
.size();
888 prefix_table
.resize(pattern_length
+ 1, /*value=*/0);
889 int64_t prefix_length
= -1;
890 prefix_table
[0] = -1;
891 for (size_t pos
= 0; pos
< pattern_length
; ++pos
) {
892 // The prefix cannot be expanded, reset.
893 while (prefix_length
>= 0 &&
894 options_
.pattern
[pos
] != options_
.pattern
[prefix_length
]) {
895 prefix_length
= prefix_table
[prefix_length
];
898 prefix_table
[pos
+ 1] = prefix_length
;
902 int64_t Find(util::string_view current
) const {
903 // Phase 2: Find the prefix in the data
904 const auto pattern_length
= options_
.pattern
.size();
905 int64_t pattern_pos
= 0;
907 if (pattern_length
== 0) return 0;
908 for (const auto c
: current
) {
909 while ((pattern_pos
>= 0) && (options_
.pattern
[pattern_pos
] != c
)) {
910 pattern_pos
= prefix_table
[pattern_pos
];
913 if (static_cast<size_t>(pattern_pos
) == pattern_length
) {
914 return pos
+ 1 - pattern_length
;
921 bool Match(util::string_view current
) const { return Find(current
) >= 0; }
924 struct PlainStartsWithMatcher
{
925 const MatchSubstringOptions
& options_
;
927 explicit PlainStartsWithMatcher(const MatchSubstringOptions
& options
)
928 : options_(options
) {}
930 static Result
<std::unique_ptr
<PlainStartsWithMatcher
>> Make(
931 const MatchSubstringOptions
& options
) {
932 // Should be handled by partial template specialization below
933 DCHECK(!options
.ignore_case
);
934 return ::arrow::internal::make_unique
<PlainStartsWithMatcher
>(options
);
937 bool Match(util::string_view current
) const {
938 // string_view::starts_with is C++20
939 return current
.substr(0, options_
.pattern
.size()) == options_
.pattern
;
943 struct PlainEndsWithMatcher
{
944 const MatchSubstringOptions
& options_
;
946 explicit PlainEndsWithMatcher(const MatchSubstringOptions
& options
)
947 : options_(options
) {}
949 static Result
<std::unique_ptr
<PlainEndsWithMatcher
>> Make(
950 const MatchSubstringOptions
& options
) {
951 // Should be handled by partial template specialization below
952 DCHECK(!options
.ignore_case
);
953 return ::arrow::internal::make_unique
<PlainEndsWithMatcher
>(options
);
956 bool Match(util::string_view current
) const {
957 // string_view::ends_with is C++20
958 return current
.size() >= options_
.pattern
.size() &&
959 current
.substr(current
.size() - options_
.pattern
.size(),
960 options_
.pattern
.size()) == options_
.pattern
;
964 #ifdef ARROW_WITH_RE2
965 struct RegexSubstringMatcher
{
966 const MatchSubstringOptions
& options_
;
967 const RE2 regex_match_
;
969 static Result
<std::unique_ptr
<RegexSubstringMatcher
>> Make(
970 const MatchSubstringOptions
& options
, bool literal
= false) {
972 ::arrow::internal::make_unique
<RegexSubstringMatcher
>(options
, literal
);
973 RETURN_NOT_OK(RegexStatus(matcher
->regex_match_
));
974 return std::move(matcher
);
977 explicit RegexSubstringMatcher(const MatchSubstringOptions
& options
,
978 bool literal
= false)
980 regex_match_(options_
.pattern
, MakeRE2Options(options
, literal
)) {}
982 bool Match(util::string_view current
) const {
983 auto piece
= re2::StringPiece(current
.data(), current
.length());
984 return re2::RE2::PartialMatch(piece
, regex_match_
);
987 static RE2::RE2::Options
MakeRE2Options(const MatchSubstringOptions
& options
,
989 RE2::RE2::Options
re2_options(RE2::Quiet
);
990 re2_options
.set_case_sensitive(!options
.ignore_case
);
991 re2_options
.set_literal(literal
);
997 template <typename Type
, typename Matcher
>
998 struct MatchSubstringImpl
{
999 using offset_type
= typename
Type::offset_type
;
1001 static Status
Exec(KernelContext
* ctx
, const ExecBatch
& batch
, Datum
* out
,
1002 const Matcher
* matcher
) {
1003 StringBoolTransform
<Type
>(
1005 [&matcher
](const void* raw_offsets
, const uint8_t* data
, int64_t length
,
1006 int64_t output_offset
, uint8_t* output
) {
1007 const offset_type
* offsets
= reinterpret_cast<const offset_type
*>(raw_offsets
);
1008 FirstTimeBitmapWriter
bitmap_writer(output
, output_offset
, length
);
1009 for (int64_t i
= 0; i
< length
; ++i
) {
1010 const char* current_data
= reinterpret_cast<const char*>(data
+ offsets
[i
]);
1011 int64_t current_length
= offsets
[i
+ 1] - offsets
[i
];
1012 if (matcher
->Match(util::string_view(current_data
, current_length
))) {
1013 bitmap_writer
.Set();
1015 bitmap_writer
.Next();
1017 bitmap_writer
.Finish();
1020 return Status::OK();
1024 template <typename Type
, typename Matcher
>
1025 struct MatchSubstring
{
1026 static Status
Exec(KernelContext
* ctx
, const ExecBatch
& batch
, Datum
* out
) {
1027 // TODO Cache matcher across invocations (for regex compilation)
1028 ARROW_ASSIGN_OR_RAISE(auto matcher
, Matcher::Make(MatchSubstringState::Get(ctx
)));
1029 return MatchSubstringImpl
<Type
, Matcher
>::Exec(ctx
, batch
, out
, matcher
.get());
1033 template <typename Type
>
1034 struct MatchSubstring
<Type
, PlainSubstringMatcher
> {
1035 static Status
Exec(KernelContext
* ctx
, const ExecBatch
& batch
, Datum
* out
) {
1036 auto options
= MatchSubstringState::Get(ctx
);
1037 if (options
.ignore_case
) {
1038 #ifdef ARROW_WITH_RE2
1039 ARROW_ASSIGN_OR_RAISE(auto matcher
,
1040 RegexSubstringMatcher::Make(options
, /*literal=*/true));
1041 return MatchSubstringImpl
<Type
, RegexSubstringMatcher
>::Exec(ctx
, batch
, out
,
1044 return Status::NotImplemented("ignore_case requires RE2");
1047 ARROW_ASSIGN_OR_RAISE(auto matcher
, PlainSubstringMatcher::Make(options
));
1048 return MatchSubstringImpl
<Type
, PlainSubstringMatcher
>::Exec(ctx
, batch
, out
,
1053 template <typename Type
>
1054 struct MatchSubstring
<Type
, PlainStartsWithMatcher
> {
1055 static Status
Exec(KernelContext
* ctx
, const ExecBatch
& batch
, Datum
* out
) {
1056 auto options
= MatchSubstringState::Get(ctx
);
1057 if (options
.ignore_case
) {
1058 #ifdef ARROW_WITH_RE2
1059 MatchSubstringOptions converted_options
= options
;
1060 converted_options
.pattern
= "^" + RE2::QuoteMeta(options
.pattern
);
1061 ARROW_ASSIGN_OR_RAISE(auto matcher
, RegexSubstringMatcher::Make(converted_options
));
1062 return MatchSubstringImpl
<Type
, RegexSubstringMatcher
>::Exec(ctx
, batch
, out
,
1065 return Status::NotImplemented("ignore_case requires RE2");
1068 ARROW_ASSIGN_OR_RAISE(auto matcher
, PlainStartsWithMatcher::Make(options
));
1069 return MatchSubstringImpl
<Type
, PlainStartsWithMatcher
>::Exec(ctx
, batch
, out
,
1074 template <typename Type
>
1075 struct MatchSubstring
<Type
, PlainEndsWithMatcher
> {
1076 static Status
Exec(KernelContext
* ctx
, const ExecBatch
& batch
, Datum
* out
) {
1077 auto options
= MatchSubstringState::Get(ctx
);
1078 if (options
.ignore_case
) {
1079 #ifdef ARROW_WITH_RE2
1080 MatchSubstringOptions converted_options
= options
;
1081 converted_options
.pattern
= RE2::QuoteMeta(options
.pattern
) + "$";
1082 ARROW_ASSIGN_OR_RAISE(auto matcher
, RegexSubstringMatcher::Make(converted_options
));
1083 return MatchSubstringImpl
<Type
, RegexSubstringMatcher
>::Exec(ctx
, batch
, out
,
1086 return Status::NotImplemented("ignore_case requires RE2");
1089 ARROW_ASSIGN_OR_RAISE(auto matcher
, PlainEndsWithMatcher::Make(options
));
1090 return MatchSubstringImpl
<Type
, PlainEndsWithMatcher
>::Exec(ctx
, batch
, out
,
1095 const FunctionDoc
match_substring_doc(
1096 "Match strings against literal pattern",
1097 ("For each string in `strings`, emit true iff it contains a given pattern.\n"
1098 "Null inputs emit null. The pattern must be given in MatchSubstringOptions. "
1099 "If ignore_case is set, only simple case folding is performed."),
1100 {"strings"}, "MatchSubstringOptions");
1102 const FunctionDoc
starts_with_doc(
1103 "Check if strings start with a literal pattern",
1104 ("For each string in `strings`, emit true iff it starts with a given pattern.\n"
1105 "Null inputs emit null. The pattern must be given in MatchSubstringOptions. "
1106 "If ignore_case is set, only simple case folding is performed."),
1107 {"strings"}, "MatchSubstringOptions");
1109 const FunctionDoc
ends_with_doc(
1110 "Check if strings end with a literal pattern",
1111 ("For each string in `strings`, emit true iff it ends with a given pattern.\n"
1112 "Null inputs emit null. The pattern must be given in MatchSubstringOptions. "
1113 "If ignore_case is set, only simple case folding is performed."),
1114 {"strings"}, "MatchSubstringOptions");
1116 #ifdef ARROW_WITH_RE2
1117 const FunctionDoc
match_substring_regex_doc(
1118 "Match strings against regex pattern",
1119 ("For each string in `strings`, emit true iff it matches a given pattern at any "
1121 "Null inputs emit null. The pattern must be given in MatchSubstringOptions. "
1122 "If ignore_case is set, only simple case folding is performed."),
1123 {"strings"}, "MatchSubstringOptions");
1127 /// Convert a SQL-style LIKE pattern (using '%' and '_') into a regex pattern
1128 std::string
MakeLikeRegex(const MatchSubstringOptions
& options
) {
1129 // Allow . to match \n
1130 std::string like_pattern
= "(?s:^";
1131 like_pattern
.reserve(options
.pattern
.size() + 7);
1132 bool escaped
= false;
1133 for (const char c
: options
.pattern
) {
1134 if (!escaped
&& c
== '%') {
1135 like_pattern
.append(".*");
1136 } else if (!escaped
&& c
== '_') {
1137 like_pattern
.append(".");
1138 } else if (!escaped
&& c
== '\\') {
1154 like_pattern
.push_back('\\');
1155 like_pattern
.push_back(c
);
1160 like_pattern
.push_back(c
);
1167 like_pattern
.append("$)");
1168 return like_pattern
;
1171 // Evaluate a SQL-like LIKE pattern by translating it to a regexp or
1172 // substring search as appropriate. See what Apache Impala does:
1173 // https://github.com/apache/impala/blob/9c38568657d62b6f6d7b10aa1c721ba843374dd8/be/src/exprs/like-predicate.cc
1174 template <typename StringType
>
1176 static Status
Exec(KernelContext
* ctx
, const ExecBatch
& batch
, Datum
* out
) {
1177 // NOTE: avoid making those constants global to avoid compiling regexes at startup
1178 // A LIKE pattern matching this regex can be translated into a substring search.
1179 static const RE2
kLikePatternIsSubstringMatch(R
"(%+([^%_]*[^\\%_])?%+)");
1180 // A LIKE pattern matching this regex can be translated into a prefix search.
1181 static const RE2
kLikePatternIsStartsWith(R
"(([^%_]*[^\\%_])?%+)");
1182 // A LIKE pattern matching this regex can be translated into a suffix search.
1183 static const RE2
kLikePatternIsEndsWith(R
"(%+([^%_]*))");
1185 auto original_options
= MatchSubstringState::Get(ctx
);
1186 auto original_state
= ctx
->state();
1189 std::string pattern
;
1190 if (!original_options
.ignore_case
&&
1191 re2::RE2::FullMatch(original_options
.pattern
, kLikePatternIsSubstringMatch
,
1193 MatchSubstringOptions converted_options
{pattern
, original_options
.ignore_case
};
1194 MatchSubstringState
converted_state(converted_options
);
1195 ctx
->SetState(&converted_state
);
1196 status
= MatchSubstring
<StringType
, PlainSubstringMatcher
>::Exec(ctx
, batch
, out
);
1197 } else if (!original_options
.ignore_case
&&
1198 re2::RE2::FullMatch(original_options
.pattern
, kLikePatternIsStartsWith
,
1200 MatchSubstringOptions converted_options
{pattern
, original_options
.ignore_case
};
1201 MatchSubstringState
converted_state(converted_options
);
1202 ctx
->SetState(&converted_state
);
1203 status
= MatchSubstring
<StringType
, PlainStartsWithMatcher
>::Exec(ctx
, batch
, out
);
1204 } else if (!original_options
.ignore_case
&&
1205 re2::RE2::FullMatch(original_options
.pattern
, kLikePatternIsEndsWith
,
1207 MatchSubstringOptions converted_options
{pattern
, original_options
.ignore_case
};
1208 MatchSubstringState
converted_state(converted_options
);
1209 ctx
->SetState(&converted_state
);
1210 status
= MatchSubstring
<StringType
, PlainEndsWithMatcher
>::Exec(ctx
, batch
, out
);
1212 MatchSubstringOptions converted_options
{MakeLikeRegex(original_options
),
1213 original_options
.ignore_case
};
1214 MatchSubstringState
converted_state(converted_options
);
1215 ctx
->SetState(&converted_state
);
1216 status
= MatchSubstring
<StringType
, RegexSubstringMatcher
>::Exec(ctx
, batch
, out
);
1218 ctx
->SetState(original_state
);
1223 const FunctionDoc
match_like_doc(
1224 "Match strings against SQL-style LIKE pattern",
1225 ("For each string in `strings`, emit true iff it fully matches a given pattern "
1226 "at any position. That is, '%' will match any number of characters, '_' will "
1227 "match exactly one character, and any other character matches itself. To "
1228 "match a literal '%', '_', or '\\', precede the character with a backslash.\n"
1229 "Null inputs emit null. The pattern must be given in MatchSubstringOptions."),
1230 {"strings"}, "MatchSubstringOptions");
1234 void AddMatchSubstring(FunctionRegistry
* registry
) {
1236 auto func
= std::make_shared
<ScalarFunction
>("match_substring", Arity::Unary(),
1237 &match_substring_doc
);
1238 auto exec_32
= MatchSubstring
<StringType
, PlainSubstringMatcher
>::Exec
;
1239 auto exec_64
= MatchSubstring
<LargeStringType
, PlainSubstringMatcher
>::Exec
;
1240 DCHECK_OK(func
->AddKernel({utf8()}, boolean(), exec_32
, MatchSubstringState::Init
));
1242 func
->AddKernel({large_utf8()}, boolean(), exec_64
, MatchSubstringState::Init
));
1243 DCHECK_OK(registry
->AddFunction(std::move(func
)));
1246 auto func
= std::make_shared
<ScalarFunction
>("starts_with", Arity::Unary(),
1247 &match_substring_doc
);
1248 auto exec_32
= MatchSubstring
<StringType
, PlainStartsWithMatcher
>::Exec
;
1249 auto exec_64
= MatchSubstring
<LargeStringType
, PlainStartsWithMatcher
>::Exec
;
1250 DCHECK_OK(func
->AddKernel({utf8()}, boolean(), exec_32
, MatchSubstringState::Init
));
1252 func
->AddKernel({large_utf8()}, boolean(), exec_64
, MatchSubstringState::Init
));
1253 DCHECK_OK(registry
->AddFunction(std::move(func
)));
1256 auto func
= std::make_shared
<ScalarFunction
>("ends_with", Arity::Unary(),
1257 &match_substring_doc
);
1258 auto exec_32
= MatchSubstring
<StringType
, PlainEndsWithMatcher
>::Exec
;
1259 auto exec_64
= MatchSubstring
<LargeStringType
, PlainEndsWithMatcher
>::Exec
;
1260 DCHECK_OK(func
->AddKernel({utf8()}, boolean(), exec_32
, MatchSubstringState::Init
));
1262 func
->AddKernel({large_utf8()}, boolean(), exec_64
, MatchSubstringState::Init
));
1263 DCHECK_OK(registry
->AddFunction(std::move(func
)));
1265 #ifdef ARROW_WITH_RE2
1267 auto func
= std::make_shared
<ScalarFunction
>("match_substring_regex", Arity::Unary(),
1268 &match_substring_regex_doc
);
1269 auto exec_32
= MatchSubstring
<StringType
, RegexSubstringMatcher
>::Exec
;
1270 auto exec_64
= MatchSubstring
<LargeStringType
, RegexSubstringMatcher
>::Exec
;
1271 DCHECK_OK(func
->AddKernel({utf8()}, boolean(), exec_32
, MatchSubstringState::Init
));
1273 func
->AddKernel({large_utf8()}, boolean(), exec_64
, MatchSubstringState::Init
));
1274 DCHECK_OK(registry
->AddFunction(std::move(func
)));
1278 std::make_shared
<ScalarFunction
>("match_like", Arity::Unary(), &match_like_doc
);
1279 auto exec_32
= MatchLike
<StringType
>::Exec
;
1280 auto exec_64
= MatchLike
<LargeStringType
>::Exec
;
1281 DCHECK_OK(func
->AddKernel({utf8()}, boolean(), exec_32
, MatchSubstringState::Init
));
1283 func
->AddKernel({large_utf8()}, boolean(), exec_64
, MatchSubstringState::Init
));
1284 DCHECK_OK(registry
->AddFunction(std::move(func
)));
1289 // Substring find - lfind/index/etc.
1291 struct FindSubstring
{
1292 const PlainSubstringMatcher matcher_
;
1294 explicit FindSubstring(PlainSubstringMatcher matcher
) : matcher_(std::move(matcher
)) {}
1296 template <typename OutValue
, typename
... Ignored
>
1297 OutValue
Call(KernelContext
*, util::string_view val
, Status
*) const {
1298 return static_cast<OutValue
>(matcher_
.Find(val
));
1302 #ifdef ARROW_WITH_RE2
1303 struct FindSubstringRegex
{
1304 std::unique_ptr
<RE2
> regex_match_
;
1306 explicit FindSubstringRegex(const MatchSubstringOptions
& options
,
1307 bool literal
= false) {
1308 std::string regex
= "(";
1309 regex
.reserve(options
.pattern
.length() + 2);
1310 regex
+= literal
? RE2::QuoteMeta(options
.pattern
) : options
.pattern
;
1312 regex_match_
.reset(new RE2(std::move(regex
), RegexSubstringMatcher::MakeRE2Options(
1313 options
, /*literal=*/false)));
1316 template <typename OutValue
, typename
... Ignored
>
1317 OutValue
Call(KernelContext
*, util::string_view val
, Status
*) const {
1318 re2::StringPiece
piece(val
.data(), val
.length());
1319 re2::StringPiece match
;
1320 if (re2::RE2::PartialMatch(piece
, *regex_match_
, &match
)) {
1321 return static_cast<OutValue
>(match
.data() - piece
.data());
1328 template <typename InputType
>
1329 struct FindSubstringExec
{
1330 using OffsetType
= typename TypeTraits
<InputType
>::OffsetType
;
1331 static Status
Exec(KernelContext
* ctx
, const ExecBatch
& batch
, Datum
* out
) {
1332 const MatchSubstringOptions
& options
= MatchSubstringState::Get(ctx
);
1333 if (options
.ignore_case
) {
1334 #ifdef ARROW_WITH_RE2
1335 applicator::ScalarUnaryNotNullStateful
<OffsetType
, InputType
, FindSubstringRegex
>
1336 kernel
{FindSubstringRegex(options
, /*literal=*/true)};
1337 return kernel
.Exec(ctx
, batch
, out
);
1339 return Status::NotImplemented("ignore_case requires RE2");
1341 applicator::ScalarUnaryNotNullStateful
<OffsetType
, InputType
, FindSubstring
> kernel
{
1342 FindSubstring(PlainSubstringMatcher(options
))};
1343 return kernel
.Exec(ctx
, batch
, out
);
1347 const FunctionDoc
find_substring_doc(
1348 "Find first occurrence of substring",
1349 ("For each string in `strings`, emit the index of the first occurrence of the given "
1350 "pattern, or -1 if not found.\n"
1351 "Null inputs emit null. The pattern must be given in MatchSubstringOptions."),
1352 {"strings"}, "MatchSubstringOptions");
1354 #ifdef ARROW_WITH_RE2
1355 template <typename InputType
>
1356 struct FindSubstringRegexExec
{
1357 using OffsetType
= typename TypeTraits
<InputType
>::OffsetType
;
1358 static Status
Exec(KernelContext
* ctx
, const ExecBatch
& batch
, Datum
* out
) {
1359 const MatchSubstringOptions
& options
= MatchSubstringState::Get(ctx
);
1360 applicator::ScalarUnaryNotNullStateful
<OffsetType
, InputType
, FindSubstringRegex
>
1361 kernel
{FindSubstringRegex(options
, /*literal=*/false)};
1362 return kernel
.Exec(ctx
, batch
, out
);
1366 const FunctionDoc
find_substring_regex_doc(
1367 "Find location of first match of regex pattern",
1368 ("For each string in `strings`, emit the index of the first match of the given "
1369 "pattern, or -1 if not found.\n"
1370 "Null inputs emit null. The pattern must be given in MatchSubstringOptions."),
1371 {"strings"}, "MatchSubstringOptions");
1374 void AddFindSubstring(FunctionRegistry
* registry
) {
1376 auto func
= std::make_shared
<ScalarFunction
>("find_substring", Arity::Unary(),
1377 &find_substring_doc
);
1378 for (const auto& ty
: BaseBinaryTypes()) {
1379 auto offset_type
= offset_bit_width(ty
->id()) == 64 ? int64() : int32();
1380 DCHECK_OK(func
->AddKernel({ty
}, offset_type
,
1381 GenerateTypeAgnosticVarBinaryBase
<FindSubstringExec
>(ty
),
1382 MatchSubstringState::Init
));
1384 DCHECK_OK(func
->AddKernel({InputType(Type::FIXED_SIZE_BINARY
)}, int32(),
1385 FindSubstringExec
<FixedSizeBinaryType
>::Exec
,
1386 MatchSubstringState::Init
));
1387 DCHECK_OK(registry
->AddFunction(std::move(func
)));
1389 #ifdef ARROW_WITH_RE2
1391 auto func
= std::make_shared
<ScalarFunction
>("find_substring_regex", Arity::Unary(),
1392 &find_substring_regex_doc
);
1393 for (const auto& ty
: BaseBinaryTypes()) {
1394 auto offset_type
= offset_bit_width(ty
->id()) == 64 ? int64() : int32();
1396 func
->AddKernel({ty
}, offset_type
,
1397 GenerateTypeAgnosticVarBinaryBase
<FindSubstringRegexExec
>(ty
),
1398 MatchSubstringState::Init
));
1400 DCHECK_OK(func
->AddKernel({InputType(Type::FIXED_SIZE_BINARY
)}, int32(),
1401 FindSubstringRegexExec
<FixedSizeBinaryType
>::Exec
,
1402 MatchSubstringState::Init
));
1403 DCHECK_OK(registry
->AddFunction(std::move(func
)));
1410 struct CountSubstring
{
1411 const PlainSubstringMatcher matcher_
;
1413 explicit CountSubstring(PlainSubstringMatcher matcher
) : matcher_(std::move(matcher
)) {}
1415 template <typename OutValue
, typename
... Ignored
>
1416 OutValue
Call(KernelContext
*, util::string_view val
, Status
*) const {
1419 const auto pattern_size
= std::max
<uint64_t>(1, matcher_
.options_
.pattern
.size());
1420 while (start
<= val
.size()) {
1421 const int64_t index
= matcher_
.Find(val
.substr(start
));
1424 start
+= index
+ pattern_size
;
1433 #ifdef ARROW_WITH_RE2
1434 struct CountSubstringRegex
{
1435 std::unique_ptr
<RE2
> regex_match_
;
1437 explicit CountSubstringRegex(const MatchSubstringOptions
& options
, bool literal
= false)
1438 : regex_match_(new RE2(options
.pattern
,
1439 RegexSubstringMatcher::MakeRE2Options(options
, literal
))) {}
1441 static Result
<CountSubstringRegex
> Make(const MatchSubstringOptions
& options
,
1442 bool literal
= false) {
1443 CountSubstringRegex
counter(options
, literal
);
1444 RETURN_NOT_OK(RegexStatus(*counter
.regex_match_
));
1445 return std::move(counter
);
1448 template <typename OutValue
, typename
... Ignored
>
1449 OutValue
Call(KernelContext
*, util::string_view val
, Status
*) const {
1451 re2::StringPiece
input(val
.data(), val
.size());
1452 auto last_size
= input
.size();
1453 while (re2::RE2::FindAndConsume(&input
, *regex_match_
)) {
1455 if (last_size
== input
.size()) {
1457 if (input
.size() > 0) {
1458 input
.remove_prefix(1);
1463 last_size
= input
.size();
1469 template <typename InputType
>
1470 struct CountSubstringRegexExec
{
1471 using OffsetType
= typename TypeTraits
<InputType
>::OffsetType
;
1472 static Status
Exec(KernelContext
* ctx
, const ExecBatch
& batch
, Datum
* out
) {
1473 const MatchSubstringOptions
& options
= MatchSubstringState::Get(ctx
);
1474 ARROW_ASSIGN_OR_RAISE(auto counter
, CountSubstringRegex::Make(options
));
1475 applicator::ScalarUnaryNotNullStateful
<OffsetType
, InputType
, CountSubstringRegex
>
1476 kernel
{std::move(counter
)};
1477 return kernel
.Exec(ctx
, batch
, out
);
1482 template <typename InputType
>
1483 struct CountSubstringExec
{
1484 using OffsetType
= typename TypeTraits
<InputType
>::OffsetType
;
1485 static Status
Exec(KernelContext
* ctx
, const ExecBatch
& batch
, Datum
* out
) {
1486 const MatchSubstringOptions
& options
= MatchSubstringState::Get(ctx
);
1487 if (options
.ignore_case
) {
1488 #ifdef ARROW_WITH_RE2
1489 ARROW_ASSIGN_OR_RAISE(auto counter
,
1490 CountSubstringRegex::Make(options
, /*literal=*/true));
1491 applicator::ScalarUnaryNotNullStateful
<OffsetType
, InputType
, CountSubstringRegex
>
1492 kernel
{std::move(counter
)};
1493 return kernel
.Exec(ctx
, batch
, out
);
1495 return Status::NotImplemented("ignore_case requires RE2");
1498 applicator::ScalarUnaryNotNullStateful
<OffsetType
, InputType
, CountSubstring
> kernel
{
1499 CountSubstring(PlainSubstringMatcher(options
))};
1500 return kernel
.Exec(ctx
, batch
, out
);
1504 const FunctionDoc
count_substring_doc(
1505 "Count occurrences of substring",
1506 ("For each string in `strings`, emit the number of occurrences of the given "
1508 "Null inputs emit null. The pattern must be given in MatchSubstringOptions."),
1509 {"strings"}, "MatchSubstringOptions");
1511 #ifdef ARROW_WITH_RE2
1512 const FunctionDoc
count_substring_regex_doc(
1513 "Count occurrences of substring",
1514 ("For each string in `strings`, emit the number of occurrences of the given "
1516 "Null inputs emit null. The pattern must be given in MatchSubstringOptions."),
1517 {"strings"}, "MatchSubstringOptions");
1520 void AddCountSubstring(FunctionRegistry
* registry
) {
1522 auto func
= std::make_shared
<ScalarFunction
>("count_substring", Arity::Unary(),
1523 &count_substring_doc
);
1524 for (const auto& ty
: BaseBinaryTypes()) {
1525 auto offset_type
= offset_bit_width(ty
->id()) == 64 ? int64() : int32();
1526 DCHECK_OK(func
->AddKernel({ty
}, offset_type
,
1527 GenerateTypeAgnosticVarBinaryBase
<CountSubstringExec
>(ty
),
1528 MatchSubstringState::Init
));
1530 DCHECK_OK(func
->AddKernel({InputType(Type::FIXED_SIZE_BINARY
)}, int32(),
1531 CountSubstringExec
<FixedSizeBinaryType
>::Exec
,
1532 MatchSubstringState::Init
));
1533 DCHECK_OK(registry
->AddFunction(std::move(func
)));
1535 #ifdef ARROW_WITH_RE2
1537 auto func
= std::make_shared
<ScalarFunction
>("count_substring_regex", Arity::Unary(),
1538 &count_substring_regex_doc
);
1539 for (const auto& ty
: BaseBinaryTypes()) {
1540 auto offset_type
= offset_bit_width(ty
->id()) == 64 ? int64() : int32();
1542 func
->AddKernel({ty
}, offset_type
,
1543 GenerateTypeAgnosticVarBinaryBase
<CountSubstringRegexExec
>(ty
),
1544 MatchSubstringState::Init
));
1546 DCHECK_OK(func
->AddKernel({InputType(Type::FIXED_SIZE_BINARY
)}, int32(),
1547 CountSubstringRegexExec
<FixedSizeBinaryType
>::Exec
,
1548 MatchSubstringState::Init
));
1549 DCHECK_OK(registry
->AddFunction(std::move(func
)));
1556 struct SliceTransformBase
: public StringTransformBase
{
1557 using State
= OptionsWrapper
<SliceOptions
>;
1559 const SliceOptions
* options
;
1561 Status
PreExec(KernelContext
* ctx
, const ExecBatch
& batch
, Datum
* out
) override
{
1562 options
= &State::Get(ctx
);
1563 if (options
->step
== 0) {
1564 return Status::Invalid("Slice step cannot be zero");
1566 return Status::OK();
1570 struct SliceCodeunitsTransform
: SliceTransformBase
{
1571 int64_t MaxCodeunits(int64_t ninputs
, int64_t input_ncodeunits
) override
{
1572 const SliceOptions
& opt
= *this->options
;
1573 if ((opt
.start
>= 0) != (opt
.stop
>= 0)) {
1574 // If start and stop don't have the same sign, we can't guess an upper bound
1575 // on the resulting slice lengths, so return a worst case estimate.
1576 return input_ncodeunits
;
1578 int64_t max_slice_codepoints
= (opt
.stop
- opt
.start
+ opt
.step
- 1) / opt
.step
;
1579 // The maximum UTF8 byte size of a codepoint is 4
1580 return std::min(input_ncodeunits
,
1581 4 * ninputs
* std::max
<int64_t>(0, max_slice_codepoints
));
1584 int64_t Transform(const uint8_t* input
, int64_t input_string_ncodeunits
,
1586 if (options
->step
>= 1) {
1587 return SliceForward(input
, input_string_ncodeunits
, output
);
1589 return SliceBackward(input
, input_string_ncodeunits
, output
);
1592 #define RETURN_IF_UTF8_ERROR(expr) \
1594 if (ARROW_PREDICT_FALSE(!expr)) { \
1595 return kTransformError; \
1599 int64_t SliceForward(const uint8_t* input
, int64_t input_string_ncodeunits
,
1601 // Slice in forward order (step > 0)
1602 const SliceOptions
& opt
= *this->options
;
1603 const uint8_t* begin
= input
;
1604 const uint8_t* end
= input
+ input_string_ncodeunits
;
1605 const uint8_t* begin_sliced
= begin
;
1606 const uint8_t* end_sliced
= end
;
1608 // First, compute begin_sliced and end_sliced
1609 if (opt
.start
>= 0) {
1610 // start counting from the left
1611 RETURN_IF_UTF8_ERROR(
1612 arrow::util::UTF8AdvanceCodepoints(begin
, end
, &begin_sliced
, opt
.start
));
1613 if (opt
.stop
> opt
.start
) {
1614 // continue counting from begin_sliced
1615 const int64_t length
= opt
.stop
- opt
.start
;
1616 RETURN_IF_UTF8_ERROR(
1617 arrow::util::UTF8AdvanceCodepoints(begin_sliced
, end
, &end_sliced
, length
));
1618 } else if (opt
.stop
< 0) {
1619 // or from the end (but we will never need to < begin_sliced)
1620 RETURN_IF_UTF8_ERROR(arrow::util::UTF8AdvanceCodepointsReverse(
1621 begin_sliced
, end
, &end_sliced
, -opt
.stop
));
1623 // zero length slice
1627 // start counting from the right
1628 RETURN_IF_UTF8_ERROR(arrow::util::UTF8AdvanceCodepointsReverse(
1629 begin
, end
, &begin_sliced
, -opt
.start
));
1631 // continue counting from the left, we cannot start from begin_sliced because we
1632 // don't know how many codepoints are between begin and begin_sliced
1633 RETURN_IF_UTF8_ERROR(
1634 arrow::util::UTF8AdvanceCodepoints(begin
, end
, &end_sliced
, opt
.stop
));
1635 // and therefore we also needs this
1636 if (end_sliced
<= begin_sliced
) {
1637 // zero length slice
1640 } else if ((opt
.stop
< 0) && (opt
.stop
> opt
.start
)) {
1641 // stop is negative, but larger than start, so we count again from the right
1642 // in some cases we can optimize this, depending on the shortest path (from end
1643 // or begin_sliced), but begin_sliced and opt.start can be 'out of sync',
1644 // for instance when start=-100, when the string length is only 10.
1645 RETURN_IF_UTF8_ERROR(arrow::util::UTF8AdvanceCodepointsReverse(
1646 begin_sliced
, end
, &end_sliced
, -opt
.stop
));
1648 // zero length slice
1653 // Second, copy computed slice to output
1654 DCHECK(begin_sliced
<= end_sliced
);
1655 if (opt
.step
== 1) {
1656 // fast case, where we simply can finish with a memcpy
1657 std::copy(begin_sliced
, end_sliced
, output
);
1658 return end_sliced
- begin_sliced
;
1660 uint8_t* dest
= output
;
1661 const uint8_t* i
= begin_sliced
;
1663 while (i
< end_sliced
) {
1664 uint32_t codepoint
= 0;
1665 // write a single codepoint
1666 RETURN_IF_UTF8_ERROR(arrow::util::UTF8Decode(&i
, &codepoint
));
1667 dest
= arrow::util::UTF8Encode(dest
, codepoint
);
1668 // and skip the remainder
1669 int64_t skips
= opt
.step
- 1;
1670 while ((skips
--) && (i
< end_sliced
)) {
1671 RETURN_IF_UTF8_ERROR(arrow::util::UTF8Decode(&i
, &codepoint
));
1674 return dest
- output
;
1677 int64_t SliceBackward(const uint8_t* input
, int64_t input_string_ncodeunits
,
1679 // Slice in reverse order (step < 0)
1680 const SliceOptions
& opt
= *this->options
;
1681 const uint8_t* begin
= input
;
1682 const uint8_t* end
= input
+ input_string_ncodeunits
;
1683 const uint8_t* begin_sliced
= begin
;
1684 const uint8_t* end_sliced
= end
;
1686 // Serious +1 -1 kung fu because begin_sliced and end_sliced act like
1687 // reverse iterators.
1688 if (opt
.start
>= 0) {
1689 // +1 because begin_sliced acts as as the end of a reverse iterator
1690 RETURN_IF_UTF8_ERROR(
1691 arrow::util::UTF8AdvanceCodepoints(begin
, end
, &begin_sliced
, opt
.start
+ 1));
1693 // -1 because start=-1 means the last codeunit, which is 0 advances
1694 RETURN_IF_UTF8_ERROR(arrow::util::UTF8AdvanceCodepointsReverse(
1695 begin
, end
, &begin_sliced
, -opt
.start
- 1));
1697 // make it point at the last codeunit of the previous codeunit
1700 // similar to opt.start
1701 if (opt
.stop
>= 0) {
1702 RETURN_IF_UTF8_ERROR(
1703 arrow::util::UTF8AdvanceCodepoints(begin
, end
, &end_sliced
, opt
.stop
+ 1));
1705 RETURN_IF_UTF8_ERROR(arrow::util::UTF8AdvanceCodepointsReverse(
1706 begin
, end
, &end_sliced
, -opt
.stop
- 1));
1710 // Copy computed slice to output
1711 uint8_t* dest
= output
;
1712 const uint8_t* i
= begin_sliced
;
1713 while (i
> end_sliced
) {
1714 uint32_t codepoint
= 0;
1715 // write a single codepoint
1716 RETURN_IF_UTF8_ERROR(arrow::util::UTF8DecodeReverse(&i
, &codepoint
));
1717 dest
= arrow::util::UTF8Encode(dest
, codepoint
);
1718 // and skip the remainder
1719 int64_t skips
= -opt
.step
- 1;
1720 while ((skips
--) && (i
> end_sliced
)) {
1721 RETURN_IF_UTF8_ERROR(arrow::util::UTF8DecodeReverse(&i
, &codepoint
));
1724 return dest
- output
;
1727 #undef RETURN_IF_UTF8_ERROR
1730 template <typename Type
>
1731 using SliceCodeunits
= StringTransformExec
<Type
, SliceCodeunitsTransform
>;
1733 const FunctionDoc
utf8_slice_codeunits_doc(
1735 ("For each string in `strings`, slice into a substring defined by\n"
1736 "`start`, `stop`, `step`) as given by `SliceOptions` where `start` is inclusive\n"
1737 "and `stop` is exclusive and are measured in codeunits. If step is negative, the\n"
1738 "string will be advanced in reversed order. A `step` of zero is considered an\n"
1740 "Null inputs emit null."),
1741 {"strings"}, "SliceOptions");
1743 void AddSlice(FunctionRegistry
* registry
) {
1744 auto func
= std::make_shared
<ScalarFunction
>("utf8_slice_codeunits", Arity::Unary(),
1745 &utf8_slice_codeunits_doc
);
1746 using t32
= SliceCodeunits
<StringType
>;
1747 using t64
= SliceCodeunits
<LargeStringType
>;
1749 func
->AddKernel({utf8()}, utf8(), t32::Exec
, SliceCodeunitsTransform::State::Init
));
1750 DCHECK_OK(func
->AddKernel({large_utf8()}, large_utf8(), t64::Exec
,
1751 SliceCodeunitsTransform::State::Init
));
1752 DCHECK_OK(registry
->AddFunction(std::move(func
)));
1755 template <typename Derived
, bool allow_empty
= false>
1756 struct CharacterPredicateUnicode
{
1757 static bool Call(KernelContext
*, const uint8_t* input
, size_t input_string_ncodeunits
,
1759 if (allow_empty
&& input_string_ncodeunits
== 0) {
1764 if (!ARROW_PREDICT_TRUE(arrow::util::UTF8AllOf(
1765 input
, input
+ input_string_ncodeunits
, &all
, [&any
](uint32_t codepoint
) {
1766 any
|= Derived::PredicateCharacterAny(codepoint
);
1767 return Derived::PredicateCharacterAll(codepoint
);
1769 *st
= Status::Invalid("Invalid UTF8 sequence in input");
1775 static inline bool PredicateCharacterAny(uint32_t) {
1776 return true; // default condition make sure there is at least 1 charachter
1780 template <typename Derived
, bool allow_empty
= false>
1781 struct CharacterPredicateAscii
{
1782 static bool Call(KernelContext
*, const uint8_t* input
, size_t input_string_ncodeunits
,
1784 if (allow_empty
&& input_string_ncodeunits
== 0) {
1788 // MB: A simple for loops seems 8% faster on gcc 9.3, running the IsAlphaNumericAscii
1789 // benchmark. I don't consider that worth it.
1790 bool all
= std::all_of(input
, input
+ input_string_ncodeunits
,
1791 [&any
](uint8_t ascii_character
) {
1792 any
|= Derived::PredicateCharacterAny(ascii_character
);
1793 return Derived::PredicateCharacterAll(ascii_character
);
1798 static inline bool PredicateCharacterAny(uint8_t) {
1799 return true; // default condition make sure there is at least 1 charachter
1803 #ifdef ARROW_WITH_UTF8PROC
1804 struct IsAlphaNumericUnicode
: CharacterPredicateUnicode
<IsAlphaNumericUnicode
> {
1805 static inline bool PredicateCharacterAll(uint32_t codepoint
) {
1806 return IsAlphaNumericCharacterUnicode(codepoint
);
1811 struct IsAlphaNumericAscii
: CharacterPredicateAscii
<IsAlphaNumericAscii
> {
1812 static inline bool PredicateCharacterAll(uint8_t ascii_character
) {
1813 return IsAlphaNumericCharacterAscii(ascii_character
);
1817 #ifdef ARROW_WITH_UTF8PROC
1818 struct IsAlphaUnicode
: CharacterPredicateUnicode
<IsAlphaUnicode
> {
1819 static inline bool PredicateCharacterAll(uint32_t codepoint
) {
1820 return IsAlphaCharacterUnicode(codepoint
);
1825 struct IsAlphaAscii
: CharacterPredicateAscii
<IsAlphaAscii
> {
1826 static inline bool PredicateCharacterAll(uint8_t ascii_character
) {
1827 return IsAlphaCharacterAscii(ascii_character
);
1831 #ifdef ARROW_WITH_UTF8PROC
1832 struct IsDecimalUnicode
: CharacterPredicateUnicode
<IsDecimalUnicode
> {
1833 static inline bool PredicateCharacterAll(uint32_t codepoint
) {
1834 return IsDecimalCharacterUnicode(codepoint
);
1839 struct IsDecimalAscii
: CharacterPredicateAscii
<IsDecimalAscii
> {
1840 static inline bool PredicateCharacterAll(uint8_t ascii_character
) {
1841 return IsDecimalCharacterAscii(ascii_character
);
1845 #ifdef ARROW_WITH_UTF8PROC
1846 struct IsDigitUnicode
: CharacterPredicateUnicode
<IsDigitUnicode
> {
1847 static inline bool PredicateCharacterAll(uint32_t codepoint
) {
1848 return IsDigitCharacterUnicode(codepoint
);
1852 struct IsNumericUnicode
: CharacterPredicateUnicode
<IsNumericUnicode
> {
1853 static inline bool PredicateCharacterAll(uint32_t codepoint
) {
1854 return IsNumericCharacterUnicode(codepoint
);
1860 static bool Call(KernelContext
*, const uint8_t* input
,
1861 size_t input_string_nascii_characters
, Status
*) {
1862 return std::all_of(input
, input
+ input_string_nascii_characters
, IsAsciiCharacter
);
1866 #ifdef ARROW_WITH_UTF8PROC
1867 struct IsLowerUnicode
: CharacterPredicateUnicode
<IsLowerUnicode
> {
1868 static inline bool PredicateCharacterAll(uint32_t codepoint
) {
1869 // Only for cased character it needs to be lower case
1870 return !IsCasedCharacterUnicode(codepoint
) || IsLowerCaseCharacterUnicode(codepoint
);
1872 static inline bool PredicateCharacterAny(uint32_t codepoint
) {
1873 return IsCasedCharacterUnicode(codepoint
); // at least 1 cased character
1878 struct IsLowerAscii
: CharacterPredicateAscii
<IsLowerAscii
> {
1879 static inline bool PredicateCharacterAll(uint8_t ascii_character
) {
1880 // Only for cased character it needs to be lower case
1881 return !IsCasedCharacterAscii(ascii_character
) ||
1882 IsLowerCaseCharacterAscii(ascii_character
);
1884 static inline bool PredicateCharacterAny(uint8_t ascii_character
) {
1885 return IsCasedCharacterAscii(ascii_character
); // at least 1 cased character
1889 #ifdef ARROW_WITH_UTF8PROC
1890 struct IsPrintableUnicode
1891 : CharacterPredicateUnicode
<IsPrintableUnicode
, /*allow_empty=*/true> {
1892 static inline bool PredicateCharacterAll(uint32_t codepoint
) {
1893 return codepoint
== ' ' || IsPrintableCharacterUnicode(codepoint
);
1898 struct IsPrintableAscii
1899 : CharacterPredicateAscii
<IsPrintableAscii
, /*allow_empty=*/true> {
1900 static inline bool PredicateCharacterAll(uint8_t ascii_character
) {
1901 return IsPrintableCharacterAscii(ascii_character
);
1905 #ifdef ARROW_WITH_UTF8PROC
1906 struct IsSpaceUnicode
: CharacterPredicateUnicode
<IsSpaceUnicode
> {
1907 static inline bool PredicateCharacterAll(uint32_t codepoint
) {
1908 return IsSpaceCharacterUnicode(codepoint
);
1913 struct IsSpaceAscii
: CharacterPredicateAscii
<IsSpaceAscii
> {
1914 static inline bool PredicateCharacterAll(uint8_t ascii_character
) {
1915 return IsSpaceCharacterAscii(ascii_character
);
1919 #ifdef ARROW_WITH_UTF8PROC
1920 struct IsTitleUnicode
{
1921 static bool Call(KernelContext
*, const uint8_t* input
, size_t input_string_ncodeunits
,
1924 // 1. lower case follows cased
1925 // 2. upper case follows uncased
1926 // 3. at least 1 cased character (which logically should be upper/title)
1928 bool previous_cased
= false; // in LL, LU or LT
1929 bool rule_3
= false;
1931 arrow::util::UTF8AllOf(input
, input
+ input_string_ncodeunits
, &rules_1_and_2
,
1932 [&previous_cased
, &rule_3
](uint32_t codepoint
) {
1933 if (IsLowerCaseCharacterUnicode(codepoint
)) {
1934 if (!previous_cased
) return false; // rule 1 broken
1935 // next should be more lower case or uncased
1936 previous_cased
= true;
1937 } else if (IsCasedCharacterUnicode(codepoint
)) {
1938 if (previous_cased
) return false; // rule 2 broken
1939 // next should be a lower case or uncased
1940 previous_cased
= true;
1941 rule_3
= true; // rule 3 obeyed
1943 // an uncased char, like _ or 1
1944 // next should be upper case or more uncased
1945 previous_cased
= false;
1949 if (!ARROW_PREDICT_TRUE(status
)) {
1950 *st
= Status::Invalid("Invalid UTF8 sequence in input");
1953 return rules_1_and_2
& rule_3
;
1958 struct IsTitleAscii
{
1959 static bool Call(KernelContext
*, const uint8_t* input
, size_t input_string_ncodeunits
,
1962 // 1. lower case follows cased
1963 // 2. upper case follows uncased
1964 // 3. at least 1 cased character (which logically should be upper/title)
1965 bool rules_1_and_2
= true;
1966 bool previous_cased
= false; // in LL, LU or LT
1967 bool rule_3
= false;
1968 for (const uint8_t* c
= input
; c
< input
+ input_string_ncodeunits
; ++c
) {
1969 if (IsLowerCaseCharacterAscii(*c
)) {
1970 if (!previous_cased
) {
1972 rules_1_and_2
= false;
1975 // next should be more lower case or uncased
1976 previous_cased
= true;
1977 } else if (IsCasedCharacterAscii(*c
)) {
1978 if (previous_cased
) {
1980 rules_1_and_2
= false;
1983 // next should be a lower case or uncased
1984 previous_cased
= true;
1985 rule_3
= true; // rule 3 obeyed
1987 // an uncased character, like _ or 1
1988 // next should be upper case or more uncased
1989 previous_cased
= false;
1992 return rules_1_and_2
& rule_3
;
1996 #ifdef ARROW_WITH_UTF8PROC
1997 struct IsUpperUnicode
: CharacterPredicateUnicode
<IsUpperUnicode
> {
1998 static inline bool PredicateCharacterAll(uint32_t codepoint
) {
1999 // Only for cased character it needs to be lower case
2000 return !IsCasedCharacterUnicode(codepoint
) || IsUpperCaseCharacterUnicode(codepoint
);
2002 static inline bool PredicateCharacterAny(uint32_t codepoint
) {
2003 return IsCasedCharacterUnicode(codepoint
); // at least 1 cased character
2008 struct IsUpperAscii
: CharacterPredicateAscii
<IsUpperAscii
> {
2009 static inline bool PredicateCharacterAll(uint8_t ascii_character
) {
2010 // Only for cased character it needs to be lower case
2011 return !IsCasedCharacterAscii(ascii_character
) ||
2012 IsUpperCaseCharacterAscii(ascii_character
);
2014 static inline bool PredicateCharacterAny(uint8_t ascii_character
) {
2015 return IsCasedCharacterAscii(ascii_character
); // at least 1 cased character
2021 template <typename Options
>
2022 struct SplitFinderBase
{
2023 virtual ~SplitFinderBase() = default;
2024 virtual Status
PreExec(const Options
& options
) { return Status::OK(); }
2026 // Derived classes should also define these methods:
2027 // static bool Find(const uint8_t* begin, const uint8_t* end,
2028 // const uint8_t** separator_begin,
2029 // const uint8_t** separator_end,
2030 // const SplitPatternOptions& options);
2032 // static bool FindReverse(const uint8_t* begin, const uint8_t* end,
2033 // const uint8_t** separator_begin,
2034 // const uint8_t** separator_end,
2035 // const SplitPatternOptions& options);
2038 template <typename Type
, typename ListType
, typename SplitFinder
,
2039 typename Options
= typename
SplitFinder::Options
>
2041 using string_offset_type
= typename
Type::offset_type
;
2042 using list_offset_type
= typename
ListType::offset_type
;
2043 using ArrayType
= typename TypeTraits
<Type
>::ArrayType
;
2044 using ArrayListType
= typename TypeTraits
<ListType
>::ArrayType
;
2045 using ListScalarType
= typename TypeTraits
<ListType
>::ScalarType
;
2046 using ScalarType
= typename TypeTraits
<Type
>::ScalarType
;
2047 using BuilderType
= typename TypeTraits
<Type
>::BuilderType
;
2048 using ListOffsetsBuilderType
= TypedBufferBuilder
<list_offset_type
>;
2049 using State
= OptionsWrapper
<Options
>;
2051 // Keep the temporary storage accross individual values, to minimize reallocations
2052 std::vector
<util::string_view
> parts
;
2055 explicit SplitExec(const Options
& options
) : options(options
) {}
2057 static Status
Exec(KernelContext
* ctx
, const ExecBatch
& batch
, Datum
* out
) {
2058 return SplitExec
{State::Get(ctx
)}.Execute(ctx
, batch
, out
);
2061 Status
Execute(KernelContext
* ctx
, const ExecBatch
& batch
, Datum
* out
) {
2063 RETURN_NOT_OK(finder
.PreExec(options
));
2064 if (batch
[0].kind() == Datum::ARRAY
) {
2065 return Execute(ctx
, &finder
, batch
[0].array(), out
);
2067 DCHECK_EQ(batch
[0].kind(), Datum::SCALAR
);
2068 return Execute(ctx
, &finder
, batch
[0].scalar(), out
);
2071 Status
Execute(KernelContext
* ctx
, SplitFinder
* finder
,
2072 const std::shared_ptr
<ArrayData
>& data
, Datum
* out
) {
2073 const ArrayType
input(data
);
2075 BuilderType
builder(input
.type(), ctx
->memory_pool());
2076 // A slight overestimate of the data needed
2077 RETURN_NOT_OK(builder
.ReserveData(input
.total_values_length()));
2078 // The minimum amount of strings needed
2079 RETURN_NOT_OK(builder
.Resize(input
.length() - input
.null_count()));
2081 ArrayData
* output_list
= out
->mutable_array();
2082 // List offsets were preallocated
2083 auto* list_offsets
= output_list
->GetMutableValues
<list_offset_type
>(1);
2084 DCHECK_NE(list_offsets
, nullptr);
2086 *list_offsets
++ = 0;
2087 for (int64_t i
= 0; i
< input
.length(); ++i
) {
2088 if (!input
.IsNull(i
)) {
2089 RETURN_NOT_OK(SplitString(input
.GetView(i
), finder
, &builder
));
2090 if (ARROW_PREDICT_FALSE(builder
.length() >
2091 std::numeric_limits
<list_offset_type
>::max())) {
2092 return Status::CapacityError("List offset does not fit into 32 bit");
2095 *list_offsets
++ = static_cast<list_offset_type
>(builder
.length());
2097 // Assign string array to list child data
2098 std::shared_ptr
<Array
> string_array
;
2099 RETURN_NOT_OK(builder
.Finish(&string_array
));
2100 output_list
->child_data
.push_back(string_array
->data());
2101 return Status::OK();
2104 Status
Execute(KernelContext
* ctx
, SplitFinder
* finder
,
2105 const std::shared_ptr
<Scalar
>& scalar
, Datum
* out
) {
2106 const auto& input
= checked_cast
<const ScalarType
&>(*scalar
);
2107 auto result
= checked_cast
<ListScalarType
*>(out
->scalar().get());
2108 if (input
.is_valid
) {
2109 result
->is_valid
= true;
2110 BuilderType
builder(input
.type
, ctx
->memory_pool());
2111 util::string_view
s(*input
.value
);
2112 RETURN_NOT_OK(SplitString(s
, finder
, &builder
));
2113 RETURN_NOT_OK(builder
.Finish(&result
->value
));
2115 return Status::OK();
2118 Status
SplitString(const util::string_view
& s
, SplitFinder
* finder
,
2119 BuilderType
* builder
) {
2120 const uint8_t* begin
= reinterpret_cast<const uint8_t*>(s
.data());
2121 const uint8_t* end
= begin
+ s
.length();
2123 int64_t max_splits
= options
.max_splits
;
2124 // if there is no max splits, reversing does not make sense (and is probably less
2125 // efficient), but is useful for testing
2126 if (options
.reverse
) {
2127 // note that i points 1 further than the 'current'
2128 const uint8_t* i
= end
;
2129 // we will record the parts in reverse order
2131 if (max_splits
> -1) {
2132 parts
.reserve(max_splits
+ 1);
2134 while (max_splits
!= 0) {
2135 const uint8_t *separator_begin
, *separator_end
;
2136 // find with whatever algo the part we will 'cut out'
2137 if (finder
->FindReverse(begin
, i
, &separator_begin
, &separator_end
, options
)) {
2138 parts
.emplace_back(reinterpret_cast<const char*>(separator_end
),
2140 i
= separator_begin
;
2143 // if we cannot find a separator, we're done
2147 parts
.emplace_back(reinterpret_cast<const char*>(begin
), i
- begin
);
2148 // now we do the copying
2149 for (auto it
= parts
.rbegin(); it
!= parts
.rend(); ++it
) {
2150 RETURN_NOT_OK(builder
->Append(*it
));
2153 const uint8_t* i
= begin
;
2154 while (max_splits
!= 0) {
2155 const uint8_t *separator_begin
, *separator_end
;
2156 // find with whatever algo the part we will 'cut out'
2157 if (finder
->Find(i
, end
, &separator_begin
, &separator_end
, options
)) {
2158 // the part till the beginning of the 'cut'
2160 builder
->Append(i
, static_cast<string_offset_type
>(separator_begin
- i
)));
2164 // if we cannot find a separator, we're done
2169 RETURN_NOT_OK(builder
->Append(i
, static_cast<string_offset_type
>(end
- i
)));
2171 return Status::OK();
2175 struct SplitPatternFinder
: public SplitFinderBase
<SplitPatternOptions
> {
2176 using Options
= SplitPatternOptions
;
2178 Status
PreExec(const SplitPatternOptions
& options
) override
{
2179 if (options
.pattern
.length() == 0) {
2180 return Status::Invalid("Empty separator");
2182 return Status::OK();
2185 static bool Find(const uint8_t* begin
, const uint8_t* end
,
2186 const uint8_t** separator_begin
, const uint8_t** separator_end
,
2187 const SplitPatternOptions
& options
) {
2188 const uint8_t* pattern
= reinterpret_cast<const uint8_t*>(options
.pattern
.c_str());
2189 const int64_t pattern_length
= options
.pattern
.length();
2190 const uint8_t* i
= begin
;
2191 // this is O(n*m) complexity, we could use the Knuth-Morris-Pratt algorithm used in
2193 while ((i
+ pattern_length
<= end
)) {
2194 i
= std::search(i
, end
, pattern
, pattern
+ pattern_length
);
2196 *separator_begin
= i
;
2197 *separator_end
= i
+ pattern_length
;
2204 static bool FindReverse(const uint8_t* begin
, const uint8_t* end
,
2205 const uint8_t** separator_begin
, const uint8_t** separator_end
,
2206 const SplitPatternOptions
& options
) {
2207 const uint8_t* pattern
= reinterpret_cast<const uint8_t*>(options
.pattern
.c_str());
2208 const int64_t pattern_length
= options
.pattern
.length();
2209 // this is O(n*m) complexity, we could use the Knuth-Morris-Pratt algorithm used in
2211 std::reverse_iterator
<const uint8_t*> ri(end
);
2212 std::reverse_iterator
<const uint8_t*> rend(begin
);
2213 std::reverse_iterator
<const uint8_t*> pattern_rbegin(pattern
+ pattern_length
);
2214 std::reverse_iterator
<const uint8_t*> pattern_rend(pattern
);
2215 while (begin
<= ri
.base() - pattern_length
) {
2216 ri
= std::search(ri
, rend
, pattern_rbegin
, pattern_rend
);
2218 *separator_begin
= ri
.base() - pattern_length
;
2219 *separator_end
= ri
.base();
2227 template <typename Type
, typename ListType
>
2228 using SplitPatternExec
= SplitExec
<Type
, ListType
, SplitPatternFinder
>;
2230 const FunctionDoc
split_pattern_doc(
2231 "Split string according to separator",
2232 ("Split each string according to the exact `pattern` defined in\n"
2233 "SplitPatternOptions. The output for each string input is a list\n"
2236 "The maximum number of splits and direction of splitting\n"
2237 "(forward, reverse) can optionally be defined in SplitPatternOptions."),
2238 {"strings"}, "SplitPatternOptions");
2240 const FunctionDoc
ascii_split_whitespace_doc(
2241 "Split string according to any ASCII whitespace",
2242 ("Split each string according any non-zero length sequence of ASCII\n"
2243 "whitespace characters. The output for each string input is a list\n"
2246 "The maximum number of splits and direction of splitting\n"
2247 "(forward, reverse) can optionally be defined in SplitOptions."),
2248 {"strings"}, "SplitOptions");
2250 const FunctionDoc
utf8_split_whitespace_doc(
2251 "Split string according to any Unicode whitespace",
2252 ("Split each string according any non-zero length sequence of Unicode\n"
2253 "whitespace characters. The output for each string input is a list\n"
2256 "The maximum number of splits and direction of splitting\n"
2257 "(forward, reverse) can optionally be defined in SplitOptions."),
2258 {"strings"}, "SplitOptions");
2260 void AddSplitPattern(FunctionRegistry
* registry
) {
2261 auto func
= std::make_shared
<ScalarFunction
>("split_pattern", Arity::Unary(),
2262 &split_pattern_doc
);
2263 using t32
= SplitPatternExec
<StringType
, ListType
>;
2264 using t64
= SplitPatternExec
<LargeStringType
, ListType
>;
2265 DCHECK_OK(func
->AddKernel({utf8()}, {list(utf8())}, t32::Exec
, t32::State::Init
));
2267 func
->AddKernel({large_utf8()}, {list(large_utf8())}, t64::Exec
, t64::State::Init
));
2268 DCHECK_OK(registry
->AddFunction(std::move(func
)));
2271 struct SplitWhitespaceAsciiFinder
: public SplitFinderBase
<SplitOptions
> {
2272 using Options
= SplitOptions
;
2274 static bool Find(const uint8_t* begin
, const uint8_t* end
,
2275 const uint8_t** separator_begin
, const uint8_t** separator_end
,
2276 const SplitOptions
& options
) {
2277 const uint8_t* i
= begin
;
2279 if (IsSpaceCharacterAscii(*i
)) {
2280 *separator_begin
= i
;
2283 } while (IsSpaceCharacterAscii(*i
) && i
< end
);
2292 static bool FindReverse(const uint8_t* begin
, const uint8_t* end
,
2293 const uint8_t** separator_begin
, const uint8_t** separator_end
,
2294 const SplitOptions
& options
) {
2295 const uint8_t* i
= end
- 1;
2296 while ((i
>= begin
)) {
2297 if (IsSpaceCharacterAscii(*i
)) {
2298 *separator_end
= i
+ 1;
2301 } while (IsSpaceCharacterAscii(*i
) && i
>= begin
);
2302 *separator_begin
= i
+ 1;
2311 template <typename Type
, typename ListType
>
2312 using SplitWhitespaceAsciiExec
= SplitExec
<Type
, ListType
, SplitWhitespaceAsciiFinder
>;
2314 void AddSplitWhitespaceAscii(FunctionRegistry
* registry
) {
2315 static const SplitOptions default_options
{};
2317 std::make_shared
<ScalarFunction
>("ascii_split_whitespace", Arity::Unary(),
2318 &ascii_split_whitespace_doc
, &default_options
);
2319 using t32
= SplitWhitespaceAsciiExec
<StringType
, ListType
>;
2320 using t64
= SplitWhitespaceAsciiExec
<LargeStringType
, ListType
>;
2321 DCHECK_OK(func
->AddKernel({utf8()}, {list(utf8())}, t32::Exec
, t32::State::Init
));
2323 func
->AddKernel({large_utf8()}, {list(large_utf8())}, t64::Exec
, t64::State::Init
));
2324 DCHECK_OK(registry
->AddFunction(std::move(func
)));
2327 #ifdef ARROW_WITH_UTF8PROC
2328 struct SplitWhitespaceUtf8Finder
: public SplitFinderBase
<SplitOptions
> {
2329 using Options
= SplitOptions
;
2331 Status
PreExec(const SplitOptions
& options
) override
{
2332 EnsureLookupTablesFilled();
2333 return Status::OK();
2336 bool Find(const uint8_t* begin
, const uint8_t* end
, const uint8_t** separator_begin
,
2337 const uint8_t** separator_end
, const SplitOptions
& options
) {
2338 const uint8_t* i
= begin
;
2340 uint32_t codepoint
= 0;
2341 *separator_begin
= i
;
2342 if (ARROW_PREDICT_FALSE(!arrow::util::UTF8Decode(&i
, &codepoint
))) {
2345 if (IsSpaceCharacterUnicode(codepoint
)) {
2348 if (ARROW_PREDICT_FALSE(!arrow::util::UTF8Decode(&i
, &codepoint
))) {
2351 } while (IsSpaceCharacterUnicode(codepoint
) && i
< end
);
2358 bool FindReverse(const uint8_t* begin
, const uint8_t* end
,
2359 const uint8_t** separator_begin
, const uint8_t** separator_end
,
2360 const SplitOptions
& options
) {
2361 const uint8_t* i
= end
- 1;
2362 while ((i
>= begin
)) {
2363 uint32_t codepoint
= 0;
2364 *separator_end
= i
+ 1;
2365 if (ARROW_PREDICT_FALSE(!arrow::util::UTF8DecodeReverse(&i
, &codepoint
))) {
2368 if (IsSpaceCharacterUnicode(codepoint
)) {
2370 *separator_begin
= i
+ 1;
2371 if (ARROW_PREDICT_FALSE(!arrow::util::UTF8DecodeReverse(&i
, &codepoint
))) {
2374 } while (IsSpaceCharacterUnicode(codepoint
) && i
>= begin
);
2382 template <typename Type
, typename ListType
>
2383 using SplitWhitespaceUtf8Exec
= SplitExec
<Type
, ListType
, SplitWhitespaceUtf8Finder
>;
2385 void AddSplitWhitespaceUTF8(FunctionRegistry
* registry
) {
2386 static const SplitOptions default_options
{};
2388 std::make_shared
<ScalarFunction
>("utf8_split_whitespace", Arity::Unary(),
2389 &utf8_split_whitespace_doc
, &default_options
);
2390 using t32
= SplitWhitespaceUtf8Exec
<StringType
, ListType
>;
2391 using t64
= SplitWhitespaceUtf8Exec
<LargeStringType
, ListType
>;
2392 DCHECK_OK(func
->AddKernel({utf8()}, {list(utf8())}, t32::Exec
, t32::State::Init
));
2394 func
->AddKernel({large_utf8()}, {list(large_utf8())}, t64::Exec
, t64::State::Init
));
2395 DCHECK_OK(registry
->AddFunction(std::move(func
)));
2397 #endif // ARROW_WITH_UTF8PROC
2399 #ifdef ARROW_WITH_RE2
2400 struct SplitRegexFinder
: public SplitFinderBase
<SplitPatternOptions
> {
2401 using Options
= SplitPatternOptions
;
2403 util::optional
<RE2
> regex_split
;
2405 Status
PreExec(const SplitPatternOptions
& options
) override
{
2406 if (options
.reverse
) {
2407 return Status::NotImplemented("Cannot split in reverse with regex");
2409 // RE2 does *not* give you the full match! Must wrap the regex in a capture group
2410 // There is FindAndConsume, but it would give only the end of the separator
2411 std::string pattern
= "(";
2412 pattern
.reserve(options
.pattern
.size() + 2);
2413 pattern
+= options
.pattern
;
2415 regex_split
.emplace(std::move(pattern
));
2416 return RegexStatus(*regex_split
);
2419 bool Find(const uint8_t* begin
, const uint8_t* end
, const uint8_t** separator_begin
,
2420 const uint8_t** separator_end
, const SplitPatternOptions
& options
) {
2421 re2::StringPiece
piece(reinterpret_cast<const char*>(begin
),
2422 std::distance(begin
, end
));
2423 // "StringPiece is mutated to point to matched piece"
2424 re2::StringPiece result
;
2425 if (!re2::RE2::PartialMatch(piece
, *regex_split
, &result
)) {
2428 *separator_begin
= reinterpret_cast<const uint8_t*>(result
.data());
2429 *separator_end
= reinterpret_cast<const uint8_t*>(result
.data() + result
.size());
2433 bool FindReverse(const uint8_t* begin
, const uint8_t* end
,
2434 const uint8_t** separator_begin
, const uint8_t** separator_end
,
2435 const SplitPatternOptions
& options
) {
2436 // Unsupported (see PreExec)
2441 template <typename Type
, typename ListType
>
2442 using SplitRegexExec
= SplitExec
<Type
, ListType
, SplitRegexFinder
>;
2444 const FunctionDoc
split_pattern_regex_doc(
2445 "Split string according to regex pattern",
2446 ("Split each string according to the regex `pattern` defined in\n"
2447 "SplitPatternOptions. The output for each string input is a list\n"
2450 "The maximum number of splits and direction of splitting\n"
2451 "(forward, reverse) can optionally be defined in SplitPatternOptions."),
2452 {"strings"}, "SplitPatternOptions");
2454 void AddSplitRegex(FunctionRegistry
* registry
) {
2455 auto func
= std::make_shared
<ScalarFunction
>("split_pattern_regex", Arity::Unary(),
2456 &split_pattern_regex_doc
);
2457 using t32
= SplitRegexExec
<StringType
, ListType
>;
2458 using t64
= SplitRegexExec
<LargeStringType
, ListType
>;
2459 DCHECK_OK(func
->AddKernel({utf8()}, {list(utf8())}, t32::Exec
, t32::State::Init
));
2461 func
->AddKernel({large_utf8()}, {list(large_utf8())}, t64::Exec
, t64::State::Init
));
2462 DCHECK_OK(registry
->AddFunction(std::move(func
)));
2464 #endif // ARROW_WITH_RE2
2466 void AddSplit(FunctionRegistry
* registry
) {
2467 AddSplitPattern(registry
);
2468 AddSplitWhitespaceAscii(registry
);
2469 #ifdef ARROW_WITH_UTF8PROC
2470 AddSplitWhitespaceUTF8(registry
);
2472 #ifdef ARROW_WITH_RE2
2473 AddSplitRegex(registry
);
2477 // ----------------------------------------------------------------------
2478 // Replace substring (plain, regex)
2480 template <typename Type
, typename Replacer
>
2481 struct ReplaceSubString
{
2482 using ScalarType
= typename TypeTraits
<Type
>::ScalarType
;
2483 using offset_type
= typename
Type::offset_type
;
2484 using ValueDataBuilder
= TypedBufferBuilder
<uint8_t>;
2485 using OffsetBuilder
= TypedBufferBuilder
<offset_type
>;
2486 using State
= OptionsWrapper
<ReplaceSubstringOptions
>;
2488 static Status
Exec(KernelContext
* ctx
, const ExecBatch
& batch
, Datum
* out
) {
2489 // TODO Cache replacer across invocations (for regex compilation)
2490 ARROW_ASSIGN_OR_RAISE(auto replacer
, Replacer::Make(State::Get(ctx
)));
2491 return Replace(ctx
, batch
, *replacer
, out
);
2494 static Status
Replace(KernelContext
* ctx
, const ExecBatch
& batch
,
2495 const Replacer
& replacer
, Datum
* out
) {
2496 ValueDataBuilder
value_data_builder(ctx
->memory_pool());
2497 OffsetBuilder
offset_builder(ctx
->memory_pool());
2499 if (batch
[0].kind() == Datum::ARRAY
) {
2500 // We already know how many strings we have, so we can use Reserve/UnsafeAppend
2501 RETURN_NOT_OK(offset_builder
.Reserve(batch
[0].array()->length
+ 1));
2502 offset_builder
.UnsafeAppend(0); // offsets start at 0
2504 const ArrayData
& input
= *batch
[0].array();
2505 RETURN_NOT_OK(VisitArrayDataInline
<Type
>(
2507 [&](util::string_view s
) {
2508 RETURN_NOT_OK(replacer
.ReplaceString(s
, &value_data_builder
));
2509 offset_builder
.UnsafeAppend(
2510 static_cast<offset_type
>(value_data_builder
.length()));
2511 return Status::OK();
2514 // offset for null value
2515 offset_builder
.UnsafeAppend(
2516 static_cast<offset_type
>(value_data_builder
.length()));
2517 return Status::OK();
2519 ArrayData
* output
= out
->mutable_array();
2520 RETURN_NOT_OK(value_data_builder
.Finish(&output
->buffers
[2]));
2521 RETURN_NOT_OK(offset_builder
.Finish(&output
->buffers
[1]));
2523 const auto& input
= checked_cast
<const ScalarType
&>(*batch
[0].scalar());
2524 auto result
= std::make_shared
<ScalarType
>();
2525 if (input
.is_valid
) {
2526 util::string_view s
= static_cast<util::string_view
>(*input
.value
);
2527 RETURN_NOT_OK(replacer
.ReplaceString(s
, &value_data_builder
));
2528 RETURN_NOT_OK(value_data_builder
.Finish(&result
->value
));
2529 result
->is_valid
= true;
2531 out
->value
= result
;
2534 return Status::OK();
2538 struct PlainSubStringReplacer
{
2539 const ReplaceSubstringOptions
& options_
;
2541 static Result
<std::unique_ptr
<PlainSubStringReplacer
>> Make(
2542 const ReplaceSubstringOptions
& options
) {
2543 return arrow::internal::make_unique
<PlainSubStringReplacer
>(options
);
2546 explicit PlainSubStringReplacer(const ReplaceSubstringOptions
& options
)
2547 : options_(options
) {}
2549 Status
ReplaceString(util::string_view s
, TypedBufferBuilder
<uint8_t>* builder
) const {
2550 const char* i
= s
.begin();
2551 const char* end
= s
.end();
2552 int64_t max_replacements
= options_
.max_replacements
;
2553 while ((i
< end
) && (max_replacements
!= 0)) {
2555 std::search(i
, end
, options_
.pattern
.begin(), options_
.pattern
.end());
2557 RETURN_NOT_OK(builder
->Append(reinterpret_cast<const uint8_t*>(i
),
2558 static_cast<int64_t>(end
- i
)));
2561 // the string before the pattern
2562 RETURN_NOT_OK(builder
->Append(reinterpret_cast<const uint8_t*>(i
),
2563 static_cast<int64_t>(pos
- i
)));
2566 builder
->Append(reinterpret_cast<const uint8_t*>(options_
.replacement
.data()),
2567 options_
.replacement
.length()));
2569 i
= pos
+ options_
.pattern
.length();
2573 // if we exited early due to max_replacements, add the trailing part
2574 return builder
->Append(reinterpret_cast<const uint8_t*>(i
),
2575 static_cast<int64_t>(end
- i
));
2579 #ifdef ARROW_WITH_RE2
2580 struct RegexSubStringReplacer
{
2581 const ReplaceSubstringOptions
& options_
;
2582 const RE2 regex_find_
;
2583 const RE2 regex_replacement_
;
2585 static Result
<std::unique_ptr
<RegexSubStringReplacer
>> Make(
2586 const ReplaceSubstringOptions
& options
) {
2587 auto replacer
= arrow::internal::make_unique
<RegexSubStringReplacer
>(options
);
2589 RETURN_NOT_OK(RegexStatus(replacer
->regex_find_
));
2590 RETURN_NOT_OK(RegexStatus(replacer
->regex_replacement_
));
2592 std::string replacement_error
;
2593 if (!replacer
->regex_replacement_
.CheckRewriteString(replacer
->options_
.replacement
,
2594 &replacement_error
)) {
2595 return Status::Invalid("Invalid replacement string: ",
2596 std::move(replacement_error
));
2599 return std::move(replacer
);
2602 // Using RE2::FindAndConsume we can only find the pattern if it is a group, therefore
2603 // we have 2 regexes, one with () around it, one without.
2604 explicit RegexSubStringReplacer(const ReplaceSubstringOptions
& options
)
2605 : options_(options
),
2606 regex_find_("(" + options_
.pattern
+ ")", RE2::Quiet
),
2607 regex_replacement_(options_
.pattern
, RE2::Quiet
) {}
2609 Status
ReplaceString(util::string_view s
, TypedBufferBuilder
<uint8_t>* builder
) const {
2610 re2::StringPiece
replacement(options_
.replacement
);
2612 if (options_
.max_replacements
== -1) {
2613 std::string
s_copy(s
.to_string());
2614 re2::RE2::GlobalReplace(&s_copy
, regex_replacement_
, replacement
);
2615 return builder
->Append(reinterpret_cast<const uint8_t*>(s_copy
.data()),
2619 // Since RE2 does not have the concept of max_replacements, we have to do some work
2621 // We might do this faster similar to RE2::GlobalReplace using Match and Rewrite
2622 const char* i
= s
.begin();
2623 const char* end
= s
.end();
2624 re2::StringPiece
piece(s
.data(), s
.length());
2626 int64_t max_replacements
= options_
.max_replacements
;
2627 while ((i
< end
) && (max_replacements
!= 0)) {
2629 if (!re2::RE2::FindAndConsume(&piece
, regex_find_
, &found
)) {
2630 RETURN_NOT_OK(builder
->Append(reinterpret_cast<const uint8_t*>(i
),
2631 static_cast<int64_t>(end
- i
)));
2634 // wind back to the beginning of the match
2635 const char* pos
= piece
.begin() - found
.length();
2636 // the string before the pattern
2637 RETURN_NOT_OK(builder
->Append(reinterpret_cast<const uint8_t*>(i
),
2638 static_cast<int64_t>(pos
- i
)));
2639 // replace the pattern in what we found
2640 if (!re2::RE2::Replace(&found
, regex_replacement_
, replacement
)) {
2641 return Status::Invalid("Regex found, but replacement failed");
2643 RETURN_NOT_OK(builder
->Append(reinterpret_cast<const uint8_t*>(found
.data()),
2644 static_cast<int64_t>(found
.length())));
2650 // If we exited early due to max_replacements, add the trailing part
2651 return builder
->Append(reinterpret_cast<const uint8_t*>(i
),
2652 static_cast<int64_t>(end
- i
));
2657 template <typename Type
>
2658 using ReplaceSubStringPlain
= ReplaceSubString
<Type
, PlainSubStringReplacer
>;
2660 const FunctionDoc
replace_substring_doc(
2661 "Replace non-overlapping substrings that match pattern by replacement",
2662 ("For each string in `strings`, replace non-overlapping substrings that match\n"
2663 "`pattern` by `replacement`. If `max_replacements != -1`, it determines the\n"
2664 "maximum amount of replacements made, counting from the left. Null values emit\n"
2666 {"strings"}, "ReplaceSubstringOptions");
2668 #ifdef ARROW_WITH_RE2
2669 template <typename Type
>
2670 using ReplaceSubStringRegex
= ReplaceSubString
<Type
, RegexSubStringReplacer
>;
2672 const FunctionDoc
replace_substring_regex_doc(
2673 "Replace non-overlapping substrings that match regex `pattern` by `replacement`",
2674 ("For each string in `strings`, replace non-overlapping substrings that match the\n"
2675 "regular expression `pattern` by `replacement` using the Google RE2 library.\n"
2676 "If `max_replacements != -1`, it determines the maximum amount of replacements\n"
2677 "made, counting from the left. Note that if the pattern contains groups,\n"
2678 "backreferencing macan be used. Null values emit null."),
2679 {"strings"}, "ReplaceSubstringOptions");
2682 // ----------------------------------------------------------------------
2685 struct ReplaceSliceTransformBase
: public StringTransformBase
{
2686 using State
= OptionsWrapper
<ReplaceSliceOptions
>;
2688 const ReplaceSliceOptions
* options
;
2690 explicit ReplaceSliceTransformBase(const ReplaceSliceOptions
& options
)
2691 : options
{&options
} {}
2693 int64_t MaxCodeunits(int64_t ninputs
, int64_t input_ncodeunits
) override
{
2694 return ninputs
* options
->replacement
.size() + input_ncodeunits
;
2698 struct BinaryReplaceSliceTransform
: ReplaceSliceTransformBase
{
2699 using ReplaceSliceTransformBase::ReplaceSliceTransformBase
;
2700 int64_t Transform(const uint8_t* input
, int64_t input_string_ncodeunits
,
2702 const auto& opts
= *options
;
2703 int64_t before_slice
= 0;
2704 int64_t after_slice
= 0;
2705 uint8_t* output_start
= output
;
2707 if (opts
.start
>= 0) {
2709 before_slice
= std::min
<int64_t>(input_string_ncodeunits
, opts
.start
);
2712 before_slice
= std::max
<int64_t>(0, input_string_ncodeunits
+ opts
.start
);
2714 // Mimic Pandas: if stop would be before start, treat as 0-length slice
2715 if (opts
.stop
>= 0) {
2718 std::min
<int64_t>(input_string_ncodeunits
, std::max(before_slice
, opts
.stop
));
2721 after_slice
= std::max
<int64_t>(before_slice
, input_string_ncodeunits
+ opts
.stop
);
2723 output
= std::copy(input
, input
+ before_slice
, output
);
2724 output
= std::copy(opts
.replacement
.begin(), opts
.replacement
.end(), output
);
2725 output
= std::copy(input
+ after_slice
, input
+ input_string_ncodeunits
, output
);
2726 return output
- output_start
;
2729 static int32_t FixedOutputSize(const ReplaceSliceOptions
& opts
, int32_t input_width
) {
2730 int32_t before_slice
= 0;
2731 int32_t after_slice
= 0;
2732 const int32_t start
= static_cast<int32_t>(opts
.start
);
2733 const int32_t stop
= static_cast<int32_t>(opts
.stop
);
2734 if (opts
.start
>= 0) {
2736 before_slice
= std::min
<int32_t>(input_width
, start
);
2739 before_slice
= std::max
<int32_t>(0, input_width
+ start
);
2741 if (opts
.stop
>= 0) {
2743 after_slice
= std::min
<int32_t>(input_width
, std::max
<int32_t>(before_slice
, stop
));
2746 after_slice
= std::max
<int32_t>(before_slice
, input_width
+ stop
);
2748 return static_cast<int32_t>(before_slice
+ opts
.replacement
.size() +
2749 (input_width
- after_slice
));
2753 struct Utf8ReplaceSliceTransform
: ReplaceSliceTransformBase
{
2754 using ReplaceSliceTransformBase::ReplaceSliceTransformBase
;
2755 int64_t Transform(const uint8_t* input
, int64_t input_string_ncodeunits
,
2757 const auto& opts
= *options
;
2758 const uint8_t* begin
= input
;
2759 const uint8_t* end
= input
+ input_string_ncodeunits
;
2760 const uint8_t *begin_sliced
, *end_sliced
;
2761 uint8_t* output_start
= output
;
2763 // Mimic Pandas: if stop would be before start, treat as 0-length slice
2764 if (opts
.start
>= 0) {
2766 if (!arrow::util::UTF8AdvanceCodepoints(begin
, end
, &begin_sliced
, opts
.start
)) {
2767 return kTransformError
;
2769 if (opts
.stop
> options
->start
) {
2770 // Continue counting from left
2771 const int64_t length
= opts
.stop
- options
->start
;
2772 if (!arrow::util::UTF8AdvanceCodepoints(begin_sliced
, end
, &end_sliced
, length
)) {
2773 return kTransformError
;
2775 } else if (opts
.stop
< 0) {
2777 if (!arrow::util::UTF8AdvanceCodepointsReverse(begin_sliced
, end
, &end_sliced
,
2779 return kTransformError
;
2782 // Zero-length slice
2783 end_sliced
= begin_sliced
;
2787 if (!arrow::util::UTF8AdvanceCodepointsReverse(begin
, end
, &begin_sliced
,
2789 return kTransformError
;
2791 if (opts
.stop
>= 0) {
2792 // Restart counting from left
2793 if (!arrow::util::UTF8AdvanceCodepoints(begin
, end
, &end_sliced
, opts
.stop
)) {
2794 return kTransformError
;
2796 if (end_sliced
<= begin_sliced
) {
2797 // Zero-length slice
2798 end_sliced
= begin_sliced
;
2800 } else if ((opts
.stop
< 0) && (options
->stop
> options
->start
)) {
2802 if (!arrow::util::UTF8AdvanceCodepointsReverse(begin_sliced
, end
, &end_sliced
,
2804 return kTransformError
;
2807 // zero-length slice
2808 end_sliced
= begin_sliced
;
2811 output
= std::copy(begin
, begin_sliced
, output
);
2812 output
= std::copy(opts
.replacement
.begin(), options
->replacement
.end(), output
);
2813 output
= std::copy(end_sliced
, end
, output
);
2814 return output
- output_start
;
2818 template <typename Type
>
2819 using BinaryReplaceSlice
=
2820 StringTransformExecWithState
<Type
, BinaryReplaceSliceTransform
>;
2821 template <typename Type
>
2822 using Utf8ReplaceSlice
= StringTransformExecWithState
<Type
, Utf8ReplaceSliceTransform
>;
2824 const FunctionDoc
binary_replace_slice_doc(
2825 "Replace a slice of a binary string with `replacement`",
2826 ("For each string in `strings`, replace a slice of the string defined by `start`"
2827 "and `stop` with `replacement`. `start` is inclusive and `stop` is exclusive, "
2828 "and both are measured in bytes.\n"
2829 "Null values emit null."),
2830 {"strings"}, "ReplaceSliceOptions");
2832 const FunctionDoc
utf8_replace_slice_doc(
2833 "Replace a slice of a string with `replacement`",
2834 ("For each string in `strings`, replace a slice of the string defined by `start`"
2835 "and `stop` with `replacement`. `start` is inclusive and `stop` is exclusive, "
2836 "and both are measured in codeunits.\n"
2837 "Null values emit null."),
2838 {"strings"}, "ReplaceSliceOptions");
2840 void AddReplaceSlice(FunctionRegistry
* registry
) {
2842 auto func
= std::make_shared
<ScalarFunction
>("binary_replace_slice", Arity::Unary(),
2843 &binary_replace_slice_doc
);
2844 for (const auto& ty
: BaseBinaryTypes()) {
2845 DCHECK_OK(func
->AddKernel({ty
}, ty
,
2846 GenerateTypeAgnosticVarBinaryBase
<BinaryReplaceSlice
>(ty
),
2847 ReplaceSliceTransformBase::State::Init
));
2849 using TransformExec
=
2850 FixedSizeBinaryTransformExecWithState
<BinaryReplaceSliceTransform
>;
2851 DCHECK_OK(func
->AddKernel({InputType(Type::FIXED_SIZE_BINARY
)},
2852 OutputType(TransformExec::OutputType
), TransformExec::Exec
,
2853 ReplaceSliceTransformBase::State::Init
));
2854 DCHECK_OK(registry
->AddFunction(std::move(func
)));
2858 auto func
= std::make_shared
<ScalarFunction
>("utf8_replace_slice", Arity::Unary(),
2859 &utf8_replace_slice_doc
);
2860 DCHECK_OK(func
->AddKernel({utf8()}, utf8(), Utf8ReplaceSlice
<StringType
>::Exec
,
2861 ReplaceSliceTransformBase::State::Init
));
2862 DCHECK_OK(func
->AddKernel({large_utf8()}, large_utf8(),
2863 Utf8ReplaceSlice
<LargeStringType
>::Exec
,
2864 ReplaceSliceTransformBase::State::Init
));
2865 DCHECK_OK(registry
->AddFunction(std::move(func
)));
2869 // ----------------------------------------------------------------------
2870 // Extract with regex
2872 #ifdef ARROW_WITH_RE2
2874 // TODO cache this once per ExtractRegexOptions
2875 struct ExtractRegexData
{
2876 // Use unique_ptr<> because RE2 is non-movable
2877 std::unique_ptr
<RE2
> regex
;
2878 std::vector
<std::string
> group_names
;
2880 static Result
<ExtractRegexData
> Make(const ExtractRegexOptions
& options
) {
2881 ExtractRegexData
data(options
.pattern
);
2882 RETURN_NOT_OK(RegexStatus(*data
.regex
));
2884 const int group_count
= data
.regex
->NumberOfCapturingGroups();
2885 const auto& name_map
= data
.regex
->CapturingGroupNames();
2886 data
.group_names
.reserve(group_count
);
2888 for (int i
= 0; i
< group_count
; i
++) {
2889 auto item
= name_map
.find(i
+ 1); // re2 starts counting from 1
2890 if (item
== name_map
.end()) {
2891 // XXX should we instead just create fields with an empty name?
2892 return Status::Invalid("Regular expression contains unnamed groups");
2894 data
.group_names
.emplace_back(item
->second
);
2896 return std::move(data
);
2899 Result
<ValueDescr
> ResolveOutputType(const std::vector
<ValueDescr
>& args
) const {
2900 const auto& input_type
= args
[0].type
;
2901 if (input_type
== nullptr) {
2902 // No input type specified => propagate shape
2905 // Input type is either String or LargeString and is also the type of each
2906 // field in the output struct type.
2907 DCHECK(input_type
->id() == Type::STRING
|| input_type
->id() == Type::LARGE_STRING
);
2909 fields
.reserve(group_names
.size());
2910 std::transform(group_names
.begin(), group_names
.end(), std::back_inserter(fields
),
2911 [&](const std::string
& name
) { return field(name
, input_type
); });
2912 return struct_(std::move(fields
));
2916 explicit ExtractRegexData(const std::string
& pattern
)
2917 : regex(new RE2(pattern
, RE2::Quiet
)) {}
2920 Result
<ValueDescr
> ResolveExtractRegexOutput(KernelContext
* ctx
,
2921 const std::vector
<ValueDescr
>& args
) {
2922 using State
= OptionsWrapper
<ExtractRegexOptions
>;
2923 ExtractRegexOptions options
= State::Get(ctx
);
2924 ARROW_ASSIGN_OR_RAISE(auto data
, ExtractRegexData::Make(options
));
2925 return data
.ResolveOutputType(args
);
2928 struct ExtractRegexBase
{
2929 const ExtractRegexData
& data
;
2930 const int group_count
;
2931 std::vector
<re2::StringPiece
> found_values
;
2932 std::vector
<re2::RE2::Arg
> args
;
2933 std::vector
<const re2::RE2::Arg
*> args_pointers
;
2934 const re2::RE2::Arg
** args_pointers_start
;
2935 const re2::RE2::Arg
* null_arg
= nullptr;
2937 explicit ExtractRegexBase(const ExtractRegexData
& data
)
2939 group_count(static_cast<int>(data
.group_names
.size())),
2940 found_values(group_count
) {
2941 args
.reserve(group_count
);
2942 args_pointers
.reserve(group_count
);
2944 for (int i
= 0; i
< group_count
; i
++) {
2945 args
.emplace_back(&found_values
[i
]);
2946 // Since we reserved capacity, we're guaranteed the pointer remains valid
2947 args_pointers
.push_back(&args
[i
]);
2949 // Avoid null pointer if there is no capture group
2950 args_pointers_start
= (group_count
> 0) ? args_pointers
.data() : &null_arg
;
2953 bool Match(util::string_view s
) {
2954 return re2::RE2::PartialMatchN(ToStringPiece(s
), *data
.regex
, args_pointers_start
,
2959 template <typename Type
>
2960 struct ExtractRegex
: public ExtractRegexBase
{
2961 using ArrayType
= typename TypeTraits
<Type
>::ArrayType
;
2962 using ScalarType
= typename TypeTraits
<Type
>::ScalarType
;
2963 using BuilderType
= typename TypeTraits
<Type
>::BuilderType
;
2964 using State
= OptionsWrapper
<ExtractRegexOptions
>;
2966 using ExtractRegexBase::ExtractRegexBase
;
2968 static Status
Exec(KernelContext
* ctx
, const ExecBatch
& batch
, Datum
* out
) {
2969 ExtractRegexOptions options
= State::Get(ctx
);
2970 ARROW_ASSIGN_OR_RAISE(auto data
, ExtractRegexData::Make(options
));
2971 return ExtractRegex
{data
}.Extract(ctx
, batch
, out
);
2974 Status
Extract(KernelContext
* ctx
, const ExecBatch
& batch
, Datum
* out
) {
2975 ARROW_ASSIGN_OR_RAISE(auto descr
, data
.ResolveOutputType(batch
.GetDescriptors()));
2976 DCHECK_NE(descr
.type
, nullptr);
2977 const auto& type
= descr
.type
;
2979 if (batch
[0].kind() == Datum::ARRAY
) {
2980 std::unique_ptr
<ArrayBuilder
> array_builder
;
2981 RETURN_NOT_OK(MakeBuilder(ctx
->memory_pool(), type
, &array_builder
));
2982 StructBuilder
* struct_builder
= checked_cast
<StructBuilder
*>(array_builder
.get());
2984 std::vector
<BuilderType
*> field_builders
;
2985 field_builders
.reserve(group_count
);
2986 for (int i
= 0; i
< group_count
; i
++) {
2987 field_builders
.push_back(
2988 checked_cast
<BuilderType
*>(struct_builder
->field_builder(i
)));
2991 auto visit_null
= [&]() { return struct_builder
->AppendNull(); };
2992 auto visit_value
= [&](util::string_view s
) {
2994 for (int i
= 0; i
< group_count
; i
++) {
2995 RETURN_NOT_OK(field_builders
[i
]->Append(ToStringView(found_values
[i
])));
2997 return struct_builder
->Append();
2999 return struct_builder
->AppendNull();
3002 const ArrayData
& input
= *batch
[0].array();
3003 RETURN_NOT_OK(VisitArrayDataInline
<Type
>(input
, visit_value
, visit_null
));
3005 std::shared_ptr
<Array
> out_array
;
3006 RETURN_NOT_OK(struct_builder
->Finish(&out_array
));
3007 *out
= std::move(out_array
);
3009 const auto& input
= checked_cast
<const ScalarType
&>(*batch
[0].scalar());
3010 auto result
= std::make_shared
<StructScalar
>(type
);
3011 if (input
.is_valid
&& Match(util::string_view(*input
.value
))) {
3012 result
->value
.reserve(group_count
);
3013 for (int i
= 0; i
< group_count
; i
++) {
3014 result
->value
.push_back(
3015 std::make_shared
<ScalarType
>(found_values
[i
].as_string()));
3017 result
->is_valid
= true;
3019 result
->is_valid
= false;
3021 out
->value
= std::move(result
);
3024 return Status::OK();
3028 const FunctionDoc
extract_regex_doc(
3029 "Extract substrings captured by a regex pattern",
3030 ("For each string in `strings`, match the regular expression and, if\n"
3031 "successful, emit a struct with field names and values coming from the\n"
3032 "regular expression's named capture groups. If the input is null or the\n"
3033 "regular expression fails matching, a null output value is emitted.\n"
3035 "Regular expression matching is done using the Google RE2 library."),
3036 {"strings"}, "ExtractRegexOptions");
3038 void AddExtractRegex(FunctionRegistry
* registry
) {
3039 auto func
= std::make_shared
<ScalarFunction
>("extract_regex", Arity::Unary(),
3040 &extract_regex_doc
);
3041 using t32
= ExtractRegex
<StringType
>;
3042 using t64
= ExtractRegex
<LargeStringType
>;
3043 OutputType
out_ty(ResolveExtractRegexOutput
);
3044 ScalarKernel kernel
;
3046 // Null values will be computed based on regex match or not
3047 kernel
.null_handling
= NullHandling::COMPUTED_NO_PREALLOCATE
;
3048 kernel
.mem_allocation
= MemAllocation::NO_PREALLOCATE
;
3049 kernel
.signature
.reset(new KernelSignature({utf8()}, out_ty
));
3050 kernel
.exec
= t32::Exec
;
3051 kernel
.init
= t32::State::Init
;
3052 DCHECK_OK(func
->AddKernel(kernel
));
3053 kernel
.signature
.reset(new KernelSignature({large_utf8()}, out_ty
));
3054 kernel
.exec
= t64::Exec
;
3055 kernel
.init
= t64::State::Init
;
3056 DCHECK_OK(func
->AddKernel(kernel
));
3058 DCHECK_OK(registry
->AddFunction(std::move(func
)));
3060 #endif // ARROW_WITH_RE2
3062 // ----------------------------------------------------------------------
3063 // strptime string parsing
3065 using StrptimeState
= OptionsWrapper
<StrptimeOptions
>;
3067 struct ParseStrptime
{
3068 explicit ParseStrptime(const StrptimeOptions
& options
)
3069 : parser(TimestampParser::MakeStrptime(options
.format
)), unit(options
.unit
) {}
3071 template <typename
... Ignored
>
3072 int64_t Call(KernelContext
*, util::string_view val
, Status
* st
) const {
3074 if (!(*parser
)(val
.data(), val
.size(), unit
, &result
)) {
3075 *st
= Status::Invalid("Failed to parse string: '", val
, "' as a scalar of type ",
3076 TimestampType(unit
).ToString());
3081 std::shared_ptr
<TimestampParser
> parser
;
3082 TimeUnit::type unit
;
3085 template <typename InputType
>
3086 Status
StrptimeExec(KernelContext
* ctx
, const ExecBatch
& batch
, Datum
* out
) {
3087 applicator::ScalarUnaryNotNullStateful
<TimestampType
, InputType
, ParseStrptime
> kernel
{
3088 ParseStrptime(StrptimeState::Get(ctx
))};
3089 return kernel
.Exec(ctx
, batch
, out
);
3092 Result
<ValueDescr
> StrptimeResolve(KernelContext
* ctx
, const std::vector
<ValueDescr
>&) {
3094 return ::arrow::timestamp(StrptimeState::Get(ctx
).unit
);
3097 return Status::Invalid("strptime does not provide default StrptimeOptions");
3100 // ----------------------------------------------------------------------
3103 template <bool PadLeft
, bool PadRight
>
3104 struct AsciiPadTransform
: public StringTransformBase
{
3105 using State
= OptionsWrapper
<PadOptions
>;
3107 const PadOptions
& options_
;
3109 explicit AsciiPadTransform(const PadOptions
& options
) : options_(options
) {}
3111 Status
PreExec(KernelContext
* ctx
, const ExecBatch
& batch
, Datum
* out
) override
{
3112 if (options_
.padding
.size() != 1) {
3113 return Status::Invalid("Padding must be one byte, got '", options_
.padding
, "'");
3115 return Status::OK();
3118 int64_t MaxCodeunits(int64_t ninputs
, int64_t input_ncodeunits
) override
{
3119 // This is likely very overallocated but hard to do better without
3120 // actually looking at each string (because of strings that may be
3121 // longer than the given width)
3122 return input_ncodeunits
+ ninputs
* options_
.width
;
3125 int64_t Transform(const uint8_t* input
, int64_t input_string_ncodeunits
,
3127 if (input_string_ncodeunits
>= options_
.width
) {
3128 std::copy(input
, input
+ input_string_ncodeunits
, output
);
3129 return input_string_ncodeunits
;
3131 const int64_t spaces
= options_
.width
- input_string_ncodeunits
;
3134 if (PadLeft
&& PadRight
) {
3135 // If odd number of spaces, put the extra space on the right
3137 right
= spaces
- left
;
3138 } else if (PadLeft
) {
3140 } else if (PadRight
) {
3143 DCHECK(false) << "unreachable";
3146 std::fill(output
, output
+ left
, options_
.padding
[0]);
3148 output
= std::copy(input
, input
+ input_string_ncodeunits
, output
);
3149 std::fill(output
, output
+ right
, options_
.padding
[0]);
3150 return options_
.width
;
3154 template <bool PadLeft
, bool PadRight
>
3155 struct Utf8PadTransform
: public StringTransformBase
{
3156 using State
= OptionsWrapper
<PadOptions
>;
3158 const PadOptions
& options_
;
3160 explicit Utf8PadTransform(const PadOptions
& options
) : options_(options
) {}
3162 Status
PreExec(KernelContext
* ctx
, const ExecBatch
& batch
, Datum
* out
) override
{
3163 auto str
= reinterpret_cast<const uint8_t*>(options_
.padding
.data());
3164 auto strlen
= options_
.padding
.size();
3165 if (util::UTF8Length(str
, str
+ strlen
) != 1) {
3166 return Status::Invalid("Padding must be one codepoint, got '", options_
.padding
,
3169 return Status::OK();
3172 int64_t MaxCodeunits(int64_t ninputs
, int64_t input_ncodeunits
) override
{
3173 // This is likely very overallocated but hard to do better without
3174 // actually looking at each string (because of strings that may be
3175 // longer than the given width)
3176 // One codepoint may be up to 4 bytes
3177 return input_ncodeunits
+ 4 * ninputs
* options_
.width
;
3180 int64_t Transform(const uint8_t* input
, int64_t input_string_ncodeunits
,
3182 const int64_t input_width
= util::UTF8Length(input
, input
+ input_string_ncodeunits
);
3183 if (input_width
>= options_
.width
) {
3184 std::copy(input
, input
+ input_string_ncodeunits
, output
);
3185 return input_string_ncodeunits
;
3187 const int64_t spaces
= options_
.width
- input_width
;
3190 if (PadLeft
&& PadRight
) {
3191 // If odd number of spaces, put the extra space on the right
3193 right
= spaces
- left
;
3194 } else if (PadLeft
) {
3196 } else if (PadRight
) {
3199 DCHECK(false) << "unreachable";
3202 uint8_t* start
= output
;
3204 output
= std::copy(options_
.padding
.begin(), options_
.padding
.end(), output
);
3207 output
= std::copy(input
, input
+ input_string_ncodeunits
, output
);
3209 output
= std::copy(options_
.padding
.begin(), options_
.padding
.end(), output
);
3212 return output
- start
;
3216 template <typename Type
>
3217 using AsciiLPad
= StringTransformExecWithState
<Type
, AsciiPadTransform
<true, false>>;
3218 template <typename Type
>
3219 using AsciiRPad
= StringTransformExecWithState
<Type
, AsciiPadTransform
<false, true>>;
3220 template <typename Type
>
3221 using AsciiCenter
= StringTransformExecWithState
<Type
, AsciiPadTransform
<true, true>>;
3222 template <typename Type
>
3223 using Utf8LPad
= StringTransformExecWithState
<Type
, Utf8PadTransform
<true, false>>;
3224 template <typename Type
>
3225 using Utf8RPad
= StringTransformExecWithState
<Type
, Utf8PadTransform
<false, true>>;
3226 template <typename Type
>
3227 using Utf8Center
= StringTransformExecWithState
<Type
, Utf8PadTransform
<true, true>>;
3229 // ----------------------------------------------------------------------
3232 #ifdef ARROW_WITH_UTF8PROC
3234 template <bool TrimLeft
, bool TrimRight
>
3235 struct UTF8TrimWhitespaceTransform
: public StringTransformBase
{
3236 Status
PreExec(KernelContext
* ctx
, const ExecBatch
& batch
, Datum
* out
) override
{
3237 EnsureLookupTablesFilled();
3238 return Status::OK();
3241 int64_t Transform(const uint8_t* input
, int64_t input_string_ncodeunits
,
3243 const uint8_t* begin
= input
;
3244 const uint8_t* end
= input
+ input_string_ncodeunits
;
3245 const uint8_t* end_trimmed
= end
;
3246 const uint8_t* begin_trimmed
= begin
;
3248 auto predicate
= [](uint32_t c
) { return !IsSpaceCharacterUnicode(c
); };
3249 if (TrimLeft
&& !ARROW_PREDICT_TRUE(
3250 arrow::util::UTF8FindIf(begin
, end
, predicate
, &begin_trimmed
))) {
3251 return kTransformError
;
3253 if (TrimRight
&& begin_trimmed
< end
) {
3254 if (!ARROW_PREDICT_TRUE(arrow::util::UTF8FindIfReverse(begin_trimmed
, end
,
3255 predicate
, &end_trimmed
))) {
3256 return kTransformError
;
3259 std::copy(begin_trimmed
, end_trimmed
, output
);
3260 return end_trimmed
- begin_trimmed
;
3264 template <typename Type
>
3265 using UTF8TrimWhitespace
=
3266 StringTransformExec
<Type
, UTF8TrimWhitespaceTransform
<true, true>>;
3268 template <typename Type
>
3269 using UTF8LTrimWhitespace
=
3270 StringTransformExec
<Type
, UTF8TrimWhitespaceTransform
<true, false>>;
3272 template <typename Type
>
3273 using UTF8RTrimWhitespace
=
3274 StringTransformExec
<Type
, UTF8TrimWhitespaceTransform
<false, true>>;
3276 struct UTF8TrimState
{
3277 TrimOptions options_
;
3278 std::vector
<bool> codepoints_
;
3279 Status status_
= Status::OK();
3281 explicit UTF8TrimState(KernelContext
* ctx
, TrimOptions options
)
3282 : options_(std::move(options
)) {
3283 if (!ARROW_PREDICT_TRUE(
3284 arrow::util::UTF8ForEach(options_
.characters
, [&](uint32_t c
) {
3286 std::max(c
+ 1, static_cast<uint32_t>(codepoints_
.size())));
3287 codepoints_
.at(c
) = true;
3289 status_
= Status::Invalid("Invalid UTF8 sequence in input");
3294 template <bool TrimLeft
, bool TrimRight
>
3295 struct UTF8TrimTransform
: public StringTransformBase
{
3296 using State
= KernelStateFromFunctionOptions
<UTF8TrimState
, TrimOptions
>;
3298 const UTF8TrimState
& state_
;
3300 explicit UTF8TrimTransform(const UTF8TrimState
& state
) : state_(state
) {}
3302 Status
PreExec(KernelContext
* ctx
, const ExecBatch
& batch
, Datum
* out
) override
{
3303 return state_
.status_
;
3306 int64_t Transform(const uint8_t* input
, int64_t input_string_ncodeunits
,
3308 const uint8_t* begin
= input
;
3309 const uint8_t* end
= input
+ input_string_ncodeunits
;
3310 const uint8_t* end_trimmed
= end
;
3311 const uint8_t* begin_trimmed
= begin
;
3312 const auto& codepoints
= state_
.codepoints_
;
3314 auto predicate
= [&](uint32_t c
) { return c
>= codepoints
.size() || !codepoints
[c
]; };
3315 if (TrimLeft
&& !ARROW_PREDICT_TRUE(
3316 arrow::util::UTF8FindIf(begin
, end
, predicate
, &begin_trimmed
))) {
3317 return kTransformError
;
3319 if (TrimRight
&& begin_trimmed
< end
) {
3320 if (!ARROW_PREDICT_TRUE(arrow::util::UTF8FindIfReverse(begin_trimmed
, end
,
3321 predicate
, &end_trimmed
))) {
3322 return kTransformError
;
3325 std::copy(begin_trimmed
, end_trimmed
, output
);
3326 return end_trimmed
- begin_trimmed
;
3330 template <typename Type
>
3331 using UTF8Trim
= StringTransformExecWithState
<Type
, UTF8TrimTransform
<true, true>>;
3333 template <typename Type
>
3334 using UTF8LTrim
= StringTransformExecWithState
<Type
, UTF8TrimTransform
<true, false>>;
3336 template <typename Type
>
3337 using UTF8RTrim
= StringTransformExecWithState
<Type
, UTF8TrimTransform
<false, true>>;
3341 template <bool TrimLeft
, bool TrimRight
>
3342 struct AsciiTrimWhitespaceTransform
: public StringTransformBase
{
3343 int64_t Transform(const uint8_t* input
, int64_t input_string_ncodeunits
,
3345 const uint8_t* begin
= input
;
3346 const uint8_t* end
= input
+ input_string_ncodeunits
;
3347 const uint8_t* end_trimmed
= end
;
3348 const uint8_t* begin_trimmed
= begin
;
3350 auto predicate
= [](unsigned char c
) { return !IsSpaceCharacterAscii(c
); };
3352 begin_trimmed
= std::find_if(begin
, end
, predicate
);
3354 if (TrimRight
&& begin_trimmed
< end
) {
3355 std::reverse_iterator
<const uint8_t*> rbegin(end
);
3356 std::reverse_iterator
<const uint8_t*> rend(begin_trimmed
);
3357 end_trimmed
= std::find_if(rbegin
, rend
, predicate
).base();
3359 std::copy(begin_trimmed
, end_trimmed
, output
);
3360 return end_trimmed
- begin_trimmed
;
3364 template <typename Type
>
3365 using AsciiTrimWhitespace
=
3366 StringTransformExec
<Type
, AsciiTrimWhitespaceTransform
<true, true>>;
3368 template <typename Type
>
3369 using AsciiLTrimWhitespace
=
3370 StringTransformExec
<Type
, AsciiTrimWhitespaceTransform
<true, false>>;
3372 template <typename Type
>
3373 using AsciiRTrimWhitespace
=
3374 StringTransformExec
<Type
, AsciiTrimWhitespaceTransform
<false, true>>;
3376 struct AsciiTrimState
{
3377 TrimOptions options_
;
3378 std::vector
<bool> characters_
;
3380 explicit AsciiTrimState(KernelContext
* ctx
, TrimOptions options
)
3381 : options_(std::move(options
)), characters_(256) {
3382 for (const auto c
: options_
.characters
) {
3383 characters_
[static_cast<unsigned char>(c
)] = true;
3388 template <bool TrimLeft
, bool TrimRight
>
3389 struct AsciiTrimTransform
: public StringTransformBase
{
3390 using State
= KernelStateFromFunctionOptions
<AsciiTrimState
, TrimOptions
>;
3392 const AsciiTrimState
& state_
;
3394 explicit AsciiTrimTransform(const AsciiTrimState
& state
) : state_(state
) {}
3396 int64_t Transform(const uint8_t* input
, int64_t input_string_ncodeunits
,
3398 const uint8_t* begin
= input
;
3399 const uint8_t* end
= input
+ input_string_ncodeunits
;
3400 const uint8_t* end_trimmed
= end
;
3401 const uint8_t* begin_trimmed
= begin
;
3402 const auto& characters
= state_
.characters_
;
3404 auto predicate
= [&](uint8_t c
) { return !characters
[c
]; };
3406 begin_trimmed
= std::find_if(begin
, end
, predicate
);
3408 if (TrimRight
&& begin_trimmed
< end
) {
3409 std::reverse_iterator
<const uint8_t*> rbegin(end
);
3410 std::reverse_iterator
<const uint8_t*> rend(begin_trimmed
);
3411 end_trimmed
= std::find_if(rbegin
, rend
, predicate
).base();
3413 std::copy(begin_trimmed
, end_trimmed
, output
);
3414 return end_trimmed
- begin_trimmed
;
3418 template <typename Type
>
3419 using AsciiTrim
= StringTransformExecWithState
<Type
, AsciiTrimTransform
<true, true>>;
3421 template <typename Type
>
3422 using AsciiLTrim
= StringTransformExecWithState
<Type
, AsciiTrimTransform
<true, false>>;
3424 template <typename Type
>
3425 using AsciiRTrim
= StringTransformExecWithState
<Type
, AsciiTrimTransform
<false, true>>;
3427 const FunctionDoc
utf8_center_doc(
3428 "Center strings by padding with a given character",
3429 ("For each string in `strings`, emit a centered string by padding both sides \n"
3430 "with the given UTF8 codeunit.\nNull values emit null."),
3431 {"strings"}, "PadOptions");
3433 const FunctionDoc
utf8_lpad_doc(
3434 "Right-align strings by padding with a given character",
3435 ("For each string in `strings`, emit a right-aligned string by prepending \n"
3436 "the given UTF8 codeunit.\nNull values emit null."),
3437 {"strings"}, "PadOptions");
3439 const FunctionDoc
utf8_rpad_doc(
3440 "Left-align strings by padding with a given character",
3441 ("For each string in `strings`, emit a left-aligned string by appending \n"
3442 "the given UTF8 codeunit.\nNull values emit null."),
3443 {"strings"}, "PadOptions");
3445 const FunctionDoc
ascii_center_doc(
3446 utf8_center_doc
.description
+ "",
3447 ("For each string in `strings`, emit a centered string by padding both sides \n"
3448 "with the given ASCII character.\nNull values emit null."),
3449 {"strings"}, "PadOptions");
3451 const FunctionDoc
ascii_lpad_doc(
3452 utf8_lpad_doc
.description
+ "",
3453 ("For each string in `strings`, emit a right-aligned string by prepending \n"
3454 "the given ASCII character.\nNull values emit null."),
3455 {"strings"}, "PadOptions");
3457 const FunctionDoc
ascii_rpad_doc(
3458 utf8_rpad_doc
.description
+ "",
3459 ("For each string in `strings`, emit a left-aligned string by appending \n"
3460 "the given ASCII character.\nNull values emit null."),
3461 {"strings"}, "PadOptions");
3463 const FunctionDoc
utf8_trim_whitespace_doc(
3464 "Trim leading and trailing whitespace characters",
3465 ("For each string in `strings`, emit a string with leading and trailing whitespace\n"
3466 "characters removed, where whitespace characters are defined by the Unicode\n"
3467 "standard. Null values emit null."),
3470 const FunctionDoc
utf8_ltrim_whitespace_doc(
3471 "Trim leading whitespace characters",
3472 ("For each string in `strings`, emit a string with leading whitespace\n"
3473 "characters removed, where whitespace characters are defined by the Unicode\n"
3474 "standard. Null values emit null."),
3477 const FunctionDoc
utf8_rtrim_whitespace_doc(
3478 "Trim trailing whitespace characters",
3479 ("For each string in `strings`, emit a string with trailing whitespace\n"
3480 "characters removed, where whitespace characters are defined by the Unicode\n"
3481 "standard. Null values emit null."),
3484 const FunctionDoc
ascii_trim_whitespace_doc(
3485 "Trim leading and trailing ASCII whitespace characters",
3486 ("For each string in `strings`, emit a string with leading and trailing ASCII\n"
3487 "whitespace characters removed. Use `utf8_trim_whitespace` to trim Unicode\n"
3488 "whitespace characters. Null values emit null."),
3491 const FunctionDoc
ascii_ltrim_whitespace_doc(
3492 "Trim leading ASCII whitespace characters",
3493 ("For each string in `strings`, emit a string with leading ASCII whitespace\n"
3494 "characters removed. Use `utf8_ltrim_whitespace` to trim leading Unicode\n"
3495 "whitespace characters. Null values emit null."),
3498 const FunctionDoc
ascii_rtrim_whitespace_doc(
3499 "Trim trailing ASCII whitespace characters",
3500 ("For each string in `strings`, emit a string with trailing ASCII whitespace\n"
3501 "characters removed. Use `utf8_rtrim_whitespace` to trim trailing Unicode\n"
3502 "whitespace characters. Null values emit null."),
3505 const FunctionDoc
utf8_trim_doc(
3506 "Trim leading and trailing characters present in the `characters` arguments",
3507 ("For each string in `strings`, emit a string with leading and trailing\n"
3508 "characters removed that are present in the `characters` argument. Null values\n"
3510 {"strings"}, "TrimOptions");
3512 const FunctionDoc
utf8_ltrim_doc(
3513 "Trim leading characters present in the `characters` arguments",
3514 ("For each string in `strings`, emit a string with leading\n"
3515 "characters removed that are present in the `characters` argument. Null values\n"
3517 {"strings"}, "TrimOptions");
3519 const FunctionDoc
utf8_rtrim_doc(
3520 "Trim trailing characters present in the `characters` arguments",
3521 ("For each string in `strings`, emit a string with leading "
3522 "characters removed that are present in the `characters` argument. Null values\n"
3524 {"strings"}, "TrimOptions");
3526 const FunctionDoc
ascii_trim_doc(
3527 utf8_trim_doc
.summary
+ "",
3528 utf8_trim_doc
.description
+
3529 ("\nBoth the input string as the `characters` argument are interepreted as\n"
3530 "ASCII characters, to trim non-ASCII characters, use `utf8_trim`."),
3531 {"strings"}, "TrimOptions");
3533 const FunctionDoc
ascii_ltrim_doc(
3534 utf8_ltrim_doc
.summary
+ "",
3535 utf8_ltrim_doc
.description
+
3536 ("\nBoth the input string as the `characters` argument are interepreted as\n"
3537 "ASCII characters, to trim non-ASCII characters, use `utf8_trim`."),
3538 {"strings"}, "TrimOptions");
3540 const FunctionDoc
ascii_rtrim_doc(
3541 utf8_rtrim_doc
.summary
+ "",
3542 utf8_rtrim_doc
.description
+
3543 ("\nBoth the input string as the `characters` argument are interepreted as\n"
3544 "ASCII characters, to trim non-ASCII characters, use `utf8_trim`."),
3545 {"strings"}, "TrimOptions");
3547 const FunctionDoc
strptime_doc(
3549 ("For each string in `strings`, parse it as a timestamp.\n"
3550 "The timestamp unit and the expected string pattern must be given\n"
3551 "in StrptimeOptions. Null inputs emit null. If a non-null string\n"
3552 "fails parsing, an error is returned."),
3553 {"strings"}, "StrptimeOptions");
3555 const FunctionDoc
binary_length_doc(
3556 "Compute string lengths",
3557 ("For each string in `strings`, emit the number of bytes. Null values emit null."),
3560 const FunctionDoc
utf8_length_doc("Compute UTF8 string lengths",
3561 ("For each string in `strings`, emit the number of "
3562 "UTF8 characters. Null values emit null."),
3565 void AddStrptime(FunctionRegistry
* registry
) {
3566 auto func
= std::make_shared
<ScalarFunction
>("strptime", Arity::Unary(), &strptime_doc
);
3567 DCHECK_OK(func
->AddKernel({utf8()}, OutputType(StrptimeResolve
),
3568 StrptimeExec
<StringType
>, StrptimeState::Init
));
3569 DCHECK_OK(func
->AddKernel({large_utf8()}, OutputType(StrptimeResolve
),
3570 StrptimeExec
<LargeStringType
>, StrptimeState::Init
));
3571 DCHECK_OK(registry
->AddFunction(std::move(func
)));
3574 void AddBinaryLength(FunctionRegistry
* registry
) {
3575 auto func
= std::make_shared
<ScalarFunction
>("binary_length", Arity::Unary(),
3576 &binary_length_doc
);
3577 ArrayKernelExec exec_offset_32
=
3578 applicator::ScalarUnaryNotNull
<Int32Type
, StringType
, BinaryLength
>::Exec
;
3579 ArrayKernelExec exec_offset_64
=
3580 applicator::ScalarUnaryNotNull
<Int64Type
, LargeStringType
, BinaryLength
>::Exec
;
3581 for (const auto& input_type
: {binary(), utf8()}) {
3582 DCHECK_OK(func
->AddKernel({input_type
}, int32(), exec_offset_32
));
3584 for (const auto& input_type
: {large_binary(), large_utf8()}) {
3585 DCHECK_OK(func
->AddKernel({input_type
}, int64(), exec_offset_64
));
3587 DCHECK_OK(func
->AddKernel({InputType(Type::FIXED_SIZE_BINARY
)}, int32(),
3588 BinaryLength::FixedSizeExec
));
3589 DCHECK_OK(registry
->AddFunction(std::move(func
)));
3592 void AddUtf8Length(FunctionRegistry
* registry
) {
3594 std::make_shared
<ScalarFunction
>("utf8_length", Arity::Unary(), &utf8_length_doc
);
3596 ArrayKernelExec exec_offset_32
=
3597 applicator::ScalarUnaryNotNull
<Int32Type
, StringType
, Utf8Length
>::Exec
;
3598 DCHECK_OK(func
->AddKernel({utf8()}, int32(), std::move(exec_offset_32
)));
3600 ArrayKernelExec exec_offset_64
=
3601 applicator::ScalarUnaryNotNull
<Int64Type
, LargeStringType
, Utf8Length
>::Exec
;
3602 DCHECK_OK(func
->AddKernel({large_utf8()}, int64(), std::move(exec_offset_64
)));
3604 DCHECK_OK(registry
->AddFunction(std::move(func
)));
3607 template <typename BinaryType
, typename ListType
>
3609 using ArrayType
= typename TypeTraits
<BinaryType
>::ArrayType
;
3610 using ListArrayType
= typename TypeTraits
<ListType
>::ArrayType
;
3611 using ListScalarType
= typename TypeTraits
<ListType
>::ScalarType
;
3612 using ListOffsetType
= typename
ListArrayType::offset_type
;
3613 using BuilderType
= typename TypeTraits
<BinaryType
>::BuilderType
;
3615 static Status
Exec(KernelContext
* ctx
, const ExecBatch
& batch
, Datum
* out
) {
3616 if (batch
[0].kind() == Datum::SCALAR
) {
3617 if (batch
[1].kind() == Datum::SCALAR
) {
3618 return ExecScalarScalar(ctx
, *batch
[0].scalar(), *batch
[1].scalar(), out
);
3620 DCHECK_EQ(batch
[1].kind(), Datum::ARRAY
);
3621 return ExecScalarArray(ctx
, *batch
[0].scalar(), batch
[1].array(), out
);
3623 DCHECK_EQ(batch
[0].kind(), Datum::ARRAY
);
3624 if (batch
[1].kind() == Datum::SCALAR
) {
3625 return ExecArrayScalar(ctx
, batch
[0].array(), *batch
[1].scalar(), out
);
3627 DCHECK_EQ(batch
[1].kind(), Datum::ARRAY
);
3628 return ExecArrayArray(ctx
, batch
[0].array(), batch
[1].array(), out
);
3631 struct ListScalarOffsetLookup
{
3632 const ArrayType
& values
;
3634 int64_t GetStart(int64_t i
) { return 0; }
3635 int64_t GetStop(int64_t i
) { return values
.length(); }
3636 bool IsNull(int64_t i
) { return false; }
3639 struct ListArrayOffsetLookup
{
3640 explicit ListArrayOffsetLookup(const ListArrayType
& lists
)
3641 : lists_(lists
), offsets_(lists
.raw_value_offsets()) {}
3643 int64_t GetStart(int64_t i
) { return offsets_
[i
]; }
3644 int64_t GetStop(int64_t i
) { return offsets_
[i
+ 1]; }
3645 bool IsNull(int64_t i
) { return lists_
.IsNull(i
); }
3648 const ListArrayType
& lists_
;
3649 const ListOffsetType
* offsets_
;
3652 struct SeparatorScalarLookup
{
3653 const util::string_view separator
;
3655 bool IsNull(int64_t i
) { return false; }
3656 util::string_view
GetView(int64_t i
) { return separator
; }
3659 struct SeparatorArrayLookup
{
3660 const ArrayType
& separators
;
3662 bool IsNull(int64_t i
) { return separators
.IsNull(i
); }
3663 util::string_view
GetView(int64_t i
) { return separators
.GetView(i
); }
3666 // Scalar, scalar -> scalar
3667 static Status
ExecScalarScalar(KernelContext
* ctx
, const Scalar
& left
,
3668 const Scalar
& right
, Datum
* out
) {
3669 const auto& list
= checked_cast
<const ListScalarType
&>(left
);
3670 const auto& separator_scalar
= checked_cast
<const BaseBinaryScalar
&>(right
);
3671 if (!list
.is_valid
|| !separator_scalar
.is_valid
) {
3672 return Status::OK();
3674 util::string_view
separator(*separator_scalar
.value
);
3676 const auto& strings
= checked_cast
<const ArrayType
&>(*list
.value
);
3677 if (strings
.null_count() > 0) {
3678 out
->scalar()->is_valid
= false;
3679 return Status::OK();
3682 TypedBufferBuilder
<uint8_t> builder(ctx
->memory_pool());
3683 auto Append
= [&](util::string_view value
) {
3684 return builder
.Append(reinterpret_cast<const uint8_t*>(value
.data()),
3685 static_cast<int64_t>(value
.size()));
3687 if (strings
.length() > 0) {
3689 strings
.total_values_length() + (strings
.length() - 1) * separator
.length();
3690 RETURN_NOT_OK(builder
.Reserve(data_length
));
3691 RETURN_NOT_OK(Append(strings
.GetView(0)));
3692 for (int64_t j
= 1; j
< strings
.length(); j
++) {
3693 RETURN_NOT_OK(Append(separator
));
3694 RETURN_NOT_OK(Append(strings
.GetView(j
)));
3697 auto out_scalar
= checked_cast
<BaseBinaryScalar
*>(out
->scalar().get());
3698 return builder
.Finish(&out_scalar
->value
);
3701 // Scalar, array -> array
3702 static Status
ExecScalarArray(KernelContext
* ctx
, const Scalar
& left
,
3703 const std::shared_ptr
<ArrayData
>& right
, Datum
* out
) {
3704 const auto& list_scalar
= checked_cast
<const BaseListScalar
&>(left
);
3705 if (!list_scalar
.is_valid
) {
3706 ARROW_ASSIGN_OR_RAISE(
3707 auto nulls
, MakeArrayOfNull(right
->type
, right
->length
, ctx
->memory_pool()));
3708 *out
= *nulls
->data();
3709 return Status::OK();
3711 const auto& strings
= checked_cast
<const ArrayType
&>(*list_scalar
.value
);
3712 if (strings
.null_count() != 0) {
3713 ARROW_ASSIGN_OR_RAISE(
3714 auto nulls
, MakeArrayOfNull(right
->type
, right
->length
, ctx
->memory_pool()));
3715 *out
= *nulls
->data();
3716 return Status::OK();
3718 const ArrayType
separators(right
);
3720 BuilderType
builder(ctx
->memory_pool());
3721 RETURN_NOT_OK(builder
.Reserve(separators
.length()));
3723 // Presize data to avoid multiple reallocations when joining strings
3724 int64_t total_data_length
= 0;
3725 const int64_t list_length
= strings
.length();
3727 const int64_t string_length
= strings
.total_values_length();
3728 total_data_length
+=
3729 string_length
* (separators
.length() - separators
.null_count());
3730 for (int64_t i
= 0; i
< separators
.length(); ++i
) {
3731 if (separators
.IsNull(i
)) {
3734 total_data_length
+= (list_length
- 1) * separators
.value_length(i
);
3737 RETURN_NOT_OK(builder
.ReserveData(total_data_length
));
3739 return JoinStrings(separators
.length(), strings
, ListScalarOffsetLookup
{strings
},
3740 SeparatorArrayLookup
{separators
}, &builder
, out
);
3743 // Array, scalar -> array
3744 static Status
ExecArrayScalar(KernelContext
* ctx
,
3745 const std::shared_ptr
<ArrayData
>& left
,
3746 const Scalar
& right
, Datum
* out
) {
3747 const ListArrayType
lists(left
);
3748 const auto& separator_scalar
= checked_cast
<const BaseBinaryScalar
&>(right
);
3750 if (!separator_scalar
.is_valid
) {
3751 ARROW_ASSIGN_OR_RAISE(
3753 MakeArrayOfNull(lists
.value_type(), lists
.length(), ctx
->memory_pool()));
3754 *out
= *nulls
->data();
3755 return Status::OK();
3758 util::string_view
separator(*separator_scalar
.value
);
3759 const auto& strings
= checked_cast
<const ArrayType
&>(*lists
.values());
3760 const auto list_offsets
= lists
.raw_value_offsets();
3762 BuilderType
builder(ctx
->memory_pool());
3763 RETURN_NOT_OK(builder
.Reserve(lists
.length()));
3765 // Presize data to avoid multiple reallocations when joining strings
3766 int64_t total_data_length
= strings
.total_values_length();
3767 for (int64_t i
= 0; i
< lists
.length(); ++i
) {
3768 const auto start
= list_offsets
[i
], end
= list_offsets
[i
+ 1];
3769 if (end
> start
&& !ValuesContainNull(strings
, start
, end
)) {
3770 total_data_length
+= (end
- start
- 1) * separator
.length();
3773 RETURN_NOT_OK(builder
.ReserveData(total_data_length
));
3775 return JoinStrings(lists
.length(), strings
, ListArrayOffsetLookup
{lists
},
3776 SeparatorScalarLookup
{separator
}, &builder
, out
);
3779 // Array, array -> array
3780 static Status
ExecArrayArray(KernelContext
* ctx
, const std::shared_ptr
<ArrayData
>& left
,
3781 const std::shared_ptr
<ArrayData
>& right
, Datum
* out
) {
3782 const ListArrayType
lists(left
);
3783 const auto& strings
= checked_cast
<const ArrayType
&>(*lists
.values());
3784 const auto list_offsets
= lists
.raw_value_offsets();
3785 const auto string_offsets
= strings
.raw_value_offsets();
3786 const ArrayType
separators(right
);
3788 BuilderType
builder(ctx
->memory_pool());
3789 RETURN_NOT_OK(builder
.Reserve(lists
.length()));
3791 // Presize data to avoid multiple reallocations when joining strings
3792 int64_t total_data_length
= 0;
3793 for (int64_t i
= 0; i
< lists
.length(); ++i
) {
3794 if (separators
.IsNull(i
)) {
3797 const auto start
= list_offsets
[i
], end
= list_offsets
[i
+ 1];
3798 if (end
> start
&& !ValuesContainNull(strings
, start
, end
)) {
3799 total_data_length
+= string_offsets
[end
] - string_offsets
[start
];
3800 total_data_length
+= (end
- start
- 1) * separators
.value_length(i
);
3803 RETURN_NOT_OK(builder
.ReserveData(total_data_length
));
3805 struct SeparatorLookup
{
3806 const ArrayType
& separators
;
3808 bool IsNull(int64_t i
) { return separators
.IsNull(i
); }
3809 util::string_view
GetView(int64_t i
) { return separators
.GetView(i
); }
3811 return JoinStrings(lists
.length(), strings
, ListArrayOffsetLookup
{lists
},
3812 SeparatorArrayLookup
{separators
}, &builder
, out
);
3815 template <typename ListOffsetLookup
, typename SeparatorLookup
>
3816 static Status
JoinStrings(int64_t length
, const ArrayType
& strings
,
3817 ListOffsetLookup
&& list_offsets
, SeparatorLookup
&& separators
,
3818 BuilderType
* builder
, Datum
* out
) {
3819 for (int64_t i
= 0; i
< length
; ++i
) {
3820 if (list_offsets
.IsNull(i
) || separators
.IsNull(i
)) {
3821 builder
->UnsafeAppendNull();
3824 const auto j_start
= list_offsets
.GetStart(i
), j_end
= list_offsets
.GetStop(i
);
3825 if (j_start
== j_end
) {
3826 builder
->UnsafeAppendEmptyValue();
3829 if (ValuesContainNull(strings
, j_start
, j_end
)) {
3830 builder
->UnsafeAppendNull();
3833 builder
->UnsafeAppend(strings
.GetView(j_start
));
3834 for (int64_t j
= j_start
+ 1; j
< j_end
; ++j
) {
3835 builder
->UnsafeExtendCurrent(separators
.GetView(i
));
3836 builder
->UnsafeExtendCurrent(strings
.GetView(j
));
3840 std::shared_ptr
<Array
> string_array
;
3841 RETURN_NOT_OK(builder
->Finish(&string_array
));
3842 *out
= *string_array
->data();
3843 // Correct the output type based on the input
3844 out
->mutable_array()->type
= strings
.type();
3845 return Status::OK();
3848 static bool ValuesContainNull(const ArrayType
& values
, int64_t start
, int64_t end
) {
3849 if (values
.null_count() == 0) {
3852 for (int64_t i
= start
; i
< end
; ++i
) {
3853 if (values
.IsNull(i
)) {
3861 using BinaryJoinElementWiseState
= OptionsWrapper
<JoinOptions
>;
3863 template <typename Type
>
3864 struct BinaryJoinElementWise
{
3865 using ArrayType
= typename TypeTraits
<Type
>::ArrayType
;
3866 using BuilderType
= typename TypeTraits
<Type
>::BuilderType
;
3867 using offset_type
= typename
Type::offset_type
;
3869 static Status
Exec(KernelContext
* ctx
, const ExecBatch
& batch
, Datum
* out
) {
3870 JoinOptions options
= BinaryJoinElementWiseState::Get(ctx
);
3871 // Last argument is the separator (for consistency with binary_join)
3872 if (std::all_of(batch
.values
.begin(), batch
.values
.end(),
3873 [](const Datum
& d
) { return d
.is_scalar(); })) {
3874 return ExecOnlyScalar(ctx
, options
, batch
, out
);
3876 return ExecContainingArrays(ctx
, options
, batch
, out
);
3879 static Status
ExecOnlyScalar(KernelContext
* ctx
, const JoinOptions
& options
,
3880 const ExecBatch
& batch
, Datum
* out
) {
3881 BaseBinaryScalar
* output
= checked_cast
<BaseBinaryScalar
*>(out
->scalar().get());
3882 const size_t num_args
= batch
.values
.size();
3883 if (num_args
== 1) {
3884 // Only separator, no values
3885 output
->is_valid
= batch
.values
[0].scalar()->is_valid
;
3886 if (output
->is_valid
) {
3887 ARROW_ASSIGN_OR_RAISE(output
->value
, ctx
->Allocate(0));
3889 return Status::OK();
3892 int64_t final_size
= CalculateRowSize(options
, batch
, 0);
3893 if (final_size
< 0) {
3894 output
->is_valid
= false;
3895 return Status::OK();
3897 ARROW_ASSIGN_OR_RAISE(output
->value
, ctx
->Allocate(final_size
));
3898 const auto separator
= UnboxScalar
<Type
>::Unbox(*batch
.values
.back().scalar());
3899 uint8_t* buf
= output
->value
->mutable_data();
3901 for (size_t i
= 0; i
< num_args
- 1; i
++) {
3902 const Scalar
& scalar
= *batch
[i
].scalar();
3903 util::string_view s
;
3904 if (scalar
.is_valid
) {
3905 s
= UnboxScalar
<Type
>::Unbox(scalar
);
3907 switch (options
.null_handling
) {
3908 case JoinOptions::EMIT_NULL
:
3909 // Handled by CalculateRowSize
3910 DCHECK(false) << "unreachable";
3912 case JoinOptions::SKIP
:
3914 case JoinOptions::REPLACE
:
3915 s
= options
.null_replacement
;
3920 buf
= std::copy(separator
.begin(), separator
.end(), buf
);
3923 buf
= std::copy(s
.begin(), s
.end(), buf
);
3925 output
->is_valid
= true;
3926 DCHECK_EQ(final_size
, buf
- output
->value
->mutable_data());
3927 return Status::OK();
3930 static Status
ExecContainingArrays(KernelContext
* ctx
, const JoinOptions
& options
,
3931 const ExecBatch
& batch
, Datum
* out
) {
3932 // Presize data to avoid reallocations
3933 int64_t final_size
= 0;
3934 for (int64_t i
= 0; i
< batch
.length
; i
++) {
3935 auto size
= CalculateRowSize(options
, batch
, i
);
3936 if (size
> 0) final_size
+= size
;
3938 BuilderType
builder(ctx
->memory_pool());
3939 RETURN_NOT_OK(builder
.Reserve(batch
.length
));
3940 RETURN_NOT_OK(builder
.ReserveData(final_size
));
3942 std::vector
<util::string_view
> valid_cols(batch
.values
.size());
3943 for (size_t row
= 0; row
< static_cast<size_t>(batch
.length
); row
++) {
3944 size_t num_valid
= 0; // Not counting separator
3945 for (size_t col
= 0; col
< batch
.values
.size(); col
++) {
3946 if (batch
[col
].is_scalar()) {
3947 const auto& scalar
= *batch
[col
].scalar();
3948 if (scalar
.is_valid
) {
3949 valid_cols
[col
] = UnboxScalar
<Type
>::Unbox(scalar
);
3950 if (col
< batch
.values
.size() - 1) num_valid
++;
3952 valid_cols
[col
] = util::string_view();
3955 const ArrayData
& array
= *batch
[col
].array();
3956 if (!array
.MayHaveNulls() ||
3957 BitUtil::GetBit(array
.buffers
[0]->data(), array
.offset
+ row
)) {
3958 const offset_type
* offsets
= array
.GetValues
<offset_type
>(1);
3959 const uint8_t* data
= array
.GetValues
<uint8_t>(2, /*absolute_offset=*/0);
3960 const int64_t length
= offsets
[row
+ 1] - offsets
[row
];
3961 valid_cols
[col
] = util::string_view(
3962 reinterpret_cast<const char*>(data
+ offsets
[row
]), length
);
3963 if (col
< batch
.values
.size() - 1) num_valid
++;
3965 valid_cols
[col
] = util::string_view();
3970 if (!valid_cols
.back().data()) {
3971 // Separator is null
3972 builder
.UnsafeAppendNull();
3974 } else if (batch
.values
.size() == 1) {
3975 // Only given separator
3976 builder
.UnsafeAppendEmptyValue();
3978 } else if (num_valid
< batch
.values
.size() - 1) {
3979 // We had some nulls
3980 if (options
.null_handling
== JoinOptions::EMIT_NULL
) {
3981 builder
.UnsafeAppendNull();
3985 const auto separator
= valid_cols
.back();
3987 for (size_t col
= 0; col
< batch
.values
.size() - 1; col
++) {
3988 util::string_view value
= valid_cols
[col
];
3989 if (!value
.data()) {
3990 switch (options
.null_handling
) {
3991 case JoinOptions::EMIT_NULL
:
3992 DCHECK(false) << "unreachable";
3994 case JoinOptions::SKIP
:
3996 case JoinOptions::REPLACE
:
3997 value
= options
.null_replacement
;
4002 builder
.UnsafeAppend(value
);
4006 builder
.UnsafeExtendCurrent(separator
);
4007 builder
.UnsafeExtendCurrent(value
);
4011 std::shared_ptr
<Array
> string_array
;
4012 RETURN_NOT_OK(builder
.Finish(&string_array
));
4013 *out
= *string_array
->data();
4014 out
->mutable_array()->type
= batch
[0].type();
4015 DCHECK_EQ(batch
.length
, out
->array()->length
);
4016 DCHECK_EQ(final_size
,
4017 checked_cast
<const ArrayType
&>(*string_array
).total_values_length());
4018 return Status::OK();
4021 // Compute the length of the output for the given position, or -1 if it would be null.
4022 static int64_t CalculateRowSize(const JoinOptions
& options
, const ExecBatch
& batch
,
4023 const int64_t index
) {
4024 const auto num_args
= batch
.values
.size();
4025 int64_t final_size
= 0;
4026 int64_t num_non_null_args
= 0;
4027 for (size_t i
= 0; i
< num_args
; i
++) {
4028 int64_t element_size
= 0;
4030 if (batch
[i
].is_scalar()) {
4031 const Scalar
& scalar
= *batch
[i
].scalar();
4032 valid
= scalar
.is_valid
;
4033 element_size
= UnboxScalar
<Type
>::Unbox(scalar
).size();
4035 const ArrayData
& array
= *batch
[i
].array();
4036 valid
= !array
.MayHaveNulls() ||
4037 BitUtil::GetBit(array
.buffers
[0]->data(), array
.offset
+ index
);
4038 const offset_type
* offsets
= array
.GetValues
<offset_type
>(1);
4039 element_size
= offsets
[index
+ 1] - offsets
[index
];
4041 if (i
== num_args
- 1) {
4042 if (!valid
) return -1;
4043 if (num_non_null_args
> 1) {
4044 // Add separator size (only if there were values to join)
4045 final_size
+= (num_non_null_args
- 1) * element_size
;
4050 switch (options
.null_handling
) {
4051 case JoinOptions::EMIT_NULL
:
4053 case JoinOptions::SKIP
:
4055 case JoinOptions::REPLACE
:
4056 element_size
= options
.null_replacement
.size();
4060 num_non_null_args
++;
4061 final_size
+= element_size
;
4067 const FunctionDoc
binary_join_doc(
4068 "Join a list of strings together with a `separator` to form a single string",
4069 ("Insert `separator` between `list` elements, and concatenate them.\n"
4070 "Any null input and any null `list` element emits a null output.\n"),
4071 {"list", "separator"});
4073 const FunctionDoc
binary_join_element_wise_doc(
4074 "Join string arguments into one, using the last argument as the separator",
4075 ("Insert the last argument of `strings` between the rest of the elements, "
4076 "and concatenate them.\n"
4077 "Any null separator element emits a null output. Null elements either "
4078 "emit a null (the default), are skipped, or replaced with a given string.\n"),
4079 {"*strings"}, "JoinOptions");
4081 const auto kDefaultJoinOptions
= JoinOptions::Defaults();
4083 template <typename ListType
>
4084 void AddBinaryJoinForListType(ScalarFunction
* func
) {
4085 for (const std::shared_ptr
<DataType
>& ty
: BaseBinaryTypes()) {
4086 auto exec
= GenerateTypeAgnosticVarBinaryBase
<BinaryJoin
, ListType
>(*ty
);
4087 auto list_ty
= std::make_shared
<ListType
>(ty
);
4088 DCHECK_OK(func
->AddKernel({InputType(list_ty
), InputType(ty
)}, ty
, exec
));
4092 void AddBinaryJoin(FunctionRegistry
* registry
) {
4094 auto func
= std::make_shared
<ScalarFunction
>("binary_join", Arity::Binary(),
4096 AddBinaryJoinForListType
<ListType
>(func
.get());
4097 AddBinaryJoinForListType
<LargeListType
>(func
.get());
4098 DCHECK_OK(registry
->AddFunction(std::move(func
)));
4101 auto func
= std::make_shared
<ScalarFunction
>(
4102 "binary_join_element_wise", Arity::VarArgs(/*min_args=*/1),
4103 &binary_join_element_wise_doc
, &kDefaultJoinOptions
);
4104 for (const auto& ty
: BaseBinaryTypes()) {
4105 ScalarKernel kernel
{KernelSignature::Make({InputType(ty
)}, ty
, /*is_varargs=*/true),
4106 GenerateTypeAgnosticVarBinaryBase
<BinaryJoinElementWise
>(ty
),
4107 BinaryJoinElementWiseState::Init
};
4108 kernel
.null_handling
= NullHandling::COMPUTED_NO_PREALLOCATE
;
4109 kernel
.mem_allocation
= MemAllocation::NO_PREALLOCATE
;
4110 DCHECK_OK(func
->AddKernel(std::move(kernel
)));
4112 DCHECK_OK(registry
->AddFunction(std::move(func
)));
4116 template <template <typename
> class ExecFunctor
>
4117 void MakeUnaryStringBatchKernel(
4118 std::string name
, FunctionRegistry
* registry
, const FunctionDoc
* doc
,
4119 MemAllocation::type mem_allocation
= MemAllocation::PREALLOCATE
) {
4120 auto func
= std::make_shared
<ScalarFunction
>(name
, Arity::Unary(), doc
);
4122 auto exec_32
= ExecFunctor
<StringType
>::Exec
;
4123 ScalarKernel kernel
{{utf8()}, utf8(), exec_32
};
4124 kernel
.mem_allocation
= mem_allocation
;
4125 DCHECK_OK(func
->AddKernel(std::move(kernel
)));
4128 auto exec_64
= ExecFunctor
<LargeStringType
>::Exec
;
4129 ScalarKernel kernel
{{large_utf8()}, large_utf8(), exec_64
};
4130 kernel
.mem_allocation
= mem_allocation
;
4131 DCHECK_OK(func
->AddKernel(std::move(kernel
)));
4133 DCHECK_OK(registry
->AddFunction(std::move(func
)));
4136 template <template <typename
> class ExecFunctor
>
4137 void MakeUnaryStringBatchKernelWithState(
4138 std::string name
, FunctionRegistry
* registry
, const FunctionDoc
* doc
,
4139 MemAllocation::type mem_allocation
= MemAllocation::PREALLOCATE
) {
4140 auto func
= std::make_shared
<ScalarFunction
>(name
, Arity::Unary(), doc
);
4142 using t32
= ExecFunctor
<StringType
>;
4143 ScalarKernel kernel
{{utf8()}, utf8(), t32::Exec
, t32::State::Init
};
4144 kernel
.mem_allocation
= mem_allocation
;
4145 DCHECK_OK(func
->AddKernel(std::move(kernel
)));
4148 using t64
= ExecFunctor
<LargeStringType
>;
4149 ScalarKernel kernel
{{large_utf8()}, large_utf8(), t64::Exec
, t64::State::Init
};
4150 kernel
.mem_allocation
= mem_allocation
;
4151 DCHECK_OK(func
->AddKernel(std::move(kernel
)));
4153 DCHECK_OK(registry
->AddFunction(std::move(func
)));
4156 #ifdef ARROW_WITH_UTF8PROC
4158 template <template <typename
> class Transformer
>
4159 void MakeUnaryStringUTF8TransformKernel(std::string name
, FunctionRegistry
* registry
,
4160 const FunctionDoc
* doc
) {
4161 auto func
= std::make_shared
<ScalarFunction
>(name
, Arity::Unary(), doc
);
4162 ArrayKernelExec exec_32
= Transformer
<StringType
>::Exec
;
4163 ArrayKernelExec exec_64
= Transformer
<LargeStringType
>::Exec
;
4164 DCHECK_OK(func
->AddKernel({utf8()}, utf8(), exec_32
));
4165 DCHECK_OK(func
->AddKernel({large_utf8()}, large_utf8(), exec_64
));
4166 DCHECK_OK(registry
->AddFunction(std::move(func
)));
4171 // NOTE: Predicate should only populate 'status' with errors,
4172 // leave it unmodified to indicate Status::OK()
4173 using StringPredicate
=
4174 std::function
<bool(KernelContext
*, const uint8_t*, size_t, Status
*)>;
4176 template <typename Type
>
4177 Status
ApplyPredicate(KernelContext
* ctx
, const ExecBatch
& batch
,
4178 StringPredicate predicate
, Datum
* out
) {
4179 Status st
= Status::OK();
4180 EnsureLookupTablesFilled();
4181 if (batch
[0].kind() == Datum::ARRAY
) {
4182 const ArrayData
& input
= *batch
[0].array();
4183 ArrayIterator
<Type
> input_it(input
);
4184 ArrayData
* out_arr
= out
->mutable_array();
4185 ::arrow::internal::GenerateBitsUnrolled(
4186 out_arr
->buffers
[1]->mutable_data(), out_arr
->offset
, input
.length
,
4188 util::string_view val
= input_it();
4189 return predicate(ctx
, reinterpret_cast<const uint8_t*>(val
.data()), val
.size(),
4193 const auto& input
= checked_cast
<const BaseBinaryScalar
&>(*batch
[0].scalar());
4194 if (input
.is_valid
) {
4195 bool boolean_result
= predicate(ctx
, input
.value
->data(),
4196 static_cast<size_t>(input
.value
->size()), &st
);
4197 // UTF decoding can lead to issues
4199 out
->value
= std::make_shared
<BooleanScalar
>(boolean_result
);
4206 template <typename Predicate
>
4207 void AddUnaryStringPredicate(std::string name
, FunctionRegistry
* registry
,
4208 const FunctionDoc
* doc
) {
4209 auto func
= std::make_shared
<ScalarFunction
>(name
, Arity::Unary(), doc
);
4210 auto exec_32
= [](KernelContext
* ctx
, const ExecBatch
& batch
, Datum
* out
) {
4211 return ApplyPredicate
<StringType
>(ctx
, batch
, Predicate::Call
, out
);
4213 auto exec_64
= [](KernelContext
* ctx
, const ExecBatch
& batch
, Datum
* out
) {
4214 return ApplyPredicate
<LargeStringType
>(ctx
, batch
, Predicate::Call
, out
);
4216 DCHECK_OK(func
->AddKernel({utf8()}, boolean(), std::move(exec_32
)));
4217 DCHECK_OK(func
->AddKernel({large_utf8()}, boolean(), std::move(exec_64
)));
4218 DCHECK_OK(registry
->AddFunction(std::move(func
)));
4221 FunctionDoc
StringPredicateDoc(std::string summary
, std::string description
) {
4222 return FunctionDoc
{std::move(summary
), std::move(description
), {"strings"}};
4225 FunctionDoc
StringClassifyDoc(std::string class_summary
, std::string class_desc
,
4227 std::string summary
, description
;
4229 std::stringstream ss
;
4230 ss
<< "Classify strings as " << class_summary
;
4234 std::stringstream ss
;
4237 << ("For each string in `strings`, emit true iff the string is non-empty\n"
4238 "and consists only of ");
4241 << ("For each string in `strings`, emit true iff the string consists only\n"
4244 ss
<< class_desc
<< ". Null strings emit null.";
4245 description
= ss
.str();
4247 return StringPredicateDoc(std::move(summary
), std::move(description
));
4250 const auto string_is_ascii_doc
= StringClassifyDoc("ASCII", "ASCII characters", false);
4252 const auto ascii_is_alnum_doc
=
4253 StringClassifyDoc("ASCII alphanumeric", "alphanumeric ASCII characters", true);
4254 const auto ascii_is_alpha_doc
=
4255 StringClassifyDoc("ASCII alphabetic", "alphabetic ASCII characters", true);
4256 const auto ascii_is_decimal_doc
=
4257 StringClassifyDoc("ASCII decimal", "decimal ASCII characters", true);
4258 const auto ascii_is_lower_doc
=
4259 StringClassifyDoc("ASCII lowercase", "lowercase ASCII characters", true);
4260 const auto ascii_is_printable_doc
=
4261 StringClassifyDoc("ASCII printable", "printable ASCII characters", true);
4262 const auto ascii_is_space_doc
=
4263 StringClassifyDoc("ASCII whitespace", "whitespace ASCII characters", true);
4264 const auto ascii_is_upper_doc
=
4265 StringClassifyDoc("ASCII uppercase", "uppercase ASCII characters", true);
4267 const auto ascii_is_title_doc
= StringPredicateDoc(
4268 "Classify strings as ASCII titlecase",
4269 ("For each string in `strings`, emit true iff the string is title-cased,\n"
4270 "i.e. it has at least one cased character, each uppercase character\n"
4271 "follows an uncased character, and each lowercase character follows\n"
4272 "an uppercase character.\n"));
4274 const auto utf8_is_alnum_doc
=
4275 StringClassifyDoc("alphanumeric", "alphanumeric Unicode characters", true);
4276 const auto utf8_is_alpha_doc
=
4277 StringClassifyDoc("alphabetic", "alphabetic Unicode characters", true);
4278 const auto utf8_is_decimal_doc
=
4279 StringClassifyDoc("decimal", "decimal Unicode characters", true);
4280 const auto utf8_is_digit_doc
= StringClassifyDoc("digits", "Unicode digits", true);
4281 const auto utf8_is_lower_doc
=
4282 StringClassifyDoc("lowercase", "lowercase Unicode characters", true);
4283 const auto utf8_is_numeric_doc
=
4284 StringClassifyDoc("numeric", "numeric Unicode characters", true);
4285 const auto utf8_is_printable_doc
=
4286 StringClassifyDoc("printable", "printable Unicode characters", true);
4287 const auto utf8_is_space_doc
=
4288 StringClassifyDoc("whitespace", "whitespace Unicode characters", true);
4289 const auto utf8_is_upper_doc
=
4290 StringClassifyDoc("uppercase", "uppercase Unicode characters", true);
4292 const auto utf8_is_title_doc
= StringPredicateDoc(
4293 "Classify strings as titlecase",
4294 ("For each string in `strings`, emit true iff the string is title-cased,\n"
4295 "i.e. it has at least one cased character, each uppercase character\n"
4296 "follows an uncased character, and each lowercase character follows\n"
4297 "an uppercase character.\n"));
4299 const FunctionDoc
ascii_upper_doc(
4300 "Transform ASCII input to uppercase",
4301 ("For each string in `strings`, return an uppercase version.\n\n"
4302 "This function assumes the input is fully ASCII. It it may contain\n"
4303 "non-ASCII characters, use \"utf8_upper\" instead."),
4306 const FunctionDoc
ascii_lower_doc(
4307 "Transform ASCII input to lowercase",
4308 ("For each string in `strings`, return a lowercase version.\n\n"
4309 "This function assumes the input is fully ASCII. If it may contain\n"
4310 "non-ASCII characters, use \"utf8_lower\" instead."),
4313 const FunctionDoc
ascii_swapcase_doc(
4314 "Transform ASCII input lowercase characters to uppercase and uppercase characters to "
4316 ("For each string in `strings`, return a string with opposite casing.\n\n"
4317 "This function assumes the input is fully ASCII. If it may contain\n"
4318 "non-ASCII characters, use \"utf8_swapcase\" instead."),
4321 const FunctionDoc
ascii_capitalize_doc(
4322 "Capitalize the first character of ASCII input",
4323 ("For each string in `strings`, return a capitalized version.\n\n"
4324 "This function assumes the input is fully ASCII. If it may contain\n"
4325 "non-ASCII characters, use \"utf8_capitalize\" instead."),
4328 const FunctionDoc
ascii_title_doc(
4329 "Titlecase each word of ASCII input",
4330 ("For each string in `strings`, return a titlecased version.\n"
4331 "Each word in the output will start with an uppercase character and its\n"
4332 "remaining characters will be lowercase.\n\n"
4333 "This function assumes the input is fully ASCII. If it may contain\n"
4334 "non-ASCII characters, use \"utf8_title\" instead."),
4337 const FunctionDoc
ascii_reverse_doc(
4338 "Reverse ASCII input",
4339 ("For each ASCII string in `strings`, return a reversed version.\n\n"
4340 "This function assumes the input is fully ASCII. If it may contain\n"
4341 "non-ASCII characters, use \"utf8_reverse\" instead."),
4344 const FunctionDoc
utf8_upper_doc(
4345 "Transform input to uppercase",
4346 ("For each string in `strings`, return an uppercase version."), {"strings"});
4348 const FunctionDoc
utf8_lower_doc(
4349 "Transform input to lowercase",
4350 ("For each string in `strings`, return a lowercase version."), {"strings"});
4352 const FunctionDoc
utf8_swapcase_doc(
4353 "Transform input lowercase characters to uppercase and uppercase characters to "
4355 ("For each string in `strings`, return an opposite case version."), {"strings"});
4357 const FunctionDoc
utf8_capitalize_doc(
4358 "Capitalize the first character of input",
4359 ("For each string in `strings`, return a capitalized version,\n"
4360 "with the first character uppercased and the others lowercased."),
4363 const FunctionDoc
utf8_title_doc(
4364 "Titlecase each word of input",
4365 ("For each string in `strings`, return a titlecased version.\n"
4366 "Each word in the output will start with an uppercase character and its\n"
4367 "remaining characters will be lowercase."),
4370 const FunctionDoc
utf8_reverse_doc(
4372 ("For each string in `strings`, return a reversed version.\n\n"
4373 "This function operates on Unicode codepoints, not grapheme\n"
4374 "clusters. Hence, it will not correctly reverse grapheme clusters\n"
4375 "composed of multiple codepoints."),
4380 void RegisterScalarStringAscii(FunctionRegistry
* registry
) {
4381 // Some kernels are able to reuse the original offsets buffer, so don't
4382 // preallocate them in the output. Only kernels that invoke
4383 // "StringDataTransform" support no preallocation.
4384 MakeUnaryStringBatchKernel
<AsciiUpper
>("ascii_upper", registry
, &ascii_upper_doc
,
4385 MemAllocation::NO_PREALLOCATE
);
4386 MakeUnaryStringBatchKernel
<AsciiLower
>("ascii_lower", registry
, &ascii_lower_doc
,
4387 MemAllocation::NO_PREALLOCATE
);
4388 MakeUnaryStringBatchKernel
<AsciiSwapCase
>(
4389 "ascii_swapcase", registry
, &ascii_swapcase_doc
, MemAllocation::NO_PREALLOCATE
);
4390 MakeUnaryStringBatchKernel
<AsciiCapitalize
>("ascii_capitalize", registry
,
4391 &ascii_capitalize_doc
);
4392 MakeUnaryStringBatchKernel
<AsciiTitle
>("ascii_title", registry
, &ascii_title_doc
);
4393 MakeUnaryStringBatchKernel
<AsciiTrimWhitespace
>("ascii_trim_whitespace", registry
,
4394 &ascii_trim_whitespace_doc
);
4395 MakeUnaryStringBatchKernel
<AsciiLTrimWhitespace
>("ascii_ltrim_whitespace", registry
,
4396 &ascii_ltrim_whitespace_doc
);
4397 MakeUnaryStringBatchKernel
<AsciiRTrimWhitespace
>("ascii_rtrim_whitespace", registry
,
4398 &ascii_rtrim_whitespace_doc
);
4399 MakeUnaryStringBatchKernel
<AsciiReverse
>("ascii_reverse", registry
, &ascii_reverse_doc
);
4400 MakeUnaryStringBatchKernel
<Utf8Reverse
>("utf8_reverse", registry
, &utf8_reverse_doc
);
4402 MakeUnaryStringBatchKernelWithState
<AsciiCenter
>("ascii_center", registry
,
4404 MakeUnaryStringBatchKernelWithState
<AsciiLPad
>("ascii_lpad", registry
, &ascii_lpad_doc
);
4405 MakeUnaryStringBatchKernelWithState
<AsciiRPad
>("ascii_rpad", registry
, &ascii_rpad_doc
);
4406 MakeUnaryStringBatchKernelWithState
<Utf8Center
>("utf8_center", registry
,
4408 MakeUnaryStringBatchKernelWithState
<Utf8LPad
>("utf8_lpad", registry
, &utf8_lpad_doc
);
4409 MakeUnaryStringBatchKernelWithState
<Utf8RPad
>("utf8_rpad", registry
, &utf8_rpad_doc
);
4411 MakeUnaryStringBatchKernelWithState
<AsciiTrim
>("ascii_trim", registry
, &ascii_trim_doc
);
4412 MakeUnaryStringBatchKernelWithState
<AsciiLTrim
>("ascii_ltrim", registry
,
4414 MakeUnaryStringBatchKernelWithState
<AsciiRTrim
>("ascii_rtrim", registry
,
4417 AddUnaryStringPredicate
<IsAscii
>("string_is_ascii", registry
, &string_is_ascii_doc
);
4419 AddUnaryStringPredicate
<IsAlphaNumericAscii
>("ascii_is_alnum", registry
,
4420 &ascii_is_alnum_doc
);
4421 AddUnaryStringPredicate
<IsAlphaAscii
>("ascii_is_alpha", registry
, &ascii_is_alpha_doc
);
4422 AddUnaryStringPredicate
<IsDecimalAscii
>("ascii_is_decimal", registry
,
4423 &ascii_is_decimal_doc
);
4424 // no is_digit for ascii, since it is the same as is_decimal
4425 AddUnaryStringPredicate
<IsLowerAscii
>("ascii_is_lower", registry
, &ascii_is_lower_doc
);
4426 // no is_numeric for ascii, since it is the same as is_decimal
4427 AddUnaryStringPredicate
<IsPrintableAscii
>("ascii_is_printable", registry
,
4428 &ascii_is_printable_doc
);
4429 AddUnaryStringPredicate
<IsSpaceAscii
>("ascii_is_space", registry
, &ascii_is_space_doc
);
4430 AddUnaryStringPredicate
<IsTitleAscii
>("ascii_is_title", registry
, &ascii_is_title_doc
);
4431 AddUnaryStringPredicate
<IsUpperAscii
>("ascii_is_upper", registry
, &ascii_is_upper_doc
);
4433 #ifdef ARROW_WITH_UTF8PROC
4434 MakeUnaryStringUTF8TransformKernel
<UTF8Upper
>("utf8_upper", registry
, &utf8_upper_doc
);
4435 MakeUnaryStringUTF8TransformKernel
<UTF8Lower
>("utf8_lower", registry
, &utf8_lower_doc
);
4436 MakeUnaryStringUTF8TransformKernel
<UTF8SwapCase
>("utf8_swapcase", registry
,
4437 &utf8_swapcase_doc
);
4438 MakeUnaryStringBatchKernel
<Utf8Capitalize
>("utf8_capitalize", registry
,
4439 &utf8_capitalize_doc
);
4440 MakeUnaryStringBatchKernel
<Utf8Title
>("utf8_title", registry
, &utf8_title_doc
);
4441 MakeUnaryStringBatchKernel
<UTF8TrimWhitespace
>("utf8_trim_whitespace", registry
,
4442 &utf8_trim_whitespace_doc
);
4443 MakeUnaryStringBatchKernel
<UTF8LTrimWhitespace
>("utf8_ltrim_whitespace", registry
,
4444 &utf8_ltrim_whitespace_doc
);
4445 MakeUnaryStringBatchKernel
<UTF8RTrimWhitespace
>("utf8_rtrim_whitespace", registry
,
4446 &utf8_rtrim_whitespace_doc
);
4447 MakeUnaryStringBatchKernelWithState
<UTF8Trim
>("utf8_trim", registry
, &utf8_trim_doc
);
4448 MakeUnaryStringBatchKernelWithState
<UTF8LTrim
>("utf8_ltrim", registry
, &utf8_ltrim_doc
);
4449 MakeUnaryStringBatchKernelWithState
<UTF8RTrim
>("utf8_rtrim", registry
, &utf8_rtrim_doc
);
4451 AddUnaryStringPredicate
<IsAlphaNumericUnicode
>("utf8_is_alnum", registry
,
4452 &utf8_is_alnum_doc
);
4453 AddUnaryStringPredicate
<IsAlphaUnicode
>("utf8_is_alpha", registry
, &utf8_is_alpha_doc
);
4454 AddUnaryStringPredicate
<IsDecimalUnicode
>("utf8_is_decimal", registry
,
4455 &utf8_is_decimal_doc
);
4456 AddUnaryStringPredicate
<IsDigitUnicode
>("utf8_is_digit", registry
, &utf8_is_digit_doc
);
4457 AddUnaryStringPredicate
<IsLowerUnicode
>("utf8_is_lower", registry
, &utf8_is_lower_doc
);
4458 AddUnaryStringPredicate
<IsNumericUnicode
>("utf8_is_numeric", registry
,
4459 &utf8_is_numeric_doc
);
4460 AddUnaryStringPredicate
<IsPrintableUnicode
>("utf8_is_printable", registry
,
4461 &utf8_is_printable_doc
);
4462 AddUnaryStringPredicate
<IsSpaceUnicode
>("utf8_is_space", registry
, &utf8_is_space_doc
);
4463 AddUnaryStringPredicate
<IsTitleUnicode
>("utf8_is_title", registry
, &utf8_is_title_doc
);
4464 AddUnaryStringPredicate
<IsUpperUnicode
>("utf8_is_upper", registry
, &utf8_is_upper_doc
);
4467 AddBinaryLength(registry
);
4468 AddUtf8Length(registry
);
4469 AddMatchSubstring(registry
);
4470 AddFindSubstring(registry
);
4471 AddCountSubstring(registry
);
4472 MakeUnaryStringBatchKernelWithState
<ReplaceSubStringPlain
>(
4473 "replace_substring", registry
, &replace_substring_doc
,
4474 MemAllocation::NO_PREALLOCATE
);
4475 #ifdef ARROW_WITH_RE2
4476 MakeUnaryStringBatchKernelWithState
<ReplaceSubStringRegex
>(
4477 "replace_substring_regex", registry
, &replace_substring_regex_doc
,
4478 MemAllocation::NO_PREALLOCATE
);
4479 AddExtractRegex(registry
);
4481 AddReplaceSlice(registry
);
4484 AddStrptime(registry
);
4485 AddBinaryJoin(registry
);
4488 } // namespace internal
4489 } // namespace compute
4490 } // namespace arrow