ceph/src/arrow/cpp/src/arrow/compute/kernels/scalar_string.cc

   1 // Licensed to the Apache Software Foundation (ASF) under one
   2 // or more contributor license agreements.  See the NOTICE file
   3 // distributed with this work for additional information
   4 // regarding copyright ownership.  The ASF licenses this file
   5 // to you under the Apache License, Version 2.0 (the
   6 // "License"); you may not use this file except in compliance
   7 // with the License.  You may obtain a copy of the License at
   8 //
   9 //   http://www.apache.org/licenses/LICENSE-2.0
  10 //
  11 // Unless required by applicable law or agreed to in writing,
  12 // software distributed under the License is distributed on an
  13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  14 // KIND, either express or implied.  See the License for the
  15 // specific language governing permissions and limitations
  16 // under the License.
  17
  18 #include <algorithm>
  19 #include <cctype>
  20 #include <iterator>
  21 #include <string>
  22
  23 #ifdef ARROW_WITH_UTF8PROC
  24 #include <utf8proc.h>
  25 #endif
  26
  27 #ifdef ARROW_WITH_RE2
  28 #include <re2/re2.h>
  29 #endif
  30
  31 #include "arrow/array/builder_binary.h"
  32 #include "arrow/array/builder_nested.h"
  33 #include "arrow/buffer_builder.h"
  34
  35 #include "arrow/builder.h"
  36 #include "arrow/compute/api_scalar.h"
  37 #include "arrow/compute/kernels/common.h"
  38 #include "arrow/util/checked_cast.h"
  39 #include "arrow/util/utf8.h"
  40 #include "arrow/util/value_parsing.h"
  41 #include "arrow/visitor_inline.h"
  42
  43 namespace arrow {
  44
  45 using internal::checked_cast;
  46
  47 namespace compute {
  48 namespace internal {
  49
  50 namespace {
  51
  52 #ifdef ARROW_WITH_RE2
  53 util::string_view ToStringView(re2::StringPiece piece) {
  54   return {piece.data(), piece.length()};
  55 }
  56
  57 re2::StringPiece ToStringPiece(util::string_view view) {
  58   return {view.data(), view.length()};
  59 }
  60
  61 Status RegexStatus(const RE2& regex) {
  62   if (!regex.ok()) {
  63     return Status::Invalid("Invalid regular expression: ", regex.error());
  64   }
  65   return Status::OK();
  66 }
  67 #endif
  68
  69 // Code units in the range [a-z] can only be an encoding of an ASCII
  70 // character/codepoint, not the 2nd, 3rd or 4th code unit (byte) of a different
  71 // codepoint. This is guaranteed by the non-overlap design of the Unicode
  72 // standard. (see section 2.5 of Unicode Standard Core Specification v13.0)
  73
  74 // IsAlpha/Digit etc
  75
  76 static inline bool IsAsciiCharacter(uint8_t character) { return character < 128; }
  77
  78 static inline bool IsLowerCaseCharacterAscii(uint8_t ascii_character) {
  79   return (ascii_character >= 'a') && (ascii_character <= 'z');
  80 }
  81
  82 static inline bool IsUpperCaseCharacterAscii(uint8_t ascii_character) {
  83   return (ascii_character >= 'A') && (ascii_character <= 'Z');
  84 }
  85
  86 static inline bool IsCasedCharacterAscii(uint8_t ascii_character) {
  87   // Note: Non-ASCII characters are seen as uncased.
  88   return IsLowerCaseCharacterAscii(ascii_character) ||
  89          IsUpperCaseCharacterAscii(ascii_character);
  90 }
  91
  92 static inline bool IsAlphaCharacterAscii(uint8_t ascii_character) {
  93   return IsCasedCharacterAscii(ascii_character);
  94 }
  95
  96 static inline bool IsAlphaNumericCharacterAscii(uint8_t ascii_character) {
  97   return ((ascii_character >= '0') && (ascii_character <= '9')) ||
  98          ((ascii_character >= 'a') && (ascii_character <= 'z')) ||
  99          ((ascii_character >= 'A') && (ascii_character <= 'Z'));
 100 }
 101
 102 static inline bool IsDecimalCharacterAscii(uint8_t ascii_character) {
 103   return ((ascii_character >= '0') && (ascii_character <= '9'));
 104 }
 105
 106 static inline bool IsSpaceCharacterAscii(uint8_t ascii_character) {
 107   return ((ascii_character >= 9) && (ascii_character <= 13)) || (ascii_character == ' ');
 108 }
 109
 110 static inline bool IsPrintableCharacterAscii(uint8_t ascii_character) {
 111   return ((ascii_character >= ' ') && (ascii_character <= '~'));
 112 }
 113
 114 struct BinaryLength {
 115   template <typename OutValue, typename Arg0Value = util::string_view>
 116   static OutValue Call(KernelContext*, Arg0Value val, Status*) {
 117     return static_cast<OutValue>(val.size());
 118   }
 119
 120   static Status FixedSizeExec(KernelContext*, const ExecBatch& batch, Datum* out) {
 121     // Output is preallocated and validity buffer is precomputed
 122     const int32_t width =
 123         checked_cast<const FixedSizeBinaryType&>(*batch[0].type()).byte_width();
 124     if (batch.values[0].is_array()) {
 125       int32_t* buffer = out->mutable_array()->GetMutableValues<int32_t>(1);
 126       std::fill(buffer, buffer + batch.length, width);
 127     } else {
 128       checked_cast<Int32Scalar*>(out->scalar().get())->value = width;
 129     }
 130     return Status::OK();
 131   }
 132 };
 133
 134 struct Utf8Length {
 135   template <typename OutValue, typename Arg0Value = util::string_view>
 136   static OutValue Call(KernelContext*, Arg0Value val, Status*) {
 137     auto str = reinterpret_cast<const uint8_t*>(val.data());
 138     auto strlen = val.size();
 139     return static_cast<OutValue>(util::UTF8Length(str, str + strlen));
 140   }
 141 };
 142
 143 static inline uint8_t ascii_tolower(uint8_t utf8_code_unit) {
 144   return ((utf8_code_unit >= 'A') && (utf8_code_unit <= 'Z')) ? (utf8_code_unit + 32)
 145                                                               : utf8_code_unit;
 146 }
 147
 148 static inline uint8_t ascii_toupper(uint8_t utf8_code_unit) {
 149   return ((utf8_code_unit >= 'a') && (utf8_code_unit <= 'z')) ? (utf8_code_unit - 32)
 150                                                               : utf8_code_unit;
 151 }
 152
 153 static inline uint8_t ascii_swapcase(uint8_t utf8_code_unit) {
 154   if (IsLowerCaseCharacterAscii(utf8_code_unit)) {
 155     utf8_code_unit -= 32;
 156   } else if (IsUpperCaseCharacterAscii(utf8_code_unit)) {
 157     utf8_code_unit += 32;
 158   }
 159   return utf8_code_unit;
 160 }
 161
 162 #ifdef ARROW_WITH_UTF8PROC
 163
 164 // Direct lookup tables for unicode properties
 165 constexpr uint32_t kMaxCodepointLookup =
 166     0xffff;  // up to this codepoint is in a lookup table
 167 std::vector<uint32_t> lut_upper_codepoint;
 168 std::vector<uint32_t> lut_lower_codepoint;
 169 std::vector<uint32_t> lut_swapcase_codepoint;
 170 std::vector<utf8proc_category_t> lut_category;
 171 std::once_flag flag_case_luts;
 172
 173 // IsAlpha/Digit etc
 174
 175 static inline bool HasAnyUnicodeGeneralCategory(uint32_t codepoint, uint32_t mask) {
 176   utf8proc_category_t general_category = codepoint <= kMaxCodepointLookup
 177                                              ? lut_category[codepoint]
 178                                              : utf8proc_category(codepoint);
 179   uint32_t general_category_bit = 1 << general_category;
 180   // for e.g. undefined (but valid) codepoints, general_category == 0 ==
 181   // UTF8PROC_CATEGORY_CN
 182   return (general_category != UTF8PROC_CATEGORY_CN) &&
 183          ((general_category_bit & mask) != 0);
 184 }
 185
 186 template <typename... Categories>
 187 static inline bool HasAnyUnicodeGeneralCategory(uint32_t codepoint, uint32_t mask,
 188                                                 utf8proc_category_t category,
 189                                                 Categories... categories) {
 190   return HasAnyUnicodeGeneralCategory(codepoint, mask | (1 << category), categories...);
 191 }
 192
 193 template <typename... Categories>
 194 static inline bool HasAnyUnicodeGeneralCategory(uint32_t codepoint,
 195                                                 utf8proc_category_t category,
 196                                                 Categories... categories) {
 197   return HasAnyUnicodeGeneralCategory(codepoint, static_cast<uint32_t>(1u << category),
 198                                       categories...);
 199 }
 200
 201 static inline bool IsCasedCharacterUnicode(uint32_t codepoint) {
 202   return HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_LU,
 203                                       UTF8PROC_CATEGORY_LL, UTF8PROC_CATEGORY_LT) ||
 204          ((static_cast<uint32_t>(utf8proc_toupper(codepoint)) != codepoint) ||
 205           (static_cast<uint32_t>(utf8proc_tolower(codepoint)) != codepoint));
 206 }
 207
 208 static inline bool IsLowerCaseCharacterUnicode(uint32_t codepoint) {
 209   // although this trick seems to work for upper case, this is not enough for lower case
 210   // testing, see https://github.com/JuliaStrings/utf8proc/issues/195 . But currently the
 211   // best we can do
 212   return (HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_LL) ||
 213           ((static_cast<uint32_t>(utf8proc_toupper(codepoint)) != codepoint) &&
 214            (static_cast<uint32_t>(utf8proc_tolower(codepoint)) == codepoint))) &&
 215          !HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_LT);
 216 }
 217
 218 static inline bool IsUpperCaseCharacterUnicode(uint32_t codepoint) {
 219   // this seems to be a good workaround for utf8proc not having case information
 220   // https://github.com/JuliaStrings/utf8proc/issues/195
 221   return (HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_LU) ||
 222           ((static_cast<uint32_t>(utf8proc_toupper(codepoint)) == codepoint) &&
 223            (static_cast<uint32_t>(utf8proc_tolower(codepoint)) != codepoint))) &&
 224          !HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_LT);
 225 }
 226
 227 static inline bool IsAlphaNumericCharacterUnicode(uint32_t codepoint) {
 228   return HasAnyUnicodeGeneralCategory(
 229       codepoint, UTF8PROC_CATEGORY_LU, UTF8PROC_CATEGORY_LL, UTF8PROC_CATEGORY_LT,
 230       UTF8PROC_CATEGORY_LM, UTF8PROC_CATEGORY_LO, UTF8PROC_CATEGORY_ND,
 231       UTF8PROC_CATEGORY_NL, UTF8PROC_CATEGORY_NO);
 232 }
 233
 234 static inline bool IsAlphaCharacterUnicode(uint32_t codepoint) {
 235   return HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_LU,
 236                                       UTF8PROC_CATEGORY_LL, UTF8PROC_CATEGORY_LT,
 237                                       UTF8PROC_CATEGORY_LM, UTF8PROC_CATEGORY_LO);
 238 }
 239
 240 static inline bool IsDecimalCharacterUnicode(uint32_t codepoint) {
 241   return HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_ND);
 242 }
 243
 244 static inline bool IsDigitCharacterUnicode(uint32_t codepoint) {
 245   // Python defines this as Numeric_Type=Digit or Numeric_Type=Decimal.
 246   // utf8proc has no support for this, this is the best we can do:
 247   return HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_ND);
 248 }
 249
 250 static inline bool IsNumericCharacterUnicode(uint32_t codepoint) {
 251   // Formally this is not correct, but utf8proc does not allow us to query for Numerical
 252   // properties, e.g. Numeric_Value and Numeric_Type
 253   // Python defines Numeric as Numeric_Type=Digit, Numeric_Type=Decimal or
 254   // Numeric_Type=Numeric.
 255   return HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_ND,
 256                                       UTF8PROC_CATEGORY_NL, UTF8PROC_CATEGORY_NO);
 257 }
 258
 259 static inline bool IsSpaceCharacterUnicode(uint32_t codepoint) {
 260   auto property = utf8proc_get_property(codepoint);
 261   return HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_ZS) ||
 262          property->bidi_class == UTF8PROC_BIDI_CLASS_WS ||
 263          property->bidi_class == UTF8PROC_BIDI_CLASS_B ||
 264          property->bidi_class == UTF8PROC_BIDI_CLASS_S;
 265 }
 266
 267 static inline bool IsPrintableCharacterUnicode(uint32_t codepoint) {
 268   uint32_t general_category = utf8proc_category(codepoint);
 269   return (general_category != UTF8PROC_CATEGORY_CN) &&
 270          !HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_CC,
 271                                        UTF8PROC_CATEGORY_CF, UTF8PROC_CATEGORY_CS,
 272                                        UTF8PROC_CATEGORY_CO, UTF8PROC_CATEGORY_ZS,
 273                                        UTF8PROC_CATEGORY_ZL, UTF8PROC_CATEGORY_ZP);
 274 }
 275
 276 void EnsureLookupTablesFilled() {
 277   std::call_once(flag_case_luts, []() {
 278     lut_upper_codepoint.reserve(kMaxCodepointLookup + 1);
 279     lut_lower_codepoint.reserve(kMaxCodepointLookup + 1);
 280     lut_swapcase_codepoint.reserve(kMaxCodepointLookup + 1);
 281     for (uint32_t i = 0; i <= kMaxCodepointLookup; i++) {
 282       lut_upper_codepoint.push_back(utf8proc_toupper(i));
 283       lut_lower_codepoint.push_back(utf8proc_tolower(i));
 284       lut_category.push_back(utf8proc_category(i));
 285
 286       if (IsLowerCaseCharacterUnicode(i)) {
 287         lut_swapcase_codepoint.push_back(utf8proc_toupper(i));
 288       } else if (IsUpperCaseCharacterUnicode(i)) {
 289         lut_swapcase_codepoint.push_back(utf8proc_tolower(i));
 290       } else {
 291         lut_swapcase_codepoint.push_back(i);
 292       }
 293     }
 294   });
 295 }
 296
 297 #else
 298
 299 void EnsureLookupTablesFilled() {}
 300
 301 #endif  // ARROW_WITH_UTF8PROC
 302
 303 constexpr int64_t kTransformError = -1;
 304
 305 struct StringTransformBase {
 306   virtual ~StringTransformBase() = default;
 307   virtual Status PreExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
 308     return Status::OK();
 309   }
 310
 311   // Return the maximum total size of the output in codeunits (i.e. bytes)
 312   // given input characteristics.
 313   virtual int64_t MaxCodeunits(int64_t ninputs, int64_t input_ncodeunits) {
 314     return input_ncodeunits;
 315   }
 316
 317   virtual Status InvalidStatus() {
 318     return Status::Invalid("Invalid UTF8 sequence in input");
 319   }
 320
 321   // Derived classes should also define this method:
 322   //   int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
 323   //                     uint8_t* output);
 324 };
 325
 326 template <typename Type, typename StringTransform>
 327 struct StringTransformExecBase {
 328   using offset_type = typename Type::offset_type;
 329   using ArrayType = typename TypeTraits<Type>::ArrayType;
 330
 331   static Status Execute(KernelContext* ctx, StringTransform* transform,
 332                         const ExecBatch& batch, Datum* out) {
 333     if (batch[0].kind() == Datum::ARRAY) {
 334       return ExecArray(ctx, transform, batch[0].array(), out);
 335     }
 336     DCHECK_EQ(batch[0].kind(), Datum::SCALAR);
 337     return ExecScalar(ctx, transform, batch[0].scalar(), out);
 338   }
 339
 340   static Status ExecArray(KernelContext* ctx, StringTransform* transform,
 341                           const std::shared_ptr<ArrayData>& data, Datum* out) {
 342     ArrayType input(data);
 343     ArrayData* output = out->mutable_array();
 344
 345     const int64_t input_ncodeunits = input.total_values_length();
 346     const int64_t input_nstrings = input.length();
 347
 348     const int64_t output_ncodeunits_max =
 349         transform->MaxCodeunits(input_nstrings, input_ncodeunits);
 350     if (output_ncodeunits_max > std::numeric_limits<offset_type>::max()) {
 351       return Status::CapacityError(
 352           "Result might not fit in a 32bit utf8 array, convert to large_utf8");
 353     }
 354
 355     ARROW_ASSIGN_OR_RAISE(auto values_buffer, ctx->Allocate(output_ncodeunits_max));
 356     output->buffers[2] = values_buffer;
 357
 358     // String offsets are preallocated
 359     offset_type* output_string_offsets = output->GetMutableValues<offset_type>(1);
 360     uint8_t* output_str = output->buffers[2]->mutable_data();
 361     offset_type output_ncodeunits = 0;
 362
 363     output_string_offsets[0] = 0;
 364     for (int64_t i = 0; i < input_nstrings; i++) {
 365       if (!input.IsNull(i)) {
 366         offset_type input_string_ncodeunits;
 367         const uint8_t* input_string = input.GetValue(i, &input_string_ncodeunits);
 368         auto encoded_nbytes = static_cast<offset_type>(transform->Transform(
 369             input_string, input_string_ncodeunits, output_str + output_ncodeunits));
 370         if (encoded_nbytes < 0) {
 371           return transform->InvalidStatus();
 372         }
 373         output_ncodeunits += encoded_nbytes;
 374       }
 375       output_string_offsets[i + 1] = output_ncodeunits;
 376     }
 377     DCHECK_LE(output_ncodeunits, output_ncodeunits_max);
 378
 379     // Trim the codepoint buffer, since we allocated too much
 380     return values_buffer->Resize(output_ncodeunits, /*shrink_to_fit=*/true);
 381   }
 382
 383   static Status ExecScalar(KernelContext* ctx, StringTransform* transform,
 384                            const std::shared_ptr<Scalar>& scalar, Datum* out) {
 385     const auto& input = checked_cast<const BaseBinaryScalar&>(*scalar);
 386     if (!input.is_valid) {
 387       return Status::OK();
 388     }
 389     auto* result = checked_cast<BaseBinaryScalar*>(out->scalar().get());
 390     result->is_valid = true;
 391     const int64_t data_nbytes = static_cast<int64_t>(input.value->size());
 392
 393     const int64_t output_ncodeunits_max = transform->MaxCodeunits(1, data_nbytes);
 394     if (output_ncodeunits_max > std::numeric_limits<offset_type>::max()) {
 395       return Status::CapacityError(
 396           "Result might not fit in a 32bit utf8 array, convert to large_utf8");
 397     }
 398     ARROW_ASSIGN_OR_RAISE(auto value_buffer, ctx->Allocate(output_ncodeunits_max));
 399     result->value = value_buffer;
 400     auto encoded_nbytes = static_cast<offset_type>(transform->Transform(
 401         input.value->data(), data_nbytes, value_buffer->mutable_data()));
 402     if (encoded_nbytes < 0) {
 403       return transform->InvalidStatus();
 404     }
 405     DCHECK_LE(encoded_nbytes, output_ncodeunits_max);
 406     return value_buffer->Resize(encoded_nbytes, /*shrink_to_fit=*/true);
 407   }
 408 };
 409
 410 template <typename Type, typename StringTransform>
 411 struct StringTransformExec : public StringTransformExecBase<Type, StringTransform> {
 412   using StringTransformExecBase<Type, StringTransform>::Execute;
 413
 414   static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
 415     StringTransform transform;
 416     RETURN_NOT_OK(transform.PreExec(ctx, batch, out));
 417     return Execute(ctx, &transform, batch, out);
 418   }
 419 };
 420
 421 template <typename Type, typename StringTransform>
 422 struct StringTransformExecWithState
 423     : public StringTransformExecBase<Type, StringTransform> {
 424   using State = typename StringTransform::State;
 425   using StringTransformExecBase<Type, StringTransform>::Execute;
 426
 427   static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
 428     StringTransform transform(State::Get(ctx));
 429     RETURN_NOT_OK(transform.PreExec(ctx, batch, out));
 430     return Execute(ctx, &transform, batch, out);
 431   }
 432 };
 433
 434 template <typename StringTransform>
 435 struct FixedSizeBinaryTransformExecBase {
 436   static Status Execute(KernelContext* ctx, StringTransform* transform,
 437                         const ExecBatch& batch, Datum* out) {
 438     if (batch[0].kind() == Datum::ARRAY) {
 439       return ExecArray(ctx, transform, batch[0].array(), out);
 440     }
 441     DCHECK_EQ(batch[0].kind(), Datum::SCALAR);
 442     return ExecScalar(ctx, transform, batch[0].scalar(), out);
 443   }
 444
 445   static Status ExecArray(KernelContext* ctx, StringTransform* transform,
 446                           const std::shared_ptr<ArrayData>& data, Datum* out) {
 447     FixedSizeBinaryArray input(data);
 448     ArrayData* output = out->mutable_array();
 449
 450     const int32_t input_width =
 451         checked_cast<const FixedSizeBinaryType&>(*data->type).byte_width();
 452     const int32_t output_width =
 453         checked_cast<const FixedSizeBinaryType&>(*out->type()).byte_width();
 454     const int64_t input_nstrings = input.length();
 455     ARROW_ASSIGN_OR_RAISE(auto values_buffer,
 456                           ctx->Allocate(output_width * input_nstrings));
 457     uint8_t* output_str = values_buffer->mutable_data();
 458
 459     for (int64_t i = 0; i < input_nstrings; i++) {
 460       if (!input.IsNull(i)) {
 461         const uint8_t* input_string = input.GetValue(i);
 462         auto encoded_nbytes = static_cast<int32_t>(
 463             transform->Transform(input_string, input_width, output_str));
 464         if (encoded_nbytes != output_width) {
 465           return transform->InvalidStatus();
 466         }
 467       } else {
 468         std::memset(output_str, 0x00, output_width);
 469       }
 470       output_str += output_width;
 471     }
 472
 473     output->buffers[1] = std::move(values_buffer);
 474     return Status::OK();
 475   }
 476
 477   static Status ExecScalar(KernelContext* ctx, StringTransform* transform,
 478                            const std::shared_ptr<Scalar>& scalar, Datum* out) {
 479     const auto& input = checked_cast<const BaseBinaryScalar&>(*scalar);
 480     if (!input.is_valid) {
 481       return Status::OK();
 482     }
 483     const int32_t out_width =
 484         checked_cast<const FixedSizeBinaryType&>(*out->type()).byte_width();
 485     auto* result = checked_cast<BaseBinaryScalar*>(out->scalar().get());
 486
 487     const int32_t data_nbytes = static_cast<int32_t>(input.value->size());
 488     ARROW_ASSIGN_OR_RAISE(auto value_buffer, ctx->Allocate(out_width));
 489     auto encoded_nbytes = static_cast<int32_t>(transform->Transform(
 490         input.value->data(), data_nbytes, value_buffer->mutable_data()));
 491     if (encoded_nbytes != out_width) {
 492       return transform->InvalidStatus();
 493     }
 494
 495     result->is_valid = true;
 496     result->value = std::move(value_buffer);
 497     return Status::OK();
 498   }
 499 };
 500
 501 template <typename StringTransform>
 502 struct FixedSizeBinaryTransformExecWithState
 503     : public FixedSizeBinaryTransformExecBase<StringTransform> {
 504   using State = typename StringTransform::State;
 505   using FixedSizeBinaryTransformExecBase<StringTransform>::Execute;
 506
 507   static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
 508     StringTransform transform(State::Get(ctx));
 509     RETURN_NOT_OK(transform.PreExec(ctx, batch, out));
 510     return Execute(ctx, &transform, batch, out);
 511   }
 512
 513   static Result<ValueDescr> OutputType(KernelContext* ctx,
 514                                        const std::vector<ValueDescr>& descrs) {
 515     DCHECK_EQ(1, descrs.size());
 516     const auto& options = State::Get(ctx);
 517     const int32_t input_width =
 518         checked_cast<const FixedSizeBinaryType&>(*descrs[0].type).byte_width();
 519     const int32_t output_width = StringTransform::FixedOutputSize(options, input_width);
 520     return ValueDescr(fixed_size_binary(output_width), descrs[0].shape);
 521   }
 522 };
 523
 524 #ifdef ARROW_WITH_UTF8PROC
 525
 526 struct FunctionalCaseMappingTransform : public StringTransformBase {
 527   Status PreExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) override {
 528     EnsureLookupTablesFilled();
 529     return Status::OK();
 530   }
 531
 532   int64_t MaxCodeunits(int64_t ninputs, int64_t input_ncodeunits) override {
 533     // Section 5.18 of the Unicode spec claims that the number of codepoints for case
 534     // mapping can grow by a factor of 3. This means grow by a factor of 3 in bytes
 535     // However, since we don't support all casings (SpecialCasing.txt) the growth
 536     // in bytes is actually only at max 3/2 (as covered by the unittest).
 537     // Note that rounding down the 3/2 is ok, since only codepoints encoded by
 538     // two code units (even) can grow to 3 code units.
 539     return static_cast<int64_t>(input_ncodeunits) * 3 / 2;
 540   }
 541 };
 542
 543 template <typename CodepointTransform>
 544 struct StringTransformCodepoint : public FunctionalCaseMappingTransform {
 545   int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
 546                     uint8_t* output) {
 547     uint8_t* output_start = output;
 548     if (ARROW_PREDICT_FALSE(
 549             !arrow::util::UTF8Transform(input, input + input_string_ncodeunits, &output,
 550                                         CodepointTransform::TransformCodepoint))) {
 551       return kTransformError;
 552     }
 553     return output - output_start;
 554   }
 555 };
 556
 557 struct UTF8UpperTransform : public FunctionalCaseMappingTransform {
 558   static uint32_t TransformCodepoint(uint32_t codepoint) {
 559     return codepoint <= kMaxCodepointLookup ? lut_upper_codepoint[codepoint]
 560                                             : utf8proc_toupper(codepoint);
 561   }
 562 };
 563
 564 template <typename Type>
 565 using UTF8Upper = StringTransformExec<Type, StringTransformCodepoint<UTF8UpperTransform>>;
 566
 567 struct UTF8LowerTransform : public FunctionalCaseMappingTransform {
 568   static uint32_t TransformCodepoint(uint32_t codepoint) {
 569     return codepoint <= kMaxCodepointLookup ? lut_lower_codepoint[codepoint]
 570                                             : utf8proc_tolower(codepoint);
 571   }
 572 };
 573
 574 template <typename Type>
 575 using UTF8Lower = StringTransformExec<Type, StringTransformCodepoint<UTF8LowerTransform>>;
 576
 577 struct UTF8SwapCaseTransform : public FunctionalCaseMappingTransform {
 578   static uint32_t TransformCodepoint(uint32_t codepoint) {
 579     if (codepoint <= kMaxCodepointLookup) {
 580       return lut_swapcase_codepoint[codepoint];
 581     } else {
 582       if (IsLowerCaseCharacterUnicode(codepoint)) {
 583         return utf8proc_toupper(codepoint);
 584       } else if (IsUpperCaseCharacterUnicode(codepoint)) {
 585         return utf8proc_tolower(codepoint);
 586       }
 587     }
 588
 589     return codepoint;
 590   }
 591 };
 592
 593 template <typename Type>
 594 using UTF8SwapCase =
 595     StringTransformExec<Type, StringTransformCodepoint<UTF8SwapCaseTransform>>;
 596
 597 struct Utf8CapitalizeTransform : public FunctionalCaseMappingTransform {
 598   int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
 599                     uint8_t* output) {
 600     uint8_t* output_start = output;
 601     const uint8_t* end = input + input_string_ncodeunits;
 602     const uint8_t* next = input;
 603     if (input_string_ncodeunits > 0) {
 604       if (ARROW_PREDICT_FALSE(!util::UTF8AdvanceCodepoints(input, end, &next, 1))) {
 605         return kTransformError;
 606       }
 607       if (ARROW_PREDICT_FALSE(!util::UTF8Transform(
 608               input, next, &output, UTF8UpperTransform::TransformCodepoint))) {
 609         return kTransformError;
 610       }
 611       if (ARROW_PREDICT_FALSE(!util::UTF8Transform(
 612               next, end, &output, UTF8LowerTransform::TransformCodepoint))) {
 613         return kTransformError;
 614       }
 615     }
 616     return output - output_start;
 617   }
 618 };
 619
 620 template <typename Type>
 621 using Utf8Capitalize = StringTransformExec<Type, Utf8CapitalizeTransform>;
 622
 623 struct Utf8TitleTransform : public FunctionalCaseMappingTransform {
 624   int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
 625                     uint8_t* output) {
 626     uint8_t* output_start = output;
 627     const uint8_t* end = input + input_string_ncodeunits;
 628     const uint8_t* next = input;
 629     bool is_next_upper = true;
 630     while ((input = next) < end) {
 631       uint32_t codepoint;
 632       if (ARROW_PREDICT_FALSE(!util::UTF8Decode(&next, &codepoint))) {
 633         return kTransformError;
 634       }
 635       if (IsCasedCharacterUnicode(codepoint)) {
 636         // Lower/uppercase current codepoint and
 637         // prepare to lowercase next consecutive cased codepoints
 638         output = is_next_upper
 639                      ? util::UTF8Encode(output,
 640                                         UTF8UpperTransform::TransformCodepoint(codepoint))
 641                      : util::UTF8Encode(
 642                            output, UTF8LowerTransform::TransformCodepoint(codepoint));
 643         is_next_upper = false;
 644       } else {
 645         // Copy current uncased codepoint and
 646         // prepare to uppercase next cased codepoint
 647         std::memcpy(output, input, next - input);
 648         output += next - input;
 649         is_next_upper = true;
 650       }
 651     }
 652     return output - output_start;
 653   }
 654 };
 655
 656 template <typename Type>
 657 using Utf8Title = StringTransformExec<Type, Utf8TitleTransform>;
 658
 659 #endif  // ARROW_WITH_UTF8PROC
 660
 661 struct AsciiReverseTransform : public StringTransformBase {
 662   int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
 663                     uint8_t* output) {
 664     uint8_t utf8_char_found = 0;
 665     for (int64_t i = 0; i < input_string_ncodeunits; i++) {
 666       // if a utf8 char is found, report to utf8_char_found
 667       utf8_char_found |= input[i] & 0x80;
 668       output[input_string_ncodeunits - i - 1] = input[i];
 669     }
 670     return utf8_char_found ? kTransformError : input_string_ncodeunits;
 671   }
 672
 673   Status InvalidStatus() override {
 674     return Status::Invalid("Non-ASCII sequence in input");
 675   }
 676 };
 677
 678 template <typename Type>
 679 using AsciiReverse = StringTransformExec<Type, AsciiReverseTransform>;
 680
 681 struct Utf8ReverseTransform : public StringTransformBase {
 682   int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
 683                     uint8_t* output) {
 684     int64_t i = 0;
 685     while (i < input_string_ncodeunits) {
 686       int64_t char_end = std::min(i + util::ValidUtf8CodepointByteSize(input + i),
 687                                   input_string_ncodeunits);
 688       std::copy(input + i, input + char_end, output + input_string_ncodeunits - char_end);
 689       i = char_end;
 690     }
 691     return input_string_ncodeunits;
 692   }
 693 };
 694
 695 template <typename Type>
 696 using Utf8Reverse = StringTransformExec<Type, Utf8ReverseTransform>;
 697
 698 using TransformFunc = std::function<void(const uint8_t*, int64_t, uint8_t*)>;
 699
 700 // Transform a buffer of offsets to one which begins with 0 and has same
 701 // value lengths.
 702 template <typename T>
 703 Status GetShiftedOffsets(KernelContext* ctx, const Buffer& input_buffer, int64_t offset,
 704                          int64_t length, std::shared_ptr<Buffer>* out) {
 705   ARROW_ASSIGN_OR_RAISE(*out, ctx->Allocate((length + 1) * sizeof(T)));
 706   const T* input_offsets = reinterpret_cast<const T*>(input_buffer.data()) + offset;
 707   T* out_offsets = reinterpret_cast<T*>((*out)->mutable_data());
 708   T first_offset = *input_offsets;
 709   for (int64_t i = 0; i < length; ++i) {
 710     *out_offsets++ = input_offsets[i] - first_offset;
 711   }
 712   *out_offsets = input_offsets[length] - first_offset;
 713   return Status::OK();
 714 }
 715
 716 // Apply `transform` to input character data- this function cannot change the
 717 // length
 718 template <typename Type>
 719 Status StringDataTransform(KernelContext* ctx, const ExecBatch& batch,
 720                            TransformFunc transform, Datum* out) {
 721   using ArrayType = typename TypeTraits<Type>::ArrayType;
 722   using offset_type = typename Type::offset_type;
 723
 724   if (batch[0].kind() == Datum::ARRAY) {
 725     const ArrayData& input = *batch[0].array();
 726     ArrayType input_boxed(batch[0].array());
 727
 728     ArrayData* out_arr = out->mutable_array();
 729
 730     if (input.offset == 0) {
 731       // We can reuse offsets from input
 732       out_arr->buffers[1] = input.buffers[1];
 733     } else {
 734       DCHECK(input.buffers[1]);
 735       // We must allocate new space for the offsets and shift the existing offsets
 736       RETURN_NOT_OK(GetShiftedOffsets<offset_type>(ctx, *input.buffers[1], input.offset,
 737                                                    input.length, &out_arr->buffers[1]));
 738     }
 739
 740     // Allocate space for output data
 741     int64_t data_nbytes = input_boxed.total_values_length();
 742     RETURN_NOT_OK(ctx->Allocate(data_nbytes).Value(&out_arr->buffers[2]));
 743     if (input.length > 0) {
 744       transform(input.buffers[2]->data() + input_boxed.value_offset(0), data_nbytes,
 745                 out_arr->buffers[2]->mutable_data());
 746     }
 747   } else {
 748     const auto& input = checked_cast<const BaseBinaryScalar&>(*batch[0].scalar());
 749     auto result = checked_pointer_cast<BaseBinaryScalar>(MakeNullScalar(out->type()));
 750     if (input.is_valid) {
 751       result->is_valid = true;
 752       int64_t data_nbytes = input.value->size();
 753       RETURN_NOT_OK(ctx->Allocate(data_nbytes).Value(&result->value));
 754       transform(input.value->data(), data_nbytes, result->value->mutable_data());
 755     }
 756     out->value = result;
 757   }
 758
 759   return Status::OK();
 760 }
 761
 762 void TransformAsciiUpper(const uint8_t* input, int64_t length, uint8_t* output) {
 763   std::transform(input, input + length, output, ascii_toupper);
 764 }
 765
 766 template <typename Type>
 767 struct AsciiUpper {
 768   static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
 769     return StringDataTransform<Type>(ctx, batch, TransformAsciiUpper, out);
 770   }
 771 };
 772
 773 void TransformAsciiLower(const uint8_t* input, int64_t length, uint8_t* output) {
 774   std::transform(input, input + length, output, ascii_tolower);
 775 }
 776
 777 template <typename Type>
 778 struct AsciiLower {
 779   static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
 780     return StringDataTransform<Type>(ctx, batch, TransformAsciiLower, out);
 781   }
 782 };
 783
 784 void TransformAsciiSwapCase(const uint8_t* input, int64_t length, uint8_t* output) {
 785   std::transform(input, input + length, output, ascii_swapcase);
 786 }
 787
 788 template <typename Type>
 789 struct AsciiSwapCase {
 790   static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
 791     return StringDataTransform<Type>(ctx, batch, TransformAsciiSwapCase, out);
 792   }
 793 };
 794
 795 struct AsciiCapitalizeTransform : public StringTransformBase {
 796   int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
 797                     uint8_t* output) {
 798     if (input_string_ncodeunits > 0) {
 799       *output++ = ascii_toupper(*input++);
 800       TransformAsciiLower(input, input_string_ncodeunits - 1, output);
 801     }
 802     return input_string_ncodeunits;
 803   }
 804 };
 805
 806 template <typename Type>
 807 using AsciiCapitalize = StringTransformExec<Type, AsciiCapitalizeTransform>;
 808
 809 struct AsciiTitleTransform : public StringTransformBase {
 810   int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
 811                     uint8_t* output) {
 812     const uint8_t* end = input + input_string_ncodeunits;
 813     const uint8_t* next = input;
 814     bool is_next_upper = true;
 815     while ((input = next++) < end) {
 816       if (IsCasedCharacterAscii(*input)) {
 817         // Lower/uppercase current character and
 818         // prepare to lowercase next consecutive cased characters
 819         *output++ = is_next_upper ? ascii_toupper(*input) : ascii_tolower(*input);
 820         is_next_upper = false;
 821       } else {
 822         // Copy current uncased character and
 823         // prepare to uppercase next cased character
 824         *output++ = *input;
 825         is_next_upper = true;
 826       }
 827     }
 828     return input_string_ncodeunits;
 829   }
 830 };
 831
 832 template <typename Type>
 833 using AsciiTitle = StringTransformExec<Type, AsciiTitleTransform>;
 834
 835 // ----------------------------------------------------------------------
 836 // exact pattern detection
 837
 838 using StrToBoolTransformFunc =
 839     std::function<void(const void*, const uint8_t*, int64_t, int64_t, uint8_t*)>;
 840
 841 // Apply `transform` to input character data- this function cannot change the
 842 // length
 843 template <typename Type>
 844 void StringBoolTransform(KernelContext* ctx, const ExecBatch& batch,
 845                          StrToBoolTransformFunc transform, Datum* out) {
 846   using offset_type = typename Type::offset_type;
 847
 848   if (batch[0].kind() == Datum::ARRAY) {
 849     const ArrayData& input = *batch[0].array();
 850     ArrayData* out_arr = out->mutable_array();
 851     if (input.length > 0) {
 852       transform(
 853           reinterpret_cast<const offset_type*>(input.buffers[1]->data()) + input.offset,
 854           input.buffers[2]->data(), input.length, out_arr->offset,
 855           out_arr->buffers[1]->mutable_data());
 856     }
 857   } else {
 858     const auto& input = checked_cast<const BaseBinaryScalar&>(*batch[0].scalar());
 859     if (input.is_valid) {
 860       uint8_t result_value = 0;
 861       std::array<offset_type, 2> offsets{0,
 862                                          static_cast<offset_type>(input.value->size())};
 863       transform(offsets.data(), input.value->data(), 1, /*output_offset=*/0,
 864                 &result_value);
 865       out->value = std::make_shared<BooleanScalar>(result_value > 0);
 866     }
 867   }
 868 }
 869
 870 using MatchSubstringState = OptionsWrapper<MatchSubstringOptions>;
 871
 872 // This is an implementation of the Knuth-Morris-Pratt algorithm
 873 struct PlainSubstringMatcher {
 874   const MatchSubstringOptions& options_;
 875   std::vector<int64_t> prefix_table;
 876
 877   static Result<std::unique_ptr<PlainSubstringMatcher>> Make(
 878       const MatchSubstringOptions& options) {
 879     // Should be handled by partial template specialization below
 880     DCHECK(!options.ignore_case);
 881     return ::arrow::internal::make_unique<PlainSubstringMatcher>(options);
 882   }
 883
 884   explicit PlainSubstringMatcher(const MatchSubstringOptions& options)
 885       : options_(options) {
 886     // Phase 1: Build the prefix table
 887     const auto pattern_length = options_.pattern.size();
 888     prefix_table.resize(pattern_length + 1, /*value=*/0);
 889     int64_t prefix_length = -1;
 890     prefix_table[0] = -1;
 891     for (size_t pos = 0; pos < pattern_length; ++pos) {
 892       // The prefix cannot be expanded, reset.
 893       while (prefix_length >= 0 &&
 894              options_.pattern[pos] != options_.pattern[prefix_length]) {
 895         prefix_length = prefix_table[prefix_length];
 896       }
 897       prefix_length++;
 898       prefix_table[pos + 1] = prefix_length;
 899     }
 900   }
 901
 902   int64_t Find(util::string_view current) const {
 903     // Phase 2: Find the prefix in the data
 904     const auto pattern_length = options_.pattern.size();
 905     int64_t pattern_pos = 0;
 906     int64_t pos = 0;
 907     if (pattern_length == 0) return 0;
 908     for (const auto c : current) {
 909       while ((pattern_pos >= 0) && (options_.pattern[pattern_pos] != c)) {
 910         pattern_pos = prefix_table[pattern_pos];
 911       }
 912       pattern_pos++;
 913       if (static_cast<size_t>(pattern_pos) == pattern_length) {
 914         return pos + 1 - pattern_length;
 915       }
 916       pos++;
 917     }
 918     return -1;
 919   }
 920
 921   bool Match(util::string_view current) const { return Find(current) >= 0; }
 922 };
 923
 924 struct PlainStartsWithMatcher {
 925   const MatchSubstringOptions& options_;
 926
 927   explicit PlainStartsWithMatcher(const MatchSubstringOptions& options)
 928       : options_(options) {}
 929
 930   static Result<std::unique_ptr<PlainStartsWithMatcher>> Make(
 931       const MatchSubstringOptions& options) {
 932     // Should be handled by partial template specialization below
 933     DCHECK(!options.ignore_case);
 934     return ::arrow::internal::make_unique<PlainStartsWithMatcher>(options);
 935   }
 936
 937   bool Match(util::string_view current) const {
 938     // string_view::starts_with is C++20
 939     return current.substr(0, options_.pattern.size()) == options_.pattern;
 940   }
 941 };
 942
 943 struct PlainEndsWithMatcher {
 944   const MatchSubstringOptions& options_;
 945
 946   explicit PlainEndsWithMatcher(const MatchSubstringOptions& options)
 947       : options_(options) {}
 948
 949   static Result<std::unique_ptr<PlainEndsWithMatcher>> Make(
 950       const MatchSubstringOptions& options) {
 951     // Should be handled by partial template specialization below
 952     DCHECK(!options.ignore_case);
 953     return ::arrow::internal::make_unique<PlainEndsWithMatcher>(options);
 954   }
 955
 956   bool Match(util::string_view current) const {
 957     // string_view::ends_with is C++20
 958     return current.size() >= options_.pattern.size() &&
 959            current.substr(current.size() - options_.pattern.size(),
 960                           options_.pattern.size()) == options_.pattern;
 961   }
 962 };
 963
 964 #ifdef ARROW_WITH_RE2
 965 struct RegexSubstringMatcher {
 966   const MatchSubstringOptions& options_;
 967   const RE2 regex_match_;
 968
 969   static Result<std::unique_ptr<RegexSubstringMatcher>> Make(
 970       const MatchSubstringOptions& options, bool literal = false) {
 971     auto matcher =
 972         ::arrow::internal::make_unique<RegexSubstringMatcher>(options, literal);
 973     RETURN_NOT_OK(RegexStatus(matcher->regex_match_));
 974     return std::move(matcher);
 975   }
 976
 977   explicit RegexSubstringMatcher(const MatchSubstringOptions& options,
 978                                  bool literal = false)
 979       : options_(options),
 980         regex_match_(options_.pattern, MakeRE2Options(options, literal)) {}
 981
 982   bool Match(util::string_view current) const {
 983     auto piece = re2::StringPiece(current.data(), current.length());
 984     return re2::RE2::PartialMatch(piece, regex_match_);
 985   }
 986
 987   static RE2::RE2::Options MakeRE2Options(const MatchSubstringOptions& options,
 988                                           bool literal) {
 989     RE2::RE2::Options re2_options(RE2::Quiet);
 990     re2_options.set_case_sensitive(!options.ignore_case);
 991     re2_options.set_literal(literal);
 992     return re2_options;
 993   }
 994 };
 995 #endif
 996
 997 template <typename Type, typename Matcher>
 998 struct MatchSubstringImpl {
 999   using offset_type = typename Type::offset_type;
1000
1001   static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out,
1002                      const Matcher* matcher) {
1003     StringBoolTransform<Type>(
1004         ctx, batch,
1005         [&matcher](const void* raw_offsets, const uint8_t* data, int64_t length,
1006                    int64_t output_offset, uint8_t* output) {
1007           const offset_type* offsets = reinterpret_cast<const offset_type*>(raw_offsets);
1008           FirstTimeBitmapWriter bitmap_writer(output, output_offset, length);
1009           for (int64_t i = 0; i < length; ++i) {
1010             const char* current_data = reinterpret_cast<const char*>(data + offsets[i]);
1011             int64_t current_length = offsets[i + 1] - offsets[i];
1012             if (matcher->Match(util::string_view(current_data, current_length))) {
1013               bitmap_writer.Set();
1014             }
1015             bitmap_writer.Next();
1016           }
1017           bitmap_writer.Finish();
1018         },
1019         out);
1020     return Status::OK();
1021   }
1022 };
1023
1024 template <typename Type, typename Matcher>
1025 struct MatchSubstring {
1026   static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
1027     // TODO Cache matcher across invocations (for regex compilation)
1028     ARROW_ASSIGN_OR_RAISE(auto matcher, Matcher::Make(MatchSubstringState::Get(ctx)));
1029     return MatchSubstringImpl<Type, Matcher>::Exec(ctx, batch, out, matcher.get());
1030   }
1031 };
1032
1033 template <typename Type>
1034 struct MatchSubstring<Type, PlainSubstringMatcher> {
1035   static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
1036     auto options = MatchSubstringState::Get(ctx);
1037     if (options.ignore_case) {
1038 #ifdef ARROW_WITH_RE2
1039       ARROW_ASSIGN_OR_RAISE(auto matcher,
1040                             RegexSubstringMatcher::Make(options, /*literal=*/true));
1041       return MatchSubstringImpl<Type, RegexSubstringMatcher>::Exec(ctx, batch, out,
1042                                                                    matcher.get());
1043 #else
1044       return Status::NotImplemented("ignore_case requires RE2");
1045 #endif
1046     }
1047     ARROW_ASSIGN_OR_RAISE(auto matcher, PlainSubstringMatcher::Make(options));
1048     return MatchSubstringImpl<Type, PlainSubstringMatcher>::Exec(ctx, batch, out,
1049                                                                  matcher.get());
1050   }
1051 };
1052
1053 template <typename Type>
1054 struct MatchSubstring<Type, PlainStartsWithMatcher> {
1055   static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
1056     auto options = MatchSubstringState::Get(ctx);
1057     if (options.ignore_case) {
1058 #ifdef ARROW_WITH_RE2
1059       MatchSubstringOptions converted_options = options;
1060       converted_options.pattern = "^" + RE2::QuoteMeta(options.pattern);
1061       ARROW_ASSIGN_OR_RAISE(auto matcher, RegexSubstringMatcher::Make(converted_options));
1062       return MatchSubstringImpl<Type, RegexSubstringMatcher>::Exec(ctx, batch, out,
1063                                                                    matcher.get());
1064 #else
1065       return Status::NotImplemented("ignore_case requires RE2");
1066 #endif
1067     }
1068     ARROW_ASSIGN_OR_RAISE(auto matcher, PlainStartsWithMatcher::Make(options));
1069     return MatchSubstringImpl<Type, PlainStartsWithMatcher>::Exec(ctx, batch, out,
1070                                                                   matcher.get());
1071   }
1072 };
1073
1074 template <typename Type>
1075 struct MatchSubstring<Type, PlainEndsWithMatcher> {
1076   static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
1077     auto options = MatchSubstringState::Get(ctx);
1078     if (options.ignore_case) {
1079 #ifdef ARROW_WITH_RE2
1080       MatchSubstringOptions converted_options = options;
1081       converted_options.pattern = RE2::QuoteMeta(options.pattern) + "$";
1082       ARROW_ASSIGN_OR_RAISE(auto matcher, RegexSubstringMatcher::Make(converted_options));
1083       return MatchSubstringImpl<Type, RegexSubstringMatcher>::Exec(ctx, batch, out,
1084                                                                    matcher.get());
1085 #else
1086       return Status::NotImplemented("ignore_case requires RE2");
1087 #endif
1088     }
1089     ARROW_ASSIGN_OR_RAISE(auto matcher, PlainEndsWithMatcher::Make(options));
1090     return MatchSubstringImpl<Type, PlainEndsWithMatcher>::Exec(ctx, batch, out,
1091                                                                 matcher.get());
1092   }
1093 };
1094
1095 const FunctionDoc match_substring_doc(
1096     "Match strings against literal pattern",
1097     ("For each string in `strings`, emit true iff it contains a given pattern.\n"
1098      "Null inputs emit null.  The pattern must be given in MatchSubstringOptions. "
1099      "If ignore_case is set, only simple case folding is performed."),
1100     {"strings"}, "MatchSubstringOptions");
1101
1102 const FunctionDoc starts_with_doc(
1103     "Check if strings start with a literal pattern",
1104     ("For each string in `strings`, emit true iff it starts with a given pattern.\n"
1105      "Null inputs emit null.  The pattern must be given in MatchSubstringOptions. "
1106      "If ignore_case is set, only simple case folding is performed."),
1107     {"strings"}, "MatchSubstringOptions");
1108
1109 const FunctionDoc ends_with_doc(
1110     "Check if strings end with a literal pattern",
1111     ("For each string in `strings`, emit true iff it ends with a given pattern.\n"
1112      "Null inputs emit null.  The pattern must be given in MatchSubstringOptions. "
1113      "If ignore_case is set, only simple case folding is performed."),
1114     {"strings"}, "MatchSubstringOptions");
1115
1116 #ifdef ARROW_WITH_RE2
1117 const FunctionDoc match_substring_regex_doc(
1118     "Match strings against regex pattern",
1119     ("For each string in `strings`, emit true iff it matches a given pattern at any "
1120      "position.\n"
1121      "Null inputs emit null.  The pattern must be given in MatchSubstringOptions. "
1122      "If ignore_case is set, only simple case folding is performed."),
1123     {"strings"}, "MatchSubstringOptions");
1124
1125 // SQL LIKE match
1126
1127 /// Convert a SQL-style LIKE pattern (using '%' and '_') into a regex pattern
1128 std::string MakeLikeRegex(const MatchSubstringOptions& options) {
1129   // Allow . to match \n
1130   std::string like_pattern = "(?s:^";
1131   like_pattern.reserve(options.pattern.size() + 7);
1132   bool escaped = false;
1133   for (const char c : options.pattern) {
1134     if (!escaped && c == '%') {
1135       like_pattern.append(".*");
1136     } else if (!escaped && c == '_') {
1137       like_pattern.append(".");
1138     } else if (!escaped && c == '\\') {
1139       escaped = true;
1140     } else {
1141       switch (c) {
1142         case '.':
1143         case '?':
1144         case '+':
1145         case '*':
1146         case '^':
1147         case '$':
1148         case '\\':
1149         case '[':
1150         case '{':
1151         case '(':
1152         case ')':
1153         case '|': {
1154           like_pattern.push_back('\\');
1155           like_pattern.push_back(c);
1156           escaped = false;
1157           break;
1158         }
1159         default: {
1160           like_pattern.push_back(c);
1161           escaped = false;
1162           break;
1163         }
1164       }
1165     }
1166   }
1167   like_pattern.append("$)");
1168   return like_pattern;
1169 }
1170
1171 // Evaluate a SQL-like LIKE pattern by translating it to a regexp or
1172 // substring search as appropriate. See what Apache Impala does:
1173 // https://github.com/apache/impala/blob/9c38568657d62b6f6d7b10aa1c721ba843374dd8/be/src/exprs/like-predicate.cc
1174 template <typename StringType>
1175 struct MatchLike {
1176   static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
1177     // NOTE: avoid making those constants global to avoid compiling regexes at startup
1178     // A LIKE pattern matching this regex can be translated into a substring search.
1179     static const RE2 kLikePatternIsSubstringMatch(R"(%+([^%_]*[^\\%_])?%+)");
1180     // A LIKE pattern matching this regex can be translated into a prefix search.
1181     static const RE2 kLikePatternIsStartsWith(R"(([^%_]*[^\\%_])?%+)");
1182     // A LIKE pattern matching this regex can be translated into a suffix search.
1183     static const RE2 kLikePatternIsEndsWith(R"(%+([^%_]*))");
1184
1185     auto original_options = MatchSubstringState::Get(ctx);
1186     auto original_state = ctx->state();
1187
1188     Status status;
1189     std::string pattern;
1190     if (!original_options.ignore_case &&
1191         re2::RE2::FullMatch(original_options.pattern, kLikePatternIsSubstringMatch,
1192                             &pattern)) {
1193       MatchSubstringOptions converted_options{pattern, original_options.ignore_case};
1194       MatchSubstringState converted_state(converted_options);
1195       ctx->SetState(&converted_state);
1196       status = MatchSubstring<StringType, PlainSubstringMatcher>::Exec(ctx, batch, out);
1197     } else if (!original_options.ignore_case &&
1198                re2::RE2::FullMatch(original_options.pattern, kLikePatternIsStartsWith,
1199                                    &pattern)) {
1200       MatchSubstringOptions converted_options{pattern, original_options.ignore_case};
1201       MatchSubstringState converted_state(converted_options);
1202       ctx->SetState(&converted_state);
1203       status = MatchSubstring<StringType, PlainStartsWithMatcher>::Exec(ctx, batch, out);
1204     } else if (!original_options.ignore_case &&
1205                re2::RE2::FullMatch(original_options.pattern, kLikePatternIsEndsWith,
1206                                    &pattern)) {
1207       MatchSubstringOptions converted_options{pattern, original_options.ignore_case};
1208       MatchSubstringState converted_state(converted_options);
1209       ctx->SetState(&converted_state);
1210       status = MatchSubstring<StringType, PlainEndsWithMatcher>::Exec(ctx, batch, out);
1211     } else {
1212       MatchSubstringOptions converted_options{MakeLikeRegex(original_options),
1213                                               original_options.ignore_case};
1214       MatchSubstringState converted_state(converted_options);
1215       ctx->SetState(&converted_state);
1216       status = MatchSubstring<StringType, RegexSubstringMatcher>::Exec(ctx, batch, out);
1217     }
1218     ctx->SetState(original_state);
1219     return status;
1220   }
1221 };
1222
1223 const FunctionDoc match_like_doc(
1224     "Match strings against SQL-style LIKE pattern",
1225     ("For each string in `strings`, emit true iff it fully matches a given pattern "
1226      "at any position. That is, '%' will match any number of characters, '_' will "
1227      "match exactly one character, and any other character matches itself. To "
1228      "match a literal '%', '_', or '\\', precede the character with a backslash.\n"
1229      "Null inputs emit null.  The pattern must be given in MatchSubstringOptions."),
1230     {"strings"}, "MatchSubstringOptions");
1231
1232 #endif
1233
1234 void AddMatchSubstring(FunctionRegistry* registry) {
1235   {
1236     auto func = std::make_shared<ScalarFunction>("match_substring", Arity::Unary(),
1237                                                  &match_substring_doc);
1238     auto exec_32 = MatchSubstring<StringType, PlainSubstringMatcher>::Exec;
1239     auto exec_64 = MatchSubstring<LargeStringType, PlainSubstringMatcher>::Exec;
1240     DCHECK_OK(func->AddKernel({utf8()}, boolean(), exec_32, MatchSubstringState::Init));
1241     DCHECK_OK(
1242         func->AddKernel({large_utf8()}, boolean(), exec_64, MatchSubstringState::Init));
1243     DCHECK_OK(registry->AddFunction(std::move(func)));
1244   }
1245   {
1246     auto func = std::make_shared<ScalarFunction>("starts_with", Arity::Unary(),
1247                                                  &match_substring_doc);
1248     auto exec_32 = MatchSubstring<StringType, PlainStartsWithMatcher>::Exec;
1249     auto exec_64 = MatchSubstring<LargeStringType, PlainStartsWithMatcher>::Exec;
1250     DCHECK_OK(func->AddKernel({utf8()}, boolean(), exec_32, MatchSubstringState::Init));
1251     DCHECK_OK(
1252         func->AddKernel({large_utf8()}, boolean(), exec_64, MatchSubstringState::Init));
1253     DCHECK_OK(registry->AddFunction(std::move(func)));
1254   }
1255   {
1256     auto func = std::make_shared<ScalarFunction>("ends_with", Arity::Unary(),
1257                                                  &match_substring_doc);
1258     auto exec_32 = MatchSubstring<StringType, PlainEndsWithMatcher>::Exec;
1259     auto exec_64 = MatchSubstring<LargeStringType, PlainEndsWithMatcher>::Exec;
1260     DCHECK_OK(func->AddKernel({utf8()}, boolean(), exec_32, MatchSubstringState::Init));
1261     DCHECK_OK(
1262         func->AddKernel({large_utf8()}, boolean(), exec_64, MatchSubstringState::Init));
1263     DCHECK_OK(registry->AddFunction(std::move(func)));
1264   }
1265 #ifdef ARROW_WITH_RE2
1266   {
1267     auto func = std::make_shared<ScalarFunction>("match_substring_regex", Arity::Unary(),
1268                                                  &match_substring_regex_doc);
1269     auto exec_32 = MatchSubstring<StringType, RegexSubstringMatcher>::Exec;
1270     auto exec_64 = MatchSubstring<LargeStringType, RegexSubstringMatcher>::Exec;
1271     DCHECK_OK(func->AddKernel({utf8()}, boolean(), exec_32, MatchSubstringState::Init));
1272     DCHECK_OK(
1273         func->AddKernel({large_utf8()}, boolean(), exec_64, MatchSubstringState::Init));
1274     DCHECK_OK(registry->AddFunction(std::move(func)));
1275   }
1276   {
1277     auto func =
1278         std::make_shared<ScalarFunction>("match_like", Arity::Unary(), &match_like_doc);
1279     auto exec_32 = MatchLike<StringType>::Exec;
1280     auto exec_64 = MatchLike<LargeStringType>::Exec;
1281     DCHECK_OK(func->AddKernel({utf8()}, boolean(), exec_32, MatchSubstringState::Init));
1282     DCHECK_OK(
1283         func->AddKernel({large_utf8()}, boolean(), exec_64, MatchSubstringState::Init));
1284     DCHECK_OK(registry->AddFunction(std::move(func)));
1285   }
1286 #endif
1287 }
1288
1289 // Substring find - lfind/index/etc.
1290
1291 struct FindSubstring {
1292   const PlainSubstringMatcher matcher_;
1293
1294   explicit FindSubstring(PlainSubstringMatcher matcher) : matcher_(std::move(matcher)) {}
1295
1296   template <typename OutValue, typename... Ignored>
1297   OutValue Call(KernelContext*, util::string_view val, Status*) const {
1298     return static_cast<OutValue>(matcher_.Find(val));
1299   }
1300 };
1301
1302 #ifdef ARROW_WITH_RE2
1303 struct FindSubstringRegex {
1304   std::unique_ptr<RE2> regex_match_;
1305
1306   explicit FindSubstringRegex(const MatchSubstringOptions& options,
1307                               bool literal = false) {
1308     std::string regex = "(";
1309     regex.reserve(options.pattern.length() + 2);
1310     regex += literal ? RE2::QuoteMeta(options.pattern) : options.pattern;
1311     regex += ")";
1312     regex_match_.reset(new RE2(std::move(regex), RegexSubstringMatcher::MakeRE2Options(
1313                                                      options, /*literal=*/false)));
1314   }
1315
1316   template <typename OutValue, typename... Ignored>
1317   OutValue Call(KernelContext*, util::string_view val, Status*) const {
1318     re2::StringPiece piece(val.data(), val.length());
1319     re2::StringPiece match;
1320     if (re2::RE2::PartialMatch(piece, *regex_match_, &match)) {
1321       return static_cast<OutValue>(match.data() - piece.data());
1322     }
1323     return -1;
1324   }
1325 };
1326 #endif
1327
1328 template <typename InputType>
1329 struct FindSubstringExec {
1330   using OffsetType = typename TypeTraits<InputType>::OffsetType;
1331   static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
1332     const MatchSubstringOptions& options = MatchSubstringState::Get(ctx);
1333     if (options.ignore_case) {
1334 #ifdef ARROW_WITH_RE2
1335       applicator::ScalarUnaryNotNullStateful<OffsetType, InputType, FindSubstringRegex>
1336           kernel{FindSubstringRegex(options, /*literal=*/true)};
1337       return kernel.Exec(ctx, batch, out);
1338 #endif
1339       return Status::NotImplemented("ignore_case requires RE2");
1340     }
1341     applicator::ScalarUnaryNotNullStateful<OffsetType, InputType, FindSubstring> kernel{
1342         FindSubstring(PlainSubstringMatcher(options))};
1343     return kernel.Exec(ctx, batch, out);
1344   }
1345 };
1346
1347 const FunctionDoc find_substring_doc(
1348     "Find first occurrence of substring",
1349     ("For each string in `strings`, emit the index of the first occurrence of the given "
1350      "pattern, or -1 if not found.\n"
1351      "Null inputs emit null. The pattern must be given in MatchSubstringOptions."),
1352     {"strings"}, "MatchSubstringOptions");
1353
1354 #ifdef ARROW_WITH_RE2
1355 template <typename InputType>
1356 struct FindSubstringRegexExec {
1357   using OffsetType = typename TypeTraits<InputType>::OffsetType;
1358   static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
1359     const MatchSubstringOptions& options = MatchSubstringState::Get(ctx);
1360     applicator::ScalarUnaryNotNullStateful<OffsetType, InputType, FindSubstringRegex>
1361         kernel{FindSubstringRegex(options, /*literal=*/false)};
1362     return kernel.Exec(ctx, batch, out);
1363   }
1364 };
1365
1366 const FunctionDoc find_substring_regex_doc(
1367     "Find location of first match of regex pattern",
1368     ("For each string in `strings`, emit the index of the first match of the given "
1369      "pattern, or -1 if not found.\n"
1370      "Null inputs emit null. The pattern must be given in MatchSubstringOptions."),
1371     {"strings"}, "MatchSubstringOptions");
1372 #endif
1373
1374 void AddFindSubstring(FunctionRegistry* registry) {
1375   {
1376     auto func = std::make_shared<ScalarFunction>("find_substring", Arity::Unary(),
1377                                                  &find_substring_doc);
1378     for (const auto& ty : BaseBinaryTypes()) {
1379       auto offset_type = offset_bit_width(ty->id()) == 64 ? int64() : int32();
1380       DCHECK_OK(func->AddKernel({ty}, offset_type,
1381                                 GenerateTypeAgnosticVarBinaryBase<FindSubstringExec>(ty),
1382                                 MatchSubstringState::Init));
1383     }
1384     DCHECK_OK(func->AddKernel({InputType(Type::FIXED_SIZE_BINARY)}, int32(),
1385                               FindSubstringExec<FixedSizeBinaryType>::Exec,
1386                               MatchSubstringState::Init));
1387     DCHECK_OK(registry->AddFunction(std::move(func)));
1388   }
1389 #ifdef ARROW_WITH_RE2
1390   {
1391     auto func = std::make_shared<ScalarFunction>("find_substring_regex", Arity::Unary(),
1392                                                  &find_substring_regex_doc);
1393     for (const auto& ty : BaseBinaryTypes()) {
1394       auto offset_type = offset_bit_width(ty->id()) == 64 ? int64() : int32();
1395       DCHECK_OK(
1396           func->AddKernel({ty}, offset_type,
1397                           GenerateTypeAgnosticVarBinaryBase<FindSubstringRegexExec>(ty),
1398                           MatchSubstringState::Init));
1399     }
1400     DCHECK_OK(func->AddKernel({InputType(Type::FIXED_SIZE_BINARY)}, int32(),
1401                               FindSubstringRegexExec<FixedSizeBinaryType>::Exec,
1402                               MatchSubstringState::Init));
1403     DCHECK_OK(registry->AddFunction(std::move(func)));
1404   }
1405 #endif
1406 }
1407
1408 // Substring count
1409
1410 struct CountSubstring {
1411   const PlainSubstringMatcher matcher_;
1412
1413   explicit CountSubstring(PlainSubstringMatcher matcher) : matcher_(std::move(matcher)) {}
1414
1415   template <typename OutValue, typename... Ignored>
1416   OutValue Call(KernelContext*, util::string_view val, Status*) const {
1417     OutValue count = 0;
1418     uint64_t start = 0;
1419     const auto pattern_size = std::max<uint64_t>(1, matcher_.options_.pattern.size());
1420     while (start <= val.size()) {
1421       const int64_t index = matcher_.Find(val.substr(start));
1422       if (index >= 0) {
1423         count++;
1424         start += index + pattern_size;
1425       } else {
1426         break;
1427       }
1428     }
1429     return count;
1430   }
1431 };
1432
1433 #ifdef ARROW_WITH_RE2
1434 struct CountSubstringRegex {
1435   std::unique_ptr<RE2> regex_match_;
1436
1437   explicit CountSubstringRegex(const MatchSubstringOptions& options, bool literal = false)
1438       : regex_match_(new RE2(options.pattern,
1439                              RegexSubstringMatcher::MakeRE2Options(options, literal))) {}
1440
1441   static Result<CountSubstringRegex> Make(const MatchSubstringOptions& options,
1442                                           bool literal = false) {
1443     CountSubstringRegex counter(options, literal);
1444     RETURN_NOT_OK(RegexStatus(*counter.regex_match_));
1445     return std::move(counter);
1446   }
1447
1448   template <typename OutValue, typename... Ignored>
1449   OutValue Call(KernelContext*, util::string_view val, Status*) const {
1450     OutValue count = 0;
1451     re2::StringPiece input(val.data(), val.size());
1452     auto last_size = input.size();
1453     while (re2::RE2::FindAndConsume(&input, *regex_match_)) {
1454       count++;
1455       if (last_size == input.size()) {
1456         // 0-length match
1457         if (input.size() > 0) {
1458           input.remove_prefix(1);
1459         } else {
1460           break;
1461         }
1462       }
1463       last_size = input.size();
1464     }
1465     return count;
1466   }
1467 };
1468
1469 template <typename InputType>
1470 struct CountSubstringRegexExec {
1471   using OffsetType = typename TypeTraits<InputType>::OffsetType;
1472   static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
1473     const MatchSubstringOptions& options = MatchSubstringState::Get(ctx);
1474     ARROW_ASSIGN_OR_RAISE(auto counter, CountSubstringRegex::Make(options));
1475     applicator::ScalarUnaryNotNullStateful<OffsetType, InputType, CountSubstringRegex>
1476         kernel{std::move(counter)};
1477     return kernel.Exec(ctx, batch, out);
1478   }
1479 };
1480 #endif
1481
1482 template <typename InputType>
1483 struct CountSubstringExec {
1484   using OffsetType = typename TypeTraits<InputType>::OffsetType;
1485   static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
1486     const MatchSubstringOptions& options = MatchSubstringState::Get(ctx);
1487     if (options.ignore_case) {
1488 #ifdef ARROW_WITH_RE2
1489       ARROW_ASSIGN_OR_RAISE(auto counter,
1490                             CountSubstringRegex::Make(options, /*literal=*/true));
1491       applicator::ScalarUnaryNotNullStateful<OffsetType, InputType, CountSubstringRegex>
1492           kernel{std::move(counter)};
1493       return kernel.Exec(ctx, batch, out);
1494 #else
1495       return Status::NotImplemented("ignore_case requires RE2");
1496 #endif
1497     }
1498     applicator::ScalarUnaryNotNullStateful<OffsetType, InputType, CountSubstring> kernel{
1499         CountSubstring(PlainSubstringMatcher(options))};
1500     return kernel.Exec(ctx, batch, out);
1501   }
1502 };
1503
1504 const FunctionDoc count_substring_doc(
1505     "Count occurrences of substring",
1506     ("For each string in `strings`, emit the number of occurrences of the given "
1507      "pattern.\n"
1508      "Null inputs emit null. The pattern must be given in MatchSubstringOptions."),
1509     {"strings"}, "MatchSubstringOptions");
1510
1511 #ifdef ARROW_WITH_RE2
1512 const FunctionDoc count_substring_regex_doc(
1513     "Count occurrences of substring",
1514     ("For each string in `strings`, emit the number of occurrences of the given "
1515      "regex pattern.\n"
1516      "Null inputs emit null. The pattern must be given in MatchSubstringOptions."),
1517     {"strings"}, "MatchSubstringOptions");
1518 #endif
1519
1520 void AddCountSubstring(FunctionRegistry* registry) {
1521   {
1522     auto func = std::make_shared<ScalarFunction>("count_substring", Arity::Unary(),
1523                                                  &count_substring_doc);
1524     for (const auto& ty : BaseBinaryTypes()) {
1525       auto offset_type = offset_bit_width(ty->id()) == 64 ? int64() : int32();
1526       DCHECK_OK(func->AddKernel({ty}, offset_type,
1527                                 GenerateTypeAgnosticVarBinaryBase<CountSubstringExec>(ty),
1528                                 MatchSubstringState::Init));
1529     }
1530     DCHECK_OK(func->AddKernel({InputType(Type::FIXED_SIZE_BINARY)}, int32(),
1531                               CountSubstringExec<FixedSizeBinaryType>::Exec,
1532                               MatchSubstringState::Init));
1533     DCHECK_OK(registry->AddFunction(std::move(func)));
1534   }
1535 #ifdef ARROW_WITH_RE2
1536   {
1537     auto func = std::make_shared<ScalarFunction>("count_substring_regex", Arity::Unary(),
1538                                                  &count_substring_regex_doc);
1539     for (const auto& ty : BaseBinaryTypes()) {
1540       auto offset_type = offset_bit_width(ty->id()) == 64 ? int64() : int32();
1541       DCHECK_OK(
1542           func->AddKernel({ty}, offset_type,
1543                           GenerateTypeAgnosticVarBinaryBase<CountSubstringRegexExec>(ty),
1544                           MatchSubstringState::Init));
1545     }
1546     DCHECK_OK(func->AddKernel({InputType(Type::FIXED_SIZE_BINARY)}, int32(),
1547                               CountSubstringRegexExec<FixedSizeBinaryType>::Exec,
1548                               MatchSubstringState::Init));
1549     DCHECK_OK(registry->AddFunction(std::move(func)));
1550   }
1551 #endif
1552 }
1553
1554 // Slicing
1555
1556 struct SliceTransformBase : public StringTransformBase {
1557   using State = OptionsWrapper<SliceOptions>;
1558
1559   const SliceOptions* options;
1560
1561   Status PreExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) override {
1562     options = &State::Get(ctx);
1563     if (options->step == 0) {
1564       return Status::Invalid("Slice step cannot be zero");
1565     }
1566     return Status::OK();
1567   }
1568 };
1569
1570 struct SliceCodeunitsTransform : SliceTransformBase {
1571   int64_t MaxCodeunits(int64_t ninputs, int64_t input_ncodeunits) override {
1572     const SliceOptions& opt = *this->options;
1573     if ((opt.start >= 0) != (opt.stop >= 0)) {
1574       // If start and stop don't have the same sign, we can't guess an upper bound
1575       // on the resulting slice lengths, so return a worst case estimate.
1576       return input_ncodeunits;
1577     }
1578     int64_t max_slice_codepoints = (opt.stop - opt.start + opt.step - 1) / opt.step;
1579     // The maximum UTF8 byte size of a codepoint is 4
1580     return std::min(input_ncodeunits,
1581                     4 * ninputs * std::max<int64_t>(0, max_slice_codepoints));
1582   }
1583
1584   int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
1585                     uint8_t* output) {
1586     if (options->step >= 1) {
1587       return SliceForward(input, input_string_ncodeunits, output);
1588     }
1589     return SliceBackward(input, input_string_ncodeunits, output);
1590   }
1591
1592 #define RETURN_IF_UTF8_ERROR(expr)    \
1593   do {                                \
1594     if (ARROW_PREDICT_FALSE(!expr)) { \
1595       return kTransformError;         \
1596     }                                 \
1597   } while (0)
1598
1599   int64_t SliceForward(const uint8_t* input, int64_t input_string_ncodeunits,
1600                        uint8_t* output) {
1601     // Slice in forward order (step > 0)
1602     const SliceOptions& opt = *this->options;
1603     const uint8_t* begin = input;
1604     const uint8_t* end = input + input_string_ncodeunits;
1605     const uint8_t* begin_sliced = begin;
1606     const uint8_t* end_sliced = end;
1607
1608     // First, compute begin_sliced and end_sliced
1609     if (opt.start >= 0) {
1610       // start counting from the left
1611       RETURN_IF_UTF8_ERROR(
1612           arrow::util::UTF8AdvanceCodepoints(begin, end, &begin_sliced, opt.start));
1613       if (opt.stop > opt.start) {
1614         // continue counting from begin_sliced
1615         const int64_t length = opt.stop - opt.start;
1616         RETURN_IF_UTF8_ERROR(
1617             arrow::util::UTF8AdvanceCodepoints(begin_sliced, end, &end_sliced, length));
1618       } else if (opt.stop < 0) {
1619         // or from the end (but we will never need to < begin_sliced)
1620         RETURN_IF_UTF8_ERROR(arrow::util::UTF8AdvanceCodepointsReverse(
1621             begin_sliced, end, &end_sliced, -opt.stop));
1622       } else {
1623         // zero length slice
1624         return 0;
1625       }
1626     } else {
1627       // start counting from the right
1628       RETURN_IF_UTF8_ERROR(arrow::util::UTF8AdvanceCodepointsReverse(
1629           begin, end, &begin_sliced, -opt.start));
1630       if (opt.stop > 0) {
1631         // continue counting from the left, we cannot start from begin_sliced because we
1632         // don't know how many codepoints are between begin and begin_sliced
1633         RETURN_IF_UTF8_ERROR(
1634             arrow::util::UTF8AdvanceCodepoints(begin, end, &end_sliced, opt.stop));
1635         // and therefore we also needs this
1636         if (end_sliced <= begin_sliced) {
1637           // zero length slice
1638           return 0;
1639         }
1640       } else if ((opt.stop < 0) && (opt.stop > opt.start)) {
1641         // stop is negative, but larger than start, so we count again from the right
1642         // in some cases we can optimize this, depending on the shortest path (from end
1643         // or begin_sliced), but begin_sliced and opt.start can be 'out of sync',
1644         // for instance when start=-100, when the string length is only 10.
1645         RETURN_IF_UTF8_ERROR(arrow::util::UTF8AdvanceCodepointsReverse(
1646             begin_sliced, end, &end_sliced, -opt.stop));
1647       } else {
1648         // zero length slice
1649         return 0;
1650       }
1651     }
1652
1653     // Second, copy computed slice to output
1654     DCHECK(begin_sliced <= end_sliced);
1655     if (opt.step == 1) {
1656       // fast case, where we simply can finish with a memcpy
1657       std::copy(begin_sliced, end_sliced, output);
1658       return end_sliced - begin_sliced;
1659     }
1660     uint8_t* dest = output;
1661     const uint8_t* i = begin_sliced;
1662
1663     while (i < end_sliced) {
1664       uint32_t codepoint = 0;
1665       // write a single codepoint
1666       RETURN_IF_UTF8_ERROR(arrow::util::UTF8Decode(&i, &codepoint));
1667       dest = arrow::util::UTF8Encode(dest, codepoint);
1668       // and skip the remainder
1669       int64_t skips = opt.step - 1;
1670       while ((skips--) && (i < end_sliced)) {
1671         RETURN_IF_UTF8_ERROR(arrow::util::UTF8Decode(&i, &codepoint));
1672       }
1673     }
1674     return dest - output;
1675   }
1676
1677   int64_t SliceBackward(const uint8_t* input, int64_t input_string_ncodeunits,
1678                         uint8_t* output) {
1679     // Slice in reverse order (step < 0)
1680     const SliceOptions& opt = *this->options;
1681     const uint8_t* begin = input;
1682     const uint8_t* end = input + input_string_ncodeunits;
1683     const uint8_t* begin_sliced = begin;
1684     const uint8_t* end_sliced = end;
1685
1686     // Serious +1 -1 kung fu because begin_sliced and end_sliced act like
1687     // reverse iterators.
1688     if (opt.start >= 0) {
1689       // +1 because begin_sliced acts as as the end of a reverse iterator
1690       RETURN_IF_UTF8_ERROR(
1691           arrow::util::UTF8AdvanceCodepoints(begin, end, &begin_sliced, opt.start + 1));
1692     } else {
1693       // -1 because start=-1 means the last codeunit, which is 0 advances
1694       RETURN_IF_UTF8_ERROR(arrow::util::UTF8AdvanceCodepointsReverse(
1695           begin, end, &begin_sliced, -opt.start - 1));
1696     }
1697     // make it point at the last codeunit of the previous codeunit
1698     begin_sliced--;
1699
1700     // similar to opt.start
1701     if (opt.stop >= 0) {
1702       RETURN_IF_UTF8_ERROR(
1703           arrow::util::UTF8AdvanceCodepoints(begin, end, &end_sliced, opt.stop + 1));
1704     } else {
1705       RETURN_IF_UTF8_ERROR(arrow::util::UTF8AdvanceCodepointsReverse(
1706           begin, end, &end_sliced, -opt.stop - 1));
1707     }
1708     end_sliced--;
1709
1710     // Copy computed slice to output
1711     uint8_t* dest = output;
1712     const uint8_t* i = begin_sliced;
1713     while (i > end_sliced) {
1714       uint32_t codepoint = 0;
1715       // write a single codepoint
1716       RETURN_IF_UTF8_ERROR(arrow::util::UTF8DecodeReverse(&i, &codepoint));
1717       dest = arrow::util::UTF8Encode(dest, codepoint);
1718       // and skip the remainder
1719       int64_t skips = -opt.step - 1;
1720       while ((skips--) && (i > end_sliced)) {
1721         RETURN_IF_UTF8_ERROR(arrow::util::UTF8DecodeReverse(&i, &codepoint));
1722       }
1723     }
1724     return dest - output;
1725   }
1726
1727 #undef RETURN_IF_UTF8_ERROR
1728 };
1729
1730 template <typename Type>
1731 using SliceCodeunits = StringTransformExec<Type, SliceCodeunitsTransform>;
1732
1733 const FunctionDoc utf8_slice_codeunits_doc(
1734     "Slice string ",
1735     ("For each string in `strings`, slice into a substring defined by\n"
1736      "`start`, `stop`, `step`) as given by `SliceOptions` where `start` is inclusive\n"
1737      "and `stop` is exclusive and are measured in codeunits. If step is negative, the\n"
1738      "string will be advanced in reversed order. A `step` of zero is considered an\n"
1739      "error.\n"
1740      "Null inputs emit null."),
1741     {"strings"}, "SliceOptions");
1742
1743 void AddSlice(FunctionRegistry* registry) {
1744   auto func = std::make_shared<ScalarFunction>("utf8_slice_codeunits", Arity::Unary(),
1745                                                &utf8_slice_codeunits_doc);
1746   using t32 = SliceCodeunits<StringType>;
1747   using t64 = SliceCodeunits<LargeStringType>;
1748   DCHECK_OK(
1749       func->AddKernel({utf8()}, utf8(), t32::Exec, SliceCodeunitsTransform::State::Init));
1750   DCHECK_OK(func->AddKernel({large_utf8()}, large_utf8(), t64::Exec,
1751                             SliceCodeunitsTransform::State::Init));
1752   DCHECK_OK(registry->AddFunction(std::move(func)));
1753 }
1754
1755 template <typename Derived, bool allow_empty = false>
1756 struct CharacterPredicateUnicode {
1757   static bool Call(KernelContext*, const uint8_t* input, size_t input_string_ncodeunits,
1758                    Status* st) {
1759     if (allow_empty && input_string_ncodeunits == 0) {
1760       return true;
1761     }
1762     bool all;
1763     bool any = false;
1764     if (!ARROW_PREDICT_TRUE(arrow::util::UTF8AllOf(
1765             input, input + input_string_ncodeunits, &all, [&any](uint32_t codepoint) {
1766               any |= Derived::PredicateCharacterAny(codepoint);
1767               return Derived::PredicateCharacterAll(codepoint);
1768             }))) {
1769       *st = Status::Invalid("Invalid UTF8 sequence in input");
1770       return false;
1771     }
1772     return all & any;
1773   }
1774
1775   static inline bool PredicateCharacterAny(uint32_t) {
1776     return true;  // default condition make sure there is at least 1 charachter
1777   }
1778 };
1779
1780 template <typename Derived, bool allow_empty = false>
1781 struct CharacterPredicateAscii {
1782   static bool Call(KernelContext*, const uint8_t* input, size_t input_string_ncodeunits,
1783                    Status*) {
1784     if (allow_empty && input_string_ncodeunits == 0) {
1785       return true;
1786     }
1787     bool any = false;
1788     // MB: A simple for loops seems 8% faster on gcc 9.3, running the IsAlphaNumericAscii
1789     // benchmark. I don't consider that worth it.
1790     bool all = std::all_of(input, input + input_string_ncodeunits,
1791                            [&any](uint8_t ascii_character) {
1792                              any |= Derived::PredicateCharacterAny(ascii_character);
1793                              return Derived::PredicateCharacterAll(ascii_character);
1794                            });
1795     return all & any;
1796   }
1797
1798   static inline bool PredicateCharacterAny(uint8_t) {
1799     return true;  // default condition make sure there is at least 1 charachter
1800   }
1801 };
1802
1803 #ifdef ARROW_WITH_UTF8PROC
1804 struct IsAlphaNumericUnicode : CharacterPredicateUnicode<IsAlphaNumericUnicode> {
1805   static inline bool PredicateCharacterAll(uint32_t codepoint) {
1806     return IsAlphaNumericCharacterUnicode(codepoint);
1807   }
1808 };
1809 #endif
1810
1811 struct IsAlphaNumericAscii : CharacterPredicateAscii<IsAlphaNumericAscii> {
1812   static inline bool PredicateCharacterAll(uint8_t ascii_character) {
1813     return IsAlphaNumericCharacterAscii(ascii_character);
1814   }
1815 };
1816
1817 #ifdef ARROW_WITH_UTF8PROC
1818 struct IsAlphaUnicode : CharacterPredicateUnicode<IsAlphaUnicode> {
1819   static inline bool PredicateCharacterAll(uint32_t codepoint) {
1820     return IsAlphaCharacterUnicode(codepoint);
1821   }
1822 };
1823 #endif
1824
1825 struct IsAlphaAscii : CharacterPredicateAscii<IsAlphaAscii> {
1826   static inline bool PredicateCharacterAll(uint8_t ascii_character) {
1827     return IsAlphaCharacterAscii(ascii_character);
1828   }
1829 };
1830
1831 #ifdef ARROW_WITH_UTF8PROC
1832 struct IsDecimalUnicode : CharacterPredicateUnicode<IsDecimalUnicode> {
1833   static inline bool PredicateCharacterAll(uint32_t codepoint) {
1834     return IsDecimalCharacterUnicode(codepoint);
1835   }
1836 };
1837 #endif
1838
1839 struct IsDecimalAscii : CharacterPredicateAscii<IsDecimalAscii> {
1840   static inline bool PredicateCharacterAll(uint8_t ascii_character) {
1841     return IsDecimalCharacterAscii(ascii_character);
1842   }
1843 };
1844
1845 #ifdef ARROW_WITH_UTF8PROC
1846 struct IsDigitUnicode : CharacterPredicateUnicode<IsDigitUnicode> {
1847   static inline bool PredicateCharacterAll(uint32_t codepoint) {
1848     return IsDigitCharacterUnicode(codepoint);
1849   }
1850 };
1851
1852 struct IsNumericUnicode : CharacterPredicateUnicode<IsNumericUnicode> {
1853   static inline bool PredicateCharacterAll(uint32_t codepoint) {
1854     return IsNumericCharacterUnicode(codepoint);
1855   }
1856 };
1857 #endif
1858
1859 struct IsAscii {
1860   static bool Call(KernelContext*, const uint8_t* input,
1861                    size_t input_string_nascii_characters, Status*) {
1862     return std::all_of(input, input + input_string_nascii_characters, IsAsciiCharacter);
1863   }
1864 };
1865
1866 #ifdef ARROW_WITH_UTF8PROC
1867 struct IsLowerUnicode : CharacterPredicateUnicode<IsLowerUnicode> {
1868   static inline bool PredicateCharacterAll(uint32_t codepoint) {
1869     // Only for cased character it needs to be lower case
1870     return !IsCasedCharacterUnicode(codepoint) || IsLowerCaseCharacterUnicode(codepoint);
1871   }
1872   static inline bool PredicateCharacterAny(uint32_t codepoint) {
1873     return IsCasedCharacterUnicode(codepoint);  // at least 1 cased character
1874   }
1875 };
1876 #endif
1877
1878 struct IsLowerAscii : CharacterPredicateAscii<IsLowerAscii> {
1879   static inline bool PredicateCharacterAll(uint8_t ascii_character) {
1880     // Only for cased character it needs to be lower case
1881     return !IsCasedCharacterAscii(ascii_character) ||
1882            IsLowerCaseCharacterAscii(ascii_character);
1883   }
1884   static inline bool PredicateCharacterAny(uint8_t ascii_character) {
1885     return IsCasedCharacterAscii(ascii_character);  // at least 1 cased character
1886   }
1887 };
1888
1889 #ifdef ARROW_WITH_UTF8PROC
1890 struct IsPrintableUnicode
1891     : CharacterPredicateUnicode<IsPrintableUnicode, /*allow_empty=*/true> {
1892   static inline bool PredicateCharacterAll(uint32_t codepoint) {
1893     return codepoint == ' ' || IsPrintableCharacterUnicode(codepoint);
1894   }
1895 };
1896 #endif
1897
1898 struct IsPrintableAscii
1899     : CharacterPredicateAscii<IsPrintableAscii, /*allow_empty=*/true> {
1900   static inline bool PredicateCharacterAll(uint8_t ascii_character) {
1901     return IsPrintableCharacterAscii(ascii_character);
1902   }
1903 };
1904
1905 #ifdef ARROW_WITH_UTF8PROC
1906 struct IsSpaceUnicode : CharacterPredicateUnicode<IsSpaceUnicode> {
1907   static inline bool PredicateCharacterAll(uint32_t codepoint) {
1908     return IsSpaceCharacterUnicode(codepoint);
1909   }
1910 };
1911 #endif
1912
1913 struct IsSpaceAscii : CharacterPredicateAscii<IsSpaceAscii> {
1914   static inline bool PredicateCharacterAll(uint8_t ascii_character) {
1915     return IsSpaceCharacterAscii(ascii_character);
1916   }
1917 };
1918
1919 #ifdef ARROW_WITH_UTF8PROC
1920 struct IsTitleUnicode {
1921   static bool Call(KernelContext*, const uint8_t* input, size_t input_string_ncodeunits,
1922                    Status* st) {
1923     // rules:
1924     //   1. lower case follows cased
1925     //   2. upper case follows uncased
1926     //   3. at least 1 cased character (which logically should be upper/title)
1927     bool rules_1_and_2;
1928     bool previous_cased = false;  // in LL, LU or LT
1929     bool rule_3 = false;
1930     bool status =
1931         arrow::util::UTF8AllOf(input, input + input_string_ncodeunits, &rules_1_and_2,
1932                                [&previous_cased, &rule_3](uint32_t codepoint) {
1933                                  if (IsLowerCaseCharacterUnicode(codepoint)) {
1934                                    if (!previous_cased) return false;  // rule 1 broken
1935                                    // next should be more lower case or uncased
1936                                    previous_cased = true;
1937                                  } else if (IsCasedCharacterUnicode(codepoint)) {
1938                                    if (previous_cased) return false;  // rule 2 broken
1939                                    // next should be a lower case or uncased
1940                                    previous_cased = true;
1941                                    rule_3 = true;  // rule 3 obeyed
1942                                  } else {
1943                                    // an uncased char, like _ or 1
1944                                    // next should be upper case or more uncased
1945                                    previous_cased = false;
1946                                  }
1947                                  return true;
1948                                });
1949     if (!ARROW_PREDICT_TRUE(status)) {
1950       *st = Status::Invalid("Invalid UTF8 sequence in input");
1951       return false;
1952     }
1953     return rules_1_and_2 & rule_3;
1954   }
1955 };
1956 #endif
1957
1958 struct IsTitleAscii {
1959   static bool Call(KernelContext*, const uint8_t* input, size_t input_string_ncodeunits,
1960                    Status*) {
1961     // Rules:
1962     //   1. lower case follows cased
1963     //   2. upper case follows uncased
1964     //   3. at least 1 cased character (which logically should be upper/title)
1965     bool rules_1_and_2 = true;
1966     bool previous_cased = false;  // in LL, LU or LT
1967     bool rule_3 = false;
1968     for (const uint8_t* c = input; c < input + input_string_ncodeunits; ++c) {
1969       if (IsLowerCaseCharacterAscii(*c)) {
1970         if (!previous_cased) {
1971           // rule 1 broken
1972           rules_1_and_2 = false;
1973           break;
1974         }
1975         // next should be more lower case or uncased
1976         previous_cased = true;
1977       } else if (IsCasedCharacterAscii(*c)) {
1978         if (previous_cased) {
1979           // rule 2 broken
1980           rules_1_and_2 = false;
1981           break;
1982         }
1983         // next should be a lower case or uncased
1984         previous_cased = true;
1985         rule_3 = true;  // rule 3 obeyed
1986       } else {
1987         // an uncased character, like _ or 1
1988         // next should be upper case or more uncased
1989         previous_cased = false;
1990       }
1991     }
1992     return rules_1_and_2 & rule_3;
1993   }
1994 };
1995
1996 #ifdef ARROW_WITH_UTF8PROC
1997 struct IsUpperUnicode : CharacterPredicateUnicode<IsUpperUnicode> {
1998   static inline bool PredicateCharacterAll(uint32_t codepoint) {
1999     // Only for cased character it needs to be lower case
2000     return !IsCasedCharacterUnicode(codepoint) || IsUpperCaseCharacterUnicode(codepoint);
2001   }
2002   static inline bool PredicateCharacterAny(uint32_t codepoint) {
2003     return IsCasedCharacterUnicode(codepoint);  // at least 1 cased character
2004   }
2005 };
2006 #endif
2007
2008 struct IsUpperAscii : CharacterPredicateAscii<IsUpperAscii> {
2009   static inline bool PredicateCharacterAll(uint8_t ascii_character) {
2010     // Only for cased character it needs to be lower case
2011     return !IsCasedCharacterAscii(ascii_character) ||
2012            IsUpperCaseCharacterAscii(ascii_character);
2013   }
2014   static inline bool PredicateCharacterAny(uint8_t ascii_character) {
2015     return IsCasedCharacterAscii(ascii_character);  // at least 1 cased character
2016   }
2017 };
2018
2019 // splitting
2020
2021 template <typename Options>
2022 struct SplitFinderBase {
2023   virtual ~SplitFinderBase() = default;
2024   virtual Status PreExec(const Options& options) { return Status::OK(); }
2025
2026   // Derived classes should also define these methods:
2027   //   static bool Find(const uint8_t* begin, const uint8_t* end,
2028   //                    const uint8_t** separator_begin,
2029   //                    const uint8_t** separator_end,
2030   //                    const SplitPatternOptions& options);
2031   //
2032   //   static bool FindReverse(const uint8_t* begin, const uint8_t* end,
2033   //                           const uint8_t** separator_begin,
2034   //                           const uint8_t** separator_end,
2035   //                           const SplitPatternOptions& options);
2036 };
2037
2038 template <typename Type, typename ListType, typename SplitFinder,
2039           typename Options = typename SplitFinder::Options>
2040 struct SplitExec {
2041   using string_offset_type = typename Type::offset_type;
2042   using list_offset_type = typename ListType::offset_type;
2043   using ArrayType = typename TypeTraits<Type>::ArrayType;
2044   using ArrayListType = typename TypeTraits<ListType>::ArrayType;
2045   using ListScalarType = typename TypeTraits<ListType>::ScalarType;
2046   using ScalarType = typename TypeTraits<Type>::ScalarType;
2047   using BuilderType = typename TypeTraits<Type>::BuilderType;
2048   using ListOffsetsBuilderType = TypedBufferBuilder<list_offset_type>;
2049   using State = OptionsWrapper<Options>;
2050
2051   // Keep the temporary storage accross individual values, to minimize reallocations
2052   std::vector<util::string_view> parts;
2053   Options options;
2054
2055   explicit SplitExec(const Options& options) : options(options) {}
2056
2057   static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
2058     return SplitExec{State::Get(ctx)}.Execute(ctx, batch, out);
2059   }
2060
2061   Status Execute(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
2062     SplitFinder finder;
2063     RETURN_NOT_OK(finder.PreExec(options));
2064     if (batch[0].kind() == Datum::ARRAY) {
2065       return Execute(ctx, &finder, batch[0].array(), out);
2066     }
2067     DCHECK_EQ(batch[0].kind(), Datum::SCALAR);
2068     return Execute(ctx, &finder, batch[0].scalar(), out);
2069   }
2070
2071   Status Execute(KernelContext* ctx, SplitFinder* finder,
2072                  const std::shared_ptr<ArrayData>& data, Datum* out) {
2073     const ArrayType input(data);
2074
2075     BuilderType builder(input.type(), ctx->memory_pool());
2076     // A slight overestimate of the data needed
2077     RETURN_NOT_OK(builder.ReserveData(input.total_values_length()));
2078     // The minimum amount of strings needed
2079     RETURN_NOT_OK(builder.Resize(input.length() - input.null_count()));
2080
2081     ArrayData* output_list = out->mutable_array();
2082     // List offsets were preallocated
2083     auto* list_offsets = output_list->GetMutableValues<list_offset_type>(1);
2084     DCHECK_NE(list_offsets, nullptr);
2085     // Initial value
2086     *list_offsets++ = 0;
2087     for (int64_t i = 0; i < input.length(); ++i) {
2088       if (!input.IsNull(i)) {
2089         RETURN_NOT_OK(SplitString(input.GetView(i), finder, &builder));
2090         if (ARROW_PREDICT_FALSE(builder.length() >
2091                                 std::numeric_limits<list_offset_type>::max())) {
2092           return Status::CapacityError("List offset does not fit into 32 bit");
2093         }
2094       }
2095       *list_offsets++ = static_cast<list_offset_type>(builder.length());
2096     }
2097     // Assign string array to list child data
2098     std::shared_ptr<Array> string_array;
2099     RETURN_NOT_OK(builder.Finish(&string_array));
2100     output_list->child_data.push_back(string_array->data());
2101     return Status::OK();
2102   }
2103
2104   Status Execute(KernelContext* ctx, SplitFinder* finder,
2105                  const std::shared_ptr<Scalar>& scalar, Datum* out) {
2106     const auto& input = checked_cast<const ScalarType&>(*scalar);
2107     auto result = checked_cast<ListScalarType*>(out->scalar().get());
2108     if (input.is_valid) {
2109       result->is_valid = true;
2110       BuilderType builder(input.type, ctx->memory_pool());
2111       util::string_view s(*input.value);
2112       RETURN_NOT_OK(SplitString(s, finder, &builder));
2113       RETURN_NOT_OK(builder.Finish(&result->value));
2114     }
2115     return Status::OK();
2116   }
2117
2118   Status SplitString(const util::string_view& s, SplitFinder* finder,
2119                      BuilderType* builder) {
2120     const uint8_t* begin = reinterpret_cast<const uint8_t*>(s.data());
2121     const uint8_t* end = begin + s.length();
2122
2123     int64_t max_splits = options.max_splits;
2124     // if there is no max splits, reversing does not make sense (and is probably less
2125     // efficient), but is useful for testing
2126     if (options.reverse) {
2127       // note that i points 1 further than the 'current'
2128       const uint8_t* i = end;
2129       // we will record the parts in reverse order
2130       parts.clear();
2131       if (max_splits > -1) {
2132         parts.reserve(max_splits + 1);
2133       }
2134       while (max_splits != 0) {
2135         const uint8_t *separator_begin, *separator_end;
2136         // find with whatever algo the part we will 'cut out'
2137         if (finder->FindReverse(begin, i, &separator_begin, &separator_end, options)) {
2138           parts.emplace_back(reinterpret_cast<const char*>(separator_end),
2139                              i - separator_end);
2140           i = separator_begin;
2141           max_splits--;
2142         } else {
2143           // if we cannot find a separator, we're done
2144           break;
2145         }
2146       }
2147       parts.emplace_back(reinterpret_cast<const char*>(begin), i - begin);
2148       // now we do the copying
2149       for (auto it = parts.rbegin(); it != parts.rend(); ++it) {
2150         RETURN_NOT_OK(builder->Append(*it));
2151       }
2152     } else {
2153       const uint8_t* i = begin;
2154       while (max_splits != 0) {
2155         const uint8_t *separator_begin, *separator_end;
2156         // find with whatever algo the part we will 'cut out'
2157         if (finder->Find(i, end, &separator_begin, &separator_end, options)) {
2158           // the part till the beginning of the 'cut'
2159           RETURN_NOT_OK(
2160               builder->Append(i, static_cast<string_offset_type>(separator_begin - i)));
2161           i = separator_end;
2162           max_splits--;
2163         } else {
2164           // if we cannot find a separator, we're done
2165           break;
2166         }
2167       }
2168       // trailing part
2169       RETURN_NOT_OK(builder->Append(i, static_cast<string_offset_type>(end - i)));
2170     }
2171     return Status::OK();
2172   }
2173 };
2174
2175 struct SplitPatternFinder : public SplitFinderBase<SplitPatternOptions> {
2176   using Options = SplitPatternOptions;
2177
2178   Status PreExec(const SplitPatternOptions& options) override {
2179     if (options.pattern.length() == 0) {
2180       return Status::Invalid("Empty separator");
2181     }
2182     return Status::OK();
2183   }
2184
2185   static bool Find(const uint8_t* begin, const uint8_t* end,
2186                    const uint8_t** separator_begin, const uint8_t** separator_end,
2187                    const SplitPatternOptions& options) {
2188     const uint8_t* pattern = reinterpret_cast<const uint8_t*>(options.pattern.c_str());
2189     const int64_t pattern_length = options.pattern.length();
2190     const uint8_t* i = begin;
2191     // this is O(n*m) complexity, we could use the Knuth-Morris-Pratt algorithm used in
2192     // the match kernel
2193     while ((i + pattern_length <= end)) {
2194       i = std::search(i, end, pattern, pattern + pattern_length);
2195       if (i != end) {
2196         *separator_begin = i;
2197         *separator_end = i + pattern_length;
2198         return true;
2199       }
2200     }
2201     return false;
2202   }
2203
2204   static bool FindReverse(const uint8_t* begin, const uint8_t* end,
2205                           const uint8_t** separator_begin, const uint8_t** separator_end,
2206                           const SplitPatternOptions& options) {
2207     const uint8_t* pattern = reinterpret_cast<const uint8_t*>(options.pattern.c_str());
2208     const int64_t pattern_length = options.pattern.length();
2209     // this is O(n*m) complexity, we could use the Knuth-Morris-Pratt algorithm used in
2210     // the match kernel
2211     std::reverse_iterator<const uint8_t*> ri(end);
2212     std::reverse_iterator<const uint8_t*> rend(begin);
2213     std::reverse_iterator<const uint8_t*> pattern_rbegin(pattern + pattern_length);
2214     std::reverse_iterator<const uint8_t*> pattern_rend(pattern);
2215     while (begin <= ri.base() - pattern_length) {
2216       ri = std::search(ri, rend, pattern_rbegin, pattern_rend);
2217       if (ri != rend) {
2218         *separator_begin = ri.base() - pattern_length;
2219         *separator_end = ri.base();
2220         return true;
2221       }
2222     }
2223     return false;
2224   }
2225 };
2226
2227 template <typename Type, typename ListType>
2228 using SplitPatternExec = SplitExec<Type, ListType, SplitPatternFinder>;
2229
2230 const FunctionDoc split_pattern_doc(
2231     "Split string according to separator",
2232     ("Split each string according to the exact `pattern` defined in\n"
2233      "SplitPatternOptions.  The output for each string input is a list\n"
2234      "of strings.\n"
2235      "\n"
2236      "The maximum number of splits and direction of splitting\n"
2237      "(forward, reverse) can optionally be defined in SplitPatternOptions."),
2238     {"strings"}, "SplitPatternOptions");
2239
2240 const FunctionDoc ascii_split_whitespace_doc(
2241     "Split string according to any ASCII whitespace",
2242     ("Split each string according any non-zero length sequence of ASCII\n"
2243      "whitespace characters.  The output for each string input is a list\n"
2244      "of strings.\n"
2245      "\n"
2246      "The maximum number of splits and direction of splitting\n"
2247      "(forward, reverse) can optionally be defined in SplitOptions."),
2248     {"strings"}, "SplitOptions");
2249
2250 const FunctionDoc utf8_split_whitespace_doc(
2251     "Split string according to any Unicode whitespace",
2252     ("Split each string according any non-zero length sequence of Unicode\n"
2253      "whitespace characters.  The output for each string input is a list\n"
2254      "of strings.\n"
2255      "\n"
2256      "The maximum number of splits and direction of splitting\n"
2257      "(forward, reverse) can optionally be defined in SplitOptions."),
2258     {"strings"}, "SplitOptions");
2259
2260 void AddSplitPattern(FunctionRegistry* registry) {
2261   auto func = std::make_shared<ScalarFunction>("split_pattern", Arity::Unary(),
2262                                                &split_pattern_doc);
2263   using t32 = SplitPatternExec<StringType, ListType>;
2264   using t64 = SplitPatternExec<LargeStringType, ListType>;
2265   DCHECK_OK(func->AddKernel({utf8()}, {list(utf8())}, t32::Exec, t32::State::Init));
2266   DCHECK_OK(
2267       func->AddKernel({large_utf8()}, {list(large_utf8())}, t64::Exec, t64::State::Init));
2268   DCHECK_OK(registry->AddFunction(std::move(func)));
2269 }
2270
2271 struct SplitWhitespaceAsciiFinder : public SplitFinderBase<SplitOptions> {
2272   using Options = SplitOptions;
2273
2274   static bool Find(const uint8_t* begin, const uint8_t* end,
2275                    const uint8_t** separator_begin, const uint8_t** separator_end,
2276                    const SplitOptions& options) {
2277     const uint8_t* i = begin;
2278     while (i < end) {
2279       if (IsSpaceCharacterAscii(*i)) {
2280         *separator_begin = i;
2281         do {
2282           i++;
2283         } while (IsSpaceCharacterAscii(*i) && i < end);
2284         *separator_end = i;
2285         return true;
2286       }
2287       i++;
2288     }
2289     return false;
2290   }
2291
2292   static bool FindReverse(const uint8_t* begin, const uint8_t* end,
2293                           const uint8_t** separator_begin, const uint8_t** separator_end,
2294                           const SplitOptions& options) {
2295     const uint8_t* i = end - 1;
2296     while ((i >= begin)) {
2297       if (IsSpaceCharacterAscii(*i)) {
2298         *separator_end = i + 1;
2299         do {
2300           i--;
2301         } while (IsSpaceCharacterAscii(*i) && i >= begin);
2302         *separator_begin = i + 1;
2303         return true;
2304       }
2305       i--;
2306     }
2307     return false;
2308   }
2309 };
2310
2311 template <typename Type, typename ListType>
2312 using SplitWhitespaceAsciiExec = SplitExec<Type, ListType, SplitWhitespaceAsciiFinder>;
2313
2314 void AddSplitWhitespaceAscii(FunctionRegistry* registry) {
2315   static const SplitOptions default_options{};
2316   auto func =
2317       std::make_shared<ScalarFunction>("ascii_split_whitespace", Arity::Unary(),
2318                                        &ascii_split_whitespace_doc, &default_options);
2319   using t32 = SplitWhitespaceAsciiExec<StringType, ListType>;
2320   using t64 = SplitWhitespaceAsciiExec<LargeStringType, ListType>;
2321   DCHECK_OK(func->AddKernel({utf8()}, {list(utf8())}, t32::Exec, t32::State::Init));
2322   DCHECK_OK(
2323       func->AddKernel({large_utf8()}, {list(large_utf8())}, t64::Exec, t64::State::Init));
2324   DCHECK_OK(registry->AddFunction(std::move(func)));
2325 }
2326
2327 #ifdef ARROW_WITH_UTF8PROC
2328 struct SplitWhitespaceUtf8Finder : public SplitFinderBase<SplitOptions> {
2329   using Options = SplitOptions;
2330
2331   Status PreExec(const SplitOptions& options) override {
2332     EnsureLookupTablesFilled();
2333     return Status::OK();
2334   }
2335
2336   bool Find(const uint8_t* begin, const uint8_t* end, const uint8_t** separator_begin,
2337             const uint8_t** separator_end, const SplitOptions& options) {
2338     const uint8_t* i = begin;
2339     while ((i < end)) {
2340       uint32_t codepoint = 0;
2341       *separator_begin = i;
2342       if (ARROW_PREDICT_FALSE(!arrow::util::UTF8Decode(&i, &codepoint))) {
2343         return false;
2344       }
2345       if (IsSpaceCharacterUnicode(codepoint)) {
2346         do {
2347           *separator_end = i;
2348           if (ARROW_PREDICT_FALSE(!arrow::util::UTF8Decode(&i, &codepoint))) {
2349             return false;
2350           }
2351         } while (IsSpaceCharacterUnicode(codepoint) && i < end);
2352         return true;
2353       }
2354     }
2355     return false;
2356   }
2357
2358   bool FindReverse(const uint8_t* begin, const uint8_t* end,
2359                    const uint8_t** separator_begin, const uint8_t** separator_end,
2360                    const SplitOptions& options) {
2361     const uint8_t* i = end - 1;
2362     while ((i >= begin)) {
2363       uint32_t codepoint = 0;
2364       *separator_end = i + 1;
2365       if (ARROW_PREDICT_FALSE(!arrow::util::UTF8DecodeReverse(&i, &codepoint))) {
2366         return false;
2367       }
2368       if (IsSpaceCharacterUnicode(codepoint)) {
2369         do {
2370           *separator_begin = i + 1;
2371           if (ARROW_PREDICT_FALSE(!arrow::util::UTF8DecodeReverse(&i, &codepoint))) {
2372             return false;
2373           }
2374         } while (IsSpaceCharacterUnicode(codepoint) && i >= begin);
2375         return true;
2376       }
2377     }
2378     return false;
2379   }
2380 };
2381
2382 template <typename Type, typename ListType>
2383 using SplitWhitespaceUtf8Exec = SplitExec<Type, ListType, SplitWhitespaceUtf8Finder>;
2384
2385 void AddSplitWhitespaceUTF8(FunctionRegistry* registry) {
2386   static const SplitOptions default_options{};
2387   auto func =
2388       std::make_shared<ScalarFunction>("utf8_split_whitespace", Arity::Unary(),
2389                                        &utf8_split_whitespace_doc, &default_options);
2390   using t32 = SplitWhitespaceUtf8Exec<StringType, ListType>;
2391   using t64 = SplitWhitespaceUtf8Exec<LargeStringType, ListType>;
2392   DCHECK_OK(func->AddKernel({utf8()}, {list(utf8())}, t32::Exec, t32::State::Init));
2393   DCHECK_OK(
2394       func->AddKernel({large_utf8()}, {list(large_utf8())}, t64::Exec, t64::State::Init));
2395   DCHECK_OK(registry->AddFunction(std::move(func)));
2396 }
2397 #endif  // ARROW_WITH_UTF8PROC
2398
2399 #ifdef ARROW_WITH_RE2
2400 struct SplitRegexFinder : public SplitFinderBase<SplitPatternOptions> {
2401   using Options = SplitPatternOptions;
2402
2403   util::optional<RE2> regex_split;
2404
2405   Status PreExec(const SplitPatternOptions& options) override {
2406     if (options.reverse) {
2407       return Status::NotImplemented("Cannot split in reverse with regex");
2408     }
2409     // RE2 does *not* give you the full match! Must wrap the regex in a capture group
2410     // There is FindAndConsume, but it would give only the end of the separator
2411     std::string pattern = "(";
2412     pattern.reserve(options.pattern.size() + 2);
2413     pattern += options.pattern;
2414     pattern += ')';
2415     regex_split.emplace(std::move(pattern));
2416     return RegexStatus(*regex_split);
2417   }
2418
2419   bool Find(const uint8_t* begin, const uint8_t* end, const uint8_t** separator_begin,
2420             const uint8_t** separator_end, const SplitPatternOptions& options) {
2421     re2::StringPiece piece(reinterpret_cast<const char*>(begin),
2422                            std::distance(begin, end));
2423     // "StringPiece is mutated to point to matched piece"
2424     re2::StringPiece result;
2425     if (!re2::RE2::PartialMatch(piece, *regex_split, &result)) {
2426       return false;
2427     }
2428     *separator_begin = reinterpret_cast<const uint8_t*>(result.data());
2429     *separator_end = reinterpret_cast<const uint8_t*>(result.data() + result.size());
2430     return true;
2431   }
2432
2433   bool FindReverse(const uint8_t* begin, const uint8_t* end,
2434                    const uint8_t** separator_begin, const uint8_t** separator_end,
2435                    const SplitPatternOptions& options) {
2436     // Unsupported (see PreExec)
2437     return false;
2438   }
2439 };
2440
2441 template <typename Type, typename ListType>
2442 using SplitRegexExec = SplitExec<Type, ListType, SplitRegexFinder>;
2443
2444 const FunctionDoc split_pattern_regex_doc(
2445     "Split string according to regex pattern",
2446     ("Split each string according to the regex `pattern` defined in\n"
2447      "SplitPatternOptions.  The output for each string input is a list\n"
2448      "of strings.\n"
2449      "\n"
2450      "The maximum number of splits and direction of splitting\n"
2451      "(forward, reverse) can optionally be defined in SplitPatternOptions."),
2452     {"strings"}, "SplitPatternOptions");
2453
2454 void AddSplitRegex(FunctionRegistry* registry) {
2455   auto func = std::make_shared<ScalarFunction>("split_pattern_regex", Arity::Unary(),
2456                                                &split_pattern_regex_doc);
2457   using t32 = SplitRegexExec<StringType, ListType>;
2458   using t64 = SplitRegexExec<LargeStringType, ListType>;
2459   DCHECK_OK(func->AddKernel({utf8()}, {list(utf8())}, t32::Exec, t32::State::Init));
2460   DCHECK_OK(
2461       func->AddKernel({large_utf8()}, {list(large_utf8())}, t64::Exec, t64::State::Init));
2462   DCHECK_OK(registry->AddFunction(std::move(func)));
2463 }
2464 #endif  // ARROW_WITH_RE2
2465
2466 void AddSplit(FunctionRegistry* registry) {
2467   AddSplitPattern(registry);
2468   AddSplitWhitespaceAscii(registry);
2469 #ifdef ARROW_WITH_UTF8PROC
2470   AddSplitWhitespaceUTF8(registry);
2471 #endif
2472 #ifdef ARROW_WITH_RE2
2473   AddSplitRegex(registry);
2474 #endif
2475 }
2476
2477 // ----------------------------------------------------------------------
2478 // Replace substring (plain, regex)
2479
2480 template <typename Type, typename Replacer>
2481 struct ReplaceSubString {
2482   using ScalarType = typename TypeTraits<Type>::ScalarType;
2483   using offset_type = typename Type::offset_type;
2484   using ValueDataBuilder = TypedBufferBuilder<uint8_t>;
2485   using OffsetBuilder = TypedBufferBuilder<offset_type>;
2486   using State = OptionsWrapper<ReplaceSubstringOptions>;
2487
2488   static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
2489     // TODO Cache replacer across invocations (for regex compilation)
2490     ARROW_ASSIGN_OR_RAISE(auto replacer, Replacer::Make(State::Get(ctx)));
2491     return Replace(ctx, batch, *replacer, out);
2492   }
2493
2494   static Status Replace(KernelContext* ctx, const ExecBatch& batch,
2495                         const Replacer& replacer, Datum* out) {
2496     ValueDataBuilder value_data_builder(ctx->memory_pool());
2497     OffsetBuilder offset_builder(ctx->memory_pool());
2498
2499     if (batch[0].kind() == Datum::ARRAY) {
2500       // We already know how many strings we have, so we can use Reserve/UnsafeAppend
2501       RETURN_NOT_OK(offset_builder.Reserve(batch[0].array()->length + 1));
2502       offset_builder.UnsafeAppend(0);  // offsets start at 0
2503
2504       const ArrayData& input = *batch[0].array();
2505       RETURN_NOT_OK(VisitArrayDataInline<Type>(
2506           input,
2507           [&](util::string_view s) {
2508             RETURN_NOT_OK(replacer.ReplaceString(s, &value_data_builder));
2509             offset_builder.UnsafeAppend(
2510                 static_cast<offset_type>(value_data_builder.length()));
2511             return Status::OK();
2512           },
2513           [&]() {
2514             // offset for null value
2515             offset_builder.UnsafeAppend(
2516                 static_cast<offset_type>(value_data_builder.length()));
2517             return Status::OK();
2518           }));
2519       ArrayData* output = out->mutable_array();
2520       RETURN_NOT_OK(value_data_builder.Finish(&output->buffers[2]));
2521       RETURN_NOT_OK(offset_builder.Finish(&output->buffers[1]));
2522     } else {
2523       const auto& input = checked_cast<const ScalarType&>(*batch[0].scalar());
2524       auto result = std::make_shared<ScalarType>();
2525       if (input.is_valid) {
2526         util::string_view s = static_cast<util::string_view>(*input.value);
2527         RETURN_NOT_OK(replacer.ReplaceString(s, &value_data_builder));
2528         RETURN_NOT_OK(value_data_builder.Finish(&result->value));
2529         result->is_valid = true;
2530       }
2531       out->value = result;
2532     }
2533
2534     return Status::OK();
2535   }
2536 };
2537
2538 struct PlainSubStringReplacer {
2539   const ReplaceSubstringOptions& options_;
2540
2541   static Result<std::unique_ptr<PlainSubStringReplacer>> Make(
2542       const ReplaceSubstringOptions& options) {
2543     return arrow::internal::make_unique<PlainSubStringReplacer>(options);
2544   }
2545
2546   explicit PlainSubStringReplacer(const ReplaceSubstringOptions& options)
2547       : options_(options) {}
2548
2549   Status ReplaceString(util::string_view s, TypedBufferBuilder<uint8_t>* builder) const {
2550     const char* i = s.begin();
2551     const char* end = s.end();
2552     int64_t max_replacements = options_.max_replacements;
2553     while ((i < end) && (max_replacements != 0)) {
2554       const char* pos =
2555           std::search(i, end, options_.pattern.begin(), options_.pattern.end());
2556       if (pos == end) {
2557         RETURN_NOT_OK(builder->Append(reinterpret_cast<const uint8_t*>(i),
2558                                       static_cast<int64_t>(end - i)));
2559         i = end;
2560       } else {
2561         // the string before the pattern
2562         RETURN_NOT_OK(builder->Append(reinterpret_cast<const uint8_t*>(i),
2563                                       static_cast<int64_t>(pos - i)));
2564         // the replacement
2565         RETURN_NOT_OK(
2566             builder->Append(reinterpret_cast<const uint8_t*>(options_.replacement.data()),
2567                             options_.replacement.length()));
2568         // skip pattern
2569         i = pos + options_.pattern.length();
2570         max_replacements--;
2571       }
2572     }
2573     // if we exited early due to max_replacements, add the trailing part
2574     return builder->Append(reinterpret_cast<const uint8_t*>(i),
2575                            static_cast<int64_t>(end - i));
2576   }
2577 };
2578
2579 #ifdef ARROW_WITH_RE2
2580 struct RegexSubStringReplacer {
2581   const ReplaceSubstringOptions& options_;
2582   const RE2 regex_find_;
2583   const RE2 regex_replacement_;
2584
2585   static Result<std::unique_ptr<RegexSubStringReplacer>> Make(
2586       const ReplaceSubstringOptions& options) {
2587     auto replacer = arrow::internal::make_unique<RegexSubStringReplacer>(options);
2588
2589     RETURN_NOT_OK(RegexStatus(replacer->regex_find_));
2590     RETURN_NOT_OK(RegexStatus(replacer->regex_replacement_));
2591
2592     std::string replacement_error;
2593     if (!replacer->regex_replacement_.CheckRewriteString(replacer->options_.replacement,
2594                                                          &replacement_error)) {
2595       return Status::Invalid("Invalid replacement string: ",
2596                              std::move(replacement_error));
2597     }
2598
2599     return std::move(replacer);
2600   }
2601
2602   // Using RE2::FindAndConsume we can only find the pattern if it is a group, therefore
2603   // we have 2 regexes, one with () around it, one without.
2604   explicit RegexSubStringReplacer(const ReplaceSubstringOptions& options)
2605       : options_(options),
2606         regex_find_("(" + options_.pattern + ")", RE2::Quiet),
2607         regex_replacement_(options_.pattern, RE2::Quiet) {}
2608
2609   Status ReplaceString(util::string_view s, TypedBufferBuilder<uint8_t>* builder) const {
2610     re2::StringPiece replacement(options_.replacement);
2611
2612     if (options_.max_replacements == -1) {
2613       std::string s_copy(s.to_string());
2614       re2::RE2::GlobalReplace(&s_copy, regex_replacement_, replacement);
2615       return builder->Append(reinterpret_cast<const uint8_t*>(s_copy.data()),
2616                              s_copy.length());
2617     }
2618
2619     // Since RE2 does not have the concept of max_replacements, we have to do some work
2620     // ourselves.
2621     // We might do this faster similar to RE2::GlobalReplace using Match and Rewrite
2622     const char* i = s.begin();
2623     const char* end = s.end();
2624     re2::StringPiece piece(s.data(), s.length());
2625
2626     int64_t max_replacements = options_.max_replacements;
2627     while ((i < end) && (max_replacements != 0)) {
2628       std::string found;
2629       if (!re2::RE2::FindAndConsume(&piece, regex_find_, &found)) {
2630         RETURN_NOT_OK(builder->Append(reinterpret_cast<const uint8_t*>(i),
2631                                       static_cast<int64_t>(end - i)));
2632         i = end;
2633       } else {
2634         // wind back to the beginning of the match
2635         const char* pos = piece.begin() - found.length();
2636         // the string before the pattern
2637         RETURN_NOT_OK(builder->Append(reinterpret_cast<const uint8_t*>(i),
2638                                       static_cast<int64_t>(pos - i)));
2639         // replace the pattern in what we found
2640         if (!re2::RE2::Replace(&found, regex_replacement_, replacement)) {
2641           return Status::Invalid("Regex found, but replacement failed");
2642         }
2643         RETURN_NOT_OK(builder->Append(reinterpret_cast<const uint8_t*>(found.data()),
2644                                       static_cast<int64_t>(found.length())));
2645         // skip pattern
2646         i = piece.begin();
2647         max_replacements--;
2648       }
2649     }
2650     // If we exited early due to max_replacements, add the trailing part
2651     return builder->Append(reinterpret_cast<const uint8_t*>(i),
2652                            static_cast<int64_t>(end - i));
2653   }
2654 };
2655 #endif
2656
2657 template <typename Type>
2658 using ReplaceSubStringPlain = ReplaceSubString<Type, PlainSubStringReplacer>;
2659
2660 const FunctionDoc replace_substring_doc(
2661     "Replace non-overlapping substrings that match pattern by replacement",
2662     ("For each string in `strings`, replace non-overlapping substrings that match\n"
2663      "`pattern` by `replacement`. If `max_replacements != -1`, it determines the\n"
2664      "maximum amount of replacements made, counting from the left. Null values emit\n"
2665      "null."),
2666     {"strings"}, "ReplaceSubstringOptions");
2667
2668 #ifdef ARROW_WITH_RE2
2669 template <typename Type>
2670 using ReplaceSubStringRegex = ReplaceSubString<Type, RegexSubStringReplacer>;
2671
2672 const FunctionDoc replace_substring_regex_doc(
2673     "Replace non-overlapping substrings that match regex `pattern` by `replacement`",
2674     ("For each string in `strings`, replace non-overlapping substrings that match the\n"
2675      "regular expression `pattern` by `replacement` using the Google RE2 library.\n"
2676      "If `max_replacements != -1`, it determines the maximum amount of replacements\n"
2677      "made, counting from the left. Note that if the pattern contains groups,\n"
2678      "backreferencing macan be used. Null values emit null."),
2679     {"strings"}, "ReplaceSubstringOptions");
2680 #endif
2681
2682 // ----------------------------------------------------------------------
2683 // Replace slice
2684
2685 struct ReplaceSliceTransformBase : public StringTransformBase {
2686   using State = OptionsWrapper<ReplaceSliceOptions>;
2687
2688   const ReplaceSliceOptions* options;
2689
2690   explicit ReplaceSliceTransformBase(const ReplaceSliceOptions& options)
2691       : options{&options} {}
2692
2693   int64_t MaxCodeunits(int64_t ninputs, int64_t input_ncodeunits) override {
2694     return ninputs * options->replacement.size() + input_ncodeunits;
2695   }
2696 };
2697
2698 struct BinaryReplaceSliceTransform : ReplaceSliceTransformBase {
2699   using ReplaceSliceTransformBase::ReplaceSliceTransformBase;
2700   int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
2701                     uint8_t* output) {
2702     const auto& opts = *options;
2703     int64_t before_slice = 0;
2704     int64_t after_slice = 0;
2705     uint8_t* output_start = output;
2706
2707     if (opts.start >= 0) {
2708       // Count from left
2709       before_slice = std::min<int64_t>(input_string_ncodeunits, opts.start);
2710     } else {
2711       // Count from right
2712       before_slice = std::max<int64_t>(0, input_string_ncodeunits + opts.start);
2713     }
2714     // Mimic Pandas: if stop would be before start, treat as 0-length slice
2715     if (opts.stop >= 0) {
2716       // Count from left
2717       after_slice =
2718           std::min<int64_t>(input_string_ncodeunits, std::max(before_slice, opts.stop));
2719     } else {
2720       // Count from right
2721       after_slice = std::max<int64_t>(before_slice, input_string_ncodeunits + opts.stop);
2722     }
2723     output = std::copy(input, input + before_slice, output);
2724     output = std::copy(opts.replacement.begin(), opts.replacement.end(), output);
2725     output = std::copy(input + after_slice, input + input_string_ncodeunits, output);
2726     return output - output_start;
2727   }
2728
2729   static int32_t FixedOutputSize(const ReplaceSliceOptions& opts, int32_t input_width) {
2730     int32_t before_slice = 0;
2731     int32_t after_slice = 0;
2732     const int32_t start = static_cast<int32_t>(opts.start);
2733     const int32_t stop = static_cast<int32_t>(opts.stop);
2734     if (opts.start >= 0) {
2735       // Count from left
2736       before_slice = std::min<int32_t>(input_width, start);
2737     } else {
2738       // Count from right
2739       before_slice = std::max<int32_t>(0, input_width + start);
2740     }
2741     if (opts.stop >= 0) {
2742       // Count from left
2743       after_slice = std::min<int32_t>(input_width, std::max<int32_t>(before_slice, stop));
2744     } else {
2745       // Count from right
2746       after_slice = std::max<int32_t>(before_slice, input_width + stop);
2747     }
2748     return static_cast<int32_t>(before_slice + opts.replacement.size() +
2749                                 (input_width - after_slice));
2750   }
2751 };
2752
2753 struct Utf8ReplaceSliceTransform : ReplaceSliceTransformBase {
2754   using ReplaceSliceTransformBase::ReplaceSliceTransformBase;
2755   int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
2756                     uint8_t* output) {
2757     const auto& opts = *options;
2758     const uint8_t* begin = input;
2759     const uint8_t* end = input + input_string_ncodeunits;
2760     const uint8_t *begin_sliced, *end_sliced;
2761     uint8_t* output_start = output;
2762
2763     // Mimic Pandas: if stop would be before start, treat as 0-length slice
2764     if (opts.start >= 0) {
2765       // Count from left
2766       if (!arrow::util::UTF8AdvanceCodepoints(begin, end, &begin_sliced, opts.start)) {
2767         return kTransformError;
2768       }
2769       if (opts.stop > options->start) {
2770         // Continue counting from left
2771         const int64_t length = opts.stop - options->start;
2772         if (!arrow::util::UTF8AdvanceCodepoints(begin_sliced, end, &end_sliced, length)) {
2773           return kTransformError;
2774         }
2775       } else if (opts.stop < 0) {
2776         // Count from right
2777         if (!arrow::util::UTF8AdvanceCodepointsReverse(begin_sliced, end, &end_sliced,
2778                                                        -opts.stop)) {
2779           return kTransformError;
2780         }
2781       } else {
2782         // Zero-length slice
2783         end_sliced = begin_sliced;
2784       }
2785     } else {
2786       // Count from right
2787       if (!arrow::util::UTF8AdvanceCodepointsReverse(begin, end, &begin_sliced,
2788                                                      -opts.start)) {
2789         return kTransformError;
2790       }
2791       if (opts.stop >= 0) {
2792         // Restart counting from left
2793         if (!arrow::util::UTF8AdvanceCodepoints(begin, end, &end_sliced, opts.stop)) {
2794           return kTransformError;
2795         }
2796         if (end_sliced <= begin_sliced) {
2797           // Zero-length slice
2798           end_sliced = begin_sliced;
2799         }
2800       } else if ((opts.stop < 0) && (options->stop > options->start)) {
2801         // Count from right
2802         if (!arrow::util::UTF8AdvanceCodepointsReverse(begin_sliced, end, &end_sliced,
2803                                                        -opts.stop)) {
2804           return kTransformError;
2805         }
2806       } else {
2807         // zero-length slice
2808         end_sliced = begin_sliced;
2809       }
2810     }
2811     output = std::copy(begin, begin_sliced, output);
2812     output = std::copy(opts.replacement.begin(), options->replacement.end(), output);
2813     output = std::copy(end_sliced, end, output);
2814     return output - output_start;
2815   }
2816 };
2817
2818 template <typename Type>
2819 using BinaryReplaceSlice =
2820     StringTransformExecWithState<Type, BinaryReplaceSliceTransform>;
2821 template <typename Type>
2822 using Utf8ReplaceSlice = StringTransformExecWithState<Type, Utf8ReplaceSliceTransform>;
2823
2824 const FunctionDoc binary_replace_slice_doc(
2825     "Replace a slice of a binary string with `replacement`",
2826     ("For each string in `strings`, replace a slice of the string defined by `start`"
2827      "and `stop` with `replacement`. `start` is inclusive and `stop` is exclusive, "
2828      "and both are measured in bytes.\n"
2829      "Null values emit null."),
2830     {"strings"}, "ReplaceSliceOptions");
2831
2832 const FunctionDoc utf8_replace_slice_doc(
2833     "Replace a slice of a string with `replacement`",
2834     ("For each string in `strings`, replace a slice of the string defined by `start`"
2835      "and `stop` with `replacement`. `start` is inclusive and `stop` is exclusive, "
2836      "and both are measured in codeunits.\n"
2837      "Null values emit null."),
2838     {"strings"}, "ReplaceSliceOptions");
2839
2840 void AddReplaceSlice(FunctionRegistry* registry) {
2841   {
2842     auto func = std::make_shared<ScalarFunction>("binary_replace_slice", Arity::Unary(),
2843                                                  &binary_replace_slice_doc);
2844     for (const auto& ty : BaseBinaryTypes()) {
2845       DCHECK_OK(func->AddKernel({ty}, ty,
2846                                 GenerateTypeAgnosticVarBinaryBase<BinaryReplaceSlice>(ty),
2847                                 ReplaceSliceTransformBase::State::Init));
2848     }
2849     using TransformExec =
2850         FixedSizeBinaryTransformExecWithState<BinaryReplaceSliceTransform>;
2851     DCHECK_OK(func->AddKernel({InputType(Type::FIXED_SIZE_BINARY)},
2852                               OutputType(TransformExec::OutputType), TransformExec::Exec,
2853                               ReplaceSliceTransformBase::State::Init));
2854     DCHECK_OK(registry->AddFunction(std::move(func)));
2855   }
2856
2857   {
2858     auto func = std::make_shared<ScalarFunction>("utf8_replace_slice", Arity::Unary(),
2859                                                  &utf8_replace_slice_doc);
2860     DCHECK_OK(func->AddKernel({utf8()}, utf8(), Utf8ReplaceSlice<StringType>::Exec,
2861                               ReplaceSliceTransformBase::State::Init));
2862     DCHECK_OK(func->AddKernel({large_utf8()}, large_utf8(),
2863                               Utf8ReplaceSlice<LargeStringType>::Exec,
2864                               ReplaceSliceTransformBase::State::Init));
2865     DCHECK_OK(registry->AddFunction(std::move(func)));
2866   }
2867 }
2868
2869 // ----------------------------------------------------------------------
2870 // Extract with regex
2871
2872 #ifdef ARROW_WITH_RE2
2873
2874 // TODO cache this once per ExtractRegexOptions
2875 struct ExtractRegexData {
2876   // Use unique_ptr<> because RE2 is non-movable
2877   std::unique_ptr<RE2> regex;
2878   std::vector<std::string> group_names;
2879
2880   static Result<ExtractRegexData> Make(const ExtractRegexOptions& options) {
2881     ExtractRegexData data(options.pattern);
2882     RETURN_NOT_OK(RegexStatus(*data.regex));
2883
2884     const int group_count = data.regex->NumberOfCapturingGroups();
2885     const auto& name_map = data.regex->CapturingGroupNames();
2886     data.group_names.reserve(group_count);
2887
2888     for (int i = 0; i < group_count; i++) {
2889       auto item = name_map.find(i + 1);  // re2 starts counting from 1
2890       if (item == name_map.end()) {
2891         // XXX should we instead just create fields with an empty name?
2892         return Status::Invalid("Regular expression contains unnamed groups");
2893       }
2894       data.group_names.emplace_back(item->second);
2895     }
2896     return std::move(data);
2897   }
2898
2899   Result<ValueDescr> ResolveOutputType(const std::vector<ValueDescr>& args) const {
2900     const auto& input_type = args[0].type;
2901     if (input_type == nullptr) {
2902       // No input type specified => propagate shape
2903       return args[0];
2904     }
2905     // Input type is either String or LargeString and is also the type of each
2906     // field in the output struct type.
2907     DCHECK(input_type->id() == Type::STRING || input_type->id() == Type::LARGE_STRING);
2908     FieldVector fields;
2909     fields.reserve(group_names.size());
2910     std::transform(group_names.begin(), group_names.end(), std::back_inserter(fields),
2911                    [&](const std::string& name) { return field(name, input_type); });
2912     return struct_(std::move(fields));
2913   }
2914
2915  private:
2916   explicit ExtractRegexData(const std::string& pattern)
2917       : regex(new RE2(pattern, RE2::Quiet)) {}
2918 };
2919
2920 Result<ValueDescr> ResolveExtractRegexOutput(KernelContext* ctx,
2921                                              const std::vector<ValueDescr>& args) {
2922   using State = OptionsWrapper<ExtractRegexOptions>;
2923   ExtractRegexOptions options = State::Get(ctx);
2924   ARROW_ASSIGN_OR_RAISE(auto data, ExtractRegexData::Make(options));
2925   return data.ResolveOutputType(args);
2926 }
2927
2928 struct ExtractRegexBase {
2929   const ExtractRegexData& data;
2930   const int group_count;
2931   std::vector<re2::StringPiece> found_values;
2932   std::vector<re2::RE2::Arg> args;
2933   std::vector<const re2::RE2::Arg*> args_pointers;
2934   const re2::RE2::Arg** args_pointers_start;
2935   const re2::RE2::Arg* null_arg = nullptr;
2936
2937   explicit ExtractRegexBase(const ExtractRegexData& data)
2938       : data(data),
2939         group_count(static_cast<int>(data.group_names.size())),
2940         found_values(group_count) {
2941     args.reserve(group_count);
2942     args_pointers.reserve(group_count);
2943
2944     for (int i = 0; i < group_count; i++) {
2945       args.emplace_back(&found_values[i]);
2946       // Since we reserved capacity, we're guaranteed the pointer remains valid
2947       args_pointers.push_back(&args[i]);
2948     }
2949     // Avoid null pointer if there is no capture group
2950     args_pointers_start = (group_count > 0) ? args_pointers.data() : &null_arg;
2951   }
2952
2953   bool Match(util::string_view s) {
2954     return re2::RE2::PartialMatchN(ToStringPiece(s), *data.regex, args_pointers_start,
2955                                    group_count);
2956   }
2957 };
2958
2959 template <typename Type>
2960 struct ExtractRegex : public ExtractRegexBase {
2961   using ArrayType = typename TypeTraits<Type>::ArrayType;
2962   using ScalarType = typename TypeTraits<Type>::ScalarType;
2963   using BuilderType = typename TypeTraits<Type>::BuilderType;
2964   using State = OptionsWrapper<ExtractRegexOptions>;
2965
2966   using ExtractRegexBase::ExtractRegexBase;
2967
2968   static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
2969     ExtractRegexOptions options = State::Get(ctx);
2970     ARROW_ASSIGN_OR_RAISE(auto data, ExtractRegexData::Make(options));
2971     return ExtractRegex{data}.Extract(ctx, batch, out);
2972   }
2973
2974   Status Extract(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
2975     ARROW_ASSIGN_OR_RAISE(auto descr, data.ResolveOutputType(batch.GetDescriptors()));
2976     DCHECK_NE(descr.type, nullptr);
2977     const auto& type = descr.type;
2978
2979     if (batch[0].kind() == Datum::ARRAY) {
2980       std::unique_ptr<ArrayBuilder> array_builder;
2981       RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), type, &array_builder));
2982       StructBuilder* struct_builder = checked_cast<StructBuilder*>(array_builder.get());
2983
2984       std::vector<BuilderType*> field_builders;
2985       field_builders.reserve(group_count);
2986       for (int i = 0; i < group_count; i++) {
2987         field_builders.push_back(
2988             checked_cast<BuilderType*>(struct_builder->field_builder(i)));
2989       }
2990
2991       auto visit_null = [&]() { return struct_builder->AppendNull(); };
2992       auto visit_value = [&](util::string_view s) {
2993         if (Match(s)) {
2994           for (int i = 0; i < group_count; i++) {
2995             RETURN_NOT_OK(field_builders[i]->Append(ToStringView(found_values[i])));
2996           }
2997           return struct_builder->Append();
2998         } else {
2999           return struct_builder->AppendNull();
3000         }
3001       };
3002       const ArrayData& input = *batch[0].array();
3003       RETURN_NOT_OK(VisitArrayDataInline<Type>(input, visit_value, visit_null));
3004
3005       std::shared_ptr<Array> out_array;
3006       RETURN_NOT_OK(struct_builder->Finish(&out_array));
3007       *out = std::move(out_array);
3008     } else {
3009       const auto& input = checked_cast<const ScalarType&>(*batch[0].scalar());
3010       auto result = std::make_shared<StructScalar>(type);
3011       if (input.is_valid && Match(util::string_view(*input.value))) {
3012         result->value.reserve(group_count);
3013         for (int i = 0; i < group_count; i++) {
3014           result->value.push_back(
3015               std::make_shared<ScalarType>(found_values[i].as_string()));
3016         }
3017         result->is_valid = true;
3018       } else {
3019         result->is_valid = false;
3020       }
3021       out->value = std::move(result);
3022     }
3023
3024     return Status::OK();
3025   }
3026 };
3027
3028 const FunctionDoc extract_regex_doc(
3029     "Extract substrings captured by a regex pattern",
3030     ("For each string in `strings`, match the regular expression and, if\n"
3031      "successful, emit a struct with field names and values coming from the\n"
3032      "regular expression's named capture groups. If the input is null or the\n"
3033      "regular expression fails matching, a null output value is emitted.\n"
3034      "\n"
3035      "Regular expression matching is done using the Google RE2 library."),
3036     {"strings"}, "ExtractRegexOptions");
3037
3038 void AddExtractRegex(FunctionRegistry* registry) {
3039   auto func = std::make_shared<ScalarFunction>("extract_regex", Arity::Unary(),
3040                                                &extract_regex_doc);
3041   using t32 = ExtractRegex<StringType>;
3042   using t64 = ExtractRegex<LargeStringType>;
3043   OutputType out_ty(ResolveExtractRegexOutput);
3044   ScalarKernel kernel;
3045
3046   // Null values will be computed based on regex match or not
3047   kernel.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE;
3048   kernel.mem_allocation = MemAllocation::NO_PREALLOCATE;
3049   kernel.signature.reset(new KernelSignature({utf8()}, out_ty));
3050   kernel.exec = t32::Exec;
3051   kernel.init = t32::State::Init;
3052   DCHECK_OK(func->AddKernel(kernel));
3053   kernel.signature.reset(new KernelSignature({large_utf8()}, out_ty));
3054   kernel.exec = t64::Exec;
3055   kernel.init = t64::State::Init;
3056   DCHECK_OK(func->AddKernel(kernel));
3057
3058   DCHECK_OK(registry->AddFunction(std::move(func)));
3059 }
3060 #endif  // ARROW_WITH_RE2
3061
3062 // ----------------------------------------------------------------------
3063 // strptime string parsing
3064
3065 using StrptimeState = OptionsWrapper<StrptimeOptions>;
3066
3067 struct ParseStrptime {
3068   explicit ParseStrptime(const StrptimeOptions& options)
3069       : parser(TimestampParser::MakeStrptime(options.format)), unit(options.unit) {}
3070
3071   template <typename... Ignored>
3072   int64_t Call(KernelContext*, util::string_view val, Status* st) const {
3073     int64_t result = 0;
3074     if (!(*parser)(val.data(), val.size(), unit, &result)) {
3075       *st = Status::Invalid("Failed to parse string: '", val, "' as a scalar of type ",
3076                             TimestampType(unit).ToString());
3077     }
3078     return result;
3079   }
3080
3081   std::shared_ptr<TimestampParser> parser;
3082   TimeUnit::type unit;
3083 };
3084
3085 template <typename InputType>
3086 Status StrptimeExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
3087   applicator::ScalarUnaryNotNullStateful<TimestampType, InputType, ParseStrptime> kernel{
3088       ParseStrptime(StrptimeState::Get(ctx))};
3089   return kernel.Exec(ctx, batch, out);
3090 }
3091
3092 Result<ValueDescr> StrptimeResolve(KernelContext* ctx, const std::vector<ValueDescr>&) {
3093   if (ctx->state()) {
3094     return ::arrow::timestamp(StrptimeState::Get(ctx).unit);
3095   }
3096
3097   return Status::Invalid("strptime does not provide default StrptimeOptions");
3098 }
3099
3100 // ----------------------------------------------------------------------
3101 // string padding
3102
3103 template <bool PadLeft, bool PadRight>
3104 struct AsciiPadTransform : public StringTransformBase {
3105   using State = OptionsWrapper<PadOptions>;
3106
3107   const PadOptions& options_;
3108
3109   explicit AsciiPadTransform(const PadOptions& options) : options_(options) {}
3110
3111   Status PreExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) override {
3112     if (options_.padding.size() != 1) {
3113       return Status::Invalid("Padding must be one byte, got '", options_.padding, "'");
3114     }
3115     return Status::OK();
3116   }
3117
3118   int64_t MaxCodeunits(int64_t ninputs, int64_t input_ncodeunits) override {
3119     // This is likely very overallocated but hard to do better without
3120     // actually looking at each string (because of strings that may be
3121     // longer than the given width)
3122     return input_ncodeunits + ninputs * options_.width;
3123   }
3124
3125   int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
3126                     uint8_t* output) {
3127     if (input_string_ncodeunits >= options_.width) {
3128       std::copy(input, input + input_string_ncodeunits, output);
3129       return input_string_ncodeunits;
3130     }
3131     const int64_t spaces = options_.width - input_string_ncodeunits;
3132     int64_t left = 0;
3133     int64_t right = 0;
3134     if (PadLeft && PadRight) {
3135       // If odd number of spaces, put the extra space on the right
3136       left = spaces / 2;
3137       right = spaces - left;
3138     } else if (PadLeft) {
3139       left = spaces;
3140     } else if (PadRight) {
3141       right = spaces;
3142     } else {
3143       DCHECK(false) << "unreachable";
3144       return 0;
3145     }
3146     std::fill(output, output + left, options_.padding[0]);
3147     output += left;
3148     output = std::copy(input, input + input_string_ncodeunits, output);
3149     std::fill(output, output + right, options_.padding[0]);
3150     return options_.width;
3151   }
3152 };
3153
3154 template <bool PadLeft, bool PadRight>
3155 struct Utf8PadTransform : public StringTransformBase {
3156   using State = OptionsWrapper<PadOptions>;
3157
3158   const PadOptions& options_;
3159
3160   explicit Utf8PadTransform(const PadOptions& options) : options_(options) {}
3161
3162   Status PreExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) override {
3163     auto str = reinterpret_cast<const uint8_t*>(options_.padding.data());
3164     auto strlen = options_.padding.size();
3165     if (util::UTF8Length(str, str + strlen) != 1) {
3166       return Status::Invalid("Padding must be one codepoint, got '", options_.padding,
3167                              "'");
3168     }
3169     return Status::OK();
3170   }
3171
3172   int64_t MaxCodeunits(int64_t ninputs, int64_t input_ncodeunits) override {
3173     // This is likely very overallocated but hard to do better without
3174     // actually looking at each string (because of strings that may be
3175     // longer than the given width)
3176     // One codepoint may be up to 4 bytes
3177     return input_ncodeunits + 4 * ninputs * options_.width;
3178   }
3179
3180   int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
3181                     uint8_t* output) {
3182     const int64_t input_width = util::UTF8Length(input, input + input_string_ncodeunits);
3183     if (input_width >= options_.width) {
3184       std::copy(input, input + input_string_ncodeunits, output);
3185       return input_string_ncodeunits;
3186     }
3187     const int64_t spaces = options_.width - input_width;
3188     int64_t left = 0;
3189     int64_t right = 0;
3190     if (PadLeft && PadRight) {
3191       // If odd number of spaces, put the extra space on the right
3192       left = spaces / 2;
3193       right = spaces - left;
3194     } else if (PadLeft) {
3195       left = spaces;
3196     } else if (PadRight) {
3197       right = spaces;
3198     } else {
3199       DCHECK(false) << "unreachable";
3200       return 0;
3201     }
3202     uint8_t* start = output;
3203     while (left) {
3204       output = std::copy(options_.padding.begin(), options_.padding.end(), output);
3205       left--;
3206     }
3207     output = std::copy(input, input + input_string_ncodeunits, output);
3208     while (right) {
3209       output = std::copy(options_.padding.begin(), options_.padding.end(), output);
3210       right--;
3211     }
3212     return output - start;
3213   }
3214 };
3215
3216 template <typename Type>
3217 using AsciiLPad = StringTransformExecWithState<Type, AsciiPadTransform<true, false>>;
3218 template <typename Type>
3219 using AsciiRPad = StringTransformExecWithState<Type, AsciiPadTransform<false, true>>;
3220 template <typename Type>
3221 using AsciiCenter = StringTransformExecWithState<Type, AsciiPadTransform<true, true>>;
3222 template <typename Type>
3223 using Utf8LPad = StringTransformExecWithState<Type, Utf8PadTransform<true, false>>;
3224 template <typename Type>
3225 using Utf8RPad = StringTransformExecWithState<Type, Utf8PadTransform<false, true>>;
3226 template <typename Type>
3227 using Utf8Center = StringTransformExecWithState<Type, Utf8PadTransform<true, true>>;
3228
3229 // ----------------------------------------------------------------------
3230 // string trimming
3231
3232 #ifdef ARROW_WITH_UTF8PROC
3233
3234 template <bool TrimLeft, bool TrimRight>
3235 struct UTF8TrimWhitespaceTransform : public StringTransformBase {
3236   Status PreExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) override {
3237     EnsureLookupTablesFilled();
3238     return Status::OK();
3239   }
3240
3241   int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
3242                     uint8_t* output) {
3243     const uint8_t* begin = input;
3244     const uint8_t* end = input + input_string_ncodeunits;
3245     const uint8_t* end_trimmed = end;
3246     const uint8_t* begin_trimmed = begin;
3247
3248     auto predicate = [](uint32_t c) { return !IsSpaceCharacterUnicode(c); };
3249     if (TrimLeft && !ARROW_PREDICT_TRUE(
3250                         arrow::util::UTF8FindIf(begin, end, predicate, &begin_trimmed))) {
3251       return kTransformError;
3252     }
3253     if (TrimRight && begin_trimmed < end) {
3254       if (!ARROW_PREDICT_TRUE(arrow::util::UTF8FindIfReverse(begin_trimmed, end,
3255                                                              predicate, &end_trimmed))) {
3256         return kTransformError;
3257       }
3258     }
3259     std::copy(begin_trimmed, end_trimmed, output);
3260     return end_trimmed - begin_trimmed;
3261   }
3262 };
3263
3264 template <typename Type>
3265 using UTF8TrimWhitespace =
3266     StringTransformExec<Type, UTF8TrimWhitespaceTransform<true, true>>;
3267
3268 template <typename Type>
3269 using UTF8LTrimWhitespace =
3270     StringTransformExec<Type, UTF8TrimWhitespaceTransform<true, false>>;
3271
3272 template <typename Type>
3273 using UTF8RTrimWhitespace =
3274     StringTransformExec<Type, UTF8TrimWhitespaceTransform<false, true>>;
3275
3276 struct UTF8TrimState {
3277   TrimOptions options_;
3278   std::vector<bool> codepoints_;
3279   Status status_ = Status::OK();
3280
3281   explicit UTF8TrimState(KernelContext* ctx, TrimOptions options)
3282       : options_(std::move(options)) {
3283     if (!ARROW_PREDICT_TRUE(
3284             arrow::util::UTF8ForEach(options_.characters, [&](uint32_t c) {
3285               codepoints_.resize(
3286                   std::max(c + 1, static_cast<uint32_t>(codepoints_.size())));
3287               codepoints_.at(c) = true;
3288             }))) {
3289       status_ = Status::Invalid("Invalid UTF8 sequence in input");
3290     }
3291   }
3292 };
3293
3294 template <bool TrimLeft, bool TrimRight>
3295 struct UTF8TrimTransform : public StringTransformBase {
3296   using State = KernelStateFromFunctionOptions<UTF8TrimState, TrimOptions>;
3297
3298   const UTF8TrimState& state_;
3299
3300   explicit UTF8TrimTransform(const UTF8TrimState& state) : state_(state) {}
3301
3302   Status PreExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) override {
3303     return state_.status_;
3304   }
3305
3306   int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
3307                     uint8_t* output) {
3308     const uint8_t* begin = input;
3309     const uint8_t* end = input + input_string_ncodeunits;
3310     const uint8_t* end_trimmed = end;
3311     const uint8_t* begin_trimmed = begin;
3312     const auto& codepoints = state_.codepoints_;
3313
3314     auto predicate = [&](uint32_t c) { return c >= codepoints.size() || !codepoints[c]; };
3315     if (TrimLeft && !ARROW_PREDICT_TRUE(
3316                         arrow::util::UTF8FindIf(begin, end, predicate, &begin_trimmed))) {
3317       return kTransformError;
3318     }
3319     if (TrimRight && begin_trimmed < end) {
3320       if (!ARROW_PREDICT_TRUE(arrow::util::UTF8FindIfReverse(begin_trimmed, end,
3321                                                              predicate, &end_trimmed))) {
3322         return kTransformError;
3323       }
3324     }
3325     std::copy(begin_trimmed, end_trimmed, output);
3326     return end_trimmed - begin_trimmed;
3327   }
3328 };
3329
3330 template <typename Type>
3331 using UTF8Trim = StringTransformExecWithState<Type, UTF8TrimTransform<true, true>>;
3332
3333 template <typename Type>
3334 using UTF8LTrim = StringTransformExecWithState<Type, UTF8TrimTransform<true, false>>;
3335
3336 template <typename Type>
3337 using UTF8RTrim = StringTransformExecWithState<Type, UTF8TrimTransform<false, true>>;
3338
3339 #endif
3340
3341 template <bool TrimLeft, bool TrimRight>
3342 struct AsciiTrimWhitespaceTransform : public StringTransformBase {
3343   int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
3344                     uint8_t* output) {
3345     const uint8_t* begin = input;
3346     const uint8_t* end = input + input_string_ncodeunits;
3347     const uint8_t* end_trimmed = end;
3348     const uint8_t* begin_trimmed = begin;
3349
3350     auto predicate = [](unsigned char c) { return !IsSpaceCharacterAscii(c); };
3351     if (TrimLeft) {
3352       begin_trimmed = std::find_if(begin, end, predicate);
3353     }
3354     if (TrimRight && begin_trimmed < end) {
3355       std::reverse_iterator<const uint8_t*> rbegin(end);
3356       std::reverse_iterator<const uint8_t*> rend(begin_trimmed);
3357       end_trimmed = std::find_if(rbegin, rend, predicate).base();
3358     }
3359     std::copy(begin_trimmed, end_trimmed, output);
3360     return end_trimmed - begin_trimmed;
3361   }
3362 };
3363
3364 template <typename Type>
3365 using AsciiTrimWhitespace =
3366     StringTransformExec<Type, AsciiTrimWhitespaceTransform<true, true>>;
3367
3368 template <typename Type>
3369 using AsciiLTrimWhitespace =
3370     StringTransformExec<Type, AsciiTrimWhitespaceTransform<true, false>>;
3371
3372 template <typename Type>
3373 using AsciiRTrimWhitespace =
3374     StringTransformExec<Type, AsciiTrimWhitespaceTransform<false, true>>;
3375
3376 struct AsciiTrimState {
3377   TrimOptions options_;
3378   std::vector<bool> characters_;
3379
3380   explicit AsciiTrimState(KernelContext* ctx, TrimOptions options)
3381       : options_(std::move(options)), characters_(256) {
3382     for (const auto c : options_.characters) {
3383       characters_[static_cast<unsigned char>(c)] = true;
3384     }
3385   }
3386 };
3387
3388 template <bool TrimLeft, bool TrimRight>
3389 struct AsciiTrimTransform : public StringTransformBase {
3390   using State = KernelStateFromFunctionOptions<AsciiTrimState, TrimOptions>;
3391
3392   const AsciiTrimState& state_;
3393
3394   explicit AsciiTrimTransform(const AsciiTrimState& state) : state_(state) {}
3395
3396   int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
3397                     uint8_t* output) {
3398     const uint8_t* begin = input;
3399     const uint8_t* end = input + input_string_ncodeunits;
3400     const uint8_t* end_trimmed = end;
3401     const uint8_t* begin_trimmed = begin;
3402     const auto& characters = state_.characters_;
3403
3404     auto predicate = [&](uint8_t c) { return !characters[c]; };
3405     if (TrimLeft) {
3406       begin_trimmed = std::find_if(begin, end, predicate);
3407     }
3408     if (TrimRight && begin_trimmed < end) {
3409       std::reverse_iterator<const uint8_t*> rbegin(end);
3410       std::reverse_iterator<const uint8_t*> rend(begin_trimmed);
3411       end_trimmed = std::find_if(rbegin, rend, predicate).base();
3412     }
3413     std::copy(begin_trimmed, end_trimmed, output);
3414     return end_trimmed - begin_trimmed;
3415   }
3416 };
3417
3418 template <typename Type>
3419 using AsciiTrim = StringTransformExecWithState<Type, AsciiTrimTransform<true, true>>;
3420
3421 template <typename Type>
3422 using AsciiLTrim = StringTransformExecWithState<Type, AsciiTrimTransform<true, false>>;
3423
3424 template <typename Type>
3425 using AsciiRTrim = StringTransformExecWithState<Type, AsciiTrimTransform<false, true>>;
3426
3427 const FunctionDoc utf8_center_doc(
3428     "Center strings by padding with a given character",
3429     ("For each string in `strings`, emit a centered string by padding both sides \n"
3430      "with the given UTF8 codeunit.\nNull values emit null."),
3431     {"strings"}, "PadOptions");
3432
3433 const FunctionDoc utf8_lpad_doc(
3434     "Right-align strings by padding with a given character",
3435     ("For each string in `strings`, emit a right-aligned string by prepending \n"
3436      "the given UTF8 codeunit.\nNull values emit null."),
3437     {"strings"}, "PadOptions");
3438
3439 const FunctionDoc utf8_rpad_doc(
3440     "Left-align strings by padding with a given character",
3441     ("For each string in `strings`, emit a left-aligned string by appending \n"
3442      "the given UTF8 codeunit.\nNull values emit null."),
3443     {"strings"}, "PadOptions");
3444
3445 const FunctionDoc ascii_center_doc(
3446     utf8_center_doc.description + "",
3447     ("For each string in `strings`, emit a centered string by padding both sides \n"
3448      "with the given ASCII character.\nNull values emit null."),
3449     {"strings"}, "PadOptions");
3450
3451 const FunctionDoc ascii_lpad_doc(
3452     utf8_lpad_doc.description + "",
3453     ("For each string in `strings`, emit a right-aligned string by prepending \n"
3454      "the given ASCII character.\nNull values emit null."),
3455     {"strings"}, "PadOptions");
3456
3457 const FunctionDoc ascii_rpad_doc(
3458     utf8_rpad_doc.description + "",
3459     ("For each string in `strings`, emit a left-aligned string by appending \n"
3460      "the given ASCII character.\nNull values emit null."),
3461     {"strings"}, "PadOptions");
3462
3463 const FunctionDoc utf8_trim_whitespace_doc(
3464     "Trim leading and trailing whitespace characters",
3465     ("For each string in `strings`, emit a string with leading and trailing whitespace\n"
3466      "characters removed, where whitespace characters are defined by the Unicode\n"
3467      "standard.  Null values emit null."),
3468     {"strings"});
3469
3470 const FunctionDoc utf8_ltrim_whitespace_doc(
3471     "Trim leading whitespace characters",
3472     ("For each string in `strings`, emit a string with leading whitespace\n"
3473      "characters removed, where whitespace characters are defined by the Unicode\n"
3474      "standard.  Null values emit null."),
3475     {"strings"});
3476
3477 const FunctionDoc utf8_rtrim_whitespace_doc(
3478     "Trim trailing whitespace characters",
3479     ("For each string in `strings`, emit a string with trailing whitespace\n"
3480      "characters removed, where whitespace characters are defined by the Unicode\n"
3481      "standard.  Null values emit null."),
3482     {"strings"});
3483
3484 const FunctionDoc ascii_trim_whitespace_doc(
3485     "Trim leading and trailing ASCII whitespace characters",
3486     ("For each string in `strings`, emit a string with leading and trailing ASCII\n"
3487      "whitespace characters removed. Use `utf8_trim_whitespace` to trim Unicode\n"
3488      "whitespace characters. Null values emit null."),
3489     {"strings"});
3490
3491 const FunctionDoc ascii_ltrim_whitespace_doc(
3492     "Trim leading ASCII whitespace characters",
3493     ("For each string in `strings`, emit a string with leading ASCII whitespace\n"
3494      "characters removed.  Use `utf8_ltrim_whitespace` to trim leading Unicode\n"
3495      "whitespace characters. Null values emit null."),
3496     {"strings"});
3497
3498 const FunctionDoc ascii_rtrim_whitespace_doc(
3499     "Trim trailing ASCII whitespace characters",
3500     ("For each string in `strings`, emit a string with trailing ASCII whitespace\n"
3501      "characters removed. Use `utf8_rtrim_whitespace` to trim trailing Unicode\n"
3502      "whitespace characters. Null values emit null."),
3503     {"strings"});
3504
3505 const FunctionDoc utf8_trim_doc(
3506     "Trim leading and trailing characters present in the `characters` arguments",
3507     ("For each string in `strings`, emit a string with leading and trailing\n"
3508      "characters removed that are present in the `characters` argument.  Null values\n"
3509      "emit null."),
3510     {"strings"}, "TrimOptions");
3511
3512 const FunctionDoc utf8_ltrim_doc(
3513     "Trim leading characters present in the `characters` arguments",
3514     ("For each string in `strings`, emit a string with leading\n"
3515      "characters removed that are present in the `characters` argument.  Null values\n"
3516      "emit null."),
3517     {"strings"}, "TrimOptions");
3518
3519 const FunctionDoc utf8_rtrim_doc(
3520     "Trim trailing characters present in the `characters` arguments",
3521     ("For each string in `strings`, emit a string with leading "
3522      "characters removed that are present in the `characters` argument.  Null values\n"
3523      "emit null."),
3524     {"strings"}, "TrimOptions");
3525
3526 const FunctionDoc ascii_trim_doc(
3527     utf8_trim_doc.summary + "",
3528     utf8_trim_doc.description +
3529         ("\nBoth the input string as the `characters` argument are interepreted as\n"
3530          "ASCII characters, to trim non-ASCII characters, use `utf8_trim`."),
3531     {"strings"}, "TrimOptions");
3532
3533 const FunctionDoc ascii_ltrim_doc(
3534     utf8_ltrim_doc.summary + "",
3535     utf8_ltrim_doc.description +
3536         ("\nBoth the input string as the `characters` argument are interepreted as\n"
3537          "ASCII characters, to trim non-ASCII characters, use `utf8_trim`."),
3538     {"strings"}, "TrimOptions");
3539
3540 const FunctionDoc ascii_rtrim_doc(
3541     utf8_rtrim_doc.summary + "",
3542     utf8_rtrim_doc.description +
3543         ("\nBoth the input string as the `characters` argument are interepreted as\n"
3544          "ASCII characters, to trim non-ASCII characters, use `utf8_trim`."),
3545     {"strings"}, "TrimOptions");
3546
3547 const FunctionDoc strptime_doc(
3548     "Parse timestamps",
3549     ("For each string in `strings`, parse it as a timestamp.\n"
3550      "The timestamp unit and the expected string pattern must be given\n"
3551      "in StrptimeOptions.  Null inputs emit null.  If a non-null string\n"
3552      "fails parsing, an error is returned."),
3553     {"strings"}, "StrptimeOptions");
3554
3555 const FunctionDoc binary_length_doc(
3556     "Compute string lengths",
3557     ("For each string in `strings`, emit the number of bytes.  Null values emit null."),
3558     {"strings"});
3559
3560 const FunctionDoc utf8_length_doc("Compute UTF8 string lengths",
3561                                   ("For each string in `strings`, emit the number of "
3562                                    "UTF8 characters.  Null values emit null."),
3563                                   {"strings"});
3564
3565 void AddStrptime(FunctionRegistry* registry) {
3566   auto func = std::make_shared<ScalarFunction>("strptime", Arity::Unary(), &strptime_doc);
3567   DCHECK_OK(func->AddKernel({utf8()}, OutputType(StrptimeResolve),
3568                             StrptimeExec<StringType>, StrptimeState::Init));
3569   DCHECK_OK(func->AddKernel({large_utf8()}, OutputType(StrptimeResolve),
3570                             StrptimeExec<LargeStringType>, StrptimeState::Init));
3571   DCHECK_OK(registry->AddFunction(std::move(func)));
3572 }
3573
3574 void AddBinaryLength(FunctionRegistry* registry) {
3575   auto func = std::make_shared<ScalarFunction>("binary_length", Arity::Unary(),
3576                                                &binary_length_doc);
3577   ArrayKernelExec exec_offset_32 =
3578       applicator::ScalarUnaryNotNull<Int32Type, StringType, BinaryLength>::Exec;
3579   ArrayKernelExec exec_offset_64 =
3580       applicator::ScalarUnaryNotNull<Int64Type, LargeStringType, BinaryLength>::Exec;
3581   for (const auto& input_type : {binary(), utf8()}) {
3582     DCHECK_OK(func->AddKernel({input_type}, int32(), exec_offset_32));
3583   }
3584   for (const auto& input_type : {large_binary(), large_utf8()}) {
3585     DCHECK_OK(func->AddKernel({input_type}, int64(), exec_offset_64));
3586   }
3587   DCHECK_OK(func->AddKernel({InputType(Type::FIXED_SIZE_BINARY)}, int32(),
3588                             BinaryLength::FixedSizeExec));
3589   DCHECK_OK(registry->AddFunction(std::move(func)));
3590 }
3591
3592 void AddUtf8Length(FunctionRegistry* registry) {
3593   auto func =
3594       std::make_shared<ScalarFunction>("utf8_length", Arity::Unary(), &utf8_length_doc);
3595
3596   ArrayKernelExec exec_offset_32 =
3597       applicator::ScalarUnaryNotNull<Int32Type, StringType, Utf8Length>::Exec;
3598   DCHECK_OK(func->AddKernel({utf8()}, int32(), std::move(exec_offset_32)));
3599
3600   ArrayKernelExec exec_offset_64 =
3601       applicator::ScalarUnaryNotNull<Int64Type, LargeStringType, Utf8Length>::Exec;
3602   DCHECK_OK(func->AddKernel({large_utf8()}, int64(), std::move(exec_offset_64)));
3603
3604   DCHECK_OK(registry->AddFunction(std::move(func)));
3605 }
3606
3607 template <typename BinaryType, typename ListType>
3608 struct BinaryJoin {
3609   using ArrayType = typename TypeTraits<BinaryType>::ArrayType;
3610   using ListArrayType = typename TypeTraits<ListType>::ArrayType;
3611   using ListScalarType = typename TypeTraits<ListType>::ScalarType;
3612   using ListOffsetType = typename ListArrayType::offset_type;
3613   using BuilderType = typename TypeTraits<BinaryType>::BuilderType;
3614
3615   static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
3616     if (batch[0].kind() == Datum::SCALAR) {
3617       if (batch[1].kind() == Datum::SCALAR) {
3618         return ExecScalarScalar(ctx, *batch[0].scalar(), *batch[1].scalar(), out);
3619       }
3620       DCHECK_EQ(batch[1].kind(), Datum::ARRAY);
3621       return ExecScalarArray(ctx, *batch[0].scalar(), batch[1].array(), out);
3622     }
3623     DCHECK_EQ(batch[0].kind(), Datum::ARRAY);
3624     if (batch[1].kind() == Datum::SCALAR) {
3625       return ExecArrayScalar(ctx, batch[0].array(), *batch[1].scalar(), out);
3626     }
3627     DCHECK_EQ(batch[1].kind(), Datum::ARRAY);
3628     return ExecArrayArray(ctx, batch[0].array(), batch[1].array(), out);
3629   }
3630
3631   struct ListScalarOffsetLookup {
3632     const ArrayType& values;
3633
3634     int64_t GetStart(int64_t i) { return 0; }
3635     int64_t GetStop(int64_t i) { return values.length(); }
3636     bool IsNull(int64_t i) { return false; }
3637   };
3638
3639   struct ListArrayOffsetLookup {
3640     explicit ListArrayOffsetLookup(const ListArrayType& lists)
3641         : lists_(lists), offsets_(lists.raw_value_offsets()) {}
3642
3643     int64_t GetStart(int64_t i) { return offsets_[i]; }
3644     int64_t GetStop(int64_t i) { return offsets_[i + 1]; }
3645     bool IsNull(int64_t i) { return lists_.IsNull(i); }
3646
3647    private:
3648     const ListArrayType& lists_;
3649     const ListOffsetType* offsets_;
3650   };
3651
3652   struct SeparatorScalarLookup {
3653     const util::string_view separator;
3654
3655     bool IsNull(int64_t i) { return false; }
3656     util::string_view GetView(int64_t i) { return separator; }
3657   };
3658
3659   struct SeparatorArrayLookup {
3660     const ArrayType& separators;
3661
3662     bool IsNull(int64_t i) { return separators.IsNull(i); }
3663     util::string_view GetView(int64_t i) { return separators.GetView(i); }
3664   };
3665
3666   // Scalar, scalar -> scalar
3667   static Status ExecScalarScalar(KernelContext* ctx, const Scalar& left,
3668                                  const Scalar& right, Datum* out) {
3669     const auto& list = checked_cast<const ListScalarType&>(left);
3670     const auto& separator_scalar = checked_cast<const BaseBinaryScalar&>(right);
3671     if (!list.is_valid || !separator_scalar.is_valid) {
3672       return Status::OK();
3673     }
3674     util::string_view separator(*separator_scalar.value);
3675
3676     const auto& strings = checked_cast<const ArrayType&>(*list.value);
3677     if (strings.null_count() > 0) {
3678       out->scalar()->is_valid = false;
3679       return Status::OK();
3680     }
3681
3682     TypedBufferBuilder<uint8_t> builder(ctx->memory_pool());
3683     auto Append = [&](util::string_view value) {
3684       return builder.Append(reinterpret_cast<const uint8_t*>(value.data()),
3685                             static_cast<int64_t>(value.size()));
3686     };
3687     if (strings.length() > 0) {
3688       auto data_length =
3689           strings.total_values_length() + (strings.length() - 1) * separator.length();
3690       RETURN_NOT_OK(builder.Reserve(data_length));
3691       RETURN_NOT_OK(Append(strings.GetView(0)));
3692       for (int64_t j = 1; j < strings.length(); j++) {
3693         RETURN_NOT_OK(Append(separator));
3694         RETURN_NOT_OK(Append(strings.GetView(j)));
3695       }
3696     }
3697     auto out_scalar = checked_cast<BaseBinaryScalar*>(out->scalar().get());
3698     return builder.Finish(&out_scalar->value);
3699   }
3700
3701   // Scalar, array -> array
3702   static Status ExecScalarArray(KernelContext* ctx, const Scalar& left,
3703                                 const std::shared_ptr<ArrayData>& right, Datum* out) {
3704     const auto& list_scalar = checked_cast<const BaseListScalar&>(left);
3705     if (!list_scalar.is_valid) {
3706       ARROW_ASSIGN_OR_RAISE(
3707           auto nulls, MakeArrayOfNull(right->type, right->length, ctx->memory_pool()));
3708       *out = *nulls->data();
3709       return Status::OK();
3710     }
3711     const auto& strings = checked_cast<const ArrayType&>(*list_scalar.value);
3712     if (strings.null_count() != 0) {
3713       ARROW_ASSIGN_OR_RAISE(
3714           auto nulls, MakeArrayOfNull(right->type, right->length, ctx->memory_pool()));
3715       *out = *nulls->data();
3716       return Status::OK();
3717     }
3718     const ArrayType separators(right);
3719
3720     BuilderType builder(ctx->memory_pool());
3721     RETURN_NOT_OK(builder.Reserve(separators.length()));
3722
3723     // Presize data to avoid multiple reallocations when joining strings
3724     int64_t total_data_length = 0;
3725     const int64_t list_length = strings.length();
3726     if (list_length) {
3727       const int64_t string_length = strings.total_values_length();
3728       total_data_length +=
3729           string_length * (separators.length() - separators.null_count());
3730       for (int64_t i = 0; i < separators.length(); ++i) {
3731         if (separators.IsNull(i)) {
3732           continue;
3733         }
3734         total_data_length += (list_length - 1) * separators.value_length(i);
3735       }
3736     }
3737     RETURN_NOT_OK(builder.ReserveData(total_data_length));
3738
3739     return JoinStrings(separators.length(), strings, ListScalarOffsetLookup{strings},
3740                        SeparatorArrayLookup{separators}, &builder, out);
3741   }
3742
3743   // Array, scalar -> array
3744   static Status ExecArrayScalar(KernelContext* ctx,
3745                                 const std::shared_ptr<ArrayData>& left,
3746                                 const Scalar& right, Datum* out) {
3747     const ListArrayType lists(left);
3748     const auto& separator_scalar = checked_cast<const BaseBinaryScalar&>(right);
3749
3750     if (!separator_scalar.is_valid) {
3751       ARROW_ASSIGN_OR_RAISE(
3752           auto nulls,
3753           MakeArrayOfNull(lists.value_type(), lists.length(), ctx->memory_pool()));
3754       *out = *nulls->data();
3755       return Status::OK();
3756     }
3757
3758     util::string_view separator(*separator_scalar.value);
3759     const auto& strings = checked_cast<const ArrayType&>(*lists.values());
3760     const auto list_offsets = lists.raw_value_offsets();
3761
3762     BuilderType builder(ctx->memory_pool());
3763     RETURN_NOT_OK(builder.Reserve(lists.length()));
3764
3765     // Presize data to avoid multiple reallocations when joining strings
3766     int64_t total_data_length = strings.total_values_length();
3767     for (int64_t i = 0; i < lists.length(); ++i) {
3768       const auto start = list_offsets[i], end = list_offsets[i + 1];
3769       if (end > start && !ValuesContainNull(strings, start, end)) {
3770         total_data_length += (end - start - 1) * separator.length();
3771       }
3772     }
3773     RETURN_NOT_OK(builder.ReserveData(total_data_length));
3774
3775     return JoinStrings(lists.length(), strings, ListArrayOffsetLookup{lists},
3776                        SeparatorScalarLookup{separator}, &builder, out);
3777   }
3778
3779   // Array, array -> array
3780   static Status ExecArrayArray(KernelContext* ctx, const std::shared_ptr<ArrayData>& left,
3781                                const std::shared_ptr<ArrayData>& right, Datum* out) {
3782     const ListArrayType lists(left);
3783     const auto& strings = checked_cast<const ArrayType&>(*lists.values());
3784     const auto list_offsets = lists.raw_value_offsets();
3785     const auto string_offsets = strings.raw_value_offsets();
3786     const ArrayType separators(right);
3787
3788     BuilderType builder(ctx->memory_pool());
3789     RETURN_NOT_OK(builder.Reserve(lists.length()));
3790
3791     // Presize data to avoid multiple reallocations when joining strings
3792     int64_t total_data_length = 0;
3793     for (int64_t i = 0; i < lists.length(); ++i) {
3794       if (separators.IsNull(i)) {
3795         continue;
3796       }
3797       const auto start = list_offsets[i], end = list_offsets[i + 1];
3798       if (end > start && !ValuesContainNull(strings, start, end)) {
3799         total_data_length += string_offsets[end] - string_offsets[start];
3800         total_data_length += (end - start - 1) * separators.value_length(i);
3801       }
3802     }
3803     RETURN_NOT_OK(builder.ReserveData(total_data_length));
3804
3805     struct SeparatorLookup {
3806       const ArrayType& separators;
3807
3808       bool IsNull(int64_t i) { return separators.IsNull(i); }
3809       util::string_view GetView(int64_t i) { return separators.GetView(i); }
3810     };
3811     return JoinStrings(lists.length(), strings, ListArrayOffsetLookup{lists},
3812                        SeparatorArrayLookup{separators}, &builder, out);
3813   }
3814
3815   template <typename ListOffsetLookup, typename SeparatorLookup>
3816   static Status JoinStrings(int64_t length, const ArrayType& strings,
3817                             ListOffsetLookup&& list_offsets, SeparatorLookup&& separators,
3818                             BuilderType* builder, Datum* out) {
3819     for (int64_t i = 0; i < length; ++i) {
3820       if (list_offsets.IsNull(i) || separators.IsNull(i)) {
3821         builder->UnsafeAppendNull();
3822         continue;
3823       }
3824       const auto j_start = list_offsets.GetStart(i), j_end = list_offsets.GetStop(i);
3825       if (j_start == j_end) {
3826         builder->UnsafeAppendEmptyValue();
3827         continue;
3828       }
3829       if (ValuesContainNull(strings, j_start, j_end)) {
3830         builder->UnsafeAppendNull();
3831         continue;
3832       }
3833       builder->UnsafeAppend(strings.GetView(j_start));
3834       for (int64_t j = j_start + 1; j < j_end; ++j) {
3835         builder->UnsafeExtendCurrent(separators.GetView(i));
3836         builder->UnsafeExtendCurrent(strings.GetView(j));
3837       }
3838     }
3839
3840     std::shared_ptr<Array> string_array;
3841     RETURN_NOT_OK(builder->Finish(&string_array));
3842     *out = *string_array->data();
3843     // Correct the output type based on the input
3844     out->mutable_array()->type = strings.type();
3845     return Status::OK();
3846   }
3847
3848   static bool ValuesContainNull(const ArrayType& values, int64_t start, int64_t end) {
3849     if (values.null_count() == 0) {
3850       return false;
3851     }
3852     for (int64_t i = start; i < end; ++i) {
3853       if (values.IsNull(i)) {
3854         return true;
3855       }
3856     }
3857     return false;
3858   }
3859 };
3860
3861 using BinaryJoinElementWiseState = OptionsWrapper<JoinOptions>;
3862
3863 template <typename Type>
3864 struct BinaryJoinElementWise {
3865   using ArrayType = typename TypeTraits<Type>::ArrayType;
3866   using BuilderType = typename TypeTraits<Type>::BuilderType;
3867   using offset_type = typename Type::offset_type;
3868
3869   static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
3870     JoinOptions options = BinaryJoinElementWiseState::Get(ctx);
3871     // Last argument is the separator (for consistency with binary_join)
3872     if (std::all_of(batch.values.begin(), batch.values.end(),
3873                     [](const Datum& d) { return d.is_scalar(); })) {
3874       return ExecOnlyScalar(ctx, options, batch, out);
3875     }
3876     return ExecContainingArrays(ctx, options, batch, out);
3877   }
3878
3879   static Status ExecOnlyScalar(KernelContext* ctx, const JoinOptions& options,
3880                                const ExecBatch& batch, Datum* out) {
3881     BaseBinaryScalar* output = checked_cast<BaseBinaryScalar*>(out->scalar().get());
3882     const size_t num_args = batch.values.size();
3883     if (num_args == 1) {
3884       // Only separator, no values
3885       output->is_valid = batch.values[0].scalar()->is_valid;
3886       if (output->is_valid) {
3887         ARROW_ASSIGN_OR_RAISE(output->value, ctx->Allocate(0));
3888       }
3889       return Status::OK();
3890     }
3891
3892     int64_t final_size = CalculateRowSize(options, batch, 0);
3893     if (final_size < 0) {
3894       output->is_valid = false;
3895       return Status::OK();
3896     }
3897     ARROW_ASSIGN_OR_RAISE(output->value, ctx->Allocate(final_size));
3898     const auto separator = UnboxScalar<Type>::Unbox(*batch.values.back().scalar());
3899     uint8_t* buf = output->value->mutable_data();
3900     bool first = true;
3901     for (size_t i = 0; i < num_args - 1; i++) {
3902       const Scalar& scalar = *batch[i].scalar();
3903       util::string_view s;
3904       if (scalar.is_valid) {
3905         s = UnboxScalar<Type>::Unbox(scalar);
3906       } else {
3907         switch (options.null_handling) {
3908           case JoinOptions::EMIT_NULL:
3909             // Handled by CalculateRowSize
3910             DCHECK(false) << "unreachable";
3911             break;
3912           case JoinOptions::SKIP:
3913             continue;
3914           case JoinOptions::REPLACE:
3915             s = options.null_replacement;
3916             break;
3917         }
3918       }
3919       if (!first) {
3920         buf = std::copy(separator.begin(), separator.end(), buf);
3921       }
3922       first = false;
3923       buf = std::copy(s.begin(), s.end(), buf);
3924     }
3925     output->is_valid = true;
3926     DCHECK_EQ(final_size, buf - output->value->mutable_data());
3927     return Status::OK();
3928   }
3929
3930   static Status ExecContainingArrays(KernelContext* ctx, const JoinOptions& options,
3931                                      const ExecBatch& batch, Datum* out) {
3932     // Presize data to avoid reallocations
3933     int64_t final_size = 0;
3934     for (int64_t i = 0; i < batch.length; i++) {
3935       auto size = CalculateRowSize(options, batch, i);
3936       if (size > 0) final_size += size;
3937     }
3938     BuilderType builder(ctx->memory_pool());
3939     RETURN_NOT_OK(builder.Reserve(batch.length));
3940     RETURN_NOT_OK(builder.ReserveData(final_size));
3941
3942     std::vector<util::string_view> valid_cols(batch.values.size());
3943     for (size_t row = 0; row < static_cast<size_t>(batch.length); row++) {
3944       size_t num_valid = 0;  // Not counting separator
3945       for (size_t col = 0; col < batch.values.size(); col++) {
3946         if (batch[col].is_scalar()) {
3947           const auto& scalar = *batch[col].scalar();
3948           if (scalar.is_valid) {
3949             valid_cols[col] = UnboxScalar<Type>::Unbox(scalar);
3950             if (col < batch.values.size() - 1) num_valid++;
3951           } else {
3952             valid_cols[col] = util::string_view();
3953           }
3954         } else {
3955           const ArrayData& array = *batch[col].array();
3956           if (!array.MayHaveNulls() ||
3957               BitUtil::GetBit(array.buffers[0]->data(), array.offset + row)) {
3958             const offset_type* offsets = array.GetValues<offset_type>(1);
3959             const uint8_t* data = array.GetValues<uint8_t>(2, /*absolute_offset=*/0);
3960             const int64_t length = offsets[row + 1] - offsets[row];
3961             valid_cols[col] = util::string_view(
3962                 reinterpret_cast<const char*>(data + offsets[row]), length);
3963             if (col < batch.values.size() - 1) num_valid++;
3964           } else {
3965             valid_cols[col] = util::string_view();
3966           }
3967         }
3968       }
3969
3970       if (!valid_cols.back().data()) {
3971         // Separator is null
3972         builder.UnsafeAppendNull();
3973         continue;
3974       } else if (batch.values.size() == 1) {
3975         // Only given separator
3976         builder.UnsafeAppendEmptyValue();
3977         continue;
3978       } else if (num_valid < batch.values.size() - 1) {
3979         // We had some nulls
3980         if (options.null_handling == JoinOptions::EMIT_NULL) {
3981           builder.UnsafeAppendNull();
3982           continue;
3983         }
3984       }
3985       const auto separator = valid_cols.back();
3986       bool first = true;
3987       for (size_t col = 0; col < batch.values.size() - 1; col++) {
3988         util::string_view value = valid_cols[col];
3989         if (!value.data()) {
3990           switch (options.null_handling) {
3991             case JoinOptions::EMIT_NULL:
3992               DCHECK(false) << "unreachable";
3993               break;
3994             case JoinOptions::SKIP:
3995               continue;
3996             case JoinOptions::REPLACE:
3997               value = options.null_replacement;
3998               break;
3999           }
4000         }
4001         if (first) {
4002           builder.UnsafeAppend(value);
4003           first = false;
4004           continue;
4005         }
4006         builder.UnsafeExtendCurrent(separator);
4007         builder.UnsafeExtendCurrent(value);
4008       }
4009     }
4010
4011     std::shared_ptr<Array> string_array;
4012     RETURN_NOT_OK(builder.Finish(&string_array));
4013     *out = *string_array->data();
4014     out->mutable_array()->type = batch[0].type();
4015     DCHECK_EQ(batch.length, out->array()->length);
4016     DCHECK_EQ(final_size,
4017               checked_cast<const ArrayType&>(*string_array).total_values_length());
4018     return Status::OK();
4019   }
4020
4021   // Compute the length of the output for the given position, or -1 if it would be null.
4022   static int64_t CalculateRowSize(const JoinOptions& options, const ExecBatch& batch,
4023                                   const int64_t index) {
4024     const auto num_args = batch.values.size();
4025     int64_t final_size = 0;
4026     int64_t num_non_null_args = 0;
4027     for (size_t i = 0; i < num_args; i++) {
4028       int64_t element_size = 0;
4029       bool valid = true;
4030       if (batch[i].is_scalar()) {
4031         const Scalar& scalar = *batch[i].scalar();
4032         valid = scalar.is_valid;
4033         element_size = UnboxScalar<Type>::Unbox(scalar).size();
4034       } else {
4035         const ArrayData& array = *batch[i].array();
4036         valid = !array.MayHaveNulls() ||
4037                 BitUtil::GetBit(array.buffers[0]->data(), array.offset + index);
4038         const offset_type* offsets = array.GetValues<offset_type>(1);
4039         element_size = offsets[index + 1] - offsets[index];
4040       }
4041       if (i == num_args - 1) {
4042         if (!valid) return -1;
4043         if (num_non_null_args > 1) {
4044           // Add separator size (only if there were values to join)
4045           final_size += (num_non_null_args - 1) * element_size;
4046         }
4047         break;
4048       }
4049       if (!valid) {
4050         switch (options.null_handling) {
4051           case JoinOptions::EMIT_NULL:
4052             return -1;
4053           case JoinOptions::SKIP:
4054             continue;
4055           case JoinOptions::REPLACE:
4056             element_size = options.null_replacement.size();
4057             break;
4058         }
4059       }
4060       num_non_null_args++;
4061       final_size += element_size;
4062     }
4063     return final_size;
4064   }
4065 };
4066
4067 const FunctionDoc binary_join_doc(
4068     "Join a list of strings together with a `separator` to form a single string",
4069     ("Insert `separator` between `list` elements, and concatenate them.\n"
4070      "Any null input and any null `list` element emits a null output.\n"),
4071     {"list", "separator"});
4072
4073 const FunctionDoc binary_join_element_wise_doc(
4074     "Join string arguments into one, using the last argument as the separator",
4075     ("Insert the last argument of `strings` between the rest of the elements, "
4076      "and concatenate them.\n"
4077      "Any null separator element emits a null output. Null elements either "
4078      "emit a null (the default), are skipped, or replaced with a given string.\n"),
4079     {"*strings"}, "JoinOptions");
4080
4081 const auto kDefaultJoinOptions = JoinOptions::Defaults();
4082
4083 template <typename ListType>
4084 void AddBinaryJoinForListType(ScalarFunction* func) {
4085   for (const std::shared_ptr<DataType>& ty : BaseBinaryTypes()) {
4086     auto exec = GenerateTypeAgnosticVarBinaryBase<BinaryJoin, ListType>(*ty);
4087     auto list_ty = std::make_shared<ListType>(ty);
4088     DCHECK_OK(func->AddKernel({InputType(list_ty), InputType(ty)}, ty, exec));
4089   }
4090 }
4091
4092 void AddBinaryJoin(FunctionRegistry* registry) {
4093   {
4094     auto func = std::make_shared<ScalarFunction>("binary_join", Arity::Binary(),
4095                                                  &binary_join_doc);
4096     AddBinaryJoinForListType<ListType>(func.get());
4097     AddBinaryJoinForListType<LargeListType>(func.get());
4098     DCHECK_OK(registry->AddFunction(std::move(func)));
4099   }
4100   {
4101     auto func = std::make_shared<ScalarFunction>(
4102         "binary_join_element_wise", Arity::VarArgs(/*min_args=*/1),
4103         &binary_join_element_wise_doc, &kDefaultJoinOptions);
4104     for (const auto& ty : BaseBinaryTypes()) {
4105       ScalarKernel kernel{KernelSignature::Make({InputType(ty)}, ty, /*is_varargs=*/true),
4106                           GenerateTypeAgnosticVarBinaryBase<BinaryJoinElementWise>(ty),
4107                           BinaryJoinElementWiseState::Init};
4108       kernel.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE;
4109       kernel.mem_allocation = MemAllocation::NO_PREALLOCATE;
4110       DCHECK_OK(func->AddKernel(std::move(kernel)));
4111     }
4112     DCHECK_OK(registry->AddFunction(std::move(func)));
4113   }
4114 }
4115
4116 template <template <typename> class ExecFunctor>
4117 void MakeUnaryStringBatchKernel(
4118     std::string name, FunctionRegistry* registry, const FunctionDoc* doc,
4119     MemAllocation::type mem_allocation = MemAllocation::PREALLOCATE) {
4120   auto func = std::make_shared<ScalarFunction>(name, Arity::Unary(), doc);
4121   {
4122     auto exec_32 = ExecFunctor<StringType>::Exec;
4123     ScalarKernel kernel{{utf8()}, utf8(), exec_32};
4124     kernel.mem_allocation = mem_allocation;
4125     DCHECK_OK(func->AddKernel(std::move(kernel)));
4126   }
4127   {
4128     auto exec_64 = ExecFunctor<LargeStringType>::Exec;
4129     ScalarKernel kernel{{large_utf8()}, large_utf8(), exec_64};
4130     kernel.mem_allocation = mem_allocation;
4131     DCHECK_OK(func->AddKernel(std::move(kernel)));
4132   }
4133   DCHECK_OK(registry->AddFunction(std::move(func)));
4134 }
4135
4136 template <template <typename> class ExecFunctor>
4137 void MakeUnaryStringBatchKernelWithState(
4138     std::string name, FunctionRegistry* registry, const FunctionDoc* doc,
4139     MemAllocation::type mem_allocation = MemAllocation::PREALLOCATE) {
4140   auto func = std::make_shared<ScalarFunction>(name, Arity::Unary(), doc);
4141   {
4142     using t32 = ExecFunctor<StringType>;
4143     ScalarKernel kernel{{utf8()}, utf8(), t32::Exec, t32::State::Init};
4144     kernel.mem_allocation = mem_allocation;
4145     DCHECK_OK(func->AddKernel(std::move(kernel)));
4146   }
4147   {
4148     using t64 = ExecFunctor<LargeStringType>;
4149     ScalarKernel kernel{{large_utf8()}, large_utf8(), t64::Exec, t64::State::Init};
4150     kernel.mem_allocation = mem_allocation;
4151     DCHECK_OK(func->AddKernel(std::move(kernel)));
4152   }
4153   DCHECK_OK(registry->AddFunction(std::move(func)));
4154 }
4155
4156 #ifdef ARROW_WITH_UTF8PROC
4157
4158 template <template <typename> class Transformer>
4159 void MakeUnaryStringUTF8TransformKernel(std::string name, FunctionRegistry* registry,
4160                                         const FunctionDoc* doc) {
4161   auto func = std::make_shared<ScalarFunction>(name, Arity::Unary(), doc);
4162   ArrayKernelExec exec_32 = Transformer<StringType>::Exec;
4163   ArrayKernelExec exec_64 = Transformer<LargeStringType>::Exec;
4164   DCHECK_OK(func->AddKernel({utf8()}, utf8(), exec_32));
4165   DCHECK_OK(func->AddKernel({large_utf8()}, large_utf8(), exec_64));
4166   DCHECK_OK(registry->AddFunction(std::move(func)));
4167 }
4168
4169 #endif
4170
4171 // NOTE: Predicate should only populate 'status' with errors,
4172 //       leave it unmodified to indicate Status::OK()
4173 using StringPredicate =
4174     std::function<bool(KernelContext*, const uint8_t*, size_t, Status*)>;
4175
4176 template <typename Type>
4177 Status ApplyPredicate(KernelContext* ctx, const ExecBatch& batch,
4178                       StringPredicate predicate, Datum* out) {
4179   Status st = Status::OK();
4180   EnsureLookupTablesFilled();
4181   if (batch[0].kind() == Datum::ARRAY) {
4182     const ArrayData& input = *batch[0].array();
4183     ArrayIterator<Type> input_it(input);
4184     ArrayData* out_arr = out->mutable_array();
4185     ::arrow::internal::GenerateBitsUnrolled(
4186         out_arr->buffers[1]->mutable_data(), out_arr->offset, input.length,
4187         [&]() -> bool {
4188           util::string_view val = input_it();
4189           return predicate(ctx, reinterpret_cast<const uint8_t*>(val.data()), val.size(),
4190                            &st);
4191         });
4192   } else {
4193     const auto& input = checked_cast<const BaseBinaryScalar&>(*batch[0].scalar());
4194     if (input.is_valid) {
4195       bool boolean_result = predicate(ctx, input.value->data(),
4196                                       static_cast<size_t>(input.value->size()), &st);
4197       // UTF decoding can lead to issues
4198       if (st.ok()) {
4199         out->value = std::make_shared<BooleanScalar>(boolean_result);
4200       }
4201     }
4202   }
4203   return st;
4204 }
4205
4206 template <typename Predicate>
4207 void AddUnaryStringPredicate(std::string name, FunctionRegistry* registry,
4208                              const FunctionDoc* doc) {
4209   auto func = std::make_shared<ScalarFunction>(name, Arity::Unary(), doc);
4210   auto exec_32 = [](KernelContext* ctx, const ExecBatch& batch, Datum* out) {
4211     return ApplyPredicate<StringType>(ctx, batch, Predicate::Call, out);
4212   };
4213   auto exec_64 = [](KernelContext* ctx, const ExecBatch& batch, Datum* out) {
4214     return ApplyPredicate<LargeStringType>(ctx, batch, Predicate::Call, out);
4215   };
4216   DCHECK_OK(func->AddKernel({utf8()}, boolean(), std::move(exec_32)));
4217   DCHECK_OK(func->AddKernel({large_utf8()}, boolean(), std::move(exec_64)));
4218   DCHECK_OK(registry->AddFunction(std::move(func)));
4219 }
4220
4221 FunctionDoc StringPredicateDoc(std::string summary, std::string description) {
4222   return FunctionDoc{std::move(summary), std::move(description), {"strings"}};
4223 }
4224
4225 FunctionDoc StringClassifyDoc(std::string class_summary, std::string class_desc,
4226                               bool non_empty) {
4227   std::string summary, description;
4228   {
4229     std::stringstream ss;
4230     ss << "Classify strings as " << class_summary;
4231     summary = ss.str();
4232   }
4233   {
4234     std::stringstream ss;
4235     if (non_empty) {
4236       ss
4237           << ("For each string in `strings`, emit true iff the string is non-empty\n"
4238               "and consists only of ");
4239     } else {
4240       ss
4241           << ("For each string in `strings`, emit true iff the string consists only\n"
4242               "of ");
4243     }
4244     ss << class_desc << ".  Null strings emit null.";
4245     description = ss.str();
4246   }
4247   return StringPredicateDoc(std::move(summary), std::move(description));
4248 }
4249
4250 const auto string_is_ascii_doc = StringClassifyDoc("ASCII", "ASCII characters", false);
4251
4252 const auto ascii_is_alnum_doc =
4253     StringClassifyDoc("ASCII alphanumeric", "alphanumeric ASCII characters", true);
4254 const auto ascii_is_alpha_doc =
4255     StringClassifyDoc("ASCII alphabetic", "alphabetic ASCII characters", true);
4256 const auto ascii_is_decimal_doc =
4257     StringClassifyDoc("ASCII decimal", "decimal ASCII characters", true);
4258 const auto ascii_is_lower_doc =
4259     StringClassifyDoc("ASCII lowercase", "lowercase ASCII characters", true);
4260 const auto ascii_is_printable_doc =
4261     StringClassifyDoc("ASCII printable", "printable ASCII characters", true);
4262 const auto ascii_is_space_doc =
4263     StringClassifyDoc("ASCII whitespace", "whitespace ASCII characters", true);
4264 const auto ascii_is_upper_doc =
4265     StringClassifyDoc("ASCII uppercase", "uppercase ASCII characters", true);
4266
4267 const auto ascii_is_title_doc = StringPredicateDoc(
4268     "Classify strings as ASCII titlecase",
4269     ("For each string in `strings`, emit true iff the string is title-cased,\n"
4270      "i.e. it has at least one cased character, each uppercase character\n"
4271      "follows an uncased character, and each lowercase character follows\n"
4272      "an uppercase character.\n"));
4273
4274 const auto utf8_is_alnum_doc =
4275     StringClassifyDoc("alphanumeric", "alphanumeric Unicode characters", true);
4276 const auto utf8_is_alpha_doc =
4277     StringClassifyDoc("alphabetic", "alphabetic Unicode characters", true);
4278 const auto utf8_is_decimal_doc =
4279     StringClassifyDoc("decimal", "decimal Unicode characters", true);
4280 const auto utf8_is_digit_doc = StringClassifyDoc("digits", "Unicode digits", true);
4281 const auto utf8_is_lower_doc =
4282     StringClassifyDoc("lowercase", "lowercase Unicode characters", true);
4283 const auto utf8_is_numeric_doc =
4284     StringClassifyDoc("numeric", "numeric Unicode characters", true);
4285 const auto utf8_is_printable_doc =
4286     StringClassifyDoc("printable", "printable Unicode characters", true);
4287 const auto utf8_is_space_doc =
4288     StringClassifyDoc("whitespace", "whitespace Unicode characters", true);
4289 const auto utf8_is_upper_doc =
4290     StringClassifyDoc("uppercase", "uppercase Unicode characters", true);
4291
4292 const auto utf8_is_title_doc = StringPredicateDoc(
4293     "Classify strings as titlecase",
4294     ("For each string in `strings`, emit true iff the string is title-cased,\n"
4295      "i.e. it has at least one cased character, each uppercase character\n"
4296      "follows an uncased character, and each lowercase character follows\n"
4297      "an uppercase character.\n"));
4298
4299 const FunctionDoc ascii_upper_doc(
4300     "Transform ASCII input to uppercase",
4301     ("For each string in `strings`, return an uppercase version.\n\n"
4302      "This function assumes the input is fully ASCII.  It it may contain\n"
4303      "non-ASCII characters, use \"utf8_upper\" instead."),
4304     {"strings"});
4305
4306 const FunctionDoc ascii_lower_doc(
4307     "Transform ASCII input to lowercase",
4308     ("For each string in `strings`, return a lowercase version.\n\n"
4309      "This function assumes the input is fully ASCII.  If it may contain\n"
4310      "non-ASCII characters, use \"utf8_lower\" instead."),
4311     {"strings"});
4312
4313 const FunctionDoc ascii_swapcase_doc(
4314     "Transform ASCII input lowercase characters to uppercase and uppercase characters to "
4315     "lowercase",
4316     ("For each string in `strings`, return a string with opposite casing.\n\n"
4317      "This function assumes the input is fully ASCII.  If it may contain\n"
4318      "non-ASCII characters, use \"utf8_swapcase\" instead."),
4319     {"strings"});
4320
4321 const FunctionDoc ascii_capitalize_doc(
4322     "Capitalize the first character of ASCII input",
4323     ("For each string in `strings`, return a capitalized version.\n\n"
4324      "This function assumes the input is fully ASCII.  If it may contain\n"
4325      "non-ASCII characters, use \"utf8_capitalize\" instead."),
4326     {"strings"});
4327
4328 const FunctionDoc ascii_title_doc(
4329     "Titlecase each word of ASCII input",
4330     ("For each string in `strings`, return a titlecased version.\n"
4331      "Each word in the output will start with an uppercase character and its\n"
4332      "remaining characters will be lowercase.\n\n"
4333      "This function assumes the input is fully ASCII.  If it may contain\n"
4334      "non-ASCII characters, use \"utf8_title\" instead."),
4335     {"strings"});
4336
4337 const FunctionDoc ascii_reverse_doc(
4338     "Reverse ASCII input",
4339     ("For each ASCII string in `strings`, return a reversed version.\n\n"
4340      "This function assumes the input is fully ASCII.  If it may contain\n"
4341      "non-ASCII characters, use \"utf8_reverse\" instead."),
4342     {"strings"});
4343
4344 const FunctionDoc utf8_upper_doc(
4345     "Transform input to uppercase",
4346     ("For each string in `strings`, return an uppercase version."), {"strings"});
4347
4348 const FunctionDoc utf8_lower_doc(
4349     "Transform input to lowercase",
4350     ("For each string in `strings`, return a lowercase version."), {"strings"});
4351
4352 const FunctionDoc utf8_swapcase_doc(
4353     "Transform input lowercase characters to uppercase and uppercase characters to "
4354     "lowercase",
4355     ("For each string in `strings`, return an opposite case version."), {"strings"});
4356
4357 const FunctionDoc utf8_capitalize_doc(
4358     "Capitalize the first character of input",
4359     ("For each string in `strings`, return a capitalized version,\n"
4360      "with the first character uppercased and the others lowercased."),
4361     {"strings"});
4362
4363 const FunctionDoc utf8_title_doc(
4364     "Titlecase each word of input",
4365     ("For each string in `strings`, return a titlecased version.\n"
4366      "Each word in the output will start with an uppercase character and its\n"
4367      "remaining characters will be lowercase."),
4368     {"strings"});
4369
4370 const FunctionDoc utf8_reverse_doc(
4371     "Reverse input",
4372     ("For each string in `strings`, return a reversed version.\n\n"
4373      "This function operates on Unicode codepoints, not grapheme\n"
4374      "clusters. Hence, it will not correctly reverse grapheme clusters\n"
4375      "composed of multiple codepoints."),
4376     {"strings"});
4377
4378 }  // namespace
4379
4380 void RegisterScalarStringAscii(FunctionRegistry* registry) {
4381   // Some kernels are able to reuse the original offsets buffer, so don't
4382   // preallocate them in the output. Only kernels that invoke
4383   // "StringDataTransform" support no preallocation.
4384   MakeUnaryStringBatchKernel<AsciiUpper>("ascii_upper", registry, &ascii_upper_doc,
4385                                          MemAllocation::NO_PREALLOCATE);
4386   MakeUnaryStringBatchKernel<AsciiLower>("ascii_lower", registry, &ascii_lower_doc,
4387                                          MemAllocation::NO_PREALLOCATE);
4388   MakeUnaryStringBatchKernel<AsciiSwapCase>(
4389       "ascii_swapcase", registry, &ascii_swapcase_doc, MemAllocation::NO_PREALLOCATE);
4390   MakeUnaryStringBatchKernel<AsciiCapitalize>("ascii_capitalize", registry,
4391                                               &ascii_capitalize_doc);
4392   MakeUnaryStringBatchKernel<AsciiTitle>("ascii_title", registry, &ascii_title_doc);
4393   MakeUnaryStringBatchKernel<AsciiTrimWhitespace>("ascii_trim_whitespace", registry,
4394                                                   &ascii_trim_whitespace_doc);
4395   MakeUnaryStringBatchKernel<AsciiLTrimWhitespace>("ascii_ltrim_whitespace", registry,
4396                                                    &ascii_ltrim_whitespace_doc);
4397   MakeUnaryStringBatchKernel<AsciiRTrimWhitespace>("ascii_rtrim_whitespace", registry,
4398                                                    &ascii_rtrim_whitespace_doc);
4399   MakeUnaryStringBatchKernel<AsciiReverse>("ascii_reverse", registry, &ascii_reverse_doc);
4400   MakeUnaryStringBatchKernel<Utf8Reverse>("utf8_reverse", registry, &utf8_reverse_doc);
4401
4402   MakeUnaryStringBatchKernelWithState<AsciiCenter>("ascii_center", registry,
4403                                                    &ascii_center_doc);
4404   MakeUnaryStringBatchKernelWithState<AsciiLPad>("ascii_lpad", registry, &ascii_lpad_doc);
4405   MakeUnaryStringBatchKernelWithState<AsciiRPad>("ascii_rpad", registry, &ascii_rpad_doc);
4406   MakeUnaryStringBatchKernelWithState<Utf8Center>("utf8_center", registry,
4407                                                   &utf8_center_doc);
4408   MakeUnaryStringBatchKernelWithState<Utf8LPad>("utf8_lpad", registry, &utf8_lpad_doc);
4409   MakeUnaryStringBatchKernelWithState<Utf8RPad>("utf8_rpad", registry, &utf8_rpad_doc);
4410
4411   MakeUnaryStringBatchKernelWithState<AsciiTrim>("ascii_trim", registry, &ascii_trim_doc);
4412   MakeUnaryStringBatchKernelWithState<AsciiLTrim>("ascii_ltrim", registry,
4413                                                   &ascii_ltrim_doc);
4414   MakeUnaryStringBatchKernelWithState<AsciiRTrim>("ascii_rtrim", registry,
4415                                                   &ascii_rtrim_doc);
4416
4417   AddUnaryStringPredicate<IsAscii>("string_is_ascii", registry, &string_is_ascii_doc);
4418
4419   AddUnaryStringPredicate<IsAlphaNumericAscii>("ascii_is_alnum", registry,
4420                                                &ascii_is_alnum_doc);
4421   AddUnaryStringPredicate<IsAlphaAscii>("ascii_is_alpha", registry, &ascii_is_alpha_doc);
4422   AddUnaryStringPredicate<IsDecimalAscii>("ascii_is_decimal", registry,
4423                                           &ascii_is_decimal_doc);
4424   // no is_digit for ascii, since it is the same as is_decimal
4425   AddUnaryStringPredicate<IsLowerAscii>("ascii_is_lower", registry, &ascii_is_lower_doc);
4426   // no is_numeric for ascii, since it is the same as is_decimal
4427   AddUnaryStringPredicate<IsPrintableAscii>("ascii_is_printable", registry,
4428                                             &ascii_is_printable_doc);
4429   AddUnaryStringPredicate<IsSpaceAscii>("ascii_is_space", registry, &ascii_is_space_doc);
4430   AddUnaryStringPredicate<IsTitleAscii>("ascii_is_title", registry, &ascii_is_title_doc);
4431   AddUnaryStringPredicate<IsUpperAscii>("ascii_is_upper", registry, &ascii_is_upper_doc);
4432
4433 #ifdef ARROW_WITH_UTF8PROC
4434   MakeUnaryStringUTF8TransformKernel<UTF8Upper>("utf8_upper", registry, &utf8_upper_doc);
4435   MakeUnaryStringUTF8TransformKernel<UTF8Lower>("utf8_lower", registry, &utf8_lower_doc);
4436   MakeUnaryStringUTF8TransformKernel<UTF8SwapCase>("utf8_swapcase", registry,
4437                                                    &utf8_swapcase_doc);
4438   MakeUnaryStringBatchKernel<Utf8Capitalize>("utf8_capitalize", registry,
4439                                              &utf8_capitalize_doc);
4440   MakeUnaryStringBatchKernel<Utf8Title>("utf8_title", registry, &utf8_title_doc);
4441   MakeUnaryStringBatchKernel<UTF8TrimWhitespace>("utf8_trim_whitespace", registry,
4442                                                  &utf8_trim_whitespace_doc);
4443   MakeUnaryStringBatchKernel<UTF8LTrimWhitespace>("utf8_ltrim_whitespace", registry,
4444                                                   &utf8_ltrim_whitespace_doc);
4445   MakeUnaryStringBatchKernel<UTF8RTrimWhitespace>("utf8_rtrim_whitespace", registry,
4446                                                   &utf8_rtrim_whitespace_doc);
4447   MakeUnaryStringBatchKernelWithState<UTF8Trim>("utf8_trim", registry, &utf8_trim_doc);
4448   MakeUnaryStringBatchKernelWithState<UTF8LTrim>("utf8_ltrim", registry, &utf8_ltrim_doc);
4449   MakeUnaryStringBatchKernelWithState<UTF8RTrim>("utf8_rtrim", registry, &utf8_rtrim_doc);
4450
4451   AddUnaryStringPredicate<IsAlphaNumericUnicode>("utf8_is_alnum", registry,
4452                                                  &utf8_is_alnum_doc);
4453   AddUnaryStringPredicate<IsAlphaUnicode>("utf8_is_alpha", registry, &utf8_is_alpha_doc);
4454   AddUnaryStringPredicate<IsDecimalUnicode>("utf8_is_decimal", registry,
4455                                             &utf8_is_decimal_doc);
4456   AddUnaryStringPredicate<IsDigitUnicode>("utf8_is_digit", registry, &utf8_is_digit_doc);
4457   AddUnaryStringPredicate<IsLowerUnicode>("utf8_is_lower", registry, &utf8_is_lower_doc);
4458   AddUnaryStringPredicate<IsNumericUnicode>("utf8_is_numeric", registry,
4459                                             &utf8_is_numeric_doc);
4460   AddUnaryStringPredicate<IsPrintableUnicode>("utf8_is_printable", registry,
4461                                               &utf8_is_printable_doc);
4462   AddUnaryStringPredicate<IsSpaceUnicode>("utf8_is_space", registry, &utf8_is_space_doc);
4463   AddUnaryStringPredicate<IsTitleUnicode>("utf8_is_title", registry, &utf8_is_title_doc);
4464   AddUnaryStringPredicate<IsUpperUnicode>("utf8_is_upper", registry, &utf8_is_upper_doc);
4465 #endif
4466
4467   AddBinaryLength(registry);
4468   AddUtf8Length(registry);
4469   AddMatchSubstring(registry);
4470   AddFindSubstring(registry);
4471   AddCountSubstring(registry);
4472   MakeUnaryStringBatchKernelWithState<ReplaceSubStringPlain>(
4473       "replace_substring", registry, &replace_substring_doc,
4474       MemAllocation::NO_PREALLOCATE);
4475 #ifdef ARROW_WITH_RE2
4476   MakeUnaryStringBatchKernelWithState<ReplaceSubStringRegex>(
4477       "replace_substring_regex", registry, &replace_substring_regex_doc,
4478       MemAllocation::NO_PREALLOCATE);
4479   AddExtractRegex(registry);
4480 #endif
4481   AddReplaceSlice(registry);
4482   AddSlice(registry);
4483   AddSplit(registry);
4484   AddStrptime(registry);
4485   AddBinaryJoin(registry);
4486 }
4487
4488 }  // namespace internal
4489 }  // namespace compute
4490 }  // namespace arrow