[ceph.git] / ceph / src / arrow / cpp / src / gandiva / decimal_ir.cc

// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements.  See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership.  The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License.  You may obtain a copy of the License at
//
//   http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied.  See the License for the
// specific language governing permissions and limitations
// under the License.

#include <sstream>
#include <unordered_set>
#include <utility>

#include "arrow/status.h"
#include "gandiva/decimal_ir.h"
#include "gandiva/decimal_type_util.h"

// Algorithms adapted from Apache Impala

namespace gandiva {

#define ADD_TRACE_32(msg, value) \
  if (enable_ir_traces_) {       \
    AddTrace32(msg, value);      \
  }
#define ADD_TRACE_128(msg, value) \
  if (enable_ir_traces_) {        \
    AddTrace128(msg, value);      \
  }

// These are the functions defined in this file. The rest are in precompiled folder,
// and the i128 needs to be dis-assembled for those.
static const char* kAddFunction = "add_decimal128_decimal128";
static const char* kSubtractFunction = "subtract_decimal128_decimal128";
static const char* kEQFunction = "equal_decimal128_decimal128";
static const char* kNEFunction = "not_equal_decimal128_decimal128";
static const char* kLTFunction = "less_than_decimal128_decimal128";
static const char* kLEFunction = "less_than_or_equal_to_decimal128_decimal128";
static const char* kGTFunction = "greater_than_decimal128_decimal128";
static const char* kGEFunction = "greater_than_or_equal_to_decimal128_decimal128";

static const std::unordered_set<std::string> kDecimalIRBuilderFunctions{
    kAddFunction, kSubtractFunction, kEQFunction, kNEFunction,
    kLTFunction,  kLEFunction,       kGTFunction, kGEFunction};

const char* DecimalIR::kScaleMultipliersName = "gandivaScaleMultipliers";

/// Populate globals required by decimal IR.
/// TODO: can this be done just once ?
void DecimalIR::AddGlobals(Engine* engine) {
  auto types = engine->types();

  // populate vector : [ 1, 10, 100, 1000, ..]
  std::string value = "1";
  std::vector<llvm::Constant*> scale_multipliers;
  for (int i = 0; i < DecimalTypeUtil::kMaxPrecision + 1; ++i) {
    auto multiplier =
        llvm::ConstantInt::get(llvm::Type::getInt128Ty(*engine->context()), value, 10);
    scale_multipliers.push_back(multiplier);
    value.append("0");
  }

  auto array_type =
      llvm::ArrayType::get(types->i128_type(), DecimalTypeUtil::kMaxPrecision + 1);
  auto initializer = llvm::ConstantArray::get(
      array_type, llvm::ArrayRef<llvm::Constant*>(scale_multipliers));

  auto globalScaleMultipliers = new llvm::GlobalVariable(
      *engine->module(), array_type, true /*constant*/,
      llvm::GlobalValue::LinkOnceAnyLinkage, initializer, kScaleMultipliersName);
  globalScaleMultipliers->setAlignment(LLVM_ALIGN(16));
}

// Lookup intrinsic functions
void DecimalIR::InitializeIntrinsics() {
  sadd_with_overflow_fn_ = llvm::Intrinsic::getDeclaration(
      module(), llvm::Intrinsic::sadd_with_overflow, types()->i128_type());
  DCHECK_NE(sadd_with_overflow_fn_, nullptr);

  smul_with_overflow_fn_ = llvm::Intrinsic::getDeclaration(
      module(), llvm::Intrinsic::smul_with_overflow, types()->i128_type());
  DCHECK_NE(smul_with_overflow_fn_, nullptr);

  i128_with_overflow_struct_type_ =
      sadd_with_overflow_fn_->getFunctionType()->getReturnType();
}

// CPP:  return kScaleMultipliers[scale]
llvm::Value* DecimalIR::GetScaleMultiplier(llvm::Value* scale) {
  auto const_array = module()->getGlobalVariable(kScaleMultipliersName);
  auto ptr = CreateGEP(ir_builder(), const_array, {types()->i32_constant(0), scale});
  return CreateLoad(ir_builder(), ptr);
}

// CPP:  x <= y ? y : x
llvm::Value* DecimalIR::GetHigherScale(llvm::Value* x_scale, llvm::Value* y_scale) {
  llvm::Value* le = ir_builder()->CreateICmpSLE(x_scale, y_scale);
  return ir_builder()->CreateSelect(le, y_scale, x_scale);
}

// CPP: return (increase_scale_by <= 0)  ?
//              in_value : in_value * GetScaleMultiplier(increase_scale_by)
llvm::Value* DecimalIR::IncreaseScale(llvm::Value* in_value,
                                      llvm::Value* increase_scale_by) {
  llvm::Value* le_zero =
      ir_builder()->CreateICmpSLE(increase_scale_by, types()->i32_constant(0));
  // then block
  auto then_lambda = [&] { return in_value; };

  // else block
  auto else_lambda = [&] {
    llvm::Value* multiplier = GetScaleMultiplier(increase_scale_by);
    return ir_builder()->CreateMul(in_value, multiplier);
  };

  return BuildIfElse(le_zero, types()->i128_type(), then_lambda, else_lambda);
}

// CPP: return (increase_scale_by <= 0)  ?
//              {in_value,false} : {in_value * GetScaleMultiplier(increase_scale_by),true}
//
// The return value also indicates if there was an overflow while increasing the scale.
DecimalIR::ValueWithOverflow DecimalIR::IncreaseScaleWithOverflowCheck(
    llvm::Value* in_value, llvm::Value* increase_scale_by) {
  llvm::Value* le_zero =
      ir_builder()->CreateICmpSLE(increase_scale_by, types()->i32_constant(0));

  // then block
  auto then_lambda = [&] {
    ValueWithOverflow ret{in_value, types()->false_constant()};
    return ret.AsStruct(this);
  };

  // else block
  auto else_lambda = [&] {
    llvm::Value* multiplier = GetScaleMultiplier(increase_scale_by);
    return ir_builder()->CreateCall(smul_with_overflow_fn_, {in_value, multiplier});
  };

  auto ir_struct =
      BuildIfElse(le_zero, i128_with_overflow_struct_type_, then_lambda, else_lambda);
  return ValueWithOverflow::MakeFromStruct(this, ir_struct);
}

// CPP: return (reduce_scale_by <= 0)  ?
//              in_value : in_value / GetScaleMultiplier(reduce_scale_by)
//
// ReduceScale cannot cause an overflow.
llvm::Value* DecimalIR::ReduceScale(llvm::Value* in_value, llvm::Value* reduce_scale_by) {
  auto le_zero = ir_builder()->CreateICmpSLE(reduce_scale_by, types()->i32_constant(0));
  // then block
  auto then_lambda = [&] { return in_value; };

  // else block
  auto else_lambda = [&] {
    // TODO : handle rounding.
    llvm::Value* multiplier = GetScaleMultiplier(reduce_scale_by);
    return ir_builder()->CreateSDiv(in_value, multiplier);
  };

  return BuildIfElse(le_zero, types()->i128_type(), then_lambda, else_lambda);
}

/// @brief Fast-path for add
/// Adjust x and y to the same scale, and add them.
llvm::Value* DecimalIR::AddFastPath(const ValueFull& x, const ValueFull& y) {
  auto higher_scale = GetHigherScale(x.scale(), y.scale());
  ADD_TRACE_32("AddFastPath : higher_scale", higher_scale);

  // CPP : x_scaled = IncreaseScale(x_value, higher_scale - x_scale)
  auto x_delta = ir_builder()->CreateSub(higher_scale, x.scale());
  auto x_scaled = IncreaseScale(x.value(), x_delta);
  ADD_TRACE_128("AddFastPath : x_scaled", x_scaled);

  // CPP : y_scaled = IncreaseScale(y_value, higher_scale - y_scale)
  auto y_delta = ir_builder()->CreateSub(higher_scale, y.scale());
  auto y_scaled = IncreaseScale(y.value(), y_delta);
  ADD_TRACE_128("AddFastPath : y_scaled", y_scaled);

  auto sum = ir_builder()->CreateAdd(x_scaled, y_scaled);
  ADD_TRACE_128("AddFastPath : sum", sum);
  return sum;
}

// @brief Add with overflow check.
/// Adjust x and y to the same scale, add them, and reduce sum to output scale.
/// If there is an overflow, the sum is set to 0.
DecimalIR::ValueWithOverflow DecimalIR::AddWithOverflowCheck(const ValueFull& x,
                                                             const ValueFull& y,
                                                             const ValueFull& out) {
  auto higher_scale = GetHigherScale(x.scale(), y.scale());
  ADD_TRACE_32("AddWithOverflowCheck : higher_scale", higher_scale);

  // CPP : x_scaled = IncreaseScale(x_value, higher_scale - x.scale())
  auto x_delta = ir_builder()->CreateSub(higher_scale, x.scale());
  auto x_scaled = IncreaseScaleWithOverflowCheck(x.value(), x_delta);
  ADD_TRACE_128("AddWithOverflowCheck : x_scaled", x_scaled.value());

  // CPP : y_scaled = IncreaseScale(y_value, higher_scale - y_scale)
  auto y_delta = ir_builder()->CreateSub(higher_scale, y.scale());
  auto y_scaled = IncreaseScaleWithOverflowCheck(y.value(), y_delta);
  ADD_TRACE_128("AddWithOverflowCheck : y_scaled", y_scaled.value());

  // CPP : sum = x_scaled + y_scaled
  auto sum_ir_struct = ir_builder()->CreateCall(sadd_with_overflow_fn_,
                                                {x_scaled.value(), y_scaled.value()});
  auto sum = ValueWithOverflow::MakeFromStruct(this, sum_ir_struct);
  ADD_TRACE_128("AddWithOverflowCheck : sum", sum.value());

  // CPP : overflow ? 0 : sum / GetScaleMultiplier(max_scale - out_scale)
  auto overflow = GetCombinedOverflow({x_scaled, y_scaled, sum});
  ADD_TRACE_32("AddWithOverflowCheck : overflow", overflow);
  auto then_lambda = [&] {
    // if there is an overflow, the value returned won't be used. so, save the division.
    return types()->i128_constant(0);
  };
  auto else_lambda = [&] {
    auto reduce_scale_by = ir_builder()->CreateSub(higher_scale, out.scale());
    return ReduceScale(sum.value(), reduce_scale_by);
  };
  auto sum_descaled =
      BuildIfElse(overflow, types()->i128_type(), then_lambda, else_lambda);
  return ValueWithOverflow(sum_descaled, overflow);
}

// This is pretty complex, so use CPP fns.
llvm::Value* DecimalIR::AddLarge(const ValueFull& x, const ValueFull& y,
                                 const ValueFull& out) {
  auto block = ir_builder()->GetInsertBlock();
  auto out_high_ptr = new llvm::AllocaInst(types()->i64_type(), 0, "out_hi", block);
  auto out_low_ptr = new llvm::AllocaInst(types()->i64_type(), 0, "out_low", block);
  auto x_split = ValueSplit::MakeFromInt128(this, x.value());
  auto y_split = ValueSplit::MakeFromInt128(this, y.value());

  std::vector<llvm::Value*> args = {
      x_split.high(),  x_split.low(), x.precision(), x.scale(),
      y_split.high(),  y_split.low(), y.precision(), y.scale(),
      out.precision(), out.scale(),   out_high_ptr,  out_low_ptr,
  };
  ir_builder()->CreateCall(module()->getFunction("add_large_decimal128_decimal128"),
                           args);

  auto out_high = CreateLoad(ir_builder(), out_high_ptr);
  auto out_low = CreateLoad(ir_builder(), out_low_ptr);
  auto sum = ValueSplit(out_high, out_low).AsInt128(this);
  ADD_TRACE_128("AddLarge : sum", sum);
  return sum;
}

/// The output scale/precision cannot be arbitrary values. The algo here depends on them
/// to be the same as computed in DecimalTypeSql.
/// TODO: enforce this.
Status DecimalIR::BuildAdd() {
  // Create fn prototype :
  // int128_t
  // add_decimal128_decimal128(int128_t x_value, int32_t x_precision, int32_t x_scale,
  //                           int128_t y_value, int32_t y_precision, int32_t y_scale
  //                           int32_t out_precision, int32_t out_scale)
  auto i32 = types()->i32_type();
  auto i128 = types()->i128_type();
  auto function = BuildFunction(kAddFunction, i128,
                                {
                                    {"x_value", i128},
                                    {"x_precision", i32},
                                    {"x_scale", i32},
                                    {"y_value", i128},
                                    {"y_precision", i32},
                                    {"y_scale", i32},
                                    {"out_precision", i32},
                                    {"out_scale", i32},
                                });

  auto arg_iter = function->arg_begin();
  ValueFull x(&arg_iter[0], &arg_iter[1], &arg_iter[2]);
  ValueFull y(&arg_iter[3], &arg_iter[4], &arg_iter[5]);
  ValueFull out(nullptr, &arg_iter[6], &arg_iter[7]);

  auto entry = llvm::BasicBlock::Create(*context(), "entry", function);
  ir_builder()->SetInsertPoint(entry);

  // CPP :
  // if (out_precision < 38) {
  //   return AddFastPath(x, y)
  // } else {
  //   ret = AddWithOverflowCheck(x, y)
  //   if (ret.overflow)
  //     return AddLarge(x, y)
  //   else
  //     return ret.value;
  // }
  llvm::Value* lt_max_precision = ir_builder()->CreateICmpSLT(
      out.precision(), types()->i32_constant(DecimalTypeUtil::kMaxPrecision));
  auto then_lambda = [&] {
    // fast-path add
    return AddFastPath(x, y);
  };
  auto else_lambda = [&] {
    if (kUseOverflowIntrinsics) {
      // do the add and check if there was overflow
      auto ret = AddWithOverflowCheck(x, y, out);

      // if there is an overflow, switch to the AddLarge codepath.
      return BuildIfElse(
          ret.overflow(), types()->i128_type(), [&] { return AddLarge(x, y, out); },
          [&] { return ret.value(); });
    } else {
      return AddLarge(x, y, out);
    }
  };
  auto value =
      BuildIfElse(lt_max_precision, types()->i128_type(), then_lambda, else_lambda);

  // store result to out
  ir_builder()->CreateRet(value);
  return Status::OK();
}

Status DecimalIR::BuildSubtract() {
  // Create fn prototype :
  // int128_t
  // subtract_decimal128_decimal128(int128_t x_value, int32_t x_precision, int32_t
  // x_scale,
  //                           int128_t y_value, int32_t y_precision, int32_t y_scale
  //                           int32_t out_precision, int32_t out_scale)
  auto i32 = types()->i32_type();
  auto i128 = types()->i128_type();
  auto function = BuildFunction(kSubtractFunction, i128,
                                {
                                    {"x_value", i128},
                                    {"x_precision", i32},
                                    {"x_scale", i32},
                                    {"y_value", i128},
                                    {"y_precision", i32},
                                    {"y_scale", i32},
                                    {"out_precision", i32},
                                    {"out_scale", i32},
                                });

  auto entry = llvm::BasicBlock::Create(*context(), "entry", function);
  ir_builder()->SetInsertPoint(entry);

  // reuse add function after negating y_value. i.e
  //   add(x_value, x_precision, x_scale, -y_value, y_precision, y_scale,
  //       out_precision, out_scale)
  std::vector<llvm::Value*> args;
  int i = 0;
  for (auto& in_arg : function->args()) {
    if (i == 3) {
      auto y_neg_value = ir_builder()->CreateNeg(&in_arg);
      args.push_back(y_neg_value);
    } else {
      args.push_back(&in_arg);
    }
    ++i;
  }
  auto value = ir_builder()->CreateCall(module()->getFunction(kAddFunction), args);

  // store result to out
  ir_builder()->CreateRet(value);
  return Status::OK();
}

Status DecimalIR::BuildCompare(const std::string& function_name,
                               llvm::ICmpInst::Predicate cmp_instruction) {
  // Create fn prototype :
  // bool
  // function_name(int128_t x_value, int32_t x_precision, int32_t x_scale,
  //               int128_t y_value, int32_t y_precision, int32_t y_scale)

  auto i32 = types()->i32_type();
  auto i128 = types()->i128_type();
  auto function = BuildFunction(function_name, types()->i1_type(),
                                {
                                    {"x_value", i128},
                                    {"x_precision", i32},
                                    {"x_scale", i32},
                                    {"y_value", i128},
                                    {"y_precision", i32},
                                    {"y_scale", i32},
                                });

  auto arg_iter = function->arg_begin();
  ValueFull x(&arg_iter[0], &arg_iter[1], &arg_iter[2]);
  ValueFull y(&arg_iter[3], &arg_iter[4], &arg_iter[5]);

  auto entry = llvm::BasicBlock::Create(*context(), "entry", function);
  ir_builder()->SetInsertPoint(entry);

  // Make call to pre-compiled IR function.
  auto x_split = ValueSplit::MakeFromInt128(this, x.value());
  auto y_split = ValueSplit::MakeFromInt128(this, y.value());

  std::vector<llvm::Value*> args = {
      x_split.high(), x_split.low(), x.precision(), x.scale(),
      y_split.high(), y_split.low(), y.precision(), y.scale(),
  };
  auto cmp_value = ir_builder()->CreateCall(
      module()->getFunction("compare_decimal128_decimal128_internal"), args);
  auto result =
      ir_builder()->CreateICmp(cmp_instruction, cmp_value, types()->i32_constant(0));
  ir_builder()->CreateRet(result);
  return Status::OK();
}

llvm::Value* DecimalIR::CallDecimalFunction(const std::string& function_name,
                                            llvm::Type* return_type,
                                            const std::vector<llvm::Value*>& params) {
  if (kDecimalIRBuilderFunctions.count(function_name) != 0) {
    // this is fn built with the irbuilder.
    return ir_builder()->CreateCall(module()->getFunction(function_name), params);
  }

  // ppre-compiler fn : disassemble i128 to two i64s and re-assemble.
  auto i128 = types()->i128_type();
  auto i64 = types()->i64_type();
  std::vector<llvm::Value*> dis_assembled_args;
  for (auto& arg : params) {
    if (arg->getType() == i128) {
      // split i128 arg into two int64s.
      auto split = ValueSplit::MakeFromInt128(this, arg);
      dis_assembled_args.push_back(split.high());
      dis_assembled_args.push_back(split.low());
    } else {
      dis_assembled_args.push_back(arg);
    }
  }

  llvm::Value* result = nullptr;
  if (return_type == i128) {
    // for i128 ret, replace with two int64* args, and join them.
    auto block = ir_builder()->GetInsertBlock();
    auto out_high_ptr = new llvm::AllocaInst(i64, 0, "out_hi", block);
    auto out_low_ptr = new llvm::AllocaInst(i64, 0, "out_low", block);
    dis_assembled_args.push_back(out_high_ptr);
    dis_assembled_args.push_back(out_low_ptr);

    // Make call to pre-compiled IR function.
    ir_builder()->CreateCall(module()->getFunction(function_name), dis_assembled_args);

    auto out_high = CreateLoad(ir_builder(), out_high_ptr);
    auto out_low = CreateLoad(ir_builder(), out_low_ptr);
    result = ValueSplit(out_high, out_low).AsInt128(this);
  } else {
    DCHECK_NE(return_type, types()->void_type());

    // Make call to pre-compiled IR function.
    result = ir_builder()->CreateCall(module()->getFunction(function_name),
                                      dis_assembled_args);
  }
  return result;
}

Status DecimalIR::AddFunctions(Engine* engine) {
  auto decimal_ir = std::make_shared<DecimalIR>(engine);

  // Populate global variables used by decimal operations.
  decimal_ir->AddGlobals(engine);

  // Lookup intrinsic functions
  decimal_ir->InitializeIntrinsics();

  ARROW_RETURN_NOT_OK(decimal_ir->BuildAdd());
  ARROW_RETURN_NOT_OK(decimal_ir->BuildSubtract());
  ARROW_RETURN_NOT_OK(decimal_ir->BuildCompare(kEQFunction, llvm::ICmpInst::ICMP_EQ));
  ARROW_RETURN_NOT_OK(decimal_ir->BuildCompare(kNEFunction, llvm::ICmpInst::ICMP_NE));
  ARROW_RETURN_NOT_OK(decimal_ir->BuildCompare(kLTFunction, llvm::ICmpInst::ICMP_SLT));
  ARROW_RETURN_NOT_OK(decimal_ir->BuildCompare(kLEFunction, llvm::ICmpInst::ICMP_SLE));
  ARROW_RETURN_NOT_OK(decimal_ir->BuildCompare(kGTFunction, llvm::ICmpInst::ICMP_SGT));
  ARROW_RETURN_NOT_OK(decimal_ir->BuildCompare(kGEFunction, llvm::ICmpInst::ICMP_SGE));
  return Status::OK();
}

// Do an bitwise-or of all the overflow bits.
llvm::Value* DecimalIR::GetCombinedOverflow(
    std::vector<DecimalIR::ValueWithOverflow> vec) {
  llvm::Value* res = types()->false_constant();
  for (auto& val : vec) {
    res = ir_builder()->CreateOr(res, val.overflow());
  }
  return res;
}

DecimalIR::ValueSplit DecimalIR::ValueSplit::MakeFromInt128(DecimalIR* decimal_ir,
                                                            llvm::Value* in) {
  auto builder = decimal_ir->ir_builder();
  auto types = decimal_ir->types();

  auto high = builder->CreateLShr(in, types->i128_constant(64));
  high = builder->CreateTrunc(high, types->i64_type());
  auto low = builder->CreateTrunc(in, types->i64_type());
  return ValueSplit(high, low);
}

/// Convert IR struct {%i64, %i64} to cpp class ValueSplit
DecimalIR::ValueSplit DecimalIR::ValueSplit::MakeFromStruct(DecimalIR* decimal_ir,
                                                            llvm::Value* dstruct) {
  auto builder = decimal_ir->ir_builder();
  auto high = builder->CreateExtractValue(dstruct, 0);
  auto low = builder->CreateExtractValue(dstruct, 1);
  return DecimalIR::ValueSplit(high, low);
}

llvm::Value* DecimalIR::ValueSplit::AsInt128(DecimalIR* decimal_ir) const {
  auto builder = decimal_ir->ir_builder();
  auto types = decimal_ir->types();

  auto value = builder->CreateSExt(high_, types->i128_type());
  value = builder->CreateShl(value, types->i128_constant(64));
  value = builder->CreateAdd(value, builder->CreateZExt(low_, types->i128_type()));
  return value;
}

/// Convert IR struct {%i128, %i1} to cpp class ValueWithOverflow
DecimalIR::ValueWithOverflow DecimalIR::ValueWithOverflow::MakeFromStruct(
    DecimalIR* decimal_ir, llvm::Value* dstruct) {
  auto builder = decimal_ir->ir_builder();
  auto value = builder->CreateExtractValue(dstruct, 0);
  auto overflow = builder->CreateExtractValue(dstruct, 1);
  return DecimalIR::ValueWithOverflow(value, overflow);
}

/// Convert to IR struct {%i128, %i1}
llvm::Value* DecimalIR::ValueWithOverflow::AsStruct(DecimalIR* decimal_ir) const {
  auto builder = decimal_ir->ir_builder();

  auto undef = llvm::UndefValue::get(decimal_ir->i128_with_overflow_struct_type_);
  auto struct_val = builder->CreateInsertValue(undef, value(), 0);
  return builder->CreateInsertValue(struct_val, overflow(), 1);
}

/// debug traces
void DecimalIR::AddTrace(const std::string& fmt, std::vector<llvm::Value*> args) {
  DCHECK(enable_ir_traces_);

  auto ir_str = ir_builder()->CreateGlobalStringPtr(fmt);
  args.insert(args.begin(), ir_str);
  ir_builder()->CreateCall(module()->getFunction("printf"), args, "trace");
}

void DecimalIR::AddTrace32(const std::string& msg, llvm::Value* value) {
  AddTrace("DECIMAL_IR_TRACE:: " + msg + " %d\n", {value});
}

void DecimalIR::AddTrace128(const std::string& msg, llvm::Value* value) {
  // convert i128 into two i64s for printing
  auto split = ValueSplit::MakeFromInt128(this, value);
  AddTrace("DECIMAL_IR_TRACE:: " + msg + " %llx:%llx (%lld:%llu)\n",
           {split.high(), split.low(), split.high(), split.low()});
}

}  // namespace gandiva