Skip to content
This repository was archived by the owner on May 6, 2024. It is now read-only.

Commit e699dc1

Browse files
authored
[POAE7-2415] codegen string using Arrow format (#95)
* gen string ir * format * string codegen * format * fix * format * use TwoValueColValues replace MultipleValueColValues. use temp check for string type(checker/generator for string type is not ready) * VarcharBatch * VarcharBatch test * address comments
1 parent 787bdc5 commit e699dc1

24 files changed

+573
-35
lines changed

cider/exec/module/batch/CiderArrowBufferHolder.cpp

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,10 +46,14 @@ CiderArrowArrayBufferHolder::CiderArrowArrayBufferHolder(
4646

4747
CiderArrowArrayBufferHolder::~CiderArrowArrayBufferHolder() {
4848
for (size_t i = 0; i < buffers_.size(); ++i) {
49-
relaseBuffer(i);
49+
releaseBuffer(i);
5050
}
5151
}
5252

53+
size_t CiderArrowArrayBufferHolder::getBufferSizeAt(size_t index) {
54+
return buffers_bytes_[index];
55+
}
56+
5357
void CiderArrowArrayBufferHolder::allocBuffer(size_t index, size_t bytes) {
5458
if (buffers_[index]) {
5559
buffers_[index] = allocator_->reallocate(
@@ -61,7 +65,7 @@ void CiderArrowArrayBufferHolder::allocBuffer(size_t index, size_t bytes) {
6165
}
6266
}
6367

64-
void CiderArrowArrayBufferHolder::relaseBuffer(size_t index) {
68+
void CiderArrowArrayBufferHolder::releaseBuffer(size_t index) {
6569
if (buffers_[index]) {
6670
allocator_->deallocate(reinterpret_cast<int8_t*>(buffers_[index]),
6771
buffers_bytes_[index]);

cider/exec/module/batch/CiderArrowBufferHolder.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,8 +50,10 @@ class CiderArrowArrayBufferHolder {
5050

5151
ArrowArray* getDictPtr();
5252

53+
size_t getBufferSizeAt(size_t index);
54+
5355
private:
54-
void relaseBuffer(size_t index);
56+
void releaseBuffer(size_t index);
5557

5658
std::vector<void*> buffers_;
5759
std::vector<size_t> buffers_bytes_; // Used for allocator.

cider/exec/module/batch/CiderBatch.cpp

Lines changed: 29 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,9 @@ CiderBatch::CiderBatch(ArrowSchema* schema,
5858

5959
CiderBatch::~CiderBatch() {
6060
releaseArrowEntries();
61+
#ifdef CIDER_BATCH_CIDER_IMPL
6162
destroy(); // TODO: Remove
63+
#endif
6264
}
6365

6466
CiderBatch::CiderBatch(const CiderBatch& rh) {
@@ -95,8 +97,9 @@ CiderBatch::CiderBatch(CiderBatch&& rh) noexcept {
9597
rh.arrow_schema_ = nullptr;
9698
rh.ownership_ = false;
9799
rh.reallocate_ = false;
98-
100+
#ifdef CIDER_BATCH_CIDER_IMPL
99101
moveFrom(&rh); // TODO: Remove
102+
#endif
100103
}
101104

102105
CiderBatch& CiderBatch::operator=(CiderBatch&& rh) noexcept {
@@ -115,8 +118,9 @@ CiderBatch& CiderBatch::operator=(CiderBatch&& rh) noexcept {
115118
rh.ownership_ = false;
116119
rh.reallocate_ = false;
117120

121+
#ifdef CIDER_BATCH_CIDER_IMPL
118122
moveFrom(&rh); // TODO: Remove
119-
123+
#endif
120124
return *this;
121125
}
122126

@@ -283,13 +287,8 @@ void CiderBatch::convertToArrowRepresentation() {
283287
arrow_array_->children[i] = new ArrowArray();
284288
arrow_array_->children[i]->length = row_num();
285289
arrow_array_->children[i]->n_children = 0;
286-
arrow_array_->children[i]->buffers = (const void**)std::malloc(sizeof(void*) * 2);
287-
// FIXME: fill actual null
288290
void* null_buf = std::malloc(row_num() / 8 + 1);
289291
std::memset(null_buf, 0xFF, row_num() / 8 + 1);
290-
arrow_array_->children[i]->buffers[0] = null_buf;
291-
arrow_array_->children[i]->buffers[1] = table_ptr_[i];
292-
arrow_array_->children[i]->n_buffers = 2;
293292
arrow_array_->children[i]->private_data = nullptr;
294293
arrow_array_->children[i]->dictionary = nullptr;
295294
arrow_array_->children[i]->release = CiderBatchUtils::ciderEmptyArrowArrayReleaser;
@@ -300,6 +299,29 @@ void CiderBatch::convertToArrowRepresentation() {
300299
arrow_schema_->children[i]->n_children = 0;
301300
arrow_schema_->children[i]->children = nullptr;
302301
arrow_schema_->children[i]->release = CiderBatchUtils::ciderEmptyArrowSchemaReleaser;
302+
303+
// (Kunshang)To be removed. temp code to pass ut.
304+
// CiderStringTest::CiderStringTestArrow
305+
if (schema_->getColumnTypeById(i).has_varchar()) {
306+
arrow_array_->children[i]->n_buffers = 3;
307+
arrow_array_->children[i]->buffers = (const void**)std::malloc(sizeof(void*) * 3);
308+
arrow_array_->children[i]->buffers[0] = null_buf;
309+
310+
arrow_schema_->children[i]->format = "";
311+
// 10 string row 0-9
312+
int32_t* offset_buf = new int[11]{0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100};
313+
char* data_buf(
314+
"000000000011111111112222222222333333333344444444445555555555666666666677777777"
315+
"7788888888889999999999");
316+
arrow_array_->children[i]->buffers[1] = offset_buf;
317+
arrow_array_->children[i]->buffers[2] = data_buf;
318+
} else {
319+
arrow_array_->children[i]->buffers = (const void**)std::malloc(sizeof(void*) * 2);
320+
// FIXME: fill actual null
321+
arrow_array_->children[i]->buffers[0] = null_buf;
322+
arrow_array_->children[i]->buffers[1] = table_ptr_[i];
323+
arrow_array_->children[i]->n_buffers = 2;
324+
}
303325
}
304326
}
305327

cider/exec/module/batch/CiderBatchUtils.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,8 @@ int64_t getBufferNum(const ArrowSchema* schema) {
154154
if (!strcmp(type, "tdm")) {
155155
return 2;
156156
}
157+
case 'u':
158+
return 3;
157159
default:
158160
CIDER_THROW(CiderException,
159161
std::string("Unsupported data type to CiderBatch: ") + type);
@@ -185,6 +187,8 @@ SQLTypes convertArrowTypeToCiderType(const char* format) {
185187
case 's':
186188
return kSTRUCT;
187189
}
190+
case 'u':
191+
return kVARCHAR;
188192
default:
189193
CIDER_THROW(CiderCompileException,
190194
std::string("Unsupported data type to CiderBatch: ") + format);
@@ -209,6 +213,8 @@ const char* convertCiderTypeToArrowType(SQLTypes type) {
209213
return "g";
210214
case kSTRUCT:
211215
return "+s";
216+
case kVARCHAR:
217+
return "u";
212218
default:
213219
CIDER_THROW(CiderCompileException,
214220
std::string("Unsupported to convert type ") + toString(type) +
@@ -264,6 +270,8 @@ const char* convertSubstraitTypeToArrowType(const substrait::Type& type) {
264270
return "+s";
265271
case Type::kDate:
266272
return "tdm";
273+
case Type::kVarchar:
274+
return "u";
267275
default:
268276
CIDER_THROW(CiderRuntimeException,
269277
std::string("Unsupported to convert type ") + type.GetTypeName() +
@@ -334,6 +342,8 @@ std::unique_ptr<CiderBatch> createCiderBatch(std::shared_ptr<CiderAllocator> all
334342
if (!strcmp(format, "tdm")) {
335343
return ScalarBatch<int64_t>::Create(schema, allocator, array);
336344
}
345+
case 'u':
346+
return VarcharBatch::Create(schema, allocator, array);
337347
default:
338348
CIDER_THROW(CiderCompileException,
339349
std::string("Unsupported data type to create CiderBatch: ") + format);

cider/exec/plan/parser/TypeUtils.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,10 @@ class TypeUtils {
148148
return getIsNullable(type.time().nullability());
149149
case substrait::Type::kTimestamp:
150150
return getIsNullable(type.timestamp().nullability());
151+
case substrait::Type::kVarchar:
152+
return getIsNullable(type.varchar().nullability());
153+
case substrait::Type::kFixedChar:
154+
return getIsNullable(type.fixed_char().nullability());
151155
default:
152156
return true;
153157
}

cider/exec/template/CodeGenerator.h

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,12 @@ class CodeGenerator {
162162
CodegenColValues* rhs,
163163
llvm::Value* null);
164164

165+
std::unique_ptr<CodegenColValues> codegenVarcharCmpFun(
166+
const Analyzer::BinOper* bin_oper,
167+
CodegenColValues* lhs,
168+
CodegenColValues* rhs,
169+
llvm::Value* null);
170+
165171
llvm::Value* codegenCmp(const SQLOps,
166172
const SQLQualifier,
167173
std::vector<llvm::Value*>,
@@ -333,6 +339,12 @@ class CodeGenerator {
333339
llvm::Value* pos_arg,
334340
const CompilationOptions& co);
335341

342+
std::unique_ptr<CodegenColValues> codegenVarCharColVar(
343+
const Analyzer::ColumnVar* col_var,
344+
llvm::Value* col_byte_stream,
345+
llvm::Value* pos_arg,
346+
const CompilationOptions& co);
347+
336348
llvm::Value* codegenFixedLengthColVar(const Analyzer::ColumnVar* col_var,
337349
llvm::Value* col_byte_stream,
338350
llvm::Value* pos_arg);
@@ -350,6 +362,10 @@ class CodeGenerator {
350362
llvm::Value* col_byte_stream,
351363
llvm::Value* pos_arg);
352364

365+
std::vector<llvm::Value*> codegenVariableLengthStringColVarArrow(
366+
llvm::Value* col_byte_stream,
367+
llvm::Value* pos_arg);
368+
353369
llvm::Value* codegenRowId(const Analyzer::ColumnVar* col_var,
354370
const CompilationOptions& co);
355371

cider/exec/template/Codec.cpp

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -296,3 +296,35 @@ std::vector<llvm::Instruction*> FixedWidthSmallDate::codegenDecode(
296296
return {llvm::CallInst::Create(f, args), nullptr};
297297
}
298298
}
299+
300+
VarcharDecoder::VarcharDecoder(const size_t byte_width,
301+
llvm::IRBuilder<>* ir_builder,
302+
bool nullable)
303+
: Decoder(ir_builder, nullable), byte_width_{byte_width} {}
304+
305+
llvm::Instruction* VarcharDecoder::codegenDecode(llvm::Value* byte_stream,
306+
llvm::Value* pos,
307+
llvm::Module* module) const {
308+
UNREACHABLE();
309+
}
310+
311+
std::vector<llvm::Instruction*> VarcharDecoder::codegenDecode(llvm::Module* module,
312+
llvm::Value* byte_stream,
313+
llvm::Value* pos) const {
314+
auto nulls = extractNullVector(module, byte_stream);
315+
auto offset_buffer = extractBufferAt(module, byte_stream, 1);
316+
auto data_buffer = extractBufferAt(module, byte_stream, 2);
317+
318+
llvm::Instruction* str_ptr = llvm::CallInst::Create(
319+
module->getFunction("extract_str_ptr_arrow"), {data_buffer, offset_buffer, pos});
320+
llvm::Instruction* str_len = llvm::CallInst::Create(
321+
module->getFunction("extract_str_len_arrow"), {offset_buffer, pos});
322+
323+
if (nulls) {
324+
auto get_is_null = module->getFunction("check_bit_vector_clear");
325+
CHECK(get_is_null);
326+
return {str_ptr, str_len, llvm::CallInst::Create(get_is_null, {nulls, pos})};
327+
} else {
328+
return {str_ptr, str_len};
329+
}
330+
}

cider/exec/template/Codec.h

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -161,4 +161,22 @@ class FixedWidthSmallDate : public Decoder {
161161
static constexpr int64_t ret_null_val_ = NULL_BIGINT;
162162
};
163163

164+
class VarcharDecoder : public Decoder {
165+
public:
166+
VarcharDecoder(const size_t byte_width,
167+
llvm::IRBuilder<>* ir_builder,
168+
bool nullable = false);
169+
170+
llvm::Instruction* codegenDecode(llvm::Value* byte_stream,
171+
llvm::Value* pos,
172+
llvm::Module* module) const override;
173+
174+
std::vector<llvm::Instruction*> codegenDecode(llvm::Module* module,
175+
llvm::Value* byte_stream,
176+
llvm::Value* pos) const override;
177+
178+
private:
179+
const size_t byte_width_;
180+
};
181+
164182
#endif // QUERYENGINE_CODEC_H

cider/exec/template/CodegenColValues.h

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,4 +72,28 @@ class FixedSizeColValues : public NullableColValues {
7272
DEF_CODEGEN_COL_VALUES_MEMBER(Value, value_)
7373
};
7474

75+
class MultipleValueColValues : public NullableColValues {
76+
public:
77+
MultipleValueColValues(std::vector<llvm::Value*> values, llvm::Value* null = nullptr)
78+
: NullableColValues(null), values_(values) {}
79+
std::unique_ptr<CodegenColValues> copy() const override {
80+
return std::make_unique<MultipleValueColValues>(*this);
81+
}
82+
std::vector<llvm::Value*> getValues() { return values_; }
83+
const std::vector<llvm::Value*> getValues() const { return values_; }
84+
llvm::Value* getValueAt(int index) { return values_[index]; }
85+
86+
private:
87+
std::vector<llvm::Value*> values_;
88+
};
89+
90+
class TwoValueColValues : public MultipleValueColValues {
91+
public:
92+
TwoValueColValues(llvm::Value* value1, llvm::Value* value2, llvm::Value* null = nullptr)
93+
: MultipleValueColValues({value1, value2}, null) {}
94+
std::unique_ptr<CodegenColValues> copy() const override {
95+
return std::make_unique<TwoValueColValues>(*this);
96+
}
97+
};
98+
7599
#endif

cider/exec/template/ColumnIR.cpp

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -169,7 +169,8 @@ std::unique_ptr<CodegenColValues> CodeGenerator::codegenColumnExpr(
169169
break;
170170
}
171171
case kVARCHAR:
172-
CIDER_THROW(CiderCompileException, "String type ColumnVar is not supported now.");
172+
col_values = codegenVarCharColVar(col_var, input_col_descriptor_ptr, pos_arg, co);
173+
break;
173174
case kARRAY:
174175
CIDER_THROW(CiderCompileException, "Array type ColumnVar is not supported now.");
175176
default:
@@ -234,6 +235,28 @@ std::unique_ptr<CodegenColValues> CodeGenerator::codegenFixedLengthColVar(
234235
return std::make_unique<FixedSizeColValues>(dec_val_cast, null);
235236
}
236237

238+
std::unique_ptr<CodegenColValues> CodeGenerator::codegenVarCharColVar(
239+
const Analyzer::ColumnVar* col_var,
240+
llvm::Value* col_byte_stream,
241+
llvm::Value* pos_arg,
242+
const CompilationOptions& co) {
243+
AUTOMATIC_IR_METADATA(cgen_state_);
244+
const size_t size = 8;
245+
VarcharDecoder decoder(
246+
size, &cgen_state_->ir_builder_, !col_var->get_type_info().get_notnull());
247+
std::vector<llvm::Instruction*> values =
248+
decoder.codegenDecode(cgen_state_->module_, col_byte_stream, pos_arg);
249+
for (auto v : values) {
250+
cgen_state_->ir_builder_.Insert(v);
251+
}
252+
llvm::Instruction* null = nullptr;
253+
if (values.size() == 3) {
254+
null = values[2];
255+
values.pop_back();
256+
}
257+
return std::make_unique<TwoValueColValues>(values[0], values[1], null);
258+
}
259+
237260
std::vector<llvm::Value*> CodeGenerator::codegenColVar(const Analyzer::ColumnVar* col_var,
238261
const bool fetch_column,
239262
const bool update_query_plan,

cider/exec/template/CompareIR.cpp

Lines changed: 29 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -339,16 +339,23 @@ std::unique_ptr<CodegenColValues> CodeGenerator::codegenCmpFun(
339339

340340
if (lhs_nullable && rhs_nullable) {
341341
if (lhs_nullable->getNull() && rhs_nullable->getNull()) {
342-
null = cgen_state_->ir_builder_.CreateAnd(lhs_nullable->getNull(),
343-
rhs_nullable->getNull());
342+
null = cgen_state_->ir_builder_.CreateOr(lhs_nullable->getNull(),
343+
rhs_nullable->getNull());
344344
} else {
345345
null = lhs_nullable->getNull() ? lhs_nullable->getNull() : rhs_nullable->getNull();
346346
}
347347
} else if (lhs_nullable || rhs_nullable) {
348348
null = lhs_nullable ? lhs_nullable->getNull() : rhs_nullable->getNull();
349349
}
350350

351-
return codegenFixedSizeColCmpFun(bin_oper, lhs_lv.get(), rhs_lv.get(), null);
351+
switch (lhs_ti.get_type()) {
352+
case kVARCHAR:
353+
case kTEXT:
354+
case kCHAR:
355+
return codegenVarcharCmpFun(bin_oper, lhs_lv.get(), rhs_lv.get(), null);
356+
default:
357+
return codegenFixedSizeColCmpFun(bin_oper, lhs_lv.get(), rhs_lv.get(), null);
358+
}
352359
}
353360

354361
std::unique_ptr<CodegenColValues> CodeGenerator::codegenFixedSizeColCmpFun(
@@ -375,6 +382,25 @@ std::unique_ptr<CodegenColValues> CodeGenerator::codegenFixedSizeColCmpFun(
375382
return std::make_unique<FixedSizeColValues>(value, null);
376383
}
377384

385+
std::unique_ptr<CodegenColValues> CodeGenerator::codegenVarcharCmpFun(
386+
const Analyzer::BinOper* bin_oper,
387+
CodegenColValues* lhs,
388+
CodegenColValues* rhs,
389+
llvm::Value* null) {
390+
AUTOMATIC_IR_METADATA(cgen_state_);
391+
auto lhs_fixsize = dynamic_cast<TwoValueColValues*>(lhs);
392+
CHECK(lhs_fixsize);
393+
auto rhs_fixsize = dynamic_cast<TwoValueColValues*>(rhs);
394+
CHECK(rhs_fixsize);
395+
396+
llvm::Value* value = cgen_state_->emitCall("string_eq",
397+
{lhs_fixsize->getValueAt(0),
398+
lhs_fixsize->getValueAt(1),
399+
rhs_fixsize->getValueAt(0),
400+
rhs_fixsize->getValueAt(1)});
401+
return std::make_unique<FixedSizeColValues>(value, null);
402+
}
403+
378404
llvm::Value* CodeGenerator::codegenOverlaps(const SQLOps optype,
379405
const SQLQualifier qualifier,
380406
const std::shared_ptr<Analyzer::Expr> lhs,

cider/exec/template/IRCodegen.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -240,6 +240,13 @@ std::unique_ptr<CodegenColValues> CodeGenerator::codegenConstantExpr(
240240

241241
switch (ti.get_type()) {
242242
case kVARCHAR:
243+
CHECK(constant_value.size() == 3);
244+
return std::make_unique<TwoValueColValues>(
245+
constant_value[1],
246+
constant_value[2],
247+
constant_expr->get_is_null()
248+
? llvm::ConstantInt::getTrue(cgen_state_->context_)
249+
: llvm::ConstantInt::getFalse(cgen_state_->context_));
243250
case kARRAY:
244251
UNREACHABLE();
245252
default:

0 commit comments

Comments
 (0)