Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enhancement/8561507350/precompute output schema from processing #2233

Open
wants to merge 29 commits into
base: master
Choose a base branch
from
Open
Changes from 1 commit
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
45d9a43
Implement modify_schema for passthrough, row range, and date range cl…
alexowens90 Mar 6, 2025
c4ff8e9
Implement and test FilterClause::modify_schema based only on input_co…
alexowens90 Mar 6, 2025
9772e0e
Implement and test PartitionClause::modify_schema
alexowens90 Mar 6, 2025
191a101
Implemented AggregationClause::modify_schema
alexowens90 Mar 6, 2025
912c059
Aggregation column names tested
alexowens90 Mar 6, 2025
4b1cf76
Implement aggregation output types properly, test for sum
alexowens90 Mar 7, 2025
5970065
Tests for min and max aggregators
alexowens90 Mar 7, 2025
738c9b3
Tests for mean aggregator
alexowens90 Mar 7, 2025
9938d77
Refactored tests
alexowens90 Mar 7, 2025
c68eda1
Implement modify_schema for ResampleClause
alexowens90 Mar 10, 2025
3923582
ResampleClause::modify_schema fully tested
alexowens90 Mar 10, 2025
0ff5086
Tests for RemoveCOlumnPartitioning, Split, Sort, and Merge clauses, a…
alexowens90 Mar 10, 2025
c103b92
Throw if modify_schema is called on ColumnStatsGenerationClause
alexowens90 Mar 10, 2025
a0b0266
Same level of checks for FilterClause added to ProjectionClause
alexowens90 Mar 10, 2025
c419245
Implemented type checking for ProjectClause (untested)
alexowens90 Mar 10, 2025
0d3f2d5
Implemented type checking for FilterClause (untested)
alexowens90 Mar 10, 2025
ac9e385
Fix filter and project tests
alexowens90 Mar 10, 2025
2073432
Clone StreamDescriptors
alexowens90 Mar 11, 2025
cb344ae
AST validity tests passing for projections
alexowens90 Mar 11, 2025
e401c17
AST validity tests passing for filters
alexowens90 Mar 12, 2025
fd5d85c
Remove unneeded computed_data
alexowens90 Mar 12, 2025
15ea08e
Keep cache of column types
alexowens90 Mar 12, 2025
1b63ab0
Make ExpressionContext const&
alexowens90 Mar 12, 2025
ed5e186
Implement PartitionClause::modify_schema in the same way as the others
alexowens90 Mar 12, 2025
9f0c941
Factor out input columns check to own method
alexowens90 Mar 12, 2025
eab0465
Factor out check that a stream descriptor represents a timeseries int…
alexowens90 Mar 12, 2025
1c7d237
Improve return type of ExpressionNode::compute
alexowens90 Mar 12, 2025
4029b74
Factor out child type calculation to own method
alexowens90 Mar 12, 2025
8ed9560
Uncomment test files
alexowens90 Mar 12, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
AST validity tests passing for filters
alexowens90 committed Mar 12, 2025
commit e401c17232a2319d7048152dcb87989ed9578b0b
6 changes: 3 additions & 3 deletions cpp/arcticdb/processing/expression_node.cpp
Original file line number Diff line number Diff line change
@@ -125,15 +125,15 @@ std::optional<DataType> ExpressionNode::compute(ExpressionContext& expression_co
break;
case OperationType::ISNULL:
case OperationType::NOTNULL:
user_input::raise<ErrorCode::E_INVALID_USER_ARGUMENT>(
user_input::check<ErrorCode::E_INVALID_USER_ARGUMENT>(
left_type.has_value() && (is_floating_point_type(*left_type) || is_sequence_type(*left_type) ||
is_time_type(*left_type)),
"Unexpected data type input to unary comparison operation {}",
left_type.has_value() ? *left_type : DataType::UNKNOWN);
break;
case OperationType::IDENTITY:
case OperationType::NOT:
user_input::raise<ErrorCode::E_INVALID_USER_ARGUMENT>(!left_type.has_value() || *left_type == DataType::BOOL8,
user_input::check<ErrorCode::E_INVALID_USER_ARGUMENT>(!left_type.has_value() || *left_type == DataType::BOOL8,
"Unexpected data type input to unary boolean operation {}",
left_type.has_value() ? *left_type : DataType::UNKNOWN);
break;
@@ -248,7 +248,7 @@ std::optional<DataType> ExpressionNode::compute(ExpressionContext& expression_co
case OperationType::AND:
case OperationType::OR:
case OperationType::XOR:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Related to my other comment about difference between BitSetTag and DataType::BOOL8.

Shouldn't AND, OR, XOR, NOT operations return DataType::BOOL8, not a BitSetTag?

E.g. if we have nested (something AND (NOT something_else)) the NOT clause should either return a bool or the AND clause should accept a BitSetTag

user_input::raise<ErrorCode::E_INVALID_USER_ARGUMENT>(
user_input::check<ErrorCode::E_INVALID_USER_ARGUMENT>(
(!left_type.has_value() || *left_type == DataType::BOOL8) && (!right_type.has_value() || *right_type == DataType::BOOL8),
"Unexpected data types input to binary boolean operation {} {}",
left_type.has_value() ? *left_type : DataType::UNKNOWN,
240 changes: 240 additions & 0 deletions cpp/arcticdb/processing/test/test_output_schema.cpp
Original file line number Diff line number Diff line change
@@ -177,6 +177,7 @@ class AstParsingOutputTypesTest : public testing::Test {
initial_stream_desc_.add_scalar_field(DataType::INT32, "int32");
initial_stream_desc_.add_scalar_field(DataType::UINT8, "uint8");
initial_stream_desc_.add_scalar_field(DataType::UTF_DYNAMIC64, "string");
initial_stream_desc_.add_scalar_field(DataType::BOOL8, "bool");
}

OutputSchema initial_schema() {
@@ -185,6 +186,245 @@ class AstParsingOutputTypesTest : public testing::Test {
StreamDescriptor initial_stream_desc_;
};

TEST_F(AstParsingOutputTypesTest, FilterNotBitset) {
ExpressionContext ec;
auto node_1 = std::make_shared<ExpressionNode>(ColumnName("int32"), ColumnName("uint8"), OperationType::EQ);
ec.add_expression_node("bitset", node_1);
auto node_2 = std::make_shared<ExpressionNode>(ExpressionName("bitset"), OperationType::NOT);
ec.add_expression_node("root", node_2);
ec.root_node_name_ = ExpressionName("root");
FilterClause filter_clause{{"int32", "uint8"}, ec, {}};
auto output_schema = filter_clause.modify_schema(initial_schema());
ASSERT_EQ(output_schema.stream_descriptor_, initial_schema().stream_descriptor_);
}

TEST_F(AstParsingOutputTypesTest, FilterNotBoolColumn) {
ExpressionContext ec;
auto node = std::make_shared<ExpressionNode>(ColumnName("bool"), OperationType::NOT);
ec.add_expression_node("root", node);
ec.root_node_name_ = ExpressionName("root");
FilterClause filter_clause{{"bool"}, ec, {}};
auto output_schema = filter_clause.modify_schema(initial_schema());
ASSERT_EQ(output_schema.stream_descriptor_, initial_schema().stream_descriptor_);
}

TEST_F(AstParsingOutputTypesTest, FilterNotNumericColumn) {
ExpressionContext ec;
auto node = std::make_shared<ExpressionNode>(ColumnName("int32"), OperationType::NOT);
ec.add_expression_node("root", node);
ec.root_node_name_ = ExpressionName("root");
FilterClause filter_clause{{"int32"}, ec, {}};
EXPECT_THROW(filter_clause.modify_schema(initial_schema()), UserInputException);
}

TEST_F(AstParsingOutputTypesTest, FilterIsNullStringColumn) {
ExpressionContext ec;
auto node = std::make_shared<ExpressionNode>(ColumnName("string"), OperationType::ISNULL);
ec.add_expression_node("root", node);
ec.root_node_name_ = ExpressionName("root");
FilterClause filter_clause{{"string"}, ec, {}};
auto output_schema = filter_clause.modify_schema(initial_schema());
ASSERT_EQ(output_schema.stream_descriptor_, initial_schema().stream_descriptor_);
}

TEST_F(AstParsingOutputTypesTest, FilterIsNullNumericColumn) {
ExpressionContext ec;
auto node = std::make_shared<ExpressionNode>(ColumnName("int32"), OperationType::ISNULL);
ec.add_expression_node("root", node);
ec.root_node_name_ = ExpressionName("root");
FilterClause filter_clause{{"int32"}, ec, {}};
EXPECT_THROW(filter_clause.modify_schema(initial_schema()), UserInputException);
}

TEST_F(AstParsingOutputTypesTest, FilterIsNullBitset) {
ExpressionContext ec;
auto node_1 = std::make_shared<ExpressionNode>(ColumnName("int32"), ColumnName("uint8"), OperationType::EQ);
ec.add_expression_node("bitset", node_1);
auto node_2 = std::make_shared<ExpressionNode>(ExpressionName("bitset"), OperationType::ISNULL);
ec.add_expression_node("root", node_2);
ec.root_node_name_ = ExpressionName("root");
FilterClause filter_clause{{"int32", "uint8"}, ec, {}};
EXPECT_THROW(filter_clause.modify_schema(initial_schema()), UserInputException);
}

TEST_F(AstParsingOutputTypesTest, FilterEqNumericCols) {
ExpressionContext ec;
auto node = std::make_shared<ExpressionNode>(ColumnName("int32"), ColumnName("uint8"), OperationType::EQ);
ec.add_expression_node("root", node);
ec.root_node_name_ = ExpressionName("root");
FilterClause filter_clause{{"int32", "uint8"}, ec, {}};
auto output_schema = filter_clause.modify_schema(initial_schema());
ASSERT_EQ(output_schema.stream_descriptor_, initial_schema().stream_descriptor_);
}

TEST_F(AstParsingOutputTypesTest, FilterEqStringColStringVal) {
ExpressionContext ec;
auto value = std::make_shared<Value>(construct_string_value("hello"));
ec.add_value("value", value);
auto node = std::make_shared<ExpressionNode>(ColumnName("string"), ValueName("value"), OperationType::EQ);
ec.add_expression_node("root", node);
ec.root_node_name_ = ExpressionName("root");
FilterClause filter_clause{{"string"}, ec, {}};
auto output_schema = filter_clause.modify_schema(initial_schema());
ASSERT_EQ(output_schema.stream_descriptor_, initial_schema().stream_descriptor_);
}

TEST_F(AstParsingOutputTypesTest, FilterEqNumericColStringVal) {
ExpressionContext ec;
auto value = std::make_shared<Value>(construct_string_value("hello"));
ec.add_value("value", value);
auto node = std::make_shared<ExpressionNode>(ColumnName("int32"), ValueName("value"), OperationType::EQ);
ec.add_expression_node("root", node);
ec.root_node_name_ = ExpressionName("root");
FilterClause filter_clause{{"int32"}, ec, {}};
EXPECT_THROW(filter_clause.modify_schema(initial_schema()), UserInputException);
}

TEST_F(AstParsingOutputTypesTest, FilterEqColValueSet) {
ExpressionContext ec;
std::unordered_set<uint8_t> raw_set{1, 2, 3};
auto value_set = std::make_shared<ValueSet>(std::make_shared<std::unordered_set<uint8_t>>(std::move(raw_set)));
ec.add_value_set("value_set", value_set);
auto node = std::make_shared<ExpressionNode>(ColumnName("int32"), ValueSetName("value_set"), OperationType::EQ);
ec.add_expression_node("root", node);
ec.root_node_name_ = ExpressionName("root");
FilterClause filter_clause{{"int32"}, ec, {}};
EXPECT_THROW(filter_clause.modify_schema(initial_schema()), UserInputException);
}

TEST_F(AstParsingOutputTypesTest, FilterLessThanNumericColNumericVal) {
ExpressionContext ec;
auto value = std::make_shared<Value>(construct_value<uint16_t>(5));
ec.add_value("value", value);
auto node = std::make_shared<ExpressionNode>(ColumnName("int32"), ValueName("value"), OperationType::LT);
ec.add_expression_node("root", node);
ec.root_node_name_ = ExpressionName("root");
FilterClause filter_clause{{"int32"}, ec, {}};
auto output_schema = filter_clause.modify_schema(initial_schema());
ASSERT_EQ(output_schema.stream_descriptor_, initial_schema().stream_descriptor_);
}

TEST_F(AstParsingOutputTypesTest, FilterLessThanStringColStringVal) {
ExpressionContext ec;
auto value = std::make_shared<Value>(construct_string_value("hello"));
ec.add_value("value", value);
auto node = std::make_shared<ExpressionNode>(ColumnName("string"), ValueName("value"), OperationType::LT);
ec.add_expression_node("root", node);
ec.root_node_name_ = ExpressionName("root");
FilterClause filter_clause{{"string"}, ec, {}};
EXPECT_THROW(filter_clause.modify_schema(initial_schema()), UserInputException);
}

TEST_F(AstParsingOutputTypesTest, FilterLessThanNumericColBitset) {
ExpressionContext ec;
auto node_1 = std::make_shared<ExpressionNode>(ColumnName("int32"), ColumnName("uint8"), OperationType::EQ);
ec.add_expression_node("bitset", node_1);
auto node_2 = std::make_shared<ExpressionNode>(ColumnName("int32"), ExpressionName("bitset"), OperationType::LT);
ec.add_expression_node("root", node_2);
ec.root_node_name_ = ExpressionName("root");
FilterClause filter_clause{{"int32", "uint8"}, ec, {}};
EXPECT_THROW(filter_clause.modify_schema(initial_schema()), UserInputException);
}

TEST_F(AstParsingOutputTypesTest, FilterIsInNumericColNumericValueSet) {
ExpressionContext ec;
std::unordered_set<uint8_t> raw_set{1, 2, 3};
auto value_set = std::make_shared<ValueSet>(std::make_shared<std::unordered_set<uint8_t>>(std::move(raw_set)));
ec.add_value_set("value_set", value_set);
auto node = std::make_shared<ExpressionNode>(ColumnName("int32"), ValueSetName("value_set"), OperationType::ISIN);
ec.add_expression_node("root", node);
ec.root_node_name_ = ExpressionName("root");
FilterClause filter_clause{{"int32"}, ec, {}};
auto output_schema = filter_clause.modify_schema(initial_schema());
ASSERT_EQ(output_schema.stream_descriptor_, initial_schema().stream_descriptor_);
}

TEST_F(AstParsingOutputTypesTest, FilterIsInStringColStringValueSet) {
ExpressionContext ec;
std::vector<std::string> raw_set{"hello", "goodbye"};
auto value_set = std::make_shared<ValueSet>(std::move(raw_set));
ec.add_value_set("value_set", value_set);
auto node = std::make_shared<ExpressionNode>(ColumnName("string"), ValueSetName("value_set"), OperationType::ISIN);
ec.add_expression_node("root", node);
ec.root_node_name_ = ExpressionName("root");
FilterClause filter_clause{{"string"}, ec, {}};
auto output_schema = filter_clause.modify_schema(initial_schema());
ASSERT_EQ(output_schema.stream_descriptor_, initial_schema().stream_descriptor_);
}

TEST_F(AstParsingOutputTypesTest, FilterIsInNumericColEmptyValueSet) {
ExpressionContext ec;
std::vector<std::string> raw_set;
auto value_set = std::make_shared<ValueSet>(std::move(raw_set));
ec.add_value_set("value_set", value_set);
auto node = std::make_shared<ExpressionNode>(ColumnName("int32"), ValueSetName("value_set"), OperationType::ISIN);
ec.add_expression_node("root", node);
ec.root_node_name_ = ExpressionName("root");
FilterClause filter_clause{{"int32"}, ec, {}};
auto output_schema = filter_clause.modify_schema(initial_schema());
ASSERT_EQ(output_schema.stream_descriptor_, initial_schema().stream_descriptor_);
}

TEST_F(AstParsingOutputTypesTest, FilterIsInBitsetValueSet) {
ExpressionContext ec;
auto node_1 = std::make_shared<ExpressionNode>(ColumnName("int32"), ColumnName("uint8"), OperationType::EQ);
ec.add_expression_node("bitset", node_1);
std::unordered_set<uint8_t> raw_set{1, 2, 3};
auto value_set = std::make_shared<ValueSet>(std::make_shared<std::unordered_set<uint8_t>>(std::move(raw_set)));
ec.add_value_set("value_set", value_set);
auto node_2 = std::make_shared<ExpressionNode>(ExpressionName("bitset"), ValueSetName("value_set"), OperationType::ISIN);
ec.add_expression_node("root", node_2);
ec.root_node_name_ = ExpressionName("root");
FilterClause filter_clause{{"int32", "uint8"}, ec, {}};
EXPECT_THROW(filter_clause.modify_schema(initial_schema()), UserInputException);
}

TEST_F(AstParsingOutputTypesTest, FilterIsInNumericColNumericValue) {
ExpressionContext ec;
auto value = std::make_shared<Value>(construct_value<uint16_t>(5));
ec.add_value("value", value);
auto node = std::make_shared<ExpressionNode>(ColumnName("int32"), ValueName("value"), OperationType::ISIN);
ec.add_expression_node("root", node);
ec.root_node_name_ = ExpressionName("root");
FilterClause filter_clause{{"int32"}, ec, {}};
EXPECT_THROW(filter_clause.modify_schema(initial_schema()), UserInputException);
}

TEST_F(AstParsingOutputTypesTest, FilterAndBitsetBitset) {
ExpressionContext ec;
auto node_1 = std::make_shared<ExpressionNode>(ColumnName("int32"), ColumnName("uint8"), OperationType::EQ);
ec.add_expression_node("bitset_1", node_1);
auto node_2 = std::make_shared<ExpressionNode>(ColumnName("int32"), ColumnName("uint8"), OperationType::LT);
ec.add_expression_node("bitset_2", node_2);
auto node_3 = std::make_shared<ExpressionNode>(ExpressionName("bitset_1"), ExpressionName("bitset_2"), OperationType::AND);
ec.add_expression_node("root", node_3);
ec.root_node_name_ = ExpressionName("root");
FilterClause filter_clause{{"int32", "uint8"}, ec, {}};
auto output_schema = filter_clause.modify_schema(initial_schema());
ASSERT_EQ(output_schema.stream_descriptor_, initial_schema().stream_descriptor_);
}

TEST_F(AstParsingOutputTypesTest, FilterAndBitsetBoolColumn) {
ExpressionContext ec;
auto node_1 = std::make_shared<ExpressionNode>(ColumnName("int32"), ColumnName("uint8"), OperationType::EQ);
ec.add_expression_node("bitset", node_1);
auto node_2 = std::make_shared<ExpressionNode>(ExpressionName("bitset"), ColumnName("bool"), OperationType::AND);
ec.add_expression_node("root", node_2);
ec.root_node_name_ = ExpressionName("root");
FilterClause filter_clause{{"int32", "uint8", "bool"}, ec, {}};
auto output_schema = filter_clause.modify_schema(initial_schema());
ASSERT_EQ(output_schema.stream_descriptor_, initial_schema().stream_descriptor_);
}

TEST_F(AstParsingOutputTypesTest, FilterAndNumericCols) {
ExpressionContext ec;
auto node = std::make_shared<ExpressionNode>(ColumnName("int32"), ColumnName("uint8"), OperationType::AND);
ec.add_expression_node("root", node);
ec.root_node_name_ = ExpressionName("root");
FilterClause filter_clause{{"int32", "uint8"}, ec, {}};
EXPECT_THROW(filter_clause.modify_schema(initial_schema()), UserInputException);
}

TEST_F(AstParsingOutputTypesTest, ProjectionAbsNumeric) {
ExpressionContext ec;
auto node = std::make_shared<ExpressionNode>(ColumnName("int32"), OperationType::ABS);