From 68c81aee9b55c21e460f6401632049faa92b66f7 Mon Sep 17 00:00:00 2001 From: kasakrisz Date: Fri, 7 Nov 2025 11:23:50 +0100 Subject: [PATCH] HIVE-29290: Enabling "hive.merge.nway.joins" returns wrong resutls --- .../hive/ql/exec/CommonJoinOperator.java | 8 +- .../hive/ql/exec/CommonMergeJoinOperator.java | 3 +- .../test/queries/clientpositive/antijoin3.q | 22 ++ .../clientpositive/llap/antijoin3.q.out | 250 ++++++++++++++++++ 4 files changed, 281 insertions(+), 2 deletions(-) create mode 100644 ql/src/test/queries/clientpositive/antijoin3.q create mode 100644 ql/src/test/results/clientpositive/llap/antijoin3.q.out diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/CommonJoinOperator.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/CommonJoinOperator.java index 58ae7f3ef42a..bb0e274212e7 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/CommonJoinOperator.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/CommonJoinOperator.java @@ -990,7 +990,13 @@ protected void checkAndGenObject() throws HiveException { } else { if (!alw.hasRows()) { hasEmpty = true; - alw.addRow(dummyObj[i]); + if (!isRightOfAntiJoin) { + alw.addRow(dummyObj[i]); + } + } else if (isRightOfAntiJoin && !needsPostEvaluation) { + // For anti join the right side should be empty. For needsPostEvaluation case we will + // wait till evaluation is done. For other cases we can directly return from here. + return; } else if (!hasEmpty && alw.isSingleRow()) { if (hasAnyFiltered(alias, alw.rowIter().first())) { hasEmpty = true; diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/CommonMergeJoinOperator.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/CommonMergeJoinOperator.java index c677796de73a..f9e7a40e10e7 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/CommonMergeJoinOperator.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/CommonMergeJoinOperator.java @@ -428,7 +428,8 @@ private List joinObject(int[] smallestPos, boolean clear) throws HiveExcep private void putDummyOrEmpty(Byte i) { // put a empty list or null - if (noOuterJoin) { + boolean isRightOfAntiJoin = (i != 0 && condn[i-1].getType() == JoinDesc.ANTI_JOIN); + if (noOuterJoin || isRightOfAntiJoin) { storage[i] = emptyList; } else { storage[i] = dummyObjVectors[i]; diff --git a/ql/src/test/queries/clientpositive/antijoin3.q b/ql/src/test/queries/clientpositive/antijoin3.q new file mode 100644 index 000000000000..b2ce2b0fc80a --- /dev/null +++ b/ql/src/test/queries/clientpositive/antijoin3.q @@ -0,0 +1,22 @@ +set hive.merge.nway.joins=true; + +create table taba(id string); +create table tabb(id string); +create table tabc(id string); + +INSERT INTO TABLE taba VALUES ('1'),('2'); +INSERT INTO TABLE tabc VALUES ('1'),('2'),('2'); + +explain +select * from taba A left outer join tabb B on (A.id = B.id) left outer join tabc C on (C.id = A.id) where B.id is null; +explain cbo +select * from taba A left outer join tabb B on (A.id = B.id) left outer join tabc C on (C.id = A.id) where B.id is null; +select * from taba A left outer join tabb B on (A.id = B.id) left outer join tabc C on (C.id = A.id) where B.id is null; + +INSERT INTO TABLE tabb VALUES ('1'); + +select * from taba A left outer join tabb B on (A.id = B.id) left outer join tabc C on (C.id = A.id) where B.id is null; + +INSERT INTO TABLE tabb VALUES ('2'); + +select * from taba A left outer join tabb B on (A.id = B.id) left outer join tabc C on (C.id = A.id) where B.id is null; diff --git a/ql/src/test/results/clientpositive/llap/antijoin3.q.out b/ql/src/test/results/clientpositive/llap/antijoin3.q.out new file mode 100644 index 000000000000..b12a4fe603ff --- /dev/null +++ b/ql/src/test/results/clientpositive/llap/antijoin3.q.out @@ -0,0 +1,250 @@ +PREHOOK: query: create table taba(id string) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@taba +POSTHOOK: query: create table taba(id string) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@taba +PREHOOK: query: create table tabb(id string) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@tabb +POSTHOOK: query: create table tabb(id string) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@tabb +PREHOOK: query: create table tabc(id string) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@tabc +POSTHOOK: query: create table tabc(id string) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@tabc +PREHOOK: query: INSERT INTO TABLE taba VALUES ('1'),('2') +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@taba +POSTHOOK: query: INSERT INTO TABLE taba VALUES ('1'),('2') +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@taba +POSTHOOK: Lineage: taba.id SCRIPT [] +PREHOOK: query: INSERT INTO TABLE tabc VALUES ('1'),('2'),('2') +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@tabc +POSTHOOK: query: INSERT INTO TABLE tabc VALUES ('1'),('2'),('2') +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@tabc +POSTHOOK: Lineage: tabc.id SCRIPT [] +PREHOOK: query: explain +select * from taba A left outer join tabb B on (A.id = B.id) left outer join tabc C on (C.id = A.id) where B.id is null +PREHOOK: type: QUERY +PREHOOK: Input: default@taba +PREHOOK: Input: default@tabb +PREHOOK: Input: default@tabc +#### A masked pattern was here #### +POSTHOOK: query: explain +select * from taba A left outer join tabb B on (A.id = B.id) left outer join tabc C on (C.id = A.id) where B.id is null +POSTHOOK: type: QUERY +POSTHOOK: Input: default@taba +POSTHOOK: Input: default@tabb +POSTHOOK: Input: default@tabc +#### A masked pattern was here #### +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 3 (SIMPLE_EDGE), Map 4 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: a + Statistics: Num rows: 2 Data size: 170 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: id (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 2 Data size: 170 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: string) + null sort order: z + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 2 Data size: 170 Basic stats: COMPLETE Column stats: COMPLETE + Execution mode: vectorized, llap + LLAP IO: all inputs + Map 3 + Map Operator Tree: + TableScan + alias: b + filterExpr: id is not null (type: boolean) + Statistics: Num rows: 1 Data size: 184 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: id is not null (type: boolean) + Statistics: Num rows: 1 Data size: 184 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: id (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 184 Basic stats: COMPLETE Column stats: NONE + Group By Operator + keys: _col0 (type: string) + minReductionHashAggr: 0.99 + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 184 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + null sort order: z + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 1 Data size: 184 Basic stats: COMPLETE Column stats: NONE + Execution mode: vectorized, llap + LLAP IO: all inputs + Map 4 + Map Operator Tree: + TableScan + alias: c + filterExpr: id is not null (type: boolean) + Statistics: Num rows: 3 Data size: 255 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: id is not null (type: boolean) + Statistics: Num rows: 3 Data size: 255 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: id (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 3 Data size: 255 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: string) + null sort order: z + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 3 Data size: 255 Basic stats: COMPLETE Column stats: COMPLETE + Execution mode: vectorized, llap + LLAP IO: all inputs + Reducer 2 + Execution mode: llap + Reduce Operator Tree: + Merge Join Operator + condition map: + Anti Join 0 to 1 + Left Outer Join 0 to 2 + keys: + 0 _col0 (type: string) + 1 _col0 (type: string) + 2 _col0 (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 6 Data size: 561 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: string), null (type: string), _col1 (type: string) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 6 Data size: 561 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 6 Data size: 561 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: explain cbo +select * from taba A left outer join tabb B on (A.id = B.id) left outer join tabc C on (C.id = A.id) where B.id is null +PREHOOK: type: QUERY +PREHOOK: Input: default@taba +PREHOOK: Input: default@tabb +PREHOOK: Input: default@tabc +#### A masked pattern was here #### +POSTHOOK: query: explain cbo +select * from taba A left outer join tabb B on (A.id = B.id) left outer join tabc C on (C.id = A.id) where B.id is null +POSTHOOK: type: QUERY +POSTHOOK: Input: default@taba +POSTHOOK: Input: default@tabb +POSTHOOK: Input: default@tabc +#### A masked pattern was here #### +CBO PLAN: +HiveProject(id=[$1], id1=[null:VARCHAR(2147483647) CHARACTER SET "UTF-16LE"], id2=[$0]) + HiveJoin(condition=[=($0, $1)], joinType=[right], algorithm=[none], cost=[not available]) + HiveProject(id=[$0]) + HiveFilter(condition=[IS NOT NULL($0)]) + HiveTableScan(table=[[default, tabc]], table:alias=[c]) + HiveAntiJoin(condition=[=($0, $1)], joinType=[anti]) + HiveProject(id=[$0]) + HiveTableScan(table=[[default, taba]], table:alias=[a]) + HiveProject(id=[$0]) + HiveFilter(condition=[IS NOT NULL($0)]) + HiveTableScan(table=[[default, tabb]], table:alias=[b]) + +PREHOOK: query: select * from taba A left outer join tabb B on (A.id = B.id) left outer join tabc C on (C.id = A.id) where B.id is null +PREHOOK: type: QUERY +PREHOOK: Input: default@taba +PREHOOK: Input: default@tabb +PREHOOK: Input: default@tabc +#### A masked pattern was here #### +POSTHOOK: query: select * from taba A left outer join tabb B on (A.id = B.id) left outer join tabc C on (C.id = A.id) where B.id is null +POSTHOOK: type: QUERY +POSTHOOK: Input: default@taba +POSTHOOK: Input: default@tabb +POSTHOOK: Input: default@tabc +#### A masked pattern was here #### +1 NULL 1 +2 NULL 2 +2 NULL 2 +PREHOOK: query: INSERT INTO TABLE tabb VALUES ('1') +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@tabb +POSTHOOK: query: INSERT INTO TABLE tabb VALUES ('1') +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@tabb +POSTHOOK: Lineage: tabb.id SCRIPT [] +PREHOOK: query: select * from taba A left outer join tabb B on (A.id = B.id) left outer join tabc C on (C.id = A.id) where B.id is null +PREHOOK: type: QUERY +PREHOOK: Input: default@taba +PREHOOK: Input: default@tabb +PREHOOK: Input: default@tabc +#### A masked pattern was here #### +POSTHOOK: query: select * from taba A left outer join tabb B on (A.id = B.id) left outer join tabc C on (C.id = A.id) where B.id is null +POSTHOOK: type: QUERY +POSTHOOK: Input: default@taba +POSTHOOK: Input: default@tabb +POSTHOOK: Input: default@tabc +#### A masked pattern was here #### +2 NULL 2 +2 NULL 2 +PREHOOK: query: INSERT INTO TABLE tabb VALUES ('2') +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@tabb +POSTHOOK: query: INSERT INTO TABLE tabb VALUES ('2') +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@tabb +POSTHOOK: Lineage: tabb.id SCRIPT [] +PREHOOK: query: select * from taba A left outer join tabb B on (A.id = B.id) left outer join tabc C on (C.id = A.id) where B.id is null +PREHOOK: type: QUERY +PREHOOK: Input: default@taba +PREHOOK: Input: default@tabb +PREHOOK: Input: default@tabc +#### A masked pattern was here #### +POSTHOOK: query: select * from taba A left outer join tabb B on (A.id = B.id) left outer join tabc C on (C.id = A.id) where B.id is null +POSTHOOK: type: QUERY +POSTHOOK: Input: default@taba +POSTHOOK: Input: default@tabb +POSTHOOK: Input: default@tabc +#### A masked pattern was here ####