From 6298b6f8a5662836c18b7e28fff6b0b6d1bf37d0 Mon Sep 17 00:00:00 2001 From: Johan Lasperas Date: Thu, 22 Jun 2023 12:13:09 +0200 Subject: [PATCH] Add read char padding test and reword comments --- .../scala/org/apache/spark/sql/delta/DeltaTable.scala | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/spark/src/main/scala/org/apache/spark/sql/delta/DeltaTable.scala b/spark/src/main/scala/org/apache/spark/sql/delta/DeltaTable.scala index 8cb90e94b3c..252dd4b3438 100644 --- a/spark/src/main/scala/org/apache/spark/sql/delta/DeltaTable.scala +++ b/spark/src/main/scala/org/apache/spark/sql/delta/DeltaTable.scala @@ -358,6 +358,7 @@ object DeltaTableUtils extends PredicateHelper var hasChar = false var newTarget = target transformDown { case l @ LogicalRelation(hfsr: HadoopFsRelation, _, _, _) => + // Prune columns from the scan. val finalOutput = actualNewOutput.getOrElse(l.output).filterNot { col => columnsToDrop.exists(resolver(_, col.name)) } @@ -372,6 +373,9 @@ object DeltaTableUtils extends PredicateHelper l.copy(relation = newBaseRelation, output = finalOutput) case p @ Project(projectList, _) => + // Spark does char type read-side padding via an additional Project over the scan node. + // `newOutput` references the Project attributes, we need to translate their expression IDs + // so that `newOutput` references attributes from the LogicalRelation instead. def hasCharPadding(e: Expression): Boolean = e.exists { case s: StaticInvoke => s.staticObject == classOf[CharVarcharCodegenUtils] && s.functionName == "readSidePadding" @@ -392,12 +396,11 @@ object DeltaTableUtils extends PredicateHelper } if (hasChar) { + // When char type read-side padding is applied, we need to apply column pruning for the + // Project as well, otherwise the Project will contain missing attributes. newTarget = newTarget.transformUp { case p @ Project(projectList, child) => val newProjectList = projectList.filter { e => - // Spark does char type read-side padding via an additional Project over the scan node, - // and we need to apply column pruning for the Project as well, otherwise the Project - // will contain missing attributes. e.references.subsetOf(child.outputSet) } p.copy(projectList = newProjectList)