From 00588beeb8f545aee75beb01c356cf102487bd11 Mon Sep 17 00:00:00 2001
From: larsk-db <lars.kroll@databricks.com>
Date: Tue, 11 Jul 2023 10:58:52 +0200
Subject: [PATCH 1/2] Fix streaming when the same file occurs with different
 DVs in the same batch

---
 .../spark/sql/delta/sources/DeltaSource.scala |  36 +-
 .../DeltaSourceDeletionVectorsSuite.scala     | 415 ++++++++++++++++++
 2 files changed, 439 insertions(+), 12 deletions(-)
 create mode 100644 core/src/test/scala/org/apache/spark/sql/delta/DeltaSourceDeletionVectorsSuite.scala

diff --git a/core/src/main/scala/org/apache/spark/sql/delta/sources/DeltaSource.scala b/core/src/main/scala/org/apache/spark/sql/delta/sources/DeltaSource.scala
index 32677ee67a3..a412fbec16b 100644
--- a/core/src/main/scala/org/apache/spark/sql/delta/sources/DeltaSource.scala
+++ b/core/src/main/scala/org/apache/spark/sql/delta/sources/DeltaSource.scala
@@ -20,9 +20,7 @@ package org.apache.spark.sql.delta.sources
 import java.io.FileNotFoundException
 import java.sql.Timestamp
 
-import scala.collection.mutable
 import scala.util.{Failure, Success, Try}
-import scala.util.control.NonFatal
 import scala.util.matching.Regex
 
 import org.apache.spark.sql.delta._
@@ -316,16 +314,30 @@ trait DeltaSourceBase extends Source
    * @param indexedFiles actions iterator from which to generate the DataFrame.
    */
   protected def createDataFrame(indexedFiles: Iterator[IndexedFile]): DataFrame = {
-    val addFilesList = indexedFiles
-        .map(_.getFileAction)
-        .filter(_.isInstanceOf[AddFile])
-        .asInstanceOf[Iterator[AddFile]].toArray
-
-    deltaLog.createDataFrame(
-      readSchemaSnapshotDescriptor,
-      addFilesList,
-      isStreaming = true
-    )
+    // Read AddFiles from different versions in different scans.
+    // This avoids an issue where we might read the same file with different deletion vectors in
+    // the same scan, which we cannot support as long we broadcast a map of DVs for lookup.
+    // This code can be removed once we can pass the DVs into the scan directly together with the
+    // AddFile/PartitionedFile entry.
+    indexedFiles
+      .filter(_.getFileAction.isInstanceOf[AddFile])
+      .toSeq
+      .groupBy(_.version)
+      .values
+      .map { addFilesList =>
+        deltaLog.createDataFrame(
+          readSchemaSnapshotDescriptor,
+          addFilesList.map(_.getFileAction.asInstanceOf[AddFile]),
+          isStreaming = true
+        )
+      }.reduceOption(_ union _).getOrElse {
+        // If we filtered out all the values before the groupBy, just return an empty DataFrame.
+        deltaLog.createDataFrame(
+          readSchemaSnapshotDescriptor,
+          Seq.empty[AddFile],
+          isStreaming = true
+        )
+      }
   }
 
   /**
diff --git a/core/src/test/scala/org/apache/spark/sql/delta/DeltaSourceDeletionVectorsSuite.scala b/core/src/test/scala/org/apache/spark/sql/delta/DeltaSourceDeletionVectorsSuite.scala
new file mode 100644
index 00000000000..79ddd1263bc
--- /dev/null
+++ b/core/src/test/scala/org/apache/spark/sql/delta/DeltaSourceDeletionVectorsSuite.scala
@@ -0,0 +1,415 @@
+/*
+ * Copyright (2021) The Delta Lake Project Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.sql.delta
+
+import java.io.File
+
+import scala.util.control.NonFatal
+
+import org.apache.hadoop.fs.Path
+import org.scalatest.concurrent.Eventually
+import org.scalatest.concurrent.PatienceConfiguration.Timeout
+
+import org.apache.spark.sql.delta.test.DeltaSQLCommandTest
+import org.apache.spark.sql.execution.streaming.StreamExecution
+import org.apache.spark.sql.streaming.{StreamTest, Trigger}
+import org.apache.spark.sql.streaming.util.StreamManualClock
+
+trait DeltaSourceDeletionVectorTests extends StreamTest with DeletionVectorsTestUtils {
+
+  import testImplicits._
+
+  test("allow to delete files before starting a streaming query") {
+    withTempDir { inputDir =>
+      val deltaLog = DeltaLog.forTable(spark, new Path(inputDir.toURI))
+      (0 until 5).foreach { i =>
+        val v = Seq(i.toString).toDF
+        v.write.mode("append").format("delta").save(deltaLog.dataPath.toString)
+      }
+      sql(s"DELETE FROM delta.`$inputDir`")
+      (5 until 10).foreach { i =>
+        val v = Seq(i.toString).toDF
+        v.write.mode("append").format("delta").save(deltaLog.dataPath.toString)
+      }
+      deltaLog.checkpoint()
+      assert(deltaLog.readLastCheckpointFile().nonEmpty, "this test requires a checkpoint")
+
+      val df = spark.readStream
+        .format("delta")
+        .load(inputDir.getCanonicalPath)
+
+      testStream(df)(
+        AssertOnQuery { q =>
+          q.processAllAvailable()
+          true
+        },
+        CheckAnswer((5 until 10).map(_.toString): _*))
+    }
+  }
+
+  test("allow to delete files before staring a streaming query without checkpoint") {
+    withTempDir { inputDir =>
+      val deltaLog = DeltaLog.forTable(spark, new Path(inputDir.toURI))
+      (0 until 5).foreach { i =>
+        val v = Seq(i.toString).toDF
+        v.write.mode("append").format("delta").save(deltaLog.dataPath.toString)
+      }
+      sql(s"DELETE FROM delta.`$inputDir`")
+      (5 until 7).foreach { i =>
+        val v = Seq(i.toString).toDF
+        v.write.mode("append").format("delta").save(deltaLog.dataPath.toString)
+      }
+      assert(deltaLog.readLastCheckpointFile().isEmpty, "this test requires no checkpoint")
+
+      val df = spark.readStream
+        .format("delta")
+        .load(inputDir.getCanonicalPath)
+
+      testStream(df)(
+        AssertOnQuery { q =>
+          q.processAllAvailable()
+          true
+        },
+        CheckAnswer((5 until 7).map(_.toString): _*))
+    }
+  }
+
+  /**
+   * If deletion vectors are expected here, return true if they are present. If none are expected,
+   * return true if none are present.
+   */
+  protected def deletionVectorsPresentIfExpected(
+      inputDir: String,
+      expectDVs: Boolean): Boolean = {
+    val deltaLog = DeltaLog.forTable(spark, inputDir)
+    val filesWithDVs = getFilesWithDeletionVectors(deltaLog)
+    logWarning(s"Expecting DVs=$expectDVs - found ${filesWithDVs.size}")
+    if (expectDVs) {
+      filesWithDVs.nonEmpty
+    } else {
+      filesWithDVs.isEmpty
+    }
+  }
+
+  private def ignoreOperationsTest(
+      inputDir: String,
+      sourceOptions: Seq[(String, String)],
+      sqlCommand: String,
+      commandShouldProduceDVs: Option[Boolean] = None)(expectations: StreamAction*): Unit = {
+    (0 until 10 by 2).foreach { i =>
+      Seq(i, i + 1).toDF().coalesce(1).write.format("delta").mode("append").save(inputDir)
+    }
+
+    val df = spark.readStream.format("delta").options(sourceOptions.toMap).load(inputDir)
+    val expectDVs = commandShouldProduceDVs.getOrElse(
+      sqlCommand.toUpperCase().startsWith("DELETE"))
+
+    val base = Seq(
+      AssertOnQuery { q =>
+        q.processAllAvailable()
+        true
+      },
+      CheckAnswer((0 until 10): _*),
+      AssertOnQuery { q =>
+        sql(sqlCommand)
+        deletionVectorsPresentIfExpected(inputDir, expectDVs)
+      })
+
+    testStream(df)((base ++ expectations): _*)
+  }
+
+  private def ignoreOperationsTestWithManualClock(
+      inputDir: String,
+      sourceOptions: Seq[(String, String)],
+      sqlCommand1: String,
+      sqlCommand2: String,
+      command1ShouldProduceDVs: Option[Boolean] = None,
+      command2ShouldProduceDVs: Option[Boolean] = None,
+      expectations: List[StreamAction]): Unit = {
+
+    (0 until 15 by 3).foreach { i =>
+      Seq(i, i + 1, i + 2).toDF().coalesce(1).write.format("delta").mode("append").save(inputDir)
+    }
+    val log = DeltaLog.forTable(spark, inputDir)
+    val commitVersionBeforeDML = log.update().version
+    val df = spark.readStream.format("delta").options(sourceOptions.toMap).load(inputDir)
+    def expectDVsInCommand(shouldProduceDVs: Option[Boolean], command: String): Boolean = {
+      shouldProduceDVs.getOrElse(command.toUpperCase().startsWith("DELETE"))
+    }
+    val expectDVsInCommand1 = expectDVsInCommand(command1ShouldProduceDVs, sqlCommand1)
+    val expectDVsInCommand2 = expectDVsInCommand(command2ShouldProduceDVs, sqlCommand2)
+
+    // If it's expected to fail we must be sure not to actually process it in here,
+    // or it'll fail too early instead of being caught by ExpectFailure.
+    val shouldFailAfterCommands = expectations.exists(_.isInstanceOf[ExpectFailure[_]])
+
+    val baseActions: Seq[StreamAction] = Seq(
+      StartStream(
+        Trigger.ProcessingTime("10 seconds"),
+        new StreamManualClock(System.currentTimeMillis())),
+      AdvanceManualClock(10L * 1000L),
+      CheckAnswer((0 until 15): _*),
+      AssertOnQuery { q =>
+        // Make sure we only processed a single batch since the initial data load.
+        q.commitLog.getLatestBatchId().get == 0
+      },
+      AssertOnQuery { q =>
+        sql(sqlCommand1)
+        deletionVectorsPresentIfExpected(inputDir, expectDVsInCommand1)
+      },
+      AssertOnQuery { q =>
+        sql(sqlCommand2)
+        deletionVectorsPresentIfExpected(inputDir, expectDVsInCommand2)
+      },
+      AdvanceManualClock(20L * 1000L)) ++
+      (if (shouldFailAfterCommands) {
+         Seq.empty[StreamAction]
+       } else {
+         Seq(
+           // This makes it move to the next batch.
+           AssertOnQuery(waitUntilBatchProcessed(1, _)),
+           AssertOnQuery { q =>
+             eventually("Next batch was never processed") {
+               // Make sure we only processed a single batch since the initial data load.
+               assert(q.commitLog.getLatestBatchId().get === 1)
+             }
+             true
+           })
+       })
+
+    testStream(df)((baseActions ++ expectations): _*)
+  }
+
+  protected def waitUntilBatchProcessed(batchId: Int, currentStream: StreamExecution): Boolean = {
+    eventually("Next batch was never processed") {
+      if (!currentStream.exception.isDefined) {
+        assert(currentStream.commitLog.getLatestBatchId().get >= batchId)
+      }
+    }
+    if (currentStream.exception.isDefined) {
+      throw currentStream.exception.get
+    }
+    true
+  }
+
+  protected def eventually[T](message: String)(func: => T): T = {
+    try {
+      Eventually.eventually(Timeout(streamingTimeout)) {
+        func
+      }
+    } catch {
+      case NonFatal(e) =>
+        fail(message, e)
+    }
+  }
+
+  testQuietly(s"deleting files fails query if ignoreDeletes = false") {
+    withTempDir { inputDir =>
+      ignoreOperationsTest(
+        inputDir.getAbsolutePath,
+        sourceOptions = Nil,
+        sqlCommand = s"DELETE FROM delta.`$inputDir`",
+        // Whole table deletes do not produce DVs.
+        commandShouldProduceDVs = Some(false))(ExpectFailure[DeltaUnsupportedOperationException] {
+        e =>
+          for (msg <- Seq("Detected deleted data", "not supported", "ignoreDeletes", "true")) {
+            assert(e.getMessage.contains(msg))
+          }
+      })
+    }
+  }
+
+  Seq("ignoreFileDeletion", DeltaOptions.IGNORE_DELETES_OPTION).foreach { ignoreDeletes =>
+    testQuietly(
+      s"allow to delete files after staring a streaming query when $ignoreDeletes is true") {
+      withTempDir { inputDir =>
+        ignoreOperationsTest(
+          inputDir.getAbsolutePath,
+          sourceOptions = Seq(ignoreDeletes -> "true"),
+          sqlCommand = s"DELETE FROM delta.`$inputDir`",
+          // Whole table deletes do not produce DVs.
+          commandShouldProduceDVs = Some(false))(
+          AssertOnQuery { q =>
+            Seq(10).toDF().write.format("delta").mode("append").save(inputDir.getAbsolutePath)
+            q.processAllAvailable()
+            true
+          },
+          CheckAnswer((0 to 10): _*))
+      }
+    }
+  }
+
+  case class SourceChangeVariant(
+      label: String,
+      query: File => String,
+      answerWithIgnoreChanges: Seq[Int])
+
+  val sourceChangeVariants: Seq[SourceChangeVariant] = Seq(
+    // A partial file delete is treated like an update by the Source.
+    SourceChangeVariant(
+      label = "DELETE",
+      query = inputDir => s"DELETE FROM delta.`$inputDir` WHERE value = 3",
+      // 2 occurs in the same file as 3, so it gets duplicated during processing.
+      answerWithIgnoreChanges = (0 to 10) :+ 2))
+
+  for (variant <- sourceChangeVariants)
+  testQuietly(
+    "updating the source table causes failure when ignoreChanges = false" +
+      s" - using ${variant.label}") {
+    withTempDir { inputDir =>
+      ignoreOperationsTest(
+        inputDir.getAbsolutePath,
+        sourceOptions = Nil,
+        sqlCommand = variant.query(inputDir))(
+        ExpectFailure[DeltaUnsupportedOperationException] { e =>
+          for (msg <- Seq("data update", "not supported", "skipChangeCommits", "true")) {
+            assert(e.getMessage.contains(msg))
+          }
+        })
+    }
+  }
+
+  for (variant <- sourceChangeVariants)
+  testQuietly(
+    "allow to update the source table when ignoreChanges = true" +
+      s" - using ${variant.label}") {
+    withTempDir { inputDir =>
+      ignoreOperationsTest(
+        inputDir.getAbsolutePath,
+        sourceOptions = Seq(DeltaOptions.IGNORE_CHANGES_OPTION -> "true"),
+        sqlCommand = variant.query(inputDir))(
+        AssertOnQuery { q =>
+          Seq(10).toDF().write.format("delta").mode("append").save(inputDir.getAbsolutePath)
+          q.processAllAvailable()
+          true
+        },
+        CheckAnswer(variant.answerWithIgnoreChanges: _*))
+    }
+  }
+
+  testQuietly("deleting files when ignoreChanges = true doesn't fail the query") {
+    withTempDir { inputDir =>
+      ignoreOperationsTest(
+        inputDir.getAbsolutePath,
+        sourceOptions = Seq(DeltaOptions.IGNORE_CHANGES_OPTION -> "true"),
+        sqlCommand = s"DELETE FROM delta.`$inputDir`",
+        // Whole table deletes do not produce DVs.
+        commandShouldProduceDVs = Some(false))(
+        AssertOnQuery { q =>
+          Seq(10).toDF().write.format("delta").mode("append").save(inputDir.getAbsolutePath)
+          q.processAllAvailable()
+          true
+        },
+        CheckAnswer((0 to 10): _*))
+    }
+  }
+
+  for (variant <- sourceChangeVariants)
+  testQuietly("updating source table when ignoreDeletes = true fails the query" +
+      s" - using ${variant.label}") {
+    withTempDir { inputDir =>
+      ignoreOperationsTest(
+        inputDir.getAbsolutePath,
+        sourceOptions = Seq(DeltaOptions.IGNORE_DELETES_OPTION -> "true"),
+        sqlCommand = variant.query(inputDir))(
+        ExpectFailure[DeltaUnsupportedOperationException] { e =>
+          for (msg <- Seq("data update", "not supported", "skipChangeCommits", "true")) {
+            assert(e.getMessage.contains(msg))
+          }
+        })
+    }
+  }
+
+  private val allSourceOptions = Seq(
+    Nil,
+    List(DeltaOptions.IGNORE_DELETES_OPTION),
+    List(DeltaOptions.IGNORE_CHANGES_OPTION),
+    List(DeltaOptions.SKIP_CHANGE_COMMITS_OPTION))
+    .map { options =>
+      options.map(key => key -> "true")
+    }
+
+  for (sourceOption <- allSourceOptions)
+  testQuietly("subsequent DML commands are processed correctly in a batch - DELETE->DELETE" +
+      s" - $sourceOption") {
+    val expectations: List[StreamAction] =
+      sourceOption.map(_._1) match {
+        case List(DeltaOptions.IGNORE_DELETES_OPTION) | Nil =>
+          // These two do not allow updates.
+          ExpectFailure[DeltaUnsupportedOperationException] { e =>
+            for (msg <- Seq("data update", "not supported", "skipChangeCommits", "true")) {
+              assert(e.getMessage.contains(msg))
+            }
+          } :: Nil
+        case List(DeltaOptions.IGNORE_CHANGES_OPTION) =>
+          // The 4 and 5 are in the same file as 3, so the first DELETE is going to duplicate them.
+          // 5 is still in the same file as 4 after the first DELETE, so the second DELETE is going
+          // to duplicate it again.
+          CheckAnswer((0 until 15) ++ Seq(4, 5, 5): _*) :: Nil
+        case List(DeltaOptions.SKIP_CHANGE_COMMITS_OPTION) =>
+          // This will completely ignore the DELETEs.
+          CheckAnswer((0 until 15): _*) :: Nil
+      }
+
+    withTempDir { inputDir =>
+      ignoreOperationsTestWithManualClock(
+        inputDir.getAbsolutePath,
+        sourceOptions = sourceOption,
+        sqlCommand1 = s"DELETE FROM delta.`$inputDir` WHERE value == 3",
+        sqlCommand2 = s"DELETE FROM delta.`$inputDir` WHERE value == 4",
+        expectations = expectations)
+    }
+  }
+
+  for (sourceOption <- allSourceOptions)
+  testQuietly("subsequent DML commands are processed correctly in a batch - INSERT->UPDATE" +
+      s" - $sourceOption") {
+    val expectations: List[StreamAction] = sourceOption.map(_._1) match {
+      case List(DeltaOptions.IGNORE_DELETES_OPTION) | Nil =>
+        // These two do not allow updates.
+        ExpectFailure[DeltaUnsupportedOperationException] { e =>
+          for (msg <- Seq("data update", "not supported", "skipChangeCommits", "true")) {
+            assert(e.getMessage.contains(msg))
+          }
+        } :: Nil
+      case List(DeltaOptions.IGNORE_CHANGES_OPTION) =>
+        // 15 and 16 are in the same file, so 16 will get duplicated by the DELETE.
+        CheckAnswer((0 to 16) ++ Seq(16): _*) :: Nil
+      case List(DeltaOptions.SKIP_CHANGE_COMMITS_OPTION) =>
+        // This will completely ignore the DELETE.
+        CheckAnswer((0 to 16): _*) :: Nil
+    }
+
+    withTempDir { inputDir =>
+      ignoreOperationsTestWithManualClock(
+        inputDir.getAbsolutePath,
+        sourceOptions = sourceOption,
+        sqlCommand1 =
+          s"INSERT INTO delta.`$inputDir` SELECT /*+ COALESCE(1) */ * FROM VALUES 15, 16",
+        sqlCommand2 = s"DELETE FROM delta.`$inputDir` WHERE value == 15",
+        expectations = expectations)
+    }
+  }
+}
+
+class DeltaSourceDeletionVectorsSuite extends DeltaSourceSuiteBase
+  with DeltaSQLCommandTest
+  with DeltaSourceDeletionVectorTests {
+  override def beforeAll(): Unit = {
+    super.beforeAll()
+    enableDeletionVectorsInNewTables(spark.conf)
+  }
+}

From c2d774ceb982441f301d00f23dda9ee41808d06f Mon Sep 17 00:00:00 2001
From: larsk-db <lars.kroll@databricks.com>
Date: Thu, 20 Jul 2023 11:26:58 +0200
Subject: [PATCH 2/2] address comments

---
 .../spark/sql/delta/sources/DeltaSource.scala | 54 +++++++++++--------
 1 file changed, 32 insertions(+), 22 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/sql/delta/sources/DeltaSource.scala b/core/src/main/scala/org/apache/spark/sql/delta/sources/DeltaSource.scala
index a412fbec16b..60a96306b7a 100644
--- a/core/src/main/scala/org/apache/spark/sql/delta/sources/DeltaSource.scala
+++ b/core/src/main/scala/org/apache/spark/sql/delta/sources/DeltaSource.scala
@@ -314,30 +314,40 @@ trait DeltaSourceBase extends Source
    * @param indexedFiles actions iterator from which to generate the DataFrame.
    */
   protected def createDataFrame(indexedFiles: Iterator[IndexedFile]): DataFrame = {
-    // Read AddFiles from different versions in different scans.
-    // This avoids an issue where we might read the same file with different deletion vectors in
-    // the same scan, which we cannot support as long we broadcast a map of DVs for lookup.
-    // This code can be removed once we can pass the DVs into the scan directly together with the
-    // AddFile/PartitionedFile entry.
-    indexedFiles
+    val addFiles = indexedFiles
       .filter(_.getFileAction.isInstanceOf[AddFile])
       .toSeq
-      .groupBy(_.version)
-      .values
-      .map { addFilesList =>
-        deltaLog.createDataFrame(
-          readSchemaSnapshotDescriptor,
-          addFilesList.map(_.getFileAction.asInstanceOf[AddFile]),
-          isStreaming = true
-        )
-      }.reduceOption(_ union _).getOrElse {
-        // If we filtered out all the values before the groupBy, just return an empty DataFrame.
-        deltaLog.createDataFrame(
-          readSchemaSnapshotDescriptor,
-          Seq.empty[AddFile],
-          isStreaming = true
-        )
-      }
+    val hasDeletionVectors =
+      addFiles.exists(_.getFileAction.asInstanceOf[AddFile].deletionVector != null)
+    if (hasDeletionVectors) {
+      // Read AddFiles from different versions in different scans.
+      // This avoids an issue where we might read the same file with different deletion vectors in
+      // the same scan, which we cannot support as long we broadcast a map of DVs for lookup.
+      // This code can be removed once we can pass the DVs into the scan directly together with the
+      // AddFile/PartitionedFile entry.
+      addFiles
+        .groupBy(_.version)
+        .values
+        .map { addFilesList =>
+          deltaLog.createDataFrame(
+            readSchemaSnapshotDescriptor,
+            addFilesList.map(_.getFileAction.asInstanceOf[AddFile]),
+            isStreaming = true)
+        }
+        .reduceOption(_ union _)
+        .getOrElse {
+          // If we filtered out all the values before the groupBy, just return an empty DataFrame.
+          deltaLog.createDataFrame(
+            readSchemaSnapshotDescriptor,
+            Seq.empty[AddFile],
+            isStreaming = true)
+        }
+    } else {
+      deltaLog.createDataFrame(
+        readSchemaSnapshotDescriptor,
+        addFiles.map(_.getFileAction.asInstanceOf[AddFile]),
+        isStreaming = true)
+    }
   }
 
   /**