[ML-3035] Add persistence & test for DeepImageFeaturizer in Scala (#86)

smurching · sueann · commit 94452d6f9cba · 2017-12-05T12:58:24.000-08:00
Adds persistence for Scala implementation of DeepImageFeaturizer. Since DeepImageFeaturizer stores all its data as instances of org.apache.spark.ml.Param, all we need to do is extend Spark's default ML persistence traits (DefaultParamsReadable, DefaultParamsWritable).

See MLlib's Binarizer and BinarizerSuite for an example of a similar Transformer/test suite pair.

As is, this PR enables DeepImageFeaturizer persistence in Scala but not Python; to enable it in Python, we just need to inherit from JavaMLWriter &amp; JavaMLReader, which define Python persistence methods (save, load) that call the corresponding Scala methods. However, we need to ensure the behavior in python and scala are consistent before we support persistence read/write in python.
diff --git a/src/main/scala/com/databricks/sparkdl/DeepImageFeaturizer.scala b/src/main/scala/com/databricks/sparkdl/DeepImageFeaturizer.scala
@@ -23,11 +23,10 @@ import org.apache.spark.ml.Transformer
 import org.apache.spark.ml.linalg.SQLDataTypes.VectorType
 import org.apache.spark.ml.linalg.Vectors
 import org.apache.spark.ml.param.{Param, ParamMap}
-import org.apache.spark.ml.util.{DefaultParamsWritable, Identifiable}
+import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable}
 import org.apache.spark.sql.{DataFrame, Dataset, Row}
 import org.apache.spark.sql.functions.{col, udf}
 import org.apache.spark.sql.types.StructType
-
 import org.tensorflow.framework.GraphDef
 import org.tensorframes.{Shape, ShapeDescription}
 import org.tensorframes.impl.DebugRowOps
@@ -126,7 +125,7 @@ class DeepImageFeaturizer(override val uid: String) extends Transformer with Def
   }
 }
 
-object DeepImageFeaturizer {
+object DeepImageFeaturizer extends DefaultParamsReadable[DeepImageFeaturizer] {
   /**
    * The deep image featurizer uses the information provided by named Image model to apply the
    * tensorflow graph, given in NamedImageModel.graph as a GraphDef, to an image column of a
diff --git a/src/test/scala/com/databricks/sparkdl/DeepImageFeaturizerSuite.scala b/src/test/scala/com/databricks/sparkdl/DeepImageFeaturizerSuite.scala
@@ -23,7 +23,7 @@ import org.apache.spark.sql.{DataFrame, Row}
 import org.apache.spark.sql.types.{StructField, StructType}
 import org.scalatest.FunSuite
 
-class DeepImageFeaturizerSuite extends FunSuite with TestSparkContext {
+class DeepImageFeaturizerSuite extends FunSuite with TestSparkContext with DefaultReadWriteTest {
 
   var data: DataFrame = _
 
@@ -118,4 +118,12 @@ class DeepImageFeaturizerSuite extends FunSuite with TestSparkContext {
       featurizer.setModelName("noSuchModel")
     }
   }
+
+  test("DeepImageFeaturizer persistence") {
+    val featurizer = new DeepImageFeaturizer()
+      .setModelName("_test")
+      .setInputCol("myInput")
+      .setOutputCol("myOutput")
+    testDefaultReadWrite(featurizer)
+  }
 }
diff --git a/src/test/scala/com/databricks/sparkdl/DefaultReadWriteTest.scala b/src/test/scala/com/databricks/sparkdl/DefaultReadWriteTest.scala
@@ -0,0 +1,83 @@
+/*
+ * Copyright 2017 Databricks, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.databricks.sparkdl
+
+import java.io.{File, IOException}
+
+import org.scalatest.Suite
+
+import org.apache.spark.ml.param.Params
+import org.apache.spark.ml.util.{Identifiable, MLReader, MLWritable, TempDirectory}
+
+/**
+ * Copied from Spark (https://github.com/apache/spark/blob/branch-2.2/mllib/src/test/scala/org/
+ * apache/spark/ml/util/DefaultReadWriteTest.scala).
+ *
+ * Trait containing a default persistence test for Estimators/Transformers whose data is stored
+ * entirely in [[org.apache.spark.ml.param.Param]] instances.
+ * */
+trait DefaultReadWriteTest extends TempDirectory { self: Suite =>
+
+  /**
+   * Checks "overwrite" option and params.
+   * This saves to and loads from [[tempDir]], but creates a subdirectory with a random name
+   * in order to avoid conflicts from multiple calls to this method.
+   *
+   * @param instance ML instance to test saving/loading
+   * @param testParams  If true, then test values of Params.  Otherwise, just test overwrite option.
+   * @tparam T ML instance type
+   * @return  Instance loaded from file
+   */
+  def testDefaultReadWrite[T <: Params with MLWritable](
+      instance: T,
+      testParams: Boolean = true): T = {
+    val uid = instance.uid
+    val subdirName = Identifiable.randomUID("test")
+
+    val subdir = new File(tempDir, subdirName)
+    val path = new File(subdir, uid).getPath
+
+    instance.save(path)
+    intercept[IOException] {
+      instance.save(path)
+    }
+    instance.write.overwrite().save(path)
+    val loader = instance.getClass.getMethod("read").invoke(null).asInstanceOf[MLReader[T]]
+    val newInstance = loader.load(path)
+    assert(newInstance.uid === instance.uid)
+    if (testParams) {
+      instance.params.foreach { p =>
+        if (instance.isDefined(p)) {
+          (instance.getOrDefault(p), newInstance.getOrDefault(p)) match {
+            case (Array(values), Array(newValues)) =>
+              assert(values === newValues, s"Values do not match on param ${p.name}.")
+            case (value, newValue) =>
+              assert(value === newValue, s"Values do not match on param ${p.name}.")
+          }
+        } else {
+          assert(!newInstance.isDefined(p), s"Param ${p.name} shouldn't be defined.")
+        }
+      }
+    }
+
+    val load = instance.getClass.getMethod("load", classOf[String])
+    val another = load.invoke(instance, path).asInstanceOf[T]
+    assert(another.uid === instance.uid)
+    another
+  }
+
+}
diff --git a/src/test/scala/org/apache/spark/ml/util/TempDirectory.scala b/src/test/scala/org/apache/spark/ml/util/TempDirectory.scala
@@ -0,0 +1,50 @@
+/*
+ * Copyright 2017 Databricks, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.util
+
+import java.io.File
+
+import org.scalatest.{BeforeAndAfterAll, Suite}
+
+import org.apache.spark.util.Utils
+
+/**
+ * Trait that creates a temporary directory before all tests and deletes it after all.
+ */
+trait TempDirectory extends BeforeAndAfterAll { self: Suite =>
+
+  private var _tempDir: File = _
+
+  /**
+   * Returns the temporary directory as a `File` instance.
+   */
+  protected def tempDir: File = _tempDir
+
+  override def beforeAll(): Unit = {
+    super.beforeAll()
+    _tempDir = Utils.createTempDir(namePrefix = this.getClass.getName)
+  }
+
+  override def afterAll(): Unit = {
+    try {
+      Utils.deleteRecursively(_tempDir)
+    } finally {
+      super.afterAll()
+    }
+  }
+}
+

Original file line number	Diff line number	Diff line change
`@@ -23,7 +23,7 @@ import org.apache.spark.sql.{DataFrame, Row}`
`23`	`23`	`import org.apache.spark.sql.types.{StructField, StructType}`
`24`	`24`	`import org.scalatest.FunSuite`
`25`	`25`
`26`		`-class DeepImageFeaturizerSuite extends FunSuite with TestSparkContext {`
	`26`	`+class DeepImageFeaturizerSuite extends FunSuite with TestSparkContext with DefaultReadWriteTest {`
`27`	`27`
`28`	`28`	`var data: DataFrame = _`
`29`	`29`
`@@ -118,4 +118,12 @@ class DeepImageFeaturizerSuite extends FunSuite with TestSparkContext {`
`118`	`118`	`featurizer.setModelName("noSuchModel")`
`119`	`119`	`}`
`120`	`120`	`}`
	`121`	`+`
	`122`	`+ test("DeepImageFeaturizer persistence") {`
	`123`	`+ val featurizer = new DeepImageFeaturizer()`
	`124`	`+ .setModelName("_test")`
	`125`	`+ .setInputCol("myInput")`
	`126`	`+ .setOutputCol("myOutput")`
	`127`	`+ testDefaultReadWrite(featurizer)`
	`128`	`+ }`
`121`	`129`	`}`