diff --git a/pom.xml b/pom.xml
index 32784c3..280626b 100644
--- a/pom.xml
+++ b/pom.xml
@@ -13,6 +13,7 @@
wordcount
+ sindy
pagerank
diff --git a/sindy/.gitignore b/sindy/.gitignore
new file mode 100644
index 0000000..a81a9ed
--- /dev/null
+++ b/sindy/.gitignore
@@ -0,0 +1,13 @@
+target/
+pom.xml.tag
+pom.xml.releaseBackup
+pom.xml.versionsBackup
+pom.xml.next
+release.properties
+dependency-reduced-pom.xml
+buildNumber.properties
+.mvn/timing.properties
+
+# IntelliJ
+.idea/
+*.iml
diff --git a/sindy/pom.xml b/sindy/pom.xml
new file mode 100644
index 0000000..c4c5dee
--- /dev/null
+++ b/sindy/pom.xml
@@ -0,0 +1,265 @@
+
+
+ 4.0.0
+
+
+ com.github.sekruse
+ rheem-examples
+ 1.0
+
+ sindy
+
+
+
+
+
+
+ net.alchim31.maven
+ scala-maven-plugin
+ 3.2.2
+
+
+
+ compile
+ testCompile
+
+
+
+
+ ${scala.version}
+
+
+
+
+
+
+ net.alchim31.maven
+ scala-maven-plugin
+
+
+
+
+
+
+
+ org.qcri.rheem
+ rheem-core
+ ${rheem.version}
+
+
+ org.qcri.rheem
+ rheem-basic
+ ${rheem.version}
+
+
+ org.qcri.rheem
+ rheem-java
+ ${rheem.version}
+
+
+ org.qcri.rheem
+ rheem-spark
+ ${rheem.version}
+
+
+ org.qcri.rheem
+ rheem-api
+ ${rheem.version}
+
+
+ org.qcri.rheem
+ rheem-sqlite3
+ ${rheem.version}
+
+
+
+
+
+
+ org.apache.spark
+ spark-core_${scala.compat.version}
+ ${spark.version}
+
+
+ org.slf4j
+ slf4j-log4j12
+
+
+ ${external.platforms.scope}
+
+
+ org.apache.spark
+ spark-graphx_${scala.compat.version}
+ ${spark.version}
+
+
+ org.slf4j
+ slf4j-log4j12
+
+
+ ${external.platforms.scope}
+
+
+
+
+ org.apache.hadoop
+ hadoop-hdfs
+ ${external.platforms.scope}
+ ${hadoop.version}
+
+
+ commons-daemon
+ commons-daemon
+
+
+ org.apache.avro
+ avro
+
+
+ org.mortbay.jetty
+ jetty
+
+
+ com.sun.jersey
+ jersey-core
+
+
+ com.sun.jersey
+ jersey-server
+
+
+ javax.servlet
+ servlet-api
+
+
+ javax.servlet.jsp
+ jsp-api
+
+
+ tomcat
+ jasper-runtime
+
+
+ org.slf4j
+ slf4j-log4j12
+
+
+
+
+ org.apache.hadoop
+ hadoop-common
+ ${external.platforms.scope}
+ ${hadoop.version}
+
+
+ tomcat
+ jasper-compiler
+
+
+ tomcat
+ jasper-runtime
+
+
+ javax.servlet
+ servlet-api
+
+
+ javax.servlet.jsp
+ jsp-api
+
+
+ commons-logging
+ commons-logging-api
+
+
+ jetty
+ org.mortbay.jetty
+
+
+ org.mortbay.jetty
+ jetty
+
+
+ org.mortbay.jetty
+ jetty-util
+
+
+ org.mortbay.jetty
+ jsp-api-2.1
+
+
+ org.mortbay.jetty
+ servlet-api-2.5
+
+
+ com.sun.jersey
+ jersey-core
+
+
+ com.sun.jersey
+ jersey-json
+
+
+ com.sun.jersey
+ jersey-server
+
+
+ org.eclipse.jdt
+ core
+
+
+ org.apache.avro
+ avro-ipc
+
+
+ net.sf.kosmosfs
+ kfs
+
+
+ net.java.dev.jets3t
+ jets3t
+
+
+ com.jcraft
+ jsch
+
+
+ commons-el
+ commons-el
+
+
+ org.slf4j
+ slf4j-log4j12
+
+
+
+
+
+
+ org.slf4j
+ slf4j-simple
+ 1.7.13
+
+
+ junit
+ junit
+ 4.12
+ test
+
+
+
+
+
diff --git a/sindy/src/main/scala/com/github/sekruse/sindy/Sindy.scala b/sindy/src/main/scala/com/github/sekruse/sindy/Sindy.scala
new file mode 100644
index 0000000..77d27f2
--- /dev/null
+++ b/sindy/src/main/scala/com/github/sekruse/sindy/Sindy.scala
@@ -0,0 +1,214 @@
+package com.github.sekruse.sindy
+
+import java.util
+
+import com.github.sekruse.sindy.Sindy._
+import org.qcri.rheem.api.{PlanBuilder, _}
+import org.qcri.rheem.basic.data.Record
+import org.qcri.rheem.core.api.{Configuration, RheemContext}
+import org.qcri.rheem.core.function.FunctionDescriptor.{SerializableBinaryOperator, SerializableFunction}
+import org.qcri.rheem.core.plugin.Plugin
+import org.qcri.rheem.java.Java
+import org.qcri.rheem.spark.Spark
+import org.qcri.rheem.sqlite3.Sqlite3
+import org.qcri.rheem.sqlite3.operators.Sqlite3TableSource
+
+import scala.collection.mutable
+
+/**
+ * This is a Rheem-based implementation of the SINDY algorithm.
+ */
+class Sindy(configuration: Configuration, plugins: Plugin*) {
+
+ /**
+ * Execute the SINDY algorithm.
+ *
+ * @param jdbcUrl JDBC URL to the SQLite3 database
+ * @param schema mapping of all tables to their columns
+ * @param tables specification of which tables/columns to consider
+ * @return the INDs
+ */
+ def apply(jdbcUrl: String, schema: Map[String, Seq[String]], tables: (String, Seq[String])*) = {
+
+ // Prepare the RheemContext.
+ val rheemContext = new RheemContext(configuration)
+ plugins.foreach(rheemContext.register)
+ rheemContext.getConfiguration.setProperty("rheem.sqlite3.jdbc.url", jdbcUrl)
+ val planBuilder = new PlanBuilder(rheemContext)
+
+ // Create cells from the various tables.
+ var offset = 0
+ val columnsById = mutable.Map[Int, String]()
+ val allCells = tables
+ .map { case (table, columns) =>
+ // Handle the special case where columns == "*".
+ var resolvedColumns = if (columns == Seq("*")) schema(table) else columns
+
+ // Read the cells from the specified table/columns.
+ var records = planBuilder.readTable(new Sqlite3TableSource(table, schema(table): _*)).withName(s"Load $table")
+
+ // If requested, project to the relevant fields.
+ if (resolvedColumns.toSet != schema(table).toSet)
+ records = records.projectRecords(resolvedColumns).withName(s"Project $table")
+
+ // Create the cells, finally.
+ val cells = records.flatMapJava(new CellCreator(offset), selectivity = resolvedColumns.size.toDouble).withName(s"Create cells for $table")
+
+ // Maintain some helper data structures.
+ resolvedColumns.zipWithIndex.foreach { case (column, index) => columnsById(offset + index) = s"$table[$column]" }
+ offset += resolvedColumns.size
+
+ cells
+ }
+ .reduce(_ union _)
+
+ // Do the rest of the SINDY logic on the cells.
+ val rawInds = allCells
+ .map(cell => (cell._1, Array(cell._2))).withName("Prepare cell merging")
+ .reduceByKeyJava(toSerializableFunction(_._1), new CellMerger).withName("Merge cells")
+ .flatMapJava(new IndCandidateGenerator).withName("Generate IND candidate sets")
+ .reduceByKeyJava(toSerializableFunction(_._1), new IndCandidateMerger).withName("Merge IND candidate sets")
+ .filter(_._2.length > 0).withName("Filter empty candidate sets")
+ .collect()
+
+ // Make the results readable.
+ rawInds.map {
+ case (dep, refs) => (s"${columnsById(dep)}", refs.map(columnsById).toSeq)
+ }
+ }
+
+}
+
+/**
+ * Companion object for [[Sindy]].
+ */
+object Sindy {
+
+ def main(args: Array[String]): Unit = {
+ // Parse parameters.
+ if (args.isEmpty) {
+ sys.error("Usage: (,)* [(,)*]*")
+ sys.exit(1)
+ }
+ val plugins = parsePlugins(args(0))
+ val inputUrl = args(1)
+ val schema = loadSchema(args(2))
+ val tables = if (args.length > 3) args.slice(3, args.length).map(parseTable).toSeq else schema.toSeq
+
+ // Prepare SINDY.
+ val configuration = new Configuration
+ val sindy = new Sindy(configuration, plugins: _*)
+
+ // Run SINDY.
+ val inds = sindy(inputUrl, schema, tables: _*)
+
+ // Print the result.
+ inds.foreach {
+ case (dep, refs) => println(s"$dep is in included in: ${refs.mkString(", ")}")
+ }
+ }
+
+ /**
+ * Parse a comma-separated list of plugins.
+ *
+ * @param arg the list
+ * @return the [[Plugin]]s
+ */
+ def parsePlugins(arg: String) = arg.split(",").map {
+ case "java" => Java.basicPlugin
+ case "spark" => Spark.basicPlugin
+ case "sqlite3" => Sqlite3.plugin
+ case other: String => sys.error(s"Unknown plugin: $other")
+ }
+
+ private val tableRegex = """(\w+)\[((?:\w+(?:,\w+)*)|(?:\*))\]""".r
+
+ /**
+ * Parse a table/column argument of pattern `table[column,column,...]`.
+ *
+ * @param arg the pattern
+ * @return the table specification
+ */
+ def parseTable(arg: String) = arg match {
+ case tableRegex(table, columns) => (table, columns.split(',').toSeq)
+ case other: String => sys.error(s"Illegal table specifier: $other")
+ }
+
+ /**
+ * Load the schema for the database.
+ *
+ * @param file path to the schema file
+ * @return a mapping from table names to their columns
+ */
+ def loadSchema(file: String) = scala.io.Source.fromFile(file).getLines()
+ .map(parseTable)
+ .toMap
+
+ /**
+ * UDF to create cells from a [[Record]].
+ *
+ * @param offset the column ID offset for the input [[Record]]s
+ */
+ class CellCreator(val offset: Int) extends SerializableFunction[Record, java.lang.Iterable[(String, Int)]] {
+
+ override def apply(record: Record): java.lang.Iterable[(String, Int)] = {
+ val cells = new util.ArrayList[(String, Int)](record.size)
+ var columnId = offset
+ for (index <- 0 until record.size) {
+ cells.add((record.getString(index), columnId))
+ columnId += 1
+ }
+ cells
+ }
+ }
+
+ /**
+ * UDF to merge the column IDs of two cells.
+ */
+ class CellMerger extends SerializableBinaryOperator[(String, Array[Int])] {
+
+ lazy val merger = mutable.Set[Int]()
+
+ override def apply(cell1: (String, Array[Int]), cell2: (String, Array[Int])): (String, Array[Int]) = {
+ merger.clear()
+ for (columnId <- cell1._2) merger += columnId
+ for (columnId <- cell2._2) merger += columnId
+ (cell1._1, merger.toArray)
+ }
+ }
+
+ /**
+ * UDF to create IND candidates from a cell group.
+ */
+ class IndCandidateGenerator extends SerializableFunction[(String, Array[Int]), java.lang.Iterable[(Int, Array[Int])]] {
+
+ override def apply(cellGroup: (String, Array[Int])): java.lang.Iterable[(Int, Array[Int])] = {
+ val columnIds = cellGroup._2
+ val result = new util.ArrayList[(Int, Array[Int])](columnIds.length)
+ for (i <- columnIds.indices) {
+ val refColumnIds = new Array[Int](columnIds.length - 1)
+ java.lang.System.arraycopy(columnIds, 0, refColumnIds, 0, i)
+ java.lang.System.arraycopy(columnIds, i + 1, refColumnIds, i, refColumnIds.length - i)
+ result.add((columnIds(i), refColumnIds))
+ }
+ result
+ }
+ }
+
+ /**
+ * UDF to merge two IND candidates.
+ */
+ class IndCandidateMerger extends SerializableBinaryOperator[(Int, Array[Int])] {
+
+ lazy val merger = mutable.Set[Int]()
+
+ override def apply(indc1: (Int, Array[Int]), indc2: (Int, Array[Int])): (Int, Array[Int]) = {
+ merger.clear()
+ for (columnId <- indc1._2) merger += columnId
+ (indc1._1, indc2._2.filter(merger.contains))
+ }
+
+ }
+
+
+}