Skip to content

Commit

Permalink
release 0.0.0 on sonatype (#14)
Browse files Browse the repository at this point in the history
* Using data-tc dep (local), simplify dep lists, using version 
* Move Hashable, CustomHashMap to separate files. WIP build fixing
* add inline warnings for scalac
* made cluster_h not @inline, due to failure to inline
* using scalafmt instead of scalariform format
* formatted all using scalafmt
* fix pomExtra info for publishing
  • Loading branch information
malcolmgreaves authored Jan 7, 2017
1 parent cc92b2e commit 347d869
Show file tree
Hide file tree
Showing 29 changed files with 618 additions and 588 deletions.
38 changes: 16 additions & 22 deletions build.sbt
Original file line number Diff line number Diff line change
@@ -1,16 +1,14 @@
name := "fp4ml"

com.typesafe.sbt.SbtScalariform.defaultScalariformSettings

scalaVersion in ThisBuild := "2.11.8"
organization in ThisBuild := "io.malcolmgreaves"
version in ThisBuild := {
version in ThisBuild := {
val major: Int = 0
val minor: Int = 0
val patch: Int = 0
s"$major.$minor.$patch"
}

import SharedBuild._

lazy val root = project
.in(file("."))
.aggregate(
Expand All @@ -19,20 +17,16 @@ lazy val root = project
)
.settings {
publishArtifact := false
publishLocal := {}
publish := {}
publishLocal := {}
publish := {}
}

lazy val `fp4ml-main` = project
.in(file("fp4ml-main"))
.settings {
publishArtifact := true
}
lazy val `fp4ml-main` = project.in(file("fp4ml-main")).settings {
publishArtifact := true
}

lazy val `fp4ml-spark` = project
.in(file("fp4ml-spark"))
.dependsOn(`fp4ml-main`)
.settings {
lazy val `fp4ml-spark` =
project.in(file("fp4ml-spark")).dependsOn(`fp4ml-main`).settings {
publishArtifact := true
}

Expand All @@ -41,19 +35,18 @@ lazy val publishTasks = subprojects.map { publish.in }

resolvers in ThisBuild := Seq(
// sonatype, maven central
"Sonatype Releases" at "https://oss.sonatype.org/content/repositories/releases/",
"Sonatype Releases" at "https://oss.sonatype.org/content/repositories/releases/",
"Sonatype Snapshots" at "https://oss.sonatype.org/content/repositories/snapshots/",

// bintray
"Scalaz Bintray" at "http://dl.bintray.com/scalaz/releases",
Resolver.bintrayRepo("mfglabs", "maven"),
Resolver.bintrayRepo("dwhjames", "maven"),

// etc.
"Confluent" at "http://packages.confluent.io/maven/"
)

lazy val javaV = "1.8"
scalaVersion in ThisBuild := "2.11.8"
scalacOptions in ThisBuild := Seq(
"-optimize",
"-deprecation",
Expand All @@ -70,15 +63,16 @@ scalacOptions in ThisBuild := Seq(
"-language:reflectiveCalls",
"-Yno-adapted-args",
"-Ywarn-value-discard",
"-Yinline-warnings",
"-Xlint",
"-Xfuture",
"-Ywarn-dead-code",
"-Xfatal-warnings" // Every warning is esclated to an error.
)
javacOptions in ThisBuild := Seq("-source", javaV, "-target", javaV)
javaOptions in ThisBuild := Seq(
"-server",
"-XX:+AggressiveOpts",
javaOptions in ThisBuild := Seq(
"-server",
"-XX:+AggressiveOpts",
"-XX:+TieredCompilation",
"-XX:CompileThreshold=100",
"-Xmx3000M",
Expand Down
15 changes: 6 additions & 9 deletions fp4ml-main/build.sbt
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,17 @@ name := "fp4ml-main"

import SharedBuild._

com.typesafe.sbt.SbtScalariform.defaultScalariformSettings
ScalariformKeys.preferences := sharedCodeFmt

addCompilerPlugin(scalaMacros)

libraryDependencies ++=
miscDeps ++
mathMlDeps ++
testDeps
libraryDependencies ++=
fp4mlMainDeps ++
testDeps

//
// test, runtime settings
//
fork in run := true
fork in Test := true
fork in run := true
fork in Test := true
parallelExecution in Test := true

pomExtra := pomExtraInfo
106 changes: 45 additions & 61 deletions fp4ml-main/src/main/scala/mlbigbook/app/Exp20NG.scala
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
package mlbigbook.app

import java.io.{ FileReader, BufferedReader, File }
import java.io.{FileReader, BufferedReader, File}
import java.nio.charset.Charset
import java.nio.file.Files

import breeze.linalg.{ SparseVector, DenseVector }
import breeze.linalg.{SparseVector, DenseVector}
import mlbigbook.math.MathVectorOps
import mlbigbook.ml.{ ImplicitHashable, KnnClassifier }
import mlbigbook.ml.{ImplicitHashable, KnnClassifier}

import scala.io.Source
import scala.util.Random
Expand All @@ -17,9 +17,10 @@ object Exp20NG extends App {
s => s.trim.toLowerCase

lazy val filterLine: String => Boolean =
s => s.nonEmpty &&
headerPrefixes.forall { !s.startsWith(_) } &&
headerSuffixes.forall { !s.endsWith(_) }
s =>
s.nonEmpty &&
headerPrefixes.forall { !s.startsWith(_) } &&
headerSuffixes.forall { !s.endsWith(_) }

lazy val labelTransform: String => String =
label => {
Expand Down Expand Up @@ -68,29 +69,22 @@ object Exp20NG extends App {
| >
|:
|<
""".stripMargin
.trim
.toLowerCase
.split { "\n" }
.toSeq
""".stripMargin.trim.toLowerCase.split { "\n" }.toSeq

lazy val headerSuffixes: Seq[String] =
"""
|writes:
|.com
""".stripMargin
.trim
.toLowerCase
.split { "\n" }
.toSeq
""".stripMargin.trim.toLowerCase.split { "\n" }.toSeq

lazy val ngDirectory = new File("./20_newsgroups")
println(s"Loading 20 Newsgroup Data from:\n${ngDirectory.getCanonicalPath}\n")
println(
s"Loading 20 Newsgroup Data from:\n${ngDirectory.getCanonicalPath}\n")

import scala.collection.JavaConverters._
lazy val loadNgFi: File => Seq[String] =
fi => if (fi isFile)
{
fi =>
if (fi isFile) {
val br = new BufferedReader(new FileReader(fi))
val buf = new scala.collection.mutable.ArrayBuffer[String](420)
var line: String = br.readLine()
Expand All @@ -99,23 +93,18 @@ object Exp20NG extends App {
line = br.readLine()
}
buf.toSeq
}
.map { normalizeLine }
.filter { filterLine }
else
Seq.empty
}.map { normalizeLine }.filter { filterLine } else
Seq.empty

lazy val loadNgData: File => Seq[(File, Seq[String])] =
f => {
if (f.isDirectory) {
Option(f.listFiles())
.map { _.toSeq }
.getOrElse { Seq.empty }
.flatMap { loadNgData }
Option(f.listFiles()).map { _.toSeq }.getOrElse { Seq.empty }.flatMap {
loadNgData
}

} else if (f.isFile)
Seq((f, loadNgFi(f)))

else
Seq.empty
}
Expand All @@ -130,13 +119,11 @@ object Exp20NG extends App {
println(s"There are ${ng20.size} newsgroup directories")

val newsgroup2fileandcontent =
ng20
.map { ngDir =>
println(s"loading data from the ${ngDir.getName} newsgroup ... ")
val bothFiLines = loadNgData(ngDir)
(ngDir.getName, bothFiLines)
}
.toMap
ng20.map { ngDir =>
println(s"loading data from the ${ngDir.getName} newsgroup ... ")
val bothFiLines = loadNgData(ngDir)
(ngDir.getName, bothFiLines)
}.toMap

type Document = String

Expand Down Expand Up @@ -166,25 +153,22 @@ object Exp20NG extends App {

lazy val vectorize = (s: Document) =>
SparseVector[Float](word2index.size)({
val bothIndexValue = s
.split(" ")
.foldLeft(Map.empty[Int, Float]) {
case (accum, word) =>

if (word2index contains word) {
val index = word2index(word)
if (accum.contains(index))
(accum - index) + (index -> (accum(index) + 1.0f))
else
accum + (index -> 1.0f)

} else
accum
}

bothIndexValue
.map { case (index, count) => (index, math.log(count).toFloat) }
.toSeq
val bothIndexValue = s.split(" ").foldLeft(Map.empty[Int, Float]) {
case (accum, word) =>
if (word2index contains word) {
val index = word2index(word)
if (accum.contains(index))
(accum - index) + (index -> (accum(index) + 1.0f))
else
accum + (index -> 1.0f)

} else
accum
}

bothIndexValue.map {
case (index, count) => (index, math.log(count).toFloat)
}.toSeq
}: _*)

lazy val nDimensions = word2index.size
Expand All @@ -204,10 +188,9 @@ object Exp20NG extends App {

val (train, test): (Seq[(Document, String)], Seq[(Document, String)]) = {

val shuffled: Seq[(Document, String)] = allLabeledData
.map { x => (x, math.random) }
.sortBy { case (_, rando) => rando }
.map { case (x, _) => x }
val shuffled: Seq[(Document, String)] = allLabeledData.map { x =>
(x, math.random)
}.sortBy { case (_, rando) => rando }.map { case (x, _) => x }

val si = (shuffled.size * .9).toInt

Expand All @@ -231,5 +214,6 @@ object Exp20NG extends App {
if (predicted == testLabel)
nCorrect += 1
}
println(s"\n\nAccuracy: $nCorrect / $nTake = ${(nCorrect.toFloat / nTake.toFloat) * 100.0f} %")
}
println(
s"\n\nAccuracy: $nCorrect / $nTake = ${(nCorrect.toFloat / nTake.toFloat) * 100.0f} %")
}
18 changes: 9 additions & 9 deletions fp4ml-main/src/main/scala/mlbigbook/math/Argmax.scala
Original file line number Diff line number Diff line change
Expand Up @@ -5,19 +5,19 @@ import fif.Data
import scala.reflect.ClassTag

/**
* Generic algorithm for finding the maximal argument. Uses the `Val`
* type class as evidence of an argument's value.
*/
* Generic algorithm for finding the maximal argument. Uses the `Val`
* type class as evidence of an argument's value.
*/
object Argmax {

import Data.ops._

/**
* Finds the maximal argument of `elements` in linear time. Uses the `Val`
* type class as evidence of an argument's value.
*
* throws IllegalArgumentException Iff `elements` is empty.
*/
* Finds the maximal argument of `elements` in linear time. Uses the `Val`
* type class as evidence of an argument's value.
*
* throws IllegalArgumentException Iff `elements` is empty.
*/
def apply[T: Val: ClassTag, D[_]: Data](elements: D[T]): Option[T] =
if (elements isEmpty)
None
Expand All @@ -34,4 +34,4 @@ object Argmax {
b
}
}
}
}
2 changes: 1 addition & 1 deletion fp4ml-main/src/main/scala/mlbigbook/math/Argmin.scala
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,4 @@ object Argmin {
implicitly[ClassTag[T]],
implicitly[Data[D]]
)
}
}
5 changes: 2 additions & 3 deletions fp4ml-main/src/main/scala/mlbigbook/math/BaseMathVecOps.scala
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,7 @@ import breeze.storage.Zero
import scala.language.higherKinds

private[math] abstract class BaseMathVecOps[Num, V[_]](
implicit
no: Fractional[Num],
implicit no: Fractional[Num],
zo: Zero[Num],
so: Semiring[Num]
) extends MathVectorOps[V] {
Expand All @@ -18,4 +17,4 @@ private[math] abstract class BaseMathVecOps[Num, V[_]](
override final implicit lazy val z = zo
override final implicit lazy val s = so

}
}
Loading

0 comments on commit 347d869

Please sign in to comment.