Skip to content

Commit

Permalink
A few more edits from review of text
Browse files Browse the repository at this point in the history
  • Loading branch information
srowen committed May 23, 2017
1 parent 1eb2e50 commit 1643bfd
Show file tree
Hide file tree
Showing 2 changed files with 5 additions and 7 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -33,16 +33,15 @@ object RunIntro extends Serializable {

val preview = spark.read.csv("hdfs:///user/ds/linkage")
preview.show()
preview.schema.foreach(println)
preview.printSchema()

val parsed = spark.read
.option("header", "true")
.option("nullValue", "?")
.option("inferSchema", "true")
.csv("hdfs:///user/ds/linkage")
parsed.show()
val schema = parsed.schema
schema.foreach(println)
parsed.printSchema()

parsed.count()
parsed.cache()
Expand All @@ -61,7 +60,7 @@ object RunIntro extends Serializable {
summary.select("summary", "cmp_fname_c1", "cmp_fname_c2").show()

val matches = parsed.where("is_match = true")
val misses = parsed.filter($"is_match" === lit(false))
val misses = parsed.filter($"is_match" === false)
val matchSummary = matches.describe()
val missSummary = misses.describe()

Expand All @@ -76,9 +75,9 @@ object RunIntro extends Serializable {
""").show()

val matchData = parsed.as[MatchData]
val scored = matchData.map(md => {
val scored = matchData.map { md =>
(scoreMatchData(md), md.is_match)
}).toDF("score", "is_match")
}.toDF("score", "is_match")
crossTabs(scored, 4.0).show()
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,6 @@ object RunLSA {
vecRdd.cache()
val mat = new RowMatrix(vecRdd)
val svd = mat.computeSVD(k, computeU=true)
val u = svd.U.rows.zipWithUniqueId()

println("Singular values: " + svd.s)
val topConceptTerms = topTermsInTopConcepts(svd, 10, 10, termIds)
Expand Down

0 comments on commit 1643bfd

Please sign in to comment.