Skip to content
Open

hw2 #23

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 0 additions & 17 deletions hw0/README.md

This file was deleted.

19 changes: 0 additions & 19 deletions hw1/README.md

This file was deleted.

20 changes: 20 additions & 0 deletions hw2/zybkin/build.sbt
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
name := "zybkin"

version := "0.1"

scalaVersion := "2.11.12"

// https://mvnrepository.com/artifact/org.apache.spark/spark-core
libraryDependencies += "org.apache.spark" %% "spark-core" % "2.2.0"

// https://mvnrepository.com/artifact/org.apache.spark/spark-sql
libraryDependencies += "org.apache.spark" %% "spark-sql" % "2.2.0"

// https://mvnrepository.com/artifact/org.apache.spark/spark-mllib
libraryDependencies += "org.apache.spark" %% "spark-mllib" % "2.2.0"

resolvers ++= Seq("spark-stemming" at "https://dl.bintray.com/spark-packages/maven/")

libraryDependencies += "master" % "spark-stemming" % "0.2.0"


Binary file added hw2/zybkin/model/metadata/._SUCCESS.crc
Binary file not shown.
Binary file added hw2/zybkin/model/metadata/.part-00000.crc
Binary file not shown.
Empty file.
1 change: 1 addition & 0 deletions hw2/zybkin/model/metadata/part-00000
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"class":"org.apache.spark.ml.PipelineModel","timestamp":1607870543172,"sparkVersion":"2.2.0","uid":"pipeline_26333c77dbbe","paramMap":{"stageUids":["regexTok_c5a528de9f1e","stopWords_948a1ac99ff9","hashingTF_2f8c7a6e570e","idf_9f1592d87f1e","strIdx_d99abd9ddb82","gbtc_ee3785e7a472"]}}
Binary file not shown.
Binary file not shown.
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"class":"org.apache.spark.ml.feature.RegexTokenizer","timestamp":1607870543255,"sparkVersion":"2.2.0","uid":"regexTok_c5a528de9f1e","paramMap":{"pattern":"[\\W]","minTokenLength":1,"toLowercase":true,"outputCol":"words","inputCol":"text","gaps":true}}
Binary file not shown.
Binary file not shown.
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"class":"org.apache.spark.ml.feature.StopWordsRemover","timestamp":1607870543292,"sparkVersion":"2.2.0","uid":"stopWords_948a1ac99ff9","paramMap":{"caseSensitive":false,"outputCol":"importantWords","stopWords":["i","me","my","myself","we","our","ours","ourselves","you","your","yours","yourself","yourselves","he","him","his","himself","she","her","hers","herself","it","its","itself","they","them","their","theirs","themselves","what","which","who","whom","this","that","these","those","am","is","are","was","were","be","been","being","have","has","had","having","do","does","did","doing","a","an","the","and","but","if","or","because","as","until","while","of","at","by","for","with","about","against","between","into","through","during","before","after","above","below","to","from","up","down","in","out","on","off","over","under","again","further","then","once","here","there","when","where","why","how","all","any","both","each","few","more","most","other","some","such","no","nor","not","only","own","same","so","than","too","very","s","t","can","will","just","don","should","now","i'll","you'll","he'll","she'll","we'll","they'll","i'd","you'd","he'd","she'd","we'd","they'd","i'm","you're","he's","she's","it's","we're","they're","i've","we've","you've","they've","isn't","aren't","wasn't","weren't","haven't","hasn't","hadn't","don't","doesn't","didn't","won't","wouldn't","shan't","shouldn't","mustn't","can't","couldn't","cannot","could","here's","how's","let's","ought","that's","there's","what's","when's","where's","who's","why's","would"],"inputCol":"words"}}
Binary file not shown.
Binary file not shown.
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"class":"org.apache.spark.ml.feature.HashingTF","timestamp":1607870543325,"sparkVersion":"2.2.0","uid":"hashingTF_2f8c7a6e570e","paramMap":{"binary":false,"outputCol":"featureTF","inputCol":"importantWords","numFeatures":5500}}
Binary file not shown.
Binary file not shown.
Empty file.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"class":"org.apache.spark.ml.feature.IDFModel","timestamp":1607870543359,"sparkVersion":"2.2.0","uid":"idf_9f1592d87f1e","paramMap":{"inputCol":"featureTF","outputCol":"features","minDocFreq":0}}
Binary file not shown.
Binary file not shown.
Empty file.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"class":"org.apache.spark.ml.feature.StringIndexerModel","timestamp":1607870543819,"sparkVersion":"2.2.0","uid":"strIdx_d99abd9ddb82","paramMap":{"handleInvalid":"error","inputCol":"target","outputCol":"label"}}
Binary file not shown.
Binary file not shown.
Empty file.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"class":"org.apache.spark.ml.classification.GBTClassificationModel","timestamp":1607870543912,"sparkVersion":"2.2.0","uid":"gbtc_ee3785e7a472","paramMap":{"featuresCol":"features","maxBins":32,"stepSize":0.1,"subsamplingRate":1.0,"maxMemoryInMB":256,"maxIter":26,"lossType":"logistic","seed":-1287390502,"minInfoGain":0.0,"rawPredictionCol":"rawPrediction","minInstancesPerNode":1,"cacheNodeIds":false,"checkpointInterval":10,"impurity":"gini","probabilityCol":"probability","labelCol":"label","predictionCol":"prediction","maxDepth":5},"numFeatures":5500,"numTrees":26}
Binary file not shown.
Binary file not shown.
Empty file.
Binary file not shown.
48 changes: 48 additions & 0 deletions hw2/zybkin/src/main/scala/hw2.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
import org.apache.spark.ml.PipelineModel
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.{DataTypes, StructType}


object hw2 {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder()
.appName("zybkin")
.master("local")
.getOrCreate()

spark.sparkContext.setLogLevel("WARN")
import spark.implicits._

val scheme = new StructType().add("id", DataTypes.IntegerType).add("text", DataTypes.StringType)

val recieved = spark.readStream
.format("socket")
.option("host", "localhost")
.option("port", 9999)
.load()

val recievedJson =
recieved.withColumn("json", from_json($"value", scheme))
.select("json.*")
.select($"id", $"text")


val model = PipelineModel.read.load("model/")
recievedJson.printSchema()
val result = model.transform(recievedJson.select($"id", $"text"))
.select($"id", $"prediction".as("target").cast(DataTypes.IntegerType))

val query = result
.repartition(1)
.writeStream
.outputMode("append")
.format("com.databricks.spark.csv")
.option("header", "true")
.option("path", "data")
.option("checkpointLocation", "checkpoint")
.start()

query.awaitTermination()
}
}