Nordsvich · zvladn7 · Oct 11, 2020 · Oct 11, 2020 · Nov 22, 2020 · Nov 23, 2020
diff --git a/hw0/README.md b/hw0/README.md
diff --git a/hw1/README.md b/hw1/README.md
diff --git a/hw2/zybkin/build.sbt b/hw2/zybkin/build.sbt
@@ -0,0 +1,20 @@
+name := "zybkin"
+
+version := "0.1"
+
+scalaVersion := "2.11.12"
+
+// https://mvnrepository.com/artifact/org.apache.spark/spark-core
+libraryDependencies += "org.apache.spark" %% "spark-core" % "2.2.0"
+
+// https://mvnrepository.com/artifact/org.apache.spark/spark-sql
+libraryDependencies += "org.apache.spark" %% "spark-sql" % "2.2.0"
+
+// https://mvnrepository.com/artifact/org.apache.spark/spark-mllib
+libraryDependencies += "org.apache.spark" %% "spark-mllib" % "2.2.0"
+
+resolvers ++= Seq("spark-stemming" at "https://dl.bintray.com/spark-packages/maven/")
+
+libraryDependencies += "master" % "spark-stemming" % "0.2.0"
+
+
diff --git a/hw2/zybkin/model/metadata/._SUCCESS.crc b/hw2/zybkin/model/metadata/._SUCCESS.crc
diff --git a/hw2/zybkin/model/metadata/.part-00000.crc b/hw2/zybkin/model/metadata/.part-00000.crc
diff --git a/hw2/zybkin/model/metadata/_SUCCESS b/hw2/zybkin/model/metadata/_SUCCESS
diff --git a/hw2/zybkin/model/metadata/part-00000 b/hw2/zybkin/model/metadata/part-00000
@@ -0,0 +1 @@
+{"class":"org.apache.spark.ml.PipelineModel","timestamp":1607870543172,"sparkVersion":"2.2.0","uid":"pipeline_26333c77dbbe","paramMap":{"stageUids":["regexTok_c5a528de9f1e","stopWords_948a1ac99ff9","hashingTF_2f8c7a6e570e","idf_9f1592d87f1e","strIdx_d99abd9ddb82","gbtc_ee3785e7a472"]}}
diff --git a/hw2/zybkin/model/stages/0_regexTok_c5a528de9f1e/metadata/._SUCCESS.crc b/hw2/zybkin/model/stages/0_regexTok_c5a528de9f1e/metadata/._SUCCESS.crc
diff --git a/hw2/zybkin/model/stages/0_regexTok_c5a528de9f1e/metadata/.part-00000.crc b/hw2/zybkin/model/stages/0_regexTok_c5a528de9f1e/metadata/.part-00000.crc
diff --git a/hw2/zybkin/model/stages/0_regexTok_c5a528de9f1e/metadata/_SUCCESS b/hw2/zybkin/model/stages/0_regexTok_c5a528de9f1e/metadata/_SUCCESS
diff --git a/hw2/zybkin/model/stages/0_regexTok_c5a528de9f1e/metadata/part-00000 b/hw2/zybkin/model/stages/0_regexTok_c5a528de9f1e/metadata/part-00000
@@ -0,0 +1 @@
+{"class":"org.apache.spark.ml.feature.RegexTokenizer","timestamp":1607870543255,"sparkVersion":"2.2.0","uid":"regexTok_c5a528de9f1e","paramMap":{"pattern":"[\\W]","minTokenLength":1,"toLowercase":true,"outputCol":"words","inputCol":"text","gaps":true}}
diff --git a/hw2/zybkin/model/stages/1_stopWords_948a1ac99ff9/metadata/._SUCCESS.crc b/hw2/zybkin/model/stages/1_stopWords_948a1ac99ff9/metadata/._SUCCESS.crc
diff --git a/hw2/zybkin/model/stages/1_stopWords_948a1ac99ff9/metadata/.part-00000.crc b/hw2/zybkin/model/stages/1_stopWords_948a1ac99ff9/metadata/.part-00000.crc
diff --git a/hw2/zybkin/model/stages/1_stopWords_948a1ac99ff9/metadata/_SUCCESS b/hw2/zybkin/model/stages/1_stopWords_948a1ac99ff9/metadata/_SUCCESS
diff --git a/hw2/zybkin/model/stages/1_stopWords_948a1ac99ff9/metadata/part-00000 b/hw2/zybkin/model/stages/1_stopWords_948a1ac99ff9/metadata/part-00000
@@ -0,0 +1 @@
+{"class":"org.apache.spark.ml.feature.StopWordsRemover","timestamp":1607870543292,"sparkVersion":"2.2.0","uid":"stopWords_948a1ac99ff9","paramMap":{"caseSensitive":false,"outputCol":"importantWords","stopWords":["i","me","my","myself","we","our","ours","ourselves","you","your","yours","yourself","yourselves","he","him","his","himself","she","her","hers","herself","it","its","itself","they","them","their","theirs","themselves","what","which","who","whom","this","that","these","those","am","is","are","was","were","be","been","being","have","has","had","having","do","does","did","doing","a","an","the","and","but","if","or","because","as","until","while","of","at","by","for","with","about","against","between","into","through","during","before","after","above","below","to","from","up","down","in","out","on","off","over","under","again","further","then","once","here","there","when","where","why","how","all","any","both","each","few","more","most","other","some","such","no","nor","not","only","own","same","so","than","too","very","s","t","can","will","just","don","should","now","i'll","you'll","he'll","she'll","we'll","they'll","i'd","you'd","he'd","she'd","we'd","they'd","i'm","you're","he's","she's","it's","we're","they're","i've","we've","you've","they've","isn't","aren't","wasn't","weren't","haven't","hasn't","hadn't","don't","doesn't","didn't","won't","wouldn't","shan't","shouldn't","mustn't","can't","couldn't","cannot","could","here's","how's","let's","ought","that's","there's","what's","when's","where's","who's","why's","would"],"inputCol":"words"}}
diff --git a/hw2/zybkin/model/stages/2_hashingTF_2f8c7a6e570e/metadata/._SUCCESS.crc b/hw2/zybkin/model/stages/2_hashingTF_2f8c7a6e570e/metadata/._SUCCESS.crc
diff --git a/hw2/zybkin/model/stages/2_hashingTF_2f8c7a6e570e/metadata/.part-00000.crc b/hw2/zybkin/model/stages/2_hashingTF_2f8c7a6e570e/metadata/.part-00000.crc
diff --git a/hw2/zybkin/model/stages/2_hashingTF_2f8c7a6e570e/metadata/_SUCCESS b/hw2/zybkin/model/stages/2_hashingTF_2f8c7a6e570e/metadata/_SUCCESS
diff --git a/hw2/zybkin/model/stages/2_hashingTF_2f8c7a6e570e/metadata/part-00000 b/hw2/zybkin/model/stages/2_hashingTF_2f8c7a6e570e/metadata/part-00000
@@ -0,0 +1 @@
+{"class":"org.apache.spark.ml.feature.HashingTF","timestamp":1607870543325,"sparkVersion":"2.2.0","uid":"hashingTF_2f8c7a6e570e","paramMap":{"binary":false,"outputCol":"featureTF","inputCol":"importantWords","numFeatures":5500}}
diff --git a/hw2/zybkin/model/stages/3_idf_9f1592d87f1e/data/._SUCCESS.crc b/hw2/zybkin/model/stages/3_idf_9f1592d87f1e/data/._SUCCESS.crc
diff --git a/...f1592d87f1e/data/.part-00000-6e56c146-b712-4cea-93ed-38ad53570704-c000.snappy.parquet.crc b/...f1592d87f1e/data/.part-00000-6e56c146-b712-4cea-93ed-38ad53570704-c000.snappy.parquet.crc
diff --git a/hw2/zybkin/model/stages/3_idf_9f1592d87f1e/data/_SUCCESS b/hw2/zybkin/model/stages/3_idf_9f1592d87f1e/data/_SUCCESS
diff --git a/...idf_9f1592d87f1e/data/part-00000-6e56c146-b712-4cea-93ed-38ad53570704-c000.snappy.parquet b/...idf_9f1592d87f1e/data/part-00000-6e56c146-b712-4cea-93ed-38ad53570704-c000.snappy.parquet
diff --git a/hw2/zybkin/model/stages/3_idf_9f1592d87f1e/metadata/._SUCCESS.crc b/hw2/zybkin/model/stages/3_idf_9f1592d87f1e/metadata/._SUCCESS.crc
diff --git a/hw2/zybkin/model/stages/3_idf_9f1592d87f1e/metadata/.part-00000.crc b/hw2/zybkin/model/stages/3_idf_9f1592d87f1e/metadata/.part-00000.crc
diff --git a/hw2/zybkin/model/stages/3_idf_9f1592d87f1e/metadata/_SUCCESS b/hw2/zybkin/model/stages/3_idf_9f1592d87f1e/metadata/_SUCCESS
diff --git a/hw2/zybkin/model/stages/3_idf_9f1592d87f1e/metadata/part-00000 b/hw2/zybkin/model/stages/3_idf_9f1592d87f1e/metadata/part-00000
@@ -0,0 +1 @@
+{"class":"org.apache.spark.ml.feature.IDFModel","timestamp":1607870543359,"sparkVersion":"2.2.0","uid":"idf_9f1592d87f1e","paramMap":{"inputCol":"featureTF","outputCol":"features","minDocFreq":0}}
diff --git a/hw2/zybkin/model/stages/4_strIdx_d99abd9ddb82/data/._SUCCESS.crc b/hw2/zybkin/model/stages/4_strIdx_d99abd9ddb82/data/._SUCCESS.crc
diff --git a/...99abd9ddb82/data/.part-00000-df6d0cd9-2007-4c2e-9730-fd4e59a18879-c000.snappy.parquet.crc b/...99abd9ddb82/data/.part-00000-df6d0cd9-2007-4c2e-9730-fd4e59a18879-c000.snappy.parquet.crc
diff --git a/hw2/zybkin/model/stages/4_strIdx_d99abd9ddb82/data/_SUCCESS b/hw2/zybkin/model/stages/4_strIdx_d99abd9ddb82/data/_SUCCESS
diff --git a/...Idx_d99abd9ddb82/data/part-00000-df6d0cd9-2007-4c2e-9730-fd4e59a18879-c000.snappy.parquet b/...Idx_d99abd9ddb82/data/part-00000-df6d0cd9-2007-4c2e-9730-fd4e59a18879-c000.snappy.parquet
diff --git a/hw2/zybkin/model/stages/4_strIdx_d99abd9ddb82/metadata/._SUCCESS.crc b/hw2/zybkin/model/stages/4_strIdx_d99abd9ddb82/metadata/._SUCCESS.crc
diff --git a/hw2/zybkin/model/stages/4_strIdx_d99abd9ddb82/metadata/.part-00000.crc b/hw2/zybkin/model/stages/4_strIdx_d99abd9ddb82/metadata/.part-00000.crc
diff --git a/hw2/zybkin/model/stages/4_strIdx_d99abd9ddb82/metadata/_SUCCESS b/hw2/zybkin/model/stages/4_strIdx_d99abd9ddb82/metadata/_SUCCESS
diff --git a/hw2/zybkin/model/stages/4_strIdx_d99abd9ddb82/metadata/part-00000 b/hw2/zybkin/model/stages/4_strIdx_d99abd9ddb82/metadata/part-00000
@@ -0,0 +1 @@
+{"class":"org.apache.spark.ml.feature.StringIndexerModel","timestamp":1607870543819,"sparkVersion":"2.2.0","uid":"strIdx_d99abd9ddb82","paramMap":{"handleInvalid":"error","inputCol":"target","outputCol":"label"}}
diff --git a/hw2/zybkin/model/stages/5_gbtc_ee3785e7a472/data/._SUCCESS.crc b/hw2/zybkin/model/stages/5_gbtc_ee3785e7a472/data/._SUCCESS.crc
diff --git a/...e3785e7a472/data/.part-00000-6ca2f721-e1f6-4ba0-aa00-6e87be27c07a-c000.snappy.parquet.crc b/...e3785e7a472/data/.part-00000-6ca2f721-e1f6-4ba0-aa00-6e87be27c07a-c000.snappy.parquet.crc
diff --git a/hw2/zybkin/model/stages/5_gbtc_ee3785e7a472/data/_SUCCESS b/hw2/zybkin/model/stages/5_gbtc_ee3785e7a472/data/_SUCCESS
diff --git a/...btc_ee3785e7a472/data/part-00000-6ca2f721-e1f6-4ba0-aa00-6e87be27c07a-c000.snappy.parquet b/...btc_ee3785e7a472/data/part-00000-6ca2f721-e1f6-4ba0-aa00-6e87be27c07a-c000.snappy.parquet
diff --git a/hw2/zybkin/model/stages/5_gbtc_ee3785e7a472/metadata/._SUCCESS.crc b/hw2/zybkin/model/stages/5_gbtc_ee3785e7a472/metadata/._SUCCESS.crc
diff --git a/hw2/zybkin/model/stages/5_gbtc_ee3785e7a472/metadata/.part-00000.crc b/hw2/zybkin/model/stages/5_gbtc_ee3785e7a472/metadata/.part-00000.crc
diff --git a/hw2/zybkin/model/stages/5_gbtc_ee3785e7a472/metadata/_SUCCESS b/hw2/zybkin/model/stages/5_gbtc_ee3785e7a472/metadata/_SUCCESS
diff --git a/hw2/zybkin/model/stages/5_gbtc_ee3785e7a472/metadata/part-00000 b/hw2/zybkin/model/stages/5_gbtc_ee3785e7a472/metadata/part-00000
@@ -0,0 +1 @@
+{"class":"org.apache.spark.ml.classification.GBTClassificationModel","timestamp":1607870543912,"sparkVersion":"2.2.0","uid":"gbtc_ee3785e7a472","paramMap":{"featuresCol":"features","maxBins":32,"stepSize":0.1,"subsamplingRate":1.0,"maxMemoryInMB":256,"maxIter":26,"lossType":"logistic","seed":-1287390502,"minInfoGain":0.0,"rawPredictionCol":"rawPrediction","minInstancesPerNode":1,"cacheNodeIds":false,"checkpointInterval":10,"impurity":"gini","probabilityCol":"probability","labelCol":"label","predictionCol":"prediction","maxDepth":5},"numFeatures":5500,"numTrees":26}
diff --git a/hw2/zybkin/model/stages/5_gbtc_ee3785e7a472/treesMetadata/._SUCCESS.crc b/hw2/zybkin/model/stages/5_gbtc_ee3785e7a472/treesMetadata/._SUCCESS.crc
diff --git a/...72/treesMetadata/.part-00000-7964372e-7851-46e6-aba6-fdb0a6a70ebf-c000.snappy.parquet.crc b/...72/treesMetadata/.part-00000-7964372e-7851-46e6-aba6-fdb0a6a70ebf-c000.snappy.parquet.crc
diff --git a/hw2/zybkin/model/stages/5_gbtc_ee3785e7a472/treesMetadata/_SUCCESS b/hw2/zybkin/model/stages/5_gbtc_ee3785e7a472/treesMetadata/_SUCCESS
diff --git a/...5e7a472/treesMetadata/part-00000-7964372e-7851-46e6-aba6-fdb0a6a70ebf-c000.snappy.parquet b/...5e7a472/treesMetadata/part-00000-7964372e-7851-46e6-aba6-fdb0a6a70ebf-c000.snappy.parquet
diff --git a/hw2/zybkin/src/main/scala/hw2.scala b/hw2/zybkin/src/main/scala/hw2.scala
@@ -0,0 +1,48 @@
+import org.apache.spark.ml.PipelineModel
+import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.functions._
+import org.apache.spark.sql.types.{DataTypes, StructType}
+
+
+object hw2 {
+  def main(args: Array[String]): Unit = {
+    val spark = SparkSession.builder()
+      .appName("zybkin")
+      .master("local")
+      .getOrCreate()
+
+    spark.sparkContext.setLogLevel("WARN")
+    import spark.implicits._
+
+    val scheme = new StructType().add("id", DataTypes.IntegerType).add("text", DataTypes.StringType)
+
+    val recieved = spark.readStream
+      .format("socket")
+      .option("host", "localhost")
+      .option("port", 9999)
+      .load()
+
+    val recievedJson =
+      recieved.withColumn("json", from_json($"value", scheme))
+        .select("json.*")
+        .select($"id", $"text")
+
+
+    val model = PipelineModel.read.load("model/")
+    recievedJson.printSchema()
+    val result = model.transform(recievedJson.select($"id", $"text"))
+      .select($"id", $"prediction".as("target").cast(DataTypes.IntegerType))
+
+    val query = result
+      .repartition(1)
+      .writeStream
+      .outputMode("append")
+      .format("com.databricks.spark.csv")
+      .option("header", "true")
+      .option("path", "data")
+      .option("checkpointLocation", "checkpoint")
+      .start()
+
+    query.awaitTermination()
+  }
+}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		{"class":"org.apache.spark.ml.PipelineModel","timestamp":1607870543172,"sparkVersion":"2.2.0","uid":"pipeline_26333c77dbbe","paramMap":{"stageUids":["regexTok_c5a528de9f1e","stopWords_948a1ac99ff9","hashingTF_2f8c7a6e570e","idf_9f1592d87f1e","strIdx_d99abd9ddb82","gbtc_ee3785e7a472"]}}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		{"class":"org.apache.spark.ml.feature.RegexTokenizer","timestamp":1607870543255,"sparkVersion":"2.2.0","uid":"regexTok_c5a528de9f1e","paramMap":{"pattern":"[\\W]","minTokenLength":1,"toLowercase":true,"outputCol":"words","inputCol":"text","gaps":true}}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		{"class":"org.apache.spark.ml.feature.StopWordsRemover","timestamp":1607870543292,"sparkVersion":"2.2.0","uid":"stopWords_948a1ac99ff9","paramMap":{"caseSensitive":false,"outputCol":"importantWords","stopWords":["i","me","my","myself","we","our","ours","ourselves","you","your","yours","yourself","yourselves","he","him","his","himself","she","her","hers","herself","it","its","itself","they","them","their","theirs","themselves","what","which","who","whom","this","that","these","those","am","is","are","was","were","be","been","being","have","has","had","having","do","does","did","doing","a","an","the","and","but","if","or","because","as","until","while","of","at","by","for","with","about","against","between","into","through","during","before","after","above","below","to","from","up","down","in","out","on","off","over","under","again","further","then","once","here","there","when","where","why","how","all","any","both","each","few","more","most","other","some","such","no","nor","not","only","own","same","so","than","too","very","s","t","can","will","just","don","should","now","i'll","you'll","he'll","she'll","we'll","they'll","i'd","you'd","he'd","she'd","we'd","they'd","i'm","you're","he's","she's","it's","we're","they're","i've","we've","you've","they've","isn't","aren't","wasn't","weren't","haven't","hasn't","hadn't","don't","doesn't","didn't","won't","wouldn't","shan't","shouldn't","mustn't","can't","couldn't","cannot","could","here's","how's","let's","ought","that's","there's","what's","when's","where's","who's","why's","would"],"inputCol":"words"}}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		{"class":"org.apache.spark.ml.feature.HashingTF","timestamp":1607870543325,"sparkVersion":"2.2.0","uid":"hashingTF_2f8c7a6e570e","paramMap":{"binary":false,"outputCol":"featureTF","inputCol":"importantWords","numFeatures":5500}}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		{"class":"org.apache.spark.ml.feature.IDFModel","timestamp":1607870543359,"sparkVersion":"2.2.0","uid":"idf_9f1592d87f1e","paramMap":{"inputCol":"featureTF","outputCol":"features","minDocFreq":0}}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		{"class":"org.apache.spark.ml.classification.GBTClassificationModel","timestamp":1607870543912,"sparkVersion":"2.2.0","uid":"gbtc_ee3785e7a472","paramMap":{"featuresCol":"features","maxBins":32,"stepSize":0.1,"subsamplingRate":1.0,"maxMemoryInMB":256,"maxIter":26,"lossType":"logistic","seed":-1287390502,"minInfoGain":0.0,"rawPredictionCol":"rawPrediction","minInstancesPerNode":1,"cacheNodeIds":false,"checkpointInterval":10,"impurity":"gini","probabilityCol":"probability","labelCol":"label","predictionCol":"prediction","maxDepth":5},"numFeatures":5500,"numTrees":26}