diff --git a/hw0/README.md b/hw0/README.md deleted file mode 100644 index b9d1672..0000000 --- a/hw0/README.md +++ /dev/null @@ -1,17 +0,0 @@ -Данные: - -https://www.kaggle.com/dgomonov/new-york-city-airbnb-open-data?select=AB_NYC_2019.csv - - -Задачи: - -1) Посчитать медиану, моду и среднее и дисперсию для каждого room_type -2) Найти самое дорогое и самое дешевое предложение -3) Посчитать корреляцию между ценой и минимальный количеством ночей, кол-вом отзывов -4) Нужно найти гео квадрат размером 5км на 5км с самой высокой средней стоимостью жилья - -P.S Пул ревест и пжлст сделайте подпапку со своей фамилией - -Технологии: - -Spark / Scala / Java \ No newline at end of file diff --git a/hw1/README.md b/hw1/README.md deleted file mode 100644 index e38a446..0000000 --- a/hw1/README.md +++ /dev/null @@ -1,19 +0,0 @@ -Данные: - -https://www.kaggle.com/c/nlp-getting-started/overview - -Задачи: - -1) Решить на Spark задачу -2) Сделать сабмит в соревнование - -Подсказки: - -TF/IDF,LDA,StopWords,Tokenizing,Stemming - - -P.S Пул ревест и пжлст сделайте подпапку со своей фамилией - -Технологии: - -Spark / Scala / Java \ No newline at end of file diff --git a/hw2/zybkin/build.sbt b/hw2/zybkin/build.sbt new file mode 100644 index 0000000..dbbb342 --- /dev/null +++ b/hw2/zybkin/build.sbt @@ -0,0 +1,20 @@ +name := "zybkin" + +version := "0.1" + +scalaVersion := "2.11.12" + +// https://mvnrepository.com/artifact/org.apache.spark/spark-core +libraryDependencies += "org.apache.spark" %% "spark-core" % "2.2.0" + +// https://mvnrepository.com/artifact/org.apache.spark/spark-sql +libraryDependencies += "org.apache.spark" %% "spark-sql" % "2.2.0" + +// https://mvnrepository.com/artifact/org.apache.spark/spark-mllib +libraryDependencies += "org.apache.spark" %% "spark-mllib" % "2.2.0" + +resolvers ++= Seq("spark-stemming" at "https://dl.bintray.com/spark-packages/maven/") + +libraryDependencies += "master" % "spark-stemming" % "0.2.0" + + diff --git a/hw2/zybkin/model/metadata/._SUCCESS.crc b/hw2/zybkin/model/metadata/._SUCCESS.crc new file mode 100644 index 0000000..3b7b044 Binary files /dev/null and b/hw2/zybkin/model/metadata/._SUCCESS.crc differ diff --git a/hw2/zybkin/model/metadata/.part-00000.crc b/hw2/zybkin/model/metadata/.part-00000.crc new file mode 100644 index 0000000..2f44448 Binary files /dev/null and b/hw2/zybkin/model/metadata/.part-00000.crc differ diff --git a/hw2/zybkin/model/metadata/_SUCCESS b/hw2/zybkin/model/metadata/_SUCCESS new file mode 100644 index 0000000..e69de29 diff --git a/hw2/zybkin/model/metadata/part-00000 b/hw2/zybkin/model/metadata/part-00000 new file mode 100644 index 0000000..8da8475 --- /dev/null +++ b/hw2/zybkin/model/metadata/part-00000 @@ -0,0 +1 @@ +{"class":"org.apache.spark.ml.PipelineModel","timestamp":1607870543172,"sparkVersion":"2.2.0","uid":"pipeline_26333c77dbbe","paramMap":{"stageUids":["regexTok_c5a528de9f1e","stopWords_948a1ac99ff9","hashingTF_2f8c7a6e570e","idf_9f1592d87f1e","strIdx_d99abd9ddb82","gbtc_ee3785e7a472"]}} diff --git a/hw2/zybkin/model/stages/0_regexTok_c5a528de9f1e/metadata/._SUCCESS.crc b/hw2/zybkin/model/stages/0_regexTok_c5a528de9f1e/metadata/._SUCCESS.crc new file mode 100644 index 0000000..3b7b044 Binary files /dev/null and b/hw2/zybkin/model/stages/0_regexTok_c5a528de9f1e/metadata/._SUCCESS.crc differ diff --git a/hw2/zybkin/model/stages/0_regexTok_c5a528de9f1e/metadata/.part-00000.crc b/hw2/zybkin/model/stages/0_regexTok_c5a528de9f1e/metadata/.part-00000.crc new file mode 100644 index 0000000..79a2bd0 Binary files /dev/null and b/hw2/zybkin/model/stages/0_regexTok_c5a528de9f1e/metadata/.part-00000.crc differ diff --git a/hw2/zybkin/model/stages/0_regexTok_c5a528de9f1e/metadata/_SUCCESS b/hw2/zybkin/model/stages/0_regexTok_c5a528de9f1e/metadata/_SUCCESS new file mode 100644 index 0000000..e69de29 diff --git a/hw2/zybkin/model/stages/0_regexTok_c5a528de9f1e/metadata/part-00000 b/hw2/zybkin/model/stages/0_regexTok_c5a528de9f1e/metadata/part-00000 new file mode 100644 index 0000000..0e6c1d7 --- /dev/null +++ b/hw2/zybkin/model/stages/0_regexTok_c5a528de9f1e/metadata/part-00000 @@ -0,0 +1 @@ +{"class":"org.apache.spark.ml.feature.RegexTokenizer","timestamp":1607870543255,"sparkVersion":"2.2.0","uid":"regexTok_c5a528de9f1e","paramMap":{"pattern":"[\\W]","minTokenLength":1,"toLowercase":true,"outputCol":"words","inputCol":"text","gaps":true}} diff --git a/hw2/zybkin/model/stages/1_stopWords_948a1ac99ff9/metadata/._SUCCESS.crc b/hw2/zybkin/model/stages/1_stopWords_948a1ac99ff9/metadata/._SUCCESS.crc new file mode 100644 index 0000000..3b7b044 Binary files /dev/null and b/hw2/zybkin/model/stages/1_stopWords_948a1ac99ff9/metadata/._SUCCESS.crc differ diff --git a/hw2/zybkin/model/stages/1_stopWords_948a1ac99ff9/metadata/.part-00000.crc b/hw2/zybkin/model/stages/1_stopWords_948a1ac99ff9/metadata/.part-00000.crc new file mode 100644 index 0000000..e5d10bf Binary files /dev/null and b/hw2/zybkin/model/stages/1_stopWords_948a1ac99ff9/metadata/.part-00000.crc differ diff --git a/hw2/zybkin/model/stages/1_stopWords_948a1ac99ff9/metadata/_SUCCESS b/hw2/zybkin/model/stages/1_stopWords_948a1ac99ff9/metadata/_SUCCESS new file mode 100644 index 0000000..e69de29 diff --git a/hw2/zybkin/model/stages/1_stopWords_948a1ac99ff9/metadata/part-00000 b/hw2/zybkin/model/stages/1_stopWords_948a1ac99ff9/metadata/part-00000 new file mode 100644 index 0000000..9af0569 --- /dev/null +++ b/hw2/zybkin/model/stages/1_stopWords_948a1ac99ff9/metadata/part-00000 @@ -0,0 +1 @@ +{"class":"org.apache.spark.ml.feature.StopWordsRemover","timestamp":1607870543292,"sparkVersion":"2.2.0","uid":"stopWords_948a1ac99ff9","paramMap":{"caseSensitive":false,"outputCol":"importantWords","stopWords":["i","me","my","myself","we","our","ours","ourselves","you","your","yours","yourself","yourselves","he","him","his","himself","she","her","hers","herself","it","its","itself","they","them","their","theirs","themselves","what","which","who","whom","this","that","these","those","am","is","are","was","were","be","been","being","have","has","had","having","do","does","did","doing","a","an","the","and","but","if","or","because","as","until","while","of","at","by","for","with","about","against","between","into","through","during","before","after","above","below","to","from","up","down","in","out","on","off","over","under","again","further","then","once","here","there","when","where","why","how","all","any","both","each","few","more","most","other","some","such","no","nor","not","only","own","same","so","than","too","very","s","t","can","will","just","don","should","now","i'll","you'll","he'll","she'll","we'll","they'll","i'd","you'd","he'd","she'd","we'd","they'd","i'm","you're","he's","she's","it's","we're","they're","i've","we've","you've","they've","isn't","aren't","wasn't","weren't","haven't","hasn't","hadn't","don't","doesn't","didn't","won't","wouldn't","shan't","shouldn't","mustn't","can't","couldn't","cannot","could","here's","how's","let's","ought","that's","there's","what's","when's","where's","who's","why's","would"],"inputCol":"words"}} diff --git a/hw2/zybkin/model/stages/2_hashingTF_2f8c7a6e570e/metadata/._SUCCESS.crc b/hw2/zybkin/model/stages/2_hashingTF_2f8c7a6e570e/metadata/._SUCCESS.crc new file mode 100644 index 0000000..3b7b044 Binary files /dev/null and b/hw2/zybkin/model/stages/2_hashingTF_2f8c7a6e570e/metadata/._SUCCESS.crc differ diff --git a/hw2/zybkin/model/stages/2_hashingTF_2f8c7a6e570e/metadata/.part-00000.crc b/hw2/zybkin/model/stages/2_hashingTF_2f8c7a6e570e/metadata/.part-00000.crc new file mode 100644 index 0000000..e7d813e Binary files /dev/null and b/hw2/zybkin/model/stages/2_hashingTF_2f8c7a6e570e/metadata/.part-00000.crc differ diff --git a/hw2/zybkin/model/stages/2_hashingTF_2f8c7a6e570e/metadata/_SUCCESS b/hw2/zybkin/model/stages/2_hashingTF_2f8c7a6e570e/metadata/_SUCCESS new file mode 100644 index 0000000..e69de29 diff --git a/hw2/zybkin/model/stages/2_hashingTF_2f8c7a6e570e/metadata/part-00000 b/hw2/zybkin/model/stages/2_hashingTF_2f8c7a6e570e/metadata/part-00000 new file mode 100644 index 0000000..9235fbc --- /dev/null +++ b/hw2/zybkin/model/stages/2_hashingTF_2f8c7a6e570e/metadata/part-00000 @@ -0,0 +1 @@ +{"class":"org.apache.spark.ml.feature.HashingTF","timestamp":1607870543325,"sparkVersion":"2.2.0","uid":"hashingTF_2f8c7a6e570e","paramMap":{"binary":false,"outputCol":"featureTF","inputCol":"importantWords","numFeatures":5500}} diff --git a/hw2/zybkin/model/stages/3_idf_9f1592d87f1e/data/._SUCCESS.crc b/hw2/zybkin/model/stages/3_idf_9f1592d87f1e/data/._SUCCESS.crc new file mode 100644 index 0000000..3b7b044 Binary files /dev/null and b/hw2/zybkin/model/stages/3_idf_9f1592d87f1e/data/._SUCCESS.crc differ diff --git a/hw2/zybkin/model/stages/3_idf_9f1592d87f1e/data/.part-00000-6e56c146-b712-4cea-93ed-38ad53570704-c000.snappy.parquet.crc b/hw2/zybkin/model/stages/3_idf_9f1592d87f1e/data/.part-00000-6e56c146-b712-4cea-93ed-38ad53570704-c000.snappy.parquet.crc new file mode 100644 index 0000000..6191fc3 Binary files /dev/null and b/hw2/zybkin/model/stages/3_idf_9f1592d87f1e/data/.part-00000-6e56c146-b712-4cea-93ed-38ad53570704-c000.snappy.parquet.crc differ diff --git a/hw2/zybkin/model/stages/3_idf_9f1592d87f1e/data/_SUCCESS b/hw2/zybkin/model/stages/3_idf_9f1592d87f1e/data/_SUCCESS new file mode 100644 index 0000000..e69de29 diff --git a/hw2/zybkin/model/stages/3_idf_9f1592d87f1e/data/part-00000-6e56c146-b712-4cea-93ed-38ad53570704-c000.snappy.parquet b/hw2/zybkin/model/stages/3_idf_9f1592d87f1e/data/part-00000-6e56c146-b712-4cea-93ed-38ad53570704-c000.snappy.parquet new file mode 100644 index 0000000..7a2a6ea Binary files /dev/null and b/hw2/zybkin/model/stages/3_idf_9f1592d87f1e/data/part-00000-6e56c146-b712-4cea-93ed-38ad53570704-c000.snappy.parquet differ diff --git a/hw2/zybkin/model/stages/3_idf_9f1592d87f1e/metadata/._SUCCESS.crc b/hw2/zybkin/model/stages/3_idf_9f1592d87f1e/metadata/._SUCCESS.crc new file mode 100644 index 0000000..3b7b044 Binary files /dev/null and b/hw2/zybkin/model/stages/3_idf_9f1592d87f1e/metadata/._SUCCESS.crc differ diff --git a/hw2/zybkin/model/stages/3_idf_9f1592d87f1e/metadata/.part-00000.crc b/hw2/zybkin/model/stages/3_idf_9f1592d87f1e/metadata/.part-00000.crc new file mode 100644 index 0000000..d5981b7 Binary files /dev/null and b/hw2/zybkin/model/stages/3_idf_9f1592d87f1e/metadata/.part-00000.crc differ diff --git a/hw2/zybkin/model/stages/3_idf_9f1592d87f1e/metadata/_SUCCESS b/hw2/zybkin/model/stages/3_idf_9f1592d87f1e/metadata/_SUCCESS new file mode 100644 index 0000000..e69de29 diff --git a/hw2/zybkin/model/stages/3_idf_9f1592d87f1e/metadata/part-00000 b/hw2/zybkin/model/stages/3_idf_9f1592d87f1e/metadata/part-00000 new file mode 100644 index 0000000..947bf21 --- /dev/null +++ b/hw2/zybkin/model/stages/3_idf_9f1592d87f1e/metadata/part-00000 @@ -0,0 +1 @@ +{"class":"org.apache.spark.ml.feature.IDFModel","timestamp":1607870543359,"sparkVersion":"2.2.0","uid":"idf_9f1592d87f1e","paramMap":{"inputCol":"featureTF","outputCol":"features","minDocFreq":0}} diff --git a/hw2/zybkin/model/stages/4_strIdx_d99abd9ddb82/data/._SUCCESS.crc b/hw2/zybkin/model/stages/4_strIdx_d99abd9ddb82/data/._SUCCESS.crc new file mode 100644 index 0000000..3b7b044 Binary files /dev/null and b/hw2/zybkin/model/stages/4_strIdx_d99abd9ddb82/data/._SUCCESS.crc differ diff --git a/hw2/zybkin/model/stages/4_strIdx_d99abd9ddb82/data/.part-00000-df6d0cd9-2007-4c2e-9730-fd4e59a18879-c000.snappy.parquet.crc b/hw2/zybkin/model/stages/4_strIdx_d99abd9ddb82/data/.part-00000-df6d0cd9-2007-4c2e-9730-fd4e59a18879-c000.snappy.parquet.crc new file mode 100644 index 0000000..aa05ec6 Binary files /dev/null and b/hw2/zybkin/model/stages/4_strIdx_d99abd9ddb82/data/.part-00000-df6d0cd9-2007-4c2e-9730-fd4e59a18879-c000.snappy.parquet.crc differ diff --git a/hw2/zybkin/model/stages/4_strIdx_d99abd9ddb82/data/_SUCCESS b/hw2/zybkin/model/stages/4_strIdx_d99abd9ddb82/data/_SUCCESS new file mode 100644 index 0000000..e69de29 diff --git a/hw2/zybkin/model/stages/4_strIdx_d99abd9ddb82/data/part-00000-df6d0cd9-2007-4c2e-9730-fd4e59a18879-c000.snappy.parquet b/hw2/zybkin/model/stages/4_strIdx_d99abd9ddb82/data/part-00000-df6d0cd9-2007-4c2e-9730-fd4e59a18879-c000.snappy.parquet new file mode 100644 index 0000000..8ac2ead Binary files /dev/null and b/hw2/zybkin/model/stages/4_strIdx_d99abd9ddb82/data/part-00000-df6d0cd9-2007-4c2e-9730-fd4e59a18879-c000.snappy.parquet differ diff --git a/hw2/zybkin/model/stages/4_strIdx_d99abd9ddb82/metadata/._SUCCESS.crc b/hw2/zybkin/model/stages/4_strIdx_d99abd9ddb82/metadata/._SUCCESS.crc new file mode 100644 index 0000000..3b7b044 Binary files /dev/null and b/hw2/zybkin/model/stages/4_strIdx_d99abd9ddb82/metadata/._SUCCESS.crc differ diff --git a/hw2/zybkin/model/stages/4_strIdx_d99abd9ddb82/metadata/.part-00000.crc b/hw2/zybkin/model/stages/4_strIdx_d99abd9ddb82/metadata/.part-00000.crc new file mode 100644 index 0000000..d5a8100 Binary files /dev/null and b/hw2/zybkin/model/stages/4_strIdx_d99abd9ddb82/metadata/.part-00000.crc differ diff --git a/hw2/zybkin/model/stages/4_strIdx_d99abd9ddb82/metadata/_SUCCESS b/hw2/zybkin/model/stages/4_strIdx_d99abd9ddb82/metadata/_SUCCESS new file mode 100644 index 0000000..e69de29 diff --git a/hw2/zybkin/model/stages/4_strIdx_d99abd9ddb82/metadata/part-00000 b/hw2/zybkin/model/stages/4_strIdx_d99abd9ddb82/metadata/part-00000 new file mode 100644 index 0000000..3e66885 --- /dev/null +++ b/hw2/zybkin/model/stages/4_strIdx_d99abd9ddb82/metadata/part-00000 @@ -0,0 +1 @@ +{"class":"org.apache.spark.ml.feature.StringIndexerModel","timestamp":1607870543819,"sparkVersion":"2.2.0","uid":"strIdx_d99abd9ddb82","paramMap":{"handleInvalid":"error","inputCol":"target","outputCol":"label"}} diff --git a/hw2/zybkin/model/stages/5_gbtc_ee3785e7a472/data/._SUCCESS.crc b/hw2/zybkin/model/stages/5_gbtc_ee3785e7a472/data/._SUCCESS.crc new file mode 100644 index 0000000..3b7b044 Binary files /dev/null and b/hw2/zybkin/model/stages/5_gbtc_ee3785e7a472/data/._SUCCESS.crc differ diff --git a/hw2/zybkin/model/stages/5_gbtc_ee3785e7a472/data/.part-00000-6ca2f721-e1f6-4ba0-aa00-6e87be27c07a-c000.snappy.parquet.crc b/hw2/zybkin/model/stages/5_gbtc_ee3785e7a472/data/.part-00000-6ca2f721-e1f6-4ba0-aa00-6e87be27c07a-c000.snappy.parquet.crc new file mode 100644 index 0000000..e353c45 Binary files /dev/null and b/hw2/zybkin/model/stages/5_gbtc_ee3785e7a472/data/.part-00000-6ca2f721-e1f6-4ba0-aa00-6e87be27c07a-c000.snappy.parquet.crc differ diff --git a/hw2/zybkin/model/stages/5_gbtc_ee3785e7a472/data/_SUCCESS b/hw2/zybkin/model/stages/5_gbtc_ee3785e7a472/data/_SUCCESS new file mode 100644 index 0000000..e69de29 diff --git a/hw2/zybkin/model/stages/5_gbtc_ee3785e7a472/data/part-00000-6ca2f721-e1f6-4ba0-aa00-6e87be27c07a-c000.snappy.parquet b/hw2/zybkin/model/stages/5_gbtc_ee3785e7a472/data/part-00000-6ca2f721-e1f6-4ba0-aa00-6e87be27c07a-c000.snappy.parquet new file mode 100644 index 0000000..2778bbb Binary files /dev/null and b/hw2/zybkin/model/stages/5_gbtc_ee3785e7a472/data/part-00000-6ca2f721-e1f6-4ba0-aa00-6e87be27c07a-c000.snappy.parquet differ diff --git a/hw2/zybkin/model/stages/5_gbtc_ee3785e7a472/metadata/._SUCCESS.crc b/hw2/zybkin/model/stages/5_gbtc_ee3785e7a472/metadata/._SUCCESS.crc new file mode 100644 index 0000000..3b7b044 Binary files /dev/null and b/hw2/zybkin/model/stages/5_gbtc_ee3785e7a472/metadata/._SUCCESS.crc differ diff --git a/hw2/zybkin/model/stages/5_gbtc_ee3785e7a472/metadata/.part-00000.crc b/hw2/zybkin/model/stages/5_gbtc_ee3785e7a472/metadata/.part-00000.crc new file mode 100644 index 0000000..e950725 Binary files /dev/null and b/hw2/zybkin/model/stages/5_gbtc_ee3785e7a472/metadata/.part-00000.crc differ diff --git a/hw2/zybkin/model/stages/5_gbtc_ee3785e7a472/metadata/_SUCCESS b/hw2/zybkin/model/stages/5_gbtc_ee3785e7a472/metadata/_SUCCESS new file mode 100644 index 0000000..e69de29 diff --git a/hw2/zybkin/model/stages/5_gbtc_ee3785e7a472/metadata/part-00000 b/hw2/zybkin/model/stages/5_gbtc_ee3785e7a472/metadata/part-00000 new file mode 100644 index 0000000..ef4d378 --- /dev/null +++ b/hw2/zybkin/model/stages/5_gbtc_ee3785e7a472/metadata/part-00000 @@ -0,0 +1 @@ +{"class":"org.apache.spark.ml.classification.GBTClassificationModel","timestamp":1607870543912,"sparkVersion":"2.2.0","uid":"gbtc_ee3785e7a472","paramMap":{"featuresCol":"features","maxBins":32,"stepSize":0.1,"subsamplingRate":1.0,"maxMemoryInMB":256,"maxIter":26,"lossType":"logistic","seed":-1287390502,"minInfoGain":0.0,"rawPredictionCol":"rawPrediction","minInstancesPerNode":1,"cacheNodeIds":false,"checkpointInterval":10,"impurity":"gini","probabilityCol":"probability","labelCol":"label","predictionCol":"prediction","maxDepth":5},"numFeatures":5500,"numTrees":26} diff --git a/hw2/zybkin/model/stages/5_gbtc_ee3785e7a472/treesMetadata/._SUCCESS.crc b/hw2/zybkin/model/stages/5_gbtc_ee3785e7a472/treesMetadata/._SUCCESS.crc new file mode 100644 index 0000000..3b7b044 Binary files /dev/null and b/hw2/zybkin/model/stages/5_gbtc_ee3785e7a472/treesMetadata/._SUCCESS.crc differ diff --git a/hw2/zybkin/model/stages/5_gbtc_ee3785e7a472/treesMetadata/.part-00000-7964372e-7851-46e6-aba6-fdb0a6a70ebf-c000.snappy.parquet.crc b/hw2/zybkin/model/stages/5_gbtc_ee3785e7a472/treesMetadata/.part-00000-7964372e-7851-46e6-aba6-fdb0a6a70ebf-c000.snappy.parquet.crc new file mode 100644 index 0000000..1e5af56 Binary files /dev/null and b/hw2/zybkin/model/stages/5_gbtc_ee3785e7a472/treesMetadata/.part-00000-7964372e-7851-46e6-aba6-fdb0a6a70ebf-c000.snappy.parquet.crc differ diff --git a/hw2/zybkin/model/stages/5_gbtc_ee3785e7a472/treesMetadata/_SUCCESS b/hw2/zybkin/model/stages/5_gbtc_ee3785e7a472/treesMetadata/_SUCCESS new file mode 100644 index 0000000..e69de29 diff --git a/hw2/zybkin/model/stages/5_gbtc_ee3785e7a472/treesMetadata/part-00000-7964372e-7851-46e6-aba6-fdb0a6a70ebf-c000.snappy.parquet b/hw2/zybkin/model/stages/5_gbtc_ee3785e7a472/treesMetadata/part-00000-7964372e-7851-46e6-aba6-fdb0a6a70ebf-c000.snappy.parquet new file mode 100644 index 0000000..6037372 Binary files /dev/null and b/hw2/zybkin/model/stages/5_gbtc_ee3785e7a472/treesMetadata/part-00000-7964372e-7851-46e6-aba6-fdb0a6a70ebf-c000.snappy.parquet differ diff --git a/hw2/zybkin/src/main/scala/hw2.scala b/hw2/zybkin/src/main/scala/hw2.scala new file mode 100644 index 0000000..8c57b39 --- /dev/null +++ b/hw2/zybkin/src/main/scala/hw2.scala @@ -0,0 +1,48 @@ +import org.apache.spark.ml.PipelineModel +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.functions._ +import org.apache.spark.sql.types.{DataTypes, StructType} + + +object hw2 { + def main(args: Array[String]): Unit = { + val spark = SparkSession.builder() + .appName("zybkin") + .master("local") + .getOrCreate() + + spark.sparkContext.setLogLevel("WARN") + import spark.implicits._ + + val scheme = new StructType().add("id", DataTypes.IntegerType).add("text", DataTypes.StringType) + + val recieved = spark.readStream + .format("socket") + .option("host", "localhost") + .option("port", 9999) + .load() + + val recievedJson = + recieved.withColumn("json", from_json($"value", scheme)) + .select("json.*") + .select($"id", $"text") + + + val model = PipelineModel.read.load("model/") + recievedJson.printSchema() + val result = model.transform(recievedJson.select($"id", $"text")) + .select($"id", $"prediction".as("target").cast(DataTypes.IntegerType)) + + val query = result + .repartition(1) + .writeStream + .outputMode("append") + .format("com.databricks.spark.csv") + .option("header", "true") + .option("path", "data") + .option("checkpointLocation", "checkpoint") + .start() + + query.awaitTermination() + } +}