Skip to content
Open

Hw1 #19

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions hw0/zybkin/build.sbt
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
name := "zybkin"

version := "0.1"

scalaVersion := "2.11.12"

// https://mvnrepository.com/artifact/org.apache.spark/spark-core
libraryDependencies += "org.apache.spark" %% "spark-core" % "2.3.3"

// https://mvnrepository.com/artifact/org.apache.spark/spark-sql
libraryDependencies += "org.apache.spark" %% "spark-sql" % "2.3.3"
49,081 changes: 49,081 additions & 0 deletions hw0/zybkin/data/AB_NYC_2019.csv

Large diffs are not rendered by default.

Binary file added hw0/zybkin/data/New_York_City_.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
120 changes: 120 additions & 0 deletions hw0/zybkin/src/main/scala/hw0.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.IntegerType

object hw0 {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder()
.appName("zybkin")
.master("local")
.getOrCreate()

spark.sparkContext.setLogLevel("WARN")
import spark.implicits._

val optionsMap = Map(
"header" -> "true",
"inferSchema" -> "true",
"escape" -> "\"",
"mode" -> "DROPMALFORMED")

var df = spark.read
.options(optionsMap)
.csv("data/AB_NYC_2019.csv")

df = df.withColumn("price", df("price").cast(IntegerType))

println("----------MODE----------")
df.select($"room_type", $"price")
.groupBy($"room_type", $"price")
.count()
.sort($"count".desc)
.groupBy($"room_type")
.agg(first($"price").alias("price"), max($"count").alias("mode"))
.sort($"mode".desc)
.show(10)

println("\n\n----------MEAN----------")
df.select($"room_type", $"price")
.groupBy($"room_type")
.agg(mean($"price").alias("avgPrice"))
.sort($"avgPrice".desc)
.show(10)

println("\n\n----------MEDIAN----------")
df.createOrReplaceTempView("df")
spark.sql("select room_type, percentile_approx(price, 0.5) as median from df group by room_type")
.show()

println("\n\n----------DISPERSION----------")
df.select($"room_type", $"price")
.groupBy($"room_type")
.agg(stddev($"price").alias("stddevPrice"))
.select($"room_type", ($"stddevPrice" * $"stddevPrice").alias("dispersion"))
.show()


println("\n\n----------THE MOST EXPENSIVE----------")
df.select($"room_type", $"price")
.sort($"price".desc)
.show(1)

println("\n\n----------THE MOST CHEAPEST----------")
df.select($"room_type", $"price")
.sort($"price")
.show(1)


println("\n\n----------the Pearson Correlation----------")
df.select($"price", $"minimum_nights", $"number_of_reviews")
.agg(
corr($"price", $"minimum_nights"),
corr($"price", $"number_of_reviews")
)
.show()


val encode = (lat: Double, lng: Double, precision: Int) => {

val base32 = "0123456789bcdefghjkmnpqrstuvwxyz"

var (minLat, maxLat) = (-90.0, 90.0)
var (minLng, maxLng) = (-180.0, 180.0)
val bits = List(16, 8, 4, 2, 1)

(0 until precision).map { p => {
base32 apply (0 until 5).map { i => {
if (((5 * p) + i) % 2 == 0) {
val mid = (minLng + maxLng) / 2.0
if (lng > mid) {
minLng = mid
bits(i)
} else {
maxLng = mid
0
}
} else {
val mid = (minLat + maxLat) / 2.0
if (lat > mid) {
minLat = mid
bits(i)
} else {
maxLat = mid
0
}
}
}
}.reduceLeft((a, b) => a | b)
}
}.mkString("")
}

val geohash_udf = udf(encode)
df.select($"price", geohash_udf($"latitude", $"longitude", lit(5)).alias("geohash"))
.groupBy($"geohash")
.mean("price")
.sort($"avg(price)".desc)
.show(1)

}
}
20 changes: 20 additions & 0 deletions hw1/zybkin/build.sbt
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
name := "zybkin"

version := "0.1"

scalaVersion := "2.11.12"

// https://mvnrepository.com/artifact/org.apache.spark/spark-core
libraryDependencies += "org.apache.spark" %% "spark-core" % "2.2.0"

// https://mvnrepository.com/artifact/org.apache.spark/spark-sql
libraryDependencies += "org.apache.spark" %% "spark-sql" % "2.2.0"

// https://mvnrepository.com/artifact/org.apache.spark/spark-mllib
libraryDependencies += "org.apache.spark" %% "spark-mllib" % "2.2.0"

resolvers ++= Seq("spark-stemming" at "https://dl.bintray.com/spark-packages/maven/")

libraryDependencies += "master" % "spark-stemming" % "0.2.0"


Loading