diff --git a/.gitignore b/.gitignore index 3d20886..e76ecb0 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,7 @@ spark-warehouses/ *.DS_Store* .clj-kondo/.cache +.clj-kondo/marick pom.xml pom.xml.asc diff --git a/lein-template/resources/leiningen/new/geni/project.clj b/lein-template/resources/leiningen/new/geni/project.clj index 224f7f9..72d500d 100644 --- a/lein-template/resources/leiningen/new/geni/project.clj +++ b/lein-template/resources/leiningen/new/geni/project.clj @@ -8,13 +8,16 @@ [metosin/jsonista "0.3.3" :exclusions [com.fasterxml.jackson.core/jackson-databind]] [expound "0.8.9"] + [io.netty/netty-all "4.1.74.Final"] + [com.fasterxml.jackson.core/jackson-core "2.15.3"] + [com.fasterxml.jackson.core/jackson-annotations "2.15.3"] ;; Spark - [org.apache.spark/spark-core_2.12 "3.1.2"] - [org.apache.spark/spark-hive_2.12 "3.1.2"] - [org.apache.spark/spark-mllib_2.12 "3.1.2"] - [org.apache.spark/spark-sql_2.12 "3.1.2"] - [org.apache.spark/spark-streaming_2.12 "3.1.2"] - [org.apache.spark/spark-yarn_2.12 "3.1.2"] + [org.apache.spark/spark-core_2.12 "3.3.3"] + [org.apache.spark/spark-hive_2.12 "3.3.3"] + [org.apache.spark/spark-mllib_2.12 "3.3.3"] + [org.apache.spark/spark-sql_2.12 "3.3.3"] + [org.apache.spark/spark-streaming_2.12 "3.3.3"] + [org.apache.spark/spark-yarn_2.12 "3.3.3"] [com.github.fommil.netlib/all "1.1.2" :extension "pom"] ; Arrow [org.apache.arrow/arrow-memory-netty "4.0.0"] @@ -40,6 +43,12 @@ "--class" "{{namespace}}.core" "target/uberjar/{{raw-name}}-standalone.jar"]]}{{/dataproc?}} + :jvm-opts ["--add-opens=java.base/java.io=ALL-UNNAMED" + "--add-opens=java.base/java.nio=ALL-UNNAMED" + "--add-opens=java.base/java.lang.invoke=ALL-UNNAMED" + "--add-opens=java.base/java.util=ALL-UNNAMED" + "--add-opens=java.base/sun.nio.ch=ALL-UNNAMED" + "--add-opens=java.base/sun.util.calendar=ALL-UNNAMED"] :profiles {:uberjar {:aot :all} :dev {:plugins [[lein-ancient "0.7.0"]]}} :main ^:skip-aot {{namespace}}.core diff --git a/project.clj b/project.clj index 267c2c4..1b6169c 100644 --- a/project.clj +++ b/project.clj @@ -1,29 +1,38 @@ (def spark-deps - '[;; Spark + '[[io.netty/netty-all "4.1.74.Final"] + [com.fasterxml.jackson.core/jackson-core "2.15.3"] + [com.fasterxml.jackson.core/jackson-annotations "2.15.3"] + ;; Spark ; This breaks cljcdoc: https://github.com/cljdoc/cljdoc/issues/407 ; Frozen until issue is resolved. ;[com.github.fommil.netlib/all "1.1.2" :extension "pom"] - [org.apache.spark/spark-avro_2.12 "3.1.1"] - [org.apache.spark/spark-core_2.12 "3.1.1"] - [org.apache.spark/spark-hive_2.12 "3.1.1"] - [org.apache.spark/spark-mllib_2.12 "3.1.1"] - [org.apache.spark/spark-sql_2.12 "3.1.1"] - [org.apache.spark/spark-streaming_2.12 "3.1.1"] + [org.apache.spark/spark-avro_2.12 "3.3.3"] + [org.apache.spark/spark-core_2.12 "3.3.3"] + [org.apache.spark/spark-hive_2.12 "3.3.3"] + [org.apache.spark/spark-mllib_2.12 "3.3.3"] + [org.apache.spark/spark-sql_2.12 "3.3.3"] + [org.apache.spark/spark-streaming_2.12 "3.3.3"] ; Arrow - [org.apache.arrow/arrow-memory-netty "3.0.0"] - [org.apache.arrow/arrow-memory-core "3.0.0"] - [org.apache.arrow/arrow-vector "3.0.0" + [org.apache.arrow/arrow-memory-netty "4.0.0"] + [org.apache.arrow/arrow-memory-core "4.0.0"] + [org.apache.arrow/arrow-vector "4.0.0" :exclusions [commons-codec com.fasterxml.jackson.core/jackson-databind]] ; Databases - [mysql/mysql-connector-java "8.0.23"] - [org.postgresql/postgresql "42.2.19"] + [mysql/mysql-connector-java "8.0.25"] + [org.postgresql/postgresql "42.2.20"] [org.xerial/sqlite-jdbc "3.34.0"] ;; Optional: Spark XGBoost [ml.dmlc/xgboost4j-spark_2.12 "1.2.0"] [ml.dmlc/xgboost4j_2.12 "1.2.0"]]) (defproject zero.one/geni "0.0.40" - :jvm-opts ["-Duser.country=US" "-Duser.language=en"] + :jvm-opts ["-Duser.country=US" "-Duser.language=en" + "--add-opens=java.base/java.io=ALL-UNNAMED" + "--add-opens=java.base/java.nio=ALL-UNNAMED" + "--add-opens=java.base/java.lang.invoke=ALL-UNNAMED" + "--add-opens=java.base/java.util=ALL-UNNAMED" + "--add-opens=java.base/sun.nio.ch=ALL-UNNAMED" + "--add-opens=java.base/sun.util.calendar=ALL-UNNAMED"] :description "A Clojure dataframe library that runs on Spark" :url "https://github.com/zero-one-group/geni" :license {:name "Apache License" diff --git a/test/zero_one/geni/data_sources_test.clj b/test/zero_one/geni/data_sources_test.clj index 837e7e0..841f0fe 100644 --- a/test/zero_one/geni/data_sources_test.clj +++ b/test/zero_one/geni/data_sources_test.clj @@ -30,7 +30,7 @@ (g/dtypes dummy-df) => {:coord "ArrayType(DoubleType,true)" :prop "MapType(StringType,StringType,true)" :rooms (str "StructType(" - "StructField(rooms,LongType,true), " + "StructField(rooms,LongType,true)," "StructField(bathroom,DoubleType,true))")}) (fact "correct direct schema option" (-> (g/read-parquet! @@ -46,7 +46,7 @@ g/dtypes) => {:coord "ArrayType(LongType,true)" :prop "MapType(StringType,StringType,true)" :rooms (str "StructType(" - "StructField(rooms,IntegerType,true), " + "StructField(rooms,IntegerType,true)," "StructField(bathroom,FloatType,true))")}) (fact "correct data-oriented schema option" (-> (g/read-parquet! @@ -57,7 +57,7 @@ g/dtypes) => {:coord "ArrayType(ShortType,true)" :prop "MapType(StringType,StringType,true)" :rooms (str "StructType(" - "StructField(rooms,FloatType,true), " + "StructField(rooms,FloatType,true)," "StructField(bathroom,LongType,true))")}))) (facts "On binary data" :binary diff --git a/test/zero_one/geni/dataset_creation_test.clj b/test/zero_one/geni/dataset_creation_test.clj index 7480749..4d83f9a 100644 --- a/test/zero_one/geni/dataset_creation_test.clj +++ b/test/zero_one/geni/dataset_creation_test.clj @@ -51,7 +51,7 @@ [(g/row (g/row 27 42)) (g/row (g/row 57 18))] {:coord {:x :int :y :int}})) - => {:coord "StructType(StructField(x,IntegerType,true), StructField(y,IntegerType,true))"}) + => {:coord "StructType(StructField(x,IntegerType,true),StructField(y,IntegerType,true))"}) (fact "of struct array fields" (g/dtypes (g/create-dataframe @@ -59,7 +59,7 @@ [(g/row [(g/row 27 42)]) (g/row [(g/row 57 18)])] {:coords [{:x :int :y :int}]})) - => {:coords "ArrayType(StructType(StructField(x,IntegerType,true), StructField(y,IntegerType,true)),true)"})) + => {:coords "ArrayType(StructType(StructField(x,IntegerType,true),StructField(y,IntegerType,true)),true)"})) (facts "On building blocks" (fact "can instantiate vectors" @@ -266,7 +266,7 @@ (instance? Dataset dataset) => true (g/column-names dataset) => ["a" "b"] (g/dtypes dataset) => {:a "LongType" - :b "StructType(StructField(z,ArrayType(StringType,true),true), StructField(y,BooleanType,true))"})) + :b "StructType(StructField(z,ArrayType(StringType,true),true),StructField(y,BooleanType,true))"})) (fact "should create the right schema for list of maps" (let [dataset (g/table->dataset @tr/spark @@ -276,7 +276,7 @@ (instance? Dataset dataset) => true (g/column-names dataset) => ["a" "b"] (g/dtypes dataset) => {:a "LongType" - :b "ArrayType(StructType(StructField(z,LongType,true), StructField(y,DoubleType,true)),true)"})) + :b "ArrayType(StructType(StructField(z,LongType,true),StructField(y,DoubleType,true)),true)"})) (fact "should create the right schema for list of list of maps" (let [dataset (g/table->dataset @tr/spark @@ -286,7 +286,7 @@ (instance? Dataset dataset) => true (g/column-names dataset) => ["a" "b"] (g/dtypes dataset) => {:a "LongType" - :b "ArrayType(ArrayType(StructType(StructField(z,LongType,true), StructField(y,BooleanType,true)),true),true)"}))) + :b "ArrayType(ArrayType(StructType(StructField(z,LongType,true),StructField(y,BooleanType,true)),true),true)"}))) (facts "On spark range" (fact "should create simple datasets" diff --git a/test/zero_one/geni/dataset_test.clj b/test/zero_one/geni/dataset_test.clj index ff54c18..7f3f284 100644 --- a/test/zero_one/geni/dataset_test.clj +++ b/test/zero_one/geni/dataset_test.clj @@ -430,7 +430,7 @@ (-> (df-20) (g/repartition :Suburb :SellerG) g/partitions - count) => #(< 1 %)) + count) => #(<= 1 %)) (fact "able to repartition by number and columns" (-> (df-20) (g/repartition 10 :Suburb :SellerG) diff --git a/test/zero_one/geni/rdd_test.clj b/test/zero_one/geni/rdd_test.clj index ef53553..f5f03dd 100644 --- a/test/zero_one/geni/rdd_test.clj +++ b/test/zero_one/geni/rdd_test.clj @@ -55,7 +55,7 @@ (rdd/resources) => {} (rdd/spark-home) => (System/getenv "SPARK_HOME") (rdd/sc) => (partial instance? SparkContext) - (rdd/version) => "3.1.1")) + (rdd/version) => "3.3.3")) (facts "On repartitioning" :rdd (fact "partition-by works" diff --git a/test/zero_one/geni/sql_functions_test.clj b/test/zero_one/geni/sql_functions_test.clj index 2d0bba2..8f1cc74 100644 --- a/test/zero_one/geni/sql_functions_test.clj +++ b/test/zero_one/geni/sql_functions_test.clj @@ -24,8 +24,8 @@ :to-2 (g/to-json (g/struct {:time (g/to-timestamp (g/lit "2015-08-26") "yyyy-MM-dd")}) {:timestampFormat "dd/MM/yyyy"})}) g/collect - first) => {:schema-1 "ARRAY>" - :schema-2 "ARRAY>" + first) => {:schema-1 "ARRAY>" + :schema-2 "ARRAY>" :from-1 {:a 1 :b 0.8} :from-2 {:time (Timestamp. 1440547200000)} :to-1 "{\"a\":1,\"b\":2}" @@ -44,8 +44,8 @@ :to-2 (g/to-csv (g/struct {:time (g/to-timestamp (g/lit "2015-08-26") "yyyy-MM-dd")}) {:timestampFormat "dd/MM/yyyy"})}) g/collect - first) => {:schema-1 "STRUCT<`_c0`: INT, `_c1`: STRING>" - :schema-2 "STRUCT<`_c0`: INT, `_c1`: STRING>" + first) => {:schema-1 "STRUCT<_c0: INT, _c1: STRING>" + :schema-2 "STRUCT<_c0: INT, _c1: STRING>" :from-1 {:a 1 :b 0.8} :from-2 {:time (Timestamp. 1440547200000)} :to-1 "1,2" @@ -214,7 +214,7 @@ (-> (df-20) (g/cube :SellerG :Regionname) (g/agg (g/grouping-id :SellerG :Regionname)) - g/first-vals) => ["Nelson" nil 1] + g/first-vals) => ["Biggin" "Northern Metropolitan" 0] (-> (df-20) (g/group-by :SellerG) (g/agg (-> (g/collect-list :Regionname) (g/as :regions))) @@ -503,7 +503,7 @@ (g/agg (g/count-distinct {:seller :SellerG :suburb :Suburb})) - g/column-names) => ["count(SellerG AS `seller`, Suburb AS `suburb`)"]))) + g/column-names) => ["count(SellerG AS seller, Suburb AS suburb)"]))) (facts "On window functions" :slow (let [window (g/window {:partition-by :SellerG :order-by :Price})]