From f0bb380c9cb0a86e4d471a0399b1842d1603b11b Mon Sep 17 00:00:00 2001 From: Waqas Ali Date: Wed, 12 Jan 2022 15:00:06 +0800 Subject: [PATCH] Fix bug where g/->dataset doesn't work for more than 8 columns (#340) --- .../zero_one/geni/core/dataset_creation.clj | 8 +++++++- test/zero_one/geni/dataset_creation_test.clj | 20 ++++++++++++++++++- 2 files changed, 26 insertions(+), 2 deletions(-) diff --git a/src/clojure/zero_one/geni/core/dataset_creation.clj b/src/clojure/zero_one/geni/core/dataset_creation.clj index ebf0236..14621ef 100644 --- a/src/clojure/zero_one/geni/core/dataset_creation.clj +++ b/src/clojure/zero_one/geni/core/dataset_creation.clj @@ -164,6 +164,8 @@ For list of maps, it returns a list of one map with first non-nil value for each nested key. Examples: + [] => [] + [nil nil] => [] [1 2 3] => [1] [nil [1 2]] => [[1]] [{:a 1} {:a 3 :b true}] => [{:a 1 :b true}] @@ -186,6 +188,10 @@ The sample non-nil value can be generated using first-non-nil function above. Examples: + [] | [] + => [] + [nil nil] | [] + => [nil nil] [1 2 3] | [1] => [1 2 3] [nil [1 2]] | [[1]] @@ -235,7 +241,7 @@ (let [col-names (map name col-names) transposed (transpose table) values (map first-non-nil transposed) - table (transpose (map (partial apply fill-missing-nested-keys) (zipmap transposed values))) + table (transpose (map (partial apply fill-missing-nested-keys) (map vector transposed values))) rows (interop/->java-list (map interop/->spark-row (transform-maps table))) schema (infer-schema col-names (map first values))] (.createDataFrame spark rows schema))))) diff --git a/test/zero_one/geni/dataset_creation_test.clj b/test/zero_one/geni/dataset_creation_test.clj index 5fb8054..7480749 100644 --- a/test/zero_one/geni/dataset_creation_test.clj +++ b/test/zero_one/geni/dataset_creation_test.clj @@ -227,7 +227,25 @@ (instance? Dataset dataset) => true (g/collect-vals dataset) => [[0 [[{:z 1 :h nil :g nil} {:z 2 :h nil :g nil}] [{:z nil :h true :g nil}]]] - [1 [[{:z nil :h nil :g 3.0}]]]]))) + [1 [[{:z nil :h nil :g 3.0}]]]])) + (fact "should work for several number of columns" + (let [dataset (g/records->dataset + @tr/spark + [{:a 1 :b 2 :c 3 :d 4 :e 5 :f 6 :g 7 :h 8 :i 9} + {:a 10 :b 11 :c 12 :d 13 :e 14 :f 15 :g 16 :h 17 :i 18}])] + (instance? Dataset dataset) => true + (g/collect dataset) => [{:a 1 :b 2 :c 3 :d 4 :e 5 :f 6 :g 7 :h 8 :i 9} + {:a 10 :b 11 :c 12 :d 13 :e 14 :f 15 :g 16 :h 17 :i 18}])) + (fact "should work for nil and empty values" + (let [dataset (g/records->dataset + @tr/spark + [{:i nil :s [] :b []} + {:i nil :s [nil nil] :b []} + {:i nil :s nil :b []}])] + (instance? Dataset dataset) => true + (g/collect-vals dataset) => [[nil [] []] + [nil [nil nil] []] + [nil nil []]]))) (facts "On table->dataset" (fact "should create the right dataset"