Merge pull request #36 from lyft/v3.1.2-rc1_branch

Spark 3.1.2-RC1
lyft · May 25, 2021 · 0ae05fa · 0ae05fa
2 parents bf60b67 + de351e3
commit 0ae05fa
Show file tree

Hide file tree

Showing 539 changed files with 27,468 additions and 5,678 deletions.
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
@@ -79,10 +79,9 @@ jobs:
       INCLUDED_TAGS: ${{ matrix.included-tags }}
       HADOOP_PROFILE: ${{ matrix.hadoop }}
       HIVE_PROFILE: ${{ matrix.hive }}
-      # GitHub Actions' default miniconda to use in pip packaging test.
-      CONDA_PREFIX: /usr/share/miniconda
       GITHUB_PREV_SHA: ${{ github.event.before }}
       GITHUB_INPUT_BRANCH: ${{ github.event.inputs.target }}
+      SPARK_LOCAL_IP: localhost
     steps:
     - name: Checkout Spark repository
       uses: actions/checkout@v2
@@ -166,10 +165,9 @@ jobs:
       MODULES_TO_TEST: ${{ matrix.modules }}
       HADOOP_PROFILE: hadoop3.2
       HIVE_PROFILE: hive2.3
-      # GitHub Actions' default miniconda to use in pip packaging test.
-      CONDA_PREFIX: /usr/share/miniconda
       GITHUB_PREV_SHA: ${{ github.event.before }}
       GITHUB_INPUT_BRANCH: ${{ github.event.inputs.target }}
+      SPARK_LOCAL_IP: localhost
     steps:
     - name: Checkout Spark repository
       uses: actions/checkout@v2
@@ -211,9 +209,14 @@ jobs:
       run: |
         python3.6 -m pip install numpy 'pyarrow<3.0.0' pandas scipy xmlrunner
         python3.6 -m pip list
+    - name: Install Conda for pip packaging test
+      run: |
+        curl -s https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh > miniconda.sh
+        bash miniconda.sh -b -p $HOME/miniconda
     # Run the tests.
     - name: Run tests
       run: |
+        export PATH=$PATH:$HOME/miniconda/bin
         ./dev/run-tests --parallelism 2 --modules "$MODULES_TO_TEST"
     - name: Upload test results to report
       if: always()
@@ -238,6 +241,7 @@ jobs:
       HIVE_PROFILE: hive2.3
       GITHUB_PREV_SHA: ${{ github.event.before }}
       GITHUB_INPUT_BRANCH: ${{ github.event.inputs.target }}
+      SPARK_LOCAL_IP: localhost
     steps:
     - name: Checkout Spark repository
       uses: actions/checkout@v2
@@ -285,6 +289,9 @@ jobs:
   lint:
     name: Linters, licenses, dependencies and documentation generation
     runs-on: ubuntu-20.04
+    env:
+      LC_ALL: C.UTF-8
+      LANG: C.UTF-8
     container:
       image: dongjoon/apache-spark-github-action-image:20201025
     steps:
@@ -326,20 +333,24 @@ jobs:
       run: |
         # TODO(SPARK-32407): Sphinx 3.1+ does not correctly index nested classes.
         #   See also https://github.com/sphinx-doc/sphinx/issues/7551.
-        python3.6 -m pip install flake8 'sphinx<3.1.0' numpy pydata_sphinx_theme ipython nbsphinx mypy numpydoc
+        # Jinja2 3.0.0+ causes error when building with Sphinx.
+        #   See also https://issues.apache.org/jira/browse/SPARK-35375.
+        python3.6 -m pip install flake8 'sphinx<3.1.0' numpy pydata_sphinx_theme ipython nbsphinx mypy numpydoc 'jinja2<3.0.0'
     - name: Install R linter dependencies and SparkR
       run: |
         apt-get install -y libcurl4-openssl-dev libgit2-dev libssl-dev libxml2-dev
         Rscript -e "install.packages(c('devtools'), repos='https://cloud.r-project.org/')"
-        Rscript -e "devtools::install_github('jimhester/lintr@v2.0.0')"
+        Rscript -e "devtools::install_github('jimhester/lintr@v2.0.1')"
         ./R/install-dev.sh
     - name: Install dependencies for documentation generation
       run: |
         # pandoc is required to generate PySpark APIs as well in nbsphinx.
         apt-get install -y libcurl4-openssl-dev pandoc
         # TODO(SPARK-32407): Sphinx 3.1+ does not correctly index nested classes.
         #   See also https://github.com/sphinx-doc/sphinx/issues/7551.
-        python3.6 -m pip install 'sphinx<3.1.0' mkdocs numpy pydata_sphinx_theme ipython nbsphinx numpydoc
+        # Jinja2 3.0.0+ causes error when building with Sphinx.
+        #   See also https://issues.apache.org/jira/browse/SPARK-35375.
+        python3.6 -m pip install 'sphinx<3.1.0' mkdocs numpy pydata_sphinx_theme ipython nbsphinx numpydoc 'jinja2<3.0.0'
         apt-get update -y
         apt-get install -y ruby ruby-dev
         gem install jekyll jekyll-redirect-from rouge
@@ -359,8 +370,6 @@ jobs:
     - name: Run documentation build
       run: |
         cd docs
-        export LC_ALL=C.UTF-8
-        export LANG=C.UTF-8
         jekyll build
 
   java-11:
@@ -369,6 +378,17 @@ jobs:
     steps:
     - name: Checkout Spark repository
       uses: actions/checkout@v2
+    - name: Cache Scala, SBT and Maven
+      uses: actions/cache@v2
+      with:
+        path: |
+          build/apache-maven-*
+          build/scala-*
+          build/*.jar
+          ~/.sbt
+        key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }}
+        restore-keys: |
+          build-
     - name: Cache Maven local repository
       uses: actions/cache@v2
       with:
@@ -388,45 +408,99 @@ jobs:
         ./build/mvn $MAVEN_CLI_OPTS -DskipTests -Pyarn -Pmesos -Pkubernetes -Phive -Phive-thriftserver -Phadoop-cloud -Djava.version=11 install
         rm -rf ~/.m2/repository/org/apache/spark
 
-  scala-213:
-    name: Scala 2.13 build with SBT
+  hadoop-2:
+    name: Hadoop 2 build with SBT
     runs-on: ubuntu-20.04
     steps:
     - name: Checkout Spark repository
       uses: actions/checkout@v2
+    - name: Cache Scala, SBT and Maven
+      uses: actions/cache@v2
+      with:
+        path: |
+          build/apache-maven-*
+          build/scala-*
+          build/*.jar
+          ~/.sbt
+        key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }}
+        restore-keys: |
+          build-
     - name: Cache Coursier local repository
       uses: actions/cache@v2
       with:
         path: ~/.cache/coursier
-        key: scala-213-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }}
+        key: hadoop-2-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }}
         restore-keys: |
-          scala-213-coursier-
+          hadoop-2-coursier-
     - name: Install Java 8
       uses: actions/setup-java@v1
       with:
         java-version: 8
     - name: Build with SBT
       run: |
-        ./dev/change-scala-version.sh 2.13
-        ./build/sbt -Pyarn -Pmesos -Pkubernetes -Phive -Phive-thriftserver -Phadoop-cloud -Pkinesis-asl -Pdocker-integration-tests -Pkubernetes-integration-tests -Pspark-ganglia-lgpl -Pscala-2.13 compile test:compile
+        ./build/sbt -Pyarn -Pmesos -Pkubernetes -Phive -Phive-thriftserver -Phadoop-cloud -Pkinesis-asl -Phadoop-2.7 compile test:compile
 
-  hadoop-2:
-    name: Hadoop 2 build with SBT
+  tpcds-1g:
+    name: Run TPC-DS queries with SF=1
     runs-on: ubuntu-20.04
+    env:
+      SPARK_LOCAL_IP: localhost
     steps:
     - name: Checkout Spark repository
       uses: actions/checkout@v2
+    - name: Cache Scala, SBT and Maven
+      uses: actions/cache@v2
+      with:
+        path: |
+          build/apache-maven-*
+          build/scala-*
+          build/*.jar
+          ~/.sbt
+        key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }}
+        restore-keys: |
+          build-
     - name: Cache Coursier local repository
       uses: actions/cache@v2
       with:
         path: ~/.cache/coursier
-        key: hadoop-2-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }}
+        key: tpcds-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }}
         restore-keys: |
-          hadoop-2-coursier-
+          tpcds-coursier-
     - name: Install Java 8
       uses: actions/setup-java@v1
       with:
         java-version: 8
-    - name: Build with SBT
+    - name: Cache TPC-DS generated data
+      id: cache-tpcds-sf-1
+      uses: actions/cache@v2
+      with:
+        path: ./tpcds-sf-1
+        key: tpcds-${{ hashFiles('.github/workflows/build_and_test.yml', 'sql/core/src/test/scala/org/apache/spark/sql/TPCDSSchema.scala') }}
+    - name: Checkout tpcds-kit repository
+      if: steps.cache-tpcds-sf-1.outputs.cache-hit != 'true'
+      uses: actions/checkout@v2
+      with:
+        repository: databricks/tpcds-kit
+        ref: 2a5078a782192ddb6efbcead8de9973d6ab4f069
+        path: ./tpcds-kit
+    - name: Build tpcds-kit
+      if: steps.cache-tpcds-sf-1.outputs.cache-hit != 'true'
+      run: cd tpcds-kit/tools && make OS=LINUX
+    - name: Generate TPC-DS (SF=1) table data
+      if: steps.cache-tpcds-sf-1.outputs.cache-hit != 'true'
+      run: build/sbt "sql/test:runMain org.apache.spark.sql.GenTPCDSData --dsdgenDir `pwd`/tpcds-kit/tools --location `pwd`/tpcds-sf-1 --scaleFactor 1 --numPartitions 1 --overwrite"
+    - name: Run TPC-DS queries
       run: |
-        ./build/sbt -Pyarn -Pmesos -Pkubernetes -Phive -Phive-thriftserver -Phadoop-cloud -Pkinesis-asl -Phadoop-2.7 compile test:compile
+        SPARK_TPCDS_DATA=`pwd`/tpcds-sf-1 build/sbt "sql/testOnly org.apache.spark.sql.TPCDSQueryTestSuite"
+    - name: Upload test results to report
+      if: always()
+      uses: actions/upload-artifact@v2
+      with:
+        name: test-results-tpcds--8-hadoop3.2-hive2.3
+        path: "**/target/test-reports/*.xml"
+    - name: Upload unit tests log files
+      if: failure()
+      uses: actions/upload-artifact@v2
+      with:
+        name: unit-tests-log-tpcds--8-hadoop3.2-hive2.3
+        path: "**/target/unit-tests.log"
diff --git a/.github/workflows/test_report.yml b/.github/workflows/test_report.yml
@@ -15,6 +15,7 @@ jobs:
         github_token: ${{ secrets.GITHUB_TOKEN }}
         workflow: ${{ github.event.workflow_run.workflow_id }}
         commit: ${{ github.event.workflow_run.head_commit.id }}
+        workflow_conclusion: completed
     - name: Publish test report
       uses: scacap/action-surefire-report@v1
       with:

diff --git a/.sbtopts b/.sbtopts
diff --git a/R/DOCUMENTATION.md b/R/DOCUMENTATION.md
@@ -19,7 +19,7 @@ license: |
 # SparkR Documentation
 
 SparkR documentation is generated by using in-source comments and annotated by using
-[`roxygen2`](https://cran.r-project.org/web/packages/roxygen2/index.html). After making changes to the documentation and generating man pages,
+[`roxygen2`](https://cran.r-project.org/package=roxygen2). After making changes to the documentation and generating man pages,
 you can run the following from an R console in the SparkR home directory
 ```R
 library(devtools)

diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION
@@ -1,13 +1,13 @@
 Package: SparkR
 Type: Package
-Version: 3.1.1
+Version: 3.1.2
 Title: R Front End for 'Apache Spark'
 Description: Provides an R Front end for 'Apache Spark' <https://spark.apache.org>.
-Authors@R: c(person("Shivaram", "Venkataraman", role = c("aut", "cre"),
+Authors@R: c(person("Shivaram", "Venkataraman", role = "aut",
                     email = "shivaram@cs.berkeley.edu"),
              person("Xiangrui", "Meng", role = "aut",
                     email = "meng@databricks.com"),
-             person("Felix", "Cheung", role = "aut",
+             person("Felix", "Cheung", role = c("aut", "cre"),
                     email = "felixcheung@apache.org"),
              person(family = "The Apache Software Foundation", role = c("aut", "cph")))
 License: Apache License (== 2.0)
@@ -20,6 +20,7 @@ Depends:
 Suggests:
     knitr,
     rmarkdown,
+    markdown,
     testthat,
     e1071,
     survival,

diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
@@ -3578,7 +3578,12 @@ unresolved_named_lambda_var <- function(...) {
     "org.apache.spark.sql.Column",
     newJObject(
       "org.apache.spark.sql.catalyst.expressions.UnresolvedNamedLambdaVariable",
-      list(...)
+      lapply(list(...), function(x) {
+        handledCallJStatic(
+          "org.apache.spark.sql.catalyst.expressions.UnresolvedNamedLambdaVariable",
+          "freshVarName",
+          x)
+      })
     )
   )
   column(jc)

diff --git a/R/pkg/R/mllib_classification.R b/R/pkg/R/mllib_classification.R
@@ -574,7 +574,7 @@ setMethod("write.ml", signature(object = "MultilayerPerceptronClassificationMode
 #' @rdname spark.naiveBayes
 #' @aliases spark.naiveBayes,SparkDataFrame,formula-method
 #' @name spark.naiveBayes
-#' @seealso e1071: \url{https://cran.r-project.org/web/packages/e1071/index.html}
+#' @seealso e1071: \url{https://cran.r-project.org/package=e1071}
 #' @examples
 #' \dontrun{
 #' data <- as.data.frame(UCBAdmissions)

diff --git a/R/pkg/R/mllib_clustering.R b/R/pkg/R/mllib_clustering.R
@@ -204,7 +204,7 @@ setMethod("write.ml", signature(object = "BisectingKMeansModel", path = "charact
 #' @return \code{spark.gaussianMixture} returns a fitted multivariate gaussian mixture model.
 #' @rdname spark.gaussianMixture
 #' @name spark.gaussianMixture
-#' @seealso mixtools: \url{https://cran.r-project.org/web/packages/mixtools/index.html}
+#' @seealso mixtools: \url{https://cran.r-project.org/package=mixtools}
 #' @examples
 #' \dontrun{
 #' sparkR.session()
@@ -483,7 +483,7 @@ setMethod("write.ml", signature(object = "KMeansModel", path = "character"),
 #' @return \code{spark.lda} returns a fitted Latent Dirichlet Allocation model.
 #' @rdname spark.lda
 #' @aliases spark.lda,SparkDataFrame-method
-#' @seealso topicmodels: \url{https://cran.r-project.org/web/packages/topicmodels/index.html}
+#' @seealso topicmodels: \url{https://cran.r-project.org/package=topicmodels}
 #' @examples
 #' \dontrun{
 #' text <- read.df("data/mllib/sample_lda_libsvm_data.txt", source = "libsvm")

diff --git a/R/pkg/R/mllib_regression.R b/R/pkg/R/mllib_regression.R
@@ -475,7 +475,7 @@ setMethod("write.ml", signature(object = "IsotonicRegressionModel", path = "char
 #' @param ... additional arguments passed to the method.
 #' @return \code{spark.survreg} returns a fitted AFT survival regression model.
 #' @rdname spark.survreg
-#' @seealso survival: \url{https://cran.r-project.org/web/packages/survival/index.html}
+#' @seealso survival: \url{https://cran.r-project.org/package=survival}
 #' @examples
 #' \dontrun{
 #' df <- createDataFrame(ovarian)

diff --git a/R/pkg/tests/fulltests/test_sparkSQL.R b/R/pkg/tests/fulltests/test_sparkSQL.R
@@ -2153,6 +2153,20 @@ test_that("higher order functions", {
   expect_error(array_transform("xs", function(...) 42))
 })
 
+test_that("SPARK-34794: lambda vars must be resolved properly in nested higher order functions", {
+  df <- sql("SELECT array(1, 2, 3) as numbers, array('a', 'b', 'c') as letters")
+  ret <- first(select(
+    df,
+    array_transform("numbers", function(number) {
+      array_transform("letters", function(latter) {
+        struct(alias(number, "n"), alias(latter, "l"))
+      })
+    })
+  ))
+
+  expect_equal(1, ret[[1]][[1]][[1]][[1]]$n)
+})
+
 test_that("group by, agg functions", {
   df <- read.json(jsonPath)
   df1 <- agg(df, name = "max", age = "sum")

diff --git a/assembly/pom.xml b/assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.1.1</version>
+    <version>3.1.2</version>
     <relativePath>../pom.xml</relativePath>
   </parent>