From 9b05432a3f88b849452b4ed482fc1fa5dda0a35d Mon Sep 17 00:00:00 2001 From: Eduard Tudenhoefner Date: Mon, 12 Aug 2024 11:55:24 +0200 Subject: [PATCH 1/2] Separate test groups --- .github/workflows/delta_test.yaml | 82 +++++++++++++++++++++++++++++ .github/workflows/hudi_test.yaml | 82 +++++++++++++++++++++++++++++ .github/workflows/iceberg_test.yaml | 82 +++++++++++++++++++++++++++++ .github/workflows/storage_test.yaml | 82 +++++++++++++++++++++++++++++ build.sbt | 38 ++++++++++++- run-tests.py | 2 +- 6 files changed, 366 insertions(+), 2 deletions(-) create mode 100644 .github/workflows/delta_test.yaml create mode 100644 .github/workflows/hudi_test.yaml create mode 100644 .github/workflows/iceberg_test.yaml create mode 100644 .github/workflows/storage_test.yaml diff --git a/.github/workflows/delta_test.yaml b/.github/workflows/delta_test.yaml new file mode 100644 index 00000000000..bfbea57aa8a --- /dev/null +++ b/.github/workflows/delta_test.yaml @@ -0,0 +1,82 @@ +name: "Delta Tests" +on: [push, pull_request] +jobs: + test: + runs-on: ubuntu-20.04 + strategy: + matrix: + # These Scala versions must match those in the build.sbt + scala: [2.12.18, 2.13.13] + env: + SCALA_VERSION: ${{ matrix.scala }} + steps: + - uses: actions/checkout@v3 + - uses: technote-space/get-diff-action@v4 + id: git-diff + with: + PATTERNS: | + ** + .github/workflows/** + !kernel/** + !connectors/** + - name: install java + uses: actions/setup-java@v3 + with: + distribution: "zulu" + java-version: "8" + - name: Cache Scala, SBT + uses: actions/cache@v3 + with: + path: | + ~/.sbt + ~/.ivy2 + ~/.cache/coursier + # Change the key if dependencies are changed. For each key, GitHub Actions will cache the + # the above directories when we use the key for the first time. After that, each run will + # just use the cache. The cache is immutable so we need to use a new key when trying to + # cache new stuff. + key: delta-sbt-cache-spark3.2-scala${{ matrix.scala }} + - name: Install Job dependencies + run: | + sudo apt-get update + sudo apt-get install -y make build-essential libssl-dev zlib1g-dev libbz2-dev libreadline-dev libsqlite3-dev wget curl llvm libncurses5-dev libncursesw5-dev xz-utils tk-dev libffi-dev liblzma-dev python-openssl git + sudo apt install libedit-dev + curl -LO https://github.com/bufbuild/buf/releases/download/v1.28.1/buf-Linux-x86_64.tar.gz + mkdir -p ~/buf + tar -xvzf buf-Linux-x86_64.tar.gz -C ~/buf --strip-components 1 + rm buf-Linux-x86_64.tar.gz + sudo apt install python3-pip --fix-missing + sudo pip3 install pipenv==2021.5.29 + curl https://pyenv.run | bash + export PATH="~/.pyenv/bin:$PATH" + eval "$(pyenv init -)" + eval "$(pyenv virtualenv-init -)" + pyenv install 3.8.18 + pyenv global system 3.8.18 + pipenv --python 3.8 install + # Update the pip version to 24.0. By default `pyenv.run` installs the latest pip version + # available. From version 24.1, `pip` doesn't allow installing python packages + # with version string containing `-`. In Delta-Spark case, the pypi package generated has + # `-SNAPSHOT` in version (e.g. `3.3.0-SNAPSHOT`) as the version is picked up from + # the`version.sbt` file. + pipenv run pip install pip==24.0 setuptools==69.5.1 wheel==0.43.0 + pipenv run pip install pyspark==3.5.0 + pipenv run pip install flake8==3.5.0 pypandoc==1.3.3 + pipenv run pip install black==23.9.1 + pipenv run pip install importlib_metadata==3.10.0 + pipenv run pip install mypy==0.982 + pipenv run pip install mypy-protobuf==3.3.0 + pipenv run pip install cryptography==37.0.4 + pipenv run pip install twine==4.0.1 + pipenv run pip install wheel==0.33.4 + pipenv run pip install setuptools==41.1.0 + pipenv run pip install pydocstyle==3.0.0 + pipenv run pip install pandas==1.1.3 + pipenv run pip install pyarrow==8.0.0 + pipenv run pip install numpy==1.20.3 + if: steps.git-diff.outputs.diff + - name: Run Scala/Java and Python tests + # when changing TEST_PARALLELISM_COUNT make sure to also change it in spark_master_test.yaml + run: | + TEST_PARALLELISM_COUNT=2 pipenv run python run-tests.py --group delta + if: steps.git-diff.outputs.diff diff --git a/.github/workflows/hudi_test.yaml b/.github/workflows/hudi_test.yaml new file mode 100644 index 00000000000..43d0d7c21b5 --- /dev/null +++ b/.github/workflows/hudi_test.yaml @@ -0,0 +1,82 @@ +name: "Delta Hudi Tests" +on: [push, pull_request] +jobs: + test: + runs-on: ubuntu-20.04 + strategy: + matrix: + # These Scala versions must match those in the build.sbt + scala: [2.12.18, 2.13.13] + env: + SCALA_VERSION: ${{ matrix.scala }} + steps: + - uses: actions/checkout@v3 + - uses: technote-space/get-diff-action@v4 + id: git-diff + with: + PATTERNS: | + ** + .github/workflows/** + !kernel/** + !connectors/** + - name: install java + uses: actions/setup-java@v3 + with: + distribution: "zulu" + java-version: "8" + - name: Cache Scala, SBT + uses: actions/cache@v3 + with: + path: | + ~/.sbt + ~/.ivy2 + ~/.cache/coursier + # Change the key if dependencies are changed. For each key, GitHub Actions will cache the + # the above directories when we use the key for the first time. After that, each run will + # just use the cache. The cache is immutable so we need to use a new key when trying to + # cache new stuff. + key: delta-sbt-cache-spark3.2-scala${{ matrix.scala }} + - name: Install Job dependencies + run: | + sudo apt-get update + sudo apt-get install -y make build-essential libssl-dev zlib1g-dev libbz2-dev libreadline-dev libsqlite3-dev wget curl llvm libncurses5-dev libncursesw5-dev xz-utils tk-dev libffi-dev liblzma-dev python-openssl git + sudo apt install libedit-dev + curl -LO https://github.com/bufbuild/buf/releases/download/v1.28.1/buf-Linux-x86_64.tar.gz + mkdir -p ~/buf + tar -xvzf buf-Linux-x86_64.tar.gz -C ~/buf --strip-components 1 + rm buf-Linux-x86_64.tar.gz + sudo apt install python3-pip --fix-missing + sudo pip3 install pipenv==2021.5.29 + curl https://pyenv.run | bash + export PATH="~/.pyenv/bin:$PATH" + eval "$(pyenv init -)" + eval "$(pyenv virtualenv-init -)" + pyenv install 3.8.18 + pyenv global system 3.8.18 + pipenv --python 3.8 install + # Update the pip version to 24.0. By default `pyenv.run` installs the latest pip version + # available. From version 24.1, `pip` doesn't allow installing python packages + # with version string containing `-`. In Delta-Spark case, the pypi package generated has + # `-SNAPSHOT` in version (e.g. `3.3.0-SNAPSHOT`) as the version is picked up from + # the`version.sbt` file. + pipenv run pip install pip==24.0 setuptools==69.5.1 wheel==0.43.0 + pipenv run pip install pyspark==3.5.0 + pipenv run pip install flake8==3.5.0 pypandoc==1.3.3 + pipenv run pip install black==23.9.1 + pipenv run pip install importlib_metadata==3.10.0 + pipenv run pip install mypy==0.982 + pipenv run pip install mypy-protobuf==3.3.0 + pipenv run pip install cryptography==37.0.4 + pipenv run pip install twine==4.0.1 + pipenv run pip install wheel==0.33.4 + pipenv run pip install setuptools==41.1.0 + pipenv run pip install pydocstyle==3.0.0 + pipenv run pip install pandas==1.1.3 + pipenv run pip install pyarrow==8.0.0 + pipenv run pip install numpy==1.20.3 + if: steps.git-diff.outputs.diff + - name: Run Scala/Java and Python tests + # when changing TEST_PARALLELISM_COUNT make sure to also change it in spark_master_test.yaml + run: | + TEST_PARALLELISM_COUNT=2 pipenv run python run-tests.py --group hudi + if: steps.git-diff.outputs.diff diff --git a/.github/workflows/iceberg_test.yaml b/.github/workflows/iceberg_test.yaml new file mode 100644 index 00000000000..f4623efd5f4 --- /dev/null +++ b/.github/workflows/iceberg_test.yaml @@ -0,0 +1,82 @@ +name: "Delta Iceberg Tests" +on: [push, pull_request] +jobs: + test: + runs-on: ubuntu-20.04 + strategy: + matrix: + # These Scala versions must match those in the build.sbt + scala: [2.12.18, 2.13.13] + env: + SCALA_VERSION: ${{ matrix.scala }} + steps: + - uses: actions/checkout@v3 + - uses: technote-space/get-diff-action@v4 + id: git-diff + with: + PATTERNS: | + ** + .github/workflows/** + !kernel/** + !connectors/** + - name: install java + uses: actions/setup-java@v3 + with: + distribution: "zulu" + java-version: "8" + - name: Cache Scala, SBT + uses: actions/cache@v3 + with: + path: | + ~/.sbt + ~/.ivy2 + ~/.cache/coursier + # Change the key if dependencies are changed. For each key, GitHub Actions will cache the + # the above directories when we use the key for the first time. After that, each run will + # just use the cache. The cache is immutable so we need to use a new key when trying to + # cache new stuff. + key: delta-sbt-cache-spark3.2-scala${{ matrix.scala }} + - name: Install Job dependencies + run: | + sudo apt-get update + sudo apt-get install -y make build-essential libssl-dev zlib1g-dev libbz2-dev libreadline-dev libsqlite3-dev wget curl llvm libncurses5-dev libncursesw5-dev xz-utils tk-dev libffi-dev liblzma-dev python-openssl git + sudo apt install libedit-dev + curl -LO https://github.com/bufbuild/buf/releases/download/v1.28.1/buf-Linux-x86_64.tar.gz + mkdir -p ~/buf + tar -xvzf buf-Linux-x86_64.tar.gz -C ~/buf --strip-components 1 + rm buf-Linux-x86_64.tar.gz + sudo apt install python3-pip --fix-missing + sudo pip3 install pipenv==2021.5.29 + curl https://pyenv.run | bash + export PATH="~/.pyenv/bin:$PATH" + eval "$(pyenv init -)" + eval "$(pyenv virtualenv-init -)" + pyenv install 3.8.18 + pyenv global system 3.8.18 + pipenv --python 3.8 install + # Update the pip version to 24.0. By default `pyenv.run` installs the latest pip version + # available. From version 24.1, `pip` doesn't allow installing python packages + # with version string containing `-`. In Delta-Spark case, the pypi package generated has + # `-SNAPSHOT` in version (e.g. `3.3.0-SNAPSHOT`) as the version is picked up from + # the`version.sbt` file. + pipenv run pip install pip==24.0 setuptools==69.5.1 wheel==0.43.0 + pipenv run pip install pyspark==3.5.0 + pipenv run pip install flake8==3.5.0 pypandoc==1.3.3 + pipenv run pip install black==23.9.1 + pipenv run pip install importlib_metadata==3.10.0 + pipenv run pip install mypy==0.982 + pipenv run pip install mypy-protobuf==3.3.0 + pipenv run pip install cryptography==37.0.4 + pipenv run pip install twine==4.0.1 + pipenv run pip install wheel==0.33.4 + pipenv run pip install setuptools==41.1.0 + pipenv run pip install pydocstyle==3.0.0 + pipenv run pip install pandas==1.1.3 + pipenv run pip install pyarrow==8.0.0 + pipenv run pip install numpy==1.20.3 + if: steps.git-diff.outputs.diff + - name: Run Scala/Java and Python tests + # when changing TEST_PARALLELISM_COUNT make sure to also change it in spark_master_test.yaml + run: | + TEST_PARALLELISM_COUNT=2 pipenv run python run-tests.py --group iceberg + if: steps.git-diff.outputs.diff diff --git a/.github/workflows/storage_test.yaml b/.github/workflows/storage_test.yaml new file mode 100644 index 00000000000..e1f05ed0565 --- /dev/null +++ b/.github/workflows/storage_test.yaml @@ -0,0 +1,82 @@ +name: "Delta Storage Tests" +on: [push, pull_request] +jobs: + test: + runs-on: ubuntu-20.04 + strategy: + matrix: + # These Scala versions must match those in the build.sbt + scala: [2.12.18, 2.13.13] + env: + SCALA_VERSION: ${{ matrix.scala }} + steps: + - uses: actions/checkout@v3 + - uses: technote-space/get-diff-action@v4 + id: git-diff + with: + PATTERNS: | + ** + .github/workflows/** + !kernel/** + !connectors/** + - name: install java + uses: actions/setup-java@v3 + with: + distribution: "zulu" + java-version: "8" + - name: Cache Scala, SBT + uses: actions/cache@v3 + with: + path: | + ~/.sbt + ~/.ivy2 + ~/.cache/coursier + # Change the key if dependencies are changed. For each key, GitHub Actions will cache the + # the above directories when we use the key for the first time. After that, each run will + # just use the cache. The cache is immutable so we need to use a new key when trying to + # cache new stuff. + key: delta-sbt-cache-spark3.2-scala${{ matrix.scala }} + - name: Install Job dependencies + run: | + sudo apt-get update + sudo apt-get install -y make build-essential libssl-dev zlib1g-dev libbz2-dev libreadline-dev libsqlite3-dev wget curl llvm libncurses5-dev libncursesw5-dev xz-utils tk-dev libffi-dev liblzma-dev python-openssl git + sudo apt install libedit-dev + curl -LO https://github.com/bufbuild/buf/releases/download/v1.28.1/buf-Linux-x86_64.tar.gz + mkdir -p ~/buf + tar -xvzf buf-Linux-x86_64.tar.gz -C ~/buf --strip-components 1 + rm buf-Linux-x86_64.tar.gz + sudo apt install python3-pip --fix-missing + sudo pip3 install pipenv==2021.5.29 + curl https://pyenv.run | bash + export PATH="~/.pyenv/bin:$PATH" + eval "$(pyenv init -)" + eval "$(pyenv virtualenv-init -)" + pyenv install 3.8.18 + pyenv global system 3.8.18 + pipenv --python 3.8 install + # Update the pip version to 24.0. By default `pyenv.run` installs the latest pip version + # available. From version 24.1, `pip` doesn't allow installing python packages + # with version string containing `-`. In Delta-Spark case, the pypi package generated has + # `-SNAPSHOT` in version (e.g. `3.3.0-SNAPSHOT`) as the version is picked up from + # the`version.sbt` file. + pipenv run pip install pip==24.0 setuptools==69.5.1 wheel==0.43.0 + pipenv run pip install pyspark==3.5.0 + pipenv run pip install flake8==3.5.0 pypandoc==1.3.3 + pipenv run pip install black==23.9.1 + pipenv run pip install importlib_metadata==3.10.0 + pipenv run pip install mypy==0.982 + pipenv run pip install mypy-protobuf==3.3.0 + pipenv run pip install cryptography==37.0.4 + pipenv run pip install twine==4.0.1 + pipenv run pip install wheel==0.33.4 + pipenv run pip install setuptools==41.1.0 + pipenv run pip install pydocstyle==3.0.0 + pipenv run pip install pandas==1.1.3 + pipenv run pip install pyarrow==8.0.0 + pipenv run pip install numpy==1.20.3 + if: steps.git-diff.outputs.diff + - name: Run Scala/Java and Python tests + # when changing TEST_PARALLELISM_COUNT make sure to also change it in spark_master_test.yaml + run: | + TEST_PARALLELISM_COUNT=2 pipenv run python run-tests.py --group storage + if: steps.git-diff.outputs.diff diff --git a/build.sbt b/build.sbt index 479c2247bd1..c6759562ef9 100644 --- a/build.sbt +++ b/build.sbt @@ -1541,7 +1541,43 @@ val createTargetClassesDir = taskKey[Unit]("create target classes dir") // Don't use these groups for any other projects lazy val sparkGroup = project - .aggregate(spark, contribs, storage, storageS3DynamoDB, iceberg, testDeltaIcebergJar, sharing, hudi) + .aggregate(spark) + .settings( + // crossScalaVersions must be set to Nil on the aggregating project + crossScalaVersions := Nil, + publishArtifact := false, + publish / skip := false, + ) + +lazy val storageGroup = project + .aggregate(storage, storageS3DynamoDB) + .settings( + // crossScalaVersions must be set to Nil on the aggregating project + crossScalaVersions := Nil, + publishArtifact := false, + publish / skip := false, + ) + +lazy val icebergGroup = project + .aggregate(iceberg, testDeltaIcebergJar) + .settings( + // crossScalaVersions must be set to Nil on the aggregating project + crossScalaVersions := Nil, + publishArtifact := false, + publish / skip := false, + ) + +lazy val hudiGroup = project + .aggregate(hudi) + .settings( + // crossScalaVersions must be set to Nil on the aggregating project + crossScalaVersions := Nil, + publishArtifact := false, + publish / skip := false, + ) + +lazy val deltaGroup = project + .aggregate(contribs, sharing) .settings( // crossScalaVersions must be set to Nil on the aggregating project crossScalaVersions := Nil, diff --git a/run-tests.py b/run-tests.py index 7823c1dcfe9..1b889249a78 100755 --- a/run-tests.py +++ b/run-tests.py @@ -25,7 +25,7 @@ # Define groups of subprojects that can be tested separately from other groups. # As of now, we have only defined project groups in the SBT build, so these must match # the group names defined in build.sbt. -valid_project_groups = ["spark", "kernel"] +valid_project_groups = ["spark", "kernel", "storage", "hudi", "iceberg", "delta"] def get_args(): From daffe09e87f1a9602f38f92f7fb430a496dbc35d Mon Sep 17 00:00:00 2001 From: Eduard Tudenhoefner Date: Mon, 12 Aug 2024 12:42:52 +0200 Subject: [PATCH 2/2] Separate test groups for Spark Master tests --- .../spark_master_connect_client_test.yaml | 50 +++++++++++++++++++ .../spark_master_connect_server_test.yaml | 50 +++++++++++++++++++ .github/workflows/spark_master_test.yaml | 2 - 3 files changed, 100 insertions(+), 2 deletions(-) create mode 100644 .github/workflows/spark_master_connect_client_test.yaml create mode 100644 .github/workflows/spark_master_connect_server_test.yaml diff --git a/.github/workflows/spark_master_connect_client_test.yaml b/.github/workflows/spark_master_connect_client_test.yaml new file mode 100644 index 00000000000..373038e5a61 --- /dev/null +++ b/.github/workflows/spark_master_connect_client_test.yaml @@ -0,0 +1,50 @@ +name: "Delta Spark Master ConnectClient Tests" +on: [push, pull_request] +jobs: + test: + runs-on: ubuntu-20.04 + strategy: + matrix: + # These Scala versions must match those in the build.sbt + scala: [2.13.13] + env: + SCALA_VERSION: ${{ matrix.scala }} + steps: + - uses: actions/checkout@v3 + - uses: technote-space/get-diff-action@v4 + id: git-diff + with: + PATTERNS: | + ** + .github/workflows/** + !kernel/** + !connectors/** + - name: install java + uses: actions/setup-java@v3 + with: + distribution: "zulu" + java-version: "17" + - name: Cache Scala, SBT + uses: actions/cache@v3 + with: + path: | + ~/.sbt + ~/.ivy2 + ~/.cache/coursier + !~/.cache/coursier/v1/https/repository.apache.org/content/groups/snapshots + # Change the key if dependencies are changed. For each key, GitHub Actions will cache the + # the above directories when we use the key for the first time. After that, each run will + # just use the cache. The cache is immutable so we need to use a new key when trying to + # cache new stuff. + key: delta-sbt-cache-spark-master-scala${{ matrix.scala }} + - name: Install Job dependencies + run: | + sudo apt-get update + sudo apt-get install -y make build-essential libssl-dev zlib1g-dev libbz2-dev libreadline-dev libsqlite3-dev wget curl llvm libncurses5-dev libncursesw5-dev xz-utils tk-dev libffi-dev liblzma-dev python-openssl git + sudo apt install libedit-dev + if: steps.git-diff.outputs.diff + - name: Run Spark Master tests + # when changing TEST_PARALLELISM_COUNT make sure to also change it in spark_test.yaml + run: | + TEST_PARALLELISM_COUNT=2 build/sbt -DsparkVersion=master "++ ${{ matrix.scala }}" clean connectServer/assembly connectClient/test + if: steps.git-diff.outputs.diff diff --git a/.github/workflows/spark_master_connect_server_test.yaml b/.github/workflows/spark_master_connect_server_test.yaml new file mode 100644 index 00000000000..c03a1960963 --- /dev/null +++ b/.github/workflows/spark_master_connect_server_test.yaml @@ -0,0 +1,50 @@ +name: "Delta Spark Master ConnectServer Tests" +on: [push, pull_request] +jobs: + test: + runs-on: ubuntu-20.04 + strategy: + matrix: + # These Scala versions must match those in the build.sbt + scala: [2.13.13] + env: + SCALA_VERSION: ${{ matrix.scala }} + steps: + - uses: actions/checkout@v3 + - uses: technote-space/get-diff-action@v4 + id: git-diff + with: + PATTERNS: | + ** + .github/workflows/** + !kernel/** + !connectors/** + - name: install java + uses: actions/setup-java@v3 + with: + distribution: "zulu" + java-version: "17" + - name: Cache Scala, SBT + uses: actions/cache@v3 + with: + path: | + ~/.sbt + ~/.ivy2 + ~/.cache/coursier + !~/.cache/coursier/v1/https/repository.apache.org/content/groups/snapshots + # Change the key if dependencies are changed. For each key, GitHub Actions will cache the + # the above directories when we use the key for the first time. After that, each run will + # just use the cache. The cache is immutable so we need to use a new key when trying to + # cache new stuff. + key: delta-sbt-cache-spark-master-scala${{ matrix.scala }} + - name: Install Job dependencies + run: | + sudo apt-get update + sudo apt-get install -y make build-essential libssl-dev zlib1g-dev libbz2-dev libreadline-dev libsqlite3-dev wget curl llvm libncurses5-dev libncursesw5-dev xz-utils tk-dev libffi-dev liblzma-dev python-openssl git + sudo apt install libedit-dev + if: steps.git-diff.outputs.diff + - name: Run Spark Master tests + # when changing TEST_PARALLELISM_COUNT make sure to also change it in spark_test.yaml + run: | + TEST_PARALLELISM_COUNT=2 build/sbt -DsparkVersion=master "++ ${{ matrix.scala }}" clean connectServer/test + if: steps.git-diff.outputs.diff diff --git a/.github/workflows/spark_master_test.yaml b/.github/workflows/spark_master_test.yaml index dfd6a5bf85c..bf33f04df16 100644 --- a/.github/workflows/spark_master_test.yaml +++ b/.github/workflows/spark_master_test.yaml @@ -47,6 +47,4 @@ jobs: # when changing TEST_PARALLELISM_COUNT make sure to also change it in spark_test.yaml run: | TEST_PARALLELISM_COUNT=2 build/sbt -DsparkVersion=master "++ ${{ matrix.scala }}" clean spark/test - TEST_PARALLELISM_COUNT=2 build/sbt -DsparkVersion=master "++ ${{ matrix.scala }}" clean connectServer/test - TEST_PARALLELISM_COUNT=2 build/sbt -DsparkVersion=master "++ ${{ matrix.scala }}" clean connectServer/assembly connectClient/test if: steps.git-diff.outputs.diff