Skip to content

Commit

Permalink
Merge branch 'master' into lenient-stats-cols
Browse files Browse the repository at this point in the history
  • Loading branch information
Kimahriman committed Sep 21, 2024
2 parents 86c148f + 1753cb5 commit 25e6faa
Show file tree
Hide file tree
Showing 370 changed files with 13,749 additions and 3,236 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/connectors_test.yaml
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
name: "Delta Connectors Tests"
name: "Delta Connectors"
on: [push, pull_request]
jobs:
build:
name: "Run tests"
name: "DC: Scala ${{ matrix.scala }}"
runs-on: ubuntu-20.04
strategy:
matrix:
Expand Down
3 changes: 2 additions & 1 deletion .github/workflows/kernel_test.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
name: "Delta Kernel Tests"
name: "Delta Kernel"
on: [push, pull_request]
jobs:
test:
name: "DK"
runs-on: ubuntu-20.04
env:
SCALA_VERSION: 2.12.18
Expand Down
3 changes: 2 additions & 1 deletion .github/workflows/spark_examples_test.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
name: "Delta Spark Local Publishing and Examples Compilation"
name: "Delta Spark Publishing and Examples"
on: [push, pull_request]
jobs:
test:
name: "DSP&E: Scala ${{ matrix.scala }}"
runs-on: ubuntu-20.04
strategy:
matrix:
Expand Down
3 changes: 2 additions & 1 deletion .github/workflows/spark_master_test.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
name: "Delta Spark Master Tests"
name: "Delta Spark Master"
on: [push, pull_request]
jobs:
test:
name: "DSM: Scala ${{ matrix.scala }}, Shard ${{ matrix.shard }}"
runs-on: ubuntu-20.04
strategy:
matrix:
Expand Down
83 changes: 83 additions & 0 deletions .github/workflows/spark_python_test.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
name: "Delta Spark Python"
on: [push, pull_request]
jobs:
test:
name: "DSP"
runs-on: ubuntu-20.04
strategy:
matrix:
# These Scala versions must match those in the build.sbt
scala: [2.12.18]
env:
SCALA_VERSION: ${{ matrix.scala }}
steps:
- uses: actions/checkout@v3
- uses: technote-space/get-diff-action@v4
id: git-diff
with:
PATTERNS: |
**
.github/workflows/**
!kernel/**
!connectors/**
- name: install java
uses: actions/setup-java@v3
with:
distribution: "zulu"
java-version: "8"
- name: Cache Scala, SBT
uses: actions/cache@v3
with:
path: |
~/.sbt
~/.ivy2
~/.cache/coursier
# Change the key if dependencies are changed. For each key, GitHub Actions will cache the
# the above directories when we use the key for the first time. After that, each run will
# just use the cache. The cache is immutable so we need to use a new key when trying to
# cache new stuff.
key: delta-sbt-cache-spark3.2-scala${{ matrix.scala }}
- name: Install Job dependencies
run: |
sudo apt-get update
sudo apt-get install -y make build-essential libssl-dev zlib1g-dev libbz2-dev libreadline-dev libsqlite3-dev wget curl llvm libncurses5-dev libncursesw5-dev xz-utils tk-dev libffi-dev liblzma-dev python-openssl git
sudo apt install libedit-dev
curl -LO https://github.com/bufbuild/buf/releases/download/v1.28.1/buf-Linux-x86_64.tar.gz
mkdir -p ~/buf
tar -xvzf buf-Linux-x86_64.tar.gz -C ~/buf --strip-components 1
rm buf-Linux-x86_64.tar.gz
sudo apt install python3-pip --fix-missing
sudo pip3 install pipenv==2021.5.29
curl https://pyenv.run | bash
export PATH="~/.pyenv/bin:$PATH"
eval "$(pyenv init -)"
eval "$(pyenv virtualenv-init -)"
pyenv install 3.8.18
pyenv global system 3.8.18
pipenv --python 3.8 install
# Update the pip version to 24.0. By default `pyenv.run` installs the latest pip version
# available. From version 24.1, `pip` doesn't allow installing python packages
# with version string containing `-`. In Delta-Spark case, the pypi package generated has
# `-SNAPSHOT` in version (e.g. `3.3.0-SNAPSHOT`) as the version is picked up from
# the`version.sbt` file.
pipenv run pip install pip==24.0 setuptools==69.5.1 wheel==0.43.0
pipenv run pip install pyspark==3.5.0
pipenv run pip install flake8==3.5.0 pypandoc==1.3.3
pipenv run pip install black==23.9.1
pipenv run pip install importlib_metadata==3.10.0
pipenv run pip install mypy==0.982
pipenv run pip install mypy-protobuf==3.3.0
pipenv run pip install cryptography==37.0.4
pipenv run pip install twine==4.0.1
pipenv run pip install wheel==0.33.4
pipenv run pip install setuptools==41.1.0
pipenv run pip install pydocstyle==3.0.0
pipenv run pip install pandas==1.1.3
pipenv run pip install pyarrow==8.0.0
pipenv run pip install numpy==1.20.3
if: steps.git-diff.outputs.diff
- name: Run Python tests
# when changing TEST_PARALLELISM_COUNT make sure to also change it in spark_master_test.yaml
run: |
TEST_PARALLELISM_COUNT=4 pipenv run python run-tests.py --group spark-python
if: steps.git-diff.outputs.diff
3 changes: 2 additions & 1 deletion .github/workflows/spark_test.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
name: "Delta Spark Tests"
name: "Delta Spark Latest"
on: [push, pull_request]
jobs:
test:
name: "DSL: Scala ${{ matrix.scala }}, Shard ${{ matrix.shard }}"
runs-on: ubuntu-20.04
strategy:
matrix:
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/unidoc.yaml
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
name: "Unidoc generation"
name: "Unidoc"
on: [push, pull_request]
jobs:
build:
name: "Generate unidoc"
name: "U: Scala ${{ matrix.scala }}"
runs-on: ubuntu-20.04
strategy:
matrix:
# These Scala versions must match those in the build.sbt
# These Scala versions must match those in the build.sbt
scala: [2.13.13, 2.12.18]
steps:
- name: install java
Expand Down
9 changes: 6 additions & 3 deletions PROTOCOL.md
Original file line number Diff line number Diff line change
Expand Up @@ -1791,13 +1791,16 @@ Type | Serialization Format
string | No translation required
numeric types | The string representation of the number
date | Encoded as `{year}-{month}-{day}`. For example, `1970-01-01`
timestamp | Encoded as `{year}-{month}-{day} {hour}:{minute}:{second}` or `{year}-{month}-{day} {hour}:{minute}:{second}.{microsecond}` For example: `1970-01-01 00:00:00`, or `1970-01-01 00:00:00.123456`
timestamp | Encoded as `{year}-{month}-{day} {hour}:{minute}:{second}` or `{year}-{month}-{day} {hour}:{minute}:{second}.{microsecond}`. For example: `1970-01-01 00:00:00`, or `1970-01-01 00:00:00.123456`. Timestamps may also be encoded as an ISO8601 formatted timestamp adjusted to UTC timestamp such as `1970-01-01T00:00:00.123456Z`
timestamp without timezone | Encoded as `{year}-{month}-{day} {hour}:{minute}:{second}` or `{year}-{month}-{day} {hour}:{minute}:{second}.{microsecond}` For example: `1970-01-01 00:00:00`, or `1970-01-01 00:00:00.123456` To use this type, a table must support a feature `timestampNtz`. See section [Timestamp without timezone (TimestampNtz)](#timestamp-without-timezone-timestampNtz) for more information.
boolean | Encoded as the string "true" or "false"
binary | Encoded as a string of escaped binary values. For example, `"\u0001\u0002\u0003"`

Note: A `timestamp` value in a partition value doesn't store the time zone due to historical reasons.
It means its behavior looks similar to `timestamp without time zone` when it is used in a partition column.
Note: A timestamp value in a partition value may be stored in one of the following ways:
1. Without a timezone, where the timestamp should be interpreted using the time zone of the system which wrote to the table.
2. Adjusted to UTC and stored in ISO8601 format.

It is highly recommended that modern writers adjust the timestamp to UTC and store the timestamp in ISO8601 format as outlined in 2.

## Schema Serialization Format

Expand Down
50 changes: 41 additions & 9 deletions build.sbt
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ spark / sparkVersion := getSparkVersion()
connectCommon / sparkVersion := getSparkVersion()
connectClient / sparkVersion := getSparkVersion()
connectServer / sparkVersion := getSparkVersion()
sharing / sparkVersion := getSparkVersion()

// Dependent library versions
val defaultSparkVersion = LATEST_RELEASED_SPARK_VERSION
Expand Down Expand Up @@ -175,6 +176,7 @@ def crossSparkSettings(): Seq[Setting[_]] = getSparkVersion() match {
Compile / unmanagedSourceDirectories += (Compile / baseDirectory).value / "src" / "main" / "scala-spark-3.5",
Test / unmanagedSourceDirectories += (Test / baseDirectory).value / "src" / "test" / "scala-spark-3.5",
Antlr4 / antlr4Version := "4.9.3",
Test / javaOptions ++= Seq("-Dlog4j.configurationFile=log4j2.properties"),

// Java-/Scala-/Uni-Doc Settings
scalacOptions ++= Seq(
Expand Down Expand Up @@ -203,8 +205,9 @@ def crossSparkSettings(): Seq[Setting[_]] = getSparkVersion() match {
"--add-opens=java.base/sun.nio.ch=ALL-UNNAMED",
"--add-opens=java.base/sun.nio.cs=ALL-UNNAMED",
"--add-opens=java.base/sun.security.action=ALL-UNNAMED",
"--add-opens=java.base/sun.util.calendar=ALL-UNNAMED"
)
"--add-opens=java.base/sun.util.calendar=ALL-UNNAMED",
"-Dlog4j.configurationFile=log4j2_spark_master.properties"
),

// Java-/Scala-/Uni-Doc Settings
// This isn't working yet against Spark Master.
Expand Down Expand Up @@ -539,21 +542,22 @@ lazy val sharing = (project in file("sharing"))
commonSettings,
scalaStyleSettings,
releaseSettings,
crossSparkSettings(),
Test / javaOptions ++= Seq("-ea"),
libraryDependencies ++= Seq(
"org.apache.spark" %% "spark-sql" % defaultSparkVersion % "provided",
"org.apache.spark" %% "spark-sql" % sparkVersion.value % "provided",

"io.delta" %% "delta-sharing-client" % "1.1.1",
"io.delta" %% "delta-sharing-client" % "1.2.0",

// Test deps
"org.scalatest" %% "scalatest" % scalaTestVersion % "test",
"org.scalatestplus" %% "scalacheck-1-15" % "3.2.9.0" % "test",
"junit" % "junit" % "4.13.2" % "test",
"com.novocode" % "junit-interface" % "0.11" % "test",
"org.apache.spark" %% "spark-catalyst" % defaultSparkVersion % "test" classifier "tests",
"org.apache.spark" %% "spark-core" % defaultSparkVersion % "test" classifier "tests",
"org.apache.spark" %% "spark-sql" % defaultSparkVersion % "test" classifier "tests",
"org.apache.spark" %% "spark-hive" % defaultSparkVersion % "test" classifier "tests",
"org.apache.spark" %% "spark-catalyst" % sparkVersion.value % "test" classifier "tests",
"org.apache.spark" %% "spark-core" % sparkVersion.value % "test" classifier "tests",
"org.apache.spark" %% "spark-sql" % sparkVersion.value % "test" classifier "tests",
"org.apache.spark" %% "spark-hive" % sparkVersion.value % "test" classifier "tests",
)
).configureUnidoc()

Expand All @@ -569,13 +573,41 @@ lazy val kernelApi = (project in file("kernel/kernel-api"))
"org.roaringbitmap" % "RoaringBitmap" % "0.9.25",
"org.slf4j" % "slf4j-api" % "1.7.36",

"com.fasterxml.jackson.core" % "jackson-databind" % "2.13.5" % "test",
"com.fasterxml.jackson.core" % "jackson-databind" % "2.13.5",
"com.fasterxml.jackson.core" % "jackson-core" % "2.13.5",
"com.fasterxml.jackson.core" % "jackson-annotations" % "2.13.5",

"org.scalatest" %% "scalatest" % scalaTestVersion % "test",
"junit" % "junit" % "4.13.2" % "test",
"com.novocode" % "junit-interface" % "0.11" % "test",
"org.slf4j" % "slf4j-log4j12" % "1.7.36" % "test",
"org.assertj" % "assertj-core" % "3.26.3" % "test"
),
// Shade jackson libraries so that connector developers don't have to worry
// about jackson version conflicts.
Compile / packageBin := assembly.value,
assembly / assemblyJarName := s"${name.value}-${version.value}.jar",
assembly / logLevel := Level.Info,
assembly / test := {},
assembly / assemblyExcludedJars := {
val cp = (assembly / fullClasspath).value
val allowedPrefixes = Set("META_INF", "io", "jackson")
cp.filter { f =>
!allowedPrefixes.exists(prefix => f.data.getName.startsWith(prefix))
}
},
assembly / assemblyShadeRules := Seq(
ShadeRule.rename("com.fasterxml.jackson.**" -> "io.delta.kernel.shaded.com.fasterxml.jackson.@1").inAll
),
assembly / assemblyMergeStrategy := {
// Discard `module-info.class` to fix the `different file contents found` error.
// TODO Upgrade SBT to 1.5 which will do this automatically
case "module-info.class" => MergeStrategy.discard
case PathList("META-INF", "services", xs @ _*) => MergeStrategy.discard
case x =>
val oldStrategy = (assembly / assemblyMergeStrategy).value
oldStrategy(x)
},
// Generate the package object to provide the version information in runtime.
Compile / sourceGenerators += Def.task {
val file = (Compile / sourceManaged).value / "io" / "delta" / "kernel" / "Meta.java"
Expand Down
16 changes: 0 additions & 16 deletions connectors/.github/workflows/new_pull_request.yaml

This file was deleted.

19 changes: 0 additions & 19 deletions connectors/.github/workflows/new_updated_issue.yaml

This file was deleted.

43 changes: 0 additions & 43 deletions connectors/.github/workflows/test.yaml

This file was deleted.

20 changes: 0 additions & 20 deletions connectors/.github/workflows/updated_pull_request.yaml

This file was deleted.

Loading

0 comments on commit 25e6faa

Please sign in to comment.