Skip to content

Commit

Permalink
Merge pull request #284 from smart-on-fhir/mikix/delta3
Browse files Browse the repository at this point in the history
build: bump delta-spark to 3.0
  • Loading branch information
mikix authored Oct 26, 2023
2 parents a12aa01 + e5cda08 commit 24b3103
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 18 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ requires-python = ">= 3.10"
# to fix any breakages since users won't immediately see the problem).
dependencies = [
"ctakesclient >= 5.1, < 6",
"delta-spark >= 2.3, < 3",
"delta-spark >= 3, < 4",
"fhirclient < 5",
"httpx < 1",
"inscriptis < 3",
Expand Down
29 changes: 12 additions & 17 deletions tests/formats/test_deltalake.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,16 +189,12 @@ def test_altered_field(self):
]
)
self.assertTrue(self.store(self.df(a=1), schema=schema))

# Confirm that Delta Lake will error out when presented with an altered type, with or without a schema.
self.assertFalse(self.store(self.df(b="string"), schema=schema))
self.assert_lake_equal(self.df(a=1))
self.assertFalse(self.store(self.df(b="string")))

# And just confirm the mildly buggy behavior that Delta Lake will silently ignore
# altered types when we don't force a schema. This is one reason we like to force a schema!
# We don't desire or care about this behavior, but just testing it here as a sort of documentation,
# in case they ever fix that, and then we get to know about it.
# Upstream issue: https://github.com/delta-io/delta/issues/1551
self.assertTrue(self.store(self.df(b="string")))
self.assert_lake_equal([{"id": "a", "value": 1}, {"id": "b"}])
self.assert_lake_equal(self.df(a=1))

def test_schema_has_names(self):
"""Verify that the lake's schemas has valid nested names, which may not always happen with spark"""
Expand Down Expand Up @@ -274,31 +270,30 @@ def test_merged_schema_for_resource(self):

@ddt.data(
# In general, the first type used wins
(pyarrow.int64(), 2000, pyarrow.int32(), 2000, "long", 2000),
(pyarrow.int32(), 2000, pyarrow.int64(), 2000, "integer", 2000),
(pyarrow.int64(), 3000000000, pyarrow.int32(), 2000, "long", 2000),
# Interestingly, delta lake will silently down-convert for us.
# This is not an expected scenario, but we should beware this gotcha.
(pyarrow.int32(), 2000, pyarrow.int64(), 3000000000, "integer", -1294967296),
(pyarrow.int64(), 2000, pyarrow.int32(), 2001, True, "long", 2001),
(pyarrow.int32(), 2000, pyarrow.int64(), 2001, True, "integer", 2001),
(pyarrow.int64(), 3000000000, pyarrow.int32(), 2000, True, "long", 2000),
# Delta lake will refuse to store too large a value for the type
(pyarrow.int32(), 2000, pyarrow.int64(), 3000000000, False, "integer", 2000),
)
@ddt.unpack
def test_column_type_merges(self, type1, val1, type2, val2, expected_type, expected_value):
def test_column_type_merges(self, type1, val1, type2, val2, expected_success, expected_type, expected_value):
"""Verify that if we write a slightly different, but compatible field to the delta lake, it works"""
schema1 = pyarrow.schema(
[
pyarrow.field("id", pyarrow.string()),
pyarrow.field("int", type1),
]
)
self.store([{"id": "1", "int": val1}], schema=schema1)
self.assertTrue(self.store([{"id": "1", "int": val1}], schema=schema1))

schema2 = pyarrow.schema(
[
pyarrow.field("id", pyarrow.string()),
pyarrow.field("int", type2),
]
)
self.store([{"id": "1", "int": val2}], schema=schema2)
self.assertEqual(expected_success, self.store([{"id": "1", "int": val2}], schema=schema2))

table_path = os.path.join(self.output_dir, "patient")
table_df = DeltaLakeFormat.spark.read.format("delta").load(table_path)
Expand Down

0 comments on commit 24b3103

Please sign in to comment.