From 2cf3bb814db23a9fbd45a55165b03e0587ac7638 Mon Sep 17 00:00:00 2001 From: Michel Davit Date: Thu, 23 Jan 2025 10:49:31 +0100 Subject: [PATCH] Fix literal string and json string disambiguation --- .../scio/bigquery/TypedBigQueryIT.scala | 8 ++-- .../scio/bigquery/syntax/TableRowSyntax.scala | 5 +- .../spotify/scio/bigquery/types/package.scala | 9 ++-- .../types/ConverterProviderTest.scala | 46 ++++++++++++++++++- 4 files changed, 56 insertions(+), 12 deletions(-) diff --git a/integration/src/test/scala/com/spotify/scio/bigquery/TypedBigQueryIT.scala b/integration/src/test/scala/com/spotify/scio/bigquery/TypedBigQueryIT.scala index 0b21b716d2..008e10e406 100644 --- a/integration/src/test/scala/com/spotify/scio/bigquery/TypedBigQueryIT.scala +++ b/integration/src/test/scala/com/spotify/scio/bigquery/TypedBigQueryIT.scala @@ -84,9 +84,9 @@ object TypedBigQueryIT { .oneOf( alphaLowerStr.flatMap(str => arbInt.arbitrary.map(num => s"""{"$str":$num}""")), alphaLowerStr.flatMap(str => arbInt.arbitrary.map(num => s"""["$str",$num]""")), - alphaLowerStr.map(str => s"\"$str\"") - // arbInt.arbitrary.map(_.toString), TableRow serialization converts to string literal - // arbBool.arbitrary.map(_.toString), TableRow serialization converts to string literal + alphaLowerStr.map(str => s"\"$str\""), + arbInt.arbitrary.map(_.toString), + arbBool.arbitrary.map(_.toString) // Gen.const("null"), null json literal is lost, interpreted as missing field ) .map(wkt => Json(wkt)) @@ -112,7 +112,7 @@ object TypedBigQueryIT { private val tableRowTable = table("records_tablerow") private val avroTable = table("records_avro") - private val records = Gen.listOfN(5, recordGen).sample.get + private val records = Gen.listOfN(100, recordGen).sample.get private val options = PipelineOptionsFactory .fromArgs( "--project=data-integration-test", diff --git a/scio-google-cloud-platform/src/main/scala/com/spotify/scio/bigquery/syntax/TableRowSyntax.scala b/scio-google-cloud-platform/src/main/scala/com/spotify/scio/bigquery/syntax/TableRowSyntax.scala index ba174bbe09..fb018d8cf2 100644 --- a/scio-google-cloud-platform/src/main/scala/com/spotify/scio/bigquery/syntax/TableRowSyntax.scala +++ b/scio-google-cloud-platform/src/main/scala/com/spotify/scio/bigquery/syntax/TableRowSyntax.scala @@ -126,13 +126,12 @@ object TableRowOps { case null => Json(null) case x: java.lang.Number => Json(x) case x: java.lang.Boolean => Json(x) + case x: java.lang.String => Json(x) // also handles json string // object case x: java.util.Map[_, _] => Json(x) // array case x: java.util.List[_] => Json(x) - // json string / literal - case x: String => Json(x) - case _ => throw new UnsupportedOperationException("Cannot convert to json: " + value) + case _ => throw new UnsupportedOperationException("Cannot convert to json: " + value) } def bignumeric(value: AnyRef): BigNumeric = value match { diff --git a/scio-google-cloud-platform/src/main/scala/com/spotify/scio/bigquery/types/package.scala b/scio-google-cloud-platform/src/main/scala/com/spotify/scio/bigquery/types/package.scala index adbe124558..588aa8a5ac 100644 --- a/scio-google-cloud-platform/src/main/scala/com/spotify/scio/bigquery/types/package.scala +++ b/scio-google-cloud-platform/src/main/scala/com/spotify/scio/bigquery/types/package.scala @@ -27,7 +27,7 @@ import org.typelevel.scalaccompat.annotation.nowarn import java.math.MathContext import java.nio.ByteBuffer -import scala.annotation.StaticAnnotation +import scala.annotation.{unused, StaticAnnotation} import scala.util.Try package object types { @@ -77,12 +77,15 @@ package object types { .disable(SerializationFeature.WRITE_DATES_AS_TIMESTAMPS) .disable(SerializationFeature.FAIL_ON_EMPTY_BEANS); + // force to use the apply(AnyRef) + @unused private def apply(wkt: String): Json = new Json(wkt) + def apply(value: AnyRef): Json = value match { case str: String if Try(mapper.readTree(str)).isSuccess => // string formatted json vs string literal - Json(str) + new Json(str) case _ => - Json(mapper.writeValueAsString(value)) + new Json(mapper.writeValueAsString(value)) } def parse(json: Json): AnyRef = mapper.readValue(json.wkt, classOf[Object]) } diff --git a/scio-google-cloud-platform/src/test/scala/com/spotify/scio/bigquery/types/ConverterProviderTest.scala b/scio-google-cloud-platform/src/test/scala/com/spotify/scio/bigquery/types/ConverterProviderTest.scala index 614dcb2a3b..46034fd5d3 100644 --- a/scio-google-cloud-platform/src/test/scala/com/spotify/scio/bigquery/types/ConverterProviderTest.scala +++ b/scio-google-cloud-platform/src/test/scala/com/spotify/scio/bigquery/types/ConverterProviderTest.scala @@ -53,16 +53,19 @@ class ConverterProviderTest extends AnyFlatSpec with Matchers { it should "handle required json type" in { // JSON object { - val wkt = """{"name":"Alice","age":30,"address":{"street":"Broadway","city":"New York"}}""" + val wkt = + """{"name":"Alice","age":30,"job":null,"address":{"street":"Broadway","city":"New York"}}""" val parsed = Map( "name" -> "Alice", "age" -> 30, + "job" -> null, "address" -> Map( "street" -> "Broadway", "city" -> "New York" ).asJava ).asJava + RequiredJson.fromTableRow(TableRow("a" -> wkt)) shouldBe RequiredJson(Json(wkt)) RequiredJson.fromTableRow(TableRow("a" -> parsed)) shouldBe RequiredJson(Json(wkt)) BigQueryType.toTableRow[RequiredJson](RequiredJson(Json(wkt))) shouldBe TableRow( "a" -> parsed @@ -71,22 +74,61 @@ class ConverterProviderTest extends AnyFlatSpec with Matchers { // JSON array { - val wkt = """["Alice",30,{"street":"Broadway","city":"New York"}]""" + val wkt = """["Alice",30,null,{"street":"Broadway","city":"New York"}]""" val parsed = List( "Alice", 30, + null, Map( "street" -> "Broadway", "city" -> "New York" ).asJava ).asJava + RequiredJson.fromTableRow(TableRow("a" -> wkt)) shouldBe RequiredJson(Json(wkt)) RequiredJson.fromTableRow(TableRow("a" -> parsed)) shouldBe RequiredJson(Json(wkt)) BigQueryType.toTableRow[RequiredJson](RequiredJson(Json(wkt))) shouldBe TableRow( "a" -> parsed ) } + // JSON string literal + { + val wkt = "\"Hello world!\"" + val parsed = "Hello world!" + + RequiredJson.fromTableRow(TableRow("a" -> wkt)) shouldBe RequiredJson(Json(wkt)) + RequiredJson.fromTableRow(TableRow("a" -> parsed)) shouldBe RequiredJson(Json(wkt)) + BigQueryType.toTableRow[RequiredJson](RequiredJson(Json(wkt))) shouldBe TableRow( + "a" -> parsed + ) + } + + // JSON boolean literal + { + val wkt = "false" + val parsed = false + + RequiredJson.fromTableRow(TableRow("a" -> wkt)) shouldBe RequiredJson(Json(wkt)) + RequiredJson.fromTableRow(TableRow("a" -> parsed)) shouldBe RequiredJson(Json(wkt)) + BigQueryType.toTableRow[RequiredJson](RequiredJson(Json(wkt))) shouldBe TableRow( + "a" -> parsed + ) + } + + // JSON number literal + { + val wkt = "42" + val parsed = 42 + + RequiredJson.fromTableRow(TableRow("a" -> wkt)) shouldBe RequiredJson(Json(wkt)) + RequiredJson.fromTableRow(TableRow("a" -> parsed)) shouldBe RequiredJson(Json(wkt)) + BigQueryType.toTableRow[RequiredJson](RequiredJson(Json(wkt))) shouldBe TableRow( + "a" -> parsed + ) + } + + // JSON null literal is ambiguous with NULLABLE column } it should "handle required big numeric type" in {