From f71a43ad84b75298f8a43916dc5309dde8575610 Mon Sep 17 00:00:00 2001 From: Flook Peter Date: Wed, 11 Dec 2024 15:57:29 +0900 Subject: [PATCH 1/2] Add in base implementation for confluent schema registry, ability to parse proto schemas, add in map data type --- .../datacaterer/api/model/Constants.scala | 10 + .../datacaterer/api/model/DataType.scala | 8 + app/build.gradle.kts | 6 +- .../core/exception/Exceptions.scala | 20 ++ .../ConfluentSchemaRegistryMetadata.scala | 112 ++++++++ .../generator/provider/DataGenerator.scala | 24 +- .../provider/RandomDataGenerator.scala | 42 ++- .../SchemaModels.scala | 19 ++ .../core/parser/ProtobufParser.scala | 110 ++++++++ .../datacaterer/core/util/MetadataUtil.scala | 2 + .../datacaterer/core/util/ProtobufUtil.scala | 253 ++++++++++++++---- .../sample/files/protobuf/example.proto | 4 + .../datasource/jms/JmsMetadataTest.scala | 10 +- .../provider/RandomDataGeneratorTest.scala | 27 +- .../core/plan/PlanProcessorTest.scala | 3 +- 15 files changed, 583 insertions(+), 67 deletions(-) create mode 100644 app/src/main/scala/io/github/datacatering/datacaterer/core/generator/metadata/datasource/confluentschemaregistry/ConfluentSchemaRegistryMetadata.scala create mode 100644 app/src/main/scala/io/github/datacatering/datacaterer/core/model/confluentschemaregistry/SchemaModels.scala create mode 100644 app/src/main/scala/io/github/datacatering/datacaterer/core/parser/ProtobufParser.scala diff --git a/api/src/main/scala/io/github/datacatering/datacaterer/api/model/Constants.scala b/api/src/main/scala/io/github/datacatering/datacaterer/api/model/Constants.scala index a39c8b49..8ea1f4e8 100644 --- a/api/src/main/scala/io/github/datacatering/datacaterer/api/model/Constants.scala +++ b/api/src/main/scala/io/github/datacatering/datacaterer/api/model/Constants.scala @@ -79,6 +79,8 @@ object Constants { lazy val MAXIMUM_LENGTH = "maxLen" lazy val ARRAY_MAXIMUM_LENGTH = "arrayMaxLen" lazy val SOURCE_MAXIMUM_LENGTH = "sourceMaxLen" + lazy val MAP_MINIMUM_SIZE = "mapMinSize" + lazy val MAP_MAXIMUM_SIZE = "mapMaxSize" lazy val MINIMUM = "min" lazy val MAXIMUM = "max" lazy val STANDARD_DEVIATION = "stddev" @@ -324,6 +326,7 @@ object Constants { lazy val DATA_CONTRACT_CLI = "dataContractCli" lazy val AMUNDSEN = "amundsen" lazy val DATAHUB = "datahub" + lazy val CONFLUENT_SCHEMA_REGISTRY = "confluentSchemaRegistry" lazy val DEFAULT_METADATA_SOURCE_NAME = "defaultMetadataSource" //alert source @@ -378,6 +381,13 @@ object Constants { lazy val OPEN_METADATA_TABLE_FQN = "tableFqn" lazy val OPEN_METADATA_SERVICE = "service" + //confluent schema registry + lazy val CONFLUENT_SCHEMA_REGISTRY_SUBJECT = "subject" + lazy val CONFLUENT_SCHEMA_REGISTRY_VERSION = "version" + lazy val CONFLUENT_SCHEMA_REGISTRY_ID = "id" + lazy val CONFLUENT_SCHEMA_REGISTRY_MESSAGE_NAME = "messageName" + lazy val DEFAULT_CONFLUENT_SCHEMA_REGISTRY_VERSION = "latest" + //delta lazy val DELTA_LAKE_SPARK_CONF = Map( "spark.sql.catalog.spark_catalog" -> "org.apache.spark.sql.delta.catalog.DeltaCatalog" diff --git a/api/src/main/scala/io/github/datacatering/datacaterer/api/model/DataType.scala b/api/src/main/scala/io/github/datacatering/datacaterer/api/model/DataType.scala index ce156c9e..c914071f 100644 --- a/api/src/main/scala/io/github/datacatering/datacaterer/api/model/DataType.scala +++ b/api/src/main/scala/io/github/datacatering/datacaterer/api/model/DataType.scala @@ -111,6 +111,14 @@ case object ArrayType extends ArrayType(StringType) { def instance: ArrayType.type = this } +class MapType(keyType: DataType = StringType, valueType: DataType = StringType) extends DataType { + override def toString: String = s"map<${keyType.toString},${valueType.toString}>" +} + +case object MapType extends MapType(StringType, StringType) { + def instance: MapType.type = this +} + class StructType(innerType: List[(String, DataType)] = List()) extends DataType { def this(innerType: java.util.List[java.util.Map.Entry[String, DataType]]) = { diff --git a/app/build.gradle.kts b/app/build.gradle.kts index 7ffee824..a303dfe3 100644 --- a/app/build.gradle.kts +++ b/app/build.gradle.kts @@ -83,7 +83,7 @@ dependencies { } // vulnerabilities in Spark - basicImpl("com.google.protobuf:protobuf-java:3.25.5") +// basicImpl("com.google.protobuf:protobuf-java:3.25.5") fails with https://github.com/protostuff/protostuff/issues/367 basicImpl("io.netty:netty-codec-http:4.1.110.Final") basicImpl("io.netty:netty-codec-http2:4.1.110.Final") basicImpl("io.netty:netty-tcnative-boringssl-static:2.0.65.Final:windows-x86_64") @@ -161,12 +161,14 @@ dependencies { //TODO implementation("jakarta.jms:jakarta.jms-api:3.1.0") jms 3.x basicImpl("javax.jms:javax.jms-api:2.0.1") basicImpl("com.solacesystems:sol-jms:10.21.0") - // metadata + // open metadata basicImpl("org.open-metadata:openmetadata-java-client:1.1.7") { //1.2.0 has component reliant on java 17 exclude(group = "org.antlr") exclude(module = "logback-core") exclude(module = "logback-classic") } + // protobuf + basicImpl("io.protostuff:protostuff-parser:3.1.40") // data generation helpers basicImpl("net.datafaker:datafaker:1.9.0") diff --git a/app/src/main/scala/io/github/datacatering/datacaterer/core/exception/Exceptions.scala b/app/src/main/scala/io/github/datacatering/datacaterer/core/exception/Exceptions.scala index 8a5dccf2..a9e019c5 100644 --- a/app/src/main/scala/io/github/datacatering/datacaterer/core/exception/Exceptions.scala +++ b/app/src/main/scala/io/github/datacatering/datacaterer/core/exception/Exceptions.scala @@ -193,6 +193,21 @@ case class InvalidOpenMetadataTableRowCountBetweenException() extends RuntimeExc "Expected at least one of 'minValue' or 'maxValue' to be defined in test parameters from OpenMetadata" ) +//confluent schema registry +case class FailedConfluentSchemaRegistryHttpCallException(url: String, throwable: Throwable) extends RuntimeException( + s"Failed to call HTTP url for Confluent Schema Registry, url=$url", + throwable +) + +case class InvalidConfluentSchemaRegistryResponseException(throwable: Throwable) extends RuntimeException( + "Failed to parse response from Confluent Schema Registry", + throwable +) + +case class InvalidConfluentSchemaRegistrySchemaRequestException(missingField: String) extends RuntimeException( + s"Required field for Confluent Schema Registry Schema Request is missing, missing-field(s)=$missingField" +) + //jms case class FailedJmsMessageSendException(throwable: Throwable) extends RuntimeException(throwable) @@ -209,3 +224,8 @@ case class InvalidColumnAsDataTypeException(columnName: String, throwable: Throw case class MissingColumnException(columnName: String) extends RuntimeException( s"Invalid schema definition due to missing column, column-name=$columnName" ) + +//protobuf +case class UnsupportedProtobufType(protobufType: String) extends RuntimeException( + s"Unsupported protobuf data type, protobuf-type=$protobufType" +) diff --git a/app/src/main/scala/io/github/datacatering/datacaterer/core/generator/metadata/datasource/confluentschemaregistry/ConfluentSchemaRegistryMetadata.scala b/app/src/main/scala/io/github/datacatering/datacaterer/core/generator/metadata/datasource/confluentschemaregistry/ConfluentSchemaRegistryMetadata.scala new file mode 100644 index 00000000..e44eeae7 --- /dev/null +++ b/app/src/main/scala/io/github/datacatering/datacaterer/core/generator/metadata/datasource/confluentschemaregistry/ConfluentSchemaRegistryMetadata.scala @@ -0,0 +1,112 @@ +package io.github.datacatering.datacaterer.core.generator.metadata.datasource.confluentschemaregistry + +import io.github.datacatering.datacaterer.api.model.Constants.{CONFLUENT_SCHEMA_REGISTRY_ID, CONFLUENT_SCHEMA_REGISTRY_MESSAGE_NAME, CONFLUENT_SCHEMA_REGISTRY_SUBJECT, CONFLUENT_SCHEMA_REGISTRY_VERSION, DATA_SOURCE_NAME, DEFAULT_CONFLUENT_SCHEMA_REGISTRY_VERSION, DEFAULT_FIELD_TYPE, FACET_DATA_SOURCE, FIELD_DATA_TYPE, FIELD_DESCRIPTION, JDBC, JDBC_TABLE, METADATA_IDENTIFIER, METADATA_SOURCE_URL, OPEN_LINEAGE_NAMESPACE, URI} +import io.github.datacatering.datacaterer.core.exception.{FailedConfluentSchemaRegistryHttpCallException, InvalidConfluentSchemaRegistryResponseException, InvalidConfluentSchemaRegistrySchemaRequestException} +import io.github.datacatering.datacaterer.core.generator.metadata.datasource.database.FieldMetadata +import io.github.datacatering.datacaterer.core.generator.metadata.datasource.{DataSourceMetadata, SubDataSourceMetadata} +import io.github.datacatering.datacaterer.core.model.confluentschemaregistry.{ConfluentSchemaRegistrySchemaResponse, ConfluentSchemaRegistrySubjectVersionResponse} +import io.github.datacatering.datacaterer.core.model.openlineage.{ListDatasetResponse, OpenLineageDataset} +import io.github.datacatering.datacaterer.core.parser.ProtobufParser +import io.github.datacatering.datacaterer.core.util.ObjectMapperUtil +import org.apache.log4j.Logger +import org.apache.spark.sql.{Dataset, SparkSession} +import org.asynchttpclient.Dsl.asyncHttpClient +import org.asynchttpclient.{AsyncHttpClient, Response} + +import scala.util.{Failure, Success, Try} + +case class ConfluentSchemaRegistryMetadata( + name: String, + format: String, + connectionConfig: Map[String, String], + asyncHttpClient: AsyncHttpClient = asyncHttpClient + ) extends DataSourceMetadata { + require( + connectionConfig.contains(METADATA_SOURCE_URL), + s"Configuration missing for Confluent Schema Registry metadata source, metadata-source=$name, missing-configuration=$METADATA_SOURCE_URL" + ) + + private val LOGGER = Logger.getLogger(getClass.getName) + private val OPT_SUBJECT = connectionConfig.get(CONFLUENT_SCHEMA_REGISTRY_SUBJECT) + private val OPT_VERSION = connectionConfig.get(CONFLUENT_SCHEMA_REGISTRY_VERSION) + private val OPT_SCHEMA_ID = connectionConfig.get(CONFLUENT_SCHEMA_REGISTRY_ID) + private val BASE_URL = connectionConfig(METADATA_SOURCE_URL) + + override val hasSourceData: Boolean = false + + override def getSubDataSourcesMetadata(implicit sparkSession: SparkSession): Array[SubDataSourceMetadata] = { + val (schemaType, schema) = (OPT_SCHEMA_ID, OPT_SUBJECT, OPT_VERSION) match { + case (Some(id), _, _) => + val baseSchema = getSchema(id) + (baseSchema.schemaType, baseSchema.schema) + case (None, Some(subject), Some(version)) => + val baseSchema = getSchema(subject, version) + (baseSchema.schemaType, baseSchema.schema) + case (None, Some(subject), None) => + val baseSchema = getSchema(subject, DEFAULT_CONFLUENT_SCHEMA_REGISTRY_VERSION) + (baseSchema.schemaType, baseSchema.schema) + case (None, None, Some(version)) => + throw InvalidConfluentSchemaRegistrySchemaRequestException("subject") + case _ => + throw InvalidConfluentSchemaRegistrySchemaRequestException("id, subject, version") + } + + val parsedSchema = if (schemaType.equalsIgnoreCase("protobuf")) { + val optMessageName = connectionConfig.get(CONFLUENT_SCHEMA_REGISTRY_MESSAGE_NAME) + optMessageName match { + case Some(messageName) => ProtobufParser.getMessageFromProtoString(schema, messageName) + case None => throw InvalidConfluentSchemaRegistrySchemaRequestException(CONFLUENT_SCHEMA_REGISTRY_MESSAGE_NAME) + } + } else if (schemaType.equalsIgnoreCase("avro")) { + + } else if (schemaType.equalsIgnoreCase("json")) { + + } + Array() + } + + override def getAdditionalColumnMetadata(implicit sparkSession: SparkSession): Dataset[FieldMetadata] = { + sparkSession.emptyDataset + } + + override def close(): Unit = { + asyncHttpClient.close() + } + + def toMetadataIdentifier(dataset: OpenLineageDataset) = s"${dataset.id.namespace}_${dataset.id.name}" + + def getSchema(id: String): ConfluentSchemaRegistrySchemaResponse = { + val response = getResponse(s"$BASE_URL/schemas/id/$id") + val tryParseResponse = Try( + ObjectMapperUtil.jsonObjectMapper.readValue(response.getResponseBody, classOf[ConfluentSchemaRegistrySchemaResponse]) + ) + getResponse(tryParseResponse) + } + + def getSchema(subject: String, version: String): ConfluentSchemaRegistrySubjectVersionResponse = { + val response = getResponse(s"$BASE_URL/subjects/$subject/versions/$version") + val tryParseResponse = Try( + ObjectMapperUtil.jsonObjectMapper.readValue(response.getResponseBody, classOf[ConfluentSchemaRegistrySubjectVersionResponse]) + ) + getResponse(tryParseResponse) + } + + private def getResponse(url: String): Response = { + val tryRequest = Try(asyncHttpClient.prepareGet(url).execute().get()) + tryRequest match { + case Failure(exception) => throw FailedConfluentSchemaRegistryHttpCallException(url, exception) + case Success(value) => value + } + } + + private def getResponse[T](tryParse: Try[T]): T = { + tryParse match { + case Failure(exception) => + LOGGER.error("Failed to parse response from Confluent Schema Registry") + throw InvalidConfluentSchemaRegistryResponseException(exception) + case Success(value) => + LOGGER.debug("Successfully parse response from Marquez to OpenLineage definition") + value + } + } +} diff --git a/app/src/main/scala/io/github/datacatering/datacaterer/core/generator/provider/DataGenerator.scala b/app/src/main/scala/io/github/datacatering/datacaterer/core/generator/provider/DataGenerator.scala index f71b4ef1..71008b8a 100644 --- a/app/src/main/scala/io/github/datacatering/datacaterer/core/generator/provider/DataGenerator.scala +++ b/app/src/main/scala/io/github/datacatering/datacaterer/core/generator/provider/DataGenerator.scala @@ -1,11 +1,10 @@ package io.github.datacatering.datacaterer.core.generator.provider -import io.github.datacatering.datacaterer.api.model.Constants.{ARRAY_MAXIMUM_LENGTH, ARRAY_MINIMUM_LENGTH, ENABLED_EDGE_CASE, ENABLED_NULL, IS_UNIQUE, PROBABILITY_OF_EDGE_CASE, PROBABILITY_OF_NULL, RANDOM_SEED, STATIC} +import io.github.datacatering.datacaterer.api.model.Constants.{ARRAY_MAXIMUM_LENGTH, ARRAY_MINIMUM_LENGTH, ENABLED_EDGE_CASE, ENABLED_NULL, IS_UNIQUE, MAP_MAXIMUM_SIZE, MAP_MINIMUM_SIZE, PROBABILITY_OF_EDGE_CASE, PROBABILITY_OF_NULL, RANDOM_SEED, STATIC} import io.github.datacatering.datacaterer.api.model.generator.BaseGenerator import io.github.datacatering.datacaterer.core.exception.ExhaustedUniqueValueGenerationException import io.github.datacatering.datacaterer.core.model.Constants.DATA_CATERER_RANDOM_LENGTH import net.datafaker.Faker -import org.apache.spark.sql.execution.SparkSqlParser import org.apache.spark.sql.functions.{expr, rand, when} import org.apache.spark.sql.types.StructField @@ -133,9 +132,26 @@ trait ArrayDataGenerator[T] extends NullableDataGenerator[List[T]] { def elementGenerator: DataGenerator[T] override def generate: List[T] = { - val listSize = random.nextInt(arrayMaxSize) + arrayMinSize - (arrayMinSize to listSize) + val listSize = random.nextInt * (arrayMaxSize - arrayMinSize) + arrayMinSize + (1 to listSize) .map(_ => elementGenerator.generate) .toList } +} + +trait MapDataGenerator[T, K] extends NullableDataGenerator[Map[T, K]] { + + lazy val mapMaxSize: Int = if (structField.metadata.contains(MAP_MAXIMUM_SIZE)) structField.metadata.getString(MAP_MAXIMUM_SIZE).toInt else 5 + lazy val mapMinSize: Int = if (structField.metadata.contains(MAP_MINIMUM_SIZE)) structField.metadata.getString(MAP_MINIMUM_SIZE).toInt else 0 + + def keyGenerator: DataGenerator[T] + + def valueGenerator: DataGenerator[K] + + override def generate: Map[T, K] = { + val mapSize = random.nextInt() * (mapMaxSize - mapMinSize) + mapMinSize + (1 to mapSize) + .map(_ => keyGenerator.generate -> valueGenerator.generate) + .toMap + } } \ No newline at end of file diff --git a/app/src/main/scala/io/github/datacatering/datacaterer/core/generator/provider/RandomDataGenerator.scala b/app/src/main/scala/io/github/datacatering/datacaterer/core/generator/provider/RandomDataGenerator.scala index 315dcacd..cc35e53e 100644 --- a/app/src/main/scala/io/github/datacatering/datacaterer/core/generator/provider/RandomDataGenerator.scala +++ b/app/src/main/scala/io/github/datacatering/datacaterer/core/generator/provider/RandomDataGenerator.scala @@ -1,6 +1,6 @@ package io.github.datacatering.datacaterer.core.generator.provider -import io.github.datacatering.datacaterer.api.model.Constants.{ARRAY_MAXIMUM_LENGTH, ARRAY_MINIMUM_LENGTH, DEFAULT_VALUE, DISTINCT_COUNT, DISTRIBUTION, DISTRIBUTION_EXPONENTIAL, DISTRIBUTION_NORMAL, DISTRIBUTION_RATE_PARAMETER, EXPRESSION, IS_UNIQUE, MAXIMUM, MAXIMUM_LENGTH, MEAN, MINIMUM, MINIMUM_LENGTH, NUMERIC_PRECISION, NUMERIC_SCALE, ROUND, ROW_COUNT, STANDARD_DEVIATION} +import io.github.datacatering.datacaterer.api.model.Constants._ import io.github.datacatering.datacaterer.core.exception.UnsupportedDataGeneratorType import io.github.datacatering.datacaterer.core.model.Constants._ import io.github.datacatering.datacaterer.core.util.GeneratorUtil @@ -34,11 +34,16 @@ object RandomDataGenerator { case BinaryType => new RandomBinaryDataGenerator(structField, faker) case ByteType => new RandomByteDataGenerator(structField, faker) case ArrayType(dt, _) => new RandomArrayDataGenerator(structField, dt, faker) + case MapType(kt, vt, _) => new RandomMapDataGenerator(structField, kt, vt, faker) case StructType(_) => new RandomStructTypeDataGenerator(structField, faker) case x => throw UnsupportedDataGeneratorType(s"Unsupported type for random data generation: name=${structField.name}, type=${x.typeName}") } } + def getGeneratorForDataType(dataType: DataType, faker: Faker = new Faker()): DataGenerator[_] = { + getGeneratorForStructField(StructField("", dataType)) + } + class RandomStringDataGenerator(val structField: StructField, val faker: Faker = new Faker()) extends NullableDataGenerator[String] { private val minLength = tryGetValue(structField.metadata, MINIMUM_LENGTH, 1) private val maxLength = tryGetValue(structField.metadata, MAXIMUM_LENGTH, 20) @@ -239,7 +244,8 @@ object RandomDataGenerator { class RandomBinaryDataGenerator(val structField: StructField, val faker: Faker = new Faker()) extends NullableDataGenerator[Array[Byte]] { private lazy val minLength = tryGetValue(structField.metadata, MINIMUM_LENGTH, 1) private lazy val maxLength = tryGetValue(structField.metadata, MAXIMUM_LENGTH, 20) - assert(minLength <= maxLength, s"minLength has to be less than or equal to maxLength, field-name=${structField.name}, minLength=$minLength, maxLength=$maxLength") + assert(minLength <= maxLength, s"minLength has to be less than or equal to maxLength, " + + s"field-name=${structField.name}, minLength=$minLength, maxLength=$maxLength") override val edgeCases: List[Array[Byte]] = List(Array(), "\n".getBytes, "\r".getBytes, "\t".getBytes, " ".getBytes, "\\u0000".getBytes, "\\ufff".getBytes, Array(Byte.MinValue), Array(Byte.MaxValue)) @@ -269,6 +275,10 @@ object RandomDataGenerator { class RandomArrayDataGenerator[T](val structField: StructField, val dataType: DataType, val faker: Faker = new Faker()) extends ArrayDataGenerator[T] { override lazy val arrayMinSize: Int = tryGetValue(structField.metadata, ARRAY_MINIMUM_LENGTH, 0) override lazy val arrayMaxSize: Int = tryGetValue(structField.metadata, ARRAY_MAXIMUM_LENGTH, 5) + assert(arrayMinSize >= 0, s"arrayMinSize has to be greater than or equal to 0, " + + s"field-name=${structField.name}, arrayMinSize=$arrayMinSize") + assert(arrayMinSize <= arrayMaxSize, s"arrayMinSize has to be less than or equal to arrayMaxSize, " + + s"field-name=${structField.name}, arrayMinSize=$arrayMinSize, arrayMaxSize=$arrayMaxSize") override def elementGenerator: DataGenerator[T] = { dataType match { @@ -291,6 +301,34 @@ object RandomDataGenerator { } } + class RandomMapDataGenerator[T, K]( + val structField: StructField, + val keyDataType: DataType, + val valueDataType: DataType, + val faker: Faker = new Faker() + ) extends MapDataGenerator[T, K] { + override lazy val mapMinSize: Int = tryGetValue(structField.metadata, MAP_MINIMUM_SIZE, 0) + override lazy val mapMaxSize: Int = tryGetValue(structField.metadata, MAP_MAXIMUM_SIZE, 5) + assert(mapMinSize >= 0, s"mapMinSize has to be greater than or equal to 0, " + + s"field-name=${structField.name}, mapMinSize=$mapMinSize") + assert(mapMinSize <= mapMaxSize, s"mapMinSize has to be less than or equal to mapMaxSize, " + + s"field-name=${structField.name}, mapMinSize=$mapMinSize, mapMaxSize=$mapMaxSize") + + override def keyGenerator: DataGenerator[T] = getGeneratorForDataType(keyDataType).asInstanceOf[DataGenerator[T]] + + override def valueGenerator: DataGenerator[K] = getGeneratorForDataType(valueDataType).asInstanceOf[DataGenerator[K]] + + //generate two arrays, key array and value array, then use map_from_arrays(col(keyArr), col(valueArr)) + //how to make it empty map when size is 0 + override def generateSqlExpression: String = { + val keyDataGenerator = getGeneratorForDataType(keyDataType) + val valueDataGenerator = getGeneratorForDataType(valueDataType) + val keySql = keyDataGenerator.generateSqlExpressionWrapper + val valueSql = valueDataGenerator.generateSqlExpressionWrapper + s"STR_TO_MAP(CONCAT_WS(',', TRANSFORM(ARRAY_REPEAT(1, CAST($sqlRandom * ${mapMaxSize - mapMinSize} + $mapMinSize AS INT)), i -> CONCAT($keySql, '->', $valueSql))), '->', ',')" + } + } + class RandomStructTypeDataGenerator(val structField: StructField, val faker: Faker = new Faker()) extends DataGenerator[Row] { override def generate: Row = { structField.dataType match { diff --git a/app/src/main/scala/io/github/datacatering/datacaterer/core/model/confluentschemaregistry/SchemaModels.scala b/app/src/main/scala/io/github/datacatering/datacaterer/core/model/confluentschemaregistry/SchemaModels.scala new file mode 100644 index 00000000..301a9555 --- /dev/null +++ b/app/src/main/scala/io/github/datacatering/datacaterer/core/model/confluentschemaregistry/SchemaModels.scala @@ -0,0 +1,19 @@ +package io.github.datacatering.datacaterer.core.model.confluentschemaregistry + +import com.fasterxml.jackson.annotation.JsonIgnoreProperties + + +@JsonIgnoreProperties(ignoreUnknown = true) +case class ConfluentSchemaRegistrySubjectVersionResponse( + subject: String, + version: Int, + id: Int, + schemaType: String, + schema: String, + ) + +@JsonIgnoreProperties(ignoreUnknown = true) +case class ConfluentSchemaRegistrySchemaResponse( + schemaType: String, + schema: String, + ) diff --git a/app/src/main/scala/io/github/datacatering/datacaterer/core/parser/ProtobufParser.scala b/app/src/main/scala/io/github/datacatering/datacaterer/core/parser/ProtobufParser.scala new file mode 100644 index 00000000..b9c3efb2 --- /dev/null +++ b/app/src/main/scala/io/github/datacatering/datacaterer/core/parser/ProtobufParser.scala @@ -0,0 +1,110 @@ +package io.github.datacatering.datacaterer.core.parser + +import com.google.inject.Guice +import io.github.datacatering.datacaterer.api.model.{BooleanType, ByteType, DataType, DecimalType, DoubleType, Field, FloatType, Generator, IntegerType, LongType, MapType, Schema, StringType, StructType} +import io.protostuff.compiler.ParserModule +import io.protostuff.compiler.model.{Message, ScalarFieldType, UserType} +import io.protostuff.compiler.parser.{Importer, LocalFileReader} + +import java.nio.file.{Files, Path} +import scala.jdk.CollectionConverters.iterableAsScalaIterableConverter + +object ProtobufParser { + + def getSchemaFromProtoFile(filePath: Path, messageName: String): Schema = { + val message = getMessageFromProtoFile(filePath, messageName) + protoMessageToSchema(message) + } + + def getMessageFromProtoString(protoString: String, messageName: String): Schema = { + val tmpFile = Files.createTempFile("proto", s"$messageName.proto") + Files.writeString(tmpFile, protoString) + val message = getMessageFromProtoFile(tmpFile, messageName) + protoMessageToSchema(message) + } + + private def getMessageFromProtoFile(filePath: Path, messageName: String): Message = { + val injector = Guice.createInjector(new ParserModule) + val importer = injector.getInstance(classOf[Importer]) + val protoContext = importer.importFile(new LocalFileReader(filePath.getParent), filePath.getFileName.toString) + + protoContext.getProto.getMessage(messageName) + } + + def protoMessageToSchema(message: Message): Schema = { + if (message.getFields != null && !message.getFields.isEmpty) { + val mappedFields = message.getFields.asScala.map(field => { + val (dataType, opts) = field.getType match { + case fieldType: ScalarFieldType => + val baseType = fieldType match { + case ScalarFieldType.INT32 | ScalarFieldType.SINT32 | + ScalarFieldType.FIXED32 | ScalarFieldType.SFIXED32 => IntegerType + case ScalarFieldType.UINT32 | ScalarFieldType.INT64 | ScalarFieldType.SINT64 | + ScalarFieldType.FIXED64 | ScalarFieldType.SFIXED64 => LongType + case ScalarFieldType.UINT64 => DecimalType + case ScalarFieldType.FLOAT => FloatType + case ScalarFieldType.DOUBLE => DoubleType + case ScalarFieldType.BOOL => BooleanType + case ScalarFieldType.STRING => StringType + case ScalarFieldType.BYTES => ByteType + } + (baseType, Map[String, String]()) + case userType: UserType => + if (userType.isEnum) { + val baseEnum = if (userType.isNested) { + userType.getParent.getEnum(userType.getName) + } else { + userType.getProto.getEnum(userType.getName) + } + val oneOf = baseEnum.getConstantNames.asScala.mkString(",") + (StringType, Map("oneOf" -> oneOf)) + } else if (userType.isMessage && !field.isMap) { + val baseMessage = if (userType.isNested) { + userType.getParent.getMessage(userType.getName) + } else { + userType.getProto.getMessage(userType.getName) + } + val innerSchema = protoMessageToSchema(baseMessage) + val innerFields = innerSchema.fields.getOrElse(List()) + .map(f => f.name -> DataType.fromString(f.`type`.getOrElse(StringType.toString))) + (new StructType(innerFields), Map[String, String]()) + } else if (field.isMap) { + val innerMessage = userType.asInstanceOf[Message] + val innerSchema = protoMessageToSchema(innerMessage).fields.getOrElse(List()) + val keyType = innerSchema.find(f => f.name == "key") + .map(_.`type`.getOrElse("string")) + .getOrElse("string") + val valueType = innerSchema.find(f => f.name == "value") + .map(_.`type`.getOrElse("string")) + .getOrElse("string") + (new MapType(DataType.fromString(keyType), DataType.fromString(valueType)), Map[String, String]()) + } else { + (StringType, Map[String, String]()) + } + case _ => + //check if it is a map or array type + if (field.isMap) { + new MapType() + } else if (field.isRepeated) { + + } else if (field.isOneofPart) { + field.getOneof + } else { + StringType + } + (StringType, Map[String, String]()) + } + Field( + field.getName, + Some(dataType.toString), + Some(Generator(options = opts)), + nullable = field.getModifier.name() == "OPTIONAL" + ) + }).toList + Schema(Some(mappedFields)) + } else { + Schema(None) + } + } + +} diff --git a/app/src/main/scala/io/github/datacatering/datacaterer/core/util/MetadataUtil.scala b/app/src/main/scala/io/github/datacatering/datacaterer/core/util/MetadataUtil.scala index ed17f4cb..b700e098 100644 --- a/app/src/main/scala/io/github/datacatering/datacaterer/core/util/MetadataUtil.scala +++ b/app/src/main/scala/io/github/datacatering/datacaterer/core/util/MetadataUtil.scala @@ -5,6 +5,7 @@ import io.github.datacatering.datacaterer.api.model.{Field, MetadataConfig, Step import io.github.datacatering.datacaterer.core.exception.UnsupportedDataFormatForTrackingException import io.github.datacatering.datacaterer.core.generator.metadata.ExpressionPredictor import io.github.datacatering.datacaterer.core.generator.metadata.datasource.DataSourceMetadata +import io.github.datacatering.datacaterer.core.generator.metadata.datasource.confluentschemaregistry.ConfluentSchemaRegistryMetadata import io.github.datacatering.datacaterer.core.generator.metadata.datasource.database.{CassandraMetadata, FieldMetadata, MysqlMetadata, PostgresMetadata} import io.github.datacatering.datacaterer.core.generator.metadata.datasource.datacontractcli.DataContractCliDataSourceMetadata import io.github.datacatering.datacaterer.core.generator.metadata.datasource.file.FileMetadata @@ -217,6 +218,7 @@ object MetadataUtil { case GREAT_EXPECTATIONS => Some(GreatExpectationsDataSourceMetadata(connectionConfig._1, format, connectionConfig._2)) case OPEN_DATA_CONTRACT_STANDARD => Some(OpenDataContractStandardDataSourceMetadata(connectionConfig._1, format, connectionConfig._2)) case DATA_CONTRACT_CLI => Some(DataContractCliDataSourceMetadata(connectionConfig._1, format, connectionConfig._2)) + case CONFLUENT_SCHEMA_REGISTRY => Some(ConfluentSchemaRegistryMetadata(connectionConfig._1, format, connectionConfig._2)) case metadataSourceType => LOGGER.warn(s"Unsupported external metadata source, connection-name=${connectionConfig._1}, metadata-source-type=$metadataSourceType") None diff --git a/app/src/main/scala/io/github/datacatering/datacaterer/core/util/ProtobufUtil.scala b/app/src/main/scala/io/github/datacatering/datacaterer/core/util/ProtobufUtil.scala index cc5d52ee..1eab20cf 100644 --- a/app/src/main/scala/io/github/datacatering/datacaterer/core/util/ProtobufUtil.scala +++ b/app/src/main/scala/io/github/datacatering/datacaterer/core/util/ProtobufUtil.scala @@ -1,75 +1,224 @@ package io.github.datacatering.datacaterer.core.util -import com.google.protobuf.DescriptorProtos import com.google.protobuf.DescriptorProtos.FieldDescriptorProto import com.google.protobuf.Descriptors.FieldDescriptor import com.google.protobuf.Descriptors.FieldDescriptor.JavaType -import org.apache.spark.sql.types.{DataType, DataTypes, StructField, StructType} +import com.google.protobuf.{BoolValue, BytesValue, DescriptorProtos, DoubleValue, FloatValue, Int32Value, Int64Value, StringValue, UInt32Value, UInt64Value, WireFormat} +import io.github.datacatering.datacaterer.api.model.{ArrayType, BinaryType, BooleanType, DataType, DecimalType, DoubleType, Field, FloatType, IntegerType, LongType, MapType, StringType, StructType, TimestampType} +import io.github.datacatering.datacaterer.core.exception.UnsupportedProtobufType +import org.apache.log4j.Logger +import org.apache.spark.sql.protobuf.utils.SchemaConverters +import org.apache.spark.sql.types.{DataTypes, StructField} import java.io.{BufferedInputStream, FileInputStream} import scala.collection.JavaConverters.asScalaBufferConverter object ProtobufUtil { + private val LOGGER = Logger.getLogger(getClass.getName) + def toStructType(descriptorFile: String): Map[String, StructType] = { val file = new BufferedInputStream(new FileInputStream(descriptorFile)) + val descriptorProto = DescriptorProtos.DescriptorProto.parseFrom(file) val fileDescriptorSet = DescriptorProtos.FileDescriptorSet.parseFrom(file) - fileDescriptorSet.getFileList.asScala - .flatMap(fd => { - fd.getMessageTypeList.asScala.toList.map(message => { - (message.getName, StructType(getSchemaFromFieldsProto(message.getFieldList.asScala.toList))) - }) - // (fd.getName, StructType(getSchemaFromFields(fd.getMessageTypeList.asScala.toList))) - }).toMap +// fileDescriptorSet.getFileList.asScala +// .flatMap(fd => { +// fd.getMessageTypeList.asScala.toList.map(message => { +// (message.getName, StructType(getSchemaFromFieldsProto(message.getFieldList.asScala.toList))) +// }) +// // (fd.getName, StructType(getSchemaFromFields(fd.getMessageTypeList.asScala.toList))) +// }).toMap + Map() } - private def getSchemaFromFields(fields: List[FieldDescriptor]): Array[StructField] = { - fields.map(field => { - val dataType = getDataTypeForField(field) - StructField(field.getName, dataType, !field.isRequired) - }).toArray - } +// private def getSchemaFromFields(fields: List[FieldDescriptor]): Array[StructField] = { +// fields.map(field => { +// val dataType = getDataTypeForField(field) +// StructField(field.getName, dataType, !field.isRequired) +// }).toArray +// } +// +// private def getSchemaFromFieldsProto(fields: List[FieldDescriptorProto]): Array[StructField] = { +// fields.map(field => { +// val dataType = getDataTypeForField(field) +// StructField(field.getName, dataType) +// }).toArray +// } +// +// private def getDataTypeForField(fieldDescriptor: FieldDescriptor): DataType = { +// fieldDescriptor.getJavaType match { +// case JavaType.BOOLEAN => DataTypes.BooleanType +// case JavaType.INT => DataTypes.IntegerType +// case JavaType.LONG => DataTypes.LongType +// case JavaType.DOUBLE => DataTypes.DoubleType +// case JavaType.FLOAT => DataTypes.FloatType +// case JavaType.STRING => DataTypes.StringType +// case JavaType.ENUM => DataTypes.StringType +// case JavaType.BYTE_STRING => DataTypes.BinaryType +// case JavaType.MESSAGE => +// new StructType(getSchemaFromFields(fieldDescriptor.getMessageType.getFields.asScala.toList)) +// case _ => throw new RuntimeException(s"Unable to parse proto type, type=${fieldDescriptor.getType}") +// } +// } +// +// private def getDataTypeForField(fieldDescriptor: FieldDescriptorProto): DataType = { +// // val nonProtoField = FieldDescriptor.Type.valueOf(fieldDescriptor.getType) +// FieldDescriptor.Type.valueOf(fieldDescriptor.getType).getJavaType match { +// case JavaType.BOOLEAN => DataTypes.BooleanType +// case JavaType.INT => DataTypes.IntegerType +// case JavaType.LONG => DataTypes.LongType +// case JavaType.DOUBLE => DataTypes.DoubleType +// case JavaType.FLOAT => DataTypes.FloatType +// case JavaType.STRING => DataTypes.StringType +// case JavaType.ENUM => DataTypes.StringType +// case JavaType.BYTE_STRING => DataTypes.BinaryType +// case JavaType.MESSAGE => +// new StructType(getSchemaFromFields(fieldDescriptor.getDescriptorForType.getFields.asScala.toList)) +// case _ => throw new RuntimeException(s"Unable to parse proto type, type=${fieldDescriptor}") +// } +// } - private def getSchemaFromFieldsProto(fields: List[FieldDescriptorProto]): Array[StructField] = { - fields.map(field => { - val dataType = getDataTypeForField(field) - StructField(field.getName, dataType) - }).toArray - } + //comes from https://github.com/apache/spark/blob/master/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/SchemaConverters.scala#L68 + def fieldFromDescriptor( + fd: FieldDescriptor, + existingRecordNames: Map[String, Int] + ): Option[Field] = { + import com.google.protobuf.Descriptors.FieldDescriptor.JavaType._ - private def getDataTypeForField(fieldDescriptor: FieldDescriptor): DataType = { - fieldDescriptor.getJavaType match { - case JavaType.BOOLEAN => DataTypes.BooleanType - case JavaType.INT => DataTypes.IntegerType - case JavaType.LONG => DataTypes.LongType - case JavaType.DOUBLE => DataTypes.DoubleType - case JavaType.FLOAT => DataTypes.FloatType - case JavaType.STRING => DataTypes.StringType - case JavaType.ENUM => DataTypes.StringType - case JavaType.BYTE_STRING => DataTypes.BinaryType - case JavaType.MESSAGE => { - new StructType(getSchemaFromFields(fieldDescriptor.getMessageType.getFields.asScala.toList)) + val dataType = fd.getJavaType match { + // When the protobuf type is unsigned and upcastUnsignedIntegers has been set, + // use a larger type (LongType and Decimal(20,0) for uint32 and uint64). + case INT => + if (fd.getLiteType == WireFormat.FieldType.UINT32) { + Some(LongType) + } else { + Some(IntegerType) + } + case LONG => if (fd.getLiteType == WireFormat.FieldType.UINT64) { + Some(DecimalType) + } else { + Some(LongType) } - case _ => throw new RuntimeException(s"Unable to parse proto type, type=${fieldDescriptor.getType}") - } - } + case FLOAT => Some(FloatType) + case DOUBLE => Some(DoubleType) + case BOOLEAN => Some(BooleanType) + case STRING => Some(StringType) + case BYTE_STRING => Some(BinaryType) + case ENUM => Some(StringType) + case MESSAGE + if fd.getMessageType.getName == "Duration" && + fd.getMessageType.getFields.size() == 2 && + fd.getMessageType.getFields.get(0).getName.equals("seconds") && + fd.getMessageType.getFields.get(1).getName.equals("nanos") => + LOGGER.warn(s"DateTimeInterval is not a supported data type, field-name=${fd.getFullName}") + None + case MESSAGE + if fd.getMessageType.getName == "Timestamp" && + fd.getMessageType.getFields.size() == 2 && + fd.getMessageType.getFields.get(0).getName.equals("seconds") && + fd.getMessageType.getFields.get(1).getName.equals("nanos") => + Some(TimestampType) + case MESSAGE if fd.getMessageType.getFullName == "google.protobuf.Any" => + Some(StringType) + // Unwrap well known primitive wrapper types if the option has been set. + case MESSAGE if fd.getMessageType.getFullName == BoolValue.getDescriptor.getFullName => + Some(BooleanType) + case MESSAGE if fd.getMessageType.getFullName == Int32Value.getDescriptor.getFullName => + Some(IntegerType) + case MESSAGE if fd.getMessageType.getFullName == UInt32Value.getDescriptor.getFullName => + Some(LongType) + case MESSAGE if fd.getMessageType.getFullName == Int64Value.getDescriptor.getFullName => + Some(LongType) + case MESSAGE if fd.getMessageType.getFullName == UInt64Value.getDescriptor.getFullName => + Some(DecimalType) + case MESSAGE if fd.getMessageType.getFullName == StringValue.getDescriptor.getFullName => + Some(StringType) + case MESSAGE if fd.getMessageType.getFullName == BytesValue.getDescriptor.getFullName => + Some(BinaryType) + case MESSAGE if fd.getMessageType.getFullName == FloatValue.getDescriptor.getFullName => + Some(FloatType) + case MESSAGE if fd.getMessageType.getFullName == DoubleValue.getDescriptor.getFullName => + Some(DoubleType) - private def getDataTypeForField(fieldDescriptor: FieldDescriptorProto): DataType = { - // val nonProtoField = FieldDescriptor.Type.valueOf(fieldDescriptor.getType) - FieldDescriptor.Type.valueOf(fieldDescriptor.getType).getJavaType match { - case JavaType.BOOLEAN => DataTypes.BooleanType - case JavaType.INT => DataTypes.IntegerType - case JavaType.LONG => DataTypes.LongType - case JavaType.DOUBLE => DataTypes.DoubleType - case JavaType.FLOAT => DataTypes.FloatType - case JavaType.STRING => DataTypes.StringType - case JavaType.ENUM => DataTypes.StringType - case JavaType.BYTE_STRING => DataTypes.BinaryType - case JavaType.MESSAGE => { - new StructType(getSchemaFromFields(fieldDescriptor.getDescriptorForType.getFields.asScala.toList)) - } - case _ => throw new RuntimeException(s"Unable to parse proto type, type=${fieldDescriptor}") + case MESSAGE if fd.isRepeated && fd.getMessageType.getOptions.hasMapEntry => + var keyType: Option[DataType] = None + var valueType: Option[DataType] = None + fd.getMessageType.getFields.forEach { field => + field.getName match { + case "key" => + keyType = + fieldFromDescriptor( + field, + existingRecordNames + ).map(f => DataType.fromString(f.`type`.getOrElse("string"))) + case "value" => + valueType = + fieldFromDescriptor( + field, + existingRecordNames + ).map(f => DataType.fromString(f.`type`.getOrElse("string"))) + } + } + (keyType, valueType) match { + case (None, _) => + // This is probably never expected. Protobuf does not allow complex types for keys. + LOGGER.info(s"Dropping map field ${fd.getFullName}. Key reached max recursive depth.") + None + case (_, None) => + LOGGER.info(s"Dropping map field ${fd.getFullName}. Value reached max recursive depth.") + None + case (Some(kt), Some(vt)) => Some(new MapType(kt, vt)) + } + case MESSAGE => + // If the `recursive.fields.max.depth` value is not specified, it will default to -1, + // and recursive fields are not permitted. Setting it to 1 drops all recursive fields, + // 2 allows it to be recursed once, and 3 allows it to be recursed twice and so on. + // A value less than or equal to 0 or greater than 10 is not allowed, and if a protobuf + // record has more depth for recursive fields than the allowed value, it will be truncated + // and some fields may be discarded. + // SQL Schema for protob2uf `message Person { string name = 1; Person bff = 2;}` + // will vary based on the value of "recursive.fields.max.depth". + // 1: struct + // 2: struct> + // 3: struct>> + // and so on. + // TODO(rangadi): A better way to terminate would be replace the remaining recursive struct + // with the byte array of corresponding protobuf. This way no information is lost. + // i.e. with max depth 2, the above looks like this: + // struct> + val recordName = fd.getMessageType.getFullName + val recursiveDepth = existingRecordNames.getOrElse(recordName, 0) + val recursiveFieldMaxDepth = 2 + if (existingRecordNames.contains(recordName) && + recursiveDepth >= recursiveFieldMaxDepth) { + // Recursive depth limit is reached. This field is dropped. + // If it is inside a container like map or array, the containing field is dropped. + LOGGER.info( + s"The field ${fd.getFullName} of type $recordName is dropped " + + s"at recursive depth $recursiveDepth" + ) + None + } else { + val newRecordNames = existingRecordNames + (recordName -> (recursiveDepth + 1)) + val fields = fd.getMessageType.getFields.asScala.flatMap( + fieldFromDescriptor(_, newRecordNames) + ).toList + fields match { + case Nil => + LOGGER.info( + s"Dropping ${fd.getFullName} as it does not have any fields left " + + "likely due to recursive depth limit." + ) + None + case fds => Some(new StructType(fds.map(f => f.name -> DataType.fromString(f.`type`.getOrElse("string"))))) + } + } + case other => throw UnsupportedProtobufType(other.toString) + } + dataType.map { + case dt: MapType => Field(fd.getName, Some(dt.toString)) + case dt if fd.isRepeated => Field(fd.getName, Some(new ArrayType(dt).toString)) + case dt => Field(fd.getName, Some(dt.toString), nullable = !fd.isRequired) } } - } diff --git a/app/src/test/resources/sample/files/protobuf/example.proto b/app/src/test/resources/sample/files/protobuf/example.proto index b1d6af1b..3c31c8a6 100644 --- a/app/src/test/resources/sample/files/protobuf/example.proto +++ b/app/src/test/resources/sample/files/protobuf/example.proto @@ -14,6 +14,9 @@ message Proto3AllTypes { FIRST = 1; SECOND = 2; } + message NestedMessage { + string nst_msg = 1; + } int64 int = 1; string text = 2; @@ -32,4 +35,5 @@ message Proto3AllTypes { string option_b = 12; } map map = 13; + NestedMessage nst_msg_val = 14; } \ No newline at end of file diff --git a/app/src/test/scala/io/github/datacatering/datacaterer/core/generator/metadata/datasource/jms/JmsMetadataTest.scala b/app/src/test/scala/io/github/datacatering/datacaterer/core/generator/metadata/datasource/jms/JmsMetadataTest.scala index ec2fe78a..6243ba0c 100644 --- a/app/src/test/scala/io/github/datacatering/datacaterer/core/generator/metadata/datasource/jms/JmsMetadataTest.scala +++ b/app/src/test/scala/io/github/datacatering/datacaterer/core/generator/metadata/datasource/jms/JmsMetadataTest.scala @@ -1,6 +1,7 @@ package io.github.datacatering.datacaterer.core.generator.metadata.datasource.jms import io.github.datacatering.datacaterer.core.generator.Holder +import io.github.datacatering.datacaterer.core.parser.ProtobufParser import io.github.datacatering.datacaterer.core.util.{ProtobufUtil, SparkSuite} import org.apache.spark.sql.avro.functions.from_avro import org.apache.spark.sql.functions.lit @@ -10,6 +11,7 @@ import org.junit.runner.RunWith import org.scalatestplus.junit.JUnitRunner import java.io.File +import java.nio.file.Paths @RunWith(classOf[JUnitRunner]) class JmsMetadataTest extends SparkSuite { @@ -48,10 +50,16 @@ class JmsMetadataTest extends SparkSuite { protobufData.printSchema() } - ignore("can read all structs from proto descriptor file") { + test("can read all structs from proto descriptor file") { val protoFile = new File("app/src/test/resources/sample/files/protobuf/example.desc").getAbsolutePath val structs = ProtobufUtil.toStructType(protoFile) structs } + test("can read all structs from proto file") { + val protoFile = new File("app/src/test/resources/sample/files/protobuf/example.proto") + val schema = ProtobufParser.getSchemaFromProtoFile(Paths.get(protoFile.toURI), "Proto3AllTypes") + schema + } + } diff --git a/app/src/test/scala/io/github/datacatering/datacaterer/core/generator/provider/RandomDataGeneratorTest.scala b/app/src/test/scala/io/github/datacatering/datacaterer/core/generator/provider/RandomDataGeneratorTest.scala index 1c7d782a..b19a514f 100644 --- a/app/src/test/scala/io/github/datacatering/datacaterer/core/generator/provider/RandomDataGeneratorTest.scala +++ b/app/src/test/scala/io/github/datacatering/datacaterer/core/generator/provider/RandomDataGeneratorTest.scala @@ -1,6 +1,6 @@ package io.github.datacatering.datacaterer.core.generator.provider -import io.github.datacatering.datacaterer.api.model.Constants.{ARRAY_MINIMUM_LENGTH, DISTINCT_COUNT, DISTRIBUTION, DISTRIBUTION_EXPONENTIAL, DISTRIBUTION_NORMAL, DISTRIBUTION_RATE_PARAMETER, ENABLED_EDGE_CASE, ENABLED_NULL, EXPRESSION, IS_UNIQUE, MAXIMUM, MAXIMUM_LENGTH, MEAN, MINIMUM, MINIMUM_LENGTH, PROBABILITY_OF_EDGE_CASE, PROBABILITY_OF_NULL, ROUND, ROW_COUNT, STANDARD_DEVIATION} +import io.github.datacatering.datacaterer.api.model.Constants.{ARRAY_MINIMUM_LENGTH, DISTINCT_COUNT, DISTRIBUTION, DISTRIBUTION_EXPONENTIAL, DISTRIBUTION_NORMAL, DISTRIBUTION_RATE_PARAMETER, ENABLED_EDGE_CASE, ENABLED_NULL, EXPRESSION, IS_UNIQUE, MAP_MAXIMUM_SIZE, MAP_MINIMUM_SIZE, MAXIMUM, MAXIMUM_LENGTH, MEAN, MINIMUM, MINIMUM_LENGTH, PROBABILITY_OF_EDGE_CASE, PROBABILITY_OF_NULL, ROUND, ROW_COUNT, STANDARD_DEVIATION} import io.github.datacatering.datacaterer.core.generator.provider.RandomDataGenerator._ import io.github.datacatering.datacaterer.core.model.Constants.INDEX_INC_COL import org.apache.spark.sql.types._ @@ -356,7 +356,7 @@ class RandomDataGeneratorTest extends AnyFunSuite { val intGenerator = new RandomIntDataGenerator(StructField("random_int", IntegerType, false, metadata)) assert(intGenerator.edgeCases.nonEmpty) - assertResult(s"CAST(ROUND(RANDN() + 0, 0) AS INT)")(intGenerator.generateSqlExpression) + assertResult("CAST(ROUND(RANDN() + 0, 0) AS INT)")(intGenerator.generateSqlExpression) } test("Can create random int generator with exponential distribution") { @@ -364,7 +364,7 @@ class RandomDataGeneratorTest extends AnyFunSuite { val intGenerator = new RandomIntDataGenerator(StructField("random_int", IntegerType, false, metadata)) assert(intGenerator.edgeCases.nonEmpty) - assertResult(s"CAST(ROUND(GREATEST(0, LEAST(100000, 100000 * (-LN(1 - RAND()) / 1.0) + 0)), 0) AS INT)")(intGenerator.generateSqlExpression) + assertResult("CAST(ROUND(GREATEST(0, LEAST(100000, 100000 * (-LN(1 - RAND()) / 1.0) + 0)), 0) AS INT)")(intGenerator.generateSqlExpression) } test("Can create random int generator with exponential distribution within max and min") { @@ -372,7 +372,7 @@ class RandomDataGeneratorTest extends AnyFunSuite { val intGenerator = new RandomIntDataGenerator(StructField("random_int", IntegerType, false, metadata)) assert(intGenerator.edgeCases.nonEmpty) - assertResult(s"CAST(ROUND(GREATEST(10, LEAST(100, 90 * (-LN(1 - RAND()) / 1.0) + 10)), 0) AS INT)")(intGenerator.generateSqlExpression) + assertResult("CAST(ROUND(GREATEST(10, LEAST(100, 90 * (-LN(1 - RAND()) / 1.0) + 10)), 0) AS INT)")(intGenerator.generateSqlExpression) } test("Can create random int generator with exponential distribution with rate parameter") { @@ -380,6 +380,23 @@ class RandomDataGeneratorTest extends AnyFunSuite { val intGenerator = new RandomIntDataGenerator(StructField("random_int", IntegerType, false, metadata)) assert(intGenerator.edgeCases.nonEmpty) - assertResult(s"CAST(ROUND(GREATEST(0, LEAST(100000, 100000 * (-LN(1 - RAND()) / 2.0) + 0)), 0) AS INT)")(intGenerator.generateSqlExpression) + assertResult("CAST(ROUND(GREATEST(0, LEAST(100000, 100000 * (-LN(1 - RAND()) / 2.0) + 0)), 0) AS INT)")(intGenerator.generateSqlExpression) + } + + test("Can create random map generator") { + val metadata = new MetadataBuilder().build() + val mapGenerator = new RandomMapDataGenerator[String, String](StructField("random_map", StringType, false, metadata), StringType, StringType) + + val res = mapGenerator.generate + assert(res.isInstanceOf[Map[String, String]]) + assert(mapGenerator.generateSqlExpression.startsWith("STR_TO_MAP(CONCAT_WS(',', TRANSFORM(ARRAY_REPEAT(1, CAST(RAND() * 5 + 0 AS INT)), i -> CONCAT(")) + } + + test("Can create random map generator with min and max map size") { + val metadata = new MetadataBuilder().putString(MAP_MAXIMUM_SIZE, "3").putString(MAP_MINIMUM_SIZE, "3").build() + val mapGenerator = new RandomMapDataGenerator[String, String](StructField("random_map", StringType, false, metadata), StringType, StringType) + + val res = mapGenerator.generate + assertResult(3)(res.size) } } diff --git a/app/src/test/scala/io/github/datacatering/datacaterer/core/plan/PlanProcessorTest.scala b/app/src/test/scala/io/github/datacatering/datacaterer/core/plan/PlanProcessorTest.scala index e43a44dc..ef21d275 100644 --- a/app/src/test/scala/io/github/datacatering/datacaterer/core/plan/PlanProcessorTest.scala +++ b/app/src/test/scala/io/github/datacatering/datacaterer/core/plan/PlanProcessorTest.scala @@ -2,7 +2,7 @@ package io.github.datacatering.datacaterer.core.plan import io.github.datacatering.datacaterer.api.PlanRun import io.github.datacatering.datacaterer.api.model.Constants.{OPEN_METADATA_AUTH_TYPE_OPEN_METADATA, OPEN_METADATA_JWT_TOKEN, OPEN_METADATA_TABLE_FQN, PARTITIONS, ROWS_PER_SECOND, SAVE_MODE, VALIDATION_IDENTIFIER} -import io.github.datacatering.datacaterer.api.model.{ArrayType, DateType, DoubleType, HeaderType, IntegerType, TimestampType} +import io.github.datacatering.datacaterer.api.model.{ArrayType, DateType, DoubleType, HeaderType, IntegerType, MapType, TimestampType} import io.github.datacatering.datacaterer.core.model.Constants.METADATA_FILTER_OUT_SCHEMA import io.github.datacatering.datacaterer.core.util.{ObjectMapperUtil, SparkSuite} import org.asynchttpclient.DefaultAsyncHttpClientConfig @@ -33,6 +33,7 @@ class PlanProcessorTest extends SparkSuite { field.name("balance").`type`(DoubleType).min(10).max(1000).round(2), field.name("date").`type`(DateType).min(Date.valueOf("2022-01-01")), field.name("status").oneOf(accountStatus: _*), + field.name("rand_map").`type`(MapType), field.name("update_history") .`type`(ArrayType) .schema( From 09b044c6ce8b597492a0db40fadab0102b53d012 Mon Sep 17 00:00:00 2001 From: Flook Peter Date: Wed, 25 Dec 2024 19:48:15 +0800 Subject: [PATCH 2/2] Major refactor of plan attribute, add new types of validations, refactor usage of HTTP APIs, remove spray-json --- README.md | 32 +- .../datacaterer/javaapi/api/PlanRun.java | 46 +- .../api/MetadataSourceBuilder.scala | 26 +- .../datacaterer/api/PlanBuilder.scala | 44 +- .../datacaterer/api/PlanRun.scala | 38 +- .../datacaterer/api/SinkOptionsBuilder.scala | 24 +- .../datacaterer/api/TaskBuilder.scala | 393 ++++------ .../datacaterer/api/ValidationBuilder.scala | 686 ++++++++++++------ .../api/connection/ConnectionBuilder.scala | 13 +- .../datacaterer/api/model/Constants.scala | 71 +- .../api/model/MetadataSourceModels.scala | 8 +- .../datacaterer/api/model/PlanModels.scala | 43 +- .../api/model/ValidationModels.scala | 272 ++++++- .../parser/ValidationBuilderSerializer.scala | 74 ++ .../api/parser/ValidationIdResolver.scala | 108 --- .../javaapi/api/DocumentationJavaPlanRun.java | 2 +- .../javaapi/api/ExampleJavaPlanRun.java | 2 +- .../datacaterer/api/ExamplePlanRun.scala | 70 +- .../api/MetadataSourceBuilderTest.scala | 35 +- .../datacaterer/api/PlanBuilderTest.scala | 49 +- .../datacaterer/api/PlanRunTest.scala | 40 +- .../api/SinkOptionsBuilderTest.scala | 11 +- .../datacaterer/api/TasksBuilderTest.scala | 131 ++-- .../ValidationConfigurationBuilderTest.scala | 636 ++++++++++++---- app/build.gradle.kts | 9 +- app/src/main/resources/report/main.css | 2 +- app/src/main/resources/ui/blah.json | 150 ++++ .../main/resources/ui/configuration-data.js | 48 +- .../main/resources/ui/helper-configuration.js | 5 +- .../main/resources/ui/helper-foreign-keys.js | 46 +- .../main/resources/ui/helper-generation.js | 44 +- .../main/resources/ui/helper-record-count.js | 179 ++--- .../main/resources/ui/helper-validation.js | 217 +++--- app/src/main/resources/ui/history/history.js | 4 +- app/src/main/resources/ui/index.html | 32 +- app/src/main/resources/ui/index.js | 130 +++- app/src/main/resources/ui/plan/plan.js | 11 +- app/src/main/resources/ui/shared.js | 70 +- .../core/exception/Exceptions.scala | 20 +- .../core/generator/BatchDataProcessor.scala | 6 +- .../core/generator/DataGeneratorFactory.scala | 56 +- .../generator/DataGeneratorProcessor.scala | 4 +- .../delete/DeleteRecordProcessor.scala | 4 +- .../delete/JdbcDeleteRecordService.scala | 4 +- .../metadata/CombinationCalculator.scala | 39 +- .../metadata/ExpressionPredictor.scala | 2 +- .../generator/metadata/PlanGenerator.scala | 6 +- .../generator/metadata/StepNameProvider.scala | 6 +- .../datasource/DataSourceMetadata.scala | 24 +- .../DataSourceMetadataFactory.scala | 63 +- .../ConfluentSchemaRegistryMetadata.scala | 106 +-- .../database/CassandraMetadata.scala | 18 +- .../database/DatabaseMetadata.scala | 34 +- .../datasource/database/MysqlMetadata.scala | 22 +- .../database/PostgresMetadata.scala | 28 +- .../model/GreatExpectationsModels.scala | 248 +++---- .../datasource/http/HttpMetadata.scala | 4 +- .../datasource/http/OpenAPIConverter.scala | 50 +- ...taContractStandardDataSourceMetadata.scala | 7 +- .../OpenDataContractStandardV2Mapper.scala | 36 +- .../OpenDataContractStandardV3Mapper.scala | 4 +- .../OpenDataContractStandardV3Models.scala | 4 +- .../openlineage/OpenLineageMetadata.scala | 6 +- .../OpenMetadataDataSourceMetadata.scala | 16 +- .../OpenMetadataDataValidations.scala | 20 +- .../model/OpenMetadataModels.scala | 116 +-- .../ExpressionValidationPredictionCheck.scala | 8 +- .../PrimaryKeyValidationPredictionCheck.scala | 2 +- .../validation/ValidationPredictor.scala | 4 +- .../generator/provider/DataGenerator.scala | 4 +- .../provider/RandomDataGenerator.scala | 6 +- .../result/DataGenerationResultWriter.scala | 2 +- .../generator/result/ResultHtmlWriter.scala | 15 +- .../track/RecordTrackingProcessor.scala | 8 +- .../datacaterer/core/model/Constants.scala | 40 +- .../core/model/ValidationModels.scala | 4 +- ...onfluentSchemaRegistrySchemaResponse.scala | 10 + .../SchemaModels.scala | 19 - .../model/openlineage/DatasetModels.scala | 2 +- .../datacaterer/core/parser/PlanParser.scala | 72 +- .../core/parser/ProtobufParser.scala | 68 +- .../datacaterer/core/plan/PlanProcessor.scala | 1 + .../datacaterer/core/sink/SinkFactory.scala | 87 ++- .../core/sink/http/HttpSinkProcessor.scala | 24 +- .../core/sink/jms/JmsSinkProcessor.scala | 21 +- .../core/ui/mapper/ConfigurationMapper.scala | 2 +- .../core/ui/mapper/ConnectionMapper.scala | 108 --- .../core/ui/mapper/CountMapper.scala | 34 - .../core/ui/mapper/DateTimeFormat.scala | 22 - .../core/ui/mapper/FieldMapper.scala | 82 --- .../core/ui/mapper/ForeignKeyMapper.scala | 49 -- .../datacaterer/core/ui/mapper/UiMapper.scala | 59 -- .../core/ui/mapper/ValidationMapper.scala | 207 ------ .../core/ui/model/JsonSupport.scala | 51 -- .../datacaterer/core/ui/model/models.scala | 71 +- .../core/ui/plan/ConnectionRepository.scala | 13 +- .../core/ui/plan/PlanRepository.scala | 196 +++-- .../datacaterer/core/ui/plan/PlanRoutes.scala | 35 +- .../core/util/ForeignKeyUtil.scala | 80 +- .../datacaterer/core/util/GeneratorUtil.scala | 36 +- .../datacaterer/core/util/MetadataUtil.scala | 82 +-- .../core/util/ObjectMapperUtil.scala | 3 + .../core/util/RecordCountUtil.scala | 4 +- .../datacaterer/core/util/RowUtil.scala | 14 +- .../datacaterer/core/util/SchemaUtil.scala | 279 +++---- .../datacaterer/core/util/TaskHelper.scala | 2 +- .../core/util/UniqueFieldsUtil.scala | 64 +- .../core/util/ValidationUtil.scala | 3 +- .../core/validator/ValidationOperations.scala | 122 +++- .../core/validator/ValidationProcessor.scala | 10 +- .../sample/files/protobuf/example.proto | 10 +- .../sample/files/protobuf/my-import.proto | 7 + .../files/protobuf/register-sample-schemas.sh | 46 ++ .../confluentschemaregistry/get-example.json | 14 + .../get-my-import-by-subject.json | 7 + .../get-my-import.json | 7 + .../marquez/get_dataset_api_response.json | 2 +- .../marquez/list_datasets_api_response.json | 2 +- .../openmetadata/get_table_response.json | 50 +- .../account-balance-transaction-plan.yaml | 11 +- .../sample/plan/account-create-plan-test.yaml | 11 +- .../sample/plan/account-create-plan.yaml | 11 +- .../sample/plan/customer-create-plan.yaml | 6 - .../plan/example-account-create-plan.yaml | 12 +- .../resources/sample/plan/large-plan.yaml | 11 +- .../cassandra/cassandra-customer-task.yaml | 65 +- .../task/file/csv-transaction-task.yaml | 71 +- .../sample/task/file/json-account-task.yaml | 213 +++--- .../sample/task/file/large-csv-task.yaml | 63 +- .../task/file/large-json-account-task.yaml | 150 ++-- .../task/file/parquet-transaction-task.yaml | 60 +- .../sample/task/file/simple-json-task.yaml | 144 ++-- .../sample/task/http/http-account-task.yaml | 109 ++- .../sample/task/jms/jms-account-task.yaml | 95 +-- .../sample/task/kafka/kafka-account-task.yaml | 114 ++- .../postgres-balance-transaction-task.yaml | 59 +- .../task/postgres/postgres-customer-task.yaml | 51 +- .../postgres/postgres-transaction-task.yaml | 19 +- .../sample/validation/all/all-validation.yaml | 54 +- .../validation/json/json-validation.yaml | 2 +- .../{ => simple}/simple-validation.yaml | 9 + .../resources/script/responses/plans.json | 2 +- .../generator/DataGeneratorFactoryTest.scala | 51 +- .../metadata/CombinationCalculatorTest.scala | 15 +- .../metadata/PlanGeneratorTest.scala | 18 +- .../DataSourceMetadataFactoryTest.scala | 18 +- .../ConfluentSchemaRegistryMetadataTest.scala | 114 +++ ...ataContractCliDataSourceMetadataTest.scala | 2 +- ...GreatExpectationsDataValidationsTest.scala | 92 +-- .../http/OpenAPIConverterTest.scala | 70 +- .../datasource/jms/JmsMetadataTest.scala | 11 +- ...ntractStandardDataSourceMetadataTest.scala | 4 +- .../openlineage/OpenLineageMetadataTest.scala | 2 +- .../OpenMetadataDataValidationsTest.scala | 82 +-- .../validation/ValidationPredictorTest.scala | 7 +- .../provider/RandomDataGeneratorTest.scala | 4 +- .../track/RecordTrackingProcessorTest.scala | 21 +- .../model/ForeignKeyRelationHelperTest.scala | 8 +- .../core/model/PlanImplicitsTest.scala | 11 +- .../core/model/TaskHelperTest.scala | 49 +- .../core/model/ValidationOperationsTest.scala | 282 +++++-- .../core/parser/PlanParserTest.scala | 11 +- .../core/parser/ProtobufParserTest.scala | 33 + .../core/plan/ExampleJavaPlanRun.java | 10 +- .../core/plan/PlanProcessorTest.scala | 159 ++-- .../core/sink/jms/JmsSinkProcessorTest.scala | 27 +- .../core/ui/mapper/ConnectionMapperTest.scala | 179 ----- .../core/ui/mapper/CountMapperTest.scala | 82 --- .../core/ui/mapper/FieldMapperTest.scala | 57 -- .../core/ui/mapper/ForeignKeyMapperTest.scala | 29 - .../core/ui/mapper/UiMapperTest.scala | 27 - .../core/ui/mapper/ValidationMapperTest.scala | 324 --------- .../core/util/ForeignKeyUtilTest.scala | 59 +- .../core/util/MetadataUtilTest.scala | 14 +- .../core/util/RecordCountUtilTest.scala | 12 +- .../core/util/UniqueFieldsUtilTest.scala | 62 +- .../validator/ValidationProcessorTest.scala | 28 +- gradle.properties | 2 +- insta-integration.yaml | 2 +- misc/CHANGELOG.md | 42 ++ workspace.xml | 1 + 181 files changed, 5122 insertions(+), 5165 deletions(-) create mode 100644 api/src/main/scala/io/github/datacatering/datacaterer/api/parser/ValidationBuilderSerializer.scala delete mode 100644 api/src/main/scala/io/github/datacatering/datacaterer/api/parser/ValidationIdResolver.scala create mode 100644 app/src/main/resources/ui/blah.json create mode 100644 app/src/main/scala/io/github/datacatering/datacaterer/core/model/confluentschemaregistry/ConfluentSchemaRegistrySchemaResponse.scala delete mode 100644 app/src/main/scala/io/github/datacatering/datacaterer/core/model/confluentschemaregistry/SchemaModels.scala delete mode 100644 app/src/main/scala/io/github/datacatering/datacaterer/core/ui/mapper/ConnectionMapper.scala delete mode 100644 app/src/main/scala/io/github/datacatering/datacaterer/core/ui/mapper/CountMapper.scala delete mode 100644 app/src/main/scala/io/github/datacatering/datacaterer/core/ui/mapper/DateTimeFormat.scala delete mode 100644 app/src/main/scala/io/github/datacatering/datacaterer/core/ui/mapper/FieldMapper.scala delete mode 100644 app/src/main/scala/io/github/datacatering/datacaterer/core/ui/mapper/ForeignKeyMapper.scala delete mode 100644 app/src/main/scala/io/github/datacatering/datacaterer/core/ui/mapper/UiMapper.scala delete mode 100644 app/src/main/scala/io/github/datacatering/datacaterer/core/ui/mapper/ValidationMapper.scala delete mode 100644 app/src/main/scala/io/github/datacatering/datacaterer/core/ui/model/JsonSupport.scala create mode 100644 app/src/test/resources/sample/files/protobuf/my-import.proto create mode 100644 app/src/test/resources/sample/files/protobuf/register-sample-schemas.sh create mode 100644 app/src/test/resources/sample/metadata/confluentschemaregistry/get-example.json create mode 100644 app/src/test/resources/sample/metadata/confluentschemaregistry/get-my-import-by-subject.json create mode 100644 app/src/test/resources/sample/metadata/confluentschemaregistry/get-my-import.json rename app/src/test/resources/sample/validation/{ => simple}/simple-validation.yaml (62%) create mode 100644 app/src/test/scala/io/github/datacatering/datacaterer/core/generator/metadata/datasource/confluentschemaregistry/ConfluentSchemaRegistryMetadataTest.scala create mode 100644 app/src/test/scala/io/github/datacatering/datacaterer/core/parser/ProtobufParserTest.scala delete mode 100644 app/src/test/scala/io/github/datacatering/datacaterer/core/ui/mapper/ConnectionMapperTest.scala delete mode 100644 app/src/test/scala/io/github/datacatering/datacaterer/core/ui/mapper/CountMapperTest.scala delete mode 100644 app/src/test/scala/io/github/datacatering/datacaterer/core/ui/mapper/FieldMapperTest.scala delete mode 100644 app/src/test/scala/io/github/datacatering/datacaterer/core/ui/mapper/ForeignKeyMapperTest.scala delete mode 100644 app/src/test/scala/io/github/datacatering/datacaterer/core/ui/mapper/UiMapperTest.scala delete mode 100644 app/src/test/scala/io/github/datacatering/datacaterer/core/ui/mapper/ValidationMapperTest.scala create mode 100644 misc/CHANGELOG.md diff --git a/README.md b/README.md index e20dc73f..cf0e8494 100644 --- a/README.md +++ b/README.md @@ -137,14 +137,14 @@ postgres("customer_postgres", "jdbc:postgresql://localhost:5432/customer") //na ```scala postgres("customer_postgres", "jdbc:postgresql://localhost:5432/customer") - .schema(field.name("account_id").regex("ACC[0-9]{10}").unique(true)) + .fields(field.name("account_id").regex("ACC[0-9]{10}").unique(true)) ``` ##### [I then want to test my job ingests all the data after generating](https://github.com/data-catering/data-caterer-example/blob/b0f03fb26f185ec8613241205b998aef1d5f5a01/src/main/scala/io/github/datacatering/plan/ValidationPlanRun.scala) ```scala val postgresTask = postgres("customer_postgres", "jdbc:postgresql://localhost:5432/customer") - .schema(field.name("account_id").regex("ACC[0-9]{10}").unique(true)) + .fields(field.name("account_id").regex("ACC[0-9]{10}").unique(true)) val parquetValidation = parquet("output_parquet", "/data/parquet/customer") .validation(validation.count.isEqual(1000)) @@ -154,12 +154,12 @@ val parquetValidation = parquet("output_parquet", "/data/parquet/customer") ```scala val postgresTask = postgres("customer_postgres", "jdbc:postgresql://localhost:5432/customer") - .schema(field.name("account_id").regex("ACC[0-9]{10}").unique(true)) + .fields(field.name("account_id").regex("ACC[0-9]{10}").unique(true)) val parquetValidation = parquet("output_parquet", "/data/parquet/customer") .validation( validation.upstreamData(postgresTask) - .joinColumns("account_id") + .joinFields("account_id") .withValidation(validation.count().isEqual(1000)) ) ``` @@ -168,12 +168,12 @@ val parquetValidation = parquet("output_parquet", "/data/parquet/customer") ```scala val postgresTask = postgres("customer_postgres", "jdbc:postgresql://localhost:5432/customer") - .schema(field.name("account_id").regex("ACC[0-9]{10}").unique(true)) + .fields(field.name("account_id").regex("ACC[0-9]{10}").unique(true)) val parquetValidation = parquet("output_parquet", "/data/parquet/customer") .validation( validation.upstreamData(postgresTask) - .joinColumns("account_id") + .joinFields("account_id") .withValidation(validation.count().isEqual(1000)) ) .validationWait(waitCondition.file("/data/parquet/customer")) @@ -186,18 +186,18 @@ val parquetValidation = parquet("output_parquet", "/data/parquet/customer") ```scala kafka("my_kafka", "localhost:29092") .topic("account-topic") - .schema(...) + .fields(...) ``` ##### [But I want the same `account_id` to show in Postgres and Kafka](https://github.com/data-catering/data-caterer-example/blob/b0f03fb26f185ec8613241205b998aef1d5f5a01/src/main/scala/io/github/datacatering/plan/AdvancedBatchEventPlanRun.scala) ```scala val postgresTask = postgres("customer_postgres", "jdbc:postgresql://localhost:5432/customer") - .schema(field.name("account_id").regex("ACC[0-9]{10}")) + .fields(field.name("account_id").regex("ACC[0-9]{10}")) val kafkaTask = kafka("my_kafka", "localhost:29092") .topic("account-topic") - .schema(...) + .fields(...) plan.addForeignKeyRelationship( postgresTask, List("account_id"), @@ -212,7 +212,7 @@ plan.addForeignKeyRelationship( ```scala postgres("customer_postgres", "jdbc:postgresql://localhost:5432/customer") .table("account", "transactions") - .count(count.recordsPerColumn(5, "account_id")) + .count(count.recordsPerField(5, "account_id")) ``` ##### [Randomly generate 1 to 5 transactions per `account_id`](https://github.com/data-catering/data-caterer-example/blob/b0f03fb26f185ec8613241205b998aef1d5f5a01/src/main/scala/io/github/datacatering/plan/MultipleRecordsPerColPlan.scala) @@ -220,7 +220,7 @@ postgres("customer_postgres", "jdbc:postgresql://localhost:5432/customer") ```scala postgres("customer_postgres", "jdbc:postgresql://localhost:5432/customer") .table("account", "transactions") - .count(count.recordsPerColumnGenerator(generator.min(1).max(5), "account_id")) + .count(count.recordsPerFieldGenerator(generator.min(1).max(5), "account_id")) ``` ##### [I want to delete the generated data](https://github.com/data-catering/data-caterer-example/blob/b0f03fb26f185ec8613241205b998aef1d5f5a01/src/main/scala/io/github/datacatering/plan/AdvancedDeletePlanRun.scala) @@ -228,7 +228,7 @@ postgres("customer_postgres", "jdbc:postgresql://localhost:5432/customer") ```scala val postgresTask = postgres("customer_postgres", "jdbc:postgresql://localhost:5432/customer") .table("account", "transactions") - .count(count.recordsPerColumnGenerator(generator.min(0).max(5), "account_id")) + .count(count.recordsPerFieldGenerator(generator.min(0).max(5), "account_id")) val conf = configuration .enableDeleteGeneratedRecords(true) @@ -240,7 +240,7 @@ val conf = configuration ```scala val postgresTask = postgres("customer_postgres", "jdbc:postgresql://localhost:5432/customer") .table("account", "transactions") - .count(count.recordsPerColumnGenerator(generator.min(0).max(5), "account_id")) + .count(count.recordsPerFieldGenerator(generator.min(0).max(5), "account_id")) val cassandraTxns = cassandra("ingested_data", "localhost:9042") .table("account", "transactions") @@ -260,7 +260,7 @@ val conf = configuration ```scala val postgresTask = postgres("customer_postgres", "jdbc:postgresql://localhost:5432/customer") - .count(count.recordsPerColumnGenerator(generator.min(0).max(5), "account_id")) + .count(count.recordsPerFieldGenerator(generator.min(0).max(5), "account_id")) val cassandraTxns = cassandra("ingested_data", "localhost:9042") .table("account", "transactions") @@ -282,14 +282,14 @@ val conf = configuration ```scala parquet("customer_parquet", "/data/parquet/customer") - .schema(metadataSource.openDataContractStandard("/data/odcs/full-example.odcs.yaml")) + .fields(metadataSource.openDataContractStandard("/data/odcs/full-example.odcs.yaml")) ``` ##### [I have an OpenAPI/Swagger doc](https://github.com/data-catering/data-caterer-example/blob/b0f03fb26f185ec8613241205b998aef1d5f5a01/src/main/scala/io/github/datacatering/plan/AdvancedHttpPlanRun.scala) ```scala http("my_http") - .schema(metadataSource.openApi("/data/http/petstore.json")) + .fields(metadataSource.openApi("/data/http/petstore.json")) ``` #### Validate data using validations from metadata source diff --git a/api/src/main/java/io/github/datacatering/datacaterer/javaapi/api/PlanRun.java b/api/src/main/java/io/github/datacatering/datacaterer/javaapi/api/PlanRun.java index db59bee3..5203b8ea 100644 --- a/api/src/main/java/io/github/datacatering/datacaterer/javaapi/api/PlanRun.java +++ b/api/src/main/java/io/github/datacatering/datacaterer/javaapi/api/PlanRun.java @@ -2,17 +2,16 @@ import io.github.datacatering.datacaterer.api.BasePlanRun; -import io.github.datacatering.datacaterer.api.ColumnValidationBuilder; import io.github.datacatering.datacaterer.api.CombinationPreFilterBuilder; import io.github.datacatering.datacaterer.api.CountBuilder; import io.github.datacatering.datacaterer.api.DataCatererConfigurationBuilder; import io.github.datacatering.datacaterer.api.DataSourceValidationBuilder; import io.github.datacatering.datacaterer.api.FieldBuilder; +import io.github.datacatering.datacaterer.api.FieldValidationBuilder; import io.github.datacatering.datacaterer.api.GeneratorBuilder; import io.github.datacatering.datacaterer.api.MetadataSourceBuilder; import io.github.datacatering.datacaterer.api.PlanBuilder; import io.github.datacatering.datacaterer.api.PreFilterBuilder; -import io.github.datacatering.datacaterer.api.SchemaBuilder; import io.github.datacatering.datacaterer.api.StepBuilder; import io.github.datacatering.datacaterer.api.TaskBuilder; import io.github.datacatering.datacaterer.api.TaskSummaryBuilder; @@ -98,15 +97,6 @@ public StepBuilder step() { return new StepBuilder(); } - /** - * Creates a SchemaBuilder instance. - * - * @return A SchemaBuilder instance. - */ - public SchemaBuilder schema() { - return new SchemaBuilder(); - } - /** * Creates a FieldBuilder instance. * @@ -172,13 +162,13 @@ public CombinationPreFilterBuilder preFilterBuilder(ValidationBuilder validation } /** - * Creates a ColumnValidationBuilder instance for the specified column. + * Creates a FieldValidationBuilder instance for the specified field. * - * @param column The name of the column. - * @return A ColumnValidationBuilder instance for the specified column. + * @param field The name of the field. + * @return A FieldValidationBuilder instance for the specified field. */ - public ColumnValidationBuilder columnPreFilter(String column) { - return new ValidationBuilder().col(column); + public FieldValidationBuilder fieldPreFilter(String field) { + return new ValidationBuilder().field(field); } /** @@ -209,39 +199,39 @@ public MetadataSourceBuilder metadataSource() { } /** - * Creates a ForeignKeyRelation instance with the provided data source, step, and column. + * Creates a ForeignKeyRelation instance with the provided data source, step, and field. * * @param dataSource The name of the data source. * @param step The step associated with the ForeignKeyRelation. - * @param column The column for the ForeignKeyRelation. + * @param field The field for the ForeignKeyRelation. * @return A ForeignKeyRelation instance. */ - public ForeignKeyRelation foreignField(String dataSource, String step, String column) { - return new ForeignKeyRelation(dataSource, step, column); + public ForeignKeyRelation foreignField(String dataSource, String step, String field) { + return new ForeignKeyRelation(dataSource, step, field); } /** - * Creates a ForeignKeyRelation instance with the provided data source, step, and columns. + * Creates a ForeignKeyRelation instance with the provided data source, step, and fields. * * @param dataSource The name of the data source. * @param step The step associated with the ForeignKeyRelation. - * @param columns The list of columns for the ForeignKeyRelation. + * @param fields The list of fields for the ForeignKeyRelation. * @return A ForeignKeyRelation instance. */ - public ForeignKeyRelation foreignField(String dataSource, String step, List columns) { - return new ForeignKeyRelation(dataSource, step, toScalaList(columns)); + public ForeignKeyRelation foreignField(String dataSource, String step, List fields) { + return new ForeignKeyRelation(dataSource, step, toScalaList(fields)); } /** - * Creates a ForeignKeyRelation instance with the provided ConnectionTaskBuilder, step, and columns. + * Creates a ForeignKeyRelation instance with the provided ConnectionTaskBuilder, step, and fields. * * @param connectionTaskBuilder The ConnectionTaskBuilder instance representing the task. * @param step The step associated with the ForeignKeyRelation. - * @param columns The list of columns for the ForeignKeyRelation. + * @param fields The list of fields for the ForeignKeyRelation. * @return A ForeignKeyRelation instance. */ - public ForeignKeyRelation foreignField(ConnectionTaskBuilder connectionTaskBuilder, String step, List columns) { - return new ForeignKeyRelation(connectionTaskBuilder.connectionConfigWithTaskBuilder().dataSourceName(), step, toScalaList(columns)); + public ForeignKeyRelation foreignField(ConnectionTaskBuilder connectionTaskBuilder, String step, List fields) { + return new ForeignKeyRelation(connectionTaskBuilder.connectionConfigWithTaskBuilder().dataSourceName(), step, toScalaList(fields)); } /** diff --git a/api/src/main/scala/io/github/datacatering/datacaterer/api/MetadataSourceBuilder.scala b/api/src/main/scala/io/github/datacatering/datacaterer/api/MetadataSourceBuilder.scala index 9288f58b..6578d945 100644 --- a/api/src/main/scala/io/github/datacatering/datacaterer/api/MetadataSourceBuilder.scala +++ b/api/src/main/scala/io/github/datacatering/datacaterer/api/MetadataSourceBuilder.scala @@ -1,9 +1,9 @@ package io.github.datacatering.datacaterer.api import io.github.datacatering.datacaterer.api.converter.Converters.toScalaMap -import io.github.datacatering.datacaterer.api.model.Constants.{DATA_CONTRACT_FILE, DATA_CONTRACT_SCHEMA, GREAT_EXPECTATIONS_FILE, METADATA_SOURCE_URL, OPEN_LINEAGE_DATASET, OPEN_LINEAGE_NAMESPACE, OPEN_METADATA_API_VERSION, OPEN_METADATA_AUTH_TYPE, OPEN_METADATA_AUTH_TYPE_OPEN_METADATA, OPEN_METADATA_DEFAULT_API_VERSION, OPEN_METADATA_HOST, OPEN_METADATA_JWT_TOKEN, SCHEMA_LOCATION} +import io.github.datacatering.datacaterer.api.model.Constants.{CONFLUENT_SCHEMA_REGISTRY_ID, CONFLUENT_SCHEMA_REGISTRY_SUBJECT, CONFLUENT_SCHEMA_REGISTRY_VERSION, DATA_CONTRACT_FILE, DATA_CONTRACT_SCHEMA, GREAT_EXPECTATIONS_FILE, METADATA_SOURCE_URL, OPEN_LINEAGE_DATASET, OPEN_LINEAGE_NAMESPACE, OPEN_METADATA_API_VERSION, OPEN_METADATA_AUTH_TYPE, OPEN_METADATA_AUTH_TYPE_OPEN_METADATA, OPEN_METADATA_DEFAULT_API_VERSION, OPEN_METADATA_HOST, OPEN_METADATA_JWT_TOKEN, SCHEMA_LOCATION} import com.softwaremill.quicklens.ModifyPimp -import io.github.datacatering.datacaterer.api.model.{DataContractCliSource, GreatExpectationsSource, MarquezMetadataSource, MetadataSource, OpenAPISource, OpenDataContractStandardSource, OpenMetadataSource} +import io.github.datacatering.datacaterer.api.model.{ConfluentSchemaRegistrySource, DataContractCliSource, GreatExpectationsSource, MarquezMetadataSource, MetadataSource, OpenAPISource, OpenDataContractStandardSource, OpenMetadataSource} case class MetadataSourceBuilder(metadataSource: MetadataSource = MarquezMetadataSource()) { def this() = this(MarquezMetadataSource()) @@ -105,4 +105,26 @@ case class MetadataSourceBuilder(metadataSource: MetadataSource = MarquezMetadat DATA_CONTRACT_SCHEMA -> modelNames.mkString(",") ))) } + + def confluentSchemaRegistry(url: String, schemaId: Int): MetadataSourceBuilder = { + this.modify(_.metadataSource).setTo(ConfluentSchemaRegistrySource(Map( + METADATA_SOURCE_URL -> url, + CONFLUENT_SCHEMA_REGISTRY_ID -> schemaId.toString + ))) + } + + def confluentSchemaRegistry(url: String, schemaSubject: String): MetadataSourceBuilder = { + this.modify(_.metadataSource).setTo(ConfluentSchemaRegistrySource(Map( + METADATA_SOURCE_URL -> url, + CONFLUENT_SCHEMA_REGISTRY_SUBJECT -> schemaSubject + ))) + } + + def confluentSchemaRegistry(url: String, schemaSubject: String, version: Int): MetadataSourceBuilder = { + this.modify(_.metadataSource).setTo(ConfluentSchemaRegistrySource(Map( + METADATA_SOURCE_URL -> url, + CONFLUENT_SCHEMA_REGISTRY_SUBJECT -> schemaSubject, + CONFLUENT_SCHEMA_REGISTRY_VERSION -> version.toString, + ))) + } } diff --git a/api/src/main/scala/io/github/datacatering/datacaterer/api/PlanBuilder.scala b/api/src/main/scala/io/github/datacatering/datacaterer/api/PlanBuilder.scala index eb49ce2d..401b57bf 100644 --- a/api/src/main/scala/io/github/datacatering/datacaterer/api/PlanBuilder.scala +++ b/api/src/main/scala/io/github/datacatering/datacaterer/api/PlanBuilder.scala @@ -44,50 +44,50 @@ case class PlanBuilder(plan: Plan = Plan(), tasks: List[TasksBuilder] = List()) def addForeignKeyRelationship(foreignKey: ForeignKeyRelation, generationLinks: List[ForeignKeyRelation], deleteLinks: List[ForeignKeyRelation]): PlanBuilder = this.modify(_.plan.sinkOptions).setTo(Some(getSinkOpt.foreignKey(foreignKey, generationLinks, deleteLinks).sinkOptions)) - def addForeignKeyRelationship(connectionTaskBuilder: ConnectionTaskBuilder[_], columns: List[String], + def addForeignKeyRelationship(connectionTaskBuilder: ConnectionTaskBuilder[_], fields: List[String], generationLinks: List[(ConnectionTaskBuilder[_], List[String])]): PlanBuilder = { - val baseRelation = toForeignKeyRelation(connectionTaskBuilder, columns) + val baseRelation = toForeignKeyRelation(connectionTaskBuilder, fields) val otherRelations = generationLinks.map(r => toForeignKeyRelation(r._1, r._2)) addForeignKeyRelationship(baseRelation, otherRelations: _*) } - def addForeignKeyRelationship(connectionTaskBuilder: ConnectionTaskBuilder[_], columns: List[String], + def addForeignKeyRelationship(connectionTaskBuilder: ConnectionTaskBuilder[_], fields: List[String], generationLinks: List[(ConnectionTaskBuilder[_], List[String])], deleteLinks: List[(ConnectionTaskBuilder[_], List[String])], ): PlanBuilder = { - val baseRelation = toForeignKeyRelation(connectionTaskBuilder, columns) + val baseRelation = toForeignKeyRelation(connectionTaskBuilder, fields) val mappedGeneration = generationLinks.map(r => toForeignKeyRelation(r._1, r._2)) val mappedDelete = deleteLinks.map(r => toForeignKeyRelation(r._1, r._2, true)) addForeignKeyRelationship(baseRelation, mappedGeneration, mappedDelete) } - def addForeignKeyRelationship(connectionTaskBuilder: ConnectionTaskBuilder[_], columns: java.util.List[String], + def addForeignKeyRelationship(connectionTaskBuilder: ConnectionTaskBuilder[_], fields: java.util.List[String], relations: java.util.List[java.util.Map.Entry[ConnectionTaskBuilder[_], java.util.List[String]]]): PlanBuilder = { val scalaListRelations = toScalaList(relations) val mappedRelations = scalaListRelations.map(r => (r.getKey, toScalaList(r.getValue))) - addForeignKeyRelationship(connectionTaskBuilder, toScalaList(columns), mappedRelations) + addForeignKeyRelationship(connectionTaskBuilder, toScalaList(fields), mappedRelations) } - def addForeignKeyRelationship(connectionTaskBuilder: ConnectionTaskBuilder[_], column: String, + def addForeignKeyRelationship(connectionTaskBuilder: ConnectionTaskBuilder[_], field: String, relations: List[(ConnectionTaskBuilder[_], String)]): PlanBuilder = - addForeignKeyRelationship(connectionTaskBuilder, List(column), relations.map(r => (r._1, List(r._2)))) + addForeignKeyRelationship(connectionTaskBuilder, List(field), relations.map(r => (r._1, List(r._2)))) - def addForeignKeyRelationship(connectionTaskBuilder: ConnectionTaskBuilder[_], column: String, + def addForeignKeyRelationship(connectionTaskBuilder: ConnectionTaskBuilder[_], field: String, relations: java.util.List[java.util.Map.Entry[ConnectionTaskBuilder[_], String]]): PlanBuilder = { val scalaListRelations = toScalaList(relations) val mappedRelations = scalaListRelations.map(r => (r.getKey, List(r.getValue))) - addForeignKeyRelationship(connectionTaskBuilder, List(column), mappedRelations) + addForeignKeyRelationship(connectionTaskBuilder, List(field), mappedRelations) } - def addForeignKeyRelationships(connectionTaskBuilder: ConnectionTaskBuilder[_], columns: List[String], + def addForeignKeyRelationships(connectionTaskBuilder: ConnectionTaskBuilder[_], fields: List[String], relations: List[ForeignKeyRelation]): PlanBuilder = { - val baseRelation = toForeignKeyRelation(connectionTaskBuilder, columns) + val baseRelation = toForeignKeyRelation(connectionTaskBuilder, fields) addForeignKeyRelationship(baseRelation, relations: _*) } - def addForeignKeyRelationships(connectionTaskBuilder: ConnectionTaskBuilder[_], columns: java.util.List[String], + def addForeignKeyRelationships(connectionTaskBuilder: ConnectionTaskBuilder[_], fields: java.util.List[String], relations: java.util.List[ForeignKeyRelation]): PlanBuilder = - addForeignKeyRelationships(connectionTaskBuilder, toScalaList(columns), toScalaList(relations)) + addForeignKeyRelationships(connectionTaskBuilder, toScalaList(fields), toScalaList(relations)) def addForeignKeyRelationship(foreignKey: ForeignKeyRelation, relations: List[(ConnectionTaskBuilder[_], List[String])]): PlanBuilder = @@ -97,19 +97,19 @@ case class PlanBuilder(plan: Plan = Plan(), tasks: List[TasksBuilder] = List()) relations: java.util.List[(ConnectionTaskBuilder[_], java.util.List[String])]): PlanBuilder = addForeignKeyRelationship(foreignKey, toScalaList(relations).map(r => toForeignKeyRelation(r._1, toScalaList(r._2))): _*) - private def toForeignKeyRelation(connectionTaskBuilder: ConnectionTaskBuilder[_], columns: List[String], isDeleteFk: Boolean = false) = { + private def toForeignKeyRelation(connectionTaskBuilder: ConnectionTaskBuilder[_], fields: List[String], isDeleteFk: Boolean = false) = { val dataSource = connectionTaskBuilder.connectionConfigWithTaskBuilder.dataSourceName - val colNames = columns.mkString(",") + val fieldNames = fields.mkString(",") connectionTaskBuilder.step match { case Some(value) => - val fields = value.step.schema.fields.getOrElse(List()) - val hasColumns = columns.forall(c => fields.exists(_.name == c)) - if (!hasColumns && !value.step.options.contains(METADATA_SOURCE_TYPE) && !isDeleteFk) { - throw new RuntimeException(s"Column name defined in foreign key relationship does not exist, data-source=$dataSource, column-name=$colNames") + val schemaFields = value.step.fields + val hasFields = fields.forall(c => schemaFields.exists(_.name == c)) + if (!hasFields && !value.step.options.contains(METADATA_SOURCE_TYPE) && !isDeleteFk) { + throw new RuntimeException(s"Field name defined in foreign key relationship does not exist, data-source=$dataSource, field-name=$fieldNames") } - ForeignKeyRelation(dataSource, value.step.name, columns) + ForeignKeyRelation(dataSource, value.step.name, fields) case None => - throw new RuntimeException(s"No schema defined for data source. Cannot create foreign key relationship, data-source=$dataSource, column-name=$colNames") + throw new RuntimeException(s"No fields defined for data source. Cannot create foreign key relationship, data-source=$dataSource, field-name=$fieldNames") } } diff --git a/api/src/main/scala/io/github/datacatering/datacaterer/api/PlanRun.scala b/api/src/main/scala/io/github/datacatering/datacaterer/api/PlanRun.scala index d149f0bf..29e3c3a0 100644 --- a/api/src/main/scala/io/github/datacatering/datacaterer/api/PlanRun.scala +++ b/api/src/main/scala/io/github/datacatering/datacaterer/api/PlanRun.scala @@ -24,8 +24,6 @@ trait PlanRun { def step: StepBuilder = StepBuilder() - def schema: SchemaBuilder = SchemaBuilder() - def field: FieldBuilder = FieldBuilder() def generator: GeneratorBuilder = GeneratorBuilder() @@ -40,20 +38,20 @@ trait PlanRun { def preFilterBuilder(validationBuilder: ValidationBuilder): CombinationPreFilterBuilder = PreFilterBuilder().filter(validationBuilder) - def columnPreFilter(column: String): ColumnValidationBuilder = ValidationBuilder().col(column) + def fieldPreFilter(field: String): FieldValidationBuilder = ValidationBuilder().field(field) def dataSourceValidation: DataSourceValidationBuilder = DataSourceValidationBuilder() def validationConfig: ValidationConfigurationBuilder = ValidationConfigurationBuilder() - def foreignField(dataSource: String, step: String, column: String): ForeignKeyRelation = - new ForeignKeyRelation(dataSource, step, column) + def foreignField(dataSource: String, step: String, field: String): ForeignKeyRelation = + new ForeignKeyRelation(dataSource, step, field) - def foreignField(dataSource: String, step: String, columns: List[String]): ForeignKeyRelation = - ForeignKeyRelation(dataSource, step, columns) + def foreignField(dataSource: String, step: String, fields: List[String]): ForeignKeyRelation = + ForeignKeyRelation(dataSource, step, fields) - def foreignField(connectionTask: ConnectionTaskBuilder[_], step: String, columns: List[String]): ForeignKeyRelation = - ForeignKeyRelation(connectionTask.connectionConfigWithTaskBuilder.dataSourceName, step, columns) + def foreignField(connectionTask: ConnectionTaskBuilder[_], step: String, fields: List[String]): ForeignKeyRelation = + ForeignKeyRelation(connectionTask.connectionConfigWithTaskBuilder.dataSourceName, step, fields) def metadataSource: MetadataSourceBuilder = MetadataSourceBuilder() @@ -513,17 +511,17 @@ trait PlanRun { private def getValidations(allConnectionTasks: Seq[ConnectionTaskBuilder[_]]) = { val validationsByDataSource = allConnectionTasks.map(x => { - val dataSource = x.connectionConfigWithTaskBuilder.dataSourceName - val optValidation = x.step - .flatMap(_.optValidation) - .map(dsValid => { - DataSourceValidationBuilder() - .options(x.step.map(_.step.options).getOrElse(Map()) ++ x.connectionConfigWithTaskBuilder.options) - .wait(dsValid.dataSourceValidation.waitCondition) - .validations(dsValid.dataSourceValidation.validations: _*) - }) - (dataSource, optValidation) - }) + val dataSource = x.connectionConfigWithTaskBuilder.dataSourceName + val optValidation = x.step + .flatMap(_.optValidation) + .map(dsValid => { + DataSourceValidationBuilder() + .options(x.step.map(_.step.options).getOrElse(Map()) ++ x.connectionConfigWithTaskBuilder.options) + .wait(dsValid.dataSourceValidation.waitCondition) + .validations(dsValid.dataSourceValidation.validations: _*) + }) + (dataSource, optValidation) + }) .filter(_._2.isDefined) .map(ds => (ds._1, validationConfig.addDataSourceValidation(ds._1, ds._2.get))) diff --git a/api/src/main/scala/io/github/datacatering/datacaterer/api/SinkOptionsBuilder.scala b/api/src/main/scala/io/github/datacatering/datacaterer/api/SinkOptionsBuilder.scala index 9bb110cd..3470b828 100644 --- a/api/src/main/scala/io/github/datacatering/datacaterer/api/SinkOptionsBuilder.scala +++ b/api/src/main/scala/io/github/datacatering/datacaterer/api/SinkOptionsBuilder.scala @@ -1,7 +1,7 @@ package io.github.datacatering.datacaterer.api import com.softwaremill.quicklens.ModifyPimp -import io.github.datacatering.datacaterer.api.model.{ForeignKeyRelation, SinkOptions} +import io.github.datacatering.datacaterer.api.model.{ForeignKey, ForeignKeyRelation, SinkOptions} import scala.annotation.varargs @@ -28,9 +28,9 @@ case class SinkOptionsBuilder(sinkOptions: SinkOptions = SinkOptions()) { def locale(locale: String): SinkOptionsBuilder = this.modify(_.sinkOptions.locale).setTo(Some(locale)) /** - * Define a foreign key relationship between columns across any data source for data generation. - * To define which column to use, it is defined by the following:
- * dataSourceName + stepName + columnName + * Define a foreign key relationship between fields across any data source for data generation. + * To define which field to use, it is defined by the following:
+ * dataSourceName + stepName + fieldName * * @param foreignKey Base foreign key * @param generationLinks Foreign key relations for data generation @@ -38,13 +38,13 @@ case class SinkOptionsBuilder(sinkOptions: SinkOptions = SinkOptions()) { * @see Docs for details */ @varargs def foreignKey(foreignKey: ForeignKeyRelation, generationLinks: ForeignKeyRelation*): SinkOptionsBuilder = - this.modify(_.sinkOptions.foreignKeys)(_ ++ List((foreignKey.toString, generationLinks.map(_.toString).toList, List()))) + this.modify(_.sinkOptions.foreignKeys)(_ ++ List(ForeignKey(foreignKey, generationLinks.toList, List()))) /** - * Define a foreign key relationship between columns across any data source for data generation and deletion. + * Define a foreign key relationship between fields across any data source for data generation and deletion. * Can be used for data generation and deletion. - * To define which column to use, it is defined by the following:
- * dataSourceName + stepName + columnName + * To define which field to use, it is defined by the following:
+ * dataSourceName + stepName + fieldName * * @param foreignKey Base foreign key * @param generationLinks Foreign key relations for data generation @@ -53,12 +53,12 @@ case class SinkOptionsBuilder(sinkOptions: SinkOptions = SinkOptions()) { * @see Docs for details */ def foreignKey(foreignKey: ForeignKeyRelation, generationLinks: List[ForeignKeyRelation], deleteLinks: List[ForeignKeyRelation]): SinkOptionsBuilder = - this.modify(_.sinkOptions.foreignKeys)(_ ++ List((foreignKey.toString, generationLinks.map(_.toString), deleteLinks.map(_.toString)))) + this.modify(_.sinkOptions.foreignKeys)(_ ++ List(ForeignKey(foreignKey, generationLinks, deleteLinks))) /** - * Define a foreign key relationship between columns across any data source for data generation. - * To define which column to use, it is defined by the following:
- * dataSourceName + stepName + columnName + * Define a foreign key relationship between fields across any data source for data generation. + * To define which field to use, it is defined by the following:
+ * dataSourceName + stepName + fieldName * * @param foreignKey Base foreign key * @param generationLinks Foreign key relations for data generation diff --git a/api/src/main/scala/io/github/datacatering/datacaterer/api/TaskBuilder.scala b/api/src/main/scala/io/github/datacatering/datacaterer/api/TaskBuilder.scala index f194145a..86408e60 100644 --- a/api/src/main/scala/io/github/datacatering/datacaterer/api/TaskBuilder.scala +++ b/api/src/main/scala/io/github/datacatering/datacaterer/api/TaskBuilder.scala @@ -3,7 +3,7 @@ package io.github.datacatering.datacaterer.api import com.softwaremill.quicklens.ModifyPimp import io.github.datacatering.datacaterer.api.converter.Converters.toScalaMap import io.github.datacatering.datacaterer.api.model.Constants._ -import io.github.datacatering.datacaterer.api.model.{Count, DataType, Field, Generator, PerColumnCount, Schema, Step, StringType, Task, TaskSummary} +import io.github.datacatering.datacaterer.api.model.{Count, DataType, Field, PerFieldCount, Step, StringType, Task, TaskSummary} import scala.annotation.varargs @@ -266,13 +266,13 @@ case class StepBuilder(step: Step = Step(), optValidation: Option[DataSourceVali this.modify(_.step.options)(_ ++ Map(PATH -> path)) /** - * The columns within the generated data to use as partitions for a file data source. - * Order of partition columns defined is used to define order of partitions.
+ * The fields within the generated data to use as partitions for a file data source. + * Order of partition fields defined is used to define order of partitions.
* For example, {{{partitionBy("year", "account_id")}}} * will ensure that `year` is used as the top level partition * before `account_id`. * - * @param partitionsBy Partition column names in order + * @param partitionsBy Partition field names in order * @return StepBuilder */ @varargs def partitionBy(partitionsBy: String*): StepBuilder = @@ -314,7 +314,7 @@ case class StepBuilder(step: Step = Step(), optValidation: Option[DataSourceVali /** * Define number of records to be generated. - * If you also have defined a per column count, this value will not represent the full number of records generated. + * If you also have defined a per field count, this value will not represent the full number of records generated. * * @param records Number of records to generate * @return StepBuilder @@ -325,7 +325,7 @@ case class StepBuilder(step: Step = Step(), optValidation: Option[DataSourceVali /** * Define a generator to be used for determining the number of records to generate. - * If you also have defined a per column count, the value generated will be combined with the per column count to + * If you also have defined a per field count, the value generated will be combined with the per field count to * determine the total number of records * * @param generator Generator builder for determining number of records to generate @@ -336,28 +336,17 @@ case class StepBuilder(step: Step = Step(), optValidation: Option[DataSourceVali this.modify(_.step.count).setTo(CountBuilder().generator(generator).count) /** - * Define the number of records to generate based off certain columns.
- * For example, if you had a data set with columns account_id and amount, you can set that 10 records to be generated - * per account_id via {{{.count(new PerColumnCountBuilder().total(10, "account_id")}}}. + * Define the number of records to generate based off certain fields.
+ * For example, if you had a data set with fields account_id and amount, you can set that 10 records to be generated + * per account_id via {{{.count(new PerFieldCountBuilder().total(10, "account_id")}}}. * The total number of records generated is also influenced by other count configurations. * - * @param perColumnCountBuilder Per column count builder + * @param perFieldCountBuilder Per field count builder * @return StepBuilder * @see Count definition for details */ - def count(perColumnCountBuilder: PerColumnCountBuilder): StepBuilder = - this.modify(_.step.count).setTo(CountBuilder().perColumn(perColumnCountBuilder).count) - - /** - * Schema to use when generating data for data source. - * The schema includes various metadata about each field to guide the data generator on what the data should look - * like. - * - * @param schemaBuilder Schema builder - * @return StepBuilder - */ - def schema(schemaBuilder: SchemaBuilder): StepBuilder = - this.modify(_.step.schema).setTo(schemaBuilder.schema) + def count(perFieldCountBuilder: PerFieldCountBuilder): StepBuilder = + this.modify(_.step.count).setTo(CountBuilder().perField(perFieldCountBuilder).count) /** * Define fields of the schema of the data source to use when generating data. @@ -365,8 +354,8 @@ case class StepBuilder(step: Step = Step(), optValidation: Option[DataSourceVali * @param fields Fields of the schema * @return StepBuilder */ - @varargs def schema(fields: FieldBuilder*): StepBuilder = - this.modify(_.step.schema).setTo(SchemaBuilder().addFields(fields: _*).schema) + @varargs def fields(fields: FieldBuilder*): StepBuilder = + this.modify(_.step.fields).setTo(step.fields ++ fields.map(_.field)) /** * Define data validations once data has been generated. The result of the validations is logged out and included @@ -424,162 +413,131 @@ case class CountBuilder(count: Count = Count()) { * @return the modified count builder */ def generator(generator: GeneratorBuilder): CountBuilder = - this.modify(_.count.generator).setTo(Some(generator.generator)) + this.modify(_.count.options).setTo(generator.options) .modify(_.count.records).setTo(None) /** - * Sets the per-column count for the task builder. + * Sets the per-field count for the task builder. * - * @param perColumnCountBuilder the builder for the per-column count + * @param perFieldCountBuilder the builder for the per-field count * @return the updated task builder */ - def perColumn(perColumnCountBuilder: PerColumnCountBuilder): CountBuilder = - this.modify(_.count.perColumn).setTo(Some(perColumnCountBuilder.perColumnCount)) + def perField(perFieldCountBuilder: PerFieldCountBuilder): CountBuilder = + this.modify(_.count.perField).setTo(Some(perFieldCountBuilder.perFieldCount)) /** - * Sets the number of records per column for the task builder. + * Sets the number of records per field for the task builder. * - * @param records the number of records per column - * @param cols the column names to apply the records per column setting to + * @param records the number of records per field + * @param fields the field names to apply the records per field setting to * @return the updated task builder */ - @varargs def recordsPerColumn(records: Long, cols: String*): CountBuilder = - this.modify(_.count.perColumn).setTo(Some(perColCount.records(records, cols: _*).perColumnCount)) + @varargs def recordsPerField(records: Long, fields: String*): CountBuilder = + this.modify(_.count.perField).setTo(Some(perFieldCount.records(records, fields: _*).perFieldCount)) /** - * Generates a `CountBuilder` that records the number of records per column. + * Generates a `CountBuilder` that records the number of records per field. * - * @param generator The `GeneratorBuilder` to use for generating the per-column counts. - * @param cols The column names to generate per-column counts for. - * @return A `CountBuilder` that records the number of records per column. + * @param generator The `GeneratorBuilder` to use for generating the per-field counts. + * @param fields The field names to generate per-field counts for. + * @return A `CountBuilder` that records the number of records per field. */ - @varargs def recordsPerColumnGenerator(generator: GeneratorBuilder, cols: String*): CountBuilder = - this.modify(_.count.perColumn).setTo(Some(perColCount.generator(generator, cols: _*).perColumnCount)) + @varargs def recordsPerFieldGenerator(generator: GeneratorBuilder, fields: String*): CountBuilder = + this.modify(_.count.perField).setTo(Some(perFieldCount.generator(generator, fields: _*).perFieldCount)) /** - * Generates a `CountBuilder` with the specified number of records and a generator for the per-column counts. + * Generates a `CountBuilder` with the specified number of records and a generator for the per-field counts. * * @param records the total number of records to generate - * @param generator the `GeneratorBuilder` to use for generating the per-column counts - * @param cols the names of the columns to generate counts for - * @return a `CountBuilder` with the specified record and per-column count settings + * @param generator the `GeneratorBuilder` to use for generating the per-field counts + * @param fields the names of the fields to generate counts for + * @return a `CountBuilder` with the specified record and per-field count settings */ - @varargs def recordsPerColumnGenerator(records: Long, generator: GeneratorBuilder, cols: String*): CountBuilder = + @varargs def recordsPerFieldGenerator(records: Long, generator: GeneratorBuilder, fields: String*): CountBuilder = this.modify(_.count.records).setTo(Some(records)) - .modify(_.count.perColumn).setTo(Some(perColCount.generator(generator, cols: _*).perColumnCount)) + .modify(_.count.perField).setTo(Some(perFieldCount.generator(generator, fields: _*).perFieldCount)) /** - * Generates a normal distribution of records per column for the specified columns. + * Generates a normal distribution of records per field for the specified fields. * - * @param min the minimum number of records per column - * @param max the maximum number of records per column - * @param cols the columns to generate the normal distribution for + * @param min the minimum number of records per field + * @param max the maximum number of records per field + * @param fields the fields to generate the normal distribution for * @return a `CountBuilder` instance with the normal distribution configuration applied */ - @varargs def recordsPerColumnNormalDistribution(min: Long, max: Long, cols: String*): CountBuilder = { + @varargs def recordsPerFieldNormalDistribution(min: Long, max: Long, fields: String*): CountBuilder = { val generator = GeneratorBuilder().min(min).max(max).normalDistribution() - this.modify(_.count.perColumn).setTo(Some(perColCount.generator(generator, cols: _*).perColumnCount)) + this.modify(_.count.perField).setTo(Some(perFieldCount.generator(generator, fields: _*).perFieldCount)) } /** - * Configures the task builder to generate records per column using an exponential distribution. + * Configures the task builder to generate records per field using an exponential distribution. * - * @param min the minimum number of records per column - * @param max the maximum number of records per column + * @param min the minimum number of records per field + * @param max the maximum number of records per field * @param rateParameter the rate parameter for the exponential distribution - * @param cols the columns to apply the distribution to + * @param fields the fields to apply the distribution to * @return the modified task builder */ - @varargs def recordsPerColumnExponentialDistribution(min: Long, max: Long, rateParameter: Double, cols: String*): CountBuilder = { + @varargs def recordsPerFieldExponentialDistribution(min: Long, max: Long, rateParameter: Double, fields: String*): CountBuilder = { val generator = GeneratorBuilder().min(min).max(max).exponentialDistribution(rateParameter) - this.modify(_.count.perColumn).setTo(Some(perColCount.generator(generator, cols: _*).perColumnCount)) + this.modify(_.count.perField).setTo(Some(perFieldCount.generator(generator, fields: _*).perFieldCount)) } /** - * Generates a list of records per column using an exponential distribution. + * Generates a list of records per field using an exponential distribution. * * @param rateParameter the rate parameter for the exponential distribution - * @param cols the columns to generate records for + * @param fields the fields to generate records for * @return a [[CountBuilder]] that can be used to build the records */ - @varargs def recordsPerColumnExponentialDistribution(rateParameter: Double, cols: String*): CountBuilder = - recordsPerColumnExponentialDistribution(0, 100, rateParameter, cols: _*) + @varargs def recordsPerFieldExponentialDistribution(rateParameter: Double, fields: String*): CountBuilder = + recordsPerFieldExponentialDistribution(0, 100, rateParameter, fields: _*) - private def perColCount: PerColumnCountBuilder = { - count.perColumn match { - case Some(value) => PerColumnCountBuilder(value) - case None => PerColumnCountBuilder() + private def perFieldCount: PerFieldCountBuilder = { + count.perField match { + case Some(value) => PerFieldCountBuilder(value) + case None => PerFieldCountBuilder() } } } /** - * Define number of records to generate based on certain column values. This is used in situations where - * you want to generate multiple records for a given set of column values to closer represent the real production + * Define number of records to generate based on certain field values. This is used in situations where + * you want to generate multiple records for a given set of field values to closer represent the real production * data setting. For example, you may have a data set containing bank transactions where you want to generate * multiple transactions per account. */ -case class PerColumnCountBuilder(perColumnCount: PerColumnCount = PerColumnCount()) { +case class PerFieldCountBuilder(perFieldCount: PerFieldCount = PerFieldCount()) { /** - * Define the set of columns that should have multiple records generated for. + * Define the set of fields that should have multiple records generated for. * - * @param cols Column names - * @return PerColumnCountBuilder + * @param fieldNames Field names + * @return PerFieldCountBuilder */ - @varargs def columns(cols: String*): PerColumnCountBuilder = - this.modify(_.perColumnCount.columnNames).setTo(cols.toList) + @varargs def fieldNames(fieldNames: String*): PerFieldCountBuilder = + this.modify(_.perFieldCount.fieldNames).setTo(fieldNames.toList) /** - * Number of records to generate per set of column values defined + * Number of records to generate per set of field values defined * * @param records Number of records - * @param cols Column names - * @return PerColumnCountBuilder + * @param fields Field names + * @return PerFieldCountBuilder */ - @varargs def records(records: Long, cols: String*): PerColumnCountBuilder = - columns(cols: _*).modify(_.perColumnCount.count).setTo(Some(records)) + @varargs def records(records: Long, fields: String*): PerFieldCountBuilder = + fieldNames(fields: _*).modify(_.perFieldCount.count).setTo(Some(records)) /** - * Define a generator to determine the number of records to generate per set of column value defined + * Define a generator to determine the number of records to generate per set of field value defined * * @param generator Generator for number of records - * @param cols Column names - * @return PerColumnCountBuilder - */ - @varargs def generator(generator: GeneratorBuilder, cols: String*): PerColumnCountBuilder = - columns(cols: _*).modify(_.perColumnCount.generator).setTo(Some(generator.generator)) -} - -/** - * Builds a new `Schema` instance with the provided initial state. - * - * @param schema the initial `Schema` instance to use, defaults to a new `Schema` instance - */ -case class SchemaBuilder(schema: Schema = Schema()) { - def this() = this(Schema()) - - /** - * Adds a new field to the schema builder with the specified name and data type. - * - * @param name the name of the field to add - * @param type the data type of the field, defaulting to `StringType` if not provided - * @return the updated schema builder + * @param fields Field names + * @return PerFieldCountBuilder */ - def addField(name: String, `type`: DataType = StringType): SchemaBuilder = - addFields(FieldBuilder().name(name).`type`(`type`)) - - /** - * Adds the specified fields to the schema. - * - * @param fields The fields to add to the schema. - * @return The updated `SchemaBuilder` instance. - */ - @varargs def addFields(fields: FieldBuilder*): SchemaBuilder = - this.modify(_.schema.fields).setTo(schema.fields match { - case Some(value) => Some(value ++ fields.map(_.field)) - case None => Some(fields.map(_.field).toList) - }) + @varargs def generator(generator: GeneratorBuilder, fields: String*): PerFieldCountBuilder = + fieldNames(fields: _*).modify(_.perFieldCount.options).setTo(generator.options) } /** @@ -608,32 +566,14 @@ case class FieldBuilder(field: Field = Field()) { def `type`(`type`: DataType): FieldBuilder = this.modify(_.field.`type`).setTo(Some(`type`.toString)) - /** - * Sets the schema for the current field builder. - * - * @param schema the schema to set for the field - * @return the current field builder instance - */ - def schema(schema: SchemaBuilder): FieldBuilder = - this.modify(_.field.schema).setTo(Some(schema.schema)) - - /** - * Sets the schema for the current field. - * - * @param schema the schema to set for the field - * @return a new `FieldBuilder` instance with the schema set - */ - def schema(schema: Schema): FieldBuilder = - this.modify(_.field.schema).setTo(Some(schema)) - /** * Adds the specified fields to the schema of this `FieldBuilder`. * * @param fields the fields to add to the schema * @return a new `FieldBuilder` with the updated schema */ - @varargs def schema(fields: FieldBuilder*): FieldBuilder = - this.modify(_.field.schema).setTo(Some(getSchema.addFields(fields: _*).schema)) + @varargs def fields(fields: FieldBuilder*): FieldBuilder = + this.modify(_.field.fields).setTo(field.fields ++ fields.map(_.field)) /** * Sets the field generator for the `TaskBuilder` instance, using the options from the provided `MetadataSourceBuilder`. @@ -641,8 +581,8 @@ case class FieldBuilder(field: Field = Field()) { * @param metadataSourceBuilder the `MetadataSourceBuilder` instance to use for the field generator options * @return the updated `FieldBuilder` instance */ - def schema(metadataSourceBuilder: MetadataSourceBuilder): FieldBuilder = - this.modify(_.field.generator).setTo(Some(getGenBuilder.options(metadataSourceBuilder.metadataSource.allOptions).generator)) + def fields(metadataSourceBuilder: MetadataSourceBuilder): FieldBuilder = + this.modify(_.field.options).setTo(metadataSourceBuilder.metadataSource.allOptions) /** * Sets whether the field is nullable or not. @@ -660,24 +600,7 @@ case class FieldBuilder(field: Field = Field()) { * @return the updated field builder */ def generator(generator: GeneratorBuilder): FieldBuilder = - this.modify(_.field.generator).setTo(Some(generator.generator)) - - /** - * Sets the generator for the current field builder. - * - * @param generator the generator to use for the field - * @return the updated field builder - */ - def generator(generator: Generator): FieldBuilder = - this.modify(_.field.generator).setTo(Some(generator)) - - /** - * Sets the field generator to a random generator. - * - * @return a new `FieldBuilder` instance with the field generator set to a random generator - */ - def random: FieldBuilder = - this.modify(_.field.generator).setTo(Some(getGenBuilder.random.generator)) + this.modify(_.field.options).setTo(generator.options) /** * Sets the SQL query to be used for this field. @@ -686,7 +609,7 @@ case class FieldBuilder(field: Field = Field()) { * @return a new `FieldBuilder` instance with the SQL query set */ def sql(sql: String): FieldBuilder = - this.modify(_.field.generator).setTo(Some(getGenBuilder.sql(sql).generator)) + this.modify(_.field.options).setTo(getGenBuilder.sql(sql).options) /** * Sets the regular expression pattern to be used for the field generator. @@ -695,7 +618,7 @@ case class FieldBuilder(field: Field = Field()) { * @return the updated `FieldBuilder` instance */ def regex(regex: String): FieldBuilder = - this.modify(_.field.generator).setTo(Some(getGenBuilder.regex(regex).generator)) + this.modify(_.field.options).setTo(getGenBuilder.regex(regex).options) /** * Builds a field that can take on one of the provided values. @@ -704,7 +627,7 @@ case class FieldBuilder(field: Field = Field()) { * @return A FieldBuilder that has been modified to use the provided values. */ @varargs def oneOf(values: Any*): FieldBuilder = - this.modify(_.field.generator).setTo(Some(getGenBuilder.oneOf(values: _*).generator)) + this.modify(_.field.options).setTo(getGenBuilder.oneOf(values: _*).options) .modify(_.field.`type`) .setTo( values match { @@ -724,7 +647,7 @@ case class FieldBuilder(field: Field = Field()) { * @return the updated FieldBuilder instance */ def options(options: Map[String, Any]): FieldBuilder = - this.modify(_.field.generator).setTo(Some(getGenBuilder.options(options).generator)) + this.modify(_.field.options).setTo(getGenBuilder.options(options).options) /** * Adds an option to the field generator. @@ -733,7 +656,7 @@ case class FieldBuilder(field: Field = Field()) { * @return the updated `FieldBuilder` instance */ def option(option: (String, Any)): FieldBuilder = - this.modify(_.field.generator).setTo(Some(getGenBuilder.option(option).generator)) + this.modify(_.field.options).setTo(getGenBuilder.option(option).options) /** * Sets the seed for the field generator. @@ -741,7 +664,7 @@ case class FieldBuilder(field: Field = Field()) { * @param seed the seed value to use for the field generator * @return the updated `FieldBuilder` instance */ - def seed(seed: Long): FieldBuilder = this.modify(_.field.generator).setTo(Some(getGenBuilder.seed(seed).generator)) + def seed(seed: Long): FieldBuilder = this.modify(_.field.options).setTo(getGenBuilder.seed(seed).options) /** * Enables or disables null values for the field. @@ -750,7 +673,7 @@ case class FieldBuilder(field: Field = Field()) { * @return the updated `FieldBuilder` instance */ def enableNull(enable: Boolean): FieldBuilder = - this.modify(_.field.generator).setTo(Some(getGenBuilder.enableNull(enable).generator)) + this.modify(_.field.options).setTo(getGenBuilder.enableNull(enable).options) /** * Sets the null probability for the field generator. @@ -759,7 +682,7 @@ case class FieldBuilder(field: Field = Field()) { * @return The updated `FieldBuilder` instance. */ def nullProbability(probability: Double): FieldBuilder = - this.modify(_.field.generator).setTo(Some(getGenBuilder.nullProbability(probability).generator)) + this.modify(_.field.options).setTo(getGenBuilder.nullProbability(probability).options) /** * Enables or disables edge cases for the field generator. @@ -768,7 +691,7 @@ case class FieldBuilder(field: Field = Field()) { * @return the updated `FieldBuilder` instance */ def enableEdgeCases(enable: Boolean): FieldBuilder = - this.modify(_.field.generator).setTo(Some(getGenBuilder.enableEdgeCases(enable).generator)) + this.modify(_.field.options).setTo(getGenBuilder.enableEdgeCases(enable).options) /** * Sets the edge case probability for the field generator. @@ -777,7 +700,7 @@ case class FieldBuilder(field: Field = Field()) { * @return The updated `FieldBuilder` instance. */ def edgeCaseProbability(probability: Double): FieldBuilder = - this.modify(_.field.generator).setTo(Some(getGenBuilder.edgeCaseProbability(probability).generator)) + this.modify(_.field.options).setTo(getGenBuilder.edgeCaseProbability(probability).options) /** * Creates a `FieldBuilder` with a static value generator. @@ -786,7 +709,7 @@ case class FieldBuilder(field: Field = Field()) { * @return A `FieldBuilder` with the static value generator set. */ def static(value: Any): FieldBuilder = - this.modify(_.field.generator).setTo(Some(getGenBuilder.static(value).generator)) + this.modify(_.field.options).setTo(getGenBuilder.static(value).options) /** * Constructs a `FieldBuilder` with a static value. @@ -803,7 +726,7 @@ case class FieldBuilder(field: Field = Field()) { * @return the updated `FieldBuilder` instance. */ def unique(isUnique: Boolean): FieldBuilder = - this.modify(_.field.generator).setTo(Some(getGenBuilder.unique(isUnique).generator)) + this.modify(_.field.options).setTo(getGenBuilder.unique(isUnique).options) /** * Sets the field generator to an array type with the specified element type. @@ -812,7 +735,7 @@ case class FieldBuilder(field: Field = Field()) { * @return a new `FieldBuilder` instance with the array type set */ def arrayType(`type`: String): FieldBuilder = - this.modify(_.field.generator).setTo(Some(getGenBuilder.arrayType(`type`).generator)) + this.modify(_.field.options).setTo(getGenBuilder.arrayType(`type`).options) /** * Sets the faker expression for the field generator. @@ -821,7 +744,7 @@ case class FieldBuilder(field: Field = Field()) { * @return the updated `FieldBuilder` instance */ def expression(expr: String): FieldBuilder = - this.modify(_.field.generator).setTo(Some(getGenBuilder.expression(expr).generator)) + this.modify(_.field.options).setTo(getGenBuilder.expression(expr).options) /** * Sets the field generator to use an average length generator with the specified length. @@ -830,7 +753,7 @@ case class FieldBuilder(field: Field = Field()) { * @return the updated `FieldBuilder` instance */ def avgLength(length: Int): FieldBuilder = - this.modify(_.field.generator).setTo(Some(getGenBuilder.avgLength(length).generator)) + this.modify(_.field.options).setTo(getGenBuilder.avgLength(length).options) /** * Sets the minimum value for the field. @@ -839,7 +762,7 @@ case class FieldBuilder(field: Field = Field()) { * @return The updated `FieldBuilder` instance. */ def min(min: Any): FieldBuilder = - this.modify(_.field.generator).setTo(Some(getGenBuilder.min(min).generator)) + this.modify(_.field.options).setTo(getGenBuilder.min(min).options) /** * Sets the minimum length of the field value. @@ -848,7 +771,7 @@ case class FieldBuilder(field: Field = Field()) { * @return the updated `FieldBuilder` instance */ def minLength(length: Int): FieldBuilder = - this.modify(_.field.generator).setTo(Some(getGenBuilder.minLength(length).generator)) + this.modify(_.field.options).setTo(getGenBuilder.minLength(length).options) /** * Sets the minimum length for the array generated by this `FieldBuilder`. @@ -857,7 +780,7 @@ case class FieldBuilder(field: Field = Field()) { * @return the updated `FieldBuilder` instance */ def arrayMinLength(length: Int): FieldBuilder = - this.modify(_.field.generator).setTo(Some(getGenBuilder.arrayMinLength(length).generator)) + this.modify(_.field.options).setTo(getGenBuilder.arrayMinLength(length).options) /** * Sets the maximum value for the field generator. @@ -866,7 +789,7 @@ case class FieldBuilder(field: Field = Field()) { * @return The updated `FieldBuilder` instance. */ def max(max: Any): FieldBuilder = - this.modify(_.field.generator).setTo(Some(getGenBuilder.max(max).generator)) + this.modify(_.field.options).setTo(getGenBuilder.max(max).options) /** * Sets the maximum length of the field. @@ -875,7 +798,7 @@ case class FieldBuilder(field: Field = Field()) { * @return the updated `FieldBuilder` instance */ def maxLength(length: Int): FieldBuilder = - this.modify(_.field.generator).setTo(Some(getGenBuilder.maxLength(length).generator)) + this.modify(_.field.options).setTo(getGenBuilder.maxLength(length).options) /** * Sets the maximum length of the array generated by the field's generator. @@ -884,7 +807,7 @@ case class FieldBuilder(field: Field = Field()) { * @return the updated `FieldBuilder` instance */ def arrayMaxLength(length: Int): FieldBuilder = - this.modify(_.field.generator).setTo(Some(getGenBuilder.arrayMaxLength(length).generator)) + this.modify(_.field.options).setTo(getGenBuilder.arrayMaxLength(length).options) /** * Sets the numeric precision for the field. @@ -893,7 +816,7 @@ case class FieldBuilder(field: Field = Field()) { * @return The updated `FieldBuilder` instance. */ def numericPrecision(precision: Int): FieldBuilder = - this.modify(_.field.generator).setTo(Some(getGenBuilder.numericPrecision(precision).generator)) + this.modify(_.field.options).setTo(getGenBuilder.numericPrecision(precision).options) /** * Sets the numeric scale for the field. @@ -902,7 +825,7 @@ case class FieldBuilder(field: Field = Field()) { * @return the updated `FieldBuilder` instance */ def numericScale(scale: Int): FieldBuilder = - this.modify(_.field.generator).setTo(Some(getGenBuilder.numericScale(scale).generator)) + this.modify(_.field.options).setTo(getGenBuilder.numericScale(scale).options) /** * Sets the rounding for the field. @@ -911,7 +834,7 @@ case class FieldBuilder(field: Field = Field()) { * @return the updated `FieldBuilder` instance */ def round(round: Int): FieldBuilder = - this.modify(_.field.generator).setTo(Some(getGenBuilder.round(round).generator)) + this.modify(_.field.options).setTo(getGenBuilder.round(round).options) /** * Sets whether the field should be omitted from the generated output. @@ -920,7 +843,7 @@ case class FieldBuilder(field: Field = Field()) { * @return a new `FieldBuilder` instance with the updated omit setting. */ def omit(omit: Boolean): FieldBuilder = - this.modify(_.field.generator).setTo(Some(getGenBuilder.omit(omit).generator)) + this.modify(_.field.options).setTo(getGenBuilder.omit(omit).options) /** * Sets the primary key flag for the current field. @@ -929,7 +852,7 @@ case class FieldBuilder(field: Field = Field()) { * @return The updated `FieldBuilder` instance. */ def primaryKey(isPrimaryKey: Boolean): FieldBuilder = - this.modify(_.field.generator).setTo(Some(getGenBuilder.primaryKey(isPrimaryKey).generator)) + this.modify(_.field.options).setTo(getGenBuilder.primaryKey(isPrimaryKey).options) /** * Sets the primary key position for the field being built. @@ -938,7 +861,7 @@ case class FieldBuilder(field: Field = Field()) { * @return the updated FieldBuilder instance */ def primaryKeyPosition(position: Int): FieldBuilder = - this.modify(_.field.generator).setTo(Some(getGenBuilder.primaryKeyPosition(position).generator)) + this.modify(_.field.options).setTo(getGenBuilder.primaryKeyPosition(position).options) /** * Sets the clustering position for the field generator. @@ -947,7 +870,7 @@ case class FieldBuilder(field: Field = Field()) { * @return the updated `FieldBuilder` instance */ def clusteringPosition(position: Int): FieldBuilder = - this.modify(_.field.generator).setTo(Some(getGenBuilder.clusteringPosition(position).generator)) + this.modify(_.field.options).setTo(getGenBuilder.clusteringPosition(position).options) /** * Sets the standard deviation of the field generator. @@ -956,7 +879,7 @@ case class FieldBuilder(field: Field = Field()) { * @return the updated `FieldBuilder` instance */ def standardDeviation(stddev: Double): FieldBuilder = - this.modify(_.field.generator).setTo(Some(getGenBuilder.standardDeviation(stddev).generator)) + this.modify(_.field.options).setTo(getGenBuilder.standardDeviation(stddev).options) /** * Sets the mean value for the field generator. @@ -965,48 +888,28 @@ case class FieldBuilder(field: Field = Field()) { * @return the updated `FieldBuilder` instance */ def mean(mean: Double): FieldBuilder = - this.modify(_.field.generator).setTo(Some(getGenBuilder.mean(mean).generator)) + this.modify(_.field.options).setTo(getGenBuilder.mean(mean).options) private def getGenBuilder: GeneratorBuilder = { - field.generator match { - case Some(gen) => GeneratorBuilder(gen) - case None => GeneratorBuilder() - } - } - - private def getSchema: SchemaBuilder = { - field.schema match { - case Some(schema) => SchemaBuilder(schema) - case None => SchemaBuilder() - } + GeneratorBuilder(field.options) } } /** * Data generator contains all the metadata, related to either a field or count generation, required to create new data. */ -case class GeneratorBuilder(generator: Generator = Generator()) { - def this() = this(Generator()) - - /** - * Create a random data generator. Depending on the data type, particular defaults are set for the metadata - * - * @return GeneratorBuilder GeneratorBuilder - * @see Data generator default details here - */ - def random: GeneratorBuilder = - this.modify(_.generator.`type`).setTo(RANDOM_GENERATOR) +case class GeneratorBuilder(options: Map[String, Any] = Map()) { + def this() = this(Map()) /** - * Create a SQL based generator. You can reference other columns and SQL functions to generate data. The output data + * Create a SQL based generator. You can reference other fields and SQL functions to generate data. The output data * type from the SQL expression should also match the data type defined otherwise a runtime error will be thrown * * @param sql SQL expression * @return GeneratorBuilder */ def sql(sql: String): GeneratorBuilder = - this.modify(_.generator.`type`).setTo(SQL_GENERATOR) - .modify(_.generator.options)(_ ++ Map(SQL_GENERATOR -> sql)) + this.modify(_.options)(_ ++ Map(SQL_GENERATOR -> sql)) /** * Create a generator based on a particular regex @@ -1015,8 +918,7 @@ case class GeneratorBuilder(generator: Generator = Generator()) { * @return GeneratorBuilder */ def regex(regex: String): GeneratorBuilder = - this.modify(_.generator.`type`).setTo(REGEX_GENERATOR) - .modify(_.generator.options)(_ ++ Map(REGEX_GENERATOR -> regex)) + this.modify(_.options)(_ ++ Map(REGEX_GENERATOR -> regex)) /** * Create a generator that can only generate values from a set of values defined. @@ -1024,8 +926,7 @@ case class GeneratorBuilder(generator: Generator = Generator()) { * @param values Set of valid values * @return GeneratorBuilder */ - @varargs def oneOf(values: Any*): GeneratorBuilder = this.modify(_.generator.`type`).setTo(ONE_OF_GENERATOR) - .modify(_.generator.options)(_ ++ Map(ONE_OF_GENERATOR -> values)) + @varargs def oneOf(values: Any*): GeneratorBuilder = this.modify(_.options)(_ ++ Map(ONE_OF_GENERATOR -> values)) /** * Define metadata map for your generator. Add/overwrites existing metadata @@ -1034,7 +935,7 @@ case class GeneratorBuilder(generator: Generator = Generator()) { * @return GeneratorBuilder */ def options(options: Map[String, Any]): GeneratorBuilder = - this.modify(_.generator.options)(_ ++ options) + this.modify(_.options)(_ ++ options) /** * Wrapper for Java Map @@ -1052,7 +953,7 @@ case class GeneratorBuilder(generator: Generator = Generator()) { * @return GeneratorBuilder */ def option(option: (String, Any)): GeneratorBuilder = - this.modify(_.generator.options)(_ ++ Map(option)) + this.modify(_.options)(_ ++ Map(option)) /** * Seed to use for random generator. If you want to generate a consistent set of values, use this method @@ -1061,7 +962,7 @@ case class GeneratorBuilder(generator: Generator = Generator()) { * @return GeneratorBuilder */ def seed(seed: Long): GeneratorBuilder = - this.modify(_.generator.options)(_ ++ Map(RANDOM_SEED -> seed.toString)) + this.modify(_.options)(_ ++ Map(RANDOM_SEED -> seed.toString)) /** * Enable/disable null values to be generated for this field @@ -1070,7 +971,7 @@ case class GeneratorBuilder(generator: Generator = Generator()) { * @return GeneratorBuilder */ def enableNull(enable: Boolean): GeneratorBuilder = - this.modify(_.generator.options)(_ ++ Map(ENABLED_NULL -> enable.toString)) + this.modify(_.options)(_ ++ Map(ENABLED_NULL -> enable.toString)) /** * If [[enableNull]] is enabled, the generator will generate null values with the probability defined. @@ -1080,7 +981,7 @@ case class GeneratorBuilder(generator: Generator = Generator()) { * @return GeneratorBuilder */ def nullProbability(probability: Double): GeneratorBuilder = - this.modify(_.generator.options)(_ ++ Map(PROBABILITY_OF_NULL -> probability.toString)) + this.modify(_.options)(_ ++ Map(PROBABILITY_OF_NULL -> probability.toString)) /** * Enable/disable edge case values to be generated. The edge cases are based on the data type defined. @@ -1090,7 +991,7 @@ case class GeneratorBuilder(generator: Generator = Generator()) { * @see Generator details here */ def enableEdgeCases(enable: Boolean): GeneratorBuilder = - this.modify(_.generator.options)(_ ++ Map(ENABLED_EDGE_CASE -> enable.toString)) + this.modify(_.options)(_ ++ Map(ENABLED_EDGE_CASE -> enable.toString)) /** @@ -1101,7 +1002,7 @@ case class GeneratorBuilder(generator: Generator = Generator()) { * @return GeneratorBuilder */ def edgeCaseProbability(probability: Double): GeneratorBuilder = - this.modify(_.generator.options)(_ ++ Map(PROBABILITY_OF_EDGE_CASE -> probability.toString)) + this.modify(_.options)(_ ++ Map(PROBABILITY_OF_EDGE_CASE -> probability.toString)) /** * Generator will always give back the static value, ignoring all other metadata defined @@ -1110,7 +1011,7 @@ case class GeneratorBuilder(generator: Generator = Generator()) { * @return GeneratorBuilder */ def static(value: Any): GeneratorBuilder = - this.modify(_.generator.options)(_ ++ Map(STATIC -> value.toString)) + this.modify(_.options)(_ ++ Map(STATIC -> value.toString)) /** * Wrapper for Java given `static` is a keyword @@ -1130,7 +1031,7 @@ case class GeneratorBuilder(generator: Generator = Generator()) { * @return GeneratorBuilder */ def unique(isUnique: Boolean): GeneratorBuilder = - this.modify(_.generator.options)(_ ++ Map(IS_UNIQUE -> isUnique.toString)) + this.modify(_.options)(_ ++ Map(IS_UNIQUE -> isUnique.toString)) /** * If data type is array, define the inner data type of the array @@ -1139,7 +1040,7 @@ case class GeneratorBuilder(generator: Generator = Generator()) { * @return GeneratorBuilder */ def arrayType(`type`: String): GeneratorBuilder = - this.modify(_.generator.options)(_ ++ Map(ARRAY_TYPE -> `type`)) + this.modify(_.options)(_ ++ Map(ARRAY_TYPE -> `type`)) /** * Use a DataFaker expression to generate data. If you want to know what is possible to use as an expression, follow @@ -1150,7 +1051,7 @@ case class GeneratorBuilder(generator: Generator = Generator()) { * @see Expression details */ def expression(expr: String): GeneratorBuilder = - this.modify(_.generator.options)(_ ++ Map(EXPRESSION -> expr)) + this.modify(_.options)(_ ++ Map(EXPRESSION -> expr)) /** * Average length of data generated. Length is specifically used for String data type and is ignored for other data types @@ -1159,7 +1060,7 @@ case class GeneratorBuilder(generator: Generator = Generator()) { * @return GeneratorBuilder */ def avgLength(length: Int): GeneratorBuilder = - this.modify(_.generator.options)(_ ++ Map(AVERAGE_LENGTH -> length.toString)) + this.modify(_.options)(_ ++ Map(AVERAGE_LENGTH -> length.toString)) /** * Minimum value to be generated. This can be used for any data type except for Struct and Array. @@ -1168,7 +1069,7 @@ case class GeneratorBuilder(generator: Generator = Generator()) { * @return GeneratorBuilder */ def min(min: Any): GeneratorBuilder = - this.modify(_.generator.options)(_ ++ Map(MINIMUM -> min.toString)) + this.modify(_.options)(_ ++ Map(MINIMUM -> min.toString)) /** * Minimum length of data generated. Length is specifically used for String data type and is ignored for other data types @@ -1177,7 +1078,7 @@ case class GeneratorBuilder(generator: Generator = Generator()) { * @return GeneratorBuilder */ def minLength(length: Int): GeneratorBuilder = - this.modify(_.generator.options)(_ ++ Map(MINIMUM_LENGTH -> length.toString)) + this.modify(_.options)(_ ++ Map(MINIMUM_LENGTH -> length.toString)) /** * Minimum length of array generated. Only used when data type is Array @@ -1186,17 +1087,17 @@ case class GeneratorBuilder(generator: Generator = Generator()) { * @return GeneratorBuilder */ def arrayMinLength(length: Int): GeneratorBuilder = - this.modify(_.generator.options)(_ ++ Map(ARRAY_MINIMUM_LENGTH -> length.toString)) + this.modify(_.options)(_ ++ Map(ARRAY_MINIMUM_LENGTH -> length.toString)) /** * Maximum value to be generated. This can be used for any data type except for Struct and Array. Can be ignored in - * scenario where database column is auto increment where values generated start from the max value. + * scenario where database field is auto increment where values generated start from the max value. * * @param max Maximum value * @return GeneratorBuilder */ def max(max: Any): GeneratorBuilder = - this.modify(_.generator.options)(_ ++ Map(MAXIMUM -> max.toString)) + this.modify(_.options)(_ ++ Map(MAXIMUM -> max.toString)) /** * Maximum length of data generated. Length is specifically used for String data type and is ignored for other data types @@ -1205,7 +1106,7 @@ case class GeneratorBuilder(generator: Generator = Generator()) { * @return GeneratorBuilder */ def maxLength(length: Int): GeneratorBuilder = - this.modify(_.generator.options)(_ ++ Map(MAXIMUM_LENGTH -> length.toString)) + this.modify(_.options)(_ ++ Map(MAXIMUM_LENGTH -> length.toString)) /** * Maximum length of array generated. Only used when data type is Array @@ -1214,7 +1115,7 @@ case class GeneratorBuilder(generator: Generator = Generator()) { * @return GeneratorBuilder */ def arrayMaxLength(length: Int): GeneratorBuilder = - this.modify(_.generator.options)(_ ++ Map(ARRAY_MAXIMUM_LENGTH -> length.toString)) + this.modify(_.options)(_ ++ Map(ARRAY_MAXIMUM_LENGTH -> length.toString)) /** * Numeric precision used for Decimal data type @@ -1223,7 +1124,7 @@ case class GeneratorBuilder(generator: Generator = Generator()) { * @return GeneratorBuilder */ def numericPrecision(precision: Int): GeneratorBuilder = - this.modify(_.generator.options)(_ ++ Map(NUMERIC_PRECISION -> precision.toString)) + this.modify(_.options)(_ ++ Map(NUMERIC_PRECISION -> precision.toString)) /** * Numeric scale for Decimal data type @@ -1232,7 +1133,7 @@ case class GeneratorBuilder(generator: Generator = Generator()) { * @return GeneratorBuilder */ def numericScale(scale: Int): GeneratorBuilder = - this.modify(_.generator.options)(_ ++ Map(NUMERIC_SCALE -> scale.toString)) + this.modify(_.options)(_ ++ Map(NUMERIC_SCALE -> scale.toString)) /** * Rounding to decimal places for numeric data types @@ -1241,17 +1142,17 @@ case class GeneratorBuilder(generator: Generator = Generator()) { * @return GeneratorBuilder */ def round(round: Int): GeneratorBuilder = - this.modify(_.generator.options)(_ ++ Map(ROUND -> round.toString)) + this.modify(_.options)(_ ++ Map(ROUND -> round.toString)) /** * Enable/disable including the value in the final output to the data source. Allows you to define intermediate values - * that can be used to generate other columns + * that can be used to generate other fields * * @param omit Enable/disable the value being in output to data source * @return GeneratorBuilder */ def omit(omit: Boolean): GeneratorBuilder = - this.modify(_.generator.options)(_ ++ Map(OMIT -> omit.toString)) + this.modify(_.options)(_ ++ Map(OMIT -> omit.toString)) /** * Field is a primary key of the data source. @@ -1260,7 +1161,7 @@ case class GeneratorBuilder(generator: Generator = Generator()) { * @return GeneratorBuilder */ def primaryKey(isPrimaryKey: Boolean): GeneratorBuilder = - this.modify(_.generator.options)(_ ++ Map(IS_PRIMARY_KEY -> isPrimaryKey.toString)) + this.modify(_.options)(_ ++ Map(IS_PRIMARY_KEY -> isPrimaryKey.toString)) /** * If [[primaryKey]] is enabled, this defines the position of the primary key. Starts at 1. @@ -1269,7 +1170,7 @@ case class GeneratorBuilder(generator: Generator = Generator()) { * @return GeneratorBuilder */ def primaryKeyPosition(position: Int): GeneratorBuilder = - this.modify(_.generator.options)(_ ++ Map(PRIMARY_KEY_POSITION -> position.toString)) + this.modify(_.options)(_ ++ Map(PRIMARY_KEY_POSITION -> position.toString)) /** * If the data source supports clustering order (like Cassandra), this represents the order of the clustering key. @@ -1279,7 +1180,7 @@ case class GeneratorBuilder(generator: Generator = Generator()) { * @return GeneratorBuilder */ def clusteringPosition(position: Int): GeneratorBuilder = - this.modify(_.generator.options)(_ ++ Map(CLUSTERING_POSITION -> position.toString)) + this.modify(_.options)(_ ++ Map(CLUSTERING_POSITION -> position.toString)) /** * The standard deviation of the data if it follows a normal distribution. @@ -1288,7 +1189,7 @@ case class GeneratorBuilder(generator: Generator = Generator()) { * @return GeneratorBuilder */ def standardDeviation(stddev: Double): GeneratorBuilder = - this.modify(_.generator.options)(_ ++ Map(STANDARD_DEVIATION -> stddev.toString)) + this.modify(_.options)(_ ++ Map(STANDARD_DEVIATION -> stddev.toString)) /** * The mean of the data if it follows a normal distribution. @@ -1297,7 +1198,7 @@ case class GeneratorBuilder(generator: Generator = Generator()) { * @return GeneratorBuilder */ def mean(mean: Double): GeneratorBuilder = - this.modify(_.generator.options)(_ ++ Map(MEAN -> mean.toString)) + this.modify(_.options)(_ ++ Map(MEAN -> mean.toString)) /** * The distribution of the data is exponential. @@ -1306,7 +1207,7 @@ case class GeneratorBuilder(generator: Generator = Generator()) { * @return GeneratorBuilder */ def exponentialDistribution(rate: Double): GeneratorBuilder = - this.modify(_.generator.options)(_ ++ Map(DISTRIBUTION -> DISTRIBUTION_EXPONENTIAL, DISTRIBUTION_RATE_PARAMETER -> rate.toString)) + this.modify(_.options)(_ ++ Map(DISTRIBUTION -> DISTRIBUTION_EXPONENTIAL, DISTRIBUTION_RATE_PARAMETER -> rate.toString)) /** * The distribution of the data is normal. @@ -1314,5 +1215,5 @@ case class GeneratorBuilder(generator: Generator = Generator()) { * @return GeneratorBuilder */ def normalDistribution(): GeneratorBuilder = - this.modify(_.generator.options)(_ ++ Map(DISTRIBUTION -> DISTRIBUTION_NORMAL)) + this.modify(_.options)(_ ++ Map(DISTRIBUTION -> DISTRIBUTION_NORMAL)) } \ No newline at end of file diff --git a/api/src/main/scala/io/github/datacatering/datacaterer/api/ValidationBuilder.scala b/api/src/main/scala/io/github/datacatering/datacaterer/api/ValidationBuilder.scala index 4f5cfb9b..8b7ee24f 100644 --- a/api/src/main/scala/io/github/datacatering/datacaterer/api/ValidationBuilder.scala +++ b/api/src/main/scala/io/github/datacatering/datacaterer/api/ValidationBuilder.scala @@ -2,11 +2,11 @@ package io.github.datacatering.datacaterer.api import com.fasterxml.jackson.databind.annotation.JsonSerialize import com.softwaremill.quicklens.ModifyPimp -import io.github.datacatering.datacaterer.api.ValidationHelper.cleanColumnName +import io.github.datacatering.datacaterer.api.ValidationHelper.cleanFieldName import io.github.datacatering.datacaterer.api.connection.{ConnectionTaskBuilder, FileBuilder} import io.github.datacatering.datacaterer.api.model.ConditionType.ConditionType -import io.github.datacatering.datacaterer.api.model.Constants.{AGGREGATION_AVG, AGGREGATION_COUNT, AGGREGATION_MAX, AGGREGATION_MIN, AGGREGATION_STDDEV, AGGREGATION_SUM, DEFAULT_VALIDATION_JOIN_TYPE, DEFAULT_VALIDATION_WEBHOOK_HTTP_DATA_SOURCE_NAME, VALIDATION_COLUMN_NAME_COUNT_BETWEEN, VALIDATION_COLUMN_NAME_COUNT_EQUAL, VALIDATION_COLUMN_NAME_MATCH_ORDER, VALIDATION_COLUMN_NAME_MATCH_SET, VALIDATION_PREFIX_JOIN_EXPRESSION, VALIDATION_UNIQUE} -import io.github.datacatering.datacaterer.api.model.{ColumnNamesValidation, ConditionType, DataExistsWaitCondition, DataSourceValidation, ExpressionValidation, FileExistsWaitCondition, GroupByValidation, PauseWaitCondition, UpstreamDataSourceValidation, Validation, ValidationConfiguration, WaitCondition, WebhookWaitCondition} +import io.github.datacatering.datacaterer.api.model.Constants.{AGGREGATION_AVG, AGGREGATION_COUNT, AGGREGATION_MAX, AGGREGATION_MIN, AGGREGATION_STDDEV, AGGREGATION_SUM, DEFAULT_VALIDATION_JOIN_TYPE, DEFAULT_VALIDATION_WEBHOOK_HTTP_DATA_SOURCE_NAME, VALIDATION_FIELD_NAME_COUNT_BETWEEN, VALIDATION_FIELD_NAME_COUNT_EQUAL, VALIDATION_FIELD_NAME_MATCH_ORDER, VALIDATION_FIELD_NAME_MATCH_SET, VALIDATION_PREFIX_JOIN_EXPRESSION, VALIDATION_UNIQUE} +import io.github.datacatering.datacaterer.api.model.{ConditionType, DataExistsWaitCondition, DataSourceValidation, ExpressionValidation, FieldNamesValidation, FileExistsWaitCondition, GroupByValidation, PauseWaitCondition, UpstreamDataSourceValidation, Validation, ValidationConfiguration, WaitCondition, WebhookWaitCondition} import io.github.datacatering.datacaterer.api.parser.ValidationBuilderSerializer import java.sql.{Date, Timestamp} @@ -118,7 +118,7 @@ case class ValidationBuilder(validation: Validation = ExpressionValidation(), op /** * SQL expression used to check if data is adhering to specified condition. Return result from SQL expression is - * required to be boolean. Can use any columns in the validation logic. + * required to be boolean. Can use any fields in the validation logic. * * For example, * {{{validation.expr("CASE WHEN status == 'open' THEN balance > 0 ELSE balance == 0 END")}}} @@ -129,8 +129,8 @@ case class ValidationBuilder(validation: Validation = ExpressionValidation(), op */ def expr(expr: String): ValidationBuilder = { validation match { - case GroupByValidation(grpCols, aggCol, aggType, _) => - val grpWithExpr = GroupByValidation(grpCols, aggCol, aggType, expr) + case GroupByValidation(grpFields, aggField, aggType, _, _) => + val grpWithExpr = GroupByValidation(grpFields, aggField, aggType, expr) copyWithDescAndThreshold(grpWithExpr) case expressionValidation: ExpressionValidation => val withExpr = expressionValidation.modify(_.expr).setTo(expr) @@ -140,7 +140,7 @@ case class ValidationBuilder(validation: Validation = ExpressionValidation(), op } /** - * SQL expression used to apply to columns before running validations. + * SQL expression used to apply to fields before running validations. * * For example, * {{{validation.selectExpr("PERCENTILE(amount, 0.5) AS median_amount", "*")}}} @@ -159,42 +159,42 @@ case class ValidationBuilder(validation: Validation = ExpressionValidation(), op } /** - * Define a column validation that can cover validations for any type of data. + * Define a field validation that can cover validations for any type of data. * - * @param column Name of the column to run validation against - * @return ColumnValidationBuilder + * @param field Name of the field to run validation against + * @return FieldValidationBuilder */ - def col(column: String): ColumnValidationBuilder = { - ColumnValidationBuilder(this, cleanColumnName(column)) + def field(field: String): FieldValidationBuilder = { + FieldValidationBuilder(this, cleanFieldName(field)) } /** - * Define columns to group by, so that validation can be run on grouped by dataset + * Define fields to group by, so that validation can be run on grouped by dataset * - * @param columns Name of the column to run validation against - * @return ColumnValidationBuilder + * @param fields Name of the field to run validation against + * @return FieldValidationBuilder */ - @varargs def groupBy(columns: String*): GroupByValidationBuilder = { - GroupByValidationBuilder(this, columns) + @varargs def groupBy(fields: String*): GroupByValidationBuilder = { + GroupByValidationBuilder(this, fields) } /** * Check row count of dataset * - * @return ColumnValidationBuilder to apply validation on row count + * @return FieldValidationBuilder to apply validation on row count */ - def count(): ColumnValidationBuilder = { + def count(): FieldValidationBuilder = { GroupByValidationBuilder().count() } /** - * Check if column(s) values are unique + * Check if field(s) values are unique * - * @param columns One or more columns whose values will be checked for uniqueness + * @param fields One or more fields whose values will be checked for uniqueness * @return ValidationBuilder */ - @varargs def unique(columns: String*): ValidationBuilder = { - this.modify(_.validation).setTo(GroupByValidation(columns, VALIDATION_UNIQUE, AGGREGATION_COUNT)) + @varargs def unique(fields: String*): ValidationBuilder = { + this.modify(_.validation).setTo(GroupByValidation(fields, VALIDATION_UNIQUE, AGGREGATION_COUNT)) .expr("count == 1") } @@ -205,16 +205,16 @@ case class ValidationBuilder(validation: Validation = ExpressionValidation(), op * @return UpstreamDataSourceValidationBuilder */ def upstreamData(connectionTaskBuilder: ConnectionTaskBuilder[_]): UpstreamDataSourceValidationBuilder = { - UpstreamDataSourceValidationBuilder(this, connectionTaskBuilder) + UpstreamDataSourceValidationBuilder(List(this), connectionTaskBuilder) } /** - * Define validation for column names of dataset. + * Define validation for field names of dataset. * - * @return ColumnNamesValidationBuilder + * @return FieldNamesValidationBuilder */ - def columnNames: ColumnNamesValidationBuilder = { - ColumnNamesValidationBuilder() + def fieldNames: FieldNamesValidationBuilder = { + FieldNamesValidationBuilder() } def preFilter(combinationPreFilterBuilder: CombinationPreFilterBuilder): ValidationBuilder = { @@ -233,372 +233,575 @@ case class ValidationBuilder(validation: Validation = ExpressionValidation(), op } } -case class ColumnValidationBuilder(validationBuilder: ValidationBuilder = ValidationBuilder(), column: String = "") { +case class FieldValidationBuilder(validationBuilder: ValidationBuilder = ValidationBuilder(), field: String = "") { def this() = this(ValidationBuilder(), "") /** - * Check if column values are equal to a certain value + * Check if field values are equal to a certain value * - * @param value Expected value for all column values + * @param value Expected value for all field values + * @param negate Check if not equal to when set to true * @return */ - def isEqual(value: Any): ValidationBuilder = { - validationBuilder.expr(s"$column == ${colValueToString(value)}") + def isEqual(value: Any, negate: Boolean = false): ValidationBuilder = { + val sign = if (negate) "!=" else "==" + validationBuilder.expr(s"$field $sign ${fieldValueToString(value)}") } /** - * Check if column values are equal to another column for each record + * Check if field values are equal to another field for each record * - * @param value Other column name + * @param value Other field name + * @param negate Check if not equal to field when set to true * @return */ - def isEqualCol(value: String): ValidationBuilder = { - validationBuilder.expr(s"$column == $value") + def isEqualField(value: String, negate: Boolean = false): ValidationBuilder = { + val sign = if (negate) "!=" else "==" + validationBuilder.expr(s"$field $sign $value") } /** - * Check if column values are not equal to a certain value + * Check if field values are null * - * @param value Value column should not equal to + * @param negate Check if not null when set to true * @return */ - def isNotEqual(value: Any): ValidationBuilder = { - validationBuilder.expr(s"$column != ${colValueToString(value)}") + def isNull(negate: Boolean = false): ValidationBuilder = { + val nullExpr = if (negate) "ISNOTNULL" else "ISNULL" + validationBuilder.expr(s"$nullExpr($field)") } /** - * Check if column values are not equal to another column's value for each record + * Check if field values contain particular string (only for string type fields) * - * @param value Other column name not equal to + * @param value Expected string that field values contain + * @param negate Check if not contains when set to true * @return */ - def isNotEqualCol(value: String): ValidationBuilder = { - validationBuilder.expr(s"$column != $value") + def contains(value: String, negate: Boolean = false): ValidationBuilder = { + val sign = if (negate) "!" else "" + validationBuilder.expr(s"${sign}CONTAINS($field, '$value')") } /** - * Check if column values are null + * Check if field values are less than certain value * + * @param value Less than value + * @param strictly Check if less than or equal to when set to true * @return */ - def isNull: ValidationBuilder = { - validationBuilder.expr(s"ISNULL($column)") + def lessThan(value: Any, strictly: Boolean = true): ValidationBuilder = { + val sign = if (strictly) "<" else "<=" + validationBuilder.expr(s"$field $sign ${fieldValueToString(value)}") } /** - * Check if column values are not null + * Check if field values are less than another field's values for each record * + * @param value Other field name + * @param strictly Check if less than or equal to field when set to true * @return */ - def isNotNull: ValidationBuilder = { - validationBuilder.expr(s"ISNOTNULL($column)") + def lessThanField(value: String, strictly: Boolean = true): ValidationBuilder = { + val sign = if (strictly) "<" else "<=" + validationBuilder.expr(s"$field $sign $value") } /** - * Check if column values contain particular string (only for string type columns) + * Check if field is greater than a certain value * - * @param value Expected string that column values contain + * @param value Greater than value + * @param strictly Check if greater than or equal to when set to true * @return */ - def contains(value: String): ValidationBuilder = { - validationBuilder.expr(s"CONTAINS($column, '$value')") + def greaterThan(value: Any, strictly: Boolean = true): ValidationBuilder = { + val sign = if (strictly) ">" else ">=" + validationBuilder.expr(s"$field $sign ${fieldValueToString(value)}") } /** - * Check if column values do not contain particular string (only for string type columns) + * Check if field is greater than another field's values for each record * - * @param value String value not expected to contain in column values + * @param value Other field name + * @param strictly Check if greater than or equal to field when set to true * @return */ - def notContains(value: String): ValidationBuilder = { - validationBuilder.expr(s"!CONTAINS($column, '$value')") + def greaterThanField(value: String, strictly: Boolean = true): ValidationBuilder = { + val sign = if (strictly) ">" else ">=" + validationBuilder.expr(s"$field $sign $value") } /** - * Check if column values are less than certain value + * Check if field values are between two values (inclusive) * - * @param value Less than value + * @param minValue Minimum value (inclusive) + * @param maxValue Maximum value (inclusive) + * @param negate Check if not between when set to true * @return */ - def lessThan(value: Any): ValidationBuilder = { - validationBuilder.expr(s"$column < ${colValueToString(value)}") + def between(minValue: Any, maxValue: Any, negate: Boolean = false): ValidationBuilder = { + val prefix = if (negate) "NOT " else "" + validationBuilder.expr(s"$field ${prefix}BETWEEN ${fieldValueToString(minValue)} AND ${fieldValueToString(maxValue)}") } /** - * Check if column values are less than another column's values for each record + * Check if field values are between values of other fields (inclusive) * - * @param value Other column name + * @param minValue Other field name determining minimum value (inclusive) + * @param maxValue Other field name determining maximum value (inclusive) + * @param negate Check if not between fields when set to true * @return */ - def lessThanCol(value: String): ValidationBuilder = { - validationBuilder.expr(s"$column < $value") + def betweenFields(minValue: String, maxValue: String, negate: Boolean = false): ValidationBuilder = { + val prefix = if (negate) "NOT " else "" + validationBuilder.expr(s"$field ${prefix}BETWEEN $minValue AND $maxValue") } /** - * Check if column values are less than or equal to certain value + * Check if field values are in given set of expected values * - * @param value Less than or equal to value + * @param values Expected set of values * @return */ - def lessThanOrEqual(value: Any): ValidationBuilder = { - validationBuilder.expr(s"$column <= ${colValueToString(value)}") + @varargs def in(values: Any*): ValidationBuilder = { + in(values.toList, false) } /** - * Check if column values are less than or equal to another column's values for each record + * Check if field values are in given set of expected values * - * @param value Other column name + * @param values Expected set of values + * @param negate Check if not in set of values when set to true * @return */ - def lessThanOrEqualCol(value: String): ValidationBuilder = { - validationBuilder.expr(s"$column <= $value") + def in(values: List[Any], negate: Boolean = false): ValidationBuilder = { + val prefix = if (negate) "NOT " else "" + validationBuilder.expr(s"$prefix$field IN (${values.map(fieldValueToString).mkString(",")})") } /** - * Check if column is greater than a certain value + * Check if field values match certain regex pattern (Java regular expression) * - * @param value Greater than value + * @param regex Java regular expression + * @param negate Check if not matches regex when set to true * @return */ - def greaterThan(value: Any): ValidationBuilder = { - validationBuilder.expr(s"$column > ${colValueToString(value)}") + def matches(regex: String, negate: Boolean = false): ValidationBuilder = { + val prefix = if (negate) "!" else "" + validationBuilder.expr(s"${prefix}REGEXP($field, '$regex')") } /** - * Check if column is greater than another column's values for each record + * Check if field values match certain regex patterns (Java regular expression) * - * @param value Other column name + * @param regexes Java regular expressions + * @param matchAll Check if matches all defined regex patterns if set to true, only match at least one regex pattern when set to false + * @param negate Check if not matches regex patterns when set to true * @return */ - def greaterThanCol(value: String): ValidationBuilder = { - validationBuilder.expr(s"$column > $value") + def matchesList(regexes: List[String], matchAll: Boolean = true, negate: Boolean = false): ValidationBuilder = { + val mkStringValue = if (matchAll) " AND " else " OR " + val prefix = if (negate) "NOT " else "" + val checkAllPatterns = regexes + .map(regex => s"REGEXP($field, '$regex')") + .mkString(mkStringValue) + validationBuilder.expr(s"$prefix($checkAllPatterns)") } /** - * Check if column is greater than or equal to a certain value + * Check if field values start with certain string (only for string fields) * - * @param value Greater than or equal to value + * @param value Expected prefix for string values + * @param negate Check if not starts with when set to true * @return */ - def greaterThanOrEqual(value: Any): ValidationBuilder = { - validationBuilder.expr(s"$column >= ${colValueToString(value)}") + def startsWith(value: String, negate: Boolean = false): ValidationBuilder = { + val prefix = if (negate) "!" else "" + validationBuilder.expr(s"${prefix}STARTSWITH($field, '$value')") } /** - * Check if column is greater than or equal to another column's values for each record + * Check if field values end with certain string (only for string fields) * - * @param value Other column name + * @param value Expected suffix for string + * @param negate Check if not ends with when set to true * @return */ - def greaterThanOrEqualCol(value: String): ValidationBuilder = { - validationBuilder.expr(s"$column >= $value") + def endsWith(value: String, negate: Boolean = false): ValidationBuilder = { + val prefix = if (negate) "!" else "" + validationBuilder.expr(s"${prefix}ENDSWITH($field, '$value')") } /** - * Check if column values are between two values (inclusive) + * Check if field size is equal to certain amount (only for array or map fields) * - * @param minValue Minimum value (inclusive) - * @param maxValue Maximum value (inclusive) + * @param size Expected size + * @param negate Check if not size when set to true * @return */ - def between(minValue: Any, maxValue: Any): ValidationBuilder = { - validationBuilder.expr(s"$column BETWEEN ${colValueToString(minValue)} AND ${colValueToString(maxValue)}") + def size(size: Int, negate: Boolean = false): ValidationBuilder = { + val sign = if (negate) "!=" else "==" + validationBuilder.expr(s"SIZE($field) $sign $size") } /** - * Check if column values are between values of other columns (inclusive) + * Check if field size is less than certain amount (only for array or map fields) * - * @param minValue Other column name determining minimum value (inclusive) - * @param maxValue Other column name determining maximum value (inclusive) + * @param size Less than size + * @param strictly Set to true to check less than or equal to size * @return */ - def betweenCol(minValue: String, maxValue: String): ValidationBuilder = { - validationBuilder.expr(s"$column BETWEEN $minValue AND $maxValue") + def lessThanSize(size: Int, strictly: Boolean = true): ValidationBuilder = { + val sign = if (strictly) "<" else "<=" + validationBuilder.expr(s"SIZE($field) $sign $size") } /** - * Check if column values are not between two values + * Check if field size is greater than certain amount (only for array or map fields) * - * @param minValue Minimum value - * @param maxValue Maximum value + * @param size Greater than size + * @param strictly Set to true to check greater than or equal to size * @return */ - def notBetween(minValue: Any, maxValue: Any): ValidationBuilder = { - validationBuilder.expr(s"$column NOT BETWEEN ${colValueToString(minValue)} AND ${colValueToString(maxValue)}") + def greaterThanSize(size: Int, strictly: Boolean = true): ValidationBuilder = { + val sign = if (strictly) ">" else ">=" + validationBuilder.expr(s"SIZE($field) $sign $size") } /** - * Check if column values are not between values of other columns + * Check if field values adhere to Luhn algorithm. Usually used for credit card or identification numbers. * - * @param minValue Other column name determining minimum value - * @param maxValue Other column name determining maximum value + * @param negate Check if not adheres to Luhn algorithm when set to true * @return */ - def notBetweenCol(minValue: String, maxValue: String): ValidationBuilder = { - validationBuilder.expr(s"$column NOT BETWEEN $minValue AND $maxValue") + def luhnCheck(negate: Boolean = false): ValidationBuilder = { + val prefix = if (negate) "!" else "" + validationBuilder.expr(s"${prefix}LUHN_CHECK($field)") } /** - * Check if column values are in given set of expected values + * Check if field values adhere to expected type * - * @param values Expected set of values + * @param `type` Expected data type + * @param negate Check if not has type when set to true * @return */ - @varargs def in(values: Any*): ValidationBuilder = { - validationBuilder.expr(s"$column IN (${values.map(colValueToString).mkString(",")})") + def hasType(`type`: String, negate: Boolean = false): ValidationBuilder = { + val sign = if (negate) "!=" else "==" + validationBuilder.expr(s"TYPEOF($field) $sign '${`type`}'") } /** - * Check if column values are not in given set of values + * Check if field values adhere to expected types * - * @param values Set of unwanted values + * @param types Expected data types * @return */ - @varargs def notIn(values: Any*): ValidationBuilder = { - validationBuilder.expr(s"NOT $column IN (${values.map(colValueToString).mkString(",")})") + @varargs def hasTypes(types: String*): ValidationBuilder = { + hasTypes(types.toList) } /** - * Check if column values match certain regex pattern (Java regular expression) + * Check if field values adhere to expected types * - * @param regex Java regular expression + * @param types Expected data types + * @param negate Check if not has any data type when set to true * @return */ - def matches(regex: String): ValidationBuilder = { - validationBuilder.expr(s"REGEXP($column, '$regex')") + def hasTypes(types: List[String], negate: Boolean = false): ValidationBuilder = { + val prefix = if (negate) "NOT " else "" + validationBuilder.expr(s"TYPEOF($field) ${prefix}IN (${types.map(t => s"'$t'").mkString(",")})") } /** - * Check if column values do not match certain regex (Java regular expression) + * Check if distinct values of field exist in set * - * @param regex Java regular expression + * @param set Expected set of distinct values * @return */ - def notMatches(regex: String): ValidationBuilder = { - validationBuilder.expr(s"!REGEXP($column, '$regex')") + @varargs def distinctInSet(set: Any*): ValidationBuilder = { + distinctInSet(set.toList, false) } /** - * Check if column values start with certain string (only for string columns) + * Check if distinct values of field exist in set * - * @param value Expected prefix for string values + * @param set Expected set of distinct values + * @param negate Check if distinct values are not in set when set to true * @return */ - def startsWith(value: String): ValidationBuilder = { - validationBuilder.expr(s"STARTSWITH($column, '$value')") + def distinctInSet(set: List[Any], negate: Boolean = false): ValidationBuilder = { + val sign = if (negate) "!" else "" + val removeTicksField = field.replaceAll("`", "") + validationBuilder.selectExpr(s"COLLECT_SET($field) AS ${removeTicksField}_distinct") + .expr(s"${sign}FORALL(${removeTicksField}_distinct, x -> ARRAY_CONTAINS(ARRAY(${seqToString(set)}), x))") } /** - * Check if column values do not start with certain string (only for string columns) + * Check if distinct values of field contains set * - * @param value Prefix string value should not start with + * @param set Expected contained set of distinct values * @return */ - def notStartsWith(value: String): ValidationBuilder = { - validationBuilder.expr(s"!STARTSWITH($column, '$value')") + @varargs def distinctContainsSet(set: Any*): ValidationBuilder = { + distinctContainsSet(set.toList, false) } /** - * Check if column values end with certain string (only for string columns) + * Check if distinct values of field contains set * - * @param value Expected suffix for string + * @param set Expected contained set of distinct values + * @param negate Check if distinct values does not contain set when set to true * @return */ - def endsWith(value: String): ValidationBuilder = { - validationBuilder.expr(s"ENDSWITH($column, '$value')") + def distinctContainsSet(set: List[Any], negate: Boolean = false): ValidationBuilder = { + val sign = if (negate) "!" else "" + validationBuilder.selectExpr(s"COLLECT_SET($field) AS ${removeTicksField}_distinct") + .expr(s"${sign}FORALL(ARRAY(${seqToString(set)}), x -> ARRAY_CONTAINS(${removeTicksField}_distinct, x))") } /** - * Check if column values do not end with certain string (only for string columns) + * Check if distinct values of field equals set * - * @param value Suffix string value should not end with + * @param set Expected set of distinct values * @return */ - def notEndsWith(value: String): ValidationBuilder = { - validationBuilder.expr(s"!ENDSWITH($column, '$value')") + @varargs def distinctEqual(set: Any*): ValidationBuilder = { + distinctEqual(set.toList, false) } /** - * Check if column size is equal to certain amount (only for array or map columns) + * Check if distinct values of field equals set * - * @param size Expected size + * @param set Expected set of distinct values + * @param negate Check if distinct values does not equal set when set to true * @return */ - def size(size: Int): ValidationBuilder = { - validationBuilder.expr(s"SIZE($column) == $size") + def distinctEqual(set: List[Any], negate: Boolean = false): ValidationBuilder = { + val sign = if (negate) "!=" else "==" + validationBuilder.selectExpr(s"COLLECT_SET($field) AS ${removeTicksField}_distinct") + .expr(s"ARRAY_SIZE(ARRAY_EXCEPT(ARRAY(${seqToString(set)}), ${removeTicksField}_distinct)) $sign 0") } /** - * Check if column size is not equal to certain amount (only for array or map columns) + * Check if max field value is between two values * - * @param size Array or map size should not equal + * @param min Minimum expected value for max + * @param max Maximum expected value for max + * @param negate Check if not between two values when set to true * @return */ - def notSize(size: Int): ValidationBuilder = { - validationBuilder.expr(s"SIZE($column) != $size") + def maxBetween(min: Any, max: Any, negate: Boolean = false): ValidationBuilder = { + validationBuilder.groupBy().max(field).between(min, max, negate) } /** - * Check if column size is less than certain amount (only for array or map columns) + * Check if mean field value is between two values * - * @param size Less than size + * @param min Minimum expected value for mean + * @param max Maximum expected value for mean + * @param negate Check if not between two values when set to true * @return */ - def lessThanSize(size: Int): ValidationBuilder = { - validationBuilder.expr(s"SIZE($column) < $size") + def meanBetween(min: Any, max: Any, negate: Boolean = false): ValidationBuilder = { + validationBuilder.groupBy().avg(field).between(min, max, negate) } /** - * Check if column size is less than or equal to certain amount (only for array or map columns) + * Check if median field value is between two values * - * @param size Less than or equal to size + * @param min Minimum expected value for median + * @param max Maximum expected value for median + * @param negate Check if not between two values when set to true * @return */ - def lessThanOrEqualSize(size: Int): ValidationBuilder = { - validationBuilder.expr(s"SIZE($column) <= $size") + def medianBetween(min: Any, max: Any, negate: Boolean = false): ValidationBuilder = { + val prefix = if (negate) "NOT " else "" + validationBuilder + .selectExpr(s"PERCENTILE($field, 0.5) AS ${removeTicksField}_median") + .expr(s"${removeTicksField}_median ${prefix}BETWEEN $min AND $max") } /** - * Check if column size is greater than certain amount (only for array or map columns) + * Check if min field value is between two values * - * @param size Greater than size + * @param min Minimum expected value for min + * @param max Maximum expected value for min + * @param negate Check if not between two values when set to true * @return */ - def greaterThanSize(size: Int): ValidationBuilder = { - validationBuilder.expr(s"SIZE($column) > $size") + def minBetween(min: Any, max: Any, negate: Boolean = false): ValidationBuilder = { + validationBuilder.groupBy().min(field).between(min, max, negate) } /** - * Check if column size is greater than or equal to certain amount (only for array or map columns) + * Check if standard deviation field value is between two values * - * @param size Greater than or equal to size + * @param min Minimum expected value for standard deviation + * @param max Maximum expected value for standard deviation + * @param negate Check if not between two values when set to true * @return */ - def greaterThanOrEqualSize(size: Int): ValidationBuilder = { - validationBuilder.expr(s"SIZE($column) >= $size") + def stdDevBetween(min: Any, max: Any, negate: Boolean = false): ValidationBuilder = { + validationBuilder.groupBy().stddev(field).between(min, max, negate) } /** - * Check if column values adhere to Luhn algorithm. Usually used for credit card or identification numbers. + * Check if sum field values is between two values * + * @param min Minimum expected value for sum + * @param max Maximum expected value for sum + * @param negate Check if not between two values when set to true * @return */ - def luhnCheck: ValidationBuilder = { - validationBuilder.expr(s"LUHN_CHECK($column)") + def sumBetween(min: Any, max: Any, negate: Boolean = false): ValidationBuilder = { + validationBuilder.groupBy().sum(field).between(min, max, negate) } /** - * Check if column values adhere to expected type + * Check if length of field values is between two values * - * @param `type` Expected data type + * @param min Minimum expected value for length + * @param max Maximum expected value for length + * @param negate Check if not between two values when set to true * @return */ - def hasType(`type`: String): ValidationBuilder = { - validationBuilder.expr(s"TYPEOF($column) == '${`type`}'") + def lengthBetween(min: Int, max: Int, negate: Boolean = false): ValidationBuilder = { + val prefix = if (negate) "NOT " else "" + validationBuilder.expr(s"LENGTH($field) ${prefix}BETWEEN $min AND $max") } /** - * Check if SQL expression is true or not. Can include reference to any other columns in the dataset. + * Check if length of field values is equal to value + * + * @param value Expected length + * @param negate Check if not length is not equal to value when set to true + * @return + */ + def lengthEqual(value: Int, negate: Boolean = false): ValidationBuilder = { + val sign = if (negate) "!=" else "==" + validationBuilder.expr(s"LENGTH($field) $sign $value") + } + + /** + * Check if field values are decreasing + * + * @param strictly Check values are strictly decreasing when set to true + * @return + */ + def isDecreasing(strictly: Boolean = true): ValidationBuilder = { + val lessSign = if (strictly) "<" else "<=" + validationBuilder + .selectExpr(s"$field $lessSign LAG($field) OVER (ORDER BY MONOTONICALLY_INCREASING_ID()) AS is_${removeTicksField}_decreasing") + .expr(s"is_${removeTicksField}_decreasing") + } + + /** + * Check if field values are increasing + * + * @param strictly Check values are strictly increasing when set to true + * @return + */ + def isIncreasing(strictly: Boolean = true): ValidationBuilder = { + val greaterThan = if (strictly) ">" else ">=" + validationBuilder + .selectExpr(s"$field $greaterThan LAG($field) OVER (ORDER BY MONOTONICALLY_INCREASING_ID()) AS is_${removeTicksField}_increasing") + .expr(s"is_${removeTicksField}_increasing") + } + + /** + * Check if field values can be parsed as JSON + * + * @param negate Check values cannot be parsed as JSON when set to true + * @return + */ + def isJsonParsable(negate: Boolean = false): ValidationBuilder = { + val prefix = if (negate) "" else "NOT " + validationBuilder.expr(s"GET_JSON_OBJECT($field, '$$') IS ${prefix}NULL") + } + + /** + * Check if field values adhere to JSON schema + * + * @param schema Defined JSON schema + * @param negate Check values do not adhere to JSON schema when set to true + * @return + */ + def matchJsonSchema(schema: String, negate: Boolean = false): ValidationBuilder = { + val prefix = if (negate) "" else "NOT " + validationBuilder.expr(s"FROM_JSON($field, '$schema') IS ${prefix}NULL") + } + + /** + * Check if field values match date time format + * + * @param format Defined date time format ([defined formats](https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html)) + * @param negate Check values do not adhere to date time format when set to true + * @return + */ + def matchDateTimeFormat(format: String, negate: Boolean = false): ValidationBuilder = { + val prefix = if (negate) "" else "NOT " + validationBuilder.expr(s"TRY_TO_TIMESTAMP($field, '$format') IS ${prefix}NULL") + } + + /** + * Check if the most common field value exists in set of values + * + * @param values Expected set of values most common value to exist in + * @param negate Check most common does not exist in set of values when set to true + * @return + */ + def mostCommonValueInSet(values: List[Any], negate: Boolean = false): ValidationBuilder = { + val prefix = if (negate) "!" else "" + validationBuilder + .selectExpr(s"MODE($field) AS ${removeTicksField}_mode") + .expr(s"${prefix}ARRAY_CONTAINS(ARRAY(${seqToString(values)}), ${removeTicksField}_mode)") + } + + /** + * Check if the fields proportion of unique values is between two values + * + * @param min Minimum proportion of unique values + * @param max Maximum proportion of unique values + * @param negate Check if proportion of unique values is not between two values when set to true + * @return + */ + def uniqueValuesProportionBetween(min: Double, max: Double, negate: Boolean = false): ValidationBuilder = { + val prefix = if (negate) "NOT " else "" + validationBuilder + .selectExpr(s"COUNT(DISTINCT $field) / COUNT(1) AS ${removeTicksField}_unique_proportion") + .expr(s"${removeTicksField}_unique_proportion ${prefix}BETWEEN $min AND $max") + } + + /** + * Check if quantiles of field values is within range. + * + * For example, + * `Map(0.1 -> (1, 2))` -> 10th percentile should be between 1 and 2 + * + * @param quantileRanges Map of quantile to expected range + * @param negate Check if quantile value is not between two values when set to true + * @return + */ + def quantileValuesBetween(quantileRanges: Map[Double, (Double, Double)], negate: Boolean = false): ValidationBuilder = { + val prefix = if (negate) "NOT " else "" + val quantileExprs = quantileRanges.zipWithIndex.map(quantileEntry => { + val quantile = quantileEntry._1._1 + val min = quantileEntry._1._2._1 + val max = quantileEntry._1._2._2 + val idx = quantileEntry._2 + val percentileColName = s"${removeTicksField}_percentile_$idx" + val selectExpr = s"PERCENTILE($field, $quantile) AS $percentileColName" + val whereExpr = s"$percentileColName ${prefix}BETWEEN $min AND $max" + (selectExpr, whereExpr) + }) + val selectExpr = quantileExprs.keys.toList + val whereExpr = quantileExprs.values.mkString(" AND ") + validationBuilder.selectExpr(selectExpr: _*).expr(whereExpr) + } + + /** + * Check if SQL expression is true or not. Can include reference to any other fields in the dataset. * * @param expr SQL expression * @return @@ -607,7 +810,7 @@ case class ColumnValidationBuilder(validationBuilder: ValidationBuilder = Valida validationBuilder.expr(expr) } - private def colValueToString(value: Any): String = { + private def fieldValueToString(value: Any): String = { value match { case _: String => s"'$value'" case _: Date => s"DATE('$value')" @@ -615,32 +818,41 @@ case class ColumnValidationBuilder(validationBuilder: ValidationBuilder = Valida case _ => s"$value" } } + + private def removeTicksField: String = field.replaceAll("`", "") + + private def seqToString(seq: Seq[Any]): String = { + seq.head match { + case _: String => seq.mkString("'", "','", "'") + case _ => seq.mkString(",") + } + } } case class GroupByValidationBuilder( validationBuilder: ValidationBuilder = ValidationBuilder(), - groupByCols: Seq[String] = Seq() + groupByFields: Seq[String] = Seq() ) { def this() = this(ValidationBuilder(), Seq()) /** - * Sum all values for column + * Sum all values for field * - * @param column Name of column to sum + * @param field Name of field to sum * @return */ - def sum(column: String): ColumnValidationBuilder = { - setGroupValidation(column, AGGREGATION_SUM) + def sum(field: String): FieldValidationBuilder = { + setGroupValidation(field, AGGREGATION_SUM) } /** - * Count the number of records for a particular column + * Count the number of records for a particular field * - * @param column Name of column to count + * @param field Name of field to count * @return */ - def count(column: String): ColumnValidationBuilder = { - setGroupValidation(column, AGGREGATION_COUNT) + def count(field: String): FieldValidationBuilder = { + setGroupValidation(field, AGGREGATION_COUNT) } /** @@ -648,67 +860,67 @@ case class GroupByValidationBuilder( * * @return */ - def count(): ColumnValidationBuilder = { + def count(): FieldValidationBuilder = { setGroupValidation("", AGGREGATION_COUNT) } /** - * Get the minimum value for a particular column + * Get the minimum value for a particular field * - * @param column Name of column + * @param field Name of field * @return */ - def min(column: String): ColumnValidationBuilder = { - setGroupValidation(column, AGGREGATION_MIN) + def min(field: String): FieldValidationBuilder = { + setGroupValidation(field, AGGREGATION_MIN) } /** - * Get the maximum value for a particular column + * Get the maximum value for a particular field * - * @param column Name of column + * @param field Name of field * @return */ - def max(column: String): ColumnValidationBuilder = { - setGroupValidation(column, AGGREGATION_MAX) + def max(field: String): FieldValidationBuilder = { + setGroupValidation(field, AGGREGATION_MAX) } /** - * Get the average/mean for a particular column + * Get the average/mean for a particular field * - * @param column Name of column + * @param field Name of field * @return */ - def avg(column: String): ColumnValidationBuilder = { - setGroupValidation(column, AGGREGATION_AVG) + def avg(field: String): FieldValidationBuilder = { + setGroupValidation(field, AGGREGATION_AVG) } /** - * Get the standard deviation for a particular column + * Get the standard deviation for a particular field * - * @param column Name of column + * @param field Name of field * @return */ - def stddev(column: String): ColumnValidationBuilder = { - setGroupValidation(column, AGGREGATION_STDDEV) + def stddev(field: String): FieldValidationBuilder = { + setGroupValidation(field, AGGREGATION_STDDEV) } - private def setGroupValidation(column: String, aggType: String): ColumnValidationBuilder = { - val groupByValidation = GroupByValidation(groupByCols, column, aggType) + private def setGroupValidation(field: String, aggType: String): FieldValidationBuilder = { + val groupByValidation = GroupByValidation(groupByFields, field, aggType) groupByValidation.errorThreshold = validationBuilder.validation.errorThreshold groupByValidation.description = validationBuilder.validation.description - val colName = if (column.isEmpty) aggType else s"$aggType($column)" - ColumnValidationBuilder(validationBuilder.modify(_.validation).setTo(groupByValidation), colName) + val fieldName = if (field.isEmpty) aggType else s"$aggType($field)" + FieldValidationBuilder(validationBuilder.modify(_.validation).setTo(groupByValidation), fieldName) } } case class UpstreamDataSourceValidationBuilder( - validationBuilder: ValidationBuilder = ValidationBuilder(), + validationBuilders: List[ValidationBuilder] = List(), connectionTaskBuilder: ConnectionTaskBuilder[_] = FileBuilder(), readOptions: Map[String, String] = Map(), - joinColumns: List[String] = List(), + joinFields: List[String] = List(), joinType: String = DEFAULT_VALIDATION_JOIN_TYPE ) { - def this() = this(ValidationBuilder(), FileBuilder(), Map(), List(), DEFAULT_VALIDATION_JOIN_TYPE) + def this() = this(List(), FileBuilder(), Map(), List(), DEFAULT_VALIDATION_JOIN_TYPE) /** * Define any custom read options to control which dataset is the upstream dataset. @@ -723,13 +935,13 @@ case class UpstreamDataSourceValidationBuilder( } /** - * Define set of column names to use for join with upstream dataset + * Define set of field names to use for join with upstream dataset * - * @param joinCols Column names used for join + * @param fields field names used for join * @return */ - @varargs def joinColumns(joinCols: String*): UpstreamDataSourceValidationBuilder = { - this.modify(_.joinColumns).setTo(joinCols.toList) + @varargs def joinFields(fields: String*): UpstreamDataSourceValidationBuilder = { + this.modify(_.joinFields).setTo(fields.toList) } /** @@ -740,7 +952,7 @@ case class UpstreamDataSourceValidationBuilder( * @return */ def joinExpr(expr: String): UpstreamDataSourceValidationBuilder = { - this.modify(_.joinColumns).setTo(List(s"$VALIDATION_PREFIX_JOIN_EXPRESSION$expr")) + this.modify(_.joinFields).setTo(List(s"$VALIDATION_PREFIX_JOIN_EXPRESSION$expr")) } /** @@ -761,57 +973,57 @@ case class UpstreamDataSourceValidationBuilder( } /** - * Define validation to be used on joined dataset + * Define validations to be used on joined dataset * - * @param validationBuilder Validation check on joined dataset + * @param validations Validations to check on joined dataset * @return */ - def withValidation(validationBuilder: ValidationBuilder): ValidationBuilder = { - validationBuilder.modify(_.validation).setTo(UpstreamDataSourceValidation(validationBuilder, connectionTaskBuilder, readOptions, joinColumns, joinType)) + @varargs def validations(validations: ValidationBuilder*): ValidationBuilder = { + ValidationBuilder().modify(_.validation).setTo(UpstreamDataSourceValidation(validations.toList, connectionTaskBuilder, readOptions, joinFields, joinType)) } } -case class ColumnNamesValidationBuilder( - validationBuilder: ValidationBuilder = ValidationBuilder() - ) { +case class FieldNamesValidationBuilder( + validationBuilder: ValidationBuilder = ValidationBuilder() + ) { def this() = this(ValidationBuilder()) /** - * Check number of column is equal to certain value + * Check number of field is equal to certain value * - * @param value Number of expected columns + * @param value Number of expected fields * @return ValidationBuilder */ def countEqual(value: Int): ValidationBuilder = - validationBuilder.modify(_.validation).setTo(ColumnNamesValidation(VALIDATION_COLUMN_NAME_COUNT_EQUAL, value)) + validationBuilder.modify(_.validation).setTo(FieldNamesValidation(VALIDATION_FIELD_NAME_COUNT_EQUAL, value)) /** - * Check number of columns is between two values + * Check number of fields is between two values * - * @param min Minimum number of expected columns (inclusive) - * @param max Maximum number of expected columns (inclusive) + * @param min Minimum number of expected fields (inclusive) + * @param max Maximum number of expected fields (inclusive) * @return ValidationBuilder */ def countBetween(min: Int, max: Int): ValidationBuilder = - validationBuilder.modify(_.validation).setTo(ColumnNamesValidation(VALIDATION_COLUMN_NAME_COUNT_BETWEEN, minCount = min, maxCount = max)) + validationBuilder.modify(_.validation).setTo(FieldNamesValidation(VALIDATION_FIELD_NAME_COUNT_BETWEEN, min = min, max = max)) /** - * Order of column names matches given order + * Order of field names matches given order * - * @param columnNameOrder Expected column name ordering + * @param fieldNameOrder Expected field name ordering * @return ValidationBuilder */ - @varargs def matchOrder(columnNameOrder: String*): ValidationBuilder = - validationBuilder.modify(_.validation).setTo(ColumnNamesValidation(VALIDATION_COLUMN_NAME_MATCH_ORDER, names = columnNameOrder.toArray)) + @varargs def matchOrder(fieldNameOrder: String*): ValidationBuilder = + validationBuilder.modify(_.validation).setTo(FieldNamesValidation(VALIDATION_FIELD_NAME_MATCH_ORDER, names = fieldNameOrder.toArray)) /** - * Dataset column names contains set of column names + * Dataset field names contains set of field names * - * @param columnNames Column names expected to exist within dataset + * @param fieldNames field names expected to exist within dataset * @return ValidationBuilder */ - @varargs def matchSet(columnNames: String*): ValidationBuilder = - validationBuilder.modify(_.validation).setTo(ColumnNamesValidation(VALIDATION_COLUMN_NAME_MATCH_SET, names = columnNames.toArray)) + @varargs def matchSet(fieldNames: String*): ValidationBuilder = + validationBuilder.modify(_.validation).setTo(FieldNamesValidation(VALIDATION_FIELD_NAME_MATCH_SET, names = fieldNames.toArray)) } case class WaitConditionBuilder(waitCondition: WaitCondition = PauseWaitCondition()) { @@ -936,5 +1148,5 @@ case class CombinationPreFilterBuilder( } object ValidationHelper { - def cleanColumnName(column: String): String = column.split("\\.").map(c => s"`$c`").mkString(".") + def cleanFieldName(field: String): String = field.split("\\.").map(c => s"`$c`").mkString(".") } diff --git a/api/src/main/scala/io/github/datacatering/datacaterer/api/connection/ConnectionBuilder.scala b/api/src/main/scala/io/github/datacatering/datacaterer/api/connection/ConnectionBuilder.scala index 790eb861..788fc406 100644 --- a/api/src/main/scala/io/github/datacatering/datacaterer/api/connection/ConnectionBuilder.scala +++ b/api/src/main/scala/io/github/datacatering/datacaterer/api/connection/ConnectionBuilder.scala @@ -1,8 +1,8 @@ package io.github.datacatering.datacaterer.api.connection import io.github.datacatering.datacaterer.api.model.Constants.{ALL_COMBINATIONS, ENABLE_DATA_VALIDATION, FORMAT} -import io.github.datacatering.datacaterer.api.{ConnectionConfigWithTaskBuilder, CountBuilder, FieldBuilder, GeneratorBuilder, MetadataSourceBuilder, SchemaBuilder, StepBuilder, TaskBuilder, TasksBuilder, ValidationBuilder, WaitConditionBuilder} import io.github.datacatering.datacaterer.api.model.{Step, Task} +import io.github.datacatering.datacaterer.api.{ConnectionConfigWithTaskBuilder, CountBuilder, FieldBuilder, GeneratorBuilder, MetadataSourceBuilder, StepBuilder, TaskBuilder, TasksBuilder, ValidationBuilder, WaitConditionBuilder} import scala.annotation.varargs @@ -30,17 +30,12 @@ trait ConnectionTaskBuilder[T] { this } - @varargs def schema(fields: FieldBuilder*): ConnectionTaskBuilder[T] = { - this.step = Some(getStep.schema(fields: _*)) + @varargs def fields(fields: FieldBuilder*): ConnectionTaskBuilder[T] = { + this.step = Some(getStep.fields(fields: _*)) this } - def schema(schemaBuilder: SchemaBuilder): ConnectionTaskBuilder[T] = { - this.step = Some(getStep.schema(schemaBuilder)) - this - } - - def schema(metadataSourceBuilder: MetadataSourceBuilder): ConnectionTaskBuilder[T] = { + def fields(metadataSourceBuilder: MetadataSourceBuilder): ConnectionTaskBuilder[T] = { this.connectionConfigWithTaskBuilder = this.connectionConfigWithTaskBuilder.metadataSource(metadataSourceBuilder) this.step = Some(getStep.options(metadataSourceBuilder.metadataSource.allOptions)) this diff --git a/api/src/main/scala/io/github/datacatering/datacaterer/api/model/Constants.scala b/api/src/main/scala/io/github/datacatering/datacaterer/api/model/Constants.scala index 8ea1f4e8..3b0ce043 100644 --- a/api/src/main/scala/io/github/datacatering/datacaterer/api/model/Constants.scala +++ b/api/src/main/scala/io/github/datacatering/datacaterer/api/model/Constants.scala @@ -101,7 +101,7 @@ object Constants { lazy val IS_NULLABLE = "isNullable" lazy val NULL_COUNT = "nullCount" lazy val HISTOGRAM = "histogram" - lazy val SOURCE_COLUMN_DATA_TYPE = "sourceDataType" + lazy val SOURCE_FIELD_DATA_TYPE = "sourceDataType" lazy val NUMERIC_PRECISION = "precision" lazy val NUMERIC_SCALE = "scale" lazy val DEFAULT_VALUE = "defaultValue" @@ -270,7 +270,7 @@ object Constants { lazy val DEFAULT_ICEBERG_CATALOG_TYPE = ICEBERG_CATALOG_HADOOP //foreign key defaults - lazy val DEFAULT_FOREIGN_KEY_COLUMN = "default_column" + lazy val DEFAULT_FOREIGN_KEY_FIELD = "default_field" lazy val FOREIGN_KEY_DELIMITER = "||" lazy val FOREIGN_KEY_DELIMITER_REGEX = "\\|\\|" lazy val FOREIGN_KEY_PLAN_FILE_DELIMITER = "." @@ -300,7 +300,7 @@ object Constants { //count defaults lazy val DEFAULT_COUNT_RECORDS = 1000L - lazy val DEFAULT_PER_COLUMN_COUNT_RECORDS = 10L + lazy val DEFAULT_PER_FIELD_COUNT_RECORDS = 10L //validation defaults lazy val DEFAULT_VALIDATION_CONFIG_NAME = "default_validation" @@ -311,7 +311,7 @@ object Constants { lazy val DEFAULT_VALIDATION_WEBHOOK_HTTP_DATA_SOURCE_NAME = "tmp_http_data_source" lazy val DEFAULT_VALIDATION_WEBHOOK_HTTP_METHOD = "GET" lazy val DEFAULT_VALIDATION_WEBHOOK_HTTP_STATUS_CODES = List(200) - lazy val DEFAULT_VALIDATION_COLUMN_NAME_TYPE = VALIDATION_COLUMN_NAME_COUNT_EQUAL + lazy val DEFAULT_VALIDATION_FIELD_NAME_TYPE = VALIDATION_FIELD_NAME_COUNT_EQUAL //metadata source lazy val METADATA_SOURCE_TYPE = "metadataSourceType" @@ -420,75 +420,80 @@ object Constants { lazy val AGGREGATION_STDDEV = "stddev" //validation types - lazy val VALIDATION_COLUMN = "column" lazy val VALIDATION_FIELD = "field" - lazy val VALIDATION_COLUMN_NAMES = "columnNames" lazy val VALIDATION_FIELD_NAMES = "fieldNames" lazy val VALIDATION_UPSTREAM = "upstream" lazy val VALIDATION_GROUP_BY = "groupBy" //validation support lazy val VALIDATION_DESCRIPTION = "description" lazy val VALIDATION_ERROR_THRESHOLD = "errorThreshold" - //column validations + //field validations lazy val VALIDATION_EQUAL = "equal" - lazy val VALIDATION_NOT_EQUAL = "notEqual" lazy val VALIDATION_NULL = "null" - lazy val VALIDATION_NOT_NULL = "notNull" lazy val VALIDATION_CONTAINS = "contains" - lazy val VALIDATION_NOT_CONTAINS = "notContains" lazy val VALIDATION_UNIQUE = "unique" lazy val VALIDATION_LESS_THAN = "lessThan" - lazy val VALIDATION_LESS_THAN_OR_EQUAL = "equalOrLessThan" lazy val VALIDATION_GREATER_THAN = "greaterThan" - lazy val VALIDATION_GREATER_THAN_OR_EQUAL = "equalOrGreaterThan" lazy val VALIDATION_BETWEEN = "between" - lazy val VALIDATION_NOT_BETWEEN = "notBetween" lazy val VALIDATION_IN = "in" - lazy val VALIDATION_NOT_IN = "notIn" lazy val VALIDATION_MATCHES = "matches" - lazy val VALIDATION_NOT_MATCHES = "notMatches" lazy val VALIDATION_STARTS_WITH = "startsWith" - lazy val VALIDATION_NOT_STARTS_WITH = "notStartsWith" lazy val VALIDATION_ENDS_WITH = "endsWith" - lazy val VALIDATION_NOT_ENDS_WITH = "notEndsWith" lazy val VALIDATION_SIZE = "size" - lazy val VALIDATION_NOT_SIZE = "notSize" lazy val VALIDATION_LESS_THAN_SIZE = "lessThanSize" - lazy val VALIDATION_LESS_THAN_OR_EQUAL_SIZE = "equalOrLessThanSize" lazy val VALIDATION_GREATER_THAN_SIZE = "greaterThanSize" - lazy val VALIDATION_GREATER_THAN_OR_EQUAL_SIZE = "equalOrGreaterThanSize" lazy val VALIDATION_LUHN_CHECK = "luhnCheck" lazy val VALIDATION_HAS_TYPE = "hasType" + lazy val VALIDATION_HAS_TYPES = "hasTypes" + lazy val VALIDATION_DISTINCT_IN_SET = "distinctInSet" + lazy val VALIDATION_DISTINCT_CONTAINS_SET = "distinctContainsSet" + lazy val VALIDATION_DISTINCT_EQUAL = "distinctEqual" + lazy val VALIDATION_MAX_BETWEEN = "maxBetween" + lazy val VALIDATION_MEAN_BETWEEN = "meanBetween" + lazy val VALIDATION_MEDIAN_BETWEEN = "medianBetween" + lazy val VALIDATION_MIN_BETWEEN = "minBetween" + lazy val VALIDATION_STD_DEV_BETWEEN = "stdDevBetween" + lazy val VALIDATION_SUM_BETWEEN = "sumBetween" + lazy val VALIDATION_LENGTH_BETWEEN = "lengthBetween" + lazy val VALIDATION_LENGTH_EQUAL = "lengthEqual" + lazy val VALIDATION_IS_DECREASING = "isDecreasing" + lazy val VALIDATION_IS_INCREASING = "isIncreasing" + lazy val VALIDATION_IS_JSON_PARSABLE = "isJsonParsable" + lazy val VALIDATION_MATCH_JSON_SCHEMA = "matchJsonSchema" + lazy val VALIDATION_MATCH_DATE_TIME_FORMAT = "matchDateTimeFormat" + lazy val VALIDATION_MOST_COMMON_VALUE_IN_SET = "mostCommonValueInSet" + lazy val VALIDATION_UNIQUE_VALUES_PROPORTION_BETWEEN = "uniqueValuesProportionBetween" + lazy val VALIDATION_QUANTILE_VALUES_BETWEEN = "quantileValuesBetween" lazy val VALIDATION_SQL = "sql" //group by validations lazy val VALIDATION_AGGREGATION_TYPE = "aggType" - lazy val VALIDATION_AGGREGATION_COLUMN = "aggCol" + lazy val VALIDATION_AGGREGATION_FIELD = "aggField" lazy val VALIDATION_MIN = "min" lazy val VALIDATION_MAX = "max" lazy val VALIDATION_COUNT = "count" lazy val VALIDATION_SUM = "sum" lazy val VALIDATION_AVERAGE = "average" lazy val VALIDATION_STANDARD_DEVIATION = "standardDeviation" - lazy val VALIDATION_GROUP_BY_COLUMNS = "groupByColumns" + lazy val VALIDATION_GROUP_BY_FIELDS = "groupByFields" //upstream validations lazy val VALIDATION_UPSTREAM_TASK_NAME = "upstreamTaskName" - lazy val VALIDATION_UPSTREAM_JOIN_COLUMNS = "joinColumns" + lazy val VALIDATION_UPSTREAM_JOIN_FIELDS = "joinFields" lazy val VALIDATION_UPSTREAM_JOIN_TYPE = "joinType" lazy val VALIDATION_UPSTREAM_JOIN_EXPR = "joinExpr" - //column name validations - lazy val VALIDATION_COLUMN_NAMES_COUNT_EQUAL = "countEqual" - lazy val VALIDATION_COLUMN_NAMES_COUNT_BETWEEN = "countBetween" - lazy val VALIDATION_COLUMN_NAMES_MATCH_ORDER = "matchOrder" - lazy val VALIDATION_COLUMN_NAMES_MATCH_SET = "matchSet" + //field name validations + lazy val VALIDATION_FIELD_NAMES_COUNT_EQUAL = "countEqual" + lazy val VALIDATION_FIELD_NAMES_COUNT_BETWEEN = "countBetween" + lazy val VALIDATION_FIELD_NAMES_MATCH_ORDER = "matchOrder" + lazy val VALIDATION_FIELD_NAMES_MATCH_SET = "matchSet" lazy val VALIDATION_OPTION_DELIMITER = "," - lazy val VALIDATION_SUPPORTING_OPTIONS = List(VALIDATION_COLUMN, VALIDATION_FIELD, VALIDATION_MIN, VALIDATION_MAX, VALIDATION_GROUP_BY_COLUMNS, VALIDATION_DESCRIPTION, VALIDATION_ERROR_THRESHOLD) + lazy val VALIDATION_SUPPORTING_OPTIONS = List(VALIDATION_FIELD, VALIDATION_FIELD, VALIDATION_MIN, VALIDATION_MAX, VALIDATION_GROUP_BY_FIELDS, VALIDATION_DESCRIPTION, VALIDATION_ERROR_THRESHOLD) lazy val VALIDATION_PREFIX_JOIN_EXPRESSION = "expr:" - lazy val VALIDATION_COLUMN_NAME_COUNT_EQUAL = "columnCountEqual" - lazy val VALIDATION_COLUMN_NAME_COUNT_BETWEEN = "columnCountBetween" - lazy val VALIDATION_COLUMN_NAME_MATCH_ORDER = "columnNameMatchOrder" - lazy val VALIDATION_COLUMN_NAME_MATCH_SET = "columnNameMatchSet" + lazy val VALIDATION_FIELD_NAME_COUNT_EQUAL = "fieldCountEqual" + lazy val VALIDATION_FIELD_NAME_COUNT_BETWEEN = "fieldCountBetween" + lazy val VALIDATION_FIELD_NAME_MATCH_ORDER = "fieldNameMatchOrder" + lazy val VALIDATION_FIELD_NAME_MATCH_SET = "fieldNameMatchSet" //configuration names //flags config diff --git a/api/src/main/scala/io/github/datacatering/datacaterer/api/model/MetadataSourceModels.scala b/api/src/main/scala/io/github/datacatering/datacaterer/api/model/MetadataSourceModels.scala index d3f33bb3..505b42df 100644 --- a/api/src/main/scala/io/github/datacatering/datacaterer/api/model/MetadataSourceModels.scala +++ b/api/src/main/scala/io/github/datacatering/datacaterer/api/model/MetadataSourceModels.scala @@ -1,6 +1,6 @@ package io.github.datacatering.datacaterer.api.model -import Constants.{DATA_CONTRACT_CLI, GREAT_EXPECTATIONS, MARQUEZ, METADATA_SOURCE_HAS_OPEN_LINEAGE_SUPPORT, METADATA_SOURCE_TYPE, OPEN_API, OPEN_DATA_CONTRACT_STANDARD, OPEN_METADATA} +import Constants.{CONFLUENT_SCHEMA_REGISTRY, DATA_CONTRACT_CLI, GREAT_EXPECTATIONS, MARQUEZ, METADATA_SOURCE_HAS_OPEN_LINEAGE_SUPPORT, METADATA_SOURCE_TYPE, OPEN_API, OPEN_DATA_CONTRACT_STANDARD, OPEN_METADATA} trait MetadataSource { @@ -49,3 +49,9 @@ case class DataContractCliSource(override val connectionOptions: Map[String, Str override val `type`: String = DATA_CONTRACT_CLI } + +case class ConfluentSchemaRegistrySource(override val connectionOptions: Map[String, String] = Map()) extends MetadataSource { + + override val `type`: String = CONFLUENT_SCHEMA_REGISTRY + +} diff --git a/api/src/main/scala/io/github/datacatering/datacaterer/api/model/PlanModels.scala b/api/src/main/scala/io/github/datacatering/datacaterer/api/model/PlanModels.scala index 0abeb174..8edae571 100644 --- a/api/src/main/scala/io/github/datacatering/datacaterer/api/model/PlanModels.scala +++ b/api/src/main/scala/io/github/datacatering/datacaterer/api/model/PlanModels.scala @@ -1,7 +1,7 @@ package io.github.datacatering.datacaterer.api.model import com.fasterxml.jackson.databind.annotation.JsonDeserialize -import Constants.{DEFAULT_COUNT_RECORDS, DEFAULT_DATA_SOURCE_NAME, DEFAULT_FIELD_NAME, DEFAULT_FIELD_NULLABLE, DEFAULT_FIELD_TYPE, DEFAULT_GENERATOR_TYPE, DEFAULT_PER_COLUMN_COUNT_RECORDS, DEFAULT_STEP_ENABLED, DEFAULT_STEP_NAME, DEFAULT_STEP_TYPE, DEFAULT_TASK_NAME, DEFAULT_TASK_SUMMARY_ENABLE, FOREIGN_KEY_DELIMITER} +import io.github.datacatering.datacaterer.api.model.Constants.{DEFAULT_COUNT_RECORDS, DEFAULT_DATA_SOURCE_NAME, DEFAULT_FIELD_NAME, DEFAULT_FIELD_NULLABLE, DEFAULT_FIELD_TYPE, DEFAULT_GENERATOR_TYPE, DEFAULT_PER_FIELD_COUNT_RECORDS, DEFAULT_STEP_ENABLED, DEFAULT_STEP_NAME, DEFAULT_STEP_TYPE, DEFAULT_TASK_NAME, DEFAULT_TASK_SUMMARY_ENABLE, FOREIGN_KEY_DELIMITER} import scala.language.implicitConversions @@ -17,20 +17,26 @@ case class Plan( case class SinkOptions( seed: Option[String] = None, locale: Option[String] = None, - foreignKeys: List[(String, List[String], List[String])] = List() + foreignKeys: List[ForeignKey] = List() ) case class ForeignKeyRelation( dataSource: String = DEFAULT_DATA_SOURCE_NAME, step: String = DEFAULT_STEP_NAME, - columns: List[String] = List() + fields: List[String] = List() ) { - def this(dataSource: String, step: String, column: String) = this(dataSource, step, List(column)) + def this(dataSource: String, step: String, field: String) = this(dataSource, step, List(field)) - override def toString: String = s"$dataSource$FOREIGN_KEY_DELIMITER$step$FOREIGN_KEY_DELIMITER${columns.mkString(",")}" + override def toString: String = s"$dataSource$FOREIGN_KEY_DELIMITER$step$FOREIGN_KEY_DELIMITER${fields.mkString(",")}" } +case class ForeignKey( + source: ForeignKeyRelation = ForeignKeyRelation(), + generate: List[ForeignKeyRelation] = List(), + delete: List[ForeignKeyRelation] = List(), + ) + case class TaskSummary( name: String, dataSourceName: String, @@ -47,36 +53,27 @@ case class Step( `type`: String = DEFAULT_STEP_TYPE, count: Count = Count(), options: Map[String, String] = Map(), - schema: Schema = Schema(), + fields: List[Field] = List(), enabled: Boolean = DEFAULT_STEP_ENABLED ) case class Count( @JsonDeserialize(contentAs = classOf[java.lang.Long]) records: Option[Long] = Some(DEFAULT_COUNT_RECORDS), - perColumn: Option[PerColumnCount] = None, - generator: Option[Generator] = None + perField: Option[PerFieldCount] = None, + options: Map[String, Any] = Map() ) -case class PerColumnCount( - columnNames: List[String] = List(), - @JsonDeserialize(contentAs = classOf[java.lang.Long]) count: Option[Long] = Some(DEFAULT_PER_COLUMN_COUNT_RECORDS), - generator: Option[Generator] = None +case class PerFieldCount( + fieldNames: List[String] = List(), + @JsonDeserialize(contentAs = classOf[java.lang.Long]) count: Option[Long] = Some(DEFAULT_PER_FIELD_COUNT_RECORDS), + options: Map[String, Any] = Map() ) -case class Schema( - fields: Option[List[Field]] = None - ) - case class Field( name: String = DEFAULT_FIELD_NAME, `type`: Option[String] = Some(DEFAULT_FIELD_TYPE), - generator: Option[Generator] = Some(Generator()), + options: Map[String, Any] = Map(), nullable: Boolean = DEFAULT_FIELD_NULLABLE, static: Option[String] = None, - schema: Option[Schema] = None + fields: List[Field] = List() ) - -case class Generator( - `type`: String = DEFAULT_GENERATOR_TYPE, - options: Map[String, Any] = Map() - ) diff --git a/api/src/main/scala/io/github/datacatering/datacaterer/api/model/ValidationModels.scala b/api/src/main/scala/io/github/datacatering/datacaterer/api/model/ValidationModels.scala index c6945c41..465e7dbb 100644 --- a/api/src/main/scala/io/github/datacatering/datacaterer/api/model/ValidationModels.scala +++ b/api/src/main/scala/io/github/datacatering/datacaterer/api/model/ValidationModels.scala @@ -4,15 +4,16 @@ import com.fasterxml.jackson.annotation.JsonSubTypes.Type import com.fasterxml.jackson.annotation.{JsonIgnoreProperties, JsonSubTypes, JsonTypeInfo} import com.fasterxml.jackson.databind.annotation.JsonDeserialize import io.github.datacatering.datacaterer.api.connection.{ConnectionTaskBuilder, FileBuilder} -import io.github.datacatering.datacaterer.api.model.Constants.{AGGREGATION_SUM, DEFAULT_VALIDATION_COLUMN_NAME_TYPE, DEFAULT_VALIDATION_CONFIG_NAME, DEFAULT_VALIDATION_DESCRIPTION, DEFAULT_VALIDATION_JOIN_TYPE, DEFAULT_VALIDATION_WEBHOOK_HTTP_METHOD, DEFAULT_VALIDATION_WEBHOOK_HTTP_STATUS_CODES, VALIDATION_COLUMN_NAME_COUNT_BETWEEN, VALIDATION_COLUMN_NAME_COUNT_EQUAL, VALIDATION_COLUMN_NAME_MATCH_ORDER, VALIDATION_COLUMN_NAME_MATCH_SET} +import io.github.datacatering.datacaterer.api.model.Constants._ import io.github.datacatering.datacaterer.api.{CombinationPreFilterBuilder, ValidationBuilder} @JsonSubTypes(Array( new Type(value = classOf[YamlUpstreamDataSourceValidation]), new Type(value = classOf[GroupByValidation]), - new Type(value = classOf[ColumnNamesValidation]), + new Type(value = classOf[FieldNamesValidation]), new Type(value = classOf[ExpressionValidation]), + new Type(value = classOf[FieldValidations]), )) @JsonTypeInfo(use = JsonTypeInfo.Id.DEDUCTION) @JsonIgnoreProperties(ignoreUnknown = true) @@ -55,65 +56,274 @@ case class ExpressionValidation( } case class GroupByValidation( - groupByCols: Seq[String] = Seq(), - aggCol: String = "", + groupByFields: Seq[String] = Seq(), + aggField: String = "", aggType: String = AGGREGATION_SUM, - aggExpr: String = "true" + aggExpr: String = "true", + validation: List[FieldValidation] = List() ) extends Validation { - override def toOptions: List[List[String]] = List( - List("aggExpr", aggExpr), - List("groupByCols", groupByCols.mkString(",")), - List("aggCol", aggCol), - List("aggType", aggType), - ) ++ baseOptions + override def toOptions: List[List[String]] = { + List( + List("aggExpr", aggExpr), + List("groupByFields", groupByFields.mkString(",")), + List("aggField", aggField), + List("aggType", aggType), + List("validation", validation.map(_.toString).mkString(",")), + ) ++ baseOptions + } } case class UpstreamDataSourceValidation( - validation: ValidationBuilder = ValidationBuilder(), + validations: List[ValidationBuilder] = List(), upstreamDataSource: ConnectionTaskBuilder[_] = FileBuilder(), upstreamReadOptions: Map[String, String] = Map(), - joinColumns: List[String] = List(), + joinFields: List[String] = List(), joinType: String = DEFAULT_VALIDATION_JOIN_TYPE, ) extends Validation { override def toOptions: List[List[String]] = { - val nestedValidation = validation.validation.toOptions + val nestedValidation = validations.flatMap(_.validation.toOptions) List( List("upstreamDataSource", upstreamDataSource.connectionConfigWithTaskBuilder.dataSourceName), List("upstreamReadOptions", upstreamReadOptions.mkString(", ")), - List("joinColumns", joinColumns.mkString(",")), + List("joinFields", joinFields.mkString(",")), List("joinType", joinType), ) ++ nestedValidation ++ baseOptions } } case class YamlUpstreamDataSourceValidation( - upstreamDataSource: String, - validation: Validation = ExpressionValidation(), + upstreamDataSource: String = "", + upstreamTaskName: String = "", + validation: List[Validation] = List(), upstreamReadOptions: Map[String, String] = Map(), - joinColumns: List[String] = List(), + joinFields: List[String] = List(), joinType: String = DEFAULT_VALIDATION_JOIN_TYPE, ) extends Validation { override def toOptions: List[List[String]] = List() } -case class ColumnNamesValidation( - columnNameType: String = DEFAULT_VALIDATION_COLUMN_NAME_TYPE, - count: Int = 0, - minCount: Int = 0, - maxCount: Int = 0, - names: Array[String] = Array() - ) extends Validation { +case class FieldNamesValidation( + fieldNameType: String = DEFAULT_VALIDATION_FIELD_NAME_TYPE, + count: Int = 0, + min: Int = 0, + max: Int = 0, + names: Array[String] = Array() + ) extends Validation { override def toOptions: List[List[String]] = { - val baseAttributes = columnNameType match { - case VALIDATION_COLUMN_NAME_COUNT_EQUAL => List(List("count", count.toString)) - case VALIDATION_COLUMN_NAME_COUNT_BETWEEN => List(List("minCount", minCount.toString), List("maxCount", maxCount.toString)) - case VALIDATION_COLUMN_NAME_MATCH_ORDER => List(List("names", names.mkString(","))) - case VALIDATION_COLUMN_NAME_MATCH_SET => List(List("names", names.mkString(","))) + val baseAttributes = fieldNameType match { + case VALIDATION_FIELD_NAME_COUNT_EQUAL => List(List("count", count.toString)) + case VALIDATION_FIELD_NAME_COUNT_BETWEEN => List(List("min", min.toString), List("max", max.toString)) + case VALIDATION_FIELD_NAME_MATCH_ORDER => List(List("names", names.mkString(","))) + case VALIDATION_FIELD_NAME_MATCH_SET => List(List("names", names.mkString(","))) } - List(List("columnNameType", columnNameType)) ++ baseAttributes ++ baseOptions + List(List("fieldNameType", fieldNameType)) ++ baseAttributes ++ baseOptions } } +case class FieldValidations( + field: String = "", + validation: List[FieldValidation] = List() + ) extends Validation { + override def toOptions: List[List[String]] = List( + List("field", field), + List("validation", validation.map(_.toString).mkString(",")), + ) ++ baseOptions +} + +@JsonSubTypes(Array( + new Type(value = classOf[EqualFieldValidation], name = "equal"), + new Type(value = classOf[NullFieldValidation], name = "null"), + new Type(value = classOf[ContainsFieldValidation], name = "contains"), + new Type(value = classOf[UniqueFieldValidation], name = "unique"), + new Type(value = classOf[LessThanFieldValidation], name = "lessThan"), + new Type(value = classOf[GreaterThanFieldValidation], name = "greaterThan"), + new Type(value = classOf[BetweenFieldValidation], name = "between"), + new Type(value = classOf[InFieldValidation], name = "in"), + new Type(value = classOf[MatchesFieldValidation], name = "matches"), + new Type(value = classOf[StartsWithFieldValidation], name = "startsWith"), + new Type(value = classOf[EndsWithFieldValidation], name = "endsWith"), + new Type(value = classOf[SizeFieldValidation], name = "size"), + new Type(value = classOf[LessThanSizeFieldValidation], name = "lessThanSize"), + new Type(value = classOf[GreaterThanSizeFieldValidation], name = "greaterThanSize"), + new Type(value = classOf[LuhnCheckFieldValidation], name = "luhnCheck"), + new Type(value = classOf[HasTypeFieldValidation], name = "hasType"), + new Type(value = classOf[HasTypesFieldValidation], name = "hasTypes"), + new Type(value = classOf[DistinctInSetFieldValidation], name = "distinctInSet"), + new Type(value = classOf[DistinctContainsSetFieldValidation], name = "distinctContainsSet"), + new Type(value = classOf[DistinctEqualFieldValidation], name = "distinctEqual"), + new Type(value = classOf[MaxBetweenFieldValidation], name = "maxBetween"), + new Type(value = classOf[MeanBetweenFieldValidation], name = "meanBetween"), + new Type(value = classOf[MedianBetweenFieldValidation], name = "medianBetween"), + new Type(value = classOf[MinBetweenFieldValidation], name = "minBetween"), + new Type(value = classOf[StdDevBetweenFieldValidation], name = "stdDevBetween"), + new Type(value = classOf[SumBetweenFieldValidation], name = "sumBetween"), + new Type(value = classOf[LengthBetweenFieldValidation], name = "lengthBetween"), + new Type(value = classOf[LengthEqualFieldValidation], name = "lengthEqual"), + new Type(value = classOf[IsDecreasingFieldValidation], name = "isDecreasing"), + new Type(value = classOf[IsIncreasingFieldValidation], name = "isIncreasing"), + new Type(value = classOf[IsJsonParsableFieldValidation], name = "isJsonParsable"), + new Type(value = classOf[MatchJsonSchemaFieldValidation], name = "matchJsonSchema"), + new Type(value = classOf[MatchDateTimeFormatFieldValidation], name = "matchDateTimeFormat"), + new Type(value = classOf[MostCommonValueInSetFieldValidation], name = "mostCommonValueInSet"), + new Type(value = classOf[UniqueValuesProportionBetweenFieldValidation], name = "uniqueValuesProportionBetween"), + new Type(value = classOf[QuantileValuesBetweenFieldValidation], name = "quantileValuesBetween"), +)) +@JsonTypeInfo(use = JsonTypeInfo.Id.NAME, include = JsonTypeInfo.As.EXISTING_PROPERTY, property = "type") +@JsonIgnoreProperties(ignoreUnknown = true) +trait FieldValidation extends Validation { + + val `type`: String + + override def toOptions: List[List[String]] = List( + List("type", `type`) + ) +} + +case class NullFieldValidation(negate: Boolean = false) extends FieldValidation { + override val `type`: String = VALIDATION_NULL +} + +case class EqualFieldValidation(value: Any, negate: Boolean = false) extends FieldValidation { + override val `type`: String = VALIDATION_EQUAL +} + +case class ContainsFieldValidation(value: String, negate: Boolean = false) extends FieldValidation { + override val `type`: String = VALIDATION_CONTAINS +} + +case class UniqueFieldValidation(negate: Boolean = false) extends FieldValidation { + override val `type`: String = VALIDATION_UNIQUE +} + +case class BetweenFieldValidation(min: Double, max: Double, negate: Boolean = false) extends FieldValidation { + override val `type`: String = VALIDATION_BETWEEN +} + +case class LessThanFieldValidation(value: Any, strictly: Boolean = true) extends FieldValidation { + override val `type`: String = VALIDATION_LESS_THAN +} + +case class GreaterThanFieldValidation(value: Any, strictly: Boolean = true) extends FieldValidation { + override val `type`: String = VALIDATION_GREATER_THAN +} + +case class InFieldValidation(values: List[Any], negate: Boolean = false) extends FieldValidation { + override val `type`: String = VALIDATION_IN +} + +case class MatchesFieldValidation(regex: String, negate: Boolean = false) extends FieldValidation { + override val `type`: String = VALIDATION_MATCHES +} + +case class StartsWithFieldValidation(value: String, negate: Boolean = false) extends FieldValidation { + override val `type`: String = VALIDATION_STARTS_WITH +} + +case class EndsWithFieldValidation(value: String, negate: Boolean = false) extends FieldValidation { + override val `type`: String = VALIDATION_ENDS_WITH +} + +case class SizeFieldValidation(size: Int, negate: Boolean = false) extends FieldValidation { + override val `type`: String = VALIDATION_SIZE +} + +case class LessThanSizeFieldValidation(size: Int, strictly: Boolean = true) extends FieldValidation { + override val `type`: String = VALIDATION_LESS_THAN_SIZE +} + +case class GreaterThanSizeFieldValidation(size: Int, strictly: Boolean = true) extends FieldValidation { + override val `type`: String = VALIDATION_GREATER_THAN_SIZE +} + +case class LuhnCheckFieldValidation(negate: Boolean = false) extends FieldValidation { + override val `type`: String = VALIDATION_LUHN_CHECK +} + +case class HasTypeFieldValidation(value: String, negate: Boolean = false) extends FieldValidation { + override val `type`: String = VALIDATION_HAS_TYPE +} + +case class HasTypesFieldValidation(values: List[String], negate: Boolean = false) extends FieldValidation { + override val `type`: String = VALIDATION_HAS_TYPES +} + +case class DistinctInSetFieldValidation(values: List[Any], negate: Boolean = false) extends FieldValidation { + override val `type`: String = VALIDATION_DISTINCT_IN_SET +} + +case class DistinctContainsSetFieldValidation(values: List[Any], negate: Boolean = false) extends FieldValidation { + override val `type`: String = VALIDATION_DISTINCT_CONTAINS_SET +} + +case class DistinctEqualFieldValidation(values: List[Any], negate: Boolean = false) extends FieldValidation { + override val `type`: String = VALIDATION_DISTINCT_EQUAL +} + +case class MaxBetweenFieldValidation(min: Any, max: Any, negate: Boolean = false) extends FieldValidation { + override val `type`: String = VALIDATION_MAX_BETWEEN +} + +case class MeanBetweenFieldValidation(min: Any, max: Any, negate: Boolean = false) extends FieldValidation { + override val `type`: String = VALIDATION_MEAN_BETWEEN +} + +case class MedianBetweenFieldValidation(min: Any, max: Any, negate: Boolean = false) extends FieldValidation { + override val `type`: String = VALIDATION_MEDIAN_BETWEEN +} + +case class MinBetweenFieldValidation(min: Any, max: Any, negate: Boolean = false) extends FieldValidation { + override val `type`: String = VALIDATION_MIN_BETWEEN +} + +case class StdDevBetweenFieldValidation(min: Any, max: Any, negate: Boolean = false) extends FieldValidation { + override val `type`: String = VALIDATION_STD_DEV_BETWEEN +} + +case class SumBetweenFieldValidation(min: Any, max: Any, negate: Boolean = false) extends FieldValidation { + override val `type`: String = VALIDATION_SUM_BETWEEN +} + +case class LengthBetweenFieldValidation(min: Int, max: Int, negate: Boolean = false) extends FieldValidation { + override val `type`: String = VALIDATION_LENGTH_BETWEEN +} + +case class LengthEqualFieldValidation(value: Int, negate: Boolean = false) extends FieldValidation { + override val `type`: String = VALIDATION_LENGTH_EQUAL +} + +case class IsDecreasingFieldValidation(strictly: Boolean = true) extends FieldValidation { + override val `type`: String = VALIDATION_IS_DECREASING +} + +case class IsIncreasingFieldValidation(strictly: Boolean = true) extends FieldValidation { + override val `type`: String = VALIDATION_IS_INCREASING +} + +case class IsJsonParsableFieldValidation(negate: Boolean = false) extends FieldValidation { + override val `type`: String = VALIDATION_IS_JSON_PARSABLE +} + +case class MatchJsonSchemaFieldValidation(schema: String, negate: Boolean = false) extends FieldValidation { + override val `type`: String = VALIDATION_MATCH_JSON_SCHEMA +} + +case class MatchDateTimeFormatFieldValidation(format: String, negate: Boolean = false) extends FieldValidation { + override val `type`: String = VALIDATION_MATCH_DATE_TIME_FORMAT +} + +case class MostCommonValueInSetFieldValidation(values: List[Any], negate: Boolean = false) extends FieldValidation { + override val `type`: String = VALIDATION_MOST_COMMON_VALUE_IN_SET +} + +case class UniqueValuesProportionBetweenFieldValidation(min: Double, max: Double, negate: Boolean = false) extends FieldValidation { + override val `type`: String = VALIDATION_UNIQUE_VALUES_PROPORTION_BETWEEN +} + +case class QuantileValuesBetweenFieldValidation(quantileRanges: Map[Double, (Double, Double)], negate: Boolean = false) extends FieldValidation { + override val `type`: String = VALIDATION_QUANTILE_VALUES_BETWEEN +} + + case class ValidationConfiguration( name: String = DEFAULT_VALIDATION_CONFIG_NAME, description: String = DEFAULT_VALIDATION_DESCRIPTION, diff --git a/api/src/main/scala/io/github/datacatering/datacaterer/api/parser/ValidationBuilderSerializer.scala b/api/src/main/scala/io/github/datacatering/datacaterer/api/parser/ValidationBuilderSerializer.scala new file mode 100644 index 00000000..7e83c296 --- /dev/null +++ b/api/src/main/scala/io/github/datacatering/datacaterer/api/parser/ValidationBuilderSerializer.scala @@ -0,0 +1,74 @@ +package io.github.datacatering.datacaterer.api.parser + +import com.fasterxml.jackson.core.JsonGenerator +import com.fasterxml.jackson.databind.{JsonSerializer, SerializerProvider} +import io.github.datacatering.datacaterer.api.ValidationBuilder +import io.github.datacatering.datacaterer.api.model.Constants.{VALIDATION_FIELD_NAME_COUNT_BETWEEN, VALIDATION_FIELD_NAME_COUNT_EQUAL, VALIDATION_FIELD_NAME_MATCH_ORDER, VALIDATION_FIELD_NAME_MATCH_SET} +import io.github.datacatering.datacaterer.api.model.{ExpressionValidation, FieldNamesValidation, FieldValidations, GroupByValidation, UpstreamDataSourceValidation} + +import scala.util.Try + +class ValidationBuilderSerializer extends JsonSerializer[ValidationBuilder] { + override def serialize(value: ValidationBuilder, gen: JsonGenerator, serializers: SerializerProvider): Unit = { + val validation = value.validation + Try(gen.writeStartObject()) + validation.preFilter.foreach(preFilter => { + gen.writeStringField("preFilterExpr", preFilter.toExpression) + }) + validation match { + case FieldValidations(field, fieldValid) => + gen.writeStringField("field", field) + gen.writeArrayFieldStart("validation") + fieldValid.foreach(fv => { +// gen.writeStringField("type", fv.`type`) +// fv match { +// case BetweenFieldValidation(min, max, negate) => +// gen.writeStringField("min", min.toString) +// gen.writeStringField("max", max.toString) +// gen.writeStringField("negate", negate.toString) +// } + }) + gen.writeEndArray() + case ExpressionValidation(expr, selectExpr) => + gen.writeArrayFieldStart("selectExpr") + selectExpr.foreach(gen.writeObject) + gen.writeEndArray() + gen.writeStringField("whereExpr", expr) + case GroupByValidation(groupByFields, aggField, aggType, expr, validation) => + gen.writeArrayFieldStart("groupByFields") + groupByFields.foreach(gen.writeObject) + gen.writeEndArray() + gen.writeStringField("aggField", aggField) + gen.writeStringField("aggType", aggType) + gen.writeStringField("expr", expr) + gen.writeArrayFieldStart("validation") + validation.foreach(v => serialize(ValidationBuilder(v), gen, serializers)) + gen.writeEndArray() + case FieldNamesValidation(fieldNameValidationType, count, min, max, names) => + gen.writeStringField("fieldNameType", fieldNameValidationType) + fieldNameValidationType match { + case VALIDATION_FIELD_NAME_COUNT_EQUAL => + gen.writeStringField("count", count.toString) + case VALIDATION_FIELD_NAME_COUNT_BETWEEN => + gen.writeStringField("min", min.toString) + gen.writeStringField("max", max.toString) + case VALIDATION_FIELD_NAME_MATCH_ORDER => + gen.writeStringField("matchOrder", names.mkString(",")) + case VALIDATION_FIELD_NAME_MATCH_SET => + gen.writeStringField("matchSet", names.mkString(",")) + } + case UpstreamDataSourceValidation(validationBuilders, upstreamDataSource, upstreamReadOptions, joinFields, joinType) => + gen.writeStringField("upstreamDataSource", upstreamDataSource.connectionConfigWithTaskBuilder.dataSourceName) + gen.writeObjectFieldStart("upstreamReadOptions") + upstreamReadOptions.foreach(opt => gen.writeObjectField(opt._1, opt._2)) + gen.writeEndObject() + gen.writeStringField("joinFields", joinFields.mkString(",")) + gen.writeStringField("joinType", joinType) + gen.writeArrayFieldStart("validations") + validationBuilders.foreach(v => serialize(v, gen, serializers)) + gen.writeEndArray() + case _ => + } + gen.writeEndObject() + } +} diff --git a/api/src/main/scala/io/github/datacatering/datacaterer/api/parser/ValidationIdResolver.scala b/api/src/main/scala/io/github/datacatering/datacaterer/api/parser/ValidationIdResolver.scala deleted file mode 100644 index 38441c56..00000000 --- a/api/src/main/scala/io/github/datacatering/datacaterer/api/parser/ValidationIdResolver.scala +++ /dev/null @@ -1,108 +0,0 @@ -package io.github.datacatering.datacaterer.api.parser - -import com.fasterxml.jackson.annotation.JsonTypeInfo.Id -import com.fasterxml.jackson.core.JsonGenerator -import com.fasterxml.jackson.databind.jsontype.impl.TypeIdResolverBase -import com.fasterxml.jackson.databind.{DatabindContext, JavaType, JsonSerializer, SerializerProvider} -import io.github.datacatering.datacaterer.api.ValidationBuilder -import io.github.datacatering.datacaterer.api.model.Constants.{VALIDATION_COLUMN_NAME_COUNT_BETWEEN, VALIDATION_COLUMN_NAME_COUNT_EQUAL, VALIDATION_COLUMN_NAME_MATCH_ORDER, VALIDATION_COLUMN_NAME_MATCH_SET} -import io.github.datacatering.datacaterer.api.model.{ColumnNamesValidation, ExpressionValidation, GroupByValidation, UpstreamDataSourceValidation} - -import scala.util.Try - -class ValidationIdResolver extends TypeIdResolverBase { - private var superType: JavaType = null - - override def init(bt: JavaType): Unit = { - superType = bt - } - - override def idFromValue(value: Any): String = { - idFromValueAndType(value, value.getClass) - } - - override def idFromBaseType(): String = { - idFromValueAndType(null, superType.getRawClass) - } - - override def idFromValueAndType(value: Any, suggestedType: Class[_]): String = { - val Expr = classOf[ExpressionValidation] - val Group = classOf[GroupByValidation] - val Upstream = classOf[UpstreamDataSourceValidation] - val Columns = classOf[ColumnNamesValidation] - suggestedType match { - case Expr => "ExpressionValidation" - case Group => "GroupByValidation" - case Upstream => "UpstreamDataSourceValidation" - case Columns => "ColumnNamesValidation" - case _ => "ExpressionValidation" - } - } - - override def getMechanism: Id = null - - override def typeFromId(context: DatabindContext, id: String): JavaType = { - val subType = id match { - case "ExpressionValidation" => classOf[ExpressionValidation] - case "GroupByValidation" => classOf[GroupByValidation] - case "UpstreamDataSourceValidation" => classOf[UpstreamDataSourceValidation] - case "ColumnNamesValidation" => classOf[ColumnNamesValidation] - case _ => classOf[ExpressionValidation] - } - context.constructSpecializedType(superType, subType) - } -} - -//class ValidationDeserializer extends JsonDeserializer[Validation] { -// override def deserialize(p: JsonParser, ctxt: DeserializationContext): Validation = { -// -// } -//} - -class ValidationBuilderSerializer extends JsonSerializer[ValidationBuilder] { - override def serialize(value: ValidationBuilder, gen: JsonGenerator, serializers: SerializerProvider): Unit = { - val validation = value.validation - Try(gen.writeStartObject()) - validation.preFilter.foreach(preFilter => { - gen.writeStringField("preFilterExpr", preFilter.toExpression) - }) - validation match { - case ExpressionValidation(expr, selectExpr) => - gen.writeArrayFieldStart("selectExpr") - selectExpr.foreach(gen.writeObject) - gen.writeEndArray() - gen.writeStringField("whereExpr", expr) - case GroupByValidation(groupByCols, aggCol, aggType, expr) => - gen.writeArrayFieldStart("groupByCols") - groupByCols.foreach(gen.writeObject) - gen.writeEndArray() - gen.writeStringField("aggCol", aggCol) - gen.writeStringField("aggType", aggType) - gen.writeStringField("expr", expr) - case ColumnNamesValidation(columnNameValidationType, count, minCount, maxCount, names) => - gen.writeStringField("columnNameType", columnNameValidationType) - columnNameValidationType match { - case VALIDATION_COLUMN_NAME_COUNT_EQUAL => - gen.writeStringField("count", count.toString) - case VALIDATION_COLUMN_NAME_COUNT_BETWEEN => - gen.writeStringField("min", minCount.toString) - gen.writeStringField("max", maxCount.toString) - case VALIDATION_COLUMN_NAME_MATCH_ORDER => - gen.writeStringField("matchOrder", names.mkString(",")) - case VALIDATION_COLUMN_NAME_MATCH_SET => - gen.writeStringField("matchSet", names.mkString(",")) - } - case UpstreamDataSourceValidation(validationBuilder, upstreamDataSource, upstreamReadOptions, joinCols, joinType) => - gen.writeStringField("upstreamDataSource", upstreamDataSource.connectionConfigWithTaskBuilder.dataSourceName) - gen.writeObjectFieldStart("upstreamReadOptions") - upstreamReadOptions.foreach(opt => gen.writeObjectField(opt._1, opt._2)) - gen.writeEndObject() - gen.writeStringField("joinColumns", joinCols.mkString(",")) - gen.writeStringField("joinType", joinType) - gen.writeObjectFieldStart("validation") - serialize(validationBuilder, gen, serializers) - case _ => - } - gen.writeEndObject() - } -} diff --git a/api/src/test/java/io/github/datacatering/datacaterer/javaapi/api/DocumentationJavaPlanRun.java b/api/src/test/java/io/github/datacatering/datacaterer/javaapi/api/DocumentationJavaPlanRun.java index 9926d587..fb9cbf8f 100644 --- a/api/src/test/java/io/github/datacatering/datacaterer/javaapi/api/DocumentationJavaPlanRun.java +++ b/api/src/test/java/io/github/datacatering/datacaterer/javaapi/api/DocumentationJavaPlanRun.java @@ -50,7 +50,7 @@ // .count( // count() // .records(100) -// .recordsPerColumnGenerator(generator().min(1).max(2), "account_id", "name") +// .recordsPerFieldGenerator(generator().min(1).max(2), "account_id", "name") // ); // // var foreignKeySetup = plan() diff --git a/api/src/test/java/io/github/datacatering/datacaterer/javaapi/api/ExampleJavaPlanRun.java b/api/src/test/java/io/github/datacatering/datacaterer/javaapi/api/ExampleJavaPlanRun.java index 0ce03402..6e07b8c7 100644 --- a/api/src/test/java/io/github/datacatering/datacaterer/javaapi/api/ExampleJavaPlanRun.java +++ b/api/src/test/java/io/github/datacatering/datacaterer/javaapi/api/ExampleJavaPlanRun.java @@ -8,7 +8,7 @@ // .table("my.table") // .schema(field().name("account_id")) // .count(count() -// .recordsPerColumn(10, "account_id", "name") +// .recordsPerField(10, "account_id", "name") // .generator(generator().min(10).max(100)) // ); // diff --git a/api/src/test/scala/io/github/datacatering/datacaterer/api/ExamplePlanRun.scala b/api/src/test/scala/io/github/datacatering/datacaterer/api/ExamplePlanRun.scala index 30c4c17c..ff39e725 100644 --- a/api/src/test/scala/io/github/datacatering/datacaterer/api/ExamplePlanRun.scala +++ b/api/src/test/scala/io/github/datacatering/datacaterer/api/ExamplePlanRun.scala @@ -12,11 +12,11 @@ class ExamplePlanRun extends PlanRun { addTask("account_json", "fs_json", step.name("account") .option(("path", "app/src/test/resources/sample/json/account")) - .schema(schema.addFields( + .fields( field.name("account_id"), field.name("year").`type`(IntegerType).min(2022), field.name("name").static("peter") - )) + ) ) execute(List(tasksBuilder), planBuilder) @@ -34,7 +34,7 @@ class MinimalPlanWithManualTaskRun extends PlanRun { val tasksBuilder = tasks.addTask("my_task", "mininal_json", step .option(("path", "app/src/test/resources/sample/json/minimal")) - .schema(schema.addFields(field.name("account_id"))) + .fields(field.name("account_id")) ) execute(tasksBuilder) } @@ -42,7 +42,7 @@ class MinimalPlanWithManualTaskRun extends PlanRun { class LargeCountRun extends PlanRun { val jsonTask = json("mininal_json", "app/src/test/resources/sample/json/large") - .schema(schema.addFields( + .fields( field.name("account_id"), field.name("year").`type`(IntegerType).min(2022), field.name("name").expression("#{Name.name}"), @@ -51,15 +51,15 @@ class LargeCountRun extends PlanRun { field.name("status").oneOf("open", "closed"), field.name("txn_list") .`type`(ArrayType) - .schema(schema.addFields( + .fields( field.name("id"), field.name("date").`type`(DateType).min(Date.valueOf("2022-01-01")), field.name("amount").`type`(DoubleType) - )) - )) + ) + ) .count(count .records(10000) - .recordsPerColumn(100, "account_id") + .recordsPerField(100, "account_id") ) val conf = configuration @@ -90,12 +90,12 @@ class DocsPlanRun extends PlanRun { .count( count .records(1000) - .recordsPerColumnGenerator( + .recordsPerFieldGenerator( generator.min(1).max(2), "account_id" ) ) - .schema(schema.addField("account_id")) + .fields(field.name("account_id")) ) } @@ -110,7 +110,7 @@ class FullExamplePlanRun extends PlanRun { step .name("transaction") .jdbcTable("account.transaction") - .schema(schema.addFields( + .fields( accountIdField, field.name("txn_id").regex("txn_[0-9]{5}"), field.name("year").`type`(IntegerType).sql("YEAR(date)"), @@ -118,16 +118,16 @@ class FullExamplePlanRun extends PlanRun { field.name("date").`type`(DateType).min(startDate), field.name("amount").`type`(DoubleType).max(10000), field.name("credit_debit").sql("CASE WHEN amount < 0 THEN 'C' ELSE 'D' END") - )), + ), step .name("account") .jdbcTable("account.account") - .schema(schema.addFields( + .fields( accountIdField, nameField, field.name("open_date").`type`(DateType).min(startDate), field.name("status").oneOf("open", "closed", "pending") - )) + ) ) val jsonTask = task.name("json_account_details") @@ -135,17 +135,17 @@ class FullExamplePlanRun extends PlanRun { step .name("account_info") .path("/tmp/src/main/resources/sample/json") - .schema(schema.addFields( + .fields( accountIdField, nameField, field.name("txn_list") .`type`(ArrayType) - .schema(schema.addFields( + .fields( field.name("id"), field.name("date").`type`(DateType).min(startDate), field.name("amount").`type`(DoubleType), - )) - )) + ) + ) ) val conf = configuration @@ -169,7 +169,7 @@ class FullExamplePlanRun extends PlanRun { class ConnectionBasedApiPlanRun extends PlanRun { val csvGenerate = csv("my_csv", "app/src/test/resources/sample/connection-api/csv") - .schema( + .fields( field.name("account_id"), field.name("year").`type`(IntegerType).min(2022) ) @@ -177,14 +177,14 @@ class ConnectionBasedApiPlanRun extends PlanRun { val jsonGenerate = json("my_json", "app/src/test/resources/sample/connection-api/json") .partitionBy("age") - .schema( + .fields( field.name("name").expression("#{Name.name}"), field.name("age").`type`(IntegerType).min(18).max(20), ) .count(count.records(100)) val x = json("account_info", "/tmp/data-caterer/json") - .schema( + .fields( field.name("account_id"), field.name("year").`type`(IntegerType).min(2022), field.name("name").expression("#{Name.name}"), @@ -193,11 +193,11 @@ class ConnectionBasedApiPlanRun extends PlanRun { field.name("status").oneOf("open", "closed"), field.name("txn_list") .`type`(ArrayType) - .schema(schema.addFields( + .fields( field.name("id"), field.name("date").`type`(DateType).min(Date.valueOf("2022-01-01")), field.name("amount").`type`(DoubleType), - )) + ) ) .count(count.records(100)) @@ -205,29 +205,29 @@ class ConnectionBasedApiPlanRun extends PlanRun { .task( step .jdbcTable("public.accounts") - .schema( + .fields( field.name("account_id"), field.name("name").expression("#{Name.name}"), ), step .jdbcTable("public.transactions") - .schema( + .fields( field.name("account_id"), field.name("amount").`type`(DoubleType).max(1000) ) - .count(count.recordsPerColumn(10, "account_id")) + .count(count.recordsPerField(10, "account_id")) ) val postgresAcc = postgres("my_postgres") .table("public.accounts") - .schema( + .fields( field.name("account_id") ) var jsonTask = json("my_json", "/tmp/json") - .schema( + .fields( field.name("account_id"), field.name("customer_details") - .schema( + .fields( field.name("name").sql("_join_txn_name").`type`(DoubleType).enableEdgeCases(true).edgeCaseProbability(0.1) ), field.name("_join_txn_name").omit(true) @@ -237,7 +237,7 @@ class ConnectionBasedApiPlanRun extends PlanRun { List(jsonTask -> List("account_id", "")) ) val csvTask = csv("my_csv", "s3a://my-bucket/csv/accounts") - .schema( + .fields( field.name("account_id"), ) val conf = configuration @@ -259,7 +259,7 @@ class ConnectionBasedApiPlanRun extends PlanRun { class DocumentationPlanRun extends PlanRun { val jsonTask = json("account_info", "/opt/app/data/json") - .schema( + .fields( field.name("account_id").regex("ACC[0-9]{8}"), field.name("year").`type`(IntegerType).sql("YEAR(date)"), field.name("name").expression("#{Name.name}"), @@ -268,23 +268,23 @@ class DocumentationPlanRun extends PlanRun { field.name("status").oneOf("open", "closed"), field.name("txn_list") .`type`(ArrayType) - .schema(schema.addFields( + .fields( field.name("id").sql("_join_txn_id"), field.name("date").`type`(DateType).min(Date.valueOf("2022-01-01")), field.name("amount").`type`(DoubleType) - )), + ), field.name("_join_txn_id").omit(true) ) .count(count.records(100)) val csvTxns = csv("transactions", "/opt/app/data/csv") - .schema( + .fields( field.name("account_id"), field.name("txn_id"), field.name("amount"), field.name("merchant").expression("#{Company.name}"), ) - .count(count.recordsPerColumnGenerator(generator.min(1).max(5), "account_id")) + .count(count.recordsPerFieldGenerator(generator.min(1).max(5), "account_id")) val foreignKeySetup = plan .addForeignKeyRelationship(jsonTask, "account_id", List((csvTxns, "account_id"))) diff --git a/api/src/test/scala/io/github/datacatering/datacaterer/api/MetadataSourceBuilderTest.scala b/api/src/test/scala/io/github/datacatering/datacaterer/api/MetadataSourceBuilderTest.scala index dc43f674..c8065501 100644 --- a/api/src/test/scala/io/github/datacatering/datacaterer/api/MetadataSourceBuilderTest.scala +++ b/api/src/test/scala/io/github/datacatering/datacaterer/api/MetadataSourceBuilderTest.scala @@ -1,7 +1,7 @@ package io.github.datacatering.datacaterer.api -import io.github.datacatering.datacaterer.api.model.Constants.{DATA_CONTRACT_FILE, DATA_CONTRACT_SCHEMA, GREAT_EXPECTATIONS_FILE, METADATA_SOURCE_URL, OPEN_LINEAGE_DATASET, OPEN_LINEAGE_NAMESPACE, OPEN_METADATA_API_VERSION, OPEN_METADATA_AUTH_TYPE, OPEN_METADATA_AUTH_TYPE_BASIC, OPEN_METADATA_AUTH_TYPE_OPEN_METADATA, OPEN_METADATA_BASIC_AUTH_PASSWORD, OPEN_METADATA_BASIC_AUTH_USERNAME, OPEN_METADATA_DEFAULT_API_VERSION, OPEN_METADATA_HOST, OPEN_METADATA_JWT_TOKEN, SCHEMA_LOCATION} -import io.github.datacatering.datacaterer.api.model.{DataContractCliSource, GreatExpectationsSource, MarquezMetadataSource, OpenAPISource, OpenDataContractStandardSource, OpenMetadataSource} +import io.github.datacatering.datacaterer.api.model.Constants.{CONFLUENT_SCHEMA_REGISTRY_ID, CONFLUENT_SCHEMA_REGISTRY_SUBJECT, CONFLUENT_SCHEMA_REGISTRY_VERSION, DATA_CONTRACT_FILE, DATA_CONTRACT_SCHEMA, GREAT_EXPECTATIONS_FILE, METADATA_SOURCE_URL, OPEN_LINEAGE_DATASET, OPEN_LINEAGE_NAMESPACE, OPEN_METADATA_API_VERSION, OPEN_METADATA_AUTH_TYPE, OPEN_METADATA_AUTH_TYPE_BASIC, OPEN_METADATA_AUTH_TYPE_OPEN_METADATA, OPEN_METADATA_BASIC_AUTH_PASSWORD, OPEN_METADATA_BASIC_AUTH_USERNAME, OPEN_METADATA_DEFAULT_API_VERSION, OPEN_METADATA_HOST, OPEN_METADATA_JWT_TOKEN, SCHEMA_LOCATION} +import io.github.datacatering.datacaterer.api.model.{ConfluentSchemaRegistrySource, DataContractCliSource, GreatExpectationsSource, MarquezMetadataSource, OpenAPISource, OpenDataContractStandardSource, OpenMetadataSource} import org.junit.runner.RunWith import org.scalatest.funsuite.AnyFunSuite import org.scalatestplus.junit.JUnitRunner @@ -103,4 +103,35 @@ class MetadataSourceBuilderTest extends AnyFunSuite { )) } + test("Can create Confluent Schema Registry metadata source with schema ID") { + val result = MetadataSourceBuilder().confluentSchemaRegistry("localhost:8081", 1).metadataSource + + assert(result.isInstanceOf[ConfluentSchemaRegistrySource]) + assert(result.asInstanceOf[ConfluentSchemaRegistrySource].connectionOptions == Map( + METADATA_SOURCE_URL -> "localhost:8081", + CONFLUENT_SCHEMA_REGISTRY_ID -> "1" + )) + } + + test("Can create Confluent Schema Registry metadata source with schema subject") { + val result = MetadataSourceBuilder().confluentSchemaRegistry("localhost:8081", "my-proto").metadataSource + + assert(result.isInstanceOf[ConfluentSchemaRegistrySource]) + assert(result.asInstanceOf[ConfluentSchemaRegistrySource].connectionOptions == Map( + METADATA_SOURCE_URL -> "localhost:8081", + CONFLUENT_SCHEMA_REGISTRY_SUBJECT -> "my-proto" + )) + } + + test("Can create Confluent Schema Registry metadata source with schema subject and version") { + val result = MetadataSourceBuilder().confluentSchemaRegistry("localhost:8081", "my-proto", 2).metadataSource + + assert(result.isInstanceOf[ConfluentSchemaRegistrySource]) + assert(result.asInstanceOf[ConfluentSchemaRegistrySource].connectionOptions == Map( + METADATA_SOURCE_URL -> "localhost:8081", + CONFLUENT_SCHEMA_REGISTRY_SUBJECT -> "my-proto", + CONFLUENT_SCHEMA_REGISTRY_VERSION -> "2" + )) + } + } diff --git a/api/src/test/scala/io/github/datacatering/datacaterer/api/PlanBuilderTest.scala b/api/src/test/scala/io/github/datacatering/datacaterer/api/PlanBuilderTest.scala index e222304c..9da5764b 100644 --- a/api/src/test/scala/io/github/datacatering/datacaterer/api/PlanBuilderTest.scala +++ b/api/src/test/scala/io/github/datacatering/datacaterer/api/PlanBuilderTest.scala @@ -1,7 +1,6 @@ package io.github.datacatering.datacaterer.api -import io.github.datacatering.datacaterer.api.model.Constants.{ALL_COMBINATIONS, FOREIGN_KEY_DELIMITER} -import io.github.datacatering.datacaterer.api.connection.FileBuilder +import io.github.datacatering.datacaterer.api.model.Constants.ALL_COMBINATIONS import io.github.datacatering.datacaterer.api.model.{DataCatererConfiguration, ExpressionValidation, ForeignKeyRelation, PauseWaitCondition} import org.junit.runner.RunWith import org.scalatest.funsuite.AnyFunSuite @@ -34,7 +33,7 @@ class PlanBuilderTest extends AnyFunSuite { val t = tasks.addTask( "my task", dataSourceName, - step.schema(schema.addFields(field.name("account_id"))) + step.fields(field.name("account_id")) ) val p = plan.name("my plan") @@ -75,7 +74,7 @@ class PlanBuilderTest extends AnyFunSuite { assertResult(1)(result._tasks.size) assertResult("my task")(result._tasks.head.name) - assertResult("account_id")(result._tasks.head.steps.head.schema.fields.get.head.name) + assertResult("account_id")(result._tasks.head.steps.head.fields.head.name) assertResult("my plan")(result._plan.name) assertResult(1)(result._plan.tasks.size) @@ -85,15 +84,15 @@ class PlanBuilderTest extends AnyFunSuite { assert(result._plan.sinkOptions.get.seed.contains("1")) assert(result._plan.sinkOptions.get.locale.contains("en")) val fk = result._plan.sinkOptions.get.foreignKeys - assert(fk.exists(f => f._1.equalsIgnoreCase(s"account_json${FOREIGN_KEY_DELIMITER}default_step${FOREIGN_KEY_DELIMITER}account_id"))) + assert(fk.exists(f => f.source == ForeignKeyRelation("account_json", "default_step", List("account_id")))) assert( - fk.find(f => f._1.equalsIgnoreCase(s"account_json${FOREIGN_KEY_DELIMITER}default_step${FOREIGN_KEY_DELIMITER}account_id")).get._2 == - List(s"txn_db${FOREIGN_KEY_DELIMITER}txn_step${FOREIGN_KEY_DELIMITER}account_number") + fk.find(f => f.source == ForeignKeyRelation(s"account_json", "default_step", List("account_id"))).get.generate == + List(ForeignKeyRelation("txn_db", "txn_step", List("account_number"))) ) - assert(fk.exists(f => f._1.equalsIgnoreCase(s"account_json${FOREIGN_KEY_DELIMITER}default_step${FOREIGN_KEY_DELIMITER}customer_number"))) + assert(fk.exists(f => f.source == ForeignKeyRelation(s"account_json", "default_step", List("customer_number")))) assert( - fk.find(f => f._1.equalsIgnoreCase(s"account_json${FOREIGN_KEY_DELIMITER}default_step${FOREIGN_KEY_DELIMITER}customer_number")).get._2 == - List(s"acc_db${FOREIGN_KEY_DELIMITER}acc_step${FOREIGN_KEY_DELIMITER}customer_number") + fk.find(f => f.source == ForeignKeyRelation(s"account_json", "default_step", List("customer_number"))).get.generate == + List(ForeignKeyRelation("acc_db", "acc_step", List("customer_number"))) ) assert(result._configuration.flagsConfig.enableCount) @@ -138,9 +137,9 @@ class PlanBuilderTest extends AnyFunSuite { test("Can define foreign key via connection task builder") { val jsonTask = ConnectionConfigWithTaskBuilder().file("my_json", "json") - .schema(FieldBuilder().name("account_id")) + .fields(FieldBuilder().name("account_id")) val csvTask = ConnectionConfigWithTaskBuilder().file("my_csv", "csv") - .schema(FieldBuilder().name("account_id")) + .fields(FieldBuilder().name("account_id")) val result = PlanBuilder().addForeignKeyRelationship( jsonTask, List("account_id"), List(csvTask -> List("account_id")) @@ -150,8 +149,8 @@ class PlanBuilderTest extends AnyFunSuite { val fk = result.sinkOptions.get.foreignKeys assert(fk.nonEmpty) assertResult(1)(fk.size) - assert(fk.exists(f => f._1.startsWith("my_json") && f._1.endsWith("account_id") && - f._2.size == 1 && f._2.head.startsWith("my_csv") && f._2.head.endsWith("account_id") + assert(fk.exists(f => f.source.dataSource == "my_json" && f.source.fields == List("account_id") && + f.generate.size == 1 && f.generate.head.dataSource == "my_csv" && f.generate.head.fields == List("account_id") )) val result2 = PlanBuilder().addForeignKeyRelationship( @@ -165,7 +164,7 @@ class PlanBuilderTest extends AnyFunSuite { assertResult(1)(fk2.size) } - test("Throw runtime exception when foreign key column is not defined in data sources") { + test("Throw runtime exception when foreign key field is not defined in data sources") { val jsonTask = ConnectionConfigWithTaskBuilder().file("my_json", "json") val csvTask = ConnectionConfigWithTaskBuilder().file("my_csv", "csv") @@ -175,9 +174,9 @@ class PlanBuilderTest extends AnyFunSuite { ).plan) } - test("Throw runtime exception when foreign key column is not defined in data sources with other columns") { - val jsonTask = ConnectionConfigWithTaskBuilder().file("my_json", "json").schema(FieldBuilder().name("account_number")) - val csvTask = ConnectionConfigWithTaskBuilder().file("my_csv", "csv").schema(FieldBuilder().name("account_type")) + test("Throw runtime exception when foreign key field is not defined in data sources with other fields") { + val jsonTask = ConnectionConfigWithTaskBuilder().file("my_json", "json").fields(FieldBuilder().name("account_number")) + val csvTask = ConnectionConfigWithTaskBuilder().file("my_csv", "csv").fields(FieldBuilder().name("account_type")) assertThrows[RuntimeException](PlanBuilder().addForeignKeyRelationship( jsonTask, List("account_id"), @@ -186,8 +185,8 @@ class PlanBuilderTest extends AnyFunSuite { } test("Don't throw runtime exception when data source schema is defined from metadata source") { - val jsonTask = ConnectionConfigWithTaskBuilder().file("my_json", "json").schema(MetadataSourceBuilder().openApi("localhost:8080")) - val csvTask = ConnectionConfigWithTaskBuilder().file("my_csv", "csv").schema(MetadataSourceBuilder().openApi("localhost:8080")) + val jsonTask = ConnectionConfigWithTaskBuilder().file("my_json", "json").fields(MetadataSourceBuilder().openApi("localhost:8080")) + val csvTask = ConnectionConfigWithTaskBuilder().file("my_csv", "csv").fields(MetadataSourceBuilder().openApi("localhost:8080")) val result = PlanBuilder().addForeignKeyRelationship( jsonTask, List("account_id"), List(csvTask -> List("account_id")) @@ -199,9 +198,9 @@ class PlanBuilderTest extends AnyFunSuite { assertResult(1)(fk.size) } - test("Don't throw runtime exception when delete foreign key column, defined by SQL, is not defined in data sources") { - val jsonTask = ConnectionConfigWithTaskBuilder().file("my_json", "json").schema(FieldBuilder().name("account_id")) - val csvTask = ConnectionConfigWithTaskBuilder().file("my_csv", "csv").schema(FieldBuilder().name("account_number")) + test("Don't throw runtime exception when delete foreign key field, defined by SQL, is not defined in data sources") { + val jsonTask = ConnectionConfigWithTaskBuilder().file("my_json", "json").fields(FieldBuilder().name("account_id")) + val csvTask = ConnectionConfigWithTaskBuilder().file("my_csv", "csv").fields(FieldBuilder().name("account_number")) val result = PlanBuilder().addForeignKeyRelationship( jsonTask, List("account_id"), List(), @@ -212,8 +211,8 @@ class PlanBuilderTest extends AnyFunSuite { val fk = result.sinkOptions.get.foreignKeys assert(fk.nonEmpty) assertResult(1)(fk.size) - assert(fk.head._2.isEmpty) - assertResult(1)(fk.head._3.size) + assert(fk.head.generate.isEmpty) + assertResult(1)(fk.head.delete.size) } test("Can create a step that will generate records for all combinations") { diff --git a/api/src/test/scala/io/github/datacatering/datacaterer/api/PlanRunTest.scala b/api/src/test/scala/io/github/datacatering/datacaterer/api/PlanRunTest.scala index 31ec8699..21f67594 100644 --- a/api/src/test/scala/io/github/datacatering/datacaterer/api/PlanRunTest.scala +++ b/api/src/test/scala/io/github/datacatering/datacaterer/api/PlanRunTest.scala @@ -1,6 +1,6 @@ package io.github.datacatering.datacaterer.api -import io.github.datacatering.datacaterer.api.model.Constants.{CSV, FORMAT, JDBC_TABLE, PATH, URL} +import io.github.datacatering.datacaterer.api.model.Constants.{FORMAT, JDBC_TABLE, PATH, URL} import io.github.datacatering.datacaterer.api.model.ExpressionValidation import org.junit.runner.RunWith import org.scalatest.funsuite.AnyFunSuite @@ -11,17 +11,17 @@ class PlanRunTest extends AnyFunSuite { test("Can create plan with each type of connection") { val result = new PlanRun { - val mySchema = schema.addFields(field.name("account_id")) - val myCsv = csv("my_csv", "/my/csv").schema(mySchema) - val myJson = json("my_json", "/my/json").schema(mySchema) - val myParquet = parquet("my_parquet", "/my/parquet").schema(mySchema) - val myOrc = orc("my_orc", "/my/orc").schema(mySchema) - val myPostgres = postgres("my_postgres").table("account").schema(mySchema) - val myMySql = mysql("my_mysql").table("transaction").schema(mySchema) - val myCassandra = cassandra("my_cassandra").table("account", "accounts").schema(mySchema) - val mySolace = solace("my_solace").destination("solace_topic").schema(mySchema) - val myKafka = kafka("my_kafka").topic("kafka_topic").schema(mySchema) - val myHttp = http("my_http").schema(mySchema) + val mySchema = field.name("account_id") + val myCsv = csv("my_csv", "/my/csv").fields(mySchema) + val myJson = json("my_json", "/my/json").fields(mySchema) + val myParquet = parquet("my_parquet", "/my/parquet").fields(mySchema) + val myOrc = orc("my_orc", "/my/orc").fields(mySchema) + val myPostgres = postgres("my_postgres").table("account").fields(mySchema) + val myMySql = mysql("my_mysql").table("transaction").fields(mySchema) + val myCassandra = cassandra("my_cassandra").table("account", "accounts").fields(mySchema) + val mySolace = solace("my_solace").destination("solace_topic").fields(mySchema) + val myKafka = kafka("my_kafka").topic("kafka_topic").fields(mySchema) + val myHttp = http("my_http").fields(mySchema) execute(myCsv, myJson, myParquet, myOrc, myPostgres, myMySql, myCassandra, mySolace, myKafka, myHttp) } @@ -38,10 +38,10 @@ class PlanRunTest extends AnyFunSuite { val result = new PlanRun { val myPostgresAccount = postgres("my_postgres", "my_postgres_url") .table("account.accounts") - .schema(field.name("account_id")) + .fields(field.name("account_id")) val myPostgresTransaction = postgres(myPostgresAccount) .table("account.transactions") - .schema(field.name("txn_id")) + .fields(field.name("txn_id")) execute(myPostgresAccount, myPostgresTransaction) } @@ -55,20 +55,18 @@ class PlanRunTest extends AnyFunSuite { assertResult(2)(result._tasks.size) val steps = result._tasks.flatMap(_.steps) val resAccount = steps.filter(s => s.options.get(JDBC_TABLE).contains("account.accounts")).head - assert(resAccount.schema.fields.isDefined) - assertResult(1)(resAccount.schema.fields.get.size) - assertResult("account_id")(resAccount.schema.fields.get.head.name) + assertResult(1)(resAccount.fields.size) + assertResult("account_id")(resAccount.fields.head.name) val resTxn = steps.filter(s => s.options.get(JDBC_TABLE).contains("account.transactions")).head - assert(resTxn.schema.fields.isDefined) - assertResult(1)(resTxn.schema.fields.get.size) - assertResult("txn_id")(resTxn.schema.fields.get.head.name) + assertResult(1)(resTxn.fields.size) + assertResult("txn_id")(resTxn.fields.head.name) assert(result._validations.isEmpty) } test("Can create plan with validations for one data source") { val result = new PlanRun { val myCsv = csv("my_csv", "/my/data/path") - .schema(field.name("account_id")) + .fields(field.name("account_id")) .validations(validation.expr("account_id != ''")) execute(myCsv) diff --git a/api/src/test/scala/io/github/datacatering/datacaterer/api/SinkOptionsBuilderTest.scala b/api/src/test/scala/io/github/datacatering/datacaterer/api/SinkOptionsBuilderTest.scala index 91b98ea2..d2396715 100644 --- a/api/src/test/scala/io/github/datacatering/datacaterer/api/SinkOptionsBuilderTest.scala +++ b/api/src/test/scala/io/github/datacatering/datacaterer/api/SinkOptionsBuilderTest.scala @@ -1,7 +1,6 @@ package io.github.datacatering.datacaterer.api -import io.github.datacatering.datacaterer.api.model.Constants.FOREIGN_KEY_DELIMITER -import io.github.datacatering.datacaterer.api.model.ForeignKeyRelation +import io.github.datacatering.datacaterer.api.model.{ForeignKey, ForeignKeyRelation} import org.junit.runner.RunWith import org.scalatest.funsuite.AnyFunSuite import org.scalatestplus.junit.JUnitRunner @@ -23,10 +22,10 @@ class SinkOptionsBuilderTest extends AnyFunSuite { assert(result.seed.contains("10")) assert(result.locale.contains("id")) assertResult(2)(result.foreignKeys.size) - assert(result.foreignKeys.contains((s"my_postgres${FOREIGN_KEY_DELIMITER}account${FOREIGN_KEY_DELIMITER}account_id", - List(s"my_json${FOREIGN_KEY_DELIMITER}account${FOREIGN_KEY_DELIMITER}account_id"), List()))) - assert(result.foreignKeys.contains((s"my_postgres${FOREIGN_KEY_DELIMITER}account${FOREIGN_KEY_DELIMITER}customer_number", - List(s"my_json${FOREIGN_KEY_DELIMITER}account${FOREIGN_KEY_DELIMITER}customer_number", s"my_parquet${FOREIGN_KEY_DELIMITER}transaction${FOREIGN_KEY_DELIMITER}cust_num"), List()))) + assert(result.foreignKeys.contains(ForeignKey(ForeignKeyRelation(s"my_postgres", "account", List("account_id")), + List(ForeignKeyRelation("my_json", "account", List("account_id"))), List()))) + assert(result.foreignKeys.contains(ForeignKey(ForeignKeyRelation("my_postgres", "account", List("customer_number")), + List(ForeignKeyRelation("my_json", "account", List("customer_number")), ForeignKeyRelation("my_parquet", "transaction", List("cust_num"))), List()))) } } diff --git a/api/src/test/scala/io/github/datacatering/datacaterer/api/TasksBuilderTest.scala b/api/src/test/scala/io/github/datacatering/datacaterer/api/TasksBuilderTest.scala index 28376b9a..00896a9b 100644 --- a/api/src/test/scala/io/github/datacatering/datacaterer/api/TasksBuilderTest.scala +++ b/api/src/test/scala/io/github/datacatering/datacaterer/api/TasksBuilderTest.scala @@ -1,6 +1,6 @@ package io.github.datacatering.datacaterer.api -import io.github.datacatering.datacaterer.api.model.{ArrayType, Count, DateType, Field, Generator, IntegerType, StringType} +import io.github.datacatering.datacaterer.api.model.{ArrayType, Count, DateType, Field, IntegerType, StringType} import org.junit.runner.RunWith import org.scalatest.funsuite.AnyFunSuite import org.scalatestplus.junit.JUnitRunner @@ -25,7 +25,7 @@ class TasksBuilderTest extends AnyFunSuite { .name("my step") .`type`("csv") .enabled(false) - .schema(SchemaBuilder()) + .fields() .count(CountBuilder()) .option("dbtable" -> "account.history") .options(Map("stringtype" -> "undefined")) @@ -34,7 +34,7 @@ class TasksBuilderTest extends AnyFunSuite { assertResult("my step")(result.name) assertResult("csv")(result.`type`) assert(!result.enabled) - assert(result.schema.fields.isEmpty) + assert(result.fields.isEmpty) assertResult(Count())(result.count) assert(result.options == Map( "dbtable" -> "account.history", @@ -46,95 +46,77 @@ class TasksBuilderTest extends AnyFunSuite { val result = CountBuilder().records(20).count assert(result.records.contains(20)) - assert(result.perColumn.isEmpty) - assert(result.generator.isEmpty) + assert(result.perField.isEmpty) + assert(result.options.isEmpty) } - test("Can create per column count") { + test("Can create per field count") { val result = CountBuilder() - .perColumn(PerColumnCountBuilder() + .perField(PerFieldCountBuilder() .records(20, "account_id") ) .count assert(result.records.contains(1000)) - assert(result.perColumn.isDefined) - assert(result.perColumn.get.count.contains(20)) - assertResult(List("account_id"))(result.perColumn.get.columnNames) - assert(result.perColumn.get.generator.isEmpty) - assert(result.generator.isEmpty) + assert(result.perField.isDefined) + assert(result.perField.get.count.contains(20)) + assertResult(List("account_id"))(result.perField.get.fieldNames) + assert(result.perField.get.options.isEmpty) + assert(result.options.isEmpty) } - test("Can create records per column from count builder") { + test("Can create records per field from count builder") { val result = CountBuilder() - .recordsPerColumn(20, "account_id") + .recordsPerField(20, "account_id") .count assert(result.records.contains(1000)) - assert(result.perColumn.isDefined) - assert(result.perColumn.get.count.contains(20)) - assertResult(List("account_id"))(result.perColumn.get.columnNames) - assert(result.perColumn.get.generator.isEmpty) - assert(result.generator.isEmpty) + assert(result.perField.isDefined) + assert(result.perField.get.count.contains(20)) + assertResult(List("account_id"))(result.perField.get.fieldNames) + assert(result.perField.get.options.isEmpty) + assert(result.options.isEmpty) } - test("Can create generated records per column from count builder") { + test("Can create generated records per field from count builder") { val result = CountBuilder() - .recordsPerColumnGenerator(GeneratorBuilder(), "account_id") + .recordsPerFieldGenerator(GeneratorBuilder(), "account_id") .count assert(result.records.contains(1000)) - assert(result.perColumn.isDefined) - assert(result.perColumn.get.count.contains(10)) - assertResult(List("account_id"))(result.perColumn.get.columnNames) - assert(result.perColumn.get.generator.isDefined) - assert(result.generator.isEmpty) + assert(result.perField.isDefined) + assert(result.perField.get.count.contains(10)) + assertResult(List("account_id"))(result.perField.get.fieldNames) + assert(result.perField.get.options.isEmpty) + assert(result.options.isEmpty) } - test("Can create generated records per column with total records from count builder") { + test("Can create generated records per field with total records from count builder") { val result = CountBuilder() - .recordsPerColumnGenerator(100, GeneratorBuilder(), "account_id") + .recordsPerFieldGenerator(100, GeneratorBuilder(), "account_id") .count assert(result.records.contains(100)) - assert(result.perColumn.isDefined) - assert(result.perColumn.get.count.contains(10)) - assertResult(List("account_id"))(result.perColumn.get.columnNames) - assert(result.perColumn.get.generator.isDefined) - assert(result.generator.isEmpty) + assert(result.perField.isDefined) + assert(result.perField.get.count.contains(10)) + assertResult(List("account_id"))(result.perField.get.fieldNames) + assert(result.perField.get.options.isEmpty) + assert(result.options.isEmpty) } - test("Can create per column count with generator") { + test("Can create per field count with generator") { val result = CountBuilder() - .perColumn(PerColumnCountBuilder() - .generator( - GeneratorBuilder().min(5), - "account_id" - ) + .perField(PerFieldCountBuilder() + .generator(GeneratorBuilder().min(5), "account_id") ).count assert(result.records.contains(1000)) - assert(result.perColumn.isDefined) - assert(result.perColumn.get.count.contains(10)) - assertResult(List("account_id"))(result.perColumn.get.columnNames) - assert(result.perColumn.get.generator.isDefined) - assertResult("random")(result.perColumn.get.generator.get.`type`) - assertResult("5")(result.perColumn.get.generator.get.options("min")) - assert(result.generator.isEmpty) - } - - test("Can create schema with add fields") { - val result = SchemaBuilder() - .addField("account_id") - .addField("year", IntegerType) - .addFields(FieldBuilder().name("name")) - .schema - - assert(result.fields.isDefined) - assertResult(3)(result.fields.get.size) - assert(result.fields.get.contains(Field("account_id", Some("string")))) - assert(result.fields.get.contains(Field("year", Some("integer")))) - assert(result.fields.get.contains(Field("name", Some("string")))) + assert(result.perField.isDefined) + assert(result.perField.get.count.contains(10)) + assertResult(List("account_id"))(result.perField.get.fieldNames) + assert(result.perField.get.options.nonEmpty) + assertResult("5")(result.perField.get.options("min")) + assert(result.options.isEmpty) } test("Can create field") { @@ -142,14 +124,13 @@ class TasksBuilderTest extends AnyFunSuite { .name("account_id") .`type`(StringType) .nullable(false) - .generator(GeneratorBuilder()) + .options(Map("hello" -> "world")) .field assertResult("account_id")(result.name) assert(result.`type`.contains("string")) assert(!result.nullable) - assert(result.generator.isDefined) - assert(result.generator.contains(Generator())) + assertResult(Map("hello" -> "world"))(result.options) } test("Can create field generated from sql expression") { @@ -160,9 +141,8 @@ class TasksBuilderTest extends AnyFunSuite { assertResult("account_id")(result.name) assert(result.`type`.contains("string")) - assert(result.generator.isDefined) - assertResult("sql")(result.generator.get.`type`) - assertResult("SUBSTRING(account, 1, 5)")(result.generator.get.options("sql")) + assert(result.options.nonEmpty) + assertResult("SUBSTRING(account, 1, 5)")(result.options("sql")) } test("Can create field generated from one of list of doubles") { @@ -170,8 +150,8 @@ class TasksBuilderTest extends AnyFunSuite { assertResult("account_id")(result.name) assert(result.`type`.contains("double")) - assert(result.generator.isDefined) - assertResult(List(123.1, 789.2))(result.generator.get.options("oneOf")) + assert(result.options.nonEmpty) + assertResult(List(123.1, 789.2))(result.options("oneOf")) } test("Can create field generated from one of list of strings") { @@ -179,7 +159,7 @@ class TasksBuilderTest extends AnyFunSuite { assertResult("status")(result.name) assert(result.`type`.contains("string")) - assertResult(List("open", "closed"))(result.generator.get.options("oneOf")) + assertResult(List("open", "closed"))(result.options("oneOf")) } test("Can create field generated from one of list of long") { @@ -187,7 +167,7 @@ class TasksBuilderTest extends AnyFunSuite { assertResult("amount")(result.name) assert(result.`type`.contains("long")) - assertResult(List(100L, 200L))(result.generator.get.options("oneOf")) + assertResult(List(100L, 200L))(result.options("oneOf")) } test("Can create field generated from one of list of int") { @@ -195,7 +175,7 @@ class TasksBuilderTest extends AnyFunSuite { assertResult("amount")(result.name) assert(result.`type`.contains("integer")) - assertResult(List(100, 200))(result.generator.get.options("oneOf")) + assertResult(List(100, 200))(result.options("oneOf")) } test("Can create field generated from one of list of boolean") { @@ -203,16 +183,14 @@ class TasksBuilderTest extends AnyFunSuite { assertResult("is_open")(result.name) assert(result.`type`.contains("boolean")) - assertResult(List(true, false))(result.generator.get.options("oneOf")) + assertResult(List(true, false))(result.options("oneOf")) } test("Can create field with nested schema") { val result = FieldBuilder() .name("txn_list") .`type`(new ArrayType(DateType)) - .schema(SchemaBuilder().addFields( - FieldBuilder().name("date").`type`(DateType) - )) + .fields(FieldBuilder().name("date").`type`(DateType)) .field assertResult("txn_list")(result.name) @@ -255,8 +233,7 @@ class TasksBuilderTest extends AnyFunSuite { assertResult("account_id")(result.name) assert(result.`type`.contains("string")) assert(!result.nullable) - assertResult("regex")(result.generator.get.`type`) - val gen = result.generator.get.options + val gen = result.options assertResult("acc[0-9]{3}")(gen("regex")) assertResult("1")(gen("seed")) assertResult("2")(gen("min")) diff --git a/api/src/test/scala/io/github/datacatering/datacaterer/api/ValidationConfigurationBuilderTest.scala b/api/src/test/scala/io/github/datacatering/datacaterer/api/ValidationConfigurationBuilderTest.scala index 79da5e12..ee58ff7e 100644 --- a/api/src/test/scala/io/github/datacatering/datacaterer/api/ValidationConfigurationBuilderTest.scala +++ b/api/src/test/scala/io/github/datacatering/datacaterer/api/ValidationConfigurationBuilderTest.scala @@ -1,7 +1,7 @@ package io.github.datacatering.datacaterer.api -import io.github.datacatering.datacaterer.api.model.Constants.{DEFAULT_VALIDATION_JOIN_TYPE, DEFAULT_VALIDATION_WEBHOOK_HTTP_DATA_SOURCE_NAME, DEFAULT_VALIDATION_WEBHOOK_HTTP_METHOD, DEFAULT_VALIDATION_WEBHOOK_HTTP_STATUS_CODES, PATH, VALIDATION_COLUMN_NAME_COUNT_BETWEEN, VALIDATION_COLUMN_NAME_COUNT_EQUAL, VALIDATION_COLUMN_NAME_MATCH_ORDER, VALIDATION_COLUMN_NAME_MATCH_SET} -import io.github.datacatering.datacaterer.api.model.{ColumnNamesValidation, ConditionType, DataExistsWaitCondition, ExpressionValidation, FileExistsWaitCondition, GroupByValidation, PauseWaitCondition, UpstreamDataSourceValidation, WebhookWaitCondition} +import io.github.datacatering.datacaterer.api.model.Constants.{DEFAULT_VALIDATION_JOIN_TYPE, DEFAULT_VALIDATION_WEBHOOK_HTTP_DATA_SOURCE_NAME, DEFAULT_VALIDATION_WEBHOOK_HTTP_METHOD, DEFAULT_VALIDATION_WEBHOOK_HTTP_STATUS_CODES, PATH, VALIDATION_FIELD_NAME_COUNT_BETWEEN, VALIDATION_FIELD_NAME_COUNT_EQUAL, VALIDATION_FIELD_NAME_MATCH_ORDER, VALIDATION_FIELD_NAME_MATCH_SET} +import io.github.datacatering.datacaterer.api.model.{FieldNamesValidation, ConditionType, DataExistsWaitCondition, ExpressionValidation, FileExistsWaitCondition, GroupByValidation, PauseWaitCondition, UpstreamDataSourceValidation, WebhookWaitCondition} import org.junit.runner.RunWith import org.scalatest.funsuite.AnyFunSuite import org.scalatestplus.junit.JUnitRunner @@ -34,283 +34,623 @@ class ValidationConfigurationBuilderTest extends AnyFunSuite { assert(headDsValid.validations.map(_.validation).contains(ExpressionValidation("name == 'Peter'"))) } - test("Can create column specific validation") { - val result = ValidationBuilder().col("my_col").greaterThan(10) + test("Can create field specific validation") { + val result = ValidationBuilder().field("my_col").greaterThan(10) assert(result.validation.isInstanceOf[ExpressionValidation]) assertResult("`my_col` > 10")(result.validation.asInstanceOf[ExpressionValidation].expr) } - test("Can create column equal to validation") { - val result = ValidationBuilder().col("my_col").isEqual(10) + test("Can create field equal to validation") { + val result = ValidationBuilder().field("my_col").isEqual(10) assert(result.validation.isInstanceOf[ExpressionValidation]) assertResult("`my_col` == 10")(result.validation.asInstanceOf[ExpressionValidation].expr) - val resultStr = ValidationBuilder().col("my_col").isEqual("created") + val resultStr = ValidationBuilder().field("my_col").isEqual("created") assert(resultStr.validation.isInstanceOf[ExpressionValidation]) assertResult("`my_col` == 'created'")(resultStr.validation.asInstanceOf[ExpressionValidation].expr) } - test("Can create column equal to another column validation") { - val result = ValidationBuilder().col("my_col").isEqualCol("other_col") + test("Can create field equal to another field validation") { + val result = ValidationBuilder().field("my_col").isEqualField("other_col") assert(result.validation.isInstanceOf[ExpressionValidation]) assertResult("`my_col` == other_col")(result.validation.asInstanceOf[ExpressionValidation].expr) } - test("Can create column not equal to validation") { - val result = ValidationBuilder().col("my_col").isNotEqual(10) + test("Can create field not equal to validation") { + val result = ValidationBuilder().field("my_col").isEqual(10, true) assert(result.validation.isInstanceOf[ExpressionValidation]) assertResult("`my_col` != 10")(result.validation.asInstanceOf[ExpressionValidation].expr) - val resultStr = ValidationBuilder().col("my_col").isNotEqual("created") + val resultStr = ValidationBuilder().field("my_col").isEqual("created", true) assert(resultStr.validation.isInstanceOf[ExpressionValidation]) assertResult("`my_col` != 'created'")(resultStr.validation.asInstanceOf[ExpressionValidation].expr) } - test("Can create column not equal to another column validation") { - val result = ValidationBuilder().col("my_col").isNotEqualCol("other_col") + test("Can create field not equal to another field validation") { + val result = ValidationBuilder().field("my_col").isEqualField("other_col", true) assert(result.validation.isInstanceOf[ExpressionValidation]) assertResult("`my_col` != other_col")(result.validation.asInstanceOf[ExpressionValidation].expr) } - test("Can create column is null validation") { - val result = ValidationBuilder().col("my_col").isNull + test("Can create field is null validation") { + val result = ValidationBuilder().field("my_col").isNull() assert(result.validation.isInstanceOf[ExpressionValidation]) assertResult("ISNULL(`my_col`)")(result.validation.asInstanceOf[ExpressionValidation].expr) } - test("Can create column is not null validation") { - val result = ValidationBuilder().col("my_col").isNotNull + test("Can create field is not null validation") { + val result = ValidationBuilder().field("my_col").isNull(true) assert(result.validation.isInstanceOf[ExpressionValidation]) assertResult("ISNOTNULL(`my_col`)")(result.validation.asInstanceOf[ExpressionValidation].expr) } - test("Can create column contains validation") { - val result = ValidationBuilder().col("my_col").contains("apple") + test("Can create field contains validation") { + val result = ValidationBuilder().field("my_col").contains("apple") assert(result.validation.isInstanceOf[ExpressionValidation]) assertResult("CONTAINS(`my_col`, 'apple')")(result.validation.asInstanceOf[ExpressionValidation].expr) } - test("Can create column not contains validation") { - val result = ValidationBuilder().col("my_col").notContains("apple") + test("Can create field not contains validation") { + val result = ValidationBuilder().field("my_col").contains("apple", true) assert(result.validation.isInstanceOf[ExpressionValidation]) assertResult("!CONTAINS(`my_col`, 'apple')")(result.validation.asInstanceOf[ExpressionValidation].expr) } - test("Can create column less than validation") { - val result = ValidationBuilder().col("my_col").lessThan(Date.valueOf("2023-01-01")) + test("Can create field less than validation") { + val result = ValidationBuilder().field("my_col").lessThan(Date.valueOf("2023-01-01")) assert(result.validation.isInstanceOf[ExpressionValidation]) assertResult("`my_col` < DATE('2023-01-01')")(result.validation.asInstanceOf[ExpressionValidation].expr) } - test("Can create column less than other column validation") { - val result = ValidationBuilder().col("my_col").lessThanCol("other_col") + test("Can create field less than other field validation") { + val result = ValidationBuilder().field("my_col").lessThanField("other_col") assert(result.validation.isInstanceOf[ExpressionValidation]) assertResult("`my_col` < other_col")(result.validation.asInstanceOf[ExpressionValidation].expr) } - test("Can create column less than or equal validation") { - val result = ValidationBuilder().col("my_col").lessThanOrEqual(Timestamp.valueOf("2023-01-01 00:00:00.0")) + test("Can create field less than or equal validation") { + val result = ValidationBuilder().field("my_col").lessThan(Timestamp.valueOf("2023-01-01 00:00:00.0"), false) assert(result.validation.isInstanceOf[ExpressionValidation]) assertResult("`my_col` <= TIMESTAMP('2023-01-01 00:00:00.0')")(result.validation.asInstanceOf[ExpressionValidation].expr) } - test("Can create column less than or equal other column validation") { - val result = ValidationBuilder().col("my_col").lessThanOrEqualCol("other_col") + test("Can create field less than or equal other field validation") { + val result = ValidationBuilder().field("my_col").lessThanField("other_col", false) assert(result.validation.isInstanceOf[ExpressionValidation]) assertResult("`my_col` <= other_col")(result.validation.asInstanceOf[ExpressionValidation].expr) } - test("Can create column greater than validation") { - val result = ValidationBuilder().col("my_col").greaterThan(10) + test("Can create field greater than validation") { + val result = ValidationBuilder().field("my_col").greaterThan(10) assert(result.validation.isInstanceOf[ExpressionValidation]) assertResult("`my_col` > 10")(result.validation.asInstanceOf[ExpressionValidation].expr) } - test("Can create column greater than other column validation") { - val result = ValidationBuilder().col("my_col").greaterThanCol("other_col") + test("Can create field greater than other field validation") { + val result = ValidationBuilder().field("my_col").greaterThanField("other_col") assert(result.validation.isInstanceOf[ExpressionValidation]) assertResult("`my_col` > other_col")(result.validation.asInstanceOf[ExpressionValidation].expr) } - test("Can create column greater than or equal validation") { - val result = ValidationBuilder().col("my_col").greaterThanOrEqual(10) + test("Can create field greater than or equal validation") { + val result = ValidationBuilder().field("my_col").greaterThan(10, false) assert(result.validation.isInstanceOf[ExpressionValidation]) assertResult("`my_col` >= 10")(result.validation.asInstanceOf[ExpressionValidation].expr) } - test("Can create column greater than or equal other column validation") { - val result = ValidationBuilder().col("my_col").greaterThanOrEqualCol("other_col") + test("Can create field greater than or equal other field validation") { + val result = ValidationBuilder().field("my_col").greaterThanField("other_col", false) assert(result.validation.isInstanceOf[ExpressionValidation]) assertResult("`my_col` >= other_col")(result.validation.asInstanceOf[ExpressionValidation].expr) } - test("Can create column between validation") { - val result = ValidationBuilder().col("my_col").between(10, 20) + test("Can create field between validation") { + val result = ValidationBuilder().field("my_col").between(10, 20) assert(result.validation.isInstanceOf[ExpressionValidation]) assertResult("`my_col` BETWEEN 10 AND 20")(result.validation.asInstanceOf[ExpressionValidation].expr) } - test("Can create column between other col validation") { - val result = ValidationBuilder().col("my_col").betweenCol("other_col", "another_col") + test("Can create field between other col validation") { + val result = ValidationBuilder().field("my_col").betweenFields("other_col", "another_col") assert(result.validation.isInstanceOf[ExpressionValidation]) assertResult("`my_col` BETWEEN other_col AND another_col")(result.validation.asInstanceOf[ExpressionValidation].expr) } - test("Can create column not between validation") { - val result = ValidationBuilder().col("my_col").notBetween(10, 20) + test("Can create field not between validation") { + val result = ValidationBuilder().field("my_col").between(10, 20, true) assert(result.validation.isInstanceOf[ExpressionValidation]) assertResult("`my_col` NOT BETWEEN 10 AND 20")(result.validation.asInstanceOf[ExpressionValidation].expr) } - test("Can create column not between other col validation") { - val result = ValidationBuilder().col("my_col").notBetweenCol("other_col", "another_col") + test("Can create field not between other col validation") { + val result = ValidationBuilder().field("my_col").betweenFields("other_col", "another_col", true) assert(result.validation.isInstanceOf[ExpressionValidation]) assertResult("`my_col` NOT BETWEEN other_col AND another_col")(result.validation.asInstanceOf[ExpressionValidation].expr) } - test("Can create column in validation") { - val result = ValidationBuilder().col("my_col").in("open", "closed") + test("Can create field in validation") { + val result = ValidationBuilder().field("my_col").in("open", "closed") assert(result.validation.isInstanceOf[ExpressionValidation]) assertResult("`my_col` IN ('open','closed')")(result.validation.asInstanceOf[ExpressionValidation].expr) } - test("Can create column not in validation") { - val result = ValidationBuilder().col("my_col").notIn("open", "closed") + test("Can create field not in validation") { + val result = ValidationBuilder().field("my_col").in(List("open", "closed"), true) assert(result.validation.isInstanceOf[ExpressionValidation]) assertResult("NOT `my_col` IN ('open','closed')")(result.validation.asInstanceOf[ExpressionValidation].expr) } - test("Can create column matches validation") { - val result = ValidationBuilder().col("my_col").matches("ACC[0-9]{8}") + test("Can create field matches validation") { + val result = ValidationBuilder().field("my_col").matches("ACC[0-9]{8}") assert(result.validation.isInstanceOf[ExpressionValidation]) assertResult("REGEXP(`my_col`, 'ACC[0-9]{8}')")(result.validation.asInstanceOf[ExpressionValidation].expr) } - test("Can create column not matches validation") { - val result = ValidationBuilder().col("my_col").notMatches("ACC[0-9]{8}") + test("Can create field not matches validation") { + val result = ValidationBuilder().field("my_col").matches("ACC[0-9]{8}", true) assert(result.validation.isInstanceOf[ExpressionValidation]) assertResult("!REGEXP(`my_col`, 'ACC[0-9]{8}')")(result.validation.asInstanceOf[ExpressionValidation].expr) } - test("Can create column starts with validation") { - val result = ValidationBuilder().col("my_col").startsWith("ACC") + test("Can create field starts with validation") { + val result = ValidationBuilder().field("my_col").startsWith("ACC") assert(result.validation.isInstanceOf[ExpressionValidation]) assertResult("STARTSWITH(`my_col`, 'ACC')")(result.validation.asInstanceOf[ExpressionValidation].expr) } - test("Can create column not starts with validation") { - val result = ValidationBuilder().col("my_col").notStartsWith("ACC") + test("Can create field not starts with validation") { + val result = ValidationBuilder().field("my_col").startsWith("ACC", true) assert(result.validation.isInstanceOf[ExpressionValidation]) assertResult("!STARTSWITH(`my_col`, 'ACC')")(result.validation.asInstanceOf[ExpressionValidation].expr) } - test("Can create column ends with validation") { - val result = ValidationBuilder().col("my_col").endsWith("ACC") + test("Can create field ends with validation") { + val result = ValidationBuilder().field("my_col").endsWith("ACC") assert(result.validation.isInstanceOf[ExpressionValidation]) assertResult("ENDSWITH(`my_col`, 'ACC')")(result.validation.asInstanceOf[ExpressionValidation].expr) } - test("Can create column not ends with validation") { - val result = ValidationBuilder().col("my_col").notEndsWith("ACC") + test("Can create field not ends with validation") { + val result = ValidationBuilder().field("my_col").endsWith("ACC", true) assert(result.validation.isInstanceOf[ExpressionValidation]) assertResult("!ENDSWITH(`my_col`, 'ACC')")(result.validation.asInstanceOf[ExpressionValidation].expr) } - test("Can create column size validation") { - val result = ValidationBuilder().col("my_col").size(2) + test("Can create field size validation") { + val result = ValidationBuilder().field("my_col").size(2) assert(result.validation.isInstanceOf[ExpressionValidation]) assertResult("SIZE(`my_col`) == 2")(result.validation.asInstanceOf[ExpressionValidation].expr) } - test("Can create column not size validation") { - val result = ValidationBuilder().col("my_col").notSize(5) + test("Can create field not size validation") { + val result = ValidationBuilder().field("my_col").size(5, true) assert(result.validation.isInstanceOf[ExpressionValidation]) assertResult("SIZE(`my_col`) != 5")(result.validation.asInstanceOf[ExpressionValidation].expr) } - test("Can create column less than size validation") { - val result = ValidationBuilder().col("my_col").lessThanSize(5) + test("Can create field less than size validation") { + val result = ValidationBuilder().field("my_col").lessThanSize(5) assert(result.validation.isInstanceOf[ExpressionValidation]) assertResult("SIZE(`my_col`) < 5")(result.validation.asInstanceOf[ExpressionValidation].expr) } - test("Can create column less than or equal size validation") { - val result = ValidationBuilder().col("my_col").lessThanOrEqualSize(5) + test("Can create field less than or equal size validation") { + val result = ValidationBuilder().field("my_col").lessThanSize(5, false) assert(result.validation.isInstanceOf[ExpressionValidation]) assertResult("SIZE(`my_col`) <= 5")(result.validation.asInstanceOf[ExpressionValidation].expr) } - test("Can create column greater than size validation") { - val result = ValidationBuilder().col("my_col").greaterThanSize(5) + test("Can create field greater than size validation") { + val result = ValidationBuilder().field("my_col").greaterThanSize(5) assert(result.validation.isInstanceOf[ExpressionValidation]) assertResult("SIZE(`my_col`) > 5")(result.validation.asInstanceOf[ExpressionValidation].expr) } - test("Can create column greater than or equal size validation") { - val result = ValidationBuilder().col("my_col").greaterThanOrEqualSize(5) + test("Can create field greater than or equal size validation") { + val result = ValidationBuilder().field("my_col").greaterThanSize(5, false) assert(result.validation.isInstanceOf[ExpressionValidation]) assertResult("SIZE(`my_col`) >= 5")(result.validation.asInstanceOf[ExpressionValidation].expr) } - test("Can create column greater luhn check validation") { - val result = ValidationBuilder().col("my_col").luhnCheck + test("Can create field greater luhn check validation") { + val result = ValidationBuilder().field("my_col").luhnCheck() assert(result.validation.isInstanceOf[ExpressionValidation]) assertResult("LUHN_CHECK(`my_col`)")(result.validation.asInstanceOf[ExpressionValidation].expr) } - test("Can create column type validation") { - val result = ValidationBuilder().col("my_col").hasType("double") + test("Can create field type validation") { + val result = ValidationBuilder().field("my_col").hasType("double") assert(result.validation.isInstanceOf[ExpressionValidation]) assertResult("TYPEOF(`my_col`) == 'double'")(result.validation.asInstanceOf[ExpressionValidation].expr) } - test("Can create column generic expression validation") { - val result = ValidationBuilder().col("my_col").expr("my_col * 2 < other_col / 4") + test("Can create field not type validation") { + val result = ValidationBuilder().field("my_col").hasType("double", true) + + assert(result.validation.isInstanceOf[ExpressionValidation]) + assertResult("TYPEOF(`my_col`) != 'double'")(result.validation.asInstanceOf[ExpressionValidation].expr) + } + + test("Can create field types validation") { + val result = ValidationBuilder().field("my_col").hasTypes("double", "string") + + assert(result.validation.isInstanceOf[ExpressionValidation]) + assertResult("TYPEOF(`my_col`) IN ('double','string')")(result.validation.asInstanceOf[ExpressionValidation].expr) + } + + test("Can create field not types validation") { + val result = ValidationBuilder().field("my_col").hasTypes(List("double", "string"), true) + + assert(result.validation.isInstanceOf[ExpressionValidation]) + assertResult("TYPEOF(`my_col`) NOT IN ('double','string')")(result.validation.asInstanceOf[ExpressionValidation].expr) + } + + test("Can create field distinct in set validation") { + val result = ValidationBuilder().field("my_col").distinctInSet("open", "closed") + + assert(result.validation.isInstanceOf[ExpressionValidation]) + assertResult(List("COLLECT_SET(`my_col`) AS my_col_distinct"))(result.validation.asInstanceOf[ExpressionValidation].selectExpr) + assertResult("FORALL(my_col_distinct, x -> ARRAY_CONTAINS(ARRAY('open','closed'), x))")(result.validation.asInstanceOf[ExpressionValidation].expr) + } + + test("Can create field not distinct in set validation") { + val result = ValidationBuilder().field("my_col").distinctInSet(List("open", "closed"), true) + + assert(result.validation.isInstanceOf[ExpressionValidation]) + assertResult(List("COLLECT_SET(`my_col`) AS my_col_distinct"))(result.validation.asInstanceOf[ExpressionValidation].selectExpr) + assertResult("!FORALL(my_col_distinct, x -> ARRAY_CONTAINS(ARRAY('open','closed'), x))")(result.validation.asInstanceOf[ExpressionValidation].expr) + } + + test("Can create field distinct contains set validation") { + val result = ValidationBuilder().field("my_col").distinctContainsSet("open", "closed") + + assert(result.validation.isInstanceOf[ExpressionValidation]) + assertResult(List("COLLECT_SET(`my_col`) AS my_col_distinct"))(result.validation.asInstanceOf[ExpressionValidation].selectExpr) + assertResult("FORALL(ARRAY('open','closed'), x -> ARRAY_CONTAINS(my_col_distinct, x))")(result.validation.asInstanceOf[ExpressionValidation].expr) + } + + test("Can create field distinct not contains set validation") { + val result = ValidationBuilder().field("my_col").distinctContainsSet(List("open", "closed"), true) + + assert(result.validation.isInstanceOf[ExpressionValidation]) + assertResult(List("COLLECT_SET(`my_col`) AS my_col_distinct"))(result.validation.asInstanceOf[ExpressionValidation].selectExpr) + assertResult("!FORALL(ARRAY('open','closed'), x -> ARRAY_CONTAINS(my_col_distinct, x))")(result.validation.asInstanceOf[ExpressionValidation].expr) + } + + test("Can create field distinct equals validation") { + val result = ValidationBuilder().field("my_col").distinctEqual("open", "closed") + + assert(result.validation.isInstanceOf[ExpressionValidation]) + assertResult(List("COLLECT_SET(`my_col`) AS my_col_distinct"))(result.validation.asInstanceOf[ExpressionValidation].selectExpr) + assertResult("ARRAY_SIZE(ARRAY_EXCEPT(ARRAY('open','closed'), my_col_distinct)) == 0")(result.validation.asInstanceOf[ExpressionValidation].expr) + } + + test("Can create field distinct not equals validation") { + val result = ValidationBuilder().field("my_col").distinctEqual(List("open", "closed"), true) + + assert(result.validation.isInstanceOf[ExpressionValidation]) + assertResult(List("COLLECT_SET(`my_col`) AS my_col_distinct"))(result.validation.asInstanceOf[ExpressionValidation].selectExpr) + assertResult("ARRAY_SIZE(ARRAY_EXCEPT(ARRAY('open','closed'), my_col_distinct)) != 0")(result.validation.asInstanceOf[ExpressionValidation].expr) + } + + test("Can create field max between validation") { + val result = ValidationBuilder().field("my_col").maxBetween(1, 2) + + assert(result.validation.isInstanceOf[GroupByValidation]) + val validation = result.validation.asInstanceOf[GroupByValidation] + assertResult(Seq())(validation.groupByFields) + assertResult("`my_col`")(validation.aggField) + assertResult("max")(validation.aggType) + assertResult("max(`my_col`) BETWEEN 1 AND 2")(validation.aggExpr) + } + + test("Can create field max not between validation") { + val result = ValidationBuilder().field("my_col").maxBetween(1, 2, true) + + assert(result.validation.isInstanceOf[GroupByValidation]) + val validation = result.validation.asInstanceOf[GroupByValidation] + assertResult("max(`my_col`) NOT BETWEEN 1 AND 2")(validation.aggExpr) + } + + test("Can create field mean between validation") { + val result = ValidationBuilder().field("my_col").meanBetween(1, 2) + + assert(result.validation.isInstanceOf[GroupByValidation]) + val validation = result.validation.asInstanceOf[GroupByValidation] + assertResult(Seq())(validation.groupByFields) + assertResult("`my_col`")(validation.aggField) + assertResult("avg")(validation.aggType) + assertResult("avg(`my_col`) BETWEEN 1 AND 2")(validation.aggExpr) + } + + test("Can create field mean not between validation") { + val result = ValidationBuilder().field("my_col").meanBetween(1, 2, true) + + assert(result.validation.isInstanceOf[GroupByValidation]) + val validation = result.validation.asInstanceOf[GroupByValidation] + assertResult("avg(`my_col`) NOT BETWEEN 1 AND 2")(validation.aggExpr) + } + + test("Can create field median between validation") { + val result = ValidationBuilder().field("my_col").medianBetween(1, 2) + + assert(result.validation.isInstanceOf[ExpressionValidation]) + val validation = result.validation.asInstanceOf[ExpressionValidation] + assertResult(List("PERCENTILE(`my_col`, 0.5) AS my_col_median"))(validation.selectExpr) + assertResult("my_col_median BETWEEN 1 AND 2")(validation.expr) + } + + test("Can create field median not between validation") { + val result = ValidationBuilder().field("my_col").medianBetween(1, 2, true) + + assert(result.validation.isInstanceOf[ExpressionValidation]) + val validation = result.validation.asInstanceOf[ExpressionValidation] + assertResult(List("PERCENTILE(`my_col`, 0.5) AS my_col_median"))(validation.selectExpr) + assertResult("my_col_median NOT BETWEEN 1 AND 2")(validation.expr) + } + + test("Can create field min between validation") { + val result = ValidationBuilder().field("my_col").minBetween(1, 2) + + assert(result.validation.isInstanceOf[GroupByValidation]) + val validation = result.validation.asInstanceOf[GroupByValidation] + assertResult(Seq())(validation.groupByFields) + assertResult("`my_col`")(validation.aggField) + assertResult("min")(validation.aggType) + assertResult("min(`my_col`) BETWEEN 1 AND 2")(validation.aggExpr) + } + + test("Can create field min not between validation") { + val result = ValidationBuilder().field("my_col").minBetween(1, 2, true) + + assert(result.validation.isInstanceOf[GroupByValidation]) + val validation = result.validation.asInstanceOf[GroupByValidation] + assertResult("min(`my_col`) NOT BETWEEN 1 AND 2")(validation.aggExpr) + } + + test("Can create field stddev between validation") { + val result = ValidationBuilder().field("my_col").stdDevBetween(1, 2) + + assert(result.validation.isInstanceOf[GroupByValidation]) + val validation = result.validation.asInstanceOf[GroupByValidation] + assertResult(Seq())(validation.groupByFields) + assertResult("`my_col`")(validation.aggField) + assertResult("stddev")(validation.aggType) + assertResult("stddev(`my_col`) BETWEEN 1 AND 2")(validation.aggExpr) + } + + test("Can create field stddev not between validation") { + val result = ValidationBuilder().field("my_col").stdDevBetween(1, 2, true) + + assert(result.validation.isInstanceOf[GroupByValidation]) + val validation = result.validation.asInstanceOf[GroupByValidation] + assertResult("stddev(`my_col`) NOT BETWEEN 1 AND 2")(validation.aggExpr) + } + + test("Can create field sum between validation") { + val result = ValidationBuilder().field("my_col").sumBetween(1, 2) + + assert(result.validation.isInstanceOf[GroupByValidation]) + val validation = result.validation.asInstanceOf[GroupByValidation] + assertResult(Seq())(validation.groupByFields) + assertResult("`my_col`")(validation.aggField) + assertResult("sum")(validation.aggType) + assertResult("sum(`my_col`) BETWEEN 1 AND 2")(validation.aggExpr) + } + + test("Can create field sum not between validation") { + val result = ValidationBuilder().field("my_col").sumBetween(1, 2, true) + + assert(result.validation.isInstanceOf[GroupByValidation]) + val validation = result.validation.asInstanceOf[GroupByValidation] + assertResult("sum(`my_col`) NOT BETWEEN 1 AND 2")(validation.aggExpr) + } + + test("Can create field length between validation") { + val result = ValidationBuilder().field("my_col").lengthBetween(1, 2) + + assert(result.validation.isInstanceOf[ExpressionValidation]) + assertResult("LENGTH(`my_col`) BETWEEN 1 AND 2")(result.validation.asInstanceOf[ExpressionValidation].expr) + } + + test("Can create field length not between validation") { + val result = ValidationBuilder().field("my_col").lengthBetween(1, 2, true) + + assert(result.validation.isInstanceOf[ExpressionValidation]) + assertResult("LENGTH(`my_col`) NOT BETWEEN 1 AND 2")(result.validation.asInstanceOf[ExpressionValidation].expr) + } + + test("Can create field length equal validation") { + val result = ValidationBuilder().field("my_col").lengthEqual(1) + + assert(result.validation.isInstanceOf[ExpressionValidation]) + assertResult("LENGTH(`my_col`) == 1")(result.validation.asInstanceOf[ExpressionValidation].expr) + } + + test("Can create field length not equal validation") { + val result = ValidationBuilder().field("my_col").lengthEqual(1, true) + + assert(result.validation.isInstanceOf[ExpressionValidation]) + assertResult("LENGTH(`my_col`) != 1")(result.validation.asInstanceOf[ExpressionValidation].expr) + } + + test("Can create field is decreasing validation") { + val result = ValidationBuilder().field("my_col").isDecreasing() + + assert(result.validation.isInstanceOf[ExpressionValidation]) + assertResult(List("`my_col` < LAG(`my_col`) OVER (ORDER BY MONOTONICALLY_INCREASING_ID()) AS is_my_col_decreasing"))(result.validation.asInstanceOf[ExpressionValidation].selectExpr) + assertResult("is_my_col_decreasing")(result.validation.asInstanceOf[ExpressionValidation].expr) + } + + test("Can create field is not strictly decreasing validation") { + val result = ValidationBuilder().field("my_col").isDecreasing(false) + + assert(result.validation.isInstanceOf[ExpressionValidation]) + assertResult(List("`my_col` <= LAG(`my_col`) OVER (ORDER BY MONOTONICALLY_INCREASING_ID()) AS is_my_col_decreasing"))(result.validation.asInstanceOf[ExpressionValidation].selectExpr) + assertResult("is_my_col_decreasing")(result.validation.asInstanceOf[ExpressionValidation].expr) + } + + test("Can create field is increasing validation") { + val result = ValidationBuilder().field("my_col").isIncreasing() + + assert(result.validation.isInstanceOf[ExpressionValidation]) + assertResult(List("`my_col` > LAG(`my_col`) OVER (ORDER BY MONOTONICALLY_INCREASING_ID()) AS is_my_col_increasing"))(result.validation.asInstanceOf[ExpressionValidation].selectExpr) + assertResult("is_my_col_increasing")(result.validation.asInstanceOf[ExpressionValidation].expr) + } + + test("Can create field is not strictly increasing validation") { + val result = ValidationBuilder().field("my_col").isIncreasing(false) + + assert(result.validation.isInstanceOf[ExpressionValidation]) + assertResult(List("`my_col` >= LAG(`my_col`) OVER (ORDER BY MONOTONICALLY_INCREASING_ID()) AS is_my_col_increasing"))(result.validation.asInstanceOf[ExpressionValidation].selectExpr) + assertResult("is_my_col_increasing")(result.validation.asInstanceOf[ExpressionValidation].expr) + } + + test("Can create field is JSON parsable validation") { + val result = ValidationBuilder().field("my_col").isJsonParsable() + + assert(result.validation.isInstanceOf[ExpressionValidation]) + assertResult("GET_JSON_OBJECT(`my_col`, '$') IS NOT NULL")(result.validation.asInstanceOf[ExpressionValidation].expr) + } + + test("Can create field is not JSON parsable validation") { + val result = ValidationBuilder().field("my_col").isJsonParsable(true) + + assert(result.validation.isInstanceOf[ExpressionValidation]) + assertResult("GET_JSON_OBJECT(`my_col`, '$') IS NULL")(result.validation.asInstanceOf[ExpressionValidation].expr) + } + + test("Can create field matches JSON schema validation") { + val result = ValidationBuilder().field("my_col").matchJsonSchema("schema") + + assert(result.validation.isInstanceOf[ExpressionValidation]) + assertResult("FROM_JSON(`my_col`, 'schema') IS NOT NULL")(result.validation.asInstanceOf[ExpressionValidation].expr) + } + + test("Can create field not matches JSON schema validation") { + val result = ValidationBuilder().field("my_col").matchJsonSchema("schema", true) + + assert(result.validation.isInstanceOf[ExpressionValidation]) + assertResult("FROM_JSON(`my_col`, 'schema') IS NULL")(result.validation.asInstanceOf[ExpressionValidation].expr) + } + + test("Can create field matches date time format validation") { + val result = ValidationBuilder().field("my_col").matchDateTimeFormat("format") + + assert(result.validation.isInstanceOf[ExpressionValidation]) + assertResult("TRY_TO_TIMESTAMP(`my_col`, 'format') IS NOT NULL")(result.validation.asInstanceOf[ExpressionValidation].expr) + } + + test("Can create field not matches date time format validation") { + val result = ValidationBuilder().field("my_col").matchDateTimeFormat("format", true) + + assert(result.validation.isInstanceOf[ExpressionValidation]) + assertResult("TRY_TO_TIMESTAMP(`my_col`, 'format') IS NULL")(result.validation.asInstanceOf[ExpressionValidation].expr) + } + + test("Can create field most common value in set validation") { + val result = ValidationBuilder().field("my_col").mostCommonValueInSet(List("open")) + + assert(result.validation.isInstanceOf[ExpressionValidation]) + assertResult(List("MODE(`my_col`) AS my_col_mode"))(result.validation.asInstanceOf[ExpressionValidation].selectExpr) + assertResult("ARRAY_CONTAINS(ARRAY('open'), my_col_mode)")(result.validation.asInstanceOf[ExpressionValidation].expr) + } + + test("Can create field most common value not in set validation") { + val result = ValidationBuilder().field("my_col").mostCommonValueInSet(List("open"), true) + + assert(result.validation.isInstanceOf[ExpressionValidation]) + assertResult(List("MODE(`my_col`) AS my_col_mode"))(result.validation.asInstanceOf[ExpressionValidation].selectExpr) + assertResult("!ARRAY_CONTAINS(ARRAY('open'), my_col_mode)")(result.validation.asInstanceOf[ExpressionValidation].expr) + } + + test("Can create field unique values proportion between validation") { + val result = ValidationBuilder().field("my_col").uniqueValuesProportionBetween(0.1, 0.2) + + assert(result.validation.isInstanceOf[ExpressionValidation]) + assertResult(List("COUNT(DISTINCT `my_col`) / COUNT(1) AS my_col_unique_proportion"))(result.validation.asInstanceOf[ExpressionValidation].selectExpr) + assertResult("my_col_unique_proportion BETWEEN 0.1 AND 0.2")(result.validation.asInstanceOf[ExpressionValidation].expr) + } + + test("Can create field unique values proportion not between validation") { + val result = ValidationBuilder().field("my_col").uniqueValuesProportionBetween(0.1, 0.2, true) + + assert(result.validation.isInstanceOf[ExpressionValidation]) + assertResult(List("COUNT(DISTINCT `my_col`) / COUNT(1) AS my_col_unique_proportion"))(result.validation.asInstanceOf[ExpressionValidation].selectExpr) + assertResult("my_col_unique_proportion NOT BETWEEN 0.1 AND 0.2")(result.validation.asInstanceOf[ExpressionValidation].expr) + } + + test("Can create field quantile values between validation") { + val quantileRanges = Map( + 0.1 -> (1.0, 5.0), + 0.5 -> (10.0, 15.0) + ) + val result = ValidationBuilder().field("my_col").quantileValuesBetween(quantileRanges) + + assert(result.validation.isInstanceOf[ExpressionValidation]) + assertResult(List("PERCENTILE(`my_col`, 0.1) AS my_col_percentile_0", "PERCENTILE(`my_col`, 0.5) AS my_col_percentile_1"))(result.validation.asInstanceOf[ExpressionValidation].selectExpr) + assertResult("my_col_percentile_0 BETWEEN 1.0 AND 5.0 AND my_col_percentile_1 BETWEEN 10.0 AND 15.0")(result.validation.asInstanceOf[ExpressionValidation].expr) + } + + test("Can create field quantile values not between validation") { + val quantileRanges = Map( + 0.1 -> (1.0, 5.0), + 0.5 -> (10.0, 15.0) + ) + val result = ValidationBuilder().field("my_col").quantileValuesBetween(quantileRanges, true) + + assert(result.validation.isInstanceOf[ExpressionValidation]) + assertResult(List("PERCENTILE(`my_col`, 0.1) AS my_col_percentile_0", "PERCENTILE(`my_col`, 0.5) AS my_col_percentile_1"))(result.validation.asInstanceOf[ExpressionValidation].selectExpr) + assertResult("my_col_percentile_0 NOT BETWEEN 1.0 AND 5.0 AND my_col_percentile_1 NOT BETWEEN 10.0 AND 15.0")(result.validation.asInstanceOf[ExpressionValidation].expr) + } + + test("Can create field generic expression validation") { + val result = ValidationBuilder().field("my_col").expr("my_col * 2 < other_col / 4") assert(result.validation.isInstanceOf[ExpressionValidation]) assertResult("my_col * 2 < other_col / 4")(result.validation.asInstanceOf[ExpressionValidation].expr) } - test("Can create group by column validation") { + test("Can create group by field validation") { val result = ValidationBuilder() .description("my_description") .errorThreshold(0.5) @@ -320,8 +660,8 @@ class ValidationConfigurationBuilderTest extends AnyFunSuite { assert(result.validation.isInstanceOf[GroupByValidation]) val validation = result.validation.asInstanceOf[GroupByValidation] - assertResult(Seq("account_id", "year"))(validation.groupByCols) - assertResult("amount")(validation.aggCol) + assertResult(Seq("account_id", "year"))(validation.groupByFields) + assertResult("amount")(validation.aggField) assertResult("sum")(validation.aggType) assertResult("sum(amount) < 100")(validation.aggExpr) assert(validation.description.contains("my_description")) @@ -333,13 +673,13 @@ class ValidationConfigurationBuilderTest extends AnyFunSuite { assert(result.validation.isInstanceOf[GroupByValidation]) val validation = result.validation.asInstanceOf[GroupByValidation] - assert(validation.groupByCols.isEmpty) - assert(validation.aggCol.isEmpty) + assert(validation.groupByFields.isEmpty) + assert(validation.aggField.isEmpty) assertResult("count")(validation.aggType) assertResult("count < 10")(validation.aggExpr) } - test("Can create group by then get count column validation") { + test("Can create group by then get count field validation") { val result = ValidationBuilder() .groupBy("account_id") .count("amount") @@ -347,13 +687,13 @@ class ValidationConfigurationBuilderTest extends AnyFunSuite { assert(result.validation.isInstanceOf[GroupByValidation]) val validation = result.validation.asInstanceOf[GroupByValidation] - assertResult(Seq("account_id"))(validation.groupByCols) - assertResult("amount")(validation.aggCol) + assertResult(Seq("account_id"))(validation.groupByFields) + assertResult("amount")(validation.aggField) assertResult("count")(validation.aggType) assertResult("count(amount) < 100")(validation.aggExpr) } - test("Can create group by then get max column validation") { + test("Can create group by then get max field validation") { val result = ValidationBuilder() .groupBy("account_id") .max("amount") @@ -361,13 +701,13 @@ class ValidationConfigurationBuilderTest extends AnyFunSuite { assert(result.validation.isInstanceOf[GroupByValidation]) val validation = result.validation.asInstanceOf[GroupByValidation] - assertResult(Seq("account_id"))(validation.groupByCols) - assertResult("amount")(validation.aggCol) + assertResult(Seq("account_id"))(validation.groupByFields) + assertResult("amount")(validation.aggField) assertResult("max")(validation.aggType) assertResult("max(amount) < 100")(validation.aggExpr) } - test("Can create group by then get min column validation") { + test("Can create group by then get min field validation") { val result = ValidationBuilder() .groupBy("account_id") .min("amount") @@ -375,13 +715,13 @@ class ValidationConfigurationBuilderTest extends AnyFunSuite { assert(result.validation.isInstanceOf[GroupByValidation]) val validation = result.validation.asInstanceOf[GroupByValidation] - assertResult(Seq("account_id"))(validation.groupByCols) - assertResult("amount")(validation.aggCol) + assertResult(Seq("account_id"))(validation.groupByFields) + assertResult("amount")(validation.aggField) assertResult("min")(validation.aggType) assertResult("min(amount) < 100")(validation.aggExpr) } - test("Can create group by then get average column validation") { + test("Can create group by then get average field validation") { val result = ValidationBuilder() .groupBy("account_id") .avg("amount") @@ -389,13 +729,13 @@ class ValidationConfigurationBuilderTest extends AnyFunSuite { assert(result.validation.isInstanceOf[GroupByValidation]) val validation = result.validation.asInstanceOf[GroupByValidation] - assertResult(Seq("account_id"))(validation.groupByCols) - assertResult("amount")(validation.aggCol) + assertResult(Seq("account_id"))(validation.groupByFields) + assertResult("amount")(validation.aggField) assertResult("avg")(validation.aggType) assertResult("avg(amount) < 100")(validation.aggExpr) } - test("Can create group by then get stddev column validation") { + test("Can create group by then get stddev field validation") { val result = ValidationBuilder() .groupBy("account_id") .stddev("amount") @@ -403,32 +743,32 @@ class ValidationConfigurationBuilderTest extends AnyFunSuite { assert(result.validation.isInstanceOf[GroupByValidation]) val validation = result.validation.asInstanceOf[GroupByValidation] - assertResult(Seq("account_id"))(validation.groupByCols) - assertResult("amount")(validation.aggCol) + assertResult(Seq("account_id"))(validation.groupByFields) + assertResult("amount")(validation.aggField) assertResult("stddev")(validation.aggType) assertResult("stddev(amount) < 100")(validation.aggExpr) } - test("Can create unique column validation") { + test("Can create unique field validation") { val result = ValidationBuilder().unique("account_id").description("my_description").errorThreshold(0.2) assert(result.validation.isInstanceOf[GroupByValidation]) val validation = result.validation.asInstanceOf[GroupByValidation] - assertResult(Seq("account_id"))(validation.groupByCols) - assertResult("unique")(validation.aggCol) + assertResult(Seq("account_id"))(validation.groupByFields) + assertResult("unique")(validation.aggField) assertResult("count")(validation.aggType) assertResult("count == 1")(validation.aggExpr) assert(validation.description.contains("my_description")) assert(validation.errorThreshold.contains(0.2)) } - test("Can create unique column validation with multiple columns") { + test("Can create unique field validation with multiple fields") { val result = ValidationBuilder().unique("account_id", "year", "name") assert(result.validation.isInstanceOf[GroupByValidation]) val validation = result.validation.asInstanceOf[GroupByValidation] - assertResult(Seq("account_id", "year", "name"))(validation.groupByCols) - assertResult("unique")(validation.aggCol) + assertResult(Seq("account_id", "year", "name"))(validation.groupByFields) + assertResult("unique")(validation.aggField) assertResult("count")(validation.aggType) assertResult("count == 1")(validation.aggExpr) } @@ -437,33 +777,33 @@ class ValidationConfigurationBuilderTest extends AnyFunSuite { val upstreamDataSource = ConnectionConfigWithTaskBuilder().file("other_data_source", "json") val result = ValidationBuilder() .upstreamData(upstreamDataSource) - .joinColumns("account_id") - .withValidation(ValidationBuilder().col("amount").lessThanOrEqualCol("other_data_source_balance")) + .joinFields("account_id") + .validations(ValidationBuilder().field("amount").lessThanField("other_data_source_balance", false)) assert(result.validation.isInstanceOf[UpstreamDataSourceValidation]) val validation = result.validation.asInstanceOf[UpstreamDataSourceValidation] assertResult("other_data_source")(validation.upstreamDataSource.connectionConfigWithTaskBuilder.dataSourceName) assertResult(DEFAULT_VALIDATION_JOIN_TYPE)(validation.joinType) - assertResult(List("account_id"))(validation.joinColumns) - assert(validation.validation.validation.isInstanceOf[ExpressionValidation]) - assertResult("`amount` <= other_data_source_balance")(validation.validation.validation.asInstanceOf[ExpressionValidation].expr) + assertResult(List("account_id"))(validation.joinFields) + assert(validation.validations.head.validation.isInstanceOf[ExpressionValidation]) + assertResult("`amount` <= other_data_source_balance")(validation.validations.head.validation.asInstanceOf[ExpressionValidation].expr) } test("Can create validation based on data from another data source as an anti-join") { val upstreamDataSource = ConnectionConfigWithTaskBuilder().file("other_data_source", "json") val result = ValidationBuilder() .upstreamData(upstreamDataSource) - .joinColumns("account_id") + .joinFields("account_id") .joinType("anti-join") - .withValidation(ValidationBuilder().count().isEqual(0)) + .validations(ValidationBuilder().count().isEqual(0)) assert(result.validation.isInstanceOf[UpstreamDataSourceValidation]) val validation = result.validation.asInstanceOf[UpstreamDataSourceValidation] assertResult("other_data_source")(validation.upstreamDataSource.connectionConfigWithTaskBuilder.dataSourceName) assertResult("anti-join")(validation.joinType) - assertResult(List("account_id"))(validation.joinColumns) - assert(validation.validation.validation.isInstanceOf[GroupByValidation]) - assertResult("count == 0")(validation.validation.validation.asInstanceOf[GroupByValidation].aggExpr) + assertResult(List("account_id"))(validation.joinFields) + assert(validation.validations.head.validation.isInstanceOf[GroupByValidation]) + assertResult("count == 0")(validation.validations.head.validation.asInstanceOf[GroupByValidation].aggExpr) } test("Can create validation based on data from another data source with expression for join logic") { @@ -471,48 +811,48 @@ class ValidationConfigurationBuilderTest extends AnyFunSuite { val result = ValidationBuilder() .upstreamData(upstreamDataSource) .joinExpr("account_id == CONCAT('ACC', other_data_source_account_number)") - .withValidation(ValidationBuilder().count().isEqual(0)) + .validations(ValidationBuilder().count().isEqual(0)) assert(result.validation.isInstanceOf[UpstreamDataSourceValidation]) val validation = result.validation.asInstanceOf[UpstreamDataSourceValidation] assertResult("other_data_source")(validation.upstreamDataSource.connectionConfigWithTaskBuilder.dataSourceName) assertResult(DEFAULT_VALIDATION_JOIN_TYPE)(validation.joinType) - assertResult(List("expr:account_id == CONCAT('ACC', other_data_source_account_number)"))(validation.joinColumns) - assert(validation.validation.validation.isInstanceOf[GroupByValidation]) - assertResult("count == 0")(validation.validation.validation.asInstanceOf[GroupByValidation].aggExpr) + assertResult(List("expr:account_id == CONCAT('ACC', other_data_source_account_number)"))(validation.joinFields) + assert(validation.validations.head.validation.isInstanceOf[GroupByValidation]) + assertResult("count == 0")(validation.validations.head.validation.asInstanceOf[GroupByValidation].aggExpr) } - test("Can create column count validation") { - val result = ValidationBuilder().columnNames.countEqual(5) + test("Can create field count validation") { + val result = ValidationBuilder().fieldNames.countEqual(5) - assert(result.validation.isInstanceOf[ColumnNamesValidation]) - assertResult(VALIDATION_COLUMN_NAME_COUNT_EQUAL)(result.validation.asInstanceOf[ColumnNamesValidation].columnNameType) - assertResult(5)(result.validation.asInstanceOf[ColumnNamesValidation].count) + assert(result.validation.isInstanceOf[FieldNamesValidation]) + assertResult(VALIDATION_FIELD_NAME_COUNT_EQUAL)(result.validation.asInstanceOf[FieldNamesValidation].fieldNameType) + assertResult(5)(result.validation.asInstanceOf[FieldNamesValidation].count) } - test("Can create column count between validation") { - val result = ValidationBuilder().columnNames.countBetween(5, 10) + test("Can create field count between validation") { + val result = ValidationBuilder().fieldNames.countBetween(5, 10) - assert(result.validation.isInstanceOf[ColumnNamesValidation]) - assertResult(VALIDATION_COLUMN_NAME_COUNT_BETWEEN)(result.validation.asInstanceOf[ColumnNamesValidation].columnNameType) - assertResult(5)(result.validation.asInstanceOf[ColumnNamesValidation].minCount) - assertResult(10)(result.validation.asInstanceOf[ColumnNamesValidation].maxCount) + assert(result.validation.isInstanceOf[FieldNamesValidation]) + assertResult(VALIDATION_FIELD_NAME_COUNT_BETWEEN)(result.validation.asInstanceOf[FieldNamesValidation].fieldNameType) + assertResult(5)(result.validation.asInstanceOf[FieldNamesValidation].min) + assertResult(10)(result.validation.asInstanceOf[FieldNamesValidation].max) } - test("Can create column names match ordered list of names") { - val result = ValidationBuilder().columnNames.matchOrder("account_id", "year") + test("Can create field names match ordered list of names") { + val result = ValidationBuilder().fieldNames.matchOrder("account_id", "year") - assert(result.validation.isInstanceOf[ColumnNamesValidation]) - assertResult(VALIDATION_COLUMN_NAME_MATCH_ORDER)(result.validation.asInstanceOf[ColumnNamesValidation].columnNameType) - assert(result.validation.asInstanceOf[ColumnNamesValidation].names sameElements Array("account_id", "year")) + assert(result.validation.isInstanceOf[FieldNamesValidation]) + assertResult(VALIDATION_FIELD_NAME_MATCH_ORDER)(result.validation.asInstanceOf[FieldNamesValidation].fieldNameType) + assert(result.validation.asInstanceOf[FieldNamesValidation].names sameElements Array("account_id", "year")) } - test("Can create column names exist in set of names") { - val result = ValidationBuilder().columnNames.matchSet("account_id", "year") + test("Can create field names exist in set of names") { + val result = ValidationBuilder().fieldNames.matchSet("account_id", "year") - assert(result.validation.isInstanceOf[ColumnNamesValidation]) - assertResult(VALIDATION_COLUMN_NAME_MATCH_SET)(result.validation.asInstanceOf[ColumnNamesValidation].columnNameType) - assert(result.validation.asInstanceOf[ColumnNamesValidation].names sameElements Array("account_id", "year")) + assert(result.validation.isInstanceOf[FieldNamesValidation]) + assertResult(VALIDATION_FIELD_NAME_MATCH_SET)(result.validation.asInstanceOf[FieldNamesValidation].fieldNameType) + assert(result.validation.asInstanceOf[FieldNamesValidation].names sameElements Array("account_id", "year")) } test("Can create validation pause wait condition") { @@ -584,18 +924,18 @@ class ValidationConfigurationBuilderTest extends AnyFunSuite { assertResult(List(202))(waitCondition.statusCodes) } - test("Can create column pre-filter condition for validation") { - val result = PreFilterBuilder().filter(ValidationBuilder().col("balance").greaterThan(100)) + test("Can create field pre-filter condition for validation") { + val result = PreFilterBuilder().filter(ValidationBuilder().field("balance").greaterThan(100)) assertResult(1)(result.validationPreFilterBuilders.size) assert(result.validationPreFilterBuilders.head.isLeft) assert(result.validationPreFilterBuilders.head.left.exists(_.validation.isInstanceOf[ExpressionValidation])) } - test("Can create column pre-filter condition for validation with OR condition") { + test("Can create field pre-filter condition for validation with OR condition") { val result = PreFilterBuilder() - .filter(ValidationBuilder().col("balance").greaterThan(100)) - .or(ValidationBuilder().col("amount").greaterThan(10)) + .filter(ValidationBuilder().field("balance").greaterThan(100)) + .or(ValidationBuilder().field("amount").greaterThan(10)) assertResult(3)(result.validationPreFilterBuilders.size) assert(result.validationPreFilterBuilders.head.isLeft) @@ -608,8 +948,8 @@ class ValidationConfigurationBuilderTest extends AnyFunSuite { test("Can create pre-filter conditions for validation") { val result = ValidationBuilder() - .preFilter(PreFilterBuilder().filter(ValidationBuilder().col("category").in("utilities"))) - .col("amount").greaterThan(100) + .preFilter(PreFilterBuilder().filter(ValidationBuilder().field("category").in("utilities"))) + .field("amount").greaterThan(100) assert(result.optCombinationPreFilterBuilder.isDefined) assert(result.optCombinationPreFilterBuilder.get.validate()) diff --git a/app/build.gradle.kts b/app/build.gradle.kts index a303dfe3..eed5f590 100644 --- a/app/build.gradle.kts +++ b/app/build.gradle.kts @@ -178,10 +178,10 @@ dependencies { basicImpl("com.slack.api:slack-api-client:1.39.3") // UI/HTTP server - basicImpl("org.apache.pekko:pekko-http_$scalaVersion:1.0.1") - basicImpl("org.apache.pekko:pekko-stream_$scalaVersion:1.0.2") - basicImpl("org.apache.pekko:pekko-actor-typed_$scalaVersion:1.0.2") - basicImpl("org.apache.pekko:pekko-http-spray-json_$scalaVersion:1.0.1") + basicImpl("org.apache.pekko:pekko-http_$scalaVersion:1.0.0") + basicImpl("org.apache.pekko:pekko-stream_$scalaVersion:1.0.1") + basicImpl("org.apache.pekko:pekko-actor-typed_$scalaVersion:1.0.1") + basicImpl("com.github.pjfanning:pekko-http-jackson_$scalaVersion:2.2.0") // needed to work on Windows basicImpl("com.globalmentor:hadoop-bare-naked-local-fs:0.1.0") @@ -202,6 +202,7 @@ dependencies { exclude(group = "org.scala-lang") } basicImpl("com.fasterxml.jackson.datatype:jackson-datatype-jsr310:2.15.3") + basicImpl("com.fasterxml.jackson.datatype:jackson-datatype-joda:2.15.3") //NoClassDefFoundError: shaded/parquet/com/fasterxml/jackson/databind/ObjectMapper basicImpl("org.apache.parquet:parquet-jackson:1.13.1") //new versions contain transitive deps that use java 21, shadowJar fails basicImpl("org.scala-lang.modules:scala-xml_$scalaVersion:2.2.0") { diff --git a/app/src/main/resources/report/main.css b/app/src/main/resources/report/main.css index e07128fc..2505a7fa 100644 --- a/app/src/main/resources/report/main.css +++ b/app/src/main/resources/report/main.css @@ -134,7 +134,7 @@ table, tr, td, th { .outer-container { display: flex; - flex-direction: column; + flex-direction: field; height: 100vh; } diff --git a/app/src/main/resources/ui/blah.json b/app/src/main/resources/ui/blah.json new file mode 100644 index 00000000..9e6cd244 --- /dev/null +++ b/app/src/main/resources/ui/blah.json @@ -0,0 +1,150 @@ +{ + "id": "4dc36df2-92e3-44d8-9965-eecd2050ef67", + "plan": { + "name": "my-plan", + "description": "Data generation plan", + "tasks": [ + { + "name": "task-1", + "dataSourceName": "account-json", + "enabled": true + } + ], + "sinkOptions": { + "foreignKeys": [] + }, + "validations": ["task-1"] + }, + "tasks": [ + { + "name": "task-1", + "type": "json", + "count": { + "records": 1000, + "options": {} + }, + "options": {}, + "fields": [ + { + "name": "asd", + "type": "string", + "options": { + "isUnique": "true", + "minLen": "2" + }, + "nullable": true, + "fields": [] + } + ], + "enabled": true + } + ], + "validation": [ + { + "name": "task-1", + "description": "Validation of data sources after generating data", + "dataSources": { + "account-json": [ + { + "options": {}, + "waitCondition": { + "pauseInSeconds": 0, + "isRetryable": false, + "maxRetries": 10, + "waitBeforeRetrySeconds": 2 + }, + "validations": [ + { + "field": "asd", + "validation": [ + { + "negate": true, + "type": "null" + }, { + "value": "12", + "strictly": false, + "type": "lessThan" + } + ] + }, { + "groupByFields": ["asd"], + "aggField": "asd", + "aggType": "sum", + "aggExpr": "true", + "validation": [ + { + "value": "1", + "strictly": false, + "type": "lessThan" + } + ] + }, { + "upstreamDataSource": "", + "upstreamTaskName": "abc123", + "validation": [ + { + "field": "asd", + "validation": [ + { + "negate": true, + "type": "null" + } + ] + } + ], + "upstreamReadOptions": {}, + "joinFields": ["account_id"], + "joinType": "inner" + } + ] + } + ] + } + } + ], + "configuration": { + "flag": { + "enableAlerts": "true", + "enableUniqueCheck": "false", + "enableGeneratePlanAndTasks": "false", + "enableSaveReports": "true", + "enableDeleteGeneratedRecords": "false", + "enableCount": "true", + "enableFailOnError": "true", + "enableGenerateData": "true", + "enableGenerateValidations": "false", + "enableRecordTracking": "true", + "enableValidation": "true", + "enableSinkMetadata": "false" + }, + "folder": { + "generatedReportsFolderPath": "/tmp/report", + "recordTrackingForValidationFolderPath": "/tmp/record-tracking-validation", + "generatedPlanAndTasksFolderPath": "/tmp", + "taskFolderPath": "/tmp/task", + "recordTrackingFolderPath": "/tmp/record-tracking", + "validationFolderPath": "/tmp/validation", + "planFilePath": "/tmp/plan/customer-create-plan.yaml" + }, + "metadata": { + "oneOfDistinctCountVsCountThreshold": "0.2", + "numGeneratedSamples": "10", + "numRecordsForAnalysis": "10000", + "numRecordsFromDataSource": "10000", + "oneOfMinCount": "1000" + }, + "generation": { + "numRecordsPerBatch": "100000", + "numRecordsPerStep": "-1" + }, + "validation": { + "numSampleErrorRecords": "5", + "enableDeleteRecordTrackingFiles": "true" + }, + "alert": { + "triggerOn": "all", + "slackToken": "", + "slackChannels": "" + } + } +}% \ No newline at end of file diff --git a/app/src/main/resources/ui/configuration-data.js b/app/src/main/resources/ui/configuration-data.js index ab874e90..05f7185d 100644 --- a/app/src/main/resources/ui/configuration-data.js +++ b/app/src/main/resources/ui/configuration-data.js @@ -262,10 +262,10 @@ dataTypeOptionsMap.set("array", { dataTypeOptionsMap.set("struct", {...defaultDataTypeOptions, addBlock: {type: "field"}}); export const validationTypeDisplayNameMap = new Map(); -validationTypeDisplayNameMap.set("column", "Field"); +validationTypeDisplayNameMap.set("field", "Field"); validationTypeDisplayNameMap.set("groupBy", "Group By/Aggregate"); validationTypeDisplayNameMap.set("upstream", "Upstream"); -validationTypeDisplayNameMap.set("columnNames", "Field Names"); +validationTypeDisplayNameMap.set("fieldNames", "Field Names"); export const validationTypeOptionsMap = new Map(); const defaultValidationOptions = { description: { @@ -282,7 +282,7 @@ const defaultValidationOptions = { help: "Number or percentage (0.0 to 1.0) of errors before marking validation as failed." }, } -validationTypeOptionsMap.set("column", { +validationTypeOptionsMap.set("field", { ...defaultValidationOptions, defaultChildField: {displayName: "Field", default: "", type: "text", required: "", help: "Field to validate."}, equal: { @@ -292,8 +292,14 @@ validationTypeOptionsMap.set("column", { group: {type: "checkbox", innerText: "Not"}, help: "Equal to value. Select 'Not' for not equals." }, - null: {displayName: "Null", default: "", type: "text", disabled: "", help: "Values are null."}, - notNull: {displayName: "Not Null", default: "", type: "text", disabled: "", help: "Values are not null."}, + null: { + displayName: "Null", + default: "", + type: "text", + disabled: "", + group: {type: "checkbox", innerText: "Not"}, + help: "Values are null." + }, contains: { displayName: "Contains", default: "", @@ -386,6 +392,7 @@ validationTypeOptionsMap.set("column", { default: "", type: "text", disabled: "", + group: {type: "checkbox", innerText: "Not"}, help: "Values are valid credit card or identification numbers according to Luhn Algorithm.", required: "" }, @@ -394,6 +401,7 @@ validationTypeOptionsMap.set("column", { default: "string", type: "text", choice: baseDataTypes, + group: {type: "checkbox", innerText: "Not"}, help: "Values are of data type." }, sql: { @@ -406,7 +414,7 @@ validationTypeOptionsMap.set("column", { }); validationTypeOptionsMap.set("groupBy", { ...defaultValidationOptions, - defaultChildGroupByColumns: { + defaultChildGroupByFields: { displayName: "Group By Field(s)", default: "", type: "text", @@ -418,14 +426,14 @@ validationTypeOptionsMap.set("groupBy", { default: "", type: "text", help: "Field name to count number of groups after group by.", - addBlock: {type: "column"} + addBlock: {type: "field"} }, sum: { displayName: "Sum", default: "", type: "text", help: "Field name of values to sum after group by.", - addBlock: {type: "column"}, + addBlock: {type: "field"}, required: "" }, min: { @@ -433,7 +441,7 @@ validationTypeOptionsMap.set("groupBy", { default: "", type: "text", help: "Field name to find minimum value after group by.", - addBlock: {type: "column"}, + addBlock: {type: "field"}, required: "" }, max: { @@ -441,7 +449,7 @@ validationTypeOptionsMap.set("groupBy", { default: "", type: "text", help: "Field name to find maximum value after group by.", - addBlock: {type: "column"}, + addBlock: {type: "field"}, required: "" }, average: { @@ -449,7 +457,7 @@ validationTypeOptionsMap.set("groupBy", { default: "", type: "text", help: "Field name to find average value after group by.", - addBlock: {type: "column"}, + addBlock: {type: "field"}, required: "" }, standardDeviation: { @@ -457,7 +465,7 @@ validationTypeOptionsMap.set("groupBy", { default: "", type: "text", help: "Field name to find standard deviation value after group by.", - addBlock: {type: "column"}, + addBlock: {type: "field"}, required: "" }, }); @@ -472,7 +480,7 @@ validationTypeOptionsMap.set("upstream", { help: "Name of upstream data generation task." }, addBlock: {type: "validation"}, - joinColumns: { + joinFields: { displayName: "Join Field(s)", default: "", type: "text", @@ -494,7 +502,7 @@ validationTypeOptionsMap.set("upstream", { required: "" } }); -validationTypeOptionsMap.set("columnNames", { +validationTypeOptionsMap.set("fieldNames", { ...defaultValidationOptions, countEqual: { displayName: "Count Equal", @@ -883,7 +891,7 @@ dataSourcePropertiesMap.set("csv", { displayName: "Partition By", default: "", type: "text", - help: "Column name(s) to partition by (comma separated).", + help: "Field name(s) to partition by (comma separated).", override: "true" } } @@ -910,7 +918,7 @@ dataSourcePropertiesMap.set("delta", { displayName: "Partition By", default: "", type: "text", - help: "Column name(s) to partition by (comma separated).", + help: "Field name(s) to partition by (comma separated).", override: "true" } } @@ -975,7 +983,7 @@ dataSourcePropertiesMap.set("iceberg", { displayName: "Partition By", default: "", type: "text", - help: "Column name(s) to partition by (comma separated).", + help: "Field name(s) to partition by (comma separated).", override: "true" } } @@ -1002,7 +1010,7 @@ dataSourcePropertiesMap.set("json", { displayName: "Partition By", default: "", type: "text", - help: "Column name(s) to partition by (comma separated).", + help: "Field name(s) to partition by (comma separated).", override: "true" } } @@ -1091,7 +1099,7 @@ dataSourcePropertiesMap.set("orc", { displayName: "Partition By", default: "", type: "text", - help: "Column name(s) to partition by (comma separated).", + help: "Field name(s) to partition by (comma separated).", override: "true" } } @@ -1118,7 +1126,7 @@ dataSourcePropertiesMap.set("parquet", { displayName: "Partition By", default: "", type: "text", - help: "Column name(s) to partition by (comma separated).", + help: "Field name(s) to partition by (comma separated).", override: "true" } } diff --git a/app/src/main/resources/ui/helper-configuration.js b/app/src/main/resources/ui/helper-configuration.js index adb92ed7..bb710df3 100644 --- a/app/src/main/resources/ui/helper-configuration.js +++ b/app/src/main/resources/ui/helper-configuration.js @@ -65,7 +65,10 @@ export function getConfiguration() { } else { options.set(option.getAttribute("configuration"), option.value); } - mappedConfiguration.set(baseConfig, options); + + if (baseConfig) { + mappedConfiguration.set(baseConfig, options); + } } }); return mappedConfiguration; diff --git a/app/src/main/resources/ui/helper-foreign-keys.js b/app/src/main/resources/ui/helper-foreign-keys.js index 574fb61d..2b421eb0 100644 --- a/app/src/main/resources/ui/helper-foreign-keys.js +++ b/app/src/main/resources/ui/helper-foreign-keys.js @@ -1,6 +1,6 @@ /* Foreign keys section based off tasks created. -Ability to choose task name and columns. Define custom relationships. +Ability to choose task name and fields. Define custom relationships. - One to one - One to many - Transformations @@ -53,8 +53,8 @@ async function createForeignKeyLinksFromPlan(newForeignKey, foreignKey, linkType foreignKeyLinkSources.insertBefore(newForeignKeyLink, foreignKeyLinkSources.lastChild); let updatedForeignKeyTaskName = $(newForeignKeyLink).find(`select.foreign-key-${linkType}-link`).selectpicker("val", fkLink.taskName); dispatchEvent(updatedForeignKeyTaskName, "change"); - let updatedForeignKeyColumns = $(newForeignKeyLink).find(`input.foreign-key-${linkType}-link`).val(fkLink.columns); - dispatchEvent(updatedForeignKeyColumns, "input"); + let updatedForeignKeyFields = $(newForeignKeyLink).find(`input.foreign-key-${linkType}-link`).val(fkLink.fields); + dispatchEvent(updatedForeignKeyFields, "input"); //also add in other options if (fkLink.options) { for (let [key, value] of Object.entries(fkLink.options)) { @@ -79,8 +79,8 @@ export async function createForeignKeysFromPlan(respJson) { if (foreignKey.source) { let updatedTaskName = $(newForeignKey).find("select.foreign-key-source").selectpicker("val", foreignKey.source.taskName); dispatchEvent(updatedTaskName, "change"); - let updatedColumns = $(newForeignKey).find("input.foreign-key-source").val(foreignKey.source.columns); - dispatchEvent(updatedColumns, "input"); + let updatedFields = $(newForeignKey).find("input.foreign-key-source").val(foreignKey.source.fields); + dispatchEvent(updatedFields, "input"); //also add in other options console.log(foreignKey.source.options); if (foreignKey.source.options) { @@ -104,12 +104,12 @@ export async function createForeignKeysFromPlan(respJson) { } } -function getForeignKeyLinksToArray(foreignKeyContainer, className) { +function getForeignKeyLinksToArray(taskToDataSource, foreignKeyContainer, className) { let mainContainer = $(foreignKeyContainer).find(className); let foreignKeyLinks = $(mainContainer).find(".foreign-key-input-container"); let foreignKeyLinksArray = []; for (let foreignKeyLink of foreignKeyLinks) { - let foreignKeyLinkDetails = getForeignKeyDetail(foreignKeyLink); + let foreignKeyLinkDetails = getForeignKeyDetail(taskToDataSource, foreignKeyLink); if (Object.keys(foreignKeyLinkDetails).length !== 0) { foreignKeyLinksArray.push(foreignKeyLinkDetails); } @@ -117,14 +117,14 @@ function getForeignKeyLinksToArray(foreignKeyContainer, className) { return foreignKeyLinksArray; } -export function getForeignKeys() { +export function getForeignKeys(taskToDataSource) { let foreignKeyContainers = Array.from(document.querySelectorAll(".foreign-key-container").values()); return foreignKeyContainers.map(fkContainer => { let fkSource = $(fkContainer).find(".foreign-key-main-source"); - let fkSourceDetails = getForeignKeyDetail(fkSource[0]); - let fkGenerationLinkArray = getForeignKeyLinksToArray(fkContainer, ".foreign-key-generation-link-sources"); - let fkDeleteLinkArray = getForeignKeyLinksToArray(fkContainer, ".foreign-key-delete-link-sources"); - return {source: fkSourceDetails, generationLinks: fkGenerationLinkArray, deleteLinks: fkDeleteLinkArray}; + let fkSourceDetails = getForeignKeyDetail(taskToDataSource, fkSource[0]); + let fkGenerationLinkArray = getForeignKeyLinksToArray(taskToDataSource, fkContainer, ".foreign-key-generation-link-sources"); + let fkDeleteLinkArray = getForeignKeyLinksToArray(taskToDataSource, fkContainer, ".foreign-key-delete-link-sources"); + return {source: fkSourceDetails, generate: fkGenerationLinkArray, delete: fkDeleteLinkArray}; }); } @@ -201,18 +201,18 @@ async function createForeignKeyInput(index, name) { foreignKeyContainer.setAttribute("class", "foreign-key-input-container m-1"); let foreignKey = document.createElement("div"); foreignKey.setAttribute("class", `row m-1 align-items-center ${name}-source`); - // input is task name -> column(s) + // input is task name -> field(s) let taskNameSelect = createSelect(`${name}-${index}`, "Task", `selectpicker form-control input-field ${name}`, "Select a task..."); let taskNameCol = document.createElement("div"); taskNameCol.setAttribute("class", "col"); taskNameCol.append(taskNameSelect); - let columnNamesInput = createInput(`${name}-column-${index}`, "Columns", `form-control input-field is-invalid ${name}`, "text", ""); - columnNamesInput.setAttribute("required", ""); - createFieldValidationCheck(columnNamesInput); - let columnNameFloating = createFormFloating("Column(s)", columnNamesInput); + let fieldNamesInput = createInput(`${name}-field-${index}`, "Fields", `form-control input-field is-invalid ${name}`, "text", ""); + fieldNamesInput.setAttribute("required", ""); + createFieldValidationCheck(fieldNamesInput); + let fieldNameFloating = createFormFloating("Field(s)", fieldNamesInput); - foreignKey.append(taskNameCol, columnNameFloating); + foreignKey.append(taskNameCol, fieldNameFloating); //when task name is selected, offer input to define sub data source if not defined //(i.e. schema and table for Postgres task with no schema and table defined, only offer table if schema is defined in data source) //for a http data source, endpoint is not part of the data source @@ -260,10 +260,14 @@ async function createForeignKeyInput(index, name) { return foreignKeyContainer; } -function getForeignKeyDetail(element) { +function getForeignKeyDetail(taskToDataSource, element) { let taskName = $(element).find("select[aria-label=Task]").val(); - let columns = $(element).find("input[aria-label=Columns]").val(); - let baseForeignKey = {taskName: taskName, columns: columns}; + let fields = $(element).find("input[aria-label=Fields]").val(); + + let fieldsArray = fields.includes(",") ? fields.split(",") : Array(fields); + let dataSource = taskToDataSource[taskName]; + + let baseForeignKey = {dataSource: dataSource,step: taskName, fields: fieldsArray}; let overrideConnectionOptions = getOverrideConnectionOptionsAsMap(element); if (Object.keys(overrideConnectionOptions).length > 0) { baseForeignKey["options"] = overrideConnectionOptions; diff --git a/app/src/main/resources/ui/helper-generation.js b/app/src/main/resources/ui/helper-generation.js index 46506927..87cd6628 100644 --- a/app/src/main/resources/ui/helper-generation.js +++ b/app/src/main/resources/ui/helper-generation.js @@ -19,7 +19,7 @@ export function incFields() { async function createGenerationFields(dataSourceFields, manualSchema) { let allCollapsedAccordionButton = $(document).find(".accordion-button.collapsed"); allCollapsedAccordionButton.click(); - for (const field of dataSourceFields.optFields) { + for (const field of dataSourceFields) { numFields += 1; let newField = await createNewField(numFields, "generation"); $(manualSchema).find(".accordion").first().append(newField); @@ -38,11 +38,11 @@ async function createGenerationFields(dataSourceFields, manualSchema) { } } // there are nested fields - if (field.nested && field.nested.optFields) { + if (field.fields && field.fields.length > 0) { let newFieldBox = $(newField).find(".card"); // let newFieldBox = createManualContainer(numFields, "generation", "struct-schema"); // $(newField).find(".accordion-body").append(newFieldBox); - await createGenerationFields(field.nested, newFieldBox); + await createGenerationFields(field.fields, newFieldBox); } } let collapseShow = $(document).find(".accordion-button.collapse.show"); @@ -52,9 +52,9 @@ async function createGenerationFields(dataSourceFields, manualSchema) { export async function createGenerationElements(dataSource, newDataSource, numDataSources) { let dataSourceGenContainer = $(newDataSource).find("#data-source-generation-config-container"); - // check if there is auto schema defined + // TODO check if there is auto schema defined // check if there is auto schema from metadata source defined - if (dataSource.fields && dataSource.fields.optMetadataSource) { + if (dataSource.options["metadataSourceName"]) { $(dataSourceGenContainer).find("[id^=auto-from-metadata-source-generation-checkbox]").prop("checked", true); let autoFromMetadataSchema = await createAutoFromMetadataSourceContainer(numDataSources); $(dataSourceGenContainer).find(".manual").after(autoFromMetadataSchema); @@ -62,7 +62,7 @@ export async function createGenerationElements(dataSource, newDataSource, numDat await createAutoFromMetadata(autoFromMetadataSchema, dataSource); } // check if there is manual schema defined - if (dataSource.fields && dataSource.fields.optFields && dataSource.fields.optFields.length > 0) { + if (dataSource.fields && dataSource.fields.length > 0) { let manualSchema = createManualContainer(numFields, "generation"); dataSourceGenContainer[0].insertBefore(manualSchema, dataSourceGenContainer[0].lastElementChild); $(dataSourceGenContainer).find("[id^=manual-generation-checkbox]").prop("checked", true); @@ -134,4 +134,36 @@ export function getGeneration(dataSource, currentDataSource) { dataGenerationInfo["optFields"] = Object.values(dataFieldsWithAttributes); } currentDataSource["fields"] = dataGenerationInfo; +} + +export function getGenerationYaml(dataSource, currentTask) { + // check which checkboxes are enabled: auto, auto with external, manual + let isAutoChecked = $(dataSource).find("[id^=auto-generation-checkbox]").is(":checked"); + let isAutoFromMetadataChecked = $(dataSource).find("[id^=auto-from-metadata-source-generation-checkbox]").is(":checked"); + let isManualChecked = $(dataSource).find("[id^=manual-generation-checkbox]").is(":checked"); + currentTask["options"] = {}; + + if (isAutoChecked) { + // need to enable data generation within data source options + currentTask["options"]["enableDataGeneration"] = "true"; + } + + if (isAutoFromMetadataChecked) { + let dataSourceGenerationContainer = $(dataSource).find("[id^=data-source-generation-config-container]")[0]; + let dataSourceAutoSchemaContainer = $(dataSourceGenerationContainer).find("[class~=data-source-auto-from-metadata-container]")[0]; + let metadataConnectionName = $(dataSourceAutoSchemaContainer).find("select[class~=metadata-connection-name]").val(); + $(dataSourceAutoSchemaContainer).find("input[class~=metadata-source-property]").toArray() + .forEach(opt => { + if (opt.value !== "") { + currentTask["options"][opt.getAttribute("aria-label")] = opt.value; + } + }); + currentTask["options"]["metadataSourceName"] = metadataConnectionName; + } + // get top level manual fields + if (isManualChecked) { + let dataSourceSchemaContainer = $(dataSource).find("[id^=data-source-schema-container]")[0]; + let dataFieldsWithAttributes = getGenerationSchema(dataSourceSchemaContainer); + currentTask["fields"] = Object.values(dataFieldsWithAttributes); + } } \ No newline at end of file diff --git a/app/src/main/resources/ui/helper-record-count.js b/app/src/main/resources/ui/helper-record-count.js index f300935a..b46a05a7 100644 --- a/app/src/main/resources/ui/helper-record-count.js +++ b/app/src/main/resources/ui/helper-record-count.js @@ -17,9 +17,9 @@ export function createRecordCount(index) { recordCountHeader.innerText = "Record count"; let recordCountRow = document.createElement("div"); recordCountRow.setAttribute("class", "record-count-row"); - // have 3 columns + // have 3 fields // - total -> number or random between min max - // - per column -> number or random between min max + // - per field -> number or random between min max // - estimated number of record let estimatedRecordCountContainer = document.createElement("div"); estimatedRecordCountContainer.setAttribute("class", "col"); @@ -27,15 +27,15 @@ export function createRecordCount(index) { estimatedRecordCount.innerHTML = "Estimate number of records: 1000"; estimatedRecordCountContainer.append(estimatedRecordCount); let baseRecordRadio = createBaseRecordCountContainer(index); - let perColumnContainer = createPerColumnCountContainer(index, estimatedRecordCountContainer); + let perFieldContainer = createPerFieldCountContainer(index, estimatedRecordCountContainer); let advancedButton = createButton("record-count-advanced-" + index, "Advanced", "btn btn-secondary m-1", "Advanced"); advancedButton.setAttribute("data-bs-toggle", "collapse"); - advancedButton.setAttribute("data-bs-target", "#" + perColumnContainer.getAttribute("id")); + advancedButton.setAttribute("data-bs-target", "#" + perFieldContainer.getAttribute("id")); advancedButton.setAttribute("aria-expanded", "false"); - advancedButton.setAttribute("aria-controls", perColumnContainer.getAttribute("id")); + advancedButton.setAttribute("aria-controls", perFieldContainer.getAttribute("id")); - recordCountRow.append(baseRecordRadio, advancedButton, perColumnContainer); - $(recordCountRow).find("input[type=radio].base-record-count-radio,input[type=radio].per-column-record-count-radio").change(function () { + recordCountRow.append(baseRecordRadio, advancedButton, perFieldContainer); + $(recordCountRow).find("input[type=radio].base-record-count-radio,input[type=radio].per-field-record-count-radio").change(function () { let newEstimate = estimateRecordCount(recordCountRow)["estimateRecords"]; estimatedRecordCount.innerHTML = "Estimate number of records: " + newEstimate + ""; }); @@ -56,21 +56,21 @@ export function createCountElementsFromPlan(dataSource, newDataSource) { $(newDataSource).find("[id^=base-record-count]").val(dsCount.records); } - if (dsCount.perColumnNames) { - $(newDataSource).find("[id^=per-column-names]").val(dsCount.perColumnNames.join(",")); - if (dsCount.perColumnRecordsMin && dsCount.perColumnRecordsMax) { + if (dsCount.perFieldNames) { + $(newDataSource).find("[id^=per-field-names]").val(dsCount.perFieldNames.join(",")); + if (dsCount.perFieldRecordsMin && dsCount.perFieldRecordsMax) { $(newDataSource).find(".per-unique-set-of-values-between").prop("checked", true); - $(newDataSource).find("[id^=per-column-min-record-count]").val(dsCount.perColumnRecordsMin); - $(newDataSource).find("[id^=per-column-max-record-count]").val(dsCount.perColumnRecordsMax); + $(newDataSource).find("[id^=per-field-min-record-count]").val(dsCount.perFieldRecordsMin); + $(newDataSource).find("[id^=per-field-max-record-count]").val(dsCount.perFieldRecordsMax); } else { $(newDataSource).find(".per-unique-set-of-values").prop("checked", true); - $(newDataSource).find("[id^=per-column-record-count]").val(dsCount.perColumnRecords); + $(newDataSource).find("[id^=per-field-record-count]").val(dsCount.perFieldRecords); } - $(newDataSource).find("[id^=per-column-distribution-select]").selectpicker("val", dsCount.perColumnRecordsDistribution); - let updatedPerColumnDistribution = $(newDataSource).find("[id^=per-column-distribution-select]"); - dispatchEvent(updatedPerColumnDistribution, "change"); - if (dsCount.perColumnRecordsDistribution === "exponential") { - $(newDataSource).find("[id^=per-column-distribution-rate]").val(dsCount.perColumnRecordsDistributionRateParam); + $(newDataSource).find("[id^=per-field-distribution-select]").selectpicker("val", dsCount.perFieldRecordsDistribution); + let updatedPerFieldDistribution = $(newDataSource).find("[id^=per-field-distribution-select]"); + dispatchEvent(updatedPerFieldDistribution, "change"); + if (dsCount.perFieldRecordsDistribution === "exponential") { + $(newDataSource).find("[id^=per-field-distribution-rate]").val(dsCount.perFieldRecordsDistributionRateParam); } } } @@ -83,73 +83,73 @@ export function getRecordCount(dataSource, currentDataSource) { currentDataSource["count"] = recordCountSummary; } -function createPerColumnCountContainer(index, estimatedRecordCountContainer) { - let perColumnRecordCol = createRecordCountInput(index, "per-column-record-count", "Records", "2"); - let perColumnMinCol = createRecordCountInput(index, "per-column-min-record-count", "Min", "1"); - let perColumnMaxCol = createRecordCountInput(index, "per-column-max-record-count", "Max", "2"); - let perColumnBetweenContainer = document.createElement("div"); - perColumnBetweenContainer.setAttribute("class", "row g-1"); - perColumnBetweenContainer.append(perColumnMinCol, perColumnMaxCol); - let perColumnOptions = [{text: "None"}, { +function createPerFieldCountContainer(index, estimatedRecordCountContainer) { + let perFieldRecordCol = createRecordCountInput(index, "per-field-record-count", "Records", "2"); + let perFieldMinCol = createRecordCountInput(index, "per-field-min-record-count", "Min", "1"); + let perFieldMaxCol = createRecordCountInput(index, "per-field-max-record-count", "Max", "2"); + let perFieldBetweenContainer = document.createElement("div"); + perFieldBetweenContainer.setAttribute("class", "row g-1"); + perFieldBetweenContainer.append(perFieldMinCol, perFieldMaxCol); + let perFieldOptions = [{text: "None"}, { text: "Per unique set of values", - child: perColumnRecordCol - }, {text: "Per unique set of values between", child: perColumnBetweenContainer}]; - let perColumnRadio = createRadioButtons(index, "per-column-record-count-radio", perColumnOptions, "col-6"); - // above per column radio is choice of columns - let perColumnText = createInput(`per-column-names-${index}`, "Column(s)", "form-control input-field record-count-field", "text", ""); - let perColumnFormFloating = createFormFloating("Column(s)", perColumnText); - // per column distribution alongside radio buttons - let perColumnDistributionSelect = createSelect(`per-column-distribution-select-${index}`, "Distribution", "selectpicker form-control input-field record-count-distribution-field col", "Select data distribution..."); + child: perFieldRecordCol + }, {text: "Per unique set of values between", child: perFieldBetweenContainer}]; + let perFieldRadio = createRadioButtons(index, "per-field-record-count-radio", perFieldOptions, "col-6"); + // above per field radio is choice of fields + let perFieldText = createInput(`per-field-names-${index}`, "Field(s)", "form-control input-field record-count-field", "text", ""); + let perFieldFormFloating = createFormFloating("Field(s)", perFieldText); + // per field distribution alongside radio buttons + let perFieldDistributionSelect = createSelect(`per-field-distribution-select-${index}`, "Distribution", "selectpicker form-control input-field record-count-distribution-field col", "Select data distribution..."); ["Uniform", "Exponential", "Normal"].forEach(dist => { let option = document.createElement("option"); option.setAttribute("value", dist.toLowerCase()); option.innerText = dist; - perColumnDistributionSelect.append(option); + perFieldDistributionSelect.append(option); }); - let perColumnOptionsRow = document.createElement("div"); - perColumnOptionsRow.setAttribute("class", "row g-3 m-1 align-items-center"); - perColumnOptionsRow.append(perColumnRadio, perColumnDistributionSelect); - $(perColumnDistributionSelect).selectpicker("val", "uniform"); + let perFieldOptionsRow = document.createElement("div"); + perFieldOptionsRow.setAttribute("class", "row g-3 m-1 align-items-center"); + perFieldOptionsRow.append(perFieldRadio, perFieldDistributionSelect); + $(perFieldDistributionSelect).selectpicker("val", "uniform"); - let perColumnDistributionRateParam = createInput(`per-column-distribution-rate-param-${index}`, "Rate Parameter", "form-control input-field record-count-distribution-field", "number", "1.0"); - perColumnDistributionRateParam.setAttribute("min", "0"); - perColumnDistributionRateParam.setAttribute("step", "0.00000001"); - let formFloatingRate = createFormFloating("Rate Parameter", perColumnDistributionRateParam); - perColumnDistributionSelect.addEventListener("change", (event) => { + let perFieldDistributionRateParam = createInput(`per-field-distribution-rate-param-${index}`, "Rate Parameter", "form-control input-field record-count-distribution-field", "number", "1.0"); + perFieldDistributionRateParam.setAttribute("min", "0"); + perFieldDistributionRateParam.setAttribute("step", "0.00000001"); + let formFloatingRate = createFormFloating("Rate Parameter", perFieldDistributionRateParam); + perFieldDistributionSelect.addEventListener("change", (event) => { if (event.target.value === "exponential") { // add extra input for rate parameter - perColumnOptionsRow.append(formFloatingRate); - } else if (perColumnOptionsRow.contains(formFloatingRate)) { + perFieldOptionsRow.append(formFloatingRate); + } else if (perFieldOptionsRow.contains(formFloatingRate)) { // check if rate parameter exists, if it does, remove it - perColumnOptionsRow.removeChild(formFloatingRate); + perFieldOptionsRow.removeChild(formFloatingRate); } }); - let columnInputRow = document.createElement("div"); - columnInputRow.setAttribute("class", "row g-3 m-1 align-items-center"); - let columnInputHelpDiv = createFormText(perColumnFormFloating.getAttribute("id"), "Choose which column(s) to use for creating multiple records each unique group of values.", "span"); - columnInputHelpDiv.setAttribute("class", "col-6"); - columnInputRow.append(perColumnFormFloating, columnInputHelpDiv); - - let perColumnInnerContainer = document.createElement("div"); - perColumnInnerContainer.setAttribute("class", "card card-body"); - if (index === 1 || perColumnInnerContainer.childElementCount === 0) { //TODO should only put if first task in UI - let perColumnExampleButton = createButton("per-column-example-button", "per-column-example", "btn btn-info", "Example"); - perColumnExampleButton.setAttribute("data-bs-toggle", "modal"); - perColumnExampleButton.setAttribute("data-bs-target", "#perColumnExampleModal"); - let perColumnHelpText = document.createElement("div"); - perColumnHelpText.innerHTML = "Generate multiple records per set of unique column value(s). " + perColumnExampleButton.outerHTML; - perColumnInnerContainer.append(perColumnHelpText); + let fieldInputRow = document.createElement("div"); + fieldInputRow.setAttribute("class", "row g-3 m-1 align-items-center"); + let fieldInputHelpDiv = createFormText(perFieldFormFloating.getAttribute("id"), "Choose which field(s) to use for creating multiple records each unique group of values.", "span"); + fieldInputHelpDiv.setAttribute("class", "col-6"); + fieldInputRow.append(perFieldFormFloating, fieldInputHelpDiv); + + let perFieldInnerContainer = document.createElement("div"); + perFieldInnerContainer.setAttribute("class", "card card-body"); + if (index === 1 || perFieldInnerContainer.childElementCount === 0) { //TODO should only put if first task in UI + let perFieldExampleButton = createButton("per-field-example-button", "per-field-example", "btn btn-info", "Example"); + perFieldExampleButton.setAttribute("data-bs-toggle", "modal"); + perFieldExampleButton.setAttribute("data-bs-target", "#perFieldExampleModal"); + let perFieldHelpText = document.createElement("div"); + perFieldHelpText.innerHTML = "Generate multiple records per set of unique field value(s). " + perFieldExampleButton.outerHTML; + perFieldInnerContainer.append(perFieldHelpText); } - // TODO when perColumnText is empty, disable checkbox for per column - let perColumnContainer = document.createElement("div"); - perColumnContainer.setAttribute("id", "count-advanced-collapse-" + index); - perColumnContainer.setAttribute("class", "collapse"); - perColumnInnerContainer.append(columnInputRow, perColumnOptionsRow, estimatedRecordCountContainer); - perColumnContainer.append(perColumnInnerContainer); - return perColumnContainer; + // TODO when perFieldText is empty, disable checkbox for per field + let perFieldContainer = document.createElement("div"); + perFieldContainer.setAttribute("id", "count-advanced-collapse-" + index); + perFieldContainer.setAttribute("class", "collapse"); + perFieldInnerContainer.append(fieldInputRow, perFieldOptionsRow, estimatedRecordCountContainer); + perFieldContainer.append(perFieldInnerContainer); + return perFieldContainer; } function createBaseRecordCountContainer(index) { @@ -181,34 +181,37 @@ function estimateRecordCount(recordCountRow) { recordCountSummary["records"] = baseRecordCount; } - let perColumnCheck = $(recordCountRow).find("input.per-column-record-count-radio:checked").parent().find(".record-count-field"); - let perColumnCount; - if (perColumnCheck.length > 1) { - let minPerCol = Number($(perColumnCheck).filter("input[aria-label=Min]").val()); - let maxPerCol = Number($(perColumnCheck).filter("input[aria-label=Max]").val()); - perColumnCount = (maxPerCol + minPerCol) / 2; - recordCountSummary["perColumnRecordsMin"] = minPerCol; - recordCountSummary["perColumnRecordsMax"] = maxPerCol; - } else if (perColumnCheck.length === 1) { - perColumnCount = Number(perColumnCheck.val()); - recordCountSummary["perColumnRecords"] = perColumnCount; + let perFieldCheck = $(recordCountRow).find("input.per-field-record-count-radio:checked").parent().find(".record-count-field"); + let perFieldCount; + if (perFieldCheck.length > 1) { + let minPerCol = Number($(perFieldCheck).filter("input[aria-label=Min]").val()); + let maxPerCol = Number($(perFieldCheck).filter("input[aria-label=Max]").val()); + perFieldCount = (maxPerCol + minPerCol) / 2; + recordCountSummary["perFieldRecordsMin"] = minPerCol; + recordCountSummary["perFieldRecordsMax"] = maxPerCol; + } else if (perFieldCheck.length === 1) { + perFieldCount = Number(perFieldCheck.val()); + recordCountSummary["perFieldRecords"] = perFieldCount; } else { - perColumnCount = 1; + perFieldCount = 1; } - if (perColumnCheck.length >= 1) { - let perColumNames = $(recordCountRow).find("[id^=per-column-names]").val(); - recordCountSummary["perColumnNames"] = perColumNames ? perColumNames.split(",") : []; + if (perFieldCheck.length >= 1) { + let perColumNames = $(recordCountRow).find("[id^=per-field-names]").val(); + recordCountSummary["perFieldNames"] = perColumNames ? perColumNames.split(",") : []; } - recordCountSummary["perColumnRecordsDistribution"] = $(recordCountRow).find("[id^=per-column-distribution-select]").val(); - recordCountSummary["perColumnRecordsDistributionRateParam"] = $(recordCountRow).find("[id^=per-column-distribution-rate-param]").val(); + let perFieldDist = $(recordCountRow).find("[id^=per-field-distribution-select]").val(); + if (perFieldDist !== "uniform") { + recordCountSummary["perFieldRecordsDistribution"] = perFieldDist; + } + recordCountSummary["perFieldRecordsDistributionRateParam"] = $(recordCountRow).find("[id^=per-field-distribution-rate-param]").val(); - recordCountSummary["estimateRecords"] = baseRecordCount * perColumnCount; + recordCountSummary["estimateRecords"] = baseRecordCount * perFieldCount; return recordCountSummary; } function createRecordCountInput(index, name, label, value) { let recordCountInput = createInput(`${name}-${index}`, label, "form-control input-field record-count-field", "number", value); - let radioGroup = name.startsWith("per-column") ? `per-column-count-${index}` : `base-record-count-${index}`; + let radioGroup = name.startsWith("per-field") ? `per-field-count-${index}` : `base-record-count-${index}`; recordCountInput.setAttribute("radioGroup", radioGroup); recordCountInput.setAttribute("min", "0"); return createFormFloating(label, recordCountInput); diff --git a/app/src/main/resources/ui/helper-validation.js b/app/src/main/resources/ui/helper-validation.js index ebf3e045..462f52b1 100644 --- a/app/src/main/resources/ui/helper-validation.js +++ b/app/src/main/resources/ui/helper-validation.js @@ -1,7 +1,7 @@ /* Different types of validation: -- Basic column -- Dataset (column names, row count) +- Basic field +- Dataset (field names, row count) - Group by/aggregate - Upstream - External source (great expectations) @@ -28,91 +28,71 @@ export function incValidations() { } function createGroupByValidationFromPlan(newValidation, validationOpts, validation) { - let updatedGroupByCols = $(newValidation).find("[aria-label=GroupByColumns]").val(validationOpts.groupByColumns) + let updatedGroupByCols = $(newValidation).find("[aria-label=GroupByFields]").val(validation.groupByFields) dispatchEvent(updatedGroupByCols, "input"); // can be nested validations - if (validation.nested && validation.nested.validations) { - for (let nestedValidation of validation.nested.validations) { - numValidations += 1; - let dataValidationContainer = $(newValidation).find("[id^=data-validation-container]")[0]; - let metadata = Object.create(validationTypeOptionsMap.get("groupBy")[nestedValidation.options["aggType"]]); - metadata["default"] = nestedValidation.options["aggCol"]; - addNewDataTypeAttribute(nestedValidation.options["aggType"], metadata, `groupBy-validation-${numValidations}`, "data-validation-field", dataValidationContainer); - let aggregationRow = $(dataValidationContainer).find(".data-source-validation-container-nested-validation").last().find(".row").first(); + let dataValidationContainer = $(newValidation).find("[id^=data-validation-container]")[0]; + let metadata = Object.create(validationTypeOptionsMap.get("groupBy")[validation.aggType]); + metadata["default"] = validation.aggField; + addNewDataTypeAttribute(validation.aggType, metadata, `groupBy-validation-${numValidations}`, "data-validation-field", dataValidationContainer); + let aggregationRow = $(dataValidationContainer).find(".data-source-validation-container-nested-validation").last().find(".row").first(); - if (nestedValidation.options) { - for (let [optKey, optVal] of Object.entries(nestedValidation.options)) { - if (optKey !== "aggType" && optKey !== "aggCol") { - createNewValidateAttribute(optKey, "column", optVal, aggregationRow); - } - } - } - } - } + addFieldValidations(validation, aggregationRow); } -function createNewValidateAttribute(optKey, validationType, optVal, mainContainer) { +function createNewValidateAttribute(optKey, validationType, optVal, checked, mainContainer) { numValidations += 1; - let baseKey = optKey; - if (optKey.startsWith("not")) { - baseKey = optKey.charAt(3).toLowerCase() + optKey.slice(4); - } else if (optKey.startsWith("equalOr")) { - baseKey = optKey.charAt(7).toLowerCase() + optKey.slice(8); - } - - // if it is 'notEqual' or `equalOrLessThan`, need to ensure checkbox is checked - if (optKey === "notNull") { - let baseOptions = Object.create(validationTypeOptionsMap.get(validationType)[optKey]); - addNewDataTypeAttribute(optKey, baseOptions, `data-validation-container-${numValidations}-${optKey}`, "data-validation-field", mainContainer); - } else { - let baseOptions = Object.create(validationTypeOptionsMap.get(validationType)[baseKey]); - if (baseKey !== optKey) baseOptions.group.checked = "true"; - baseOptions["default"] = optVal; - addNewDataTypeAttribute(baseKey, baseOptions, `data-validation-container-${numValidations}-${optKey}`, "data-validation-field", mainContainer); - } + let baseOptions = Object.create(validationTypeOptionsMap.get(validationType)[optKey]); + if (checked) baseOptions.group.checked = "true"; + if (optVal) baseOptions["default"] = optVal; + addNewDataTypeAttribute(optKey, baseOptions, `data-validation-container-${numValidations}-${optKey}`, "data-validation-field", mainContainer); document.getElementById(`data-validation-container-${numValidations}-${optKey}`).dispatchEvent(new Event("input")); } -async function createValidationsFromDataSource(dataSource, manualValidation) { - for (const validation of dataSource.optValidations) { +async function createValidationsFromDataSource(validations, validationOpts, manualValidation) { + for (const validation of validations) { numValidations += 1; let newValidation = await createNewField(numValidations, "validation"); $(manualValidation).children(".accordion").append(newValidation); - let updatedValidationType = $(newValidation).find("select[class~=validation-type]").selectpicker("val", validation.type); - dispatchEvent(updatedValidationType, "change"); - let validationOpts = validation.options; let mainContainer = $(newValidation).find("[id^=data-validation-container]")[0]; - if (validation.type === "column" && validationOpts.field) { - let updatedValidationCol = $(newValidation).find("[aria-label=Field]").val(validationOpts.field); + if (validation.field && validation.validation) { + let updatedValidationType = $(newValidation).find("select[class~=validation-type]").selectpicker("val", "field"); + dispatchEvent(updatedValidationType, "change"); + let updatedValidationCol = $(newValidation).find("[aria-label=Field]").val(validation.field); dispatchEvent(updatedValidationCol, "input"); - } else if (validation.type === "groupBy" && validationOpts.groupByColumns) { + addFieldValidations(validation, mainContainer); + } else if (validation.groupByFields && validation.validation) { + let updatedValidationType = $(newValidation).find("select[class~=validation-type]").selectpicker("val", "groupBy"); + dispatchEvent(updatedValidationType, "change"); createGroupByValidationFromPlan(newValidation, validationOpts, validation); - } else if (validation.type === "upstream" && validationOpts.upstreamTaskName) { - let updatedUpstreamTaskName = $(newValidation).find("[aria-label=UpstreamTaskName]").val(validationOpts.upstreamTaskName); + } else if (validation.upstreamTaskName && validation.validation) { + let updatedValidationType = $(newValidation).find("select[class~=validation-type]").selectpicker("val", "upstream"); + dispatchEvent(updatedValidationType, "change"); + let updatedUpstreamTaskName = $(newValidation).find("[aria-label=UpstreamTaskName]").val(validation.upstreamTaskName); dispatchEvent(updatedUpstreamTaskName, "input"); - // can be nested validations - - if (validation.nested && validation.nested.validations) { - let nestedManualValidation = $(newValidation).find(".data-source-validation-container-nested-validation").first(); - await createValidationsFromDataSource(validation.nested, nestedManualValidation); + //update joinFields, joinType or joinEpr + createNewValidateAttribute("joinFields", "upstream", validation.joinFields, false, mainContainer); + createNewValidateAttribute("joinType", "upstream", validation.joinType, false, mainContainer); + if (validation.joinExpr) { + createNewValidateAttribute("joinExpr", "upstream", validation.joinExpr, false, mainContainer); } - } - //otherwise it is column name validation which doesn't have any default options - for (const [optKey, optVal] of Object.entries(validationOpts)) { - if (optKey !== "groupByColumns" && optKey !== "column" && optKey !== "field" && optKey !== "upstreamTaskName") { - createNewValidateAttribute(optKey, validation.type, optVal, mainContainer); + if (validation.validation && validation.validation.length > 0) { + let nestedManualValidation = $(newValidation).find(".data-source-validation-container-nested-validation").first(); + await createValidationsFromDataSource(validation.validation, validationOpts, nestedManualValidation); } } + //otherwise it is field name validation which doesn't have any default options } } export async function createValidationFromPlan(dataSource, newDataSource, numDataSources) { let dataSourceValidationContainer = $(newDataSource).find("#data-source-validation-config-container"); - if (dataSource.validations && dataSource.validations.optMetadataSource) { + console.log(dataSource); + if (dataSource.validations && dataSource.options["metadataSourceName"]) { $(dataSourceValidationContainer).find("[id^=auto-from-metadata-source-validation-checkbox]").prop("checked", true); let autoFromMetadataSchema = await createAutoFromMetadataSourceContainer(numDataSources); $(dataSourceValidationContainer).find(".manual").after(autoFromMetadataSchema); @@ -120,17 +100,34 @@ export async function createValidationFromPlan(dataSource, newDataSource, numDat await createAutoFromMetadata(autoFromMetadataSchema, dataSource); } - if (dataSource.validations && dataSource.validations.optValidations && dataSource.validations.optValidations.length > 0) { + if (dataSource.validations && dataSource.validations.length > 0) { let manualValidation = createManualContainer(numValidations, "validation"); let dataSourceGenContainer = $(newDataSource).find("#data-source-validation-config-container"); dataSourceGenContainer.append(manualValidation); $(dataSourceGenContainer).find("[id^=manual-validation-checkbox]").prop("checked", true); - await createValidationsFromDataSource(dataSource.validations, manualValidation); + await createValidationsFromDataSource(dataSource.validations, dataSource.options, manualValidation); + } +} + +function addFieldValidations(validation, container) { + if (validation.validation && validation.validation.length > 0) { + for (const valid of validation.validation) { + let key = valid.type; + let value = valid.value; + let checked = false; + if (valid.negate) { + checked = true; + } else if (!valid.strictly) { + checked = true; + } + createNewValidateAttribute(key, "field", value, checked, container); + } } } function getValidationsFromContainer(dataSourceValidations, visitedValidations) { + let aggLabels = Array("sum", "average", "max", "min", "standardDeviation", "median", "mode", "count"); let dataValidationContainers = findNextLevelNodesByClass($(dataSourceValidations), ["data-validation-container"]); return dataValidationContainers.map(validation => { let validationAttributes = findNextLevelNodesByClass($(validation), "data-validation-field", ["card", "data-validation-container", "data-source-validation-container-nested-validation"]); @@ -147,56 +144,69 @@ function getValidationsFromContainer(dataSourceValidations, visitedValidations) // nested fields can be defined for upstream and groupBy let nestedValidations = Array.from(validation.querySelectorAll(".data-source-validation-container-nested-validation").values()); let allNestedValidations = []; + for (let nestedValidation of nestedValidations) { let currNested = getValidationsFromContainer(nestedValidation, visitedValidations); + if (!jQuery.isEmptyObject(currNested)) { + currNested.forEach(n => { + if (!jQuery.isEmptyObject(n)) { + console.log(n); + if (fieldValue === "upstream") { + if (options["validation"]) { + options["validation"].push(n); + } else { + options["validation"] = [n]; + } + } else if (fieldValue === "groupBy") { + Object.entries(n).forEach(o => options[o[0]] = o[1]); + } + } + }); + } allNestedValidations.push(currNested); } options[label] = fieldValue; - options["nested"] = {validations: allNestedValidations.flat().filter(o => !jQuery.isEmptyObject(o))}; - } else if (label === "sum" || label === "average" || label === "max" || label === "min" || label === "standardDeviation" || label === "count") { - // then we need to set the type as column and set the column name - options["type"] = "column"; - let currOpts = (options["options"] || new Map()); - currOpts.set("aggType", label); - currOpts.set("aggCol", fieldValue); - options["options"] = currOpts; - } else if (label === "name" || label === "type") { + // options["validation"] = allNestedValidations.flat().filter(o => !jQuery.isEmptyObject(o)); + } else if (aggLabels.includes(label)) { + options["aggType"] = label; + options["aggField"] = fieldValue; + } else if (label === "name" || label === "field" || label === "upstreamTaskName" || label === "joinType" || label === "joinExpr" || label === "description" || label === "errorThreshold") { options[label] = fieldValue; + } else if (label === "joinFields" || label === "groupByFields") { + options[label] = fieldValue.includes(",") ? fieldValue.split(",") : [fieldValue]; + } else if (label === "type") { + } else { let currOpts = (options["options"] || new Map()); + currOpts.set("type", label); + currOpts.set("value", fieldValue); + //TODO need to map the validation type params to key -> value pairs // need to check if it is part of input group let checkbox = $(attr).closest(".input-group").find(".form-check-input"); if (checkbox && checkbox.length > 0) { if (checkbox[0].checked) { - // then we need to get the opposite of the label (i.e. equal -> notEqual) switch (label) { - case "equal": - case "contains": - case "between": - case "in": - case "matches": - case "startsWith": - case "endsWith": - case "null": - case "size": - let oppositeLabel = "not" + label.charAt(0).toUpperCase() + label.slice(1); - currOpts.set(oppositeLabel, fieldValue); - break; case "lessThan": case "greaterThan": case "lessThanSize": case "greaterThanSize": - let equalLabel = "equalOr" + label.charAt(0).toUpperCase() + label.slice(1); - currOpts.set(equalLabel, fieldValue); + case "isDecreasing": + case "isIncreasing": + currOpts.set("strictly", "false"); + break; + default: + currOpts.set("negate", "true"); break; } + } + } + if (currOpts.size > 0) { + if (options["validation"]) { + options["validation"].push(currOpts); } else { - currOpts.set(label, fieldValue); + options["validation"] = [currOpts]; } - } else { - currOpts.set(label, fieldValue); } - options["options"] = currOpts; } return options; } else { @@ -206,41 +216,38 @@ function getValidationsFromContainer(dataSourceValidations, visitedValidations) }); } -export function getValidations(dataSource, currentDataSource) { - let dataValidationInfo = {}; +export function getValidations(dataSource, currentValidation) { // check which checkboxes are enabled: auto, auto with external, manual let isAutoChecked = $(dataSource).find("[id^=auto-validation-checkbox]").is(":checked"); let isAutoFromMetadataChecked = $(dataSource).find("[id^=auto-from-metadata-source-validation-checkbox]").is(":checked"); let isManualChecked = $(dataSource).find("[id^=manual-validation-checkbox]").is(":checked"); + currentValidation["options"] = {}; if (isAutoChecked) { // need to enable data generation within data source options - currentDataSource["options"] = {enableDataValidation: "true"}; + currentValidation["options"]["enableDataValidation"] = "true"; } else if (isAutoFromMetadataChecked) { - let dataSourceAutoSchemaContainer = $(dataSource).find("[class~=data-source-auto-from-metadata-container]")[0]; + let dataSourceValidationContainer = $(dataSource).find("[id^=data-source-validation-config-container]")[0]; + let dataSourceAutoSchemaContainer = $(dataSourceValidationContainer).find("[class~=data-source-auto-from-metadata-container]")[0]; let metadataConnectionName = $(dataSourceAutoSchemaContainer).find("select[class~=metadata-connection-name]").val(); - let metadataConnectionOptions = $(dataSourceAutoSchemaContainer).find("input[class~=metadata-source-property]").toArray() + $(dataSourceAutoSchemaContainer).find("input[class~=metadata-source-property]").toArray() .reduce(function (map, option) { if (option.value !== "") { - map[option.getAttribute("aria-label")] = option.value; + currentValidation["options"][option.getAttribute("aria-label")] = option.value; } return map; }, {}); - dataValidationInfo["optMetadataSource"] = { - name: metadataConnectionName, - overrideOptions: metadataConnectionOptions - }; + currentValidation["options"]["metadataSourceName"] = metadataConnectionName; } else if (isManualChecked) { // get top level validation container let dataSourceValidations = $(dataSource).find("[id^=data-source-validation-container]")[0]; let visitedValidations = new Set(); let dataValidationsWithAttributes = getValidationsFromContainer(dataSourceValidations, visitedValidations); - dataValidationInfo["optValidations"] = Object.values(dataValidationsWithAttributes); + currentValidation["validations"] = Object.values(dataValidationsWithAttributes); } - currentDataSource["validations"] = dataValidationInfo; } -export function addColumnValidationBlock(newAttributeRow, mainContainer, attributeContainerId, inputClass) { +export function addFieldValidationBlock(newAttributeRow, mainContainer, attributeContainerId, inputClass) { numValidations += 1; let cardDiv = document.createElement("div"); cardDiv.setAttribute("class", "card m-1 data-source-validation-container-nested-validation"); @@ -250,9 +257,9 @@ export function addColumnValidationBlock(newAttributeRow, mainContainer, attribu cardDiv.append(cardBody); mainContainer.append(cardDiv); - // column validation applied after group by + // field validation applied after group by let {buttonWithMenuDiv, addAttributeButton, menu} = createButtonWithMenu(mainContainer); - addItemsToAttributeMenu(validationTypeOptionsMap.get("column"), menu); + addItemsToAttributeMenu(validationTypeOptionsMap.get("field"), menu); newAttributeRow.append(buttonWithMenuDiv); let closeButton = createCloseButton(cardDiv); newAttributeRow.append(closeButton); @@ -260,7 +267,7 @@ export function addColumnValidationBlock(newAttributeRow, mainContainer, attribu let attribute = event.target.getAttribute("value"); // check if attribute already exists if ($(newAttributeRow).find(`[aria-label=${attribute}]`).length === 0) { - let validationMetadata = validationTypeOptionsMap.get("column")[attribute]; + let validationMetadata = validationTypeOptionsMap.get("field")[attribute]; addNewDataTypeAttribute(attribute, validationMetadata, `${attributeContainerId}-${attribute}`, inputClass, newAttributeRow); } }); diff --git a/app/src/main/resources/ui/history/history.js b/app/src/main/resources/ui/history/history.js index 522e5b57..13b7b04a 100644 --- a/app/src/main/resources/ui/history/history.js +++ b/app/src/main/resources/ui/history/history.js @@ -62,8 +62,8 @@ fetch("http://localhost:9898/run/history", { for (const runUpdatesById of planHistoryByIdValues) { let runUpdates = runUpdatesById.runs; let latestRunUpdate = runUpdates[runUpdates.length - 1]; - latestRunUpdate["createdTs"] = latestRunUpdate["createdTs"].replace("T", " ").replace(/\+.*/, ""); - latestRunUpdate["updatedTs"] = latestRunUpdate["updatedTs"].replace("T", " ").replace(/\+.*/, ""); + latestRunUpdate["createdTs"] = new Date(latestRunUpdate["createdTs"]).toISOString(); + latestRunUpdate["updatedTs"] = new Date(latestRunUpdate["updatedTs"]).toISOString(); let reportHref = `http://localhost:9898/report/${latestRunUpdate["id"]}/index.html`; latestRunUpdate["reportLink"] = latestRunUpdate["reportLink"] === "" ? "" : `Report`; let generationSummary = Array.from(latestRunUpdate["generationSummary"]) diff --git a/app/src/main/resources/ui/index.html b/app/src/main/resources/ui/index.html index c3caae45..e15335fc 100644 --- a/app/src/main/resources/ui/index.html +++ b/app/src/main/resources/ui/index.html @@ -85,7 +85,7 @@

- Define relationships between columns across any task to ensure values remain consistent. + Define relationships between fields across any task to ensure values remain consistent.
@@ -126,11 +126,11 @@

Relationship Exa

-