Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[CDAP-15361] Add Schema handling in Wrangler #645

Merged
merged 6 commits into from
Jul 25, 2023
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 16 additions & 1 deletion wrangler-api/src/main/java/io/cdap/wrangler/api/Executor.java
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,11 @@

package io.cdap.wrangler.api;

import io.cdap.cdap.api.data.schema.Schema;
import io.cdap.wrangler.api.annotations.PublicEvolving;

import java.io.Serializable;
import javax.annotation.Nullable;

/**
* A interface defining the wrangle Executor in the wrangling {@link RecipePipeline}.
Expand Down Expand Up @@ -80,5 +82,18 @@ O execute(I rows, ExecutorContext context)
* correct at this phase of invocation.
*/
void destroy();
}

/**
* This method is used to get the updated schema of the data after the directive's transformation has been applied.
* @implNote By default, returns a null and the schema is inferred from the data when necessary.
* <p>For consistent handling, override for directives that perform column renames,
* column data type changes or column additions with specific schemas.</p>
* @param inputSchema input {@link Schema} of the data before transformation
* @return output {@link Schema} of the transformed data
*/
@Nullable
default Schema getOutputSchema(Schema inputSchema) {
tivv marked this conversation as resolved.
Show resolved Hide resolved
// no op
return null;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,14 @@

import io.cdap.cdap.api.data.format.StructuredRecord;
import io.cdap.cdap.api.data.schema.Schema;
import io.cdap.cdap.api.data.schema.Schema.Field;
import io.cdap.wrangler.api.Directive;
import io.cdap.wrangler.api.DirectiveExecutionException;
import io.cdap.wrangler.api.ErrorRecord;
import io.cdap.wrangler.api.ErrorRowException;
import io.cdap.wrangler.api.Executor;
import io.cdap.wrangler.api.ExecutorContext;
import io.cdap.wrangler.api.Pair;
import io.cdap.wrangler.api.RecipeException;
import io.cdap.wrangler.api.RecipeParser;
import io.cdap.wrangler.api.RecipePipeline;
Expand All @@ -32,11 +34,16 @@
import io.cdap.wrangler.api.TransientVariableScope;
import io.cdap.wrangler.utils.RecordConvertor;
import io.cdap.wrangler.utils.RecordConvertorException;
import io.cdap.wrangler.utils.SchemaConverter;
import io.cdap.wrangler.utils.TransientStoreKeys;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import javax.annotation.Nullable;

/**
Expand All @@ -45,9 +52,11 @@
public final class RecipePipelineExecutor implements RecipePipeline<Row, StructuredRecord, ErrorRecord> {

private static final Logger LOG = LoggerFactory.getLogger(RecipePipelineExecutor.class);
private static final String TEMP_SCHEMA_FIELD_NAME = "temporarySchemaField";

private final ErrorRecordCollector collector = new ErrorRecordCollector();
private final RecordConvertor convertor = new RecordConvertor();
private final SchemaConverter generator = new SchemaConverter();
private final RecipeParser recipeParser;
private final ExecutorContext context;
private List<Directive> directives;
Expand Down Expand Up @@ -112,6 +121,12 @@ public List<Row> execute(List<Row> rows) throws RecipeException {
context.getTransientStore().reset(TransientVariableScope.LOCAL);
}

// Initialize schema with input schema from TransientStore if running in service env (design-time) / testing env
boolean designTime = context.getEnvironment() != null &&
context.getEnvironment().equals(ExecutorContext.Environment.SERVICE) ||
context.getEnvironment().equals(ExecutorContext.Environment.TESTING);
Schema schema = designTime ? context.getTransientStore().get(TransientStoreKeys.INPUT_SCHEMA) : null;

List<Row> cumulativeRows = rows.subList(i, i + 1);
directiveIndex = 0;
try {
Expand All @@ -122,14 +137,26 @@ public List<Row> execute(List<Row> rows) throws RecipeException {
if (cumulativeRows.size() < 1) {
break;
}
if (designTime && schema != null) {
Schema directiveOutputSchema = directive.getOutputSchema(schema);
schema = directiveOutputSchema != null ? directiveOutputSchema
: generateOutputSchema(schema, cumulativeRows);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should introduce a feature flag for this with default enabled.. If schema generator fails on some customer schema, there should be a way to disable it

}
} catch (ReportErrorAndProceed e) {
messages.add(String.format("%s (ecode: %d)", e.getMessage(), e.getCode()));
collector
.add(new ErrorRecord(rows.subList(i, i + 1).get(0), String.join(",", messages), e.getCode(), true));
cumulativeRows = new ArrayList<>();
break;
} catch (RecordConvertorException e) {
throw new RecipeException("Error while generating schema: " + e.getMessage(), e);
}
}
if (designTime && schema != null) {
Schema previousRowSchema = context.getTransientStore().get(TransientStoreKeys.OUTPUT_SCHEMA);
schema = previousRowSchema != null ? getSchemaUnion(previousRowSchema, schema) : schema;
context.getTransientStore().set(TransientVariableScope.GLOBAL, TransientStoreKeys.OUTPUT_SCHEMA, schema);
}
results.addAll(cumulativeRows);
} catch (ErrorRowException e) {
messages.add(String.format("%s", e.getMessage()));
Expand Down Expand Up @@ -161,4 +188,55 @@ private List<Directive> getDirectives() throws RecipeException {
}
return directives;
}

private Schema generateOutputSchema(Schema inputSchema, List<Row> output) throws RecordConvertorException {
Map<String, Schema> outputFieldMap = new LinkedHashMap<>();
for (Row row : output) {
for (Pair<String, Object> rowField : row.getFields()) {
String fieldName = rowField.getFirst();
Object fieldValue = rowField.getSecond();

Schema existing = inputSchema.getField(fieldName) != null ? inputSchema.getField(fieldName).getSchema() : null;
vanathi-g marked this conversation as resolved.
Show resolved Hide resolved
Schema generated = fieldValue != null && !isValidSchemaForValue(existing, fieldValue) ?
generator.getSchema(fieldValue, fieldName) : null;
vanathi-g marked this conversation as resolved.
Show resolved Hide resolved

if (generated != null) {
outputFieldMap.put(fieldName, generated);
vanathi-g marked this conversation as resolved.
Show resolved Hide resolved
} else if (existing != null) {
outputFieldMap.put(fieldName, existing);
}
}
}
List<Field> outputFields = outputFieldMap.entrySet().stream()
.map(e -> Schema.Field.of(e.getKey(), e.getValue()))
.collect(Collectors.toList());
return Schema.recordOf("output", outputFields);
}

// Checks whether the provided input schema is of valid type for given object
private boolean isValidSchemaForValue(@Nullable Schema schema, Object value) throws RecordConvertorException {
if (schema == null) {
return false;
}
Schema generated = generator.getSchema(value, TEMP_SCHEMA_FIELD_NAME);
generated = generated.isNullable() ? generated.getNonNullable() : generated;
schema = schema.isNullable() ? schema.getNonNullable() : schema;
return generated.getType().equals(schema.getType());
}

// Gets the union of fields in two schemas while maintaining insertion order and uniqueness of fields. If the same
// field exists with two different schemas, the second schema overwrites first one
private Schema getSchemaUnion(Schema first, Schema second) {
vanathi-g marked this conversation as resolved.
Show resolved Hide resolved
Map<String, Schema> fieldMap = new LinkedHashMap<>();
for (Field field : first.getFields()) {
fieldMap.put(field.getName(), field.getSchema());
}
for (Field field : second.getFields()) {
fieldMap.put(field.getName(), field.getSchema());
}
List<Field> outputFields = fieldMap.entrySet().stream()
.map(e -> Schema.Field.of(e.getKey(), e.getValue()))
.collect(Collectors.toList());
return Schema.recordOf("union", outputFields);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
/*
* Copyright © 2023 Cask Data, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/

package io.cdap.wrangler.utils;

/**
* TransientStoreKeys for storing Workspace schema in TransientStore
*/
public final class TransientStoreKeys {
public static final String INPUT_SCHEMA = "ws_input_schema";
public static final String OUTPUT_SCHEMA = "ws_output_schema";

private TransientStoreKeys() {
throw new AssertionError("Cannot instantiate a static utility class.");
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -24,46 +24,27 @@
import io.cdap.wrangler.api.ExecutorContext;
import io.cdap.wrangler.api.TransientStore;
import io.cdap.wrangler.proto.Contexts;
import org.apache.commons.collections.map.HashedMap;

import java.net.URL;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;

/**
* This class {@link TestingPipelineContext} is a runtime context that is provided for each
* {@link Executor} execution.
*/
class TestingPipelineContext implements ExecutorContext {
private StageMetrics metrics;
private String name;
private TransientStore store;
private Map<String, String> properties;

TestingPipelineContext() {
properties = new HashedMap();
public class TestingPipelineContext implements ExecutorContext {
private final StageMetrics metrics;
private final String name;
private final TransientStore store;
private final Map<String, String> properties;

public TestingPipelineContext() {
name = "testing";
properties = new HashMap<>();
store = new DefaultTransientStore();
}

/**
* @return Environment this context is prepared for.
*/
@Override
public Environment getEnvironment() {
return Environment.TESTING;
}

@Override
public String getNamespace() {
return Contexts.SYSTEM;
}

/**
* @return Measurements context.
*/
@Override
public StageMetrics getMetrics() {
return new StageMetrics() {
metrics = new StageMetrics() {
@Override
public void count(String s, int i) {

Expand Down Expand Up @@ -96,12 +77,33 @@ public Map<String, String> getTags() {
};
}

/**
* @return Environment this context is prepared for.
*/
@Override
public Environment getEnvironment() {
return Environment.TESTING;
}

@Override
public String getNamespace() {
return Contexts.SYSTEM;
}

/**
* @return Measurements context.
*/
@Override
public StageMetrics getMetrics() {
return metrics;
}

/**
* @return Context name.
*/
@Override
public String getContextName() {
return "testing";
return name;
}

/**
Expand Down
4 changes: 2 additions & 2 deletions wrangler-core/src/test/java/io/cdap/wrangler/TestingRig.java
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ private TestingRig() {
*/
public static List<Row> execute(String[] recipe, List<Row> rows)
throws RecipeException, DirectiveParseException, DirectiveLoadException {
return execute(recipe, rows, null);
return execute(recipe, rows, new TestingPipelineContext());
}

public static List<Row> execute(String[] recipe, List<Row> rows, ExecutorContext context)
Expand All @@ -83,7 +83,7 @@ public static List<Row> execute(String[] recipe, List<Row> rows, ExecutorContext
*/
public static Pair<List<Row>, List<Row>> executeWithErrors(String[] recipe, List<Row> rows)
throws RecipeException, DirectiveParseException, DirectiveLoadException, DirectiveNotFoundException {
return executeWithErrors(recipe, rows, null);
return executeWithErrors(recipe, rows, new TestingPipelineContext());
}

public static Pair<List<Row>, List<Row>> executeWithErrors(String[] recipe, List<Row> rows, ExecutorContext context)
Expand Down
Loading