diff --git a/pom.xml b/pom.xml index b43b5c99d..172d11f52 100644 --- a/pom.xml +++ b/pom.xml @@ -492,7 +492,7 @@ integration-test - verify + diff --git a/wrangler-transform/src/e2e-test/features/Wrangler/ParseAsCsv.feature b/wrangler-transform/src/e2e-test/features/Wrangler/ParseAsCsv.feature index fa59cb54c..7cb04063d 100644 --- a/wrangler-transform/src/e2e-test/features/Wrangler/ParseAsCsv.feature +++ b/wrangler-transform/src/e2e-test/features/Wrangler/ParseAsCsv.feature @@ -13,7 +13,7 @@ # the License. @Wrangler -Feature: Wrangler - Run time scenarios +Feature: Wrangler - Run time scenarios for parse csv @BQ_SOURCE_CSV_TEST @BQ_SINK_TEST Scenario: To verify User is able to run a pipeline using parse csv directive diff --git a/wrangler-transform/src/e2e-test/features/Wrangler/ParseAsExcel.feature b/wrangler-transform/src/e2e-test/features/Wrangler/ParseAsExcel.feature new file mode 100644 index 000000000..2ea6d6734 --- /dev/null +++ b/wrangler-transform/src/e2e-test/features/Wrangler/ParseAsExcel.feature @@ -0,0 +1,42 @@ +# Copyright © 2023 Cask Data, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not +# use this file except in compliance with the License. You may obtain a copy of +# the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations under +# the License. + +@Wrangler +Feature: Wrangler - Run time scenarios for parse as excel + + @GCS_SOURCE_TEST @BQ_SINK_TEST + Scenario: To verify User is able to run a pipeline using parse excel directive + Given Open Datafusion Project to configure pipeline + Then Click on the Plus Green Button to import the pipelines + Then Select the file for importing the pipeline for the plugin "Directive_parse_excel" + Then Navigate to the properties page of plugin: "GCSFile" + Then Replace input plugin property: "project" with value: "projectId" + Then Replace input plugin property: "path" with value: "gcsSourceBucket" + Then Click on the Get Schema button + Then Click on the Validate button + Then Close the Plugin Properties page + Then Navigate to the properties page of plugin: "BigQuery" + Then Replace input plugin property: "project" with value: "projectId" + Then Replace input plugin property: "table" with value: "bqTargetTable" + Then Replace input plugin property: "dataset" with value: "dataset" + Then Click on the Validate button + Then Close the Plugin Properties page + Then Rename the pipeline + Then Deploy the pipeline + Then Run the Pipeline in Runtime + Then Wait till pipeline is in running state + Then Open and capture logs + Then Verify the pipeline status is "Succeeded" + Then Close the pipeline logs + Then Validate The Data From BQ To BQ With Actual And Expected File for: "ExpectedDirective_parse_excel" diff --git a/wrangler-transform/src/e2e-test/features/Wrangler/ParseAsJson.feature b/wrangler-transform/src/e2e-test/features/Wrangler/ParseAsJson.feature new file mode 100644 index 000000000..6bd164d72 --- /dev/null +++ b/wrangler-transform/src/e2e-test/features/Wrangler/ParseAsJson.feature @@ -0,0 +1,43 @@ +# Copyright © 2023 Cask Data, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not +# use this file except in compliance with the License. You may obtain a copy of +# the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations under +# the License. + +@Wrangler +Feature: Wrangler - Run time scenarios for parse as json + + @BQ_SOURCE_JSON_TEST @BQ_SINK_TEST + Scenario: To verify User is able to run a pipeline using parse json directive + Given Open Datafusion Project to configure pipeline + Then Click on the Plus Green Button to import the pipelines + Then Select the file for importing the pipeline for the plugin "Directive_parse_json" + Then Navigate to the properties page of plugin: "BigQueryTable" + Then Replace input plugin property: "project" with value: "projectId" + Then Replace input plugin property: "dataset" with value: "dataset" + Then Replace input plugin property: "table" with value: "bqSourceTable" + Then Click on the Get Schema button + Then Click on the Validate button + Then Close the Plugin Properties page + Then Navigate to the properties page of plugin: "BigQuery2" + Then Replace input plugin property: "project" with value: "projectId" + Then Replace input plugin property: "table" with value: "bqTargetTable" + Then Replace input plugin property: "dataset" with value: "dataset" + Then Click on the Validate button + Then Close the Plugin Properties page + Then Rename the pipeline + Then Deploy the pipeline + Then Run the Pipeline in Runtime + Then Wait till pipeline is in running state + Then Open and capture logs + Then Verify the pipeline status is "Succeeded" + Then Close the pipeline logs + Then Validate The Data From BQ To BQ With Actual And Expected File for: "ExpectedDirective_parse_json" diff --git a/wrangler-transform/src/e2e-test/features/Wrangler/ParseAsXmlToJson.feature b/wrangler-transform/src/e2e-test/features/Wrangler/ParseAsXmlToJson.feature new file mode 100644 index 000000000..a3164ef67 --- /dev/null +++ b/wrangler-transform/src/e2e-test/features/Wrangler/ParseAsXmlToJson.feature @@ -0,0 +1,43 @@ +# Copyright © 2023 Cask Data, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not +# use this file except in compliance with the License. You may obtain a copy of +# the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations under +# the License. + +@Wrangler +Feature: Wrangler - Run time scenarios for parse as XmlToJson + + @BQ_SOURCE_XML_TEST @BQ_SINK_TEST + Scenario: To verify User is able to run a pipeline using parse XmlToJson directive + Given Open Datafusion Project to configure pipeline + Then Click on the Plus Green Button to import the pipelines + Then Select the file for importing the pipeline for the plugin "Directive_parse_xml" + Then Navigate to the properties page of plugin: "BigQueryTable" + Then Replace input plugin property: "project" with value: "projectId" + Then Replace input plugin property: "dataset" with value: "dataset" + Then Replace input plugin property: "table" with value: "bqSourceTable" + Then Click on the Get Schema button + Then Click on the Validate button + Then Close the Plugin Properties page + Then Navigate to the properties page of plugin: "BigQuery2" + Then Replace input plugin property: "project" with value: "projectId" + Then Replace input plugin property: "table" with value: "bqTargetTable" + Then Replace input plugin property: "dataset" with value: "dataset" + Then Click on the Validate button + Then Close the Plugin Properties page + Then Rename the pipeline + Then Deploy the pipeline + Then Run the Pipeline in Runtime + Then Wait till pipeline is in running state + Then Open and capture logs + Then Verify the pipeline status is "Succeeded" + Then Close the pipeline logs + Then Validate The Data From BQ To BQ With Actual And Expected File for: "ExpectedDirective_parse_xml" diff --git a/wrangler-transform/src/e2e-test/java/io/cdap/plugin/common/stepsdesign/TestSetupHooks.java b/wrangler-transform/src/e2e-test/java/io/cdap/plugin/common/stepsdesign/TestSetupHooks.java index 40c60c665..3871b43f5 100644 --- a/wrangler-transform/src/e2e-test/java/io/cdap/plugin/common/stepsdesign/TestSetupHooks.java +++ b/wrangler-transform/src/e2e-test/java/io/cdap/plugin/common/stepsdesign/TestSetupHooks.java @@ -24,7 +24,6 @@ import io.cdap.e2e.utils.StorageClient; import io.cucumber.java.After; import io.cucumber.java.Before; -import org.apache.commons.lang3.RandomStringUtils; import org.apache.commons.lang3.StringUtils; import org.junit.Assert; import stepsdesign.BeforeActions; @@ -34,16 +33,14 @@ import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Paths; -import java.sql.SQLException; import java.util.NoSuchElementException; import java.util.UUID; -import static io.cdap.e2e.pages.locators.CdfGCSLocators.filePath; - /** * Setup BQ for Wrangler tests. */ public class TestSetupHooks { + public static String gcsSourceBucketName = StringUtils.EMPTY; @Before(order = 1, value = "@BQ_SINK_TEST") public static void setTempTargetBQTableName() { @@ -74,8 +71,58 @@ public static void deleteTempTargetBQTable() throws IOException, InterruptedExce @Before(order = 1, value = "@BQ_SOURCE_CSV_TEST") public static void createTempSourceBQTable() throws IOException, InterruptedException { createSourceBQTableWithQueries(PluginPropertyUtils.pluginProp("CreateBQTableQueryFileCsv"), - PluginPropertyUtils.pluginProp("InsertBQDataQueryFileCsv")); + PluginPropertyUtils.pluginProp("InsertBQDataQueryFileCsv")); + } + @Before(order = 1, value = "@BQ_SOURCE_JSON_TEST") + public static void createTempSourceBQTableJson() throws IOException, InterruptedException { + createSourceBQTableWithQueries(PluginPropertyUtils.pluginProp("CreateBQTableQueryFileJson"), + PluginPropertyUtils.pluginProp("InsertBQDataQueryFileJson")); + } + @Before(order = 1, value = "@BQ_SOURCE_XML_TEST") + public static void createTempSourceBQTableXml() throws IOException, InterruptedException { + createSourceBQTableWithQueries(PluginPropertyUtils.pluginProp("CreateBQDataQueryFileXml"), + PluginPropertyUtils.pluginProp("InsertBQDataQueryFileXml")); + } + @After(order = 1, value = "@BQ_SOURCE_TEST") + public static void deleteTempSourceBQTable() throws IOException, InterruptedException { + String bqSourceTable = PluginPropertyUtils.pluginProp("bqSourceTable"); + BigQueryClient.dropBqQuery(bqSourceTable); + BeforeActions.scenario.write("BQ source Table " + bqSourceTable + " deleted successfully"); + PluginPropertyUtils.removePluginProp("bqSourceTable"); } + @Before(order = 1, value = "@GCS_SOURCE_TEST") + public static void createBucketWithEXCELFile() throws IOException, URISyntaxException { + gcsSourceBucketName = createGCSBucketWithFile(PluginPropertyUtils.pluginProp("testFile")); + PluginPropertyUtils.addPluginProp("gcsSourceBucket", "gs://" + gcsSourceBucketName + "/" + + PluginPropertyUtils.pluginProp("testFile")); + BeforeActions.scenario.write("GCS source bucket1 name - " + gcsSourceBucketName); + } + private static String createGCSBucketWithFile(String filePath) throws IOException, URISyntaxException { + String bucketName = StorageClient.createBucket("e2e-test-" + UUID.randomUUID()).getName(); + StorageClient.uploadObject(bucketName, filePath, filePath); + return bucketName; + } + @After(order = 1, value = "@GCS_SOURCE_TEST") + public static void deleteSourceBucketWithFile() { + deleteGCSBucket(gcsSourceBucketName); + gcsSourceBucketName = StringUtils.EMPTY; + } + private static void deleteGCSBucket(String bucketName) { + try { + for (Blob blob : StorageClient.listObjects(bucketName).iterateAll()) { + StorageClient.deleteObject(bucketName, blob.getName()); + } + StorageClient.deleteBucket(bucketName); + BeforeActions.scenario.write("Deleted GCS Bucket " + bucketName); + } catch (StorageException | IOException e) { + if (e.getMessage().contains("The specified bucket does not exist")) { + BeforeActions.scenario.write("GCS Bucket " + bucketName + " does not exist."); + } else { + Assert.fail(e.getMessage()); + } + } + } + private static void createSourceBQTableWithQueries(String bqCreateTableQueryFile, String bqInsertDataQueryFile) throws IOException, InterruptedException { diff --git a/wrangler-transform/src/e2e-test/resources/BQValidationExpectedFiles/Directive_parse_excel b/wrangler-transform/src/e2e-test/resources/BQValidationExpectedFiles/Directive_parse_excel new file mode 100644 index 000000000..3c3ae5154 --- /dev/null +++ b/wrangler-transform/src/e2e-test/resources/BQValidationExpectedFiles/Directive_parse_excel @@ -0,0 +1,2 @@ +{"copiedname":"very","id":0,"name":"very","phone":"8838.0","rollno":"3.0","uniquenum":"very,0"} +{"copiedname":"hello","id":2,"name":"hell","phone":"12345.0","rollno":"1.0","uniquenum":"hello,2"} \ No newline at end of file diff --git a/wrangler-transform/src/e2e-test/resources/BQValidationExpectedFiles/Directive_parse_json b/wrangler-transform/src/e2e-test/resources/BQValidationExpectedFiles/Directive_parse_json new file mode 100644 index 000000000..b8cac65cb --- /dev/null +++ b/wrangler-transform/src/e2e-test/resources/BQValidationExpectedFiles/Directive_parse_json @@ -0,0 +1,3 @@ +{"Body":"hello abc","copied":{"first":"Root","last":"joy"},"desc":"nick, hello abc","id":22,"json_age":"{\"json_id\":22,\"copied\":{\"first\":\"Root\",\"last\":\"joy\"},\"json_age\":1,\"json_name\":{\"first\":\"Root\",\"last\":\"joy\"},\"json_pet\":\"testing\",\"json_id_json_name\":\"22,{\\\"first\\\":\\\"Root\\\",\\\"last\\\":\\\"joy\\\"}\",\"body\":\"hello abc\",\"desc\":\"nick, hello abc\"}","json_id_json_name":"22,{\"first\":\"Root\",\"last\":\"joy\"}","json_name":{"first":"Root","last":"joy"},"json_pet":"testing"} +{"Body":"hello def","copied":{"first":"dded","last":"share"},"desc":"hello, hello def","id":23,"json_age":"{\"json_id\":23,\"copied\":{\"first\":\"dded\",\"last\":\"share\"},\"json_age\":2,\"json_name\":{\"first\":\"dded\",\"last\":\"share\"},\"json_pet\":\"testing\",\"json_id_json_name\":\"23,{\\\"first\\\":\\\"dded\\\",\\\"last\\\":\\\"share\\\"}\",\"body\":\"hello def\",\"desc\":\"hello, hello def\"}","json_id_json_name":"23,{\"first\":\"dded\",\"last\":\"share\"}","json_name":{"first":"dded","last":"share"},"json_pet":"testing"} +{"Body":"hello ghi","copied":{"first":"Root","last":"Joltie"},"desc":"doms, hello ghi","id":24,"json_age":"{\"json_id\":24,\"copied\":{\"first\":\"Root\",\"last\":\"Joltie\"},\"json_age\":3,\"json_name\":{\"first\":\"Root\",\"last\":\"Joltie\"},\"json_pet\":\"testing\",\"json_id_json_name\":\"24,{\\\"first\\\":\\\"Root\\\",\\\"last\\\":\\\"Joltie\\\"}\",\"body\":\"hello ghi\",\"desc\":\"doms, hello ghi\"}","json_id_json_name":"24,{\"first\":\"Root\",\"last\":\"Joltie\"}","json_name":{"first":"Root","last":"Joltie"},"json_pet":"testing"} \ No newline at end of file diff --git a/wrangler-transform/src/e2e-test/resources/BQValidationExpectedFiles/Directive_parse_xmltojson b/wrangler-transform/src/e2e-test/resources/BQValidationExpectedFiles/Directive_parse_xmltojson new file mode 100644 index 000000000..2f9d3ba92 --- /dev/null +++ b/wrangler-transform/src/e2e-test/resources/BQValidationExpectedFiles/Directive_parse_xmltojson @@ -0,0 +1,6 @@ +{"Email":"abc01@mail.com","distance":2.0,"distance2":0.3571428656578064,"email_domain":{"distance":2.0,"email_account":"abc01"},"email_porter":["abc","mail","com"],"id":"1","xmldata_note":{"body":"Dont forget me this week!","from":"Tani","heading":"Reminder","to":"Tove"}} +{"Email":"def02@mail.com","distance":2.0,"distance2":0.3571428656578064,"email_domain":{"distance":2.0,"email_account":"def02"},"email_porter":["def","mail","com"],"id":"2","xmldata_note":{"body":"Dont forget us this holiday!","from":"joy","heading":"Reminder","to":"Tove"}} +{"Email":"ghi03@mail.com","distance":2.0,"distance2":0.3571428656578064,"email_domain":{"distance":2.0,"email_account":"ghi03"},"email_porter":["ghi","mail","com"],"id":"3","xmldata_note":{"body":"Dont forget him this weekend!","from":"shree","heading":"Reminder","to":"Tove"}} +{"Email":"abc01@mail.com","distance":2.0,"distance2":0.3571428656578064,"email_domain":{"distance":2.0,"email_account":"abc01"},"email_porter":["abc","mail","com"],"id":"abc","xmldata_note":{"body":"Dont forget me this week!","from":"Tani","heading":"Reminder","to":"Tove"}} +{"Email":"def02@mail.com","distance":2.0,"distance2":0.3571428656578064,"email_domain":{"distance":2.0,"email_account":"def02"},"email_porter":["def","mail","com"],"id":"def","xmldata_note":{"body":"Dont forget us this holiday!","from":"joy","heading":"Reminder","to":"Tove"}} +{"Email":"ghi03@mail.com","distance":2.0,"distance2":0.3571428656578064,"email_domain":{"distance":2.0,"email_account":"ghi03"},"email_porter":["ghi","mail","com"],"id":"ghi","xmldata_note":{"body":"Dont forget him this weekend!","from":"shree","heading":"Reminder","to":"Tove"}} \ No newline at end of file diff --git a/wrangler-transform/src/e2e-test/resources/BQtesdata/BigQuery/BigQueryCreateTableQueryXml.txt b/wrangler-transform/src/e2e-test/resources/BQtesdata/BigQuery/BigQueryCreateTableQueryXml.txt new file mode 100644 index 000000000..a711921e2 --- /dev/null +++ b/wrangler-transform/src/e2e-test/resources/BQtesdata/BigQuery/BigQueryCreateTableQueryXml.txt @@ -0,0 +1 @@ +create table `DATASET.TABLE_NAME` (email STRING, xmldata STRING) diff --git a/wrangler-transform/src/e2e-test/resources/BQtesdata/BigQuery/BigQueryInsertDataQueryXml.txt b/wrangler-transform/src/e2e-test/resources/BQtesdata/BigQuery/BigQueryInsertDataQueryXml.txt new file mode 100644 index 000000000..0dc9608ce --- /dev/null +++ b/wrangler-transform/src/e2e-test/resources/BQtesdata/BigQuery/BigQueryInsertDataQueryXml.txt @@ -0,0 +1,5 @@ +INSERT INTO DATASET.TABLE_NAME (email,xmldata) +VALUES +('abc01@mail.com',' Tove Tani Reminder Dont forget me this week! '), +('def02@mail.com',' Tove joy Reminder Dont forget us this holiday! '), +('ghi03@mail.com',' Tove shree Reminder Dont forget him this weekend! '); diff --git a/wrangler-transform/src/e2e-test/resources/BQtesdata/BigQuery/BigQueryInsertDataQueryparsejson.txt b/wrangler-transform/src/e2e-test/resources/BQtesdata/BigQuery/BigQueryInsertDataQueryparsejson.txt new file mode 100644 index 000000000..dc9fa7d17 --- /dev/null +++ b/wrangler-transform/src/e2e-test/resources/BQtesdata/BigQuery/BigQueryInsertDataQueryparsejson.txt @@ -0,0 +1,6 @@ +INSERT INTO DATASET.TABLE_NAME (body,json) +VALUES +(' hello abc', '{"id": 1, "name": {"first": "Root", "last": "joy"}, "age": 22, "pet": "nick", "height": 5.8}'), +('hello def', '{"id": 2, "name": {"first": "dded", "last": "share"}, "age": 23, "pet": "hello", "height": 6.8}'), +('hello ghi', '{"id": 3, "name": {"first": "Root", "last": "Joltie"}, "age": 24, "pet": "doms", "height": 7.8}'); + diff --git a/wrangler-transform/src/e2e-test/resources/BQtesdata/BigQuery/BigQuerycreateTableQueryjson.txt b/wrangler-transform/src/e2e-test/resources/BQtesdata/BigQuery/BigQuerycreateTableQueryjson.txt new file mode 100644 index 000000000..be6b585ea --- /dev/null +++ b/wrangler-transform/src/e2e-test/resources/BQtesdata/BigQuery/BigQuerycreateTableQueryjson.txt @@ -0,0 +1 @@ +create table `DATASET.TABLE_NAME` (body STRING, json STRING) \ No newline at end of file diff --git a/wrangler-transform/src/e2e-test/resources/BQtesdata/BigQuery/test1.xlsx b/wrangler-transform/src/e2e-test/resources/BQtesdata/BigQuery/test1.xlsx new file mode 100644 index 000000000..adaa5291b Binary files /dev/null and b/wrangler-transform/src/e2e-test/resources/BQtesdata/BigQuery/test1.xlsx differ diff --git a/wrangler-transform/src/e2e-test/resources/pluginParameters.properties b/wrangler-transform/src/e2e-test/resources/pluginParameters.properties index 8ea4bf26e..e2672ad68 100644 --- a/wrangler-transform/src/e2e-test/resources/pluginParameters.properties +++ b/wrangler-transform/src/e2e-test/resources/pluginParameters.properties @@ -1,10 +1,18 @@ #json file path +Directive_parse_json=testData/Wrangler/parse_json_Wrangle-cdap-data-pipeline (1).json +Directive_parse_xml=testData/Wrangler/parse_xmltojson_wrangle-cdap-data-pipeline.json +Directive_parse_excel=testData/Wrangler/parse_excel_wrangler_copy-cdap-data-pipeline.json Directive_parse_csv=testData/Wrangler\ /parse_csv_wrangle-cdap-data-pipeline.json bqSourceTable=dummy sourcePath=example/hello.csv gcsSourceBucket=dummy +testFile=BQtesdata/BigQuery/test1.xlsx #bq queries file path +CreateBQTableQueryFileJson=BQtesdata/BigQuery/BigQuerycreateTableQueryjson.txt +InsertBQDataQueryFileJson=BQtesdata/BigQuery/BigQueryInsertDataQueryparsejson.txt +CreateBQDataQueryFileXml=BQtesdata/BigQuery/BigQueryCreateTableQueryXml.txt +InsertBQDataQueryFileXml=BQtesdata/BigQuery/BigQueryInsertDataQueryXml.txt CreateBQTableQueryFileCsv=BQtesdata/BigQuery/BigQueryCreateTableQueryCsv.txt InsertBQDataQueryFileCsv=BQtesdata/BigQuery/BigQueryInsertDataQueryCsv.txt @@ -13,4 +21,7 @@ projectId=cdf-athena dataset=test_automation dataset2=Wrangler #expectedBQFiles +ExpectedDirective_parse_json=BQValidationExpectedFiles/Directive_parse_json +ExpectedDirective_parse_xml=BQValidationExpectedFiles/Directive_parse_xmltojson +ExpectedDirective_parse_excel=BQValidationExpectedFiles/Directive_parse_excel ExpectedDirective_parse_csv=BQValidationExpectedFiles/Directive_parse_csv diff --git a/wrangler-transform/src/e2e-test/resources/testData/Wrangler/parse_excel_wrangler_copy-cdap-data-pipeline.json b/wrangler-transform/src/e2e-test/resources/testData/Wrangler/parse_excel_wrangler_copy-cdap-data-pipeline.json new file mode 100644 index 000000000..614f18fdf --- /dev/null +++ b/wrangler-transform/src/e2e-test/resources/testData/Wrangler/parse_excel_wrangler_copy-cdap-data-pipeline.json @@ -0,0 +1,186 @@ +{ + "name": "parse_excel_wrangler_copy", + "description": "Data Pipeline Application", + "artifact": { + "name": "cdap-data-pipeline", + "version": "6.10.0-SNAPSHOT", + "scope": "SYSTEM" + }, + "config": { + "resources": { + "memoryMB": 2048, + "virtualCores": 1 + }, + "driverResources": { + "memoryMB": 2048, + "virtualCores": 1 + }, + "connections": [ + { + "from": "GCSFile", + "to": "Wrangler" + }, + { + "from": "Wrangler", + "to": "BigQuery" + } + ], + "postActions": [], + "properties": {}, + "processTimingEnabled": true, + "stageLoggingEnabled": true, + "stages": [ + { + "name": "GCSFile", + "plugin": { + "name": "GCSFile", + "type": "batchsource", + "label": "GCSFile", + "artifact": { + "name": "google-cloud", + "version": "0.23.0-SNAPSHOT", + "scope": "SYSTEM" + }, + "properties": { + "format": "blob", + "path": "gs://00000000-e2e-0014a44f-81be-4501-8360-0ddca1c39789/test1.xlsx", + "fileEncoding": "UTF-8", + "useConnection": "false", + "referenceName": "test", + "schema": "{\"type\":\"record\",\"name\":\"blob\",\"fields\":[{\"name\":\"body\",\"type\":\"bytes\"}]}", + "project": "auto-detect", + "serviceAccountType": "filePath", + "serviceFilePath": "auto-detect", + "sampleSize": "1000", + "filenameOnly": "false", + "recursive": "false", + "ignoreNonExistingFolders": "false", + "encrypted": "false" + } + }, + "outputSchema": [ + { + "name": "etlSchemaBody", + "schema": "{\"type\":\"record\",\"name\":\"blob\",\"fields\":[{\"name\":\"body\",\"type\":\"bytes\"}]}" + } + ], + "id": "GCSFile", + "type": "batchsource", + "label": "GCSFile", + "icon": "fa-plug", + "$$hashKey": "object:475", + "isPluginAvailable": true, + "_uiPosition": { + "left": "496px", + "top": "343px" + } + }, + { + "name": "Wrangler", + "plugin": { + "name": "Wrangler", + "type": "transform", + "label": "Wrangler", + "artifact": { + "name": "wrangler-transform", + "version": "4.10.0-SNAPSHOT", + "scope": "SYSTEM" + }, + "properties": { + "directives": "parse-as-excel :body '0' true\ncopy name copiedname\nmerge name bkd uniquenum ','\nrename bkd rollno\ndrop fwd\nswap id rollno\nsplit-to-rows :name 'o'\nfilter-rows-on condition-false rollno !~ '2.0'", + "field": "*", + "precondition": "false", + "workspaceId": "667f9e85-6c36-4d38-ad48-ef85db7a04a2", + "schema": "{\"type\":\"record\",\"name\":\"output\",\"fields\":[{\"name\":\"id\",\"type\":[\"int\",\"null\"]},{\"name\":\"rollno\",\"type\":[\"string\",\"null\"]},{\"name\":\"name\",\"type\":[\"string\",\"null\"]},{\"name\":\"phone\",\"type\":[\"string\",\"null\"]},{\"name\":\"copiedname\",\"type\":[\"string\",\"null\"]},{\"name\":\"uniquenum\",\"type\":[\"string\",\"null\"]}]}", + "on-error": "fail-pipeline" + } + }, + "outputSchema": [ + { + "name": "etlSchemaBody", + "schema": "{\"type\":\"record\",\"name\":\"output\",\"fields\":[{\"name\":\"id\",\"type\":[\"int\",\"null\"]},{\"name\":\"rollno\",\"type\":[\"string\",\"null\"]},{\"name\":\"name\",\"type\":[\"string\",\"null\"]},{\"name\":\"phone\",\"type\":[\"string\",\"null\"]},{\"name\":\"copiedname\",\"type\":[\"string\",\"null\"]},{\"name\":\"uniquenum\",\"type\":[\"string\",\"null\"]}]}" + } + ], + "inputSchema": [ + { + "name": "GCSFile", + "schema": "{\"type\":\"record\",\"name\":\"blob\",\"fields\":[{\"name\":\"body\",\"type\":\"bytes\"}]}" + } + ], + "id": "Wrangler", + "type": "transform", + "label": "Wrangler", + "icon": "icon-DataPreparation", + "$$hashKey": "object:476", + "isPluginAvailable": true, + "_uiPosition": { + "left": "796px", + "top": "343px" + } + }, + { + "name": "BigQuery", + "plugin": { + "name": "BigQueryTable", + "type": "batchsink", + "label": "BigQuery", + "artifact": { + "name": "google-cloud", + "version": "0.23.0-SNAPSHOT", + "scope": "SYSTEM" + }, + "properties": { + "useConnection": "false", + "project": "auto-detect", + "serviceAccountType": "filePath", + "serviceFilePath": "auto-detect", + "dataset": "Wrangler", + "table": "extab34", + "operation": "insert", + "truncateTable": "false", + "allowSchemaRelaxation": "false", + "location": "US", + "createPartitionedTable": "false", + "partitioningType": "TIME", + "partitionFilterRequired": "false", + "schema": "{\"type\":\"record\",\"name\":\"output\",\"fields\":[{\"name\":\"id\",\"type\":[\"int\",\"null\"]},{\"name\":\"rollno\",\"type\":[\"string\",\"null\"]},{\"name\":\"name\",\"type\":[\"string\",\"null\"]},{\"name\":\"phone\",\"type\":[\"string\",\"null\"]},{\"name\":\"copiedname\",\"type\":[\"string\",\"null\"]},{\"name\":\"uniquenum\",\"type\":[\"string\",\"null\"]}]}" + } + }, + "outputSchema": [ + { + "name": "etlSchemaBody", + "schema": "{\"type\":\"record\",\"name\":\"output\",\"fields\":[{\"name\":\"id\",\"type\":[\"int\",\"null\"]},{\"name\":\"rollno\",\"type\":[\"string\",\"null\"]},{\"name\":\"name\",\"type\":[\"string\",\"null\"]},{\"name\":\"phone\",\"type\":[\"string\",\"null\"]},{\"name\":\"copiedname\",\"type\":[\"string\",\"null\"]},{\"name\":\"uniquenum\",\"type\":[\"string\",\"null\"]}]}" + } + ], + "inputSchema": [ + { + "name": "Wrangler", + "schema": "{\"type\":\"record\",\"name\":\"output\",\"fields\":[{\"name\":\"id\",\"type\":[\"int\",\"null\"]},{\"name\":\"rollno\",\"type\":[\"string\",\"null\"]},{\"name\":\"name\",\"type\":[\"string\",\"null\"]},{\"name\":\"phone\",\"type\":[\"string\",\"null\"]},{\"name\":\"copiedname\",\"type\":[\"string\",\"null\"]},{\"name\":\"uniquenum\",\"type\":[\"string\",\"null\"]}]}" + } + ], + "id": "BigQuery", + "type": "batchsink", + "label": "BigQuery", + "icon": "fa-plug", + "$$hashKey": "object:477", + "isPluginAvailable": true, + "_uiPosition": { + "left": "1096px", + "top": "343px" + } + } + ], + "schedule": "0 1 */1 * *", + "engine": "spark", + "numOfRecordsPreview": 100, + "rangeRecordsPreview": { + "min": 1, + "max": "5000" + }, + "description": "Data Pipeline Application", + "maxConcurrentRuns": 1, + "pushdownEnabled": false, + "transformationPushdown": {} + }, + "version": "2dd12daa-5395-11ee-9dac-000000d0cf32" +} \ No newline at end of file diff --git a/wrangler-transform/src/e2e-test/resources/testData/Wrangler/parse_json_Wrangle-cdap-data-pipeline (1).json b/wrangler-transform/src/e2e-test/resources/testData/Wrangler/parse_json_Wrangle-cdap-data-pipeline (1).json new file mode 100644 index 000000000..ccc37ccef --- /dev/null +++ b/wrangler-transform/src/e2e-test/resources/testData/Wrangler/parse_json_Wrangle-cdap-data-pipeline (1).json @@ -0,0 +1,180 @@ +{ + "name": "parse_json_Wrangle", + "description": "Data Pipeline Application", + "artifact": { + "name": "cdap-data-pipeline", + "version": "6.10.0-SNAPSHOT", + "scope": "SYSTEM" + }, + "config": { + "resources": { + "memoryMB": 2048, + "virtualCores": 1 + }, + "driverResources": { + "memoryMB": 2048, + "virtualCores": 1 + }, + "connections": [ + { + "from": "BigQueryTable", + "to": "Wrangler" + }, + { + "from": "Wrangler", + "to": "BigQuery2" + } + ], + "postActions": [], + "properties": {}, + "processTimingEnabled": true, + "stageLoggingEnabled": true, + "stages": [ + { + "name": "BigQueryTable", + "plugin": { + "name": "BigQueryTable", + "type": "batchsource", + "label": "BigQueryTable", + "artifact": { + "name": "google-cloud", + "version": "0.23.0-SNAPSHOT", + "scope": "SYSTEM" + }, + "properties": { + "useConnection": "false", + "dataset": "Wrangler", + "table": "jstab", + "schema": "{\"type\":\"record\",\"name\":\"output\",\"fields\":[{\"name\":\"Body\",\"type\":\"string\"},{\"name\":\"json\",\"type\":\"string\"}]}", + "project": "auto-detect", + "serviceAccountType": "filePath", + "serviceFilePath": "auto-detect", + "enableQueryingViews": "false" + } + }, + "outputSchema": [ + { + "name": "etlSchemaBody", + "schema": "{\"type\":\"record\",\"name\":\"output\",\"fields\":[{\"name\":\"Body\",\"type\":\"string\"},{\"name\":\"json\",\"type\":\"string\"}]}" + } + ], + "id": "BigQueryTable", + "type": "batchsource", + "label": "BigQueryTable", + "icon": "fa-plug", + "$$hashKey": "object:518", + "isPluginAvailable": true, + "_uiPosition": { + "left": "496px", + "top": "327px" + } + }, + { + "name": "Wrangler", + "plugin": { + "name": "Wrangler", + "type": "transform", + "label": "Wrangler", + "artifact": { + "name": "wrangler-transform", + "version": "4.10.0-SNAPSHOT", + "scope": "SYSTEM" + }, + "properties": { + "directives": "parse-as-json :json 1\nltrim :Body\nset-column :desc concat(json_pet, \", \", body)\ncopy :json_name :copied\nswap :json_id :json_age\nmerge :json_id :json_name :json_id_json_name ,\nmask-number :json_pet 'testing'\ndrop json_height\nwrite-as-json-map :json_age\nrename json_id id", + "field": "*", + "precondition": "false", + "workspaceId": "d496b8e4-ca6f-4d38-9877-d76bb52c218e", + "schema": "{\"type\":\"record\",\"name\":\"outputSchema\",\"fields\":[{\"name\":\"Body\",\"type\":[\"string\",\"null\"]},{\"name\":\"json_age\",\"type\":[\"string\",\"null\"]},{\"name\":\"json_name\",\"type\":[{\"type\":\"record\",\"name\":\"json_name05F0DF247CD8481657781C26E1595028\",\"fields\":[{\"name\":\"first\",\"type\":[\"string\",\"null\"]},{\"name\":\"last\",\"type\":[\"string\",\"null\"]}]},\"null\"]},{\"name\":\"id\",\"type\":[\"long\",\"null\"]},{\"name\":\"json_pet\",\"type\":[\"string\",\"null\"]},{\"name\":\"desc\",\"type\":[\"string\",\"null\"]},{\"name\":\"copied\",\"type\":[{\"type\":\"record\",\"name\":\"copied05F0DF247CD8481657781C26E1595028\",\"fields\":[{\"name\":\"first\",\"type\":[\"string\",\"null\"]},{\"name\":\"last\",\"type\":[\"string\",\"null\"]}]},\"null\"]},{\"name\":\"json_id_json_name\",\"type\":[\"string\",\"null\"]}]}", + "on-error": "fail-pipeline" + } + }, + "outputSchema": [ + { + "name": "etlSchemaBody", + "schema": "{\"type\":\"record\",\"name\":\"outputSchema\",\"fields\":[{\"name\":\"Body\",\"type\":[\"string\",\"null\"]},{\"name\":\"json_age\",\"type\":[\"string\",\"null\"]},{\"name\":\"json_name\",\"type\":[{\"type\":\"record\",\"name\":\"json_name05F0DF247CD8481657781C26E1595028\",\"fields\":[{\"name\":\"first\",\"type\":[\"string\",\"null\"]},{\"name\":\"last\",\"type\":[\"string\",\"null\"]}]},\"null\"]},{\"name\":\"id\",\"type\":[\"long\",\"null\"]},{\"name\":\"json_pet\",\"type\":[\"string\",\"null\"]},{\"name\":\"desc\",\"type\":[\"string\",\"null\"]},{\"name\":\"copied\",\"type\":[{\"type\":\"record\",\"name\":\"copied05F0DF247CD8481657781C26E1595028\",\"fields\":[{\"name\":\"first\",\"type\":[\"string\",\"null\"]},{\"name\":\"last\",\"type\":[\"string\",\"null\"]}]},\"null\"]},{\"name\":\"json_id_json_name\",\"type\":[\"string\",\"null\"]}]}" + } + ], + "inputSchema": [ + { + "name": "BigQueryTable", + "schema": "{\"type\":\"record\",\"name\":\"output\",\"fields\":[{\"name\":\"Body\",\"type\":\"string\"},{\"name\":\"json\",\"type\":\"string\"}]}" + } + ], + "id": "Wrangler", + "type": "transform", + "label": "Wrangler", + "icon": "icon-DataPreparation", + "$$hashKey": "object:519", + "isPluginAvailable": true, + "_uiPosition": { + "left": "796px", + "top": "327px" + } + }, + { + "name": "BigQuery2", + "plugin": { + "name": "BigQueryTable", + "type": "batchsink", + "label": "BigQuery2", + "artifact": { + "name": "google-cloud", + "version": "0.23.0-SNAPSHOT", + "scope": "SYSTEM" + }, + "properties": { + "useConnection": "false", + "project": "auto-detect", + "serviceAccountType": "filePath", + "serviceFilePath": "auto-detect", + "dataset": "Wrangler", + "table": "jstabsupd", + "operation": "insert", + "truncateTable": "false", + "allowSchemaRelaxation": "false", + "location": "US", + "createPartitionedTable": "false", + "partitioningType": "TIME", + "partitionFilterRequired": "false", + "schema": "{\"type\":\"record\",\"name\":\"outputSchema\",\"fields\":[{\"name\":\"Body\",\"type\":[\"string\",\"null\"]},{\"name\":\"json_age\",\"type\":[\"string\",\"null\"]},{\"name\":\"json_name\",\"type\":[{\"type\":\"record\",\"name\":\"json_name05F0DF247CD8481657781C26E1595028\",\"fields\":[{\"name\":\"first\",\"type\":[\"string\",\"null\"]},{\"name\":\"last\",\"type\":[\"string\",\"null\"]}]},\"null\"]},{\"name\":\"id\",\"type\":[\"long\",\"null\"]},{\"name\":\"json_pet\",\"type\":[\"string\",\"null\"]},{\"name\":\"desc\",\"type\":[\"string\",\"null\"]},{\"name\":\"copied\",\"type\":[{\"type\":\"record\",\"name\":\"copied05F0DF247CD8481657781C26E1595028\",\"fields\":[{\"name\":\"first\",\"type\":[\"string\",\"null\"]},{\"name\":\"last\",\"type\":[\"string\",\"null\"]}]},\"null\"]},{\"name\":\"json_id_json_name\",\"type\":[\"string\",\"null\"]}]}" + } + }, + "outputSchema": [ + { + "name": "etlSchemaBody", + "schema": "{\"type\":\"record\",\"name\":\"outputSchema\",\"fields\":[{\"name\":\"Body\",\"type\":[\"string\",\"null\"]},{\"name\":\"json_age\",\"type\":[\"string\",\"null\"]},{\"name\":\"json_name\",\"type\":[{\"type\":\"record\",\"name\":\"json_name05F0DF247CD8481657781C26E1595028\",\"fields\":[{\"name\":\"first\",\"type\":[\"string\",\"null\"]},{\"name\":\"last\",\"type\":[\"string\",\"null\"]}]},\"null\"]},{\"name\":\"id\",\"type\":[\"long\",\"null\"]},{\"name\":\"json_pet\",\"type\":[\"string\",\"null\"]},{\"name\":\"desc\",\"type\":[\"string\",\"null\"]},{\"name\":\"copied\",\"type\":[{\"type\":\"record\",\"name\":\"copied05F0DF247CD8481657781C26E1595028\",\"fields\":[{\"name\":\"first\",\"type\":[\"string\",\"null\"]},{\"name\":\"last\",\"type\":[\"string\",\"null\"]}]},\"null\"]},{\"name\":\"json_id_json_name\",\"type\":[\"string\",\"null\"]}]}" + } + ], + "inputSchema": [ + { + "name": "Wrangler", + "schema": "{\"type\":\"record\",\"name\":\"outputSchema\",\"fields\":[{\"name\":\"Body\",\"type\":[\"string\",\"null\"]},{\"name\":\"json_age\",\"type\":[\"string\",\"null\"]},{\"name\":\"json_name\",\"type\":[{\"type\":\"record\",\"name\":\"json_name05F0DF247CD8481657781C26E1595028\",\"fields\":[{\"name\":\"first\",\"type\":[\"string\",\"null\"]},{\"name\":\"last\",\"type\":[\"string\",\"null\"]}]},\"null\"]},{\"name\":\"id\",\"type\":[\"long\",\"null\"]},{\"name\":\"json_pet\",\"type\":[\"string\",\"null\"]},{\"name\":\"desc\",\"type\":[\"string\",\"null\"]},{\"name\":\"copied\",\"type\":[{\"type\":\"record\",\"name\":\"copied05F0DF247CD8481657781C26E1595028\",\"fields\":[{\"name\":\"first\",\"type\":[\"string\",\"null\"]},{\"name\":\"last\",\"type\":[\"string\",\"null\"]}]},\"null\"]},{\"name\":\"json_id_json_name\",\"type\":[\"string\",\"null\"]}]}" + } + ], + "id": "BigQuery2", + "type": "batchsink", + "label": "BigQuery2", + "icon": "fa-plug", + "$$hashKey": "object:520", + "isPluginAvailable": true, + "_uiPosition": { + "left": "1096px", + "top": "327px" + } + } + ], + "schedule": "0 1 */1 * *", + "engine": "spark", + "numOfRecordsPreview": 100, + "rangeRecordsPreview": { + "min": 1, + "max": "5000" + }, + "description": "Data Pipeline Application", + "maxConcurrentRuns": 1, + "pushdownEnabled": false, + "transformationPushdown": {} + }, + "version": "516e13c0-4c05-11ee-b70a-0000001ecd5c" +} \ No newline at end of file diff --git a/wrangler-transform/src/e2e-test/resources/testData/Wrangler/parse_xmltojson_wrangle-cdap-data-pipeline.json b/wrangler-transform/src/e2e-test/resources/testData/Wrangler/parse_xmltojson_wrangle-cdap-data-pipeline.json new file mode 100644 index 000000000..044e982e8 --- /dev/null +++ b/wrangler-transform/src/e2e-test/resources/testData/Wrangler/parse_xmltojson_wrangle-cdap-data-pipeline.json @@ -0,0 +1,362 @@ +{ + "name": "parse_xmltojson_wrangle", + "description": "Data Pipeline Application", + "artifact": { + "name": "cdap-data-pipeline", + "version": "6.10.0-SNAPSHOT", + "scope": "SYSTEM" + }, + "config": { + "resources": { + "memoryMB": 2048, + "virtualCores": 1 + }, + "driverResources": { + "memoryMB": 2048, + "virtualCores": 1 + }, + "connections": [ + { + "from": "BigQueryTable", + "to": "Wrangler" + }, + { + "from": "Wrangler", + "to": "BigQuery2" + } + ], + "postActions": [], + "properties": {}, + "processTimingEnabled": true, + "stageLoggingEnabled": true, + "stages": [ + { + "name": "BigQueryTable", + "plugin": { + "name": "BigQueryTable", + "type": "batchsource", + "label": "BigQueryTable", + "artifact": { + "name": "google-cloud", + "version": "0.23.0-SNAPSHOT", + "scope": "SYSTEM" + }, + "properties": { + "useConnection": "false", + "dataset": "Wrangler", + "table": "xmltabs", + "schema": "{\"type\":\"record\",\"name\":\"output\",\"fields\":[{\"name\":\"Email\",\"type\":\"string\"},{\"name\":\"xmldata\",\"type\":\"string\"}]}", + "project": "auto-detect", + "serviceAccountType": "filePath", + "serviceFilePath": "auto-detect", + "enableQueryingViews": "false" + } + }, + "outputSchema": [ + { + "name": "etlSchemaBody", + "schema": "{\"type\":\"record\",\"name\":\"output\",\"fields\":[{\"name\":\"Email\",\"type\":\"string\"},{\"name\":\"xmldata\",\"type\":\"string\"}]}" + } + ], + "id": "BigQueryTable", + "type": "batchsource", + "label": "BigQueryTable", + "icon": "fa-plug", + "$$hashKey": "object:615", + "isPluginAvailable": true, + "_uiPosition": { + "left": "496px", + "top": "343px" + }, + "_backendProperties": { + "schema": { + "name": "schema", + "description": "The schema of the table to read.", + "type": "string", + "required": false, + "macroSupported": true, + "macroEscapingEnabled": false, + "children": [] + }, + "viewMaterializationDataset": { + "name": "viewMaterializationDataset", + "description": "The dataset in the specified project where the temporary table should be created. Defaults to the same dataset in which the table is located.", + "type": "string", + "required": false, + "macroSupported": true, + "macroEscapingEnabled": false, + "children": [] + }, + "enableQueryingViews": { + "name": "enableQueryingViews", + "description": "Whether to allow querying views. Since BigQuery views are not materialized by default, querying them may have a performance overhead.", + "type": "string", + "required": false, + "macroSupported": true, + "macroEscapingEnabled": false, + "children": [] + }, + "serviceAccountJSON": { + "name": "serviceAccountJSON", + "description": "Content of the service account file.", + "type": "string", + "required": false, + "macroSupported": true, + "macroEscapingEnabled": false, + "children": [] + }, + "partitionTo": { + "name": "partitionTo", + "description": "It's inclusive partition end date. It should be a String with format \"yyyy-MM-dd\". This value is ignored if the table does not support partitioning.", + "type": "string", + "required": false, + "macroSupported": true, + "macroEscapingEnabled": false, + "children": [] + }, + "cmekKey": { + "name": "cmekKey", + "description": "The GCP customer managed encryption key (CMEK) name used to encrypt data written to any bucket, dataset or table created by the plugin. If the bucket, dataset or table already exists, this is ignored. More information can be found at https://cloud.google.com/data-fusion/docs/how-to/customer-managed-encryption-keys", + "type": "string", + "required": false, + "macroSupported": true, + "macroEscapingEnabled": false, + "children": [] + }, + "useConnection": { + "name": "useConnection", + "description": "Whether to use an existing connection.", + "type": "boolean", + "required": false, + "macroSupported": false, + "macroEscapingEnabled": false, + "children": [] + }, + "project": { + "name": "project", + "description": "Google Cloud Project ID. It can be found on the Dashboard in the Google Cloud Platform Console.", + "type": "string", + "required": false, + "macroSupported": true, + "macroEscapingEnabled": false, + "children": [] + }, + "viewMaterializationProject": { + "name": "viewMaterializationProject", + "description": "The project name where the temporary table should be created. Defaults to the same project in which the table is located.", + "type": "string", + "required": false, + "macroSupported": true, + "macroEscapingEnabled": false, + "children": [] + }, + "filter": { + "name": "filter", + "description": "The WHERE clause filters out rows by evaluating each row against boolean expression, and discards all rows that do not return TRUE (that is, rows that return FALSE or NULL).", + "type": "string", + "required": false, + "macroSupported": true, + "macroEscapingEnabled": false, + "children": [] + }, + "bucket": { + "name": "bucket", + "description": "The Google Cloud Storage bucket to store temporary data in. Cloud Storage data will be deleted after it is loaded into BigQuery. If it is not provided, a unique bucket will be automatically created and then deleted after the run finishes. The service account must have permission to create buckets in the configured project.", + "type": "string", + "required": false, + "macroSupported": true, + "macroEscapingEnabled": false, + "children": [] + }, + "partitionFrom": { + "name": "partitionFrom", + "description": "It's inclusive partition start date. It should be a String with format \"yyyy-MM-dd\". This value is ignored if the table does not support partitioning.", + "type": "string", + "required": false, + "macroSupported": true, + "macroEscapingEnabled": false, + "children": [] + }, + "serviceFilePath": { + "name": "serviceFilePath", + "description": "Path on the local file system of the service account key used for authorization. Can be set to 'auto-detect' when running on a Dataproc cluster. When running on other clusters, the file must be present on every node in the cluster.", + "type": "string", + "required": false, + "macroSupported": true, + "macroEscapingEnabled": false, + "children": [] + }, + "serviceAccountType": { + "name": "serviceAccountType", + "description": "Service account type, file path where the service account is located or the JSON content of the service account.", + "type": "string", + "required": false, + "macroSupported": true, + "macroEscapingEnabled": false, + "children": [] + }, + "connection": { + "name": "connection", + "description": "The existing connection to use.", + "type": "bigqueryconnectorconfig", + "required": false, + "macroSupported": true, + "macroEscapingEnabled": false, + "children": [ + "serviceAccountJSON", + "serviceFilePath", + "project", + "serviceAccountType", + "datasetProject" + ] + }, + "datasetProject": { + "name": "datasetProject", + "description": "The project the dataset belongs to. This is only required if the dataset is not in the same project that the BigQuery job will run in. If no value is given, it will default to the configured project ID.", + "type": "string", + "required": false, + "macroSupported": true, + "macroEscapingEnabled": false, + "children": [] + }, + "dataset": { + "name": "dataset", + "description": "The dataset to write to. A dataset is contained within a specific project. Datasets are top-level containers that are used to organize and control access to tables and views.", + "type": "string", + "required": true, + "macroSupported": true, + "macroEscapingEnabled": false, + "children": [] + }, + "table": { + "name": "table", + "description": "The table to read from. A table contains individual records organized in rows. Each record is composed of columns (also called fields). Every table is defined by a schema that describes the column names, data types, and other information.", + "type": "string", + "required": true, + "macroSupported": true, + "macroEscapingEnabled": false, + "children": [] + }, + "referenceName": { + "name": "referenceName", + "description": "This will be used to uniquely identify this source for lineage, annotating metadata, etc.", + "type": "string", + "required": false, + "macroSupported": false, + "macroEscapingEnabled": false, + "children": [] + } + }, + "description": "This source reads the entire contents of a BigQuery table. BigQuery is Google's serverless, highly scalable, enterprise data warehouse.Data is first written to a temporary location on Google Cloud Storage, then read into the pipeline from there.", + "selected": false + }, + { + "name": "Wrangler", + "plugin": { + "name": "Wrangler", + "type": "transform", + "label": "Wrangler", + "artifact": { + "name": "wrangler-transform", + "version": "4.10.0-SNAPSHOT", + "scope": "SYSTEM" + }, + "properties": { + "directives": "parse-xml-to-json :xmldata 1\nsplit-email :email\ntext-distance block email email_account distance\ntext-metric longest-common-subsequence email email_account distance2\nwrite-as-json-object :email_domain distance,email_account\nstemming :email\nsplit-to-rows :email_account '0'\nrename :email_account id", + "field": "*", + "precondition": "false", + "workspaceId": "f076fbe0-b7f8-47bc-8cc1-2a40e04177e1", + "schema": "{\"type\":\"record\",\"name\":\"outputSchema\",\"fields\":[{\"name\":\"Email\",\"type\":[\"string\",\"null\"]},{\"name\":\"xmldata_note\",\"type\":[{\"type\":\"record\",\"name\":\"xmldata_note69A9BFB19CE40D9BB21E66FF1DCB2823\",\"fields\":[{\"name\":\"heading\",\"type\":[\"string\",\"null\"]},{\"name\":\"from\",\"type\":[\"string\",\"null\"]},{\"name\":\"to\",\"type\":[\"string\",\"null\"]},{\"name\":\"body\",\"type\":[\"string\",\"null\"]}]},\"null\"]},{\"name\":\"id\",\"type\":[\"string\",\"null\"]},{\"name\":\"email_domain\",\"type\":[{\"type\":\"record\",\"name\":\"email_domain53E9571E3B0C6D8ACD29805625EDE284\",\"fields\":[{\"name\":\"distance\",\"type\":[\"float\",\"null\"]},{\"name\":\"email_account\",\"type\":[\"string\",\"null\"]}]},\"null\"]},{\"name\":\"distance\",\"type\":[\"float\",\"null\"]},{\"name\":\"distance2\",\"type\":[\"float\",\"null\"]},{\"name\":\"email_porter\",\"type\":[{\"type\":\"array\",\"items\":[\"string\",\"null\"]},\"null\"]}]}", + "on-error": "fail-pipeline" + } + }, + "outputSchema": [ + { + "name": "etlSchemaBody", + "schema": "{\"type\":\"record\",\"name\":\"outputSchema\",\"fields\":[{\"name\":\"Email\",\"type\":[\"string\",\"null\"]},{\"name\":\"xmldata_note\",\"type\":[{\"type\":\"record\",\"name\":\"xmldata_note69A9BFB19CE40D9BB21E66FF1DCB2823\",\"fields\":[{\"name\":\"heading\",\"type\":[\"string\",\"null\"]},{\"name\":\"from\",\"type\":[\"string\",\"null\"]},{\"name\":\"to\",\"type\":[\"string\",\"null\"]},{\"name\":\"body\",\"type\":[\"string\",\"null\"]}]},\"null\"]},{\"name\":\"id\",\"type\":[\"string\",\"null\"]},{\"name\":\"email_domain\",\"type\":[{\"type\":\"record\",\"name\":\"email_domain53E9571E3B0C6D8ACD29805625EDE284\",\"fields\":[{\"name\":\"distance\",\"type\":[\"float\",\"null\"]},{\"name\":\"email_account\",\"type\":[\"string\",\"null\"]}]},\"null\"]},{\"name\":\"distance\",\"type\":[\"float\",\"null\"]},{\"name\":\"distance2\",\"type\":[\"float\",\"null\"]},{\"name\":\"email_porter\",\"type\":[{\"type\":\"array\",\"items\":[\"string\",\"null\"]},\"null\"]}]}" + } + ], + "inputSchema": [ + { + "name": "BigQueryTable", + "schema": "{\"type\":\"record\",\"name\":\"output\",\"fields\":[{\"name\":\"Email\",\"type\":\"string\"},{\"name\":\"xmldata\",\"type\":\"string\"}]}" + } + ], + "id": "Wrangler", + "type": "transform", + "label": "Wrangler", + "icon": "icon-DataPreparation", + "$$hashKey": "object:616", + "isPluginAvailable": true, + "_uiPosition": { + "left": "796px", + "top": "343px" + }, + "selected": false + }, + { + "name": "BigQuery2", + "plugin": { + "name": "BigQueryTable", + "type": "batchsink", + "label": "BigQuery2", + "artifact": { + "name": "google-cloud", + "version": "0.23.0-SNAPSHOT", + "scope": "SYSTEM" + }, + "properties": { + "useConnection": "false", + "project": "auto-detect", + "serviceAccountType": "filePath", + "serviceFilePath": "auto-detect", + "dataset": "Wrangler", + "table": "xmltabupd", + "operation": "insert", + "truncateTable": "false", + "allowSchemaRelaxation": "false", + "location": "US", + "createPartitionedTable": "false", + "partitioningType": "TIME", + "partitionFilterRequired": "false", + "schema": "{\"type\":\"record\",\"name\":\"outputSchema\",\"fields\":[{\"name\":\"Email\",\"type\":[\"string\",\"null\"]},{\"name\":\"xmldata_note\",\"type\":[{\"type\":\"record\",\"name\":\"xmldata_note69A9BFB19CE40D9BB21E66FF1DCB2823\",\"fields\":[{\"name\":\"heading\",\"type\":[\"string\",\"null\"]},{\"name\":\"from\",\"type\":[\"string\",\"null\"]},{\"name\":\"to\",\"type\":[\"string\",\"null\"]},{\"name\":\"body\",\"type\":[\"string\",\"null\"]}]},\"null\"]},{\"name\":\"id\",\"type\":[\"string\",\"null\"]},{\"name\":\"email_domain\",\"type\":[{\"type\":\"record\",\"name\":\"email_domain53E9571E3B0C6D8ACD29805625EDE284\",\"fields\":[{\"name\":\"distance\",\"type\":[\"float\",\"null\"]},{\"name\":\"email_account\",\"type\":[\"string\",\"null\"]}]},\"null\"]},{\"name\":\"distance\",\"type\":[\"float\",\"null\"]},{\"name\":\"distance2\",\"type\":[\"float\",\"null\"]},{\"name\":\"email_porter\",\"type\":[{\"type\":\"array\",\"items\":[\"string\",\"null\"]},\"null\"]}]}" + } + }, + "outputSchema": [ + { + "name": "etlSchemaBody", + "schema": "{\"type\":\"record\",\"name\":\"outputSchema\",\"fields\":[{\"name\":\"Email\",\"type\":[\"string\",\"null\"]},{\"name\":\"xmldata_note\",\"type\":[{\"type\":\"record\",\"name\":\"xmldata_note69A9BFB19CE40D9BB21E66FF1DCB2823\",\"fields\":[{\"name\":\"heading\",\"type\":[\"string\",\"null\"]},{\"name\":\"from\",\"type\":[\"string\",\"null\"]},{\"name\":\"to\",\"type\":[\"string\",\"null\"]},{\"name\":\"body\",\"type\":[\"string\",\"null\"]}]},\"null\"]},{\"name\":\"id\",\"type\":[\"string\",\"null\"]},{\"name\":\"email_domain\",\"type\":[{\"type\":\"record\",\"name\":\"email_domain53E9571E3B0C6D8ACD29805625EDE284\",\"fields\":[{\"name\":\"distance\",\"type\":[\"float\",\"null\"]},{\"name\":\"email_account\",\"type\":[\"string\",\"null\"]}]},\"null\"]},{\"name\":\"distance\",\"type\":[\"float\",\"null\"]},{\"name\":\"distance2\",\"type\":[\"float\",\"null\"]},{\"name\":\"email_porter\",\"type\":[{\"type\":\"array\",\"items\":[\"string\",\"null\"]},\"null\"]}]}" + } + ], + "inputSchema": [ + { + "name": "Wrangler", + "schema": "{\"type\":\"record\",\"name\":\"outputSchema\",\"fields\":[{\"name\":\"Email\",\"type\":[\"string\",\"null\"]},{\"name\":\"xmldata_note\",\"type\":[{\"type\":\"record\",\"name\":\"xmldata_note69A9BFB19CE40D9BB21E66FF1DCB2823\",\"fields\":[{\"name\":\"heading\",\"type\":[\"string\",\"null\"]},{\"name\":\"from\",\"type\":[\"string\",\"null\"]},{\"name\":\"to\",\"type\":[\"string\",\"null\"]},{\"name\":\"body\",\"type\":[\"string\",\"null\"]}]},\"null\"]},{\"name\":\"id\",\"type\":[\"string\",\"null\"]},{\"name\":\"email_domain\",\"type\":[{\"type\":\"record\",\"name\":\"email_domain53E9571E3B0C6D8ACD29805625EDE284\",\"fields\":[{\"name\":\"distance\",\"type\":[\"float\",\"null\"]},{\"name\":\"email_account\",\"type\":[\"string\",\"null\"]}]},\"null\"]},{\"name\":\"distance\",\"type\":[\"float\",\"null\"]},{\"name\":\"distance2\",\"type\":[\"float\",\"null\"]},{\"name\":\"email_porter\",\"type\":[{\"type\":\"array\",\"items\":[\"string\",\"null\"]},\"null\"]}]}" + } + ], + "id": "BigQuery2", + "type": "batchsink", + "label": "BigQuery2", + "icon": "fa-plug", + "$$hashKey": "object:617", + "isPluginAvailable": true, + "_uiPosition": { + "left": "1096px", + "top": "343px" + }, + "selected": false + } + ], + "schedule": "0 1 */1 * *", + "engine": "spark", + "numOfRecordsPreview": 100, + "rangeRecordsPreview": { + "min": 1, + "max": "5000" + }, + "maxConcurrentRuns": 1, + "pushdownEnabled": false, + "transformationPushdown": {} + }, + "version": "fe20077f-4e31-11ee-b01d-000000d34db8" +} \ No newline at end of file