diff --git a/wrangler-core/src/main/java/io/cdap/directives/xml/XmlToJson.java b/wrangler-core/src/main/java/io/cdap/directives/xml/XmlToJson.java index 8a3ed89ff..513a54d17 100644 --- a/wrangler-core/src/main/java/io/cdap/directives/xml/XmlToJson.java +++ b/wrangler-core/src/main/java/io/cdap/directives/xml/XmlToJson.java @@ -38,6 +38,7 @@ import io.cdap.wrangler.api.parser.Numeric; import io.cdap.wrangler.api.parser.TokenType; import io.cdap.wrangler.api.parser.UsageDefinition; +import org.apache.commons.lang.StringUtils; import org.json.JSONException; import org.json.XML; @@ -52,9 +53,11 @@ @Description("Parses a XML document to JSON representation.") public class XmlToJson implements Directive, Lineage { public static final String NAME = "parse-xml-to-json"; + public static final String ARG_KEEP_STRING = "keep-string"; // Column within the input row that needs to be parsed as Json private String col; private int depth; + private boolean keepString; private final Gson gson = new Gson(); @Override @@ -62,6 +65,7 @@ public UsageDefinition define() { UsageDefinition.Builder builder = UsageDefinition.builder(NAME); builder.define("column", TokenType.COLUMN_NAME); builder.define("depth", TokenType.NUMERIC, Optional.TRUE); + builder.define(ARG_KEEP_STRING, TokenType.BOOLEAN, Optional.TRUE); return builder.build(); } @@ -73,6 +77,12 @@ public void initialize(Arguments args) throws DirectiveParseException { } else { this.depth = Integer.MAX_VALUE; } + + if (args.contains(ARG_KEEP_STRING) && + StringUtils.isNotEmpty(args.value(ARG_KEEP_STRING).value().toString())) { + this.keepString = Boolean.parseBoolean(args.value(ARG_KEEP_STRING).value().toString()); + } + } @Override @@ -93,7 +103,7 @@ public List execute(List rows, ExecutorContext context) throws Directi try { if (object instanceof String) { - JsonObject element = gson.fromJson(XML.toJSONObject((String) object).toString(), + JsonObject element = gson.fromJson(XML.toJSONObject((String) object, this.keepString).toString(), JsonElement.class).getAsJsonObject(); JsParser.jsonFlatten(element, col, 1, depth, row); row.remove(idx); diff --git a/wrangler-core/src/test/java/io/cdap/directives/parser/XmlToJsonTest.java b/wrangler-core/src/test/java/io/cdap/directives/parser/XmlToJsonTest.java new file mode 100644 index 000000000..2d08228a8 --- /dev/null +++ b/wrangler-core/src/test/java/io/cdap/directives/parser/XmlToJsonTest.java @@ -0,0 +1,54 @@ +/* + * Copyright © 2024 Cask Data, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package io.cdap.directives.parser; + +import io.cdap.directives.xml.XmlToJson; +import io.cdap.wrangler.TestingRig; +import io.cdap.wrangler.api.Row; +import org.junit.Assert; +import org.junit.Test; + +import java.util.Arrays; +import java.util.List; + +/** + * Tests {@link XmlToJson} + */ +public class XmlToJsonTest { + @Test + public void testAutoConversionOfStringField() throws Exception { + String[] directives = new String[] { + "copy body body_1 true", + "copy body body_2 true", + "copy body body_3 true", + "parse-xml-to-json body_1 1", + "parse-xml-to-json body_2 1 false", + "parse-xml-to-json body_3 1 true" + }; + + List rows = Arrays.asList( + new Row("body", + "303246306303E8") + ); + + rows = TestingRig.execute(directives, rows); + Assert.assertEquals(1, rows.size()); + Assert.assertEquals("{\"tagid\":3.03246306303E19}", rows.get(0).getValue("body_1_Data").toString()); + Assert.assertEquals("{\"tagid\":3.03246306303E19}", rows.get(0).getValue("body_2_Data").toString()); + Assert.assertEquals("{\"tagid\":\"303246306303E8\"}", rows.get(0).getValue("body_3_Data").toString()); + } +} diff --git a/wrangler-docs/directives/parse-xml-to-json.md b/wrangler-docs/directives/parse-xml-to-json.md index 031633786..beb136b0c 100644 --- a/wrangler-docs/directives/parse-xml-to-json.md +++ b/wrangler-docs/directives/parse-xml-to-json.md @@ -8,11 +8,13 @@ transforms the XML into a JSON document, simplifying further parsing using the ## Syntax ``` -parse-xml-to-json [] +parse-xml-to-json [] [] ``` * `` is the name of the column in the record that is an XML document. * `` indicates the depth at which the XML document parsing should terminate processing. +* `` An OPTIONAL boolean value that if true, then values will not be coerced into boolean or numeric values and will instead be left as strings. (as per `org.json.XML` rules) + The default value is `false` ## Usage Notes