Skip to content

Commit

Permalink
Merge pull request #695 from data-integrations/CDAP-20934_keepstrings…
Browse files Browse the repository at this point in the history
…_xmltojson

CDAP-20934 - adding new param to congiure if string value needs to co…
  • Loading branch information
sahusanket committed Jan 23, 2024
2 parents 7042ab9 + 8144096 commit a122b5e
Show file tree
Hide file tree
Showing 4 changed files with 69 additions and 3 deletions.
2 changes: 1 addition & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@
<aws.sdk.version>1.11.133</aws.sdk.version>
<bigquery.connector.hadoop2.version>0.10.2-hadoop2</bigquery.connector.hadoop2.version>
<bouncycastle.version>1.56</bouncycastle.version>
<cdap.version>6.10.0-SNAPSHOT</cdap.version>
<cdap.version>6.10.0</cdap.version>
<chlorine.version>1.1.5</chlorine.version>
<commons.validator.version>1.6</commons.validator.version>
<commons-io.version>2.5</commons-io.version>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
import io.cdap.wrangler.api.parser.Numeric;
import io.cdap.wrangler.api.parser.TokenType;
import io.cdap.wrangler.api.parser.UsageDefinition;
import org.apache.commons.lang.StringUtils;
import org.json.JSONException;
import org.json.XML;

Expand All @@ -52,16 +53,19 @@
@Description("Parses a XML document to JSON representation.")
public class XmlToJson implements Directive, Lineage {
public static final String NAME = "parse-xml-to-json";
public static final String ARG_KEEP_STRING = "keep-string";
// Column within the input row that needs to be parsed as Json
private String col;
private int depth;
private boolean keepString;
private final Gson gson = new Gson();

@Override
public UsageDefinition define() {
UsageDefinition.Builder builder = UsageDefinition.builder(NAME);
builder.define("column", TokenType.COLUMN_NAME);
builder.define("depth", TokenType.NUMERIC, Optional.TRUE);
builder.define(ARG_KEEP_STRING, TokenType.BOOLEAN, Optional.TRUE);
return builder.build();
}

Expand All @@ -73,6 +77,12 @@ public void initialize(Arguments args) throws DirectiveParseException {
} else {
this.depth = Integer.MAX_VALUE;
}

if (args.contains(ARG_KEEP_STRING) &&
StringUtils.isNotEmpty(args.value(ARG_KEEP_STRING).value().toString())) {
this.keepString = Boolean.parseBoolean(args.value(ARG_KEEP_STRING).value().toString());
}

}

@Override
Expand All @@ -93,7 +103,7 @@ public List<Row> execute(List<Row> rows, ExecutorContext context) throws Directi

try {
if (object instanceof String) {
JsonObject element = gson.fromJson(XML.toJSONObject((String) object).toString(),
JsonObject element = gson.fromJson(XML.toJSONObject((String) object, this.keepString).toString(),
JsonElement.class).getAsJsonObject();
JsParser.jsonFlatten(element, col, 1, depth, row);
row.remove(idx);
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
/*
* Copyright © 2024 Cask Data, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/

package io.cdap.directives.parser;

import io.cdap.directives.xml.XmlToJson;
import io.cdap.wrangler.TestingRig;
import io.cdap.wrangler.api.Row;
import org.junit.Assert;
import org.junit.Test;

import java.util.Arrays;
import java.util.List;

/**
* Tests {@link XmlToJson}
*/
public class XmlToJsonTest {
@Test
public void testAutoConversionOfStringField() throws Exception {
String[] directives = new String[] {
"copy body body_1 true",
"copy body body_2 true",
"copy body body_3 true",
"parse-xml-to-json body_1 1",
"parse-xml-to-json body_2 1 false",
"parse-xml-to-json body_3 1 true"
};

List<Row> rows = Arrays.asList(
new Row("body",
"<?xml version=\"1.0\" encoding=\"UTF-8\" ?><Data><tagid>303246306303E8</tagid></Data>")
);

rows = TestingRig.execute(directives, rows);
Assert.assertEquals(1, rows.size());
Assert.assertEquals("{\"tagid\":3.03246306303E19}", rows.get(0).getValue("body_1_Data").toString());
Assert.assertEquals("{\"tagid\":3.03246306303E19}", rows.get(0).getValue("body_2_Data").toString());
Assert.assertEquals("{\"tagid\":\"303246306303E8\"}", rows.get(0).getValue("body_3_Data").toString());
}
}
4 changes: 3 additions & 1 deletion wrangler-docs/directives/parse-xml-to-json.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,13 @@ transforms the XML into a JSON document, simplifying further parsing using the

## Syntax
```
parse-xml-to-json <column-name> [<depth>]
parse-xml-to-json <column-name> [<depth>] [<keep-strings>]
```

* `<column-name>` is the name of the column in the record that is an XML document.
* `<depth>` indicates the depth at which the XML document parsing should terminate processing.
* `<keep-strings>` An OPTIONAL boolean value that if true, then values will not be coerced into boolean or numeric values and will instead be left as strings. (as per `org.json.XML` rules)
The default value is `false`


## Usage Notes
Expand Down

0 comments on commit a122b5e

Please sign in to comment.