From f98c02fe4a7cdd9ff63fa87d1e18b8bf619d547d Mon Sep 17 00:00:00 2001 From: Tim De Pauw Date: Thu, 5 Sep 2024 14:18:18 +0200 Subject: [PATCH 1/2] Support ECMAScript \x unescaping in StringEscapeUtils --- .../commons/text/StringEscapeUtils.java | 45 ++++++++++------ .../commons/text/translate/HexUnescaper.java | 51 +++++++++++++++++++ .../commons/text/StringEscapeUtilsTest.java | 1 + 3 files changed, 82 insertions(+), 15 deletions(-) create mode 100644 src/main/java/org/apache/commons/text/translate/HexUnescaper.java diff --git a/src/main/java/org/apache/commons/text/StringEscapeUtils.java b/src/main/java/org/apache/commons/text/StringEscapeUtils.java index 1a6bd9b70d..3c8e2e1a87 100644 --- a/src/main/java/org/apache/commons/text/StringEscapeUtils.java +++ b/src/main/java/org/apache/commons/text/StringEscapeUtils.java @@ -32,6 +32,7 @@ import org.apache.commons.text.translate.NumericEntityEscaper; import org.apache.commons.text.translate.NumericEntityUnescaper; import org.apache.commons.text.translate.OctalUnescaper; +import org.apache.commons.text.translate.HexUnescaper; import org.apache.commons.text.translate.UnicodeUnescaper; import org.apache.commons.text.translate.UnicodeUnpairedSurrogateRemover; @@ -374,14 +375,13 @@ public int translate(final CharSequence input, final int index, final Writer wri ); } - /** - * Translator object for unescaping escaped Java. - * - * While {@link #unescapeJava(String)} is the expected method of use, this - * object allows the Java unescaping functionality to be used - * as the foundation for a custom translator. - */ - public static final CharSequenceTranslator UNESCAPE_JAVA; + private static final CharSequenceTranslator OCTAL_JAVA_TRANSLATOR = new OctalUnescaper(); + + private static final CharSequenceTranslator UNICODE_JAVA_TRANSLATOR = new UnicodeUnescaper(); + + private static final CharSequenceTranslator CTRL_CHARS_JAVA_TRANSLATOR = new LookupTranslator(EntityArrays.JAVA_CTRL_CHARS_UNESCAPE); + + private static final CharSequenceTranslator UNESCAPE_JAVA_TRANSLATOR; static { final Map unescapeJavaMap = new HashMap<>(); @@ -389,14 +389,23 @@ public int translate(final CharSequence input, final int index, final Writer wri unescapeJavaMap.put("\\\"", "\""); unescapeJavaMap.put("\\'", "'"); unescapeJavaMap.put("\\", StringUtils.EMPTY); - UNESCAPE_JAVA = new AggregateTranslator( - new OctalUnescaper(), // .between('\1', '\377'), - new UnicodeUnescaper(), - new LookupTranslator(EntityArrays.JAVA_CTRL_CHARS_UNESCAPE), - new LookupTranslator(Collections.unmodifiableMap(unescapeJavaMap)) - ); + UNESCAPE_JAVA_TRANSLATOR = new LookupTranslator(Collections.unmodifiableMap(unescapeJavaMap)); } + /** + * Translator object for unescaping escaped Java. + * + * While {@link #unescapeJava(String)} is the expected method of use, this + * object allows the Java unescaping functionality to be used + * as the foundation for a custom translator. + */ + public static final CharSequenceTranslator UNESCAPE_JAVA = new AggregateTranslator( + OCTAL_JAVA_TRANSLATOR, + UNICODE_JAVA_TRANSLATOR, + CTRL_CHARS_JAVA_TRANSLATOR, + UNESCAPE_JAVA_TRANSLATOR + ); + /** * Translator object for unescaping escaped EcmaScript. * @@ -404,7 +413,13 @@ public int translate(final CharSequence input, final int index, final Writer wri * object allows the EcmaScript unescaping functionality to be used * as the foundation for a custom translator. */ - public static final CharSequenceTranslator UNESCAPE_ECMASCRIPT = UNESCAPE_JAVA; + public static final CharSequenceTranslator UNESCAPE_ECMASCRIPT = new AggregateTranslator( + new HexUnescaper(), + OCTAL_JAVA_TRANSLATOR, + UNICODE_JAVA_TRANSLATOR, + CTRL_CHARS_JAVA_TRANSLATOR, + UNESCAPE_JAVA_TRANSLATOR + ); /** * Translator object for unescaping escaped Json. diff --git a/src/main/java/org/apache/commons/text/translate/HexUnescaper.java b/src/main/java/org/apache/commons/text/translate/HexUnescaper.java new file mode 100644 index 0000000000..2287d71e0c --- /dev/null +++ b/src/main/java/org/apache/commons/text/translate/HexUnescaper.java @@ -0,0 +1,51 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.text.translate; + +import java.io.IOException; +import java.io.Writer; + +/** + * Translates escaped ASCII values of the form \\x\[0-9A-Fa-f][0-9A-Fa-f] back to ASCII. + */ +public class HexUnescaper extends CharSequenceTranslator { + + /** + * {@inheritDoc} + */ + @Override + public int translate(final CharSequence input, final int index, final Writer writer) throws IOException { + if (input.charAt(index) == '\\' && index + 1 < input.length() && input.charAt(index + 1) == 'x') { + if (index + 4 <= input.length()) { + // Get 2 hex digits + final CharSequence hex = input.subSequence(index + 2, index + 4); + + try { + final int value = Integer.parseInt(hex.toString(), 16); + writer.write((char) value); + } catch (final NumberFormatException nfe) { + throw new IllegalArgumentException("Unable to parse ASCII value: " + hex, nfe); + } + return 4; + } + throw new IllegalArgumentException("Less than 2 hex digits in ASCII value: '" + + input.subSequence(index, input.length()) + + "' due to end of CharSequence"); + } + return 0; + } +} diff --git a/src/test/java/org/apache/commons/text/StringEscapeUtilsTest.java b/src/test/java/org/apache/commons/text/StringEscapeUtilsTest.java index a3f664983d..2c6488d223 100644 --- a/src/test/java/org/apache/commons/text/StringEscapeUtilsTest.java +++ b/src/test/java/org/apache/commons/text/StringEscapeUtilsTest.java @@ -456,6 +456,7 @@ public void testUnescapeEcmaScript() { assertEquals("", StringEscapeUtils.unescapeEcmaScript("")); assertEquals(">", StringEscapeUtils.unescapeEcmaScript(">")); + assertEquals("a=b", StringEscapeUtils.unescapeEcmaScript("a\\x3Db")); } @Test From 9c2da486f4424b0809f0e5913a1ec40892126201 Mon Sep 17 00:00:00 2001 From: Tim De Pauw Date: Tue, 10 Sep 2024 12:59:05 +0200 Subject: [PATCH 2/2] Fix Checkstyle validation --- .../org/apache/commons/text/StringEscapeUtils.java | 14 +++++++++++++- .../commons/text/translate/HexUnescaper.java | 8 +++++--- 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/src/main/java/org/apache/commons/text/StringEscapeUtils.java b/src/main/java/org/apache/commons/text/StringEscapeUtils.java index 3c8e2e1a87..dec384fc61 100644 --- a/src/main/java/org/apache/commons/text/StringEscapeUtils.java +++ b/src/main/java/org/apache/commons/text/StringEscapeUtils.java @@ -27,12 +27,12 @@ import org.apache.commons.text.translate.CharSequenceTranslator; import org.apache.commons.text.translate.CsvTranslators; import org.apache.commons.text.translate.EntityArrays; +import org.apache.commons.text.translate.HexUnescaper; import org.apache.commons.text.translate.JavaUnicodeEscaper; import org.apache.commons.text.translate.LookupTranslator; import org.apache.commons.text.translate.NumericEntityEscaper; import org.apache.commons.text.translate.NumericEntityUnescaper; import org.apache.commons.text.translate.OctalUnescaper; -import org.apache.commons.text.translate.HexUnescaper; import org.apache.commons.text.translate.UnicodeUnescaper; import org.apache.commons.text.translate.UnicodeUnpairedSurrogateRemover; @@ -375,12 +375,24 @@ public int translate(final CharSequence input, final int index, final Writer wri ); } + /** + * Translator for octal escapes. + */ private static final CharSequenceTranslator OCTAL_JAVA_TRANSLATOR = new OctalUnescaper(); + /** + * Translator for Unicode escapes. + */ private static final CharSequenceTranslator UNICODE_JAVA_TRANSLATOR = new UnicodeUnescaper(); + /** + * Translator for Java control characters. + */ private static final CharSequenceTranslator CTRL_CHARS_JAVA_TRANSLATOR = new LookupTranslator(EntityArrays.JAVA_CTRL_CHARS_UNESCAPE); + /** + * Translator for Java escapes that aren't control characters. + */ private static final CharSequenceTranslator UNESCAPE_JAVA_TRANSLATOR; static { diff --git a/src/main/java/org/apache/commons/text/translate/HexUnescaper.java b/src/main/java/org/apache/commons/text/translate/HexUnescaper.java index 2287d71e0c..c9168b36ea 100644 --- a/src/main/java/org/apache/commons/text/translate/HexUnescaper.java +++ b/src/main/java/org/apache/commons/text/translate/HexUnescaper.java @@ -29,10 +29,12 @@ public class HexUnescaper extends CharSequenceTranslator { */ @Override public int translate(final CharSequence input, final int index, final Writer writer) throws IOException { + final int prefixLength = 2; // "\\x".length() + final int escapeLength = 4; // "\\xHH".length() if (input.charAt(index) == '\\' && index + 1 < input.length() && input.charAt(index + 1) == 'x') { - if (index + 4 <= input.length()) { + if (index + escapeLength <= input.length()) { // Get 2 hex digits - final CharSequence hex = input.subSequence(index + 2, index + 4); + final CharSequence hex = input.subSequence(index + prefixLength, index + escapeLength); try { final int value = Integer.parseInt(hex.toString(), 16); @@ -40,7 +42,7 @@ public int translate(final CharSequence input, final int index, final Writer wri } catch (final NumberFormatException nfe) { throw new IllegalArgumentException("Unable to parse ASCII value: " + hex, nfe); } - return 4; + return escapeLength; } throw new IllegalArgumentException("Less than 2 hex digits in ASCII value: '" + input.subSequence(index, input.length())