Apache commons-text is no longer a runtime dependency

rbri · rbri · commit 43e07565deb1 · 2025-01-02T08:18:19.000+01:00
diff --git a/pom.xml b/pom.xml
@@ -1257,17 +1257,6 @@
             <artifactId>commons-lang3</artifactId>
             <version>3.17.0</version>
         </dependency>
-        <dependency>
-            <groupId>org.apache.commons</groupId>
-            <artifactId>commons-text</artifactId>
-            <version>1.13.0</version>
-            <exclusions>
-                <exclusion>
-                    <groupId>org.apache.commons</groupId>
-                    <artifactId>commons-lang3</artifactId>
-                </exclusion>
-            </exclusions>
-        </dependency>
         <dependency>
             <groupId>commons-io</groupId>
             <artifactId>commons-io</artifactId>
@@ -1381,6 +1370,19 @@
             </exclusions>
         </dependency>
 
+        <dependency>
+            <groupId>org.apache.commons</groupId>
+            <artifactId>commons-text</artifactId>
+            <version>1.13.0</version>
+            <scope>test</scope>
+            <exclusions>
+                <exclusion>
+                    <groupId>org.apache.commons</groupId>
+                    <artifactId>commons-lang3</artifactId>
+                </exclusion>
+            </exclusions>
+        </dependency>
+
         <dependency>
             <groupId>commons-fileupload</groupId>
             <artifactId>commons-fileupload</artifactId>
diff --git a/src/changes/changes.xml b/src/changes/changes.xml
@@ -8,6 +8,9 @@
 
     <body>
         <release version="4.8.0" date="January xx, 2025" description="Bugfixes">
+            <action type="update" dev="rbri">
+                Apache commons-text is no longer a runtime dependency.
+            </action>
             <action type="update" dev="RhinoTeam">
                 core-js: Extract function calling out of the interpreter loop for performance.
             </action>
diff --git a/src/main/java/org/htmlunit/html/HtmlTextArea.java b/src/main/java/org/htmlunit/html/HtmlTextArea.java
@@ -21,7 +21,6 @@
 import java.util.Map;
 
 import org.apache.commons.lang3.StringUtils;
-import org.apache.commons.text.StringEscapeUtils;
 import org.htmlunit.SgmlPage;
 import org.htmlunit.html.impl.SelectableTextInput;
 import org.htmlunit.html.impl.SelectableTextSelectionDelegate;
@@ -436,7 +435,7 @@ protected void printXml(final String indent, final PrintWriter printWriter) {
         printOpeningTagContentAsXml(printWriter);
 
         printWriter.print(">");
-        printWriter.print(StringEscapeUtils.escapeXml10(getText()));
+        printWriter.print(org.htmlunit.util.StringUtils.escapeXml(getText()));
         printWriter.print("</textarea>");
     }
 
diff --git a/src/main/java/org/htmlunit/util/StringUtils.java b/src/main/java/org/htmlunit/util/StringUtils.java
@@ -100,11 +100,7 @@ public static boolean startsWithIgnoreCase(final String s, final String expected
     }
 
     /**
-     * Escapes the characters '&lt;', '&gt;' and '&amp;' into their XML entity equivalents. Note that
-     * sometimes we have to use this method instead of
-     * {@link org.apache.commons.lang3.StringEscapeUtils#escapeXml(String)} or
-     * {@link org.apache.commons.lang3.StringEscapeUtils#escapeHtml4(String)} because those methods
-     * escape some unicode characters as well.
+     * Escapes the characters '&lt;', '&gt;' and '&amp;' into their XML entity equivalents.
      *
      * @param s the string to escape
      * @return the escaped form of the specified string
@@ -114,6 +110,83 @@ public static String escapeXmlChars(final String s) {
                 replaceEach(s, new String[] {"&", "<", ">"}, new String[] {"&amp;", "&lt;", "&gt;"});
     }
 
+    /**
+     * Escape the string to be used as xml 1.0 content be replacing the
+     * characters '&quot;', '&amp;', '&apos;', '&lt;', and '&gt;' into their XML entity equivalents.
+     * @param text the attribute value
+     * @return the escaped value
+     */
+    public static String escapeXml(final String text) {
+        if (text == null) {
+            return null;
+        }
+
+        StringBuilder escaped = null;
+
+        final int offset = 0;
+        final int max = text.length();
+
+        int readOffset = offset;
+
+        for (int i = offset; i < max; i++) {
+            final int codepoint = Character.codePointAt(text, i);
+            final boolean codepointValid = supportedByXML10(codepoint);
+
+            if (!codepointValid
+                    || codepoint == '<'
+                    || codepoint == '>'
+                    || codepoint == '&'
+                    || codepoint == '\''
+                    || codepoint == '"') {
+
+                // replacement required
+                if (escaped == null) {
+                    escaped = new StringBuilder(max);
+                }
+
+                if (i > readOffset) {
+                    escaped.append(text, readOffset, i);
+                }
+
+                if (Character.charCount(codepoint) > 1) {
+                    i++;
+                }
+                readOffset = i + 1;
+
+                // skip
+                if (!codepointValid) {
+                    continue;
+                }
+
+                if (codepoint == '<') {
+                    escaped.append("&lt;");
+                }
+                else if (codepoint == '>') {
+                    escaped.append("&gt;");
+                }
+                else if (codepoint == '&') {
+                    escaped.append("&amp;");
+                }
+                else if (codepoint == '\'') {
+                    escaped.append("&apos;");
+                }
+                else if (codepoint == '\"') {
+                    escaped.append("&quot;");
+                }
+            }
+        }
+
+        if (escaped == null) {
+            return text;
+        }
+
+        if (max > readOffset) {
+            escaped.append(text, readOffset, max);
+        }
+
+        return escaped.toString();
+    }
+
     /**
      * Escape the string to be used as attribute value.
      * Only {@code <}, {@code &} and {@code "} have to be escaped (see
@@ -122,33 +195,96 @@ public static String escapeXmlChars(final String s) {
      * @return the escaped value
      */
     public static String escapeXmlAttributeValue(final String attValue) {
-        final int len = attValue.length();
-        StringBuilder sb = null;
-        for (int i = len - 1; i >= 0; --i) {
-            final char c = attValue.charAt(i);
-            String replacement = null;
-            if (c == '<') {
-                replacement = "&lt;";
-            }
-            else if (c == '&') {
-                replacement = "&amp;";
-            }
-            else if (c == '\"') {
-                replacement = "&quot;";
-            }
+        if (attValue == null) {
+            return null;
+        }
+
+        StringBuilder escaped = null;
+
+        final int offset = 0;
+        final int max = attValue.length();
+
+        int readOffset = offset;
+
+        for (int i = offset; i < max; i++) {
+            final int codepoint = Character.codePointAt(attValue, i);
+            final boolean codepointValid = supportedByXML10(codepoint);
+
+            if (!codepointValid
+                    || codepoint == '<'
+                    || codepoint == '&'
+                    || codepoint == '"') {
+
+                // replacement required
+                if (escaped == null) {
+                    escaped = new StringBuilder(max);
+                }
+
+                if (i > readOffset) {
+                    escaped.append(attValue, readOffset, i);
+                }
+
+                if (Character.charCount(codepoint) > 1) {
+                    i++;
+                }
+                readOffset = i + 1;
+
+                // skip
+                if (!codepointValid) {
+                    continue;
+                }
 
-            if (replacement != null) {
-                if (sb == null) {
-                    sb = new StringBuilder(attValue);
+                if (codepoint == '<') {
+                    escaped.append("&lt;");
+                }
+                else if (codepoint == '&') {
+                    escaped.append("&amp;");
+                }
+                else if (codepoint == '\"') {
+                    escaped.append("&quot;");
                 }
-                sb.replace(i, i + 1, replacement);
             }
         }
 
-        if (sb != null) {
-            return sb.toString();
+        if (escaped == null) {
+            return attValue;
+        }
+
+        if (max > readOffset) {
+            escaped.append(attValue, readOffset, max);
         }
-        return attValue;
+
+        return escaped.toString();
+    }
+
+    /*
+     * XML 1.0 does not allow control characters or unpaired Unicode surrogate codepoints.
+     * We will remove characters that do not fit in the following ranges:
+     * #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
+     */
+    private static boolean supportedByXML10(final int codepoint) {
+        if (codepoint < 0x20) {
+            return codepoint == 0x9 || codepoint == 0xA || codepoint == 0xD;
+        }
+        if (codepoint <= 0xD7FF) {
+            return true;
+        }
+
+        if (codepoint < 0xE000) {
+            return false;
+        }
+        if (codepoint <= 0xFFFD) {
+            return true;
+        }
+
+        if (codepoint < 0x10000) {
+            return false;
+        }
+        if (codepoint <= 0x10FFFF) {
+            return true;
+        }
+
+        return true;
     }
 
     /**
diff --git a/src/test/java/org/htmlunit/util/StringUtilsTest.java b/src/test/java/org/htmlunit/util/StringUtilsTest.java