Skip to content

Commit

Permalink
Apache commons-text is no longer a runtime dependency
Browse files Browse the repository at this point in the history
  • Loading branch information
rbri committed Jan 2, 2025
1 parent 7d0d2f9 commit 43e0756
Show file tree
Hide file tree
Showing 5 changed files with 306 additions and 41 deletions.
24 changes: 13 additions & 11 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -1257,17 +1257,6 @@
<artifactId>commons-lang3</artifactId>
<version>3.17.0</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-text</artifactId>
<version>1.13.0</version>
<exclusions>
<exclusion>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
Expand Down Expand Up @@ -1381,6 +1370,19 @@
</exclusions>
</dependency>

<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-text</artifactId>
<version>1.13.0</version>
<scope>test</scope>
<exclusions>
<exclusion>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
</exclusion>
</exclusions>
</dependency>

<dependency>
<groupId>commons-fileupload</groupId>
<artifactId>commons-fileupload</artifactId>
Expand Down
3 changes: 3 additions & 0 deletions src/changes/changes.xml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@

<body>
<release version="4.8.0" date="January xx, 2025" description="Bugfixes">
<action type="update" dev="rbri">
Apache commons-text is no longer a runtime dependency.
</action>
<action type="update" dev="RhinoTeam">
core-js: Extract function calling out of the interpreter loop for performance.
</action>
Expand Down
3 changes: 1 addition & 2 deletions src/main/java/org/htmlunit/html/HtmlTextArea.java
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@
import java.util.Map;

import org.apache.commons.lang3.StringUtils;
import org.apache.commons.text.StringEscapeUtils;
import org.htmlunit.SgmlPage;
import org.htmlunit.html.impl.SelectableTextInput;
import org.htmlunit.html.impl.SelectableTextSelectionDelegate;
Expand Down Expand Up @@ -436,7 +435,7 @@ protected void printXml(final String indent, final PrintWriter printWriter) {
printOpeningTagContentAsXml(printWriter);

printWriter.print(">");
printWriter.print(StringEscapeUtils.escapeXml10(getText()));
printWriter.print(org.htmlunit.util.StringUtils.escapeXml(getText()));
printWriter.print("</textarea>");
}

Expand Down
188 changes: 162 additions & 26 deletions src/main/java/org/htmlunit/util/StringUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -100,11 +100,7 @@ public static boolean startsWithIgnoreCase(final String s, final String expected
}

/**
* Escapes the characters '&lt;', '&gt;' and '&amp;' into their XML entity equivalents. Note that
* sometimes we have to use this method instead of
* {@link org.apache.commons.lang3.StringEscapeUtils#escapeXml(String)} or
* {@link org.apache.commons.lang3.StringEscapeUtils#escapeHtml4(String)} because those methods
* escape some unicode characters as well.
* Escapes the characters '&lt;', '&gt;' and '&amp;' into their XML entity equivalents.
*
* @param s the string to escape
* @return the escaped form of the specified string
Expand All @@ -114,6 +110,83 @@ public static String escapeXmlChars(final String s) {
replaceEach(s, new String[] {"&", "<", ">"}, new String[] {"&amp;", "&lt;", "&gt;"});
}

/**
* Escape the string to be used as xml 1.0 content be replacing the
* characters '&quot;', '&amp;', '&apos;', '&lt;', and '&gt;' into their XML entity equivalents.
* @param text the attribute value
* @return the escaped value
*/
public static String escapeXml(final String text) {
if (text == null) {
return null;
}

StringBuilder escaped = null;

final int offset = 0;
final int max = text.length();

int readOffset = offset;

for (int i = offset; i < max; i++) {
final int codepoint = Character.codePointAt(text, i);
final boolean codepointValid = supportedByXML10(codepoint);

if (!codepointValid
|| codepoint == '<'
|| codepoint == '>'
|| codepoint == '&'
|| codepoint == '\''
|| codepoint == '"') {

// replacement required
if (escaped == null) {
escaped = new StringBuilder(max);
}

if (i > readOffset) {
escaped.append(text, readOffset, i);
}

if (Character.charCount(codepoint) > 1) {
i++;
}
readOffset = i + 1;

// skip
if (!codepointValid) {
continue;
}

if (codepoint == '<') {
escaped.append("&lt;");
}
else if (codepoint == '>') {
escaped.append("&gt;");
}
else if (codepoint == '&') {
escaped.append("&amp;");
}
else if (codepoint == '\'') {
escaped.append("&apos;");
}
else if (codepoint == '\"') {
escaped.append("&quot;");
}
}
}

if (escaped == null) {
return text;
}

if (max > readOffset) {
escaped.append(text, readOffset, max);
}

return escaped.toString();
}

/**
* Escape the string to be used as attribute value.
* Only {@code <}, {@code &} and {@code "} have to be escaped (see
Expand All @@ -122,33 +195,96 @@ public static String escapeXmlChars(final String s) {
* @return the escaped value
*/
public static String escapeXmlAttributeValue(final String attValue) {
final int len = attValue.length();
StringBuilder sb = null;
for (int i = len - 1; i >= 0; --i) {
final char c = attValue.charAt(i);
String replacement = null;
if (c == '<') {
replacement = "&lt;";
}
else if (c == '&') {
replacement = "&amp;";
}
else if (c == '\"') {
replacement = "&quot;";
}
if (attValue == null) {
return null;
}

StringBuilder escaped = null;

final int offset = 0;
final int max = attValue.length();

int readOffset = offset;

for (int i = offset; i < max; i++) {
final int codepoint = Character.codePointAt(attValue, i);
final boolean codepointValid = supportedByXML10(codepoint);

if (!codepointValid
|| codepoint == '<'
|| codepoint == '&'
|| codepoint == '"') {

// replacement required
if (escaped == null) {
escaped = new StringBuilder(max);
}

if (i > readOffset) {
escaped.append(attValue, readOffset, i);
}

if (Character.charCount(codepoint) > 1) {
i++;
}
readOffset = i + 1;

// skip
if (!codepointValid) {
continue;
}

if (replacement != null) {
if (sb == null) {
sb = new StringBuilder(attValue);
if (codepoint == '<') {
escaped.append("&lt;");
}
else if (codepoint == '&') {
escaped.append("&amp;");
}
else if (codepoint == '\"') {
escaped.append("&quot;");
}
sb.replace(i, i + 1, replacement);
}
}

if (sb != null) {
return sb.toString();
if (escaped == null) {
return attValue;
}

if (max > readOffset) {
escaped.append(attValue, readOffset, max);
}
return attValue;

return escaped.toString();
}

/*
* XML 1.0 does not allow control characters or unpaired Unicode surrogate codepoints.
* We will remove characters that do not fit in the following ranges:
* #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
*/
private static boolean supportedByXML10(final int codepoint) {
if (codepoint < 0x20) {
return codepoint == 0x9 || codepoint == 0xA || codepoint == 0xD;
}
if (codepoint <= 0xD7FF) {
return true;
}

if (codepoint < 0xE000) {
return false;
}
if (codepoint <= 0xFFFD) {
return true;
}

if (codepoint < 0x10000) {
return false;
}
if (codepoint <= 0x10FFFF) {
return true;
}

return true;
}

/**
Expand Down
Loading

0 comments on commit 43e0756

Please sign in to comment.