Skip to content

Commit 43e0756

Browse files
committed
Apache commons-text is no longer a runtime dependency
1 parent 7d0d2f9 commit 43e0756

File tree

5 files changed

+306
-41
lines changed

5 files changed

+306
-41
lines changed

pom.xml

Lines changed: 13 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1257,17 +1257,6 @@
12571257
<artifactId>commons-lang3</artifactId>
12581258
<version>3.17.0</version>
12591259
</dependency>
1260-
<dependency>
1261-
<groupId>org.apache.commons</groupId>
1262-
<artifactId>commons-text</artifactId>
1263-
<version>1.13.0</version>
1264-
<exclusions>
1265-
<exclusion>
1266-
<groupId>org.apache.commons</groupId>
1267-
<artifactId>commons-lang3</artifactId>
1268-
</exclusion>
1269-
</exclusions>
1270-
</dependency>
12711260
<dependency>
12721261
<groupId>commons-io</groupId>
12731262
<artifactId>commons-io</artifactId>
@@ -1381,6 +1370,19 @@
13811370
</exclusions>
13821371
</dependency>
13831372

1373+
<dependency>
1374+
<groupId>org.apache.commons</groupId>
1375+
<artifactId>commons-text</artifactId>
1376+
<version>1.13.0</version>
1377+
<scope>test</scope>
1378+
<exclusions>
1379+
<exclusion>
1380+
<groupId>org.apache.commons</groupId>
1381+
<artifactId>commons-lang3</artifactId>
1382+
</exclusion>
1383+
</exclusions>
1384+
</dependency>
1385+
13841386
<dependency>
13851387
<groupId>commons-fileupload</groupId>
13861388
<artifactId>commons-fileupload</artifactId>

src/changes/changes.xml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,9 @@
88

99
<body>
1010
<release version="4.8.0" date="January xx, 2025" description="Bugfixes">
11+
<action type="update" dev="rbri">
12+
Apache commons-text is no longer a runtime dependency.
13+
</action>
1114
<action type="update" dev="RhinoTeam">
1215
core-js: Extract function calling out of the interpreter loop for performance.
1316
</action>

src/main/java/org/htmlunit/html/HtmlTextArea.java

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@
2121
import java.util.Map;
2222

2323
import org.apache.commons.lang3.StringUtils;
24-
import org.apache.commons.text.StringEscapeUtils;
2524
import org.htmlunit.SgmlPage;
2625
import org.htmlunit.html.impl.SelectableTextInput;
2726
import org.htmlunit.html.impl.SelectableTextSelectionDelegate;
@@ -436,7 +435,7 @@ protected void printXml(final String indent, final PrintWriter printWriter) {
436435
printOpeningTagContentAsXml(printWriter);
437436

438437
printWriter.print(">");
439-
printWriter.print(StringEscapeUtils.escapeXml10(getText()));
438+
printWriter.print(org.htmlunit.util.StringUtils.escapeXml(getText()));
440439
printWriter.print("</textarea>");
441440
}
442441

src/main/java/org/htmlunit/util/StringUtils.java

Lines changed: 162 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -100,11 +100,7 @@ public static boolean startsWithIgnoreCase(final String s, final String expected
100100
}
101101

102102
/**
103-
* Escapes the characters '&lt;', '&gt;' and '&amp;' into their XML entity equivalents. Note that
104-
* sometimes we have to use this method instead of
105-
* {@link org.apache.commons.lang3.StringEscapeUtils#escapeXml(String)} or
106-
* {@link org.apache.commons.lang3.StringEscapeUtils#escapeHtml4(String)} because those methods
107-
* escape some unicode characters as well.
103+
* Escapes the characters '&lt;', '&gt;' and '&amp;' into their XML entity equivalents.
108104
*
109105
* @param s the string to escape
110106
* @return the escaped form of the specified string
@@ -114,6 +110,83 @@ public static String escapeXmlChars(final String s) {
114110
replaceEach(s, new String[] {"&", "<", ">"}, new String[] {"&amp;", "&lt;", "&gt;"});
115111
}
116112

113+
/**
114+
* Escape the string to be used as xml 1.0 content be replacing the
115+
* characters '&quot;', '&amp;', '&apos;', '&lt;', and '&gt;' into their XML entity equivalents.
116+
* @param text the attribute value
117+
* @return the escaped value
118+
*/
119+
public static String escapeXml(final String text) {
120+
if (text == null) {
121+
return null;
122+
}
123+
124+
StringBuilder escaped = null;
125+
126+
final int offset = 0;
127+
final int max = text.length();
128+
129+
int readOffset = offset;
130+
131+
for (int i = offset; i < max; i++) {
132+
final int codepoint = Character.codePointAt(text, i);
133+
final boolean codepointValid = supportedByXML10(codepoint);
134+
135+
if (!codepointValid
136+
|| codepoint == '<'
137+
|| codepoint == '>'
138+
|| codepoint == '&'
139+
|| codepoint == '\''
140+
|| codepoint == '"') {
141+
142+
// replacement required
143+
if (escaped == null) {
144+
escaped = new StringBuilder(max);
145+
}
146+
147+
if (i > readOffset) {
148+
escaped.append(text, readOffset, i);
149+
}
150+
151+
if (Character.charCount(codepoint) > 1) {
152+
i++;
153+
}
154+
readOffset = i + 1;
155+
156+
// skip
157+
if (!codepointValid) {
158+
continue;
159+
}
160+
161+
if (codepoint == '<') {
162+
escaped.append("&lt;");
163+
}
164+
else if (codepoint == '>') {
165+
escaped.append("&gt;");
166+
}
167+
else if (codepoint == '&') {
168+
escaped.append("&amp;");
169+
}
170+
else if (codepoint == '\'') {
171+
escaped.append("&apos;");
172+
}
173+
else if (codepoint == '\"') {
174+
escaped.append("&quot;");
175+
}
176+
}
177+
}
178+
179+
if (escaped == null) {
180+
return text;
181+
}
182+
183+
if (max > readOffset) {
184+
escaped.append(text, readOffset, max);
185+
}
186+
187+
return escaped.toString();
188+
}
189+
117190
/**
118191
* Escape the string to be used as attribute value.
119192
* Only {@code <}, {@code &} and {@code "} have to be escaped (see
@@ -122,33 +195,96 @@ public static String escapeXmlChars(final String s) {
122195
* @return the escaped value
123196
*/
124197
public static String escapeXmlAttributeValue(final String attValue) {
125-
final int len = attValue.length();
126-
StringBuilder sb = null;
127-
for (int i = len - 1; i >= 0; --i) {
128-
final char c = attValue.charAt(i);
129-
String replacement = null;
130-
if (c == '<') {
131-
replacement = "&lt;";
132-
}
133-
else if (c == '&') {
134-
replacement = "&amp;";
135-
}
136-
else if (c == '\"') {
137-
replacement = "&quot;";
138-
}
198+
if (attValue == null) {
199+
return null;
200+
}
201+
202+
StringBuilder escaped = null;
203+
204+
final int offset = 0;
205+
final int max = attValue.length();
206+
207+
int readOffset = offset;
208+
209+
for (int i = offset; i < max; i++) {
210+
final int codepoint = Character.codePointAt(attValue, i);
211+
final boolean codepointValid = supportedByXML10(codepoint);
212+
213+
if (!codepointValid
214+
|| codepoint == '<'
215+
|| codepoint == '&'
216+
|| codepoint == '"') {
217+
218+
// replacement required
219+
if (escaped == null) {
220+
escaped = new StringBuilder(max);
221+
}
222+
223+
if (i > readOffset) {
224+
escaped.append(attValue, readOffset, i);
225+
}
226+
227+
if (Character.charCount(codepoint) > 1) {
228+
i++;
229+
}
230+
readOffset = i + 1;
231+
232+
// skip
233+
if (!codepointValid) {
234+
continue;
235+
}
139236

140-
if (replacement != null) {
141-
if (sb == null) {
142-
sb = new StringBuilder(attValue);
237+
if (codepoint == '<') {
238+
escaped.append("&lt;");
239+
}
240+
else if (codepoint == '&') {
241+
escaped.append("&amp;");
242+
}
243+
else if (codepoint == '\"') {
244+
escaped.append("&quot;");
143245
}
144-
sb.replace(i, i + 1, replacement);
145246
}
146247
}
147248

148-
if (sb != null) {
149-
return sb.toString();
249+
if (escaped == null) {
250+
return attValue;
251+
}
252+
253+
if (max > readOffset) {
254+
escaped.append(attValue, readOffset, max);
150255
}
151-
return attValue;
256+
257+
return escaped.toString();
258+
}
259+
260+
/*
261+
* XML 1.0 does not allow control characters or unpaired Unicode surrogate codepoints.
262+
* We will remove characters that do not fit in the following ranges:
263+
* #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
264+
*/
265+
private static boolean supportedByXML10(final int codepoint) {
266+
if (codepoint < 0x20) {
267+
return codepoint == 0x9 || codepoint == 0xA || codepoint == 0xD;
268+
}
269+
if (codepoint <= 0xD7FF) {
270+
return true;
271+
}
272+
273+
if (codepoint < 0xE000) {
274+
return false;
275+
}
276+
if (codepoint <= 0xFFFD) {
277+
return true;
278+
}
279+
280+
if (codepoint < 0x10000) {
281+
return false;
282+
}
283+
if (codepoint <= 0x10FFFF) {
284+
return true;
285+
}
286+
287+
return true;
152288
}
153289

154290
/**

0 commit comments

Comments
 (0)