Skip to content

Commit e94e0cc

Browse files
committed
Improved detection onf XML/XHtml content when not content type header is set (#663)
1 parent c6d7da9 commit e94e0cc

File tree

4 files changed

+141
-4
lines changed

4 files changed

+141
-4
lines changed

src/changes/changes.xml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,9 @@
88

99
<body>
1010
<release version="3.8.0" date="November xx, 2023" description="Chrome/Edge 119, Bugfixes">
11+
<action type="fix" dev="rbri" issue="#663">"
12+
Improved detection onf XML/XHtml content when not content type header is set.
13+
</action>
1114
<action type="update" dev="rbri" issue="#658">"
1215
Internal method HtmlInput.setType(String, boolean) renamed to changeType(String, boolean) and
1316
a return value added. Please check the javadoc if you use this.

src/main/java/org/htmlunit/DefaultPageCreator.java

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -168,6 +168,8 @@ public static PageType determinePageType(final WebResponse webResponse) throws I
168168
return determinePageType(MimeType.TEXT_PLAIN);
169169
}
170170

171+
// looks a bit strange but correct
172+
// if there is a bom header the browsers are handling this as text page
171173
if (startsWith(bytes, markerUTF8_) || startsWith(bytes, markerUTF16BE_)
172174
|| startsWith(bytes, markerUTF16LE_)) {
173175
return determinePageType(MimeType.TEXT_PLAIN);
@@ -178,6 +180,11 @@ public static PageType determinePageType(final WebResponse webResponse) throws I
178180
}
179181

180182
final String asAsciiString = new String(bytes, StandardCharsets.US_ASCII).trim().toUpperCase(Locale.ROOT);
183+
184+
if (asAsciiString.startsWith("<?XML")) {
185+
return determinePageType(MimeType.TEXT_XML);
186+
}
187+
181188
for (final String htmlPattern : htmlPatterns) {
182189
try {
183190
if ('<' == asAsciiString.charAt(0)) {

src/main/java/org/htmlunit/util/XmlUtils.java

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -134,7 +134,8 @@ public static Document buildDocument(final WebResponse webResponse)
134134
webResponse.getContentCharset());
135135

136136
// we have to do the blank input check and the parsing in one step
137-
final TrackBlankContentReader tracker = new TrackBlankContentReader(reader);
137+
final TrackBlankContentAndSkipLeadingWhitespaceReader tracker
138+
= new TrackBlankContentAndSkipLeadingWhitespaceReader(reader);
138139

139140
final InputSource source = new InputSource(tracker);
140141
final DocumentBuilder builder = factory.newDocumentBuilder();
@@ -155,11 +156,11 @@ public static Document buildDocument(final WebResponse webResponse)
155156
/**
156157
* Helper for memory and performance optimization.
157158
*/
158-
private static final class TrackBlankContentReader extends Reader {
159+
private static final class TrackBlankContentAndSkipLeadingWhitespaceReader extends Reader {
159160
private final Reader reader_;
160161
private boolean wasBlank_ = true;
161162

162-
TrackBlankContentReader(final Reader characterStream) {
163+
TrackBlankContentAndSkipLeadingWhitespaceReader(final Reader characterStream) {
163164
reader_ = characterStream;
164165
}
165166

@@ -174,13 +175,18 @@ public void close() throws IOException {
174175

175176
@Override
176177
public int read(final char[] cbuf, final int off, final int len) throws IOException {
177-
final int result = reader_.read(cbuf, off, len);
178+
int result = reader_.read(cbuf, off, len);
178179

179180
if (wasBlank_ && result > -1) {
180181
for (int i = 0; i < result; i++) {
181182
final char ch = cbuf[off + i];
182183
if (!Character.isWhitespace(ch)) {
183184
wasBlank_ = false;
185+
if (i > 0) {
186+
// skipt the leading whitespace
187+
System.arraycopy(cbuf, i, cbuf, off, len - i);
188+
result -= i;
189+
}
184190
break;
185191
}
186192
}

src/test/java/org/htmlunit/DefaultPageCreatorTest.java

Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@
1515
package org.htmlunit;
1616

1717
import java.io.IOException;
18+
import java.io.OutputStream;
19+
import java.io.OutputStreamWriter;
1820
import java.io.Writer;
1921
import java.util.HashMap;
2022
import java.util.Map;
@@ -104,6 +106,68 @@ protected void doGet(final HttpServletRequest request, final HttpServletResponse
104106
}
105107
}
106108

109+
/**
110+
* @throws Exception if the test fails
111+
*/
112+
@Test
113+
public void noContentTypeXhtml() throws Exception {
114+
final Map<String, Class<? extends Servlet>> servlets = new HashMap<>();
115+
servlets.put("/test", NoContentTypeXhtmlServlet.class);
116+
startWebServer("./", null, servlets);
117+
118+
final WebClient client = getWebClient();
119+
final XHtmlPage page = client.getPage(URL_FIRST + "test");
120+
assertNotNull(page);
121+
}
122+
123+
/**
124+
* Servlet for {@link #noContentTypeLargeXhtmlHeader()}.
125+
*/
126+
public static class NoContentTypeXhtmlServlet extends HttpServlet {
127+
/** {@inheritDoc} */
128+
@Override
129+
protected void doGet(final HttpServletRequest request, final HttpServletResponse response) throws IOException {
130+
final Writer writer = response.getWriter();
131+
writer.write("<?xml version=\"1.0\" encoding=\"utf-8\" ?>\r\n"
132+
+ "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" "
133+
+ "\"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">\r\n"
134+
+ "<html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"lt\" lang=\"lt\">\r\n"
135+
+ "<body>Hello World</body>\r\n"
136+
+ "</html>");
137+
}
138+
}
139+
140+
/**
141+
* @throws Exception if the test fails
142+
*/
143+
@Test
144+
public void noContentTypeXhtmlLeadingBlank() throws Exception {
145+
final Map<String, Class<? extends Servlet>> servlets = new HashMap<>();
146+
servlets.put("/test", NoContentTypeXhtmlLeadingBlankServlet.class);
147+
startWebServer("./", null, servlets);
148+
149+
final WebClient client = getWebClient();
150+
final XHtmlPage page = client.getPage(URL_FIRST + "test");
151+
assertNotNull(page);
152+
}
153+
154+
/**
155+
* Servlet for {@link #noContentTypeLargeXhtmlHeader()}.
156+
*/
157+
public static class NoContentTypeXhtmlLeadingBlankServlet extends HttpServlet {
158+
/** {@inheritDoc} */
159+
@Override
160+
protected void doGet(final HttpServletRequest request, final HttpServletResponse response) throws IOException {
161+
final Writer writer = response.getWriter();
162+
writer.write(" <?xml version=\"1.0\" encoding=\"utf-8\" ?>\r\n"
163+
+ "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" "
164+
+ "\"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">\r\n"
165+
+ "<html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"lt\" lang=\"lt\">\r\n"
166+
+ "<body>Hello World</body>\r\n"
167+
+ "</html>");
168+
}
169+
}
170+
107171
/**
108172
* @throws Exception if the test fails
109173
*/
@@ -260,4 +324,61 @@ protected void doGet(final HttpServletRequest request, final HttpServletResponse
260324
writer.write("<html><head><title>\u00d3</title></head><body></body></html>");
261325
}
262326
}
327+
328+
/**
329+
* @throws Exception if the test fails
330+
*/
331+
@Test
332+
public void noContentTypeBomUtf8() throws Exception {
333+
final Map<String, Class<? extends Servlet>> servlets = new HashMap<>();
334+
servlets.put("/test", NoContentTypeBomUtf8Servlet.class);
335+
startWebServer("./", null, servlets);
336+
337+
final WebClient client = getWebClient();
338+
final TextPage page = client.getPage(URL_FIRST + "test");
339+
assertNotNull(page);
340+
}
341+
342+
/**
343+
* Servlet for {@link #noContentTypeBomUtf8()}.
344+
*/
345+
public static class NoContentTypeBomUtf8Servlet extends HttpServlet {
346+
/** {@inheritDoc} */
347+
@Override
348+
protected void doGet(final HttpServletRequest request, final HttpServletResponse response) throws IOException {
349+
final Writer writer = response.getWriter();
350+
writer.write("\u00ef\u00bb\u00bf<html><head></head><body></body></html>");
351+
}
352+
}
353+
354+
/**
355+
* @throws Exception if the test fails
356+
*/
357+
@Test
358+
public void noContentTypeBomUtf16() throws Exception {
359+
final Map<String, Class<? extends Servlet>> servlets = new HashMap<>();
360+
servlets.put("/test", NoContentTypeBomUtf16Servlet.class);
361+
startWebServer("./", null, servlets);
362+
363+
final WebClient client = getWebClient();
364+
final TextPage page = client.getPage(URL_FIRST + "test");
365+
assertNotNull(page);
366+
}
367+
368+
/**
369+
* Servlet for {@link #noContentTypeBomUtf16()}.
370+
*/
371+
public static class NoContentTypeBomUtf16Servlet extends HttpServlet {
372+
/** {@inheritDoc} */
373+
@Override
374+
protected void doGet(final HttpServletRequest request, final HttpServletResponse response) throws IOException {
375+
final OutputStream output = response.getOutputStream();
376+
output.write('\u00fe');
377+
output.write('\u00ff');
378+
output.flush();
379+
final Writer writer = new OutputStreamWriter(output, "UTF16");
380+
writer.write("<html><head></head><body></body></html>");
381+
writer.flush();
382+
}
383+
}
263384
}

0 commit comments

Comments
 (0)