From 2e648118930f97808b9e28653d6f0c25560023ac Mon Sep 17 00:00:00 2001 From: Jonathan Hedley Date: Tue, 6 Aug 2024 17:04:26 +1000 Subject: [PATCH] Added SimpleBufferedInput Replaces uses of BufferedInputStreams Advantages: - can recycle the byte[] buffer; so significant reduction in GC load - if consumer is reading into an array and there is no mark, no need to allocate a buffer - doesn't aim to support multi-threaded reads, so no syncs or locking Also, reduced the DefaultBufferSize to 8K from 32K --- .../internal/ControllableInputStream.java | 14 +- .../org/jsoup/internal/SharedConstants.java | 2 +- .../jsoup/internal/SimpleBufferedInput.java | 132 ++++++++++++++++++ 3 files changed, 142 insertions(+), 6 deletions(-) create mode 100644 src/main/java/org/jsoup/internal/SimpleBufferedInput.java diff --git a/src/main/java/org/jsoup/internal/ControllableInputStream.java b/src/main/java/org/jsoup/internal/ControllableInputStream.java index 96cbd15c0e..720dffa01c 100644 --- a/src/main/java/org/jsoup/internal/ControllableInputStream.java +++ b/src/main/java/org/jsoup/internal/ControllableInputStream.java @@ -20,7 +20,7 @@ */ // reimplemented from ConstrainableInputStream for JDK21 - extending BufferedInputStream will pin threads during read public class ControllableInputStream extends FilterInputStream { - private final BufferedInputStream buff; + private final InputStream buff; private final boolean capped; private final int maxSize; private long startTime; @@ -35,7 +35,7 @@ public class ControllableInputStream extends FilterInputStream { private int contentLength = -1; private int readPos = 0; // amount read; can be reset() - private ControllableInputStream(BufferedInputStream in, int maxSize) { + private ControllableInputStream(InputStream in, int maxSize) { super(in); Validate.isTrue(maxSize >= 0); buff = in; @@ -54,12 +54,13 @@ private ControllableInputStream(BufferedInputStream in, int maxSize) { * @return a controllable input stream */ public static ControllableInputStream wrap(InputStream in, int bufferSize, int maxSize) { + // bufferSize currently unused; consider implementing as a min size in the SoftPool recycler if (in instanceof ControllableInputStream) return (ControllableInputStream) in; else if (in instanceof BufferedInputStream) - return new ControllableInputStream((BufferedInputStream) in, maxSize); + return new ControllableInputStream(in, maxSize); else - return new ControllableInputStream(new BufferedInputStream(in, bufferSize), maxSize); + return new ControllableInputStream(new SimpleBufferedInput(in), maxSize); } @Override @@ -173,6 +174,9 @@ private boolean expired() { } public BufferedInputStream inputStream() { - return buff; + // called via HttpConnection.Response.bodyStream(), needs an OG BufferedInputStream + if (buff instanceof BufferedInputStream) + return (BufferedInputStream) buff; // if originally supplied a BIS in .wrap() + else return new BufferedInputStream(buff); } } diff --git a/src/main/java/org/jsoup/internal/SharedConstants.java b/src/main/java/org/jsoup/internal/SharedConstants.java index 8e8520cf62..6fdf05e76d 100644 --- a/src/main/java/org/jsoup/internal/SharedConstants.java +++ b/src/main/java/org/jsoup/internal/SharedConstants.java @@ -10,7 +10,7 @@ public final class SharedConstants { public static final String RangeKey = "jsoup.start"; public static final String EndRangeKey = "jsoup.end"; - public static final int DefaultBufferSize = 1024 * 32; + public static final int DefaultBufferSize = 8 * 1024; public static final String[] FormSubmitTags = { "input", "keygen", "object", "select", "textarea" diff --git a/src/main/java/org/jsoup/internal/SimpleBufferedInput.java b/src/main/java/org/jsoup/internal/SimpleBufferedInput.java new file mode 100644 index 0000000000..f477b5beb8 --- /dev/null +++ b/src/main/java/org/jsoup/internal/SimpleBufferedInput.java @@ -0,0 +1,132 @@ +package org.jsoup.internal; + +import org.jsoup.helper.Validate; +import org.jspecify.annotations.Nullable; + +import java.io.FilterInputStream; +import java.io.IOException; +import java.io.InputStream; + +import static org.jsoup.internal.SharedConstants.DefaultBufferSize; + +/** + A simple implemented of a buffered input stream, in which we can control the byte[] buffer to recycle it. Not safe for + use between threads; no sync or locks. The buffer is borrowed on initial demand in fill. */ +class SimpleBufferedInput extends FilterInputStream { + static final int BufferSize = DefaultBufferSize; + static final SoftPool BufferPool = new SoftPool<>(() -> new byte[BufferSize]); + + byte @Nullable [] byteBuf; // the byte buffer; recycled via SoftPool. Created in fill if required + int bufPos; + int bufLength; + int bufMark = -1; + + SimpleBufferedInput(InputStream in) { + super(in); + } + + @Override + public int read() throws IOException { + if (bufPos >= bufLength) { + fill(); + if (bufPos >= bufLength) + return -1; + } + return getBuf()[bufPos++] & 0xff; + } + + @Override + public int read(byte[] dest, int offset, int desiredLen) throws IOException { + Validate.notNull(dest); + if (offset < 0 || desiredLen < 0 || desiredLen > dest.length - offset) { + throw new IndexOutOfBoundsException(); + } else if (desiredLen == 0) { + return 0; + } + + int bufAvail = bufLength - bufPos; + if (bufAvail <= 0) { + if (desiredLen >= BufferSize && bufMark < 0) { + // We can skip creating / copying into a local buffer; just pass through + return in.read(dest, offset, desiredLen); + } + fill(); + bufAvail = bufLength - bufPos; + } + + int read = Math.min(bufAvail, desiredLen); + if (read <= 0) { + return -1; + } + + System.arraycopy(getBuf(), bufPos, dest, offset, read); + bufPos += read; + return read; + } + + private void fill() throws IOException { + if (byteBuf == null) { // get one on first demand + byteBuf = BufferPool.borrow(); + } + + if (bufMark < 0) { // no mark, can lose buffer (assumes we've read to bufLen) + bufPos = 0; + } else if (bufPos >= BufferSize) { // no room left in buffer + if (bufMark > 0) { // can throw away early part of the buffer + int size = bufPos - bufMark; + System.arraycopy(byteBuf, bufMark, byteBuf, 0, size); + bufPos = size; + bufMark = 0; + } else { // invalidate mark + bufMark = -1; + bufPos = 0; + } + } + bufLength = bufPos; + int read = in.read(byteBuf, bufPos, byteBuf.length - bufPos); + if (read > 0) { + bufLength = read + bufPos; + while (byteBuf.length - bufLength > 0) { // read in more if we have space, without blocking + if (in.available() < 1) break; + read = in.read(byteBuf, bufLength, byteBuf.length - bufLength); + if (read <= 0) break; + bufLength += read; + } + } + } + + byte[] getBuf() { + Validate.notNull(byteBuf); + return byteBuf; + } + + @Override + public int available() throws IOException { + if (byteBuf != null && bufLength - bufPos > 0) + return bufLength - bufPos; // doesn't include those in.available(), but mostly used as a block test + return in.available(); + } + + @Override + public void mark(int readlimit) { + if (readlimit > BufferSize) { + throw new IllegalArgumentException("Read-ahead limit is greater than buffer size"); + } + bufMark = bufPos; + } + + @Override + public void reset() throws IOException { + if (bufMark < 0) + throw new IOException("Resetting to invalid mark"); + bufPos = bufMark; + } + + @Override + public void close() throws IOException { + super.close(); + if (byteBuf == null) return; // already closed, or never allocated + BufferPool.release(byteBuf); // return the buffer to the pool + byteBuf = null; // NPE further attempts to read + } +}