From d8ee7296c0f926e398ab444c1b25f17484a8907f Mon Sep 17 00:00:00 2001 From: sebthom Date: Mon, 19 Aug 2024 18:16:30 +0200 Subject: [PATCH] fix: DocumentInputStream does not handle surrogate pairs correctly --- .../ui/internal/utils/CharsInputStream.java | 220 ++++++++++++++++++ .../ui/internal/utils/ContentTypeHelper.java | 2 +- .../internal/utils/DocumentInputStream.java | 84 +++---- .../internal/utils/CharsInputStreamTest.java | 125 ++++++++++ 4 files changed, 375 insertions(+), 56 deletions(-) create mode 100644 org.eclipse.tm4e.ui/src/main/java/org/eclipse/tm4e/ui/internal/utils/CharsInputStream.java create mode 100644 org.eclipse.tm4e.ui/src/test/java/org/eclipse/tm4e/ui/internal/utils/CharsInputStreamTest.java diff --git a/org.eclipse.tm4e.ui/src/main/java/org/eclipse/tm4e/ui/internal/utils/CharsInputStream.java b/org.eclipse.tm4e.ui/src/main/java/org/eclipse/tm4e/ui/internal/utils/CharsInputStream.java new file mode 100644 index 000000000..3ac479758 --- /dev/null +++ b/org.eclipse.tm4e.ui/src/main/java/org/eclipse/tm4e/ui/internal/utils/CharsInputStream.java @@ -0,0 +1,220 @@ +/******************************************************************************* + * Copyright (c) 2024 Sebastian Thomschke and others. + * This program and the accompanying materials are made + * available under the terms of the Eclipse Public License 2.0 + * which is available at https://www.eclipse.org/legal/epl-2.0/ + * + * SPDX-License-Identifier: EPL-2.0 + * + * Contributors: + * Sebastian Thomschke - initial implementation + *******************************************************************************/ +package org.eclipse.tm4e.ui.internal.utils; + +import java.io.IOException; +import java.io.InputStream; +import java.nio.ByteBuffer; +import java.nio.CharBuffer; +import java.nio.charset.Charset; +import java.nio.charset.CharsetEncoder; +import java.nio.charset.CoderResult; +import java.nio.charset.StandardCharsets; +import java.util.Objects; +import java.util.function.IntSupplier; + +import org.eclipse.jdt.annotation.Nullable; + +class CharsInputStream extends InputStream { + @FunctionalInterface + interface CharsSupplier { + char charAt(int index) throws Exception; + } + + enum EncoderState { + ENCODING, + FLUSHING, + DONE + } + + /** 512 surrogate character pairs */ + private static final int DEFAULT_BUFFER_SIZE = 512; + private static final int EOF = -1; + + private final int bufferSize; + private final CharBuffer charBuffer; + private final ByteBuffer byteBuffer; + private final CharsetEncoder encoder; + private EncoderState encoderState = EncoderState.ENCODING; + + private int charIndex = 0; + private final CharsSupplier chars; + private final IntSupplier charsLength; + + CharsInputStream(final CharSequence chars) { + this(chars, null); + } + + CharsInputStream(final CharSequence chars, final @Nullable Charset charset) { + this(chars, charset, DEFAULT_BUFFER_SIZE); + } + + CharsInputStream(final CharSequence chars, final @Nullable Charset charset, final int bufferSize) { + this(chars::charAt, chars::length, charset, bufferSize); + } + + CharsInputStream(final CharsSupplier chars, final IntSupplier charsLength) { + this(chars, charsLength, null); + } + + /** + * @param chars function to access indexed chars. + * @param charsLength function to get the number of indexed chars provided by the chars parameter. + */ + CharsInputStream(final CharsSupplier chars, final IntSupplier charsLength, final @Nullable Charset charset) { + this(chars, charsLength, charset, DEFAULT_BUFFER_SIZE); + } + + /** + * @param chars function to access indexed chars. + * @param charsLength function to get the number of indexed chars provided by the chars parameter. + * @param bufferSize number of surrogate character pairs to encode at once. + */ + CharsInputStream(final CharsSupplier chars, final IntSupplier charsLength, final @Nullable Charset charset, final int bufferSize) { + if (bufferSize < 1) + throw new IllegalArgumentException("[bufferSize] must be 1 or larger"); + encoder = (charset == null ? StandardCharsets.UTF_8 : charset).newEncoder(); + + this.bufferSize = bufferSize; + charBuffer = CharBuffer.allocate(bufferSize * 2); // buffer for 2 chars (high/low surrogate) + byteBuffer = ByteBuffer.allocate(bufferSize * 4); // buffer for one UTF character (up to 4 bytes) + byteBuffer.flip(); + charBuffer.flip(); + + this.chars = chars; + this.charsLength = charsLength; + } + + @Override + public int available() { + final int remaining = byteBuffer.remaining(); + return remaining == 0 ? charsLength.getAsInt() - charIndex : remaining; + } + + private boolean flushEncoder() throws IOException { + if (encoderState == EncoderState.DONE) + return false; + + if (encoderState == EncoderState.ENCODING) { + encoderState = EncoderState.FLUSHING; + } + + // flush + byteBuffer.clear(); + final CoderResult result = encoder.flush(byteBuffer); + byteBuffer.flip(); + + if (result.isOverflow()) // byteBuffer too small + return true; + + if (result.isError()) { + result.throwException(); + } + + encoderState = EncoderState.DONE; + return byteBuffer.hasRemaining(); + } + + @Override + public int read() throws IOException { + if (!byteBuffer.hasRemaining() && !refillBuffer()) + return EOF; + return byteBuffer.get() & 0xFF; // next byte as an unsigned integer (0 to 255) + } + + @Override + public int read(final byte[] buf, final int off, final int bytesToRead) throws IOException { + Objects.checkFromIndexSize(off, bytesToRead, buf.length); + if (bytesToRead == 0) + return 0; + + int bytesRead = 0; + int bytesReadable = byteBuffer.remaining(); + + while (bytesRead < bytesToRead) { + if (bytesReadable == 0) { + if (refillBuffer()) { + bytesReadable = byteBuffer.remaining(); + } else + return bytesRead == 0 ? EOF : bytesRead; + } + + final int bytesToReadNow = Math.min(bytesToRead - bytesRead, bytesReadable); + byteBuffer.get(buf, off + bytesRead, bytesToReadNow); + bytesRead += bytesToReadNow; + bytesReadable -= bytesToReadNow; + } + + return bytesRead; + } + + private boolean refillBuffer() throws IOException { + if (encoderState == EncoderState.DONE) + return false; + + if (encoderState == EncoderState.FLUSHING) + return flushEncoder(); + + final int charsLen = charsLength.getAsInt(); + + // if EOF is reached transition to flushing + if (charIndex >= charsLen) { + // finalize encoding before switching to flushing + byteBuffer.clear(); + final CoderResult result = encoder.encode(CharBuffer.allocate(0), byteBuffer, true /* signal EOF */); + byteBuffer.flip(); + if (result.isError()) { + result.throwException(); + } + return flushEncoder(); + } + + try { + charBuffer.clear(); + for (int i = 0; i < bufferSize && charIndex < charsLen; i++) { + final char nextChar = chars.charAt(charIndex++); + if (Character.isHighSurrogate(nextChar)) { // handle surrogate pairs + if (charIndex < charsLen) { + final char lowSurrogate = chars.charAt(charIndex); + if (Character.isLowSurrogate(lowSurrogate)) { + charIndex++; + charBuffer.put(nextChar); + charBuffer.put(lowSurrogate); + } else { + // missing low surrogate - fallback to replacement character + charBuffer.put('\uFFFD'); + } + } else { + // missing low surrogate - fallback to replacement character + charBuffer.put('\uFFFD'); + break; + } + } else { + charBuffer.put(nextChar); + } + } + charBuffer.flip(); + + // encode chars into bytes + byteBuffer.clear(); + final CoderResult result = encoder.encode(charBuffer, byteBuffer, false); + byteBuffer.flip(); + if (result.isError()) { + result.throwException(); + } + } catch (final Exception ex) { + throw new IOException(ex); + } + + return true; + } +} diff --git a/org.eclipse.tm4e.ui/src/main/java/org/eclipse/tm4e/ui/internal/utils/ContentTypeHelper.java b/org.eclipse.tm4e.ui/src/main/java/org/eclipse/tm4e/ui/internal/utils/ContentTypeHelper.java index 43cc25410..877179036 100644 --- a/org.eclipse.tm4e.ui/src/main/java/org/eclipse/tm4e/ui/internal/utils/ContentTypeHelper.java +++ b/org.eclipse.tm4e.ui/src/main/java/org/eclipse/tm4e/ui/internal/utils/ContentTypeHelper.java @@ -112,7 +112,7 @@ private static ContentTypeInfo getContentTypes(final ITextFileBuffer buffer) { if (bufferContentType != null) { contentTypes.add(bufferContentType); } - if (buffer.isDirty()) { + if (buffer.isDirty() && buffer.getDocument() != null) { // Buffer is dirty (content of the filesystem is not synch with // the editor content), use IDocument content. try (var input = new DocumentInputStream(buffer.getDocument())) { diff --git a/org.eclipse.tm4e.ui/src/main/java/org/eclipse/tm4e/ui/internal/utils/DocumentInputStream.java b/org.eclipse.tm4e.ui/src/main/java/org/eclipse/tm4e/ui/internal/utils/DocumentInputStream.java index 69bc716d3..c5934133a 100644 --- a/org.eclipse.tm4e.ui/src/main/java/org/eclipse/tm4e/ui/internal/utils/DocumentInputStream.java +++ b/org.eclipse.tm4e.ui/src/main/java/org/eclipse/tm4e/ui/internal/utils/DocumentInputStream.java @@ -1,71 +1,45 @@ /******************************************************************************* - * Copyright (c) 2005, 2008 IBM Corporation and others. - * All rights reserved. This program and the accompanying materials - * are made available under the terms of the Eclipse Public License v1.0 - * which accompanies this distribution, and is available at - * http://www.eclipse.org/legal/epl-v10.html + * Copyright (c) 2024 Sebastian Thomschke and others. + * This program and the accompanying materials are made + * available under the terms of the Eclipse Public License 2.0 + * which is available at https://www.eclipse.org/legal/epl-2.0/ + * + * SPDX-License-Identifier: EPL-2.0 * * Contributors: - * IBM Corporation - initial API and implementation - * QNX Software System - * Sebastian Thomschke - implement read(byte[], int, int) + * Sebastian Thomschke - initial implementation *******************************************************************************/ package org.eclipse.tm4e.ui.internal.utils; -import java.io.IOException; -import java.io.InputStream; -import java.util.Objects; +import java.nio.charset.Charset; -import org.eclipse.jface.text.BadLocationException; +import org.eclipse.core.filebuffers.FileBuffers; +import org.eclipse.core.filebuffers.ITextFileBuffer; +import org.eclipse.core.filebuffers.ITextFileBufferManager; +import org.eclipse.jdt.annotation.Nullable; import org.eclipse.jface.text.IDocument; +import org.eclipse.tm4e.ui.TMUIPlugin; -/** - * Input stream which reads from a document - */ -final class DocumentInputStream extends InputStream { - - private final IDocument doc; - private int pos = 0; - - DocumentInputStream(final IDocument document) { - doc = document; - } - - @Override - public int read(final byte[] buff, final int buffOffset, final int len) throws IOException { - Objects.checkFromIndexSize(buffOffset, len, buff.length); +final class DocumentInputStream extends CharsInputStream { - if (len == 0) - return 0; - - final var docLen = doc.getLength(); - if (pos >= docLen) - return -1; - - var bytesRead = -1; + private static @Nullable Charset getCharset(final IDocument document) { + final ITextFileBufferManager bufferManager = FileBuffers.getTextFileBufferManager(); + if (bufferManager == null) + return null; + final ITextFileBuffer buffer = bufferManager.getTextFileBuffer(document); + if (buffer == null) + return null; try { - buff[buffOffset] = (byte) doc.getChar(pos++); - bytesRead = 1; - while (bytesRead < len) { - if (pos >= docLen) { - break; - } - buff[buffOffset + bytesRead++] = (byte) doc.getChar(pos++); - } - } catch (final BadLocationException ex) { - // ignore + final String charsetName = buffer.getEncoding(); + if (charsetName != null) + return Charset.forName(charsetName); + } catch (final Exception ex) { + TMUIPlugin.logError(ex); } - return bytesRead; + return null; } - @Override - public int read() throws IOException { - try { - if (pos < doc.getLength()) - return doc.getChar(pos++) & 0xFF; - } catch (final BadLocationException ex) { - // ignore - } - return -1; + DocumentInputStream(final IDocument doc) { + super(doc::getChar, doc::getLength, getCharset(doc)); } } diff --git a/org.eclipse.tm4e.ui/src/test/java/org/eclipse/tm4e/ui/internal/utils/CharsInputStreamTest.java b/org.eclipse.tm4e.ui/src/test/java/org/eclipse/tm4e/ui/internal/utils/CharsInputStreamTest.java new file mode 100644 index 000000000..0c0d88015 --- /dev/null +++ b/org.eclipse.tm4e.ui/src/test/java/org/eclipse/tm4e/ui/internal/utils/CharsInputStreamTest.java @@ -0,0 +1,125 @@ +/******************************************************************************* + * Copyright (c) 2024 Sebastian Thomschke and others. + * This program and the accompanying materials are made + * available under the terms of the Eclipse Public License 2.0 + * which is available at https://www.eclipse.org/legal/epl-2.0/ + * + * SPDX-License-Identifier: EPL-2.0 + * + * Contributors: + * Sebastian Thomschke - initial implementation + *******************************************************************************/ +package org.eclipse.tm4e.ui.internal.utils; + +import static java.nio.charset.StandardCharsets.UTF_8; +import static org.junit.Assert.*; + +import java.io.IOException; +import java.util.ArrayList; + +import org.junit.Test; + +public class CharsInputStreamTest { + private static final String TEST_ASCII = "Hello, World!"; + + private static final String EMOJI = "๐Ÿ˜Š"; + private static final int EMOJI_BYTES_LEN = EMOJI.getBytes(UTF_8).length; + private static final String JAPANESE = "ใ“ใ‚“ใซใกใฏ"; + private static final String TEST_UNICODE = EMOJI + JAPANESE; + private static final int TEST_UNICODE_BYTES_LEN = TEST_UNICODE.getBytes(UTF_8).length; + + @Test + public void testAvailable() throws IOException { + try (var is = new CharsInputStream(TEST_ASCII)) { + assertEquals(TEST_ASCII.length(), is.available()); + final byte[] buffer = new byte[4]; + is.read(buffer); + assertEquals(TEST_ASCII.length() - 4, is.available()); + is.readAllBytes(); + assertEquals(0, is.available()); + } + + try (var is = new CharsInputStream(TEST_UNICODE)) { + assertTrue(is.available() > 0); + is.read(new byte[10]); + assertTrue(is.available() > 0); + is.readAllBytes(); + assertEquals(0, is.available()); + } + } + + @Test + public void testEndOfStream() throws IOException { + try (var is = new CharsInputStream(TEST_UNICODE)) { + is.skip(Long.MAX_VALUE); + assertEquals(-1, is.read()); + } + } + + @Test + public void testReadEachByte() throws IOException { + try (var is = new CharsInputStream(TEST_UNICODE)) { + final var bytesRead = new ArrayList(); + int b; + while ((b = is.read()) != -1) { + bytesRead.add((byte) b); + } + + final byte[] byteArray = new byte[bytesRead.size()]; + for (int i = 0; i < bytesRead.size(); i++) { + byteArray[i] = bytesRead.get(i); + } + assertEquals(TEST_UNICODE, new String(byteArray, UTF_8)); + } + } + + @Test + public void testReadIntoByteArray() throws IOException { + final byte[] buffer = new byte[1024]; // Buffer to read a portion of the text + + try (var is = new CharsInputStream(TEST_UNICODE)) { + final int bytesRead = is.read(buffer, 0, buffer.length); + + assertEquals(TEST_UNICODE_BYTES_LEN, bytesRead); + assertEquals(TEST_UNICODE, new String(buffer, 0, bytesRead, UTF_8)); + } + } + + @Test + public void testSkip() throws IOException { + try (var is = new CharsInputStream(TEST_UNICODE)) { + // skip emoji + final long skipped = is.skip(EMOJI_BYTES_LEN); + assertEquals(EMOJI_BYTES_LEN, skipped); + + final byte[] japanese = new byte[TEST_UNICODE_BYTES_LEN]; + final int bytesRead = is.read(japanese); + + assertEquals(JAPANESE, new String(japanese, 0, bytesRead, UTF_8)); + } + } + + @Test + public void testHighSurrogateAtEndOfInput() throws IOException { + final char[] invalidSequence = { 'A', '\uD800' }; // valid char followed by an isolated high surrogate + try (var is = new CharsInputStream(new String(invalidSequence), UTF_8)) { + final byte[] result = is.readAllBytes(); + final String output = new String(result, UTF_8); + + // the high surrogate at the end should be replaced by the Unicode replacement char + assertEquals("A\uFFFD", output); + } + } + + @Test + public void testHighSurrogateWithoutLowSurrogate() throws IOException { + final char[] invalidSequence = { '\uD800', 'A' }; // \uD800 is a high surrogate, followed by 'A' + try (var is = new CharsInputStream(new String(invalidSequence), UTF_8)) { + final byte[] result = is.readAllBytes(); + final String output = new String(result, UTF_8); + + // the invalid surrogate pair should be replaced by the Unicode replacement char + assertEquals("\uFFFD" + "A", output); + } + } +}