Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: DocumentInputStream does not handle surrogate pairs #786

Merged
merged 1 commit into from
Aug 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,220 @@
/*******************************************************************************
* Copyright (c) 2024 Sebastian Thomschke and others.
* This program and the accompanying materials are made
* available under the terms of the Eclipse Public License 2.0
* which is available at https://www.eclipse.org/legal/epl-2.0/
*
* SPDX-License-Identifier: EPL-2.0
*
* Contributors:
* Sebastian Thomschke - initial implementation
*******************************************************************************/
package org.eclipse.tm4e.ui.internal.utils;

import java.io.IOException;
import java.io.InputStream;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.Charset;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.CoderResult;
import java.nio.charset.StandardCharsets;
import java.util.Objects;
import java.util.function.IntSupplier;

import org.eclipse.jdt.annotation.Nullable;

class CharsInputStream extends InputStream {
@FunctionalInterface
interface CharsSupplier {
char charAt(int index) throws Exception;
}

enum EncoderState {
ENCODING,
FLUSHING,
DONE
}

/** 512 surrogate character pairs */
private static final int DEFAULT_BUFFER_SIZE = 512;
private static final int EOF = -1;

private final int bufferSize;
private final CharBuffer charBuffer;
private final ByteBuffer byteBuffer;
private final CharsetEncoder encoder;
private EncoderState encoderState = EncoderState.ENCODING;

private int charIndex = 0;
private final CharsSupplier chars;
private final IntSupplier charsLength;

CharsInputStream(final CharSequence chars) {
this(chars, null);
}

CharsInputStream(final CharSequence chars, final @Nullable Charset charset) {
this(chars, charset, DEFAULT_BUFFER_SIZE);
}

CharsInputStream(final CharSequence chars, final @Nullable Charset charset, final int bufferSize) {
this(chars::charAt, chars::length, charset, bufferSize);
}

CharsInputStream(final CharsSupplier chars, final IntSupplier charsLength) {
this(chars, charsLength, null);
}

/**
* @param chars function to access indexed chars.
* @param charsLength function to get the number of indexed chars provided by the <code>chars</code> parameter.
*/
CharsInputStream(final CharsSupplier chars, final IntSupplier charsLength, final @Nullable Charset charset) {
this(chars, charsLength, charset, DEFAULT_BUFFER_SIZE);
}

/**
* @param chars function to access indexed chars.
* @param charsLength function to get the number of indexed chars provided by the <code>chars</code> parameter.
* @param bufferSize number of surrogate character pairs to encode at once.
*/
CharsInputStream(final CharsSupplier chars, final IntSupplier charsLength, final @Nullable Charset charset, final int bufferSize) {
if (bufferSize < 1)
throw new IllegalArgumentException("[bufferSize] must be 1 or larger");
encoder = (charset == null ? StandardCharsets.UTF_8 : charset).newEncoder();

this.bufferSize = bufferSize;
charBuffer = CharBuffer.allocate(bufferSize * 2); // buffer for 2 chars (high/low surrogate)
byteBuffer = ByteBuffer.allocate(bufferSize * 4); // buffer for one UTF character (up to 4 bytes)
byteBuffer.flip();
charBuffer.flip();

this.chars = chars;
this.charsLength = charsLength;
}

@Override
public int available() {
final int remaining = byteBuffer.remaining();
return remaining == 0 ? charsLength.getAsInt() - charIndex : remaining;
}

private boolean flushEncoder() throws IOException {
if (encoderState == EncoderState.DONE)
return false;

if (encoderState == EncoderState.ENCODING) {
encoderState = EncoderState.FLUSHING;
}

// flush
byteBuffer.clear();
final CoderResult result = encoder.flush(byteBuffer);
byteBuffer.flip();

if (result.isOverflow()) // byteBuffer too small
return true;

if (result.isError()) {
result.throwException();
}

encoderState = EncoderState.DONE;
return byteBuffer.hasRemaining();
}

@Override
public int read() throws IOException {
if (!byteBuffer.hasRemaining() && !refillBuffer())
return EOF;
return byteBuffer.get() & 0xFF; // next byte as an unsigned integer (0 to 255)
}

@Override
public int read(final byte[] buf, final int off, final int bytesToRead) throws IOException {
Objects.checkFromIndexSize(off, bytesToRead, buf.length);
if (bytesToRead == 0)
return 0;

int bytesRead = 0;
int bytesReadable = byteBuffer.remaining();

while (bytesRead < bytesToRead) {
if (bytesReadable == 0) {
if (refillBuffer()) {
bytesReadable = byteBuffer.remaining();
} else
return bytesRead == 0 ? EOF : bytesRead;
}

final int bytesToReadNow = Math.min(bytesToRead - bytesRead, bytesReadable);
byteBuffer.get(buf, off + bytesRead, bytesToReadNow);
bytesRead += bytesToReadNow;
bytesReadable -= bytesToReadNow;
}

return bytesRead;
}

private boolean refillBuffer() throws IOException {
if (encoderState == EncoderState.DONE)
return false;

if (encoderState == EncoderState.FLUSHING)
return flushEncoder();

final int charsLen = charsLength.getAsInt();

// if EOF is reached transition to flushing
if (charIndex >= charsLen) {
// finalize encoding before switching to flushing
byteBuffer.clear();
final CoderResult result = encoder.encode(CharBuffer.allocate(0), byteBuffer, true /* signal EOF */);
byteBuffer.flip();
if (result.isError()) {
result.throwException();
}
return flushEncoder();
}

try {
charBuffer.clear();
for (int i = 0; i < bufferSize && charIndex < charsLen; i++) {
final char nextChar = chars.charAt(charIndex++);
if (Character.isHighSurrogate(nextChar)) { // handle surrogate pairs
if (charIndex < charsLen) {
final char lowSurrogate = chars.charAt(charIndex);
if (Character.isLowSurrogate(lowSurrogate)) {
charIndex++;
charBuffer.put(nextChar);
charBuffer.put(lowSurrogate);
} else {
// missing low surrogate - fallback to replacement character
charBuffer.put('\uFFFD');
}
} else {
// missing low surrogate - fallback to replacement character
charBuffer.put('\uFFFD');
break;
}
} else {
charBuffer.put(nextChar);
}
}
charBuffer.flip();

// encode chars into bytes
byteBuffer.clear();
final CoderResult result = encoder.encode(charBuffer, byteBuffer, false);
byteBuffer.flip();
if (result.isError()) {
result.throwException();
}
} catch (final Exception ex) {
throw new IOException(ex);
}

return true;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ private static ContentTypeInfo getContentTypes(final ITextFileBuffer buffer) {
if (bufferContentType != null) {
contentTypes.add(bufferContentType);
}
if (buffer.isDirty()) {
if (buffer.isDirty() && buffer.getDocument() != null) {
// Buffer is dirty (content of the filesystem is not synch with
// the editor content), use IDocument content.
try (var input = new DocumentInputStream(buffer.getDocument())) {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,71 +1,45 @@
/*******************************************************************************
* Copyright (c) 2005, 2008 IBM Corporation and others.
* All rights reserved. This program and the accompanying materials
* are made available under the terms of the Eclipse Public License v1.0
* which accompanies this distribution, and is available at
* http://www.eclipse.org/legal/epl-v10.html
* Copyright (c) 2024 Sebastian Thomschke and others.
* This program and the accompanying materials are made
* available under the terms of the Eclipse Public License 2.0
* which is available at https://www.eclipse.org/legal/epl-2.0/
*
* SPDX-License-Identifier: EPL-2.0
*
* Contributors:
* IBM Corporation - initial API and implementation
* QNX Software System
* Sebastian Thomschke - implement read(byte[], int, int)
* Sebastian Thomschke - initial implementation
*******************************************************************************/
package org.eclipse.tm4e.ui.internal.utils;

import java.io.IOException;
import java.io.InputStream;
import java.util.Objects;
import java.nio.charset.Charset;

import org.eclipse.jface.text.BadLocationException;
import org.eclipse.core.filebuffers.FileBuffers;
import org.eclipse.core.filebuffers.ITextFileBuffer;
import org.eclipse.core.filebuffers.ITextFileBufferManager;
import org.eclipse.jdt.annotation.Nullable;
import org.eclipse.jface.text.IDocument;
import org.eclipse.tm4e.ui.TMUIPlugin;

/**
* Input stream which reads from a document
*/
final class DocumentInputStream extends InputStream {

private final IDocument doc;
private int pos = 0;

DocumentInputStream(final IDocument document) {
doc = document;
}

@Override
public int read(final byte[] buff, final int buffOffset, final int len) throws IOException {
Objects.checkFromIndexSize(buffOffset, len, buff.length);
final class DocumentInputStream extends CharsInputStream {

if (len == 0)
return 0;

final var docLen = doc.getLength();
if (pos >= docLen)
return -1;

var bytesRead = -1;
private static @Nullable Charset getCharset(final IDocument document) {
final ITextFileBufferManager bufferManager = FileBuffers.getTextFileBufferManager();
if (bufferManager == null)
return null;
final ITextFileBuffer buffer = bufferManager.getTextFileBuffer(document);
if (buffer == null)
return null;
try {
buff[buffOffset] = (byte) doc.getChar(pos++);
bytesRead = 1;
while (bytesRead < len) {
if (pos >= docLen) {
break;
}
buff[buffOffset + bytesRead++] = (byte) doc.getChar(pos++);
}
} catch (final BadLocationException ex) {
// ignore
final String charsetName = buffer.getEncoding();
if (charsetName != null)
return Charset.forName(charsetName);
} catch (final Exception ex) {
TMUIPlugin.logError(ex);
}
return bytesRead;
return null;
}

@Override
public int read() throws IOException {
try {
if (pos < doc.getLength())
return doc.getChar(pos++) & 0xFF;
} catch (final BadLocationException ex) {
// ignore
}
return -1;
DocumentInputStream(final IDocument doc) {
super(doc::getChar, doc::getLength, getCharset(doc));
}
}
Loading
Loading