Skip to content

Commit d5f020e

Browse files
committed
fix: DocumentInputStream does not handle surrogate pairs correctly
1 parent 27b13e3 commit d5f020e

4 files changed

Lines changed: 375 additions & 56 deletions

File tree

Lines changed: 220 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,220 @@
1+
/*******************************************************************************
2+
* Copyright (c) 2024 Sebastian Thomschke and others.
3+
* This program and the accompanying materials are made
4+
* available under the terms of the Eclipse Public License 2.0
5+
* which is available at https://www.eclipse.org/legal/epl-2.0/
6+
*
7+
* SPDX-License-Identifier: EPL-2.0
8+
*
9+
* Contributors:
10+
* Sebastian Thomschke - initial implementation
11+
*******************************************************************************/
12+
package org.eclipse.tm4e.ui.internal.utils;
13+
14+
import java.io.IOException;
15+
import java.io.InputStream;
16+
import java.nio.ByteBuffer;
17+
import java.nio.CharBuffer;
18+
import java.nio.charset.Charset;
19+
import java.nio.charset.CharsetEncoder;
20+
import java.nio.charset.CoderResult;
21+
import java.nio.charset.StandardCharsets;
22+
import java.util.Objects;
23+
import java.util.function.IntSupplier;
24+
25+
import org.eclipse.jdt.annotation.Nullable;
26+
27+
class CharsInputStream extends InputStream {
28+
@FunctionalInterface
29+
interface CharsSupplier {
30+
char charAt(int index) throws Exception;
31+
}
32+
33+
enum EncoderState {
34+
ENCODING,
35+
FLUSHING,
36+
DONE
37+
}
38+
39+
/** 512 surrogate character pairs */
40+
private static final int DEFAULT_BUFFER_SIZE = 512;
41+
private static final int EOF = -1;
42+
43+
private final int bufferSize;
44+
private final CharBuffer charBuffer;
45+
private final ByteBuffer byteBuffer;
46+
private final CharsetEncoder encoder;
47+
private EncoderState encoderState = EncoderState.ENCODING;
48+
49+
private int charIndex = 0;
50+
private final CharsSupplier chars;
51+
private final IntSupplier charsLength;
52+
53+
CharsInputStream(final CharSequence chars) {
54+
this(chars, null);
55+
}
56+
57+
CharsInputStream(final CharSequence chars, final @Nullable Charset charset) {
58+
this(chars, charset, DEFAULT_BUFFER_SIZE);
59+
}
60+
61+
CharsInputStream(final CharSequence chars, final @Nullable Charset charset, final int bufferSize) {
62+
this(chars::charAt, chars::length, charset, bufferSize);
63+
}
64+
65+
CharsInputStream(final CharsSupplier chars, final IntSupplier charsLength) {
66+
this(chars, charsLength, null);
67+
}
68+
69+
/**
70+
* @param chars function to access indexed chars.
71+
* @param charsLength function to get the number of indexed chars provided by the <code>chars</code> parameter.
72+
*/
73+
CharsInputStream(final CharsSupplier chars, final IntSupplier charsLength, final @Nullable Charset charset) {
74+
this(chars, charsLength, charset, DEFAULT_BUFFER_SIZE);
75+
}
76+
77+
/**
78+
* @param chars function to access indexed chars.
79+
* @param charsLength function to get the number of indexed chars provided by the <code>chars</code> parameter.
80+
* @param bufferSize number of surrogate character pairs to encode at once.
81+
*/
82+
CharsInputStream(final CharsSupplier chars, final IntSupplier charsLength, final @Nullable Charset charset, final int bufferSize) {
83+
if (bufferSize < 1)
84+
throw new IllegalArgumentException("[bufferSize] must be 1 or larger");
85+
encoder = (charset == null ? StandardCharsets.UTF_8 : charset).newEncoder();
86+
87+
this.bufferSize = bufferSize;
88+
charBuffer = CharBuffer.allocate(bufferSize * 2); // buffer for 2 chars (high/low surrogate)
89+
byteBuffer = ByteBuffer.allocate(bufferSize * 4); // buffer for one UTF character (up to 4 bytes)
90+
byteBuffer.flip();
91+
charBuffer.flip();
92+
93+
this.chars = chars;
94+
this.charsLength = charsLength;
95+
}
96+
97+
@Override
98+
public int available() {
99+
final int remaining = byteBuffer.remaining();
100+
return remaining == 0 ? charsLength.getAsInt() - charIndex : remaining;
101+
}
102+
103+
private boolean flushEncoder() throws IOException {
104+
if (encoderState == EncoderState.DONE)
105+
return false;
106+
107+
if (encoderState == EncoderState.ENCODING) {
108+
encoderState = EncoderState.FLUSHING;
109+
}
110+
111+
// flush
112+
byteBuffer.clear();
113+
final CoderResult result = encoder.flush(byteBuffer);
114+
byteBuffer.flip();
115+
116+
if (result.isOverflow()) // byteBuffer too small
117+
return true;
118+
119+
if (result.isError()) {
120+
result.throwException();
121+
}
122+
123+
encoderState = EncoderState.DONE;
124+
return byteBuffer.hasRemaining();
125+
}
126+
127+
@Override
128+
public int read() throws IOException {
129+
if (!byteBuffer.hasRemaining() && !refillBuffer())
130+
return EOF;
131+
return byteBuffer.get() & 0xFF; // next byte as an unsigned integer (0 to 255)
132+
}
133+
134+
@Override
135+
public int read(final byte[] buf, final int off, final int bytesToRead) throws IOException {
136+
Objects.checkFromIndexSize(off, bytesToRead, buf.length);
137+
if (bytesToRead == 0)
138+
return 0;
139+
140+
int bytesRead = 0;
141+
int bytesReadable = byteBuffer.remaining();
142+
143+
while (bytesRead < bytesToRead) {
144+
if (bytesReadable == 0) {
145+
if (refillBuffer()) {
146+
bytesReadable = byteBuffer.remaining();
147+
} else
148+
return bytesRead == 0 ? EOF : bytesRead;
149+
}
150+
151+
final int bytesToReadNow = Math.min(bytesToRead - bytesRead, bytesReadable);
152+
byteBuffer.get(buf, off + bytesRead, bytesToReadNow);
153+
bytesRead += bytesToReadNow;
154+
bytesReadable -= bytesToReadNow;
155+
}
156+
157+
return bytesRead;
158+
}
159+
160+
private boolean refillBuffer() throws IOException {
161+
if (encoderState == EncoderState.DONE)
162+
return false;
163+
164+
if (encoderState == EncoderState.FLUSHING)
165+
return flushEncoder();
166+
167+
final int charsLen = charsLength.getAsInt();
168+
169+
// if EOF is reached transition to flushing
170+
if (charIndex >= charsLen) {
171+
// finalize encoding before switching to flushing
172+
byteBuffer.clear();
173+
final CoderResult result = encoder.encode(CharBuffer.allocate(0), byteBuffer, true /* signal EOF */);
174+
byteBuffer.flip();
175+
if (result.isError()) {
176+
result.throwException();
177+
}
178+
return flushEncoder();
179+
}
180+
181+
try {
182+
charBuffer.clear();
183+
for (int i = 0; i < bufferSize && charIndex < charsLen; i++) {
184+
final char nextChar = chars.charAt(charIndex++);
185+
if (Character.isHighSurrogate(nextChar)) { // handle surrogate pairs
186+
if (charIndex < charsLen) {
187+
final char lowSurrogate = chars.charAt(charIndex);
188+
if (Character.isLowSurrogate(lowSurrogate)) {
189+
charIndex++;
190+
charBuffer.put(nextChar);
191+
charBuffer.put(lowSurrogate);
192+
} else {
193+
// missing low surrogate - fallback to replacement character
194+
charBuffer.put('\uFFFD');
195+
}
196+
} else {
197+
// missing low surrogate - fallback to replacement character
198+
charBuffer.put('\uFFFD');
199+
break;
200+
}
201+
} else {
202+
charBuffer.put(nextChar);
203+
}
204+
}
205+
charBuffer.flip();
206+
207+
// encode chars into bytes
208+
byteBuffer.clear();
209+
final CoderResult result = encoder.encode(charBuffer, byteBuffer, false);
210+
byteBuffer.flip();
211+
if (result.isError()) {
212+
result.throwException();
213+
}
214+
} catch (final Exception ex) {
215+
throw new IOException(ex);
216+
}
217+
218+
return true;
219+
}
220+
}

org.eclipse.tm4e.ui/src/main/java/org/eclipse/tm4e/ui/internal/utils/ContentTypeHelper.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,7 @@ private static ContentTypeInfo getContentTypes(final ITextFileBuffer buffer) {
112112
if (bufferContentType != null) {
113113
contentTypes.add(bufferContentType);
114114
}
115-
if (buffer.isDirty()) {
115+
if (buffer.isDirty() && buffer.getDocument() != null) {
116116
// Buffer is dirty (content of the filesystem is not synch with
117117
// the editor content), use IDocument content.
118118
try (var input = new DocumentInputStream(buffer.getDocument())) {
Lines changed: 29 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -1,71 +1,45 @@
11
/*******************************************************************************
2-
* Copyright (c) 2005, 2008 IBM Corporation and others.
3-
* All rights reserved. This program and the accompanying materials
4-
* are made available under the terms of the Eclipse Public License v1.0
5-
* which accompanies this distribution, and is available at
6-
* http://www.eclipse.org/legal/epl-v10.html
2+
* Copyright (c) 2024 Sebastian Thomschke and others.
3+
* This program and the accompanying materials are made
4+
* available under the terms of the Eclipse Public License 2.0
5+
* which is available at https://www.eclipse.org/legal/epl-2.0/
6+
*
7+
* SPDX-License-Identifier: EPL-2.0
78
*
89
* Contributors:
9-
* IBM Corporation - initial API and implementation
10-
* QNX Software System
11-
* Sebastian Thomschke - implement read(byte[], int, int)
10+
* Sebastian Thomschke - initial implementation
1211
*******************************************************************************/
1312
package org.eclipse.tm4e.ui.internal.utils;
1413

15-
import java.io.IOException;
16-
import java.io.InputStream;
17-
import java.util.Objects;
14+
import java.nio.charset.Charset;
1815

19-
import org.eclipse.jface.text.BadLocationException;
16+
import org.eclipse.core.filebuffers.FileBuffers;
17+
import org.eclipse.core.filebuffers.ITextFileBuffer;
18+
import org.eclipse.core.filebuffers.ITextFileBufferManager;
19+
import org.eclipse.jdt.annotation.Nullable;
2020
import org.eclipse.jface.text.IDocument;
21+
import org.eclipse.tm4e.ui.TMUIPlugin;
2122

22-
/**
23-
* Input stream which reads from a document
24-
*/
25-
final class DocumentInputStream extends InputStream {
26-
27-
private final IDocument doc;
28-
private int pos = 0;
29-
30-
DocumentInputStream(final IDocument document) {
31-
doc = document;
32-
}
33-
34-
@Override
35-
public int read(final byte[] buff, final int buffOffset, final int len) throws IOException {
36-
Objects.checkFromIndexSize(buffOffset, len, buff.length);
23+
final class DocumentInputStream extends CharsInputStream {
3724

38-
if (len == 0)
39-
return 0;
40-
41-
final var docLen = doc.getLength();
42-
if (pos >= docLen)
43-
return -1;
44-
45-
var bytesRead = -1;
25+
private static @Nullable Charset getCharset(final IDocument document) {
26+
final ITextFileBufferManager bufferManager = FileBuffers.getTextFileBufferManager();
27+
if (bufferManager == null)
28+
return null;
29+
final ITextFileBuffer buffer = bufferManager.getTextFileBuffer(document);
30+
if (buffer == null)
31+
return null;
4632
try {
47-
buff[buffOffset] = (byte) doc.getChar(pos++);
48-
bytesRead = 1;
49-
while (bytesRead < len) {
50-
if (pos >= docLen) {
51-
break;
52-
}
53-
buff[buffOffset + bytesRead++] = (byte) doc.getChar(pos++);
54-
}
55-
} catch (final BadLocationException ex) {
56-
// ignore
33+
final String charsetName = buffer.getEncoding();
34+
if (charsetName != null)
35+
return Charset.forName(charsetName);
36+
} catch (final Exception ex) {
37+
TMUIPlugin.logError(ex);
5738
}
58-
return bytesRead;
39+
return null;
5940
}
6041

61-
@Override
62-
public int read() throws IOException {
63-
try {
64-
if (pos < doc.getLength())
65-
return doc.getChar(pos++) & 0xFF;
66-
} catch (final BadLocationException ex) {
67-
// ignore
68-
}
69-
return -1;
42+
DocumentInputStream(final IDocument doc) {
43+
super(doc::getChar, doc::getLength, getCharset(doc));
7044
}
7145
}

0 commit comments

Comments
 (0)