Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@
import java.nio.ByteOrder;
import java.nio.CharBuffer;
import java.nio.charset.CharacterCodingException;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import org.apache.parquet.io.ParquetEncodingException;
Expand Down Expand Up @@ -268,14 +267,16 @@ public String toString() {
return "Binary{\"" + toStringUsingUTF8() + "\"}";
}

private static final ThreadLocal<CharsetEncoder> ENCODER =
ThreadLocal.withInitial(StandardCharsets.UTF_8::newEncoder);

private static ByteBuffer encodeUTF8(CharSequence value) {
try {
return ENCODER.get().encode(CharBuffer.wrap(value));
// Use a fresh encoder per call rather than a static ThreadLocal initialized with a lambda
// (UTF_8::newEncoder): that lambda's class is loaded by the application ClassLoader and can
// keep it from being unloaded in long-lived pooled threads, leaking Metaspace (GH-3398).
// The encoder also preserves strict CodingErrorAction.REPORT, so malformed UTF-16 fails
// fast instead of being silently replaced (as String#getBytes(UTF_8) would).
return StandardCharsets.UTF_8.newEncoder().encode(CharBuffer.wrap(value));
} catch (CharacterCodingException e) {
throw new ParquetEncodingException("UTF-8 not supported.", e);
throw new ParquetEncodingException("Failed to encode CharSequence as UTF-8.", e);
}
}
Comment on lines 270 to 281

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Seems a valid concern, is it? @steveloughran @LuciferYang

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Confirmed. getBytes(UTF_8) replaces malformed input instead of throwing, so it wasn't behavior-preserving and the catch wasn't dead code.

Went with your suggestion and now use a fresh CharsetEncoder per call: it still drops the ThreadLocal/lambda behind the leak, while newEncoder() keeps the default CodingErrorAction.REPORT, so unpaired surrogates still fail fast with a ParquetEncodingException.

Also added tests (valid encoding + malformed-UTF-16 rejection), corrected the misleading "UTF-8 not supported." message, and updated the PR description to match.

}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,10 @@
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.nio.ByteBuffer;
import java.nio.charset.CharacterCodingException;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import org.apache.parquet.io.ParquetEncodingException;
import org.apache.parquet.io.api.TestBinary.BinaryFactory.BinaryAndOriginal;
import org.junit.Test;

Expand Down Expand Up @@ -314,4 +317,36 @@ public void testGet2BytesLittleEndianWrongLength() {
// expected
}
}

@Test
public void testFromCharSequenceEncodesValidUtf8() {
// Cover ASCII, multi-byte BMP, a supplementary code point (valid surrogate pair) and empty.
assertFromCharSequenceEncodesUtf8("test-123-é中"); // ASCII + U+00E9 (2-byte) + U+4E2D (3-byte)
assertFromCharSequenceEncodesUtf8("😀"); // U+1F600, valid surrogate pair (4-byte)
assertFromCharSequenceEncodesUtf8(""); // empty
}

private static void assertFromCharSequenceEncodesUtf8(String value) {
// fromCharSequence routes any CharSequence (here a StringBuilder) through FromCharSequenceBinary.
// For valid input the strict encoder must match String#getBytes(UTF_8), so this is a genuine
// cross-check, not a circular assertion.
Binary binary = Binary.fromCharSequence(new StringBuilder(value));
assertArrayEquals(value.getBytes(StandardCharsets.UTF_8), binary.getBytes());
}

@Test
public void testFromCharSequenceRejectsMalformedUtf16() {
// An unpaired high surrogate is invalid UTF-16. FromCharSequenceBinary must fail fast
// rather than silently substituting a replacement byte (as String#getBytes(UTF_8) would).
CharSequence value = new StringBuilder().append('a').append('\uD800').append('b');
try {
Binary.fromCharSequence(value);
fail("Should have thrown an exception for malformed UTF-16 input");
} catch (ParquetEncodingException e) {
// Lock in that the cause is a UTF-8 coding error, not an unrelated failure of the same type.
assertTrue(
"expected a CharacterCodingException cause but was " + e.getCause(),
e.getCause() instanceof CharacterCodingException);
}
}
}