apache · wgtmac · Jun 8, 2026 · Jun 8, 2026 · wgtmac · Jun 8, 2026
diff --git a/parquet-column/src/main/java/org/apache/parquet/io/api/Binary.java b/parquet-column/src/main/java/org/apache/parquet/io/api/Binary.java
@@ -27,7 +27,6 @@
 import java.nio.ByteOrder;
 import java.nio.CharBuffer;
 import java.nio.charset.CharacterCodingException;
-import java.nio.charset.CharsetEncoder;
 import java.nio.charset.StandardCharsets;
 import java.util.Arrays;
 import org.apache.parquet.io.ParquetEncodingException;
@@ -268,14 +267,16 @@ public String toString() {
       return "Binary{\"" + toStringUsingUTF8() + "\"}";
     }
 
-    private static final ThreadLocal<CharsetEncoder> ENCODER =
-        ThreadLocal.withInitial(StandardCharsets.UTF_8::newEncoder);
-
     private static ByteBuffer encodeUTF8(CharSequence value) {
       try {
-        return ENCODER.get().encode(CharBuffer.wrap(value));
+        // Use a fresh encoder per call rather than a static ThreadLocal initialized with a lambda
+        // (UTF_8::newEncoder): that lambda's class is loaded by the application ClassLoader and can
+        // keep it from being unloaded in long-lived pooled threads, leaking Metaspace (GH-3398).
+        // The encoder also preserves strict CodingErrorAction.REPORT, so malformed UTF-16 fails
+        // fast instead of being silently replaced (as String#getBytes(UTF_8) would).
+        return StandardCharsets.UTF_8.newEncoder().encode(CharBuffer.wrap(value));
       } catch (CharacterCodingException e) {
-        throw new ParquetEncodingException("UTF-8 not supported.", e);
+        throw new ParquetEncodingException("Failed to encode CharSequence as UTF-8.", e);
       }
     }
   }

diff --git a/parquet-column/src/test/java/org/apache/parquet/io/api/TestBinary.java b/parquet-column/src/test/java/org/apache/parquet/io/api/TestBinary.java
@@ -30,7 +30,10 @@
 import java.io.ObjectInputStream;
 import java.io.ObjectOutputStream;
 import java.nio.ByteBuffer;
+import java.nio.charset.CharacterCodingException;
+import java.nio.charset.StandardCharsets;
 import java.util.Arrays;
+import org.apache.parquet.io.ParquetEncodingException;
 import org.apache.parquet.io.api.TestBinary.BinaryFactory.BinaryAndOriginal;
 import org.junit.Test;
 
@@ -314,4 +317,36 @@ public void testGet2BytesLittleEndianWrongLength() {
       // expected
     }
   }
+
+  @Test
+  public void testFromCharSequenceEncodesValidUtf8() {
+    // Cover ASCII, multi-byte BMP, a supplementary code point (valid surrogate pair) and empty.
+    assertFromCharSequenceEncodesUtf8("test-123-é中"); // ASCII + U+00E9 (2-byte) + U+4E2D (3-byte)
+    assertFromCharSequenceEncodesUtf8("😀"); // U+1F600, valid surrogate pair (4-byte)
+    assertFromCharSequenceEncodesUtf8(""); // empty
+  }
+
+  private static void assertFromCharSequenceEncodesUtf8(String value) {
+    // fromCharSequence routes any CharSequence (here a StringBuilder) through FromCharSequenceBinary.
+    // For valid input the strict encoder must match String#getBytes(UTF_8), so this is a genuine
+    // cross-check, not a circular assertion.
+    Binary binary = Binary.fromCharSequence(new StringBuilder(value));
+    assertArrayEquals(value.getBytes(StandardCharsets.UTF_8), binary.getBytes());
+  }
+
+  @Test
+  public void testFromCharSequenceRejectsMalformedUtf16() {
+    // An unpaired high surrogate is invalid UTF-16. FromCharSequenceBinary must fail fast
+    // rather than silently substituting a replacement byte (as String#getBytes(UTF_8) would).
+    CharSequence value = new StringBuilder().append('a').append('\uD800').append('b');
+    try {
+      Binary.fromCharSequence(value);
+      fail("Should have thrown an exception for malformed UTF-16 input");
+    } catch (ParquetEncodingException e) {
+      // Lock in that the cause is a UTF-8 coding error, not an unrelated failure of the same type.
+      assertTrue(
+          "expected a CharacterCodingException cause but was " + e.getCause(),
+          e.getCause() instanceof CharacterCodingException);
+    }
+  }
 }