Skip to content

Commit 2346fdb

Browse files
authored
GH-3464: Improve DeltaByteArrayWriter.writeBytes (#3465)
* GH-3464 Improve `DeltaByteArrayWriter.writeBytes` to avoid unnecessary allocation and scalar prefix comparison * GH-3464 Add regression test * Update DeltaByteArrayWriter.java
1 parent e251102 commit 2346fdb

2 files changed

Lines changed: 30 additions & 6 deletions

File tree

parquet-column/src/main/java/org/apache/parquet/column/values/deltastrings/DeltaByteArrayWriter.java

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
*/
1919
package org.apache.parquet.column.values.deltastrings;
2020

21+
import java.util.Arrays;
2122
import org.apache.parquet.bytes.ByteBufferAllocator;
2223
import org.apache.parquet.bytes.BytesInput;
2324
import org.apache.parquet.column.Encoding;
@@ -88,12 +89,14 @@ public String memUsageString(String prefix) {
8889

8990
@Override
9091
public void writeBytes(Binary v) {
91-
int i = 0;
92-
byte[] vb = v.getBytes();
93-
int length = previous.length < vb.length ? previous.length : vb.length;
94-
// find the number of matching prefix bytes between this value and the previous one
95-
for (i = 0; (i < length) && (previous[i] == vb[i]); i++)
96-
;
92+
byte[] vb = v.isBackingBytesReused() ? v.getBytes() : v.getBytesUnsafe();
93+
int length = Math.min(previous.length, vb.length);
94+
// Find the number of matching prefix bytes between this value and the previous one.
95+
// Arrays.mismatch is intrinsified by the JVM to use SIMD instructions.
96+
int i = Arrays.mismatch(previous, 0, length, vb, 0, length);
97+
if (i < 0) {
98+
i = length; // all bytes in the common range matched
99+
}
97100
prefixLengthWriter.writeInteger(i);
98101
suffixWriter.writeBytes(v.slice(i, vb.length - i));
99102
previous = vb;

parquet-column/src/test/java/org/apache/parquet/column/values/deltastrings/TestDeltaByteArray.java

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
package org.apache.parquet.column.values.deltastrings;
2020

2121
import java.io.IOException;
22+
import java.nio.charset.StandardCharsets;
2223
import org.apache.parquet.bytes.ByteBufferInputStream;
2324
import org.apache.parquet.bytes.DirectByteBufferAllocator;
2425
import org.apache.parquet.column.values.Utils;
@@ -128,4 +129,24 @@ public void testWriterReset() throws Exception {
128129

129130
assertReadWrite(writer, new DeltaByteArrayReader(), values);
130131
}
132+
133+
@Test
134+
public void testReusedBackingArrayRegression() throws Exception {
135+
DeltaByteArrayWriter writer = new DeltaByteArrayWriter(64 * 1024, 64 * 1024, new DirectByteBufferAllocator());
136+
DeltaByteArrayReader reader = new DeltaByteArrayReader();
137+
138+
byte[] buffer = "parquet-000".getBytes(StandardCharsets.UTF_8);
139+
writer.writeBytes(Binary.fromReusedByteArray(buffer));
140+
141+
System.arraycopy("parquet-111".getBytes(StandardCharsets.UTF_8), 0, buffer, 0, buffer.length);
142+
writer.writeBytes(Binary.fromReusedByteArray(buffer));
143+
144+
System.arraycopy("parquet-222".getBytes(StandardCharsets.UTF_8), 0, buffer, 0, buffer.length);
145+
writer.writeBytes(Binary.fromReusedByteArray(buffer));
146+
147+
Binary[] decoded = Utils.readData(reader, writer.getBytes().toInputStream(), 3);
148+
Assert.assertEquals(Binary.fromString("parquet-000"), decoded[0]);
149+
Assert.assertEquals(Binary.fromString("parquet-111"), decoded[1]);
150+
Assert.assertEquals(Binary.fromString("parquet-222"), decoded[2]);
151+
}
131152
}

0 commit comments

Comments
 (0)