Skip to content

Commit 725fbc5

Browse files
dongjoon-hyunwilliamhyun
authored andcommitted
ORC-1961: Support orc.compression.zstd.strategy
### What changes were proposed in this pull request? This PR aims to support `orc.compression.zstd.strategy`. ### Why are the changes needed? To allow a user to choose a proper strategy based on their data. https://facebook.github.io/zstd/zstd_manual.html#Chapter5 ``` typedef enum { ZSTD_fast=1, ZSTD_dfast=2, ZSTD_greedy=3, ZSTD_lazy=4, ZSTD_lazy2=5, ZSTD_btlazy2=6, ZSTD_btopt=7, ZSTD_btultra=8, ZSTD_btultra2=9 /* note : new strategies _might_ be added in the future. Only the order (from fast to strong) is guaranteed */ } ZSTD_strategy; ``` ### How was this patch tested? Pass the CIs. ``` $ cd java $ mvn package -DskipTests -Pbenchmark $ cd bench $ time java -Dorc.compression.zstd.strategy=1 -jar core/target/orc-benchmarks-core-*-uber.jar generate data -d sales -c zstd -f orc ... 54.51s user 1.28s system 103% cpu 53.984 total $ time java -Dorc.compression.zstd.strategy=9 -jar core/target/orc-benchmarks-core-*-uber.jar generate data -d sales -c zstd -f orc ... 148.21s user 1.75s system 101% cpu 2:28.13 total ``` ### Was this patch authored or co-authored using generative AI tooling? No. Closes #2338 from dongjoon-hyun/ORC-1961. Authored-by: Dongjoon Hyun <dongjoon@apache.org> Signed-off-by: William Hyun <william@apache.org>
1 parent 09693fe commit 725fbc5

4 files changed

Lines changed: 34 additions & 6 deletions

File tree

java/core/src/java/org/apache/orc/OrcConf.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,10 @@ public enum OrcConf {
8080
"hive.exec.orc.compression.zstd.windowlog", 0,
8181
"Set the maximum allowed back-reference distance for "
8282
+ "ZStandard codec, expressed as power of 2."),
83+
COMPRESSION_ZSTD_STRATEGY("orc.compression.zstd.strategy",
84+
"hive.exec.orc.compression.zstd.strategy", 0,
85+
"Define the compression strategy to use with ZStandard codec "
86+
+ "while writing data. The valid range is 0~9."),
8387
BLOCK_PADDING_TOLERANCE("orc.block.padding.tolerance",
8488
"hive.exec.orc.block.padding.tolerance", 0.05,
8589
"Define the tolerance for block padding as a decimal fraction of\n" +

java/core/src/java/org/apache/orc/OrcFile.java

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -429,6 +429,7 @@ public static BloomFilterVersion fromString(String s) {
429429
public static class ZstdCompressOptions {
430430
private int compressionZstdLevel;
431431
private int compressionZstdWindowLog;
432+
private int compressionZstdStrategy;
432433

433434
public int getCompressionZstdLevel() {
434435
return compressionZstdLevel;
@@ -445,6 +446,14 @@ public int getCompressionZstdWindowLog() {
445446
public void setCompressionZstdWindowLog(int compressionZstdWindowLog) {
446447
this.compressionZstdWindowLog = compressionZstdWindowLog;
447448
}
449+
450+
public int getCompressionZstdStrategy() {
451+
return compressionZstdStrategy;
452+
}
453+
454+
public void setCompressionZstdStrategy(int compressionZstdStrategy) {
455+
this.compressionZstdStrategy = compressionZstdStrategy;
456+
}
448457
}
449458

450459
/**
@@ -520,6 +529,8 @@ protected WriterOptions(Properties tableProperties, Configuration conf) {
520529
OrcConf.COMPRESSION_ZSTD_LEVEL.getInt(tableProperties, conf));
521530
zstdCompressOptions.setCompressionZstdWindowLog(
522531
OrcConf.COMPRESSION_ZSTD_WINDOWLOG.getInt(tableProperties, conf));
532+
zstdCompressOptions.setCompressionZstdStrategy(
533+
OrcConf.COMPRESSION_ZSTD_STRATEGY.getInt(tableProperties, conf));
523534

524535
paddingTolerance =
525536
OrcConf.BLOCK_PADDING_TOLERANCE.getDouble(tableProperties, conf);

java/core/src/java/org/apache/orc/impl/PhysicalFsWriter.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,7 @@ public PhysicalFsWriter(FSDataOutputStream outputStream,
121121
if (zstdCompressOptions != null) {
122122
options.setLevel(zstdCompressOptions.getCompressionZstdLevel());
123123
options.setWindowLog(zstdCompressOptions.getCompressionZstdWindowLog());
124+
options.setStrategy(zstdCompressOptions.getCompressionZstdStrategy());
124125
}
125126
}
126127
compress.withCodec(codec, tempOptions);

java/core/src/java/org/apache/orc/impl/ZstdCodec.java

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -29,12 +29,12 @@ public class ZstdCodec implements CompressionCodec, DirectDecompressionCodec {
2929
private ZstdOptions zstdOptions = null;
3030
private ZstdCompressCtx zstdCompressCtx = null;
3131

32-
public ZstdCodec(int level, int windowLog) {
33-
this.zstdOptions = new ZstdOptions(level, windowLog);
32+
public ZstdCodec(int level, int windowLog, int strategy) {
33+
this.zstdOptions = new ZstdOptions(level, windowLog, strategy);
3434
}
3535

3636
public ZstdCodec() {
37-
this(3, 0);
37+
this(3, 0, 0);
3838
}
3939

4040
public ZstdOptions getZstdOptions() {
@@ -57,15 +57,17 @@ protected static byte[] getBuffer(int size) {
5757
static class ZstdOptions implements Options {
5858
private int level;
5959
private int windowLog;
60+
private int strategy;
6061

61-
ZstdOptions(int level, int windowLog) {
62+
ZstdOptions(int level, int windowLog, int strategy) {
6263
this.level = level;
6364
this.windowLog = windowLog;
65+
this.strategy = strategy;
6466
}
6567

6668
@Override
6769
public ZstdOptions copy() {
68-
return new ZstdOptions(level, windowLog);
70+
return new ZstdOptions(level, windowLog, strategy);
6971
}
7072

7173
@Override
@@ -123,6 +125,13 @@ public ZstdOptions setLevel(int newValue) {
123125
return this;
124126
}
125127

128+
public ZstdOptions setStrategy(int newValue) {
129+
// https://facebook.github.io/zstd/zstd_manual.html#Chapter5
130+
// Although the value is between 1 and 9 and 0 means `use default`, ZStd can change it.
131+
strategy = newValue;
132+
return this;
133+
}
134+
126135
@Override
127136
public ZstdOptions setData(DataKind newValue) {
128137
return this; // We don't support setting DataKind in ZstdCodec.
@@ -136,19 +145,21 @@ public boolean equals(Object o) {
136145
ZstdOptions that = (ZstdOptions) o;
137146

138147
if (level != that.level) return false;
148+
if (strategy != that.strategy) return false;
139149
return windowLog == that.windowLog;
140150
}
141151

142152
@Override
143153
public int hashCode() {
144154
int result = level;
145155
result = 31 * result + windowLog;
156+
result = 31 * result + strategy;
146157
return result;
147158
}
148159
}
149160

150161
private static final ZstdOptions DEFAULT_OPTIONS =
151-
new ZstdOptions(3, 0);
162+
new ZstdOptions(3, 0, 0);
152163

153164
@Override
154165
public Options getDefaultOptions() {
@@ -183,6 +194,7 @@ public boolean compress(ByteBuffer in, ByteBuffer out,
183194
zstdCompressCtx.setLevel(zso.level);
184195
zstdCompressCtx.setLong(zso.windowLog);
185196
zstdCompressCtx.setChecksum(false);
197+
zstdCompressCtx.setStrategy(zso.strategy);
186198

187199
try {
188200
byte[] compressed = getBuffer((int) Zstd.compressBound(inBytes));

0 commit comments

Comments
 (0)