Skip to content
This repository was archived by the owner on May 2, 2025. It is now read-only.

Commit 66e0c4e

Browse files
authored
apacheGH-3070: Add Variant logical type annotation to parquet-java (apache#3072)
1 parent 00b6bab commit 66e0c4e

8 files changed

Lines changed: 255 additions & 5 deletions

File tree

parquet-column/src/main/java/org/apache/parquet/schema/LogicalTypeAnnotation.java

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,14 @@ protected LogicalTypeAnnotation fromString(List<String> params) {
5656
return listType();
5757
}
5858
},
59+
VARIANT {
60+
@Override
61+
protected LogicalTypeAnnotation fromString(List<String> params) {
62+
Preconditions.checkArgument(
63+
params.size() == 1, "Expecting only spec version for variant annotation args: %s", params);
64+
return variantType(Byte.parseByte(params.get(0)));
65+
}
66+
},
5967
STRING {
6068
@Override
6169
protected LogicalTypeAnnotation fromString(List<String> params) {
@@ -269,6 +277,10 @@ public static ListLogicalTypeAnnotation listType() {
269277
return ListLogicalTypeAnnotation.INSTANCE;
270278
}
271279

280+
public static VariantLogicalTypeAnnotation variantType(byte specVersion) {
281+
return new VariantLogicalTypeAnnotation(specVersion);
282+
}
283+
272284
public static EnumLogicalTypeAnnotation enumType() {
273285
return EnumLogicalTypeAnnotation.INSTANCE;
274286
}
@@ -1128,6 +1140,49 @@ public int hashCode() {
11281140
}
11291141
}
11301142

1143+
public static class VariantLogicalTypeAnnotation extends LogicalTypeAnnotation {
1144+
private byte specVersion;
1145+
1146+
private VariantLogicalTypeAnnotation(byte specVersion) {
1147+
this.specVersion = specVersion;
1148+
}
1149+
1150+
@Override
1151+
public OriginalType toOriginalType() {
1152+
// No OriginalType for Variant
1153+
return null;
1154+
}
1155+
1156+
@Override
1157+
public <T> Optional<T> accept(LogicalTypeAnnotationVisitor<T> logicalTypeAnnotationVisitor) {
1158+
return logicalTypeAnnotationVisitor.visit(this);
1159+
}
1160+
1161+
@Override
1162+
LogicalTypeToken getType() {
1163+
return LogicalTypeToken.VARIANT;
1164+
}
1165+
1166+
public byte getSpecVersion() {
1167+
return this.specVersion;
1168+
}
1169+
1170+
@Override
1171+
protected String typeParametersAsString() {
1172+
return "(" + specVersion + ")";
1173+
}
1174+
1175+
@Override
1176+
public boolean equals(Object obj) {
1177+
if (!(obj instanceof VariantLogicalTypeAnnotation)) {
1178+
return false;
1179+
}
1180+
1181+
VariantLogicalTypeAnnotation other = (VariantLogicalTypeAnnotation) obj;
1182+
return specVersion == other.specVersion;
1183+
}
1184+
}
1185+
11311186
/**
11321187
* Implement this interface to visit a logical type annotation in the schema.
11331188
* The default implementation for each logical type specific visitor method is empty.
@@ -1152,6 +1207,10 @@ default Optional<T> visit(ListLogicalTypeAnnotation listLogicalType) {
11521207
return empty();
11531208
}
11541209

1210+
default Optional<T> visit(VariantLogicalTypeAnnotation variantLogicalType) {
1211+
return empty();
1212+
}
1213+
11551214
default Optional<T> visit(EnumLogicalTypeAnnotation enumLogicalType) {
11561215
return empty();
11571216
}

parquet-column/src/main/java/org/apache/parquet/schema/MessageTypeParser.java

Lines changed: 29 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -118,12 +118,36 @@ private static void addGroupType(Tokenizer st, Repetition r, GroupBuilder<?> bui
118118
String name = st.nextToken();
119119

120120
// Read annotation, if any.
121+
String annotation = null;
121122
t = st.nextToken();
122-
OriginalType originalType = null;
123123
if (t.equalsIgnoreCase("(")) {
124-
originalType = OriginalType.valueOf(st.nextToken());
125-
childBuilder.as(originalType);
126-
check(st.nextToken(), ")", "original type ended by )", st);
124+
t = st.nextToken();
125+
if (isLogicalType(t)) {
126+
LogicalTypeAnnotation.LogicalTypeToken logicalType = LogicalTypeAnnotation.LogicalTypeToken.valueOf(t);
127+
t = st.nextToken();
128+
List<String> tokens = new ArrayList<>();
129+
if ("(".equals(t)) {
130+
while (!")".equals(t)) {
131+
if (!(",".equals(t) || "(".equals(t) || ")".equals(t))) {
132+
tokens.add(t);
133+
}
134+
t = st.nextToken();
135+
}
136+
t = st.nextToken();
137+
}
138+
139+
LogicalTypeAnnotation logicalTypeAnnotation = logicalType.fromString(tokens);
140+
childBuilder.as(logicalTypeAnnotation);
141+
annotation = logicalTypeAnnotation.toString();
142+
} else {
143+
// Try to parse as OriginalType
144+
OriginalType originalType = OriginalType.valueOf(t);
145+
childBuilder.as(originalType);
146+
annotation = originalType.toString();
147+
t = st.nextToken();
148+
}
149+
150+
check(t, ")", "logical type ended by )", st);
127151
t = st.nextToken();
128152
}
129153
if (t.equals("=")) {
@@ -134,7 +158,7 @@ private static void addGroupType(Tokenizer st, Repetition r, GroupBuilder<?> bui
134158
addGroupTypeFields(t, st, childBuilder);
135159
} catch (IllegalArgumentException e) {
136160
throw new IllegalArgumentException(
137-
"problem reading type: type = group, name = " + name + ", original type = " + originalType, e);
161+
"problem reading type: type = group, name = " + name + ", annotation = " + annotation, e);
138162
}
139163

140164
childBuilder.named(name);

parquet-column/src/test/java/org/apache/parquet/parser/TestParquetParser.java

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@
5555
import static org.junit.Assert.assertEquals;
5656

5757
import org.apache.parquet.schema.GroupType;
58+
import org.apache.parquet.schema.LogicalTypeAnnotation;
5859
import org.apache.parquet.schema.MessageType;
5960
import org.apache.parquet.schema.MessageTypeParser;
6061
import org.apache.parquet.schema.OriginalType;
@@ -447,4 +448,30 @@ public void testEmbeddedAnnotations() {
447448
MessageType reparsed = MessageTypeParser.parseMessageType(parsed.toString());
448449
assertEquals(expected, reparsed);
449450
}
451+
452+
@Test
453+
public void testVARIANTAnnotation() {
454+
String message = "message Message {\n"
455+
+ " required group aVariant (VARIANT(2)) {\n"
456+
+ " required binary metadata;\n"
457+
+ " required binary value;\n"
458+
+ " }\n"
459+
+ "}\n";
460+
461+
MessageType expected = buildMessage()
462+
.requiredGroup()
463+
.as(LogicalTypeAnnotation.variantType((byte) 2))
464+
.required(BINARY)
465+
.named("metadata")
466+
.required(BINARY)
467+
.named("value")
468+
.named("aVariant")
469+
.named("Message");
470+
471+
MessageType parsed = parseMessageType(message);
472+
473+
assertEquals(expected, parsed);
474+
MessageType reparsed = parseMessageType(parsed.toString());
475+
assertEquals(expected, reparsed);
476+
}
450477
}

parquet-column/src/test/java/org/apache/parquet/schema/TestTypeBuilders.java

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@
5050
import static org.apache.parquet.schema.Type.Repetition.OPTIONAL;
5151
import static org.apache.parquet.schema.Type.Repetition.REPEATED;
5252
import static org.apache.parquet.schema.Type.Repetition.REQUIRED;
53+
import static org.junit.Assert.assertEquals;
5354

5455
import java.util.ArrayList;
5556
import java.util.List;
@@ -1414,6 +1415,52 @@ public void testTimestampLogicalTypeWithUTCParameter() {
14141415
Assert.assertEquals(nonUtcMicrosExpected, nonUtcMicrosActual);
14151416
}
14161417

1418+
@Test
1419+
public void testVariantLogicalType() {
1420+
byte specVersion = 1;
1421+
String name = "variant_field";
1422+
GroupType variantExpected = new GroupType(
1423+
REQUIRED,
1424+
name,
1425+
LogicalTypeAnnotation.variantType(specVersion),
1426+
new PrimitiveType(REQUIRED, BINARY, "metadata"),
1427+
new PrimitiveType(REQUIRED, BINARY, "value"));
1428+
1429+
GroupType variantActual = Types.buildGroup(REQUIRED)
1430+
.addFields(
1431+
Types.required(BINARY).named("metadata"),
1432+
Types.required(BINARY).named("value"))
1433+
.as(LogicalTypeAnnotation.variantType(specVersion))
1434+
.named(name);
1435+
1436+
assertEquals(variantExpected, variantActual);
1437+
}
1438+
1439+
@Test
1440+
public void testVariantLogicalTypeWithShredded() {
1441+
byte specVersion = 1;
1442+
String name = "variant_field";
1443+
GroupType variantExpected = new GroupType(
1444+
REQUIRED,
1445+
name,
1446+
LogicalTypeAnnotation.variantType(specVersion),
1447+
new PrimitiveType(REQUIRED, BINARY, "metadata"),
1448+
new PrimitiveType(OPTIONAL, BINARY, "value"),
1449+
new PrimitiveType(OPTIONAL, BINARY, "typed_value", LogicalTypeAnnotation.stringType()));
1450+
1451+
GroupType variantActual = Types.buildGroup(REQUIRED)
1452+
.addFields(
1453+
Types.required(BINARY).named("metadata"),
1454+
Types.optional(BINARY).named("value"),
1455+
Types.optional(BINARY)
1456+
.as(LogicalTypeAnnotation.stringType())
1457+
.named("typed_value"))
1458+
.as(LogicalTypeAnnotation.variantType(specVersion))
1459+
.named(name);
1460+
1461+
assertEquals(variantExpected, variantActual);
1462+
}
1463+
14171464
@Test(expected = IllegalArgumentException.class)
14181465
public void testDecimalLogicalTypeWithDeprecatedScaleMismatch() {
14191466
Types.required(BINARY)

parquet-column/src/test/java/org/apache/parquet/schema/TestTypeBuildersWithLogicalTypes.java

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,8 @@
4141
import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT96;
4242
import static org.apache.parquet.schema.Type.Repetition.REQUIRED;
4343
import static org.junit.Assert.assertEquals;
44+
import static org.junit.Assert.assertNull;
45+
import static org.junit.Assert.assertTrue;
4446

4547
import java.util.concurrent.Callable;
4648
import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName;
@@ -473,6 +475,59 @@ public void testFloat16LogicalType() {
473475
.toString());
474476
}
475477

478+
@Test
479+
public void testVariantLogicalType() {
480+
byte specVersion = 1;
481+
String name = "variant_field";
482+
GroupType variant = new GroupType(
483+
REQUIRED,
484+
name,
485+
LogicalTypeAnnotation.variantType(specVersion),
486+
Types.required(BINARY).named("metadata"),
487+
Types.required(BINARY).named("value"));
488+
489+
assertEquals(
490+
"required group variant_field (VARIANT(1)) {\n"
491+
+ " required binary metadata;\n"
492+
+ " required binary value;\n"
493+
+ "}",
494+
variant.toString());
495+
496+
LogicalTypeAnnotation annotation = variant.getLogicalTypeAnnotation();
497+
assertEquals(LogicalTypeAnnotation.LogicalTypeToken.VARIANT, annotation.getType());
498+
assertNull(annotation.toOriginalType());
499+
assertTrue(annotation instanceof LogicalTypeAnnotation.VariantLogicalTypeAnnotation);
500+
assertEquals(specVersion, ((LogicalTypeAnnotation.VariantLogicalTypeAnnotation) annotation).getSpecVersion());
501+
}
502+
503+
@Test
504+
public void testVariantLogicalTypeWithShredded() {
505+
byte specVersion = 1;
506+
507+
String name = "variant_field";
508+
GroupType variant = new GroupType(
509+
REQUIRED,
510+
name,
511+
LogicalTypeAnnotation.variantType(specVersion),
512+
Types.required(BINARY).named("metadata"),
513+
Types.optional(BINARY).named("value"),
514+
Types.optional(BINARY).as(LogicalTypeAnnotation.stringType()).named("typed_value"));
515+
516+
assertEquals(
517+
"required group variant_field (VARIANT(1)) {\n"
518+
+ " required binary metadata;\n"
519+
+ " optional binary value;\n"
520+
+ " optional binary typed_value (STRING);\n"
521+
+ "}",
522+
variant.toString());
523+
524+
LogicalTypeAnnotation annotation = variant.getLogicalTypeAnnotation();
525+
assertEquals(LogicalTypeAnnotation.LogicalTypeToken.VARIANT, annotation.getType());
526+
assertNull(annotation.toOriginalType());
527+
assertTrue(annotation instanceof LogicalTypeAnnotation.VariantLogicalTypeAnnotation);
528+
assertEquals(specVersion, ((LogicalTypeAnnotation.VariantLogicalTypeAnnotation) annotation).getSpecVersion());
529+
}
530+
476531
/**
477532
* A convenience method to avoid a large number of @Test(expected=...) tests
478533
*

parquet-format-structures/src/main/java/org/apache/parquet/format/LogicalTypes.java

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,12 @@ public static LogicalType DECIMAL(int scale, int precision) {
3232
return LogicalType.DECIMAL(new DecimalType(scale, precision));
3333
}
3434

35+
public static LogicalType VARIANT(byte specificationVersion) {
36+
VariantType type = new VariantType();
37+
type.setSpecification_version(specificationVersion);
38+
return LogicalType.VARIANT(type);
39+
}
40+
3541
public static final LogicalType UTF8 = LogicalType.STRING(new StringType());
3642
public static final LogicalType MAP = LogicalType.MAP(new MapType());
3743
public static final LogicalType LIST = LogicalType.LIST(new ListType());

parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,7 @@
103103
import org.apache.parquet.format.Type;
104104
import org.apache.parquet.format.TypeDefinedOrder;
105105
import org.apache.parquet.format.Uncompressed;
106+
import org.apache.parquet.format.VariantType;
106107
import org.apache.parquet.format.XxHash;
107108
import org.apache.parquet.hadoop.metadata.BlockMetaData;
108109
import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData;
@@ -514,6 +515,11 @@ public Optional<LogicalType> visit(LogicalTypeAnnotation.UnknownLogicalTypeAnnot
514515
public Optional<LogicalType> visit(LogicalTypeAnnotation.IntervalLogicalTypeAnnotation intervalLogicalType) {
515516
return of(LogicalTypes.UNKNOWN);
516517
}
518+
519+
@Override
520+
public Optional<LogicalType> visit(LogicalTypeAnnotation.VariantLogicalTypeAnnotation variantLogicalType) {
521+
return of(LogicalTypes.VARIANT(variantLogicalType.getSpecVersion()));
522+
}
517523
}
518524

519525
private void addRowGroup(
@@ -1177,6 +1183,9 @@ LogicalTypeAnnotation getLogicalTypeAnnotation(LogicalType type) {
11771183
return LogicalTypeAnnotation.uuidType();
11781184
case FLOAT16:
11791185
return LogicalTypeAnnotation.float16Type();
1186+
case VARIANT:
1187+
VariantType variant = type.getVARIANT();
1188+
return LogicalTypeAnnotation.variantType(variant.getSpecification_version());
11801189
default:
11811190
throw new RuntimeException("Unknown logical type " + type);
11821191
}

parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
import static org.apache.parquet.schema.LogicalTypeAnnotation.timeType;
4242
import static org.apache.parquet.schema.LogicalTypeAnnotation.timestampType;
4343
import static org.apache.parquet.schema.LogicalTypeAnnotation.uuidType;
44+
import static org.apache.parquet.schema.LogicalTypeAnnotation.variantType;
4445
import static org.apache.parquet.schema.MessageTypeParser.parseMessageType;
4546
import static org.junit.Assert.assertEquals;
4647
import static org.junit.Assert.assertFalse;
@@ -1589,6 +1590,28 @@ public void testMapConvertedTypeReadWrite() throws Exception {
15891590
verifyMapMessageType(messageType, "map");
15901591
}
15911592

1593+
@Test
1594+
public void testVariantLogicalType() {
1595+
byte specVersion = 1;
1596+
MessageType expected = Types.buildMessage()
1597+
.requiredGroup()
1598+
.as(variantType(specVersion))
1599+
.required(PrimitiveTypeName.BINARY)
1600+
.named("metadata")
1601+
.required(PrimitiveTypeName.BINARY)
1602+
.named("value")
1603+
.named("v")
1604+
.named("example");
1605+
1606+
ParquetMetadataConverter parquetMetadataConverter = new ParquetMetadataConverter();
1607+
List<SchemaElement> parquetSchema = parquetMetadataConverter.toParquetSchema(expected);
1608+
MessageType schema = parquetMetadataConverter.fromParquetSchema(parquetSchema, null);
1609+
assertEquals(expected, schema);
1610+
LogicalTypeAnnotation logicalType = schema.getType("v").getLogicalTypeAnnotation();
1611+
assertEquals(LogicalTypeAnnotation.variantType(specVersion), logicalType);
1612+
assertEquals(specVersion, ((LogicalTypeAnnotation.VariantLogicalTypeAnnotation) logicalType).getSpecVersion());
1613+
}
1614+
15921615
private void verifyMapMessageType(final MessageType messageType, final String keyValueName) throws IOException {
15931616
Path file = new Path(temporaryFolder.newFolder("verifyMapMessageType").getPath(), keyValueName + ".parquet");
15941617

0 commit comments

Comments
 (0)