From 6bf7f8e497192423a546f8632cbbb8d93d63c12b Mon Sep 17 00:00:00 2001 From: JingsongLi Date: Wed, 20 May 2026 17:56:12 +0800 Subject: [PATCH 01/11] [format] Add paimon-mosaic-format module with reader, writer, stats extraction and tests Introduces the Mosaic file format integration for Paimon with: - MosaicRecordsReader: row-group level predicate filtering using statistics, column projection, and correct returnedPosition tracking - MosaicRecordsWriter: BundleFormatWriter with writerMetadata() support for in-memory stats capture (avoids re-reading files on object stores) - MosaicSimpleStatsExtractor: stats extraction from file or writerMetadata, with SimpleColStatsCollector integration - MosaicObjects: byte[] to Paimon object conversion for all supported types - Comprehensive test suite (6 test classes covering unit and integration tests) Co-Authored-By: Claude Opus 4.6 --- paimon-mosaic-format/pom.xml | 86 ++++++ .../format/mosaic/MosaicFileFormat.java | 220 +++++++++++++++ .../mosaic/MosaicFileFormatFactory.java | 38 +++ .../format/mosaic/MosaicInputFileAdapter.java | 64 +++++ .../paimon/format/mosaic/MosaicObjects.java | 98 +++++++ .../format/mosaic/MosaicReaderFactory.java | 60 ++++ .../format/mosaic/MosaicRecordsReader.java | 243 ++++++++++++++++ .../format/mosaic/MosaicRecordsWriter.java | 183 ++++++++++++ .../mosaic/MosaicSimpleStatsExtractor.java | 147 ++++++++++ .../format/mosaic/MosaicWriterFactory.java | 44 +++ .../format/mosaic/MosaicWriterMetadata.java | 43 +++ ...org.apache.paimon.format.FileFormatFactory | 16 ++ .../format/mosaic/MosaicFileFormatTest.java | 124 +++++++++ .../mosaic/MosaicFormatReadWriteTest.java | 146 ++++++++++ .../format/mosaic/MosaicObjectsTest.java | 165 +++++++++++ .../format/mosaic/MosaicReaderWriterTest.java | 260 ++++++++++++++++++ .../MosaicSimpleStatsExtractorTest.java | 90 ++++++ .../mosaic/MosaicWriterMetadataTest.java | 194 +++++++++++++ pom.xml | 1 + 19 files changed, 2222 insertions(+) create mode 100644 paimon-mosaic-format/pom.xml create mode 100644 paimon-mosaic-format/src/main/java/org/apache/paimon/format/mosaic/MosaicFileFormat.java create mode 100644 paimon-mosaic-format/src/main/java/org/apache/paimon/format/mosaic/MosaicFileFormatFactory.java create mode 100644 paimon-mosaic-format/src/main/java/org/apache/paimon/format/mosaic/MosaicInputFileAdapter.java create mode 100644 paimon-mosaic-format/src/main/java/org/apache/paimon/format/mosaic/MosaicObjects.java create mode 100644 paimon-mosaic-format/src/main/java/org/apache/paimon/format/mosaic/MosaicReaderFactory.java create mode 100644 paimon-mosaic-format/src/main/java/org/apache/paimon/format/mosaic/MosaicRecordsReader.java create mode 100644 paimon-mosaic-format/src/main/java/org/apache/paimon/format/mosaic/MosaicRecordsWriter.java create mode 100644 paimon-mosaic-format/src/main/java/org/apache/paimon/format/mosaic/MosaicSimpleStatsExtractor.java create mode 100644 paimon-mosaic-format/src/main/java/org/apache/paimon/format/mosaic/MosaicWriterFactory.java create mode 100644 paimon-mosaic-format/src/main/java/org/apache/paimon/format/mosaic/MosaicWriterMetadata.java create mode 100644 paimon-mosaic-format/src/main/resources/META-INF/services/org.apache.paimon.format.FileFormatFactory create mode 100644 paimon-mosaic-format/src/test/java/org/apache/paimon/format/mosaic/MosaicFileFormatTest.java create mode 100644 paimon-mosaic-format/src/test/java/org/apache/paimon/format/mosaic/MosaicFormatReadWriteTest.java create mode 100644 paimon-mosaic-format/src/test/java/org/apache/paimon/format/mosaic/MosaicObjectsTest.java create mode 100644 paimon-mosaic-format/src/test/java/org/apache/paimon/format/mosaic/MosaicReaderWriterTest.java create mode 100644 paimon-mosaic-format/src/test/java/org/apache/paimon/format/mosaic/MosaicSimpleStatsExtractorTest.java create mode 100644 paimon-mosaic-format/src/test/java/org/apache/paimon/format/mosaic/MosaicWriterMetadataTest.java diff --git a/paimon-mosaic-format/pom.xml b/paimon-mosaic-format/pom.xml new file mode 100644 index 000000000000..a9182c998795 --- /dev/null +++ b/paimon-mosaic-format/pom.xml @@ -0,0 +1,86 @@ + + + + 4.0.0 + + + paimon-parent + org.apache.paimon + 1.5-SNAPSHOT + + + paimon-mosaic-format + Paimon : Mosaic Format + + + + org.apache.paimon.mosaic + mosaic-writer + 0.1.0-SNAPSHOT + + + + org.apache.paimon + paimon-arrow + ${project.version} + + + + org.apache.paimon + paimon-common + ${project.version} + provided + + + + org.apache.paimon + paimon-core + ${project.version} + provided + + + + + + org.apache.paimon + paimon-common + ${project.version} + test-jar + test + + + + org.apache.paimon + paimon-test-utils + ${project.version} + test + + + + org.apache.paimon + paimon-core + ${project.version} + test-jar + test + + + diff --git a/paimon-mosaic-format/src/main/java/org/apache/paimon/format/mosaic/MosaicFileFormat.java b/paimon-mosaic-format/src/main/java/org/apache/paimon/format/mosaic/MosaicFileFormat.java new file mode 100644 index 000000000000..b8a95e69aeda --- /dev/null +++ b/paimon-mosaic-format/src/main/java/org/apache/paimon/format/mosaic/MosaicFileFormat.java @@ -0,0 +1,220 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.format.mosaic; + +import org.apache.paimon.format.FileFormat; +import org.apache.paimon.format.FileFormatFactory; +import org.apache.paimon.format.FormatReaderFactory; +import org.apache.paimon.format.FormatWriterFactory; +import org.apache.paimon.format.SimpleStatsExtractor; +import org.apache.paimon.predicate.Predicate; +import org.apache.paimon.statistics.SimpleColStatsCollector; +import org.apache.paimon.types.ArrayType; +import org.apache.paimon.types.BigIntType; +import org.apache.paimon.types.BinaryType; +import org.apache.paimon.types.BlobType; +import org.apache.paimon.types.BooleanType; +import org.apache.paimon.types.CharType; +import org.apache.paimon.types.DataField; +import org.apache.paimon.types.DataTypeVisitor; +import org.apache.paimon.types.DateType; +import org.apache.paimon.types.DecimalType; +import org.apache.paimon.types.DoubleType; +import org.apache.paimon.types.FloatType; +import org.apache.paimon.types.IntType; +import org.apache.paimon.types.LocalZonedTimestampType; +import org.apache.paimon.types.MapType; +import org.apache.paimon.types.MultisetType; +import org.apache.paimon.types.RowType; +import org.apache.paimon.types.SmallIntType; +import org.apache.paimon.types.TimeType; +import org.apache.paimon.types.TimestampType; +import org.apache.paimon.types.TinyIntType; +import org.apache.paimon.types.VarBinaryType; +import org.apache.paimon.types.VarCharType; +import org.apache.paimon.types.VariantType; +import org.apache.paimon.types.VectorType; + +import javax.annotation.Nullable; + +import java.util.List; +import java.util.Optional; + +/** Mosaic {@link FileFormat}. */ +public class MosaicFileFormat extends FileFormat { + + static { + System.setProperty("arrow.enable_unsafe_memory_access", "true"); + } + + private final FileFormatFactory.FormatContext formatContext; + + public MosaicFileFormat(FileFormatFactory.FormatContext formatContext) { + super("mosaic"); + this.formatContext = formatContext; + } + + @Override + public FormatReaderFactory createReaderFactory( + RowType dataSchemaRowType, + RowType projectedRowType, + @Nullable List predicates) { + return new MosaicReaderFactory(dataSchemaRowType, projectedRowType, predicates); + } + + @Override + public FormatWriterFactory createWriterFactory(RowType type) { + return new MosaicWriterFactory(type, formatContext); + } + + @Override + public void validateDataFields(RowType rowType) { + rowType.accept(new MosaicRowTypeVisitor()); + } + + @Override + public Optional createStatsExtractor( + RowType type, SimpleColStatsCollector.Factory[] statsCollectors) { + return Optional.of(new MosaicSimpleStatsExtractor(type, statsCollectors)); + } + + static class MosaicRowTypeVisitor implements DataTypeVisitor { + + @Override + public Void visit(CharType charType) { + return null; + } + + @Override + public Void visit(VarCharType varCharType) { + return null; + } + + @Override + public Void visit(BooleanType booleanType) { + return null; + } + + @Override + public Void visit(BinaryType binaryType) { + return null; + } + + @Override + public Void visit(VarBinaryType varBinaryType) { + return null; + } + + @Override + public Void visit(DecimalType decimalType) { + return null; + } + + @Override + public Void visit(TinyIntType tinyIntType) { + return null; + } + + @Override + public Void visit(SmallIntType smallIntType) { + return null; + } + + @Override + public Void visit(IntType intType) { + return null; + } + + @Override + public Void visit(BigIntType bigIntType) { + return null; + } + + @Override + public Void visit(FloatType floatType) { + return null; + } + + @Override + public Void visit(DoubleType doubleType) { + return null; + } + + @Override + public Void visit(DateType dateType) { + return null; + } + + @Override + public Void visit(TimeType timeType) { + return null; + } + + @Override + public Void visit(TimestampType timestampType) { + return null; + } + + @Override + public Void visit(LocalZonedTimestampType localZonedTimestampType) { + return null; + } + + @Override + public Void visit(VariantType variantType) { + throw new UnsupportedOperationException( + "Mosaic file format does not support type VARIANT"); + } + + @Override + public Void visit(BlobType blobType) { + throw new UnsupportedOperationException( + "Mosaic file format does not support type BLOB"); + } + + @Override + public Void visit(ArrayType arrayType) { + return null; + } + + @Override + public Void visit(VectorType vectorType) { + return null; + } + + @Override + public Void visit(MultisetType multisetType) { + throw new UnsupportedOperationException( + "Mosaic file format does not support type MULTISET"); + } + + @Override + public Void visit(MapType mapType) { + throw new UnsupportedOperationException("Mosaic file format does not support type MAP"); + } + + @Override + public Void visit(RowType rowType) { + for (DataField field : rowType.getFields()) { + field.type().accept(this); + } + return null; + } + } +} diff --git a/paimon-mosaic-format/src/main/java/org/apache/paimon/format/mosaic/MosaicFileFormatFactory.java b/paimon-mosaic-format/src/main/java/org/apache/paimon/format/mosaic/MosaicFileFormatFactory.java new file mode 100644 index 000000000000..782faba3e8f9 --- /dev/null +++ b/paimon-mosaic-format/src/main/java/org/apache/paimon/format/mosaic/MosaicFileFormatFactory.java @@ -0,0 +1,38 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.format.mosaic; + +import org.apache.paimon.format.FileFormat; +import org.apache.paimon.format.FileFormatFactory; + +/** Factory to create {@link MosaicFileFormat}. */ +public class MosaicFileFormatFactory implements FileFormatFactory { + + public static final String IDENTIFIER = "mosaic"; + + @Override + public String identifier() { + return IDENTIFIER; + } + + @Override + public FileFormat create(FormatContext formatContext) { + return new MosaicFileFormat(formatContext); + } +} diff --git a/paimon-mosaic-format/src/main/java/org/apache/paimon/format/mosaic/MosaicInputFileAdapter.java b/paimon-mosaic-format/src/main/java/org/apache/paimon/format/mosaic/MosaicInputFileAdapter.java new file mode 100644 index 000000000000..8806afb697fb --- /dev/null +++ b/paimon-mosaic-format/src/main/java/org/apache/paimon/format/mosaic/MosaicInputFileAdapter.java @@ -0,0 +1,64 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.format.mosaic; + +import org.apache.paimon.fs.FileIO; +import org.apache.paimon.fs.Path; +import org.apache.paimon.fs.SeekableInputStream; +import org.apache.paimon.mosaic.InputFile; + +import java.io.EOFException; +import java.io.IOException; + +/** + * Adapts Paimon's {@link FileIO} to Mosaic's {@link InputFile} interface. + * + *

Thread-safe: creates a new stream per call since Mosaic may invoke concurrently. + */ +public class MosaicInputFileAdapter implements InputFile { + + private final FileIO fileIO; + private final Path path; + + public MosaicInputFileAdapter(FileIO fileIO, Path path) { + this.fileIO = fileIO; + this.path = path; + } + + @Override + public void readFully(long position, byte[] buffer, int offset, int length) throws IOException { + try (SeekableInputStream in = fileIO.newInputStream(path)) { + in.seek(position); + int remaining = length; + int off = offset; + while (remaining > 0) { + int read = in.read(buffer, off, remaining); + if (read < 0) { + throw new EOFException( + "Reached end of file while reading " + + path + + " at position " + + position); + } + off += read; + remaining -= read; + } + } + } +} diff --git a/paimon-mosaic-format/src/main/java/org/apache/paimon/format/mosaic/MosaicObjects.java b/paimon-mosaic-format/src/main/java/org/apache/paimon/format/mosaic/MosaicObjects.java new file mode 100644 index 000000000000..695a1920f8da --- /dev/null +++ b/paimon-mosaic-format/src/main/java/org/apache/paimon/format/mosaic/MosaicObjects.java @@ -0,0 +1,98 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.format.mosaic; + +import org.apache.paimon.data.BinaryString; +import org.apache.paimon.data.Decimal; +import org.apache.paimon.data.Timestamp; +import org.apache.paimon.types.DataType; +import org.apache.paimon.types.DecimalType; +import org.apache.paimon.types.TimestampType; + +import javax.annotation.Nullable; + +import java.math.BigDecimal; +import java.math.BigInteger; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; + +/** Converts Mosaic's byte[] statistics to Paimon objects. */ +public class MosaicObjects { + + @Nullable + public static Object convertStatsValue(byte[] bytes, DataType dataType) { + if (bytes == null || bytes.length == 0) { + return null; + } + ByteBuffer buf = ByteBuffer.wrap(bytes).order(ByteOrder.LITTLE_ENDIAN); + switch (dataType.getTypeRoot()) { + case BOOLEAN: + return bytes[0] != 0; + case TINYINT: + return bytes[0]; + case SMALLINT: + return buf.getShort(); + case INTEGER: + case DATE: + case TIME_WITHOUT_TIME_ZONE: + return buf.getInt(); + case BIGINT: + return buf.getLong(); + case FLOAT: + return buf.getFloat(); + case DOUBLE: + return buf.getDouble(); + case CHAR: + case VARCHAR: + return BinaryString.fromBytes(bytes); + case BINARY: + case VARBINARY: + return bytes; + case DECIMAL: + DecimalType decimalType = (DecimalType) dataType; + BigInteger unscaled = new BigInteger(reverseBytes(bytes)); + BigDecimal decimal = new BigDecimal(unscaled, decimalType.getScale()); + return Decimal.fromBigDecimal( + decimal, decimalType.getPrecision(), decimalType.getScale()); + case TIMESTAMP_WITHOUT_TIME_ZONE: + TimestampType tsType = (TimestampType) dataType; + long tsValue = buf.getLong(); + if (tsType.getPrecision() <= 3) { + return Timestamp.fromEpochMillis(tsValue); + } else if (tsType.getPrecision() <= 6) { + return Timestamp.fromMicros(tsValue); + } else { + return Timestamp.fromEpochMillis( + tsValue / 1_000_000, (int) (tsValue % 1_000_000)); + } + default: + return null; + } + } + + private static byte[] reverseBytes(byte[] bytes) { + byte[] reversed = new byte[bytes.length]; + for (int i = 0; i < bytes.length; i++) { + reversed[i] = bytes[bytes.length - 1 - i]; + } + return reversed; + } + + private MosaicObjects() {} +} diff --git a/paimon-mosaic-format/src/main/java/org/apache/paimon/format/mosaic/MosaicReaderFactory.java b/paimon-mosaic-format/src/main/java/org/apache/paimon/format/mosaic/MosaicReaderFactory.java new file mode 100644 index 000000000000..5b39c867e290 --- /dev/null +++ b/paimon-mosaic-format/src/main/java/org/apache/paimon/format/mosaic/MosaicReaderFactory.java @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.format.mosaic; + +import org.apache.paimon.data.InternalRow; +import org.apache.paimon.format.FormatReaderFactory; +import org.apache.paimon.predicate.Predicate; +import org.apache.paimon.reader.FileRecordReader; +import org.apache.paimon.types.RowType; + +import javax.annotation.Nullable; + +import java.io.IOException; +import java.util.List; + +/** A factory to create Mosaic reader. */ +public class MosaicReaderFactory implements FormatReaderFactory { + + private final RowType dataSchemaRowType; + private final RowType projectedRowType; + @Nullable private final List predicates; + + public MosaicReaderFactory( + RowType dataSchemaRowType, + RowType projectedRowType, + @Nullable List predicates) { + this.dataSchemaRowType = dataSchemaRowType; + this.projectedRowType = projectedRowType; + this.predicates = predicates; + } + + @Override + public FileRecordReader createReader(Context context) throws IOException { + MosaicInputFileAdapter inputFile = + new MosaicInputFileAdapter(context.fileIO(), context.filePath()); + return new MosaicRecordsReader( + inputFile, + context.fileSize(), + dataSchemaRowType, + projectedRowType, + predicates, + context.filePath()); + } +} diff --git a/paimon-mosaic-format/src/main/java/org/apache/paimon/format/mosaic/MosaicRecordsReader.java b/paimon-mosaic-format/src/main/java/org/apache/paimon/format/mosaic/MosaicRecordsReader.java new file mode 100644 index 000000000000..76101378f0ca --- /dev/null +++ b/paimon-mosaic-format/src/main/java/org/apache/paimon/format/mosaic/MosaicRecordsReader.java @@ -0,0 +1,243 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.format.mosaic; + +import org.apache.paimon.arrow.reader.ArrowBatchReader; +import org.apache.paimon.data.GenericArray; +import org.apache.paimon.data.GenericRow; +import org.apache.paimon.data.InternalRow; +import org.apache.paimon.fs.Path; +import org.apache.paimon.mosaic.ColumnStatistics; +import org.apache.paimon.mosaic.InputFile; +import org.apache.paimon.mosaic.MosaicReader; +import org.apache.paimon.predicate.Predicate; +import org.apache.paimon.reader.FileRecordIterator; +import org.apache.paimon.reader.FileRecordReader; +import org.apache.paimon.types.DataField; +import org.apache.paimon.types.DataType; +import org.apache.paimon.types.RowType; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.vector.VectorSchemaRoot; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.Schema; + +import javax.annotation.Nullable; + +import java.io.IOException; +import java.util.Iterator; +import java.util.List; + +import static org.apache.paimon.format.mosaic.MosaicObjects.convertStatsValue; + +/** File reader for Mosaic format. */ +public class MosaicRecordsReader implements FileRecordReader { + + private final MosaicReader reader; + private final ArrowBatchReader arrowBatchReader; + private final int[] columnIndices; + private final Path filePath; + private final BufferAllocator allocator; + private final int numRowGroups; + private final RowType dataSchemaRowType; + private final Schema fileSchema; + @Nullable private final List predicates; + + private int currentRowGroup; + private long returnedPosition = -1; + private VectorSchemaRoot currentVsr; + + public MosaicRecordsReader( + InputFile inputFile, + long fileSize, + RowType dataSchemaRowType, + RowType projectedRowType, + @Nullable List predicates, + Path filePath) { + this.filePath = filePath; + this.dataSchemaRowType = dataSchemaRowType; + this.predicates = predicates; + this.allocator = new RootAllocator(); + + try { + this.reader = MosaicReader.open(inputFile, fileSize, allocator); + } catch (Exception e) { + allocator.close(); + throw e; + } + + this.fileSchema = reader.getSchema(); + this.columnIndices = computeColumnIndices(fileSchema, projectedRowType); + this.numRowGroups = reader.numRowGroups(); + this.currentRowGroup = 0; + this.arrowBatchReader = new ArrowBatchReader(projectedRowType, true); + } + + @Nullable + @Override + public FileRecordIterator readBatch() throws IOException { + while (currentRowGroup < numRowGroups) { + int numRows = reader.rowGroupNumRows(currentRowGroup); + if (!matchesRowGroup(currentRowGroup, numRows)) { + returnedPosition += numRows; + currentRowGroup++; + continue; + } + + releaseCurrentVsr(); + + VectorSchemaRoot vsr; + if (columnIndices != null) { + vsr = reader.readRowGroup(currentRowGroup, columnIndices, allocator); + } else { + vsr = reader.readRowGroup(currentRowGroup, allocator); + } + currentRowGroup++; + this.currentVsr = vsr; + + Iterator rows = arrowBatchReader.readBatch(vsr).iterator(); + + return new FileRecordIterator() { + @Override + public long returnedPosition() { + return returnedPosition; + } + + @Override + public Path filePath() { + return filePath; + } + + @Nullable + @Override + public InternalRow next() { + if (rows.hasNext()) { + returnedPosition++; + return rows.next(); + } + return null; + } + + @Override + public void releaseBatch() { + releaseCurrentVsr(); + } + }; + } + return null; + } + + private boolean matchesRowGroup(int rowGroupIndex, long rowCount) { + if (predicates == null || predicates.isEmpty()) { + return true; + } + + List statsList = reader.getRowGroupStatistics(rowGroupIndex); + if (statsList.isEmpty()) { + return true; + } + + int fieldCount = dataSchemaRowType.getFieldCount(); + GenericRow minValues = new GenericRow(fieldCount); + GenericRow maxValues = new GenericRow(fieldCount); + long[] nullCounts = new long[fieldCount]; + + for (ColumnStatistics stats : statsList) { + int fileColIdx = stats.getColumnIndex(); + if (fileColIdx < 0 || fileColIdx >= fileSchema.getFields().size()) { + continue; + } + String colName = fileSchema.getFields().get(fileColIdx).getName(); + int schemaIdx = findFieldIndex(dataSchemaRowType, colName); + if (schemaIdx < 0) { + continue; + } + + nullCounts[schemaIdx] = stats.getNullCount(); + if (stats.hasMinMax()) { + DataType dataType = dataSchemaRowType.getFields().get(schemaIdx).type(); + Object min = convertStatsValue(stats.getMin(), dataType); + Object max = convertStatsValue(stats.getMax(), dataType); + minValues.setField(schemaIdx, min); + maxValues.setField(schemaIdx, max); + } + } + + for (Predicate predicate : predicates) { + if (!predicate.test(rowCount, minValues, maxValues, new GenericArray(nullCounts))) { + return false; + } + } + return true; + } + + private static int findFieldIndex(RowType rowType, String name) { + List fields = rowType.getFields(); + for (int i = 0; i < fields.size(); i++) { + if (fields.get(i).name().equals(name)) { + return i; + } + } + return -1; + } + + private void releaseCurrentVsr() { + if (currentVsr != null) { + currentVsr.close(); + currentVsr = null; + } + } + + @Override + public void close() throws IOException { + releaseCurrentVsr(); + reader.close(); + allocator.close(); + } + + @Nullable + private static int[] computeColumnIndices(Schema fileSchema, RowType projectedRowType) { + List fileFields = fileSchema.getFields(); + if (fileFields.size() == projectedRowType.getFieldCount()) { + return null; + } + + int[] indices = new int[projectedRowType.getFieldCount()]; + for (int i = 0; i < projectedRowType.getFieldCount(); i++) { + String name = projectedRowType.getFields().get(i).name(); + int pos = findArrowFieldIndex(fileFields, name); + if (pos < 0) { + throw new IllegalArgumentException( + "Projected field '" + name + "' not found in Mosaic file schema"); + } + indices[i] = pos; + } + return indices; + } + + private static int findArrowFieldIndex(List fields, String name) { + for (int i = 0; i < fields.size(); i++) { + if (fields.get(i).getName().equals(name)) { + return i; + } + } + return -1; + } +} diff --git a/paimon-mosaic-format/src/main/java/org/apache/paimon/format/mosaic/MosaicRecordsWriter.java b/paimon-mosaic-format/src/main/java/org/apache/paimon/format/mosaic/MosaicRecordsWriter.java new file mode 100644 index 000000000000..723f483d0863 --- /dev/null +++ b/paimon-mosaic-format/src/main/java/org/apache/paimon/format/mosaic/MosaicRecordsWriter.java @@ -0,0 +1,183 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.format.mosaic; + +import org.apache.paimon.arrow.ArrowBundleRecords; +import org.apache.paimon.arrow.vector.ArrowFormatWriter; +import org.apache.paimon.data.InternalRow; +import org.apache.paimon.format.BundleFormatWriter; +import org.apache.paimon.format.FileFormatFactory; +import org.apache.paimon.io.BundleRecords; +import org.apache.paimon.mosaic.ColumnStatistics; +import org.apache.paimon.mosaic.MosaicWriter; +import org.apache.paimon.mosaic.WriterOptions; +import org.apache.paimon.types.RowType; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.vector.VectorSchemaRoot; +import org.apache.arrow.vector.types.pojo.Schema; + +import javax.annotation.Nullable; + +import java.io.IOException; +import java.io.OutputStream; +import java.util.ArrayList; +import java.util.List; + +/** Mosaic records writer. */ +public class MosaicRecordsWriter implements BundleFormatWriter { + + private final ArrowFormatWriter arrowFormatWriter; + private final MosaicWriter nativeWriter; + private final BufferAllocator allocator; + @Nullable private MosaicWriterMetadata metadata; + + public MosaicRecordsWriter( + OutputStream outputStream, + RowType rowType, + FileFormatFactory.FormatContext formatContext) { + this.allocator = new RootAllocator(); + + int writeBatchSize = formatContext.writeBatchSize(); + long writeBatchMemory = formatContext.writeBatchMemory().getBytes(); + + this.arrowFormatWriter = + new ArrowFormatWriter(rowType, writeBatchSize, true, allocator, writeBatchMemory); + + Schema arrowSchema = arrowFormatWriter.getVectorSchemaRoot().getSchema(); + WriterOptions options = new WriterOptions().zstdLevel(formatContext.zstdLevel()); + + this.nativeWriter = new MosaicWriter(outputStream, arrowSchema, options, allocator); + } + + @Override + public void addElement(InternalRow internalRow) throws IOException { + if (!arrowFormatWriter.write(internalRow)) { + flush(); + if (!arrowFormatWriter.write(internalRow)) { + throw new RuntimeException("Failed to write row to Mosaic file"); + } + } + } + + @Override + public void writeBundle(BundleRecords bundleRecords) throws IOException { + if (bundleRecords instanceof ArrowBundleRecords) { + flush(); + nativeWriter.write(((ArrowBundleRecords) bundleRecords).getVectorSchemaRoot()); + } else { + for (InternalRow row : bundleRecords) { + addElement(row); + } + } + } + + @Override + public boolean reachTargetSize(boolean suggestedCheck, long targetSize) throws IOException { + if (!suggestedCheck) { + return false; + } + return nativeWriter.estimatedFileSize() >= targetSize; + } + + @Override + public void close() throws IOException { + Throwable throwable = null; + + try { + flush(); + } catch (Throwable t) { + throwable = t; + } + + try { + collectMetadata(); + } catch (Throwable t) { + throwable = addSuppressed(throwable, t); + } + + try { + nativeWriter.close(); + } catch (Throwable t) { + throwable = addSuppressed(throwable, t); + } + + try { + arrowFormatWriter.close(); + } catch (Throwable t) { + throwable = addSuppressed(throwable, t); + } + + try { + allocator.close(); + } catch (Throwable t) { + throwable = addSuppressed(throwable, t); + } + + if (throwable != null) { + rethrow(throwable); + } + } + + @Nullable + @Override + public Object writerMetadata() { + return metadata; + } + + private void collectMetadata() { + int numRowGroups = nativeWriter.numRowGroups(); + List> allStats = new ArrayList<>(numRowGroups); + for (int i = 0; i < numRowGroups; i++) { + allStats.add(nativeWriter.getRowGroupStatistics(i)); + } + this.metadata = new MosaicWriterMetadata(numRowGroups, allStats); + } + + private void flush() { + arrowFormatWriter.flush(); + if (!arrowFormatWriter.empty()) { + VectorSchemaRoot vsr = arrowFormatWriter.getVectorSchemaRoot(); + nativeWriter.write(vsr); + } + arrowFormatWriter.reset(); + } + + private static Throwable addSuppressed(Throwable throwable, Throwable suppressed) { + if (throwable == null) { + return suppressed; + } + throwable.addSuppressed(suppressed); + return throwable; + } + + private static void rethrow(Throwable throwable) throws IOException { + if (throwable instanceof IOException) { + throw (IOException) throwable; + } + if (throwable instanceof RuntimeException) { + throw (RuntimeException) throwable; + } + if (throwable instanceof Error) { + throw (Error) throwable; + } + throw new IOException(throwable); + } +} diff --git a/paimon-mosaic-format/src/main/java/org/apache/paimon/format/mosaic/MosaicSimpleStatsExtractor.java b/paimon-mosaic-format/src/main/java/org/apache/paimon/format/mosaic/MosaicSimpleStatsExtractor.java new file mode 100644 index 000000000000..4c70d473b0ba --- /dev/null +++ b/paimon-mosaic-format/src/main/java/org/apache/paimon/format/mosaic/MosaicSimpleStatsExtractor.java @@ -0,0 +1,147 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.format.mosaic; + +import org.apache.paimon.format.SimpleColStats; +import org.apache.paimon.format.SimpleStatsExtractor; +import org.apache.paimon.fs.FileIO; +import org.apache.paimon.fs.Path; +import org.apache.paimon.mosaic.ColumnStatistics; +import org.apache.paimon.mosaic.MosaicReader; +import org.apache.paimon.statistics.SimpleColStatsCollector; +import org.apache.paimon.types.DataType; +import org.apache.paimon.types.RowType; +import org.apache.paimon.utils.Pair; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.RootAllocator; + +import javax.annotation.Nullable; + +import java.util.List; + +import static org.apache.paimon.format.mosaic.MosaicObjects.convertStatsValue; + +/** Extracts statistics from Mosaic file metadata. */ +public class MosaicSimpleStatsExtractor implements SimpleStatsExtractor { + + private final RowType rowType; + private final SimpleColStatsCollector.Factory[] statsCollectors; + + public MosaicSimpleStatsExtractor( + RowType rowType, SimpleColStatsCollector.Factory[] statsCollectors) { + this.rowType = rowType; + this.statsCollectors = statsCollectors; + } + + @Override + public SimpleColStats[] extract(FileIO fileIO, Path path, long length) { + MosaicInputFileAdapter inputFile = new MosaicInputFileAdapter(fileIO, path); + try (BufferAllocator allocator = new RootAllocator(); + MosaicReader reader = MosaicReader.open(inputFile, length, allocator)) { + return extractFromStats(reader.numRowGroups(), reader::getRowGroupStatistics); + } + } + + @Override + public SimpleColStats[] extract( + FileIO fileIO, Path path, long length, @Nullable Object writerMetadata) { + if (writerMetadata instanceof MosaicWriterMetadata) { + MosaicWriterMetadata meta = (MosaicWriterMetadata) writerMetadata; + return extractFromStats(meta.numRowGroups(), meta::getRowGroupStatistics); + } + return extract(fileIO, path, length); + } + + @Override + public Pair extractWithFileInfo( + FileIO fileIO, Path path, long length) { + MosaicInputFileAdapter inputFile = new MosaicInputFileAdapter(fileIO, path); + long rowCount = 0; + try (BufferAllocator allocator = new RootAllocator(); + MosaicReader reader = MosaicReader.open(inputFile, length, allocator)) { + SimpleColStats[] stats = + extractFromStats(reader.numRowGroups(), reader::getRowGroupStatistics); + int numRowGroups = reader.numRowGroups(); + for (int rg = 0; rg < numRowGroups; rg++) { + rowCount += reader.readRowGroup(rg, allocator).getRowCount(); + } + return Pair.of(stats, new FileInfo(rowCount)); + } + } + + @SuppressWarnings("unchecked") + private SimpleColStats[] extractFromStats( + int numRowGroups, RowGroupStatsProvider statsProvider) { + int fieldCount = rowType.getFieldCount(); + Object[] minValues = new Object[fieldCount]; + Object[] maxValues = new Object[fieldCount]; + long[] nullCounts = new long[fieldCount]; + + for (int rg = 0; rg < numRowGroups; rg++) { + List stats = statsProvider.getRowGroupStatistics(rg); + for (ColumnStatistics stat : stats) { + int colIdx = stat.getColumnIndex(); + if (colIdx < 0 || colIdx >= fieldCount) { + continue; + } + + nullCounts[colIdx] += stat.getNullCount(); + + if (stat.hasMinMax()) { + DataType dataType = rowType.getFields().get(colIdx).type(); + Object min = convertStatsValue(stat.getMin(), dataType); + Object max = convertStatsValue(stat.getMax(), dataType); + if (min != null) { + if (minValues[colIdx] == null) { + minValues[colIdx] = min; + } else { + if (((Comparable) min).compareTo(minValues[colIdx]) < 0) { + minValues[colIdx] = min; + } + } + } + if (max != null) { + if (maxValues[colIdx] == null) { + maxValues[colIdx] = max; + } else { + if (((Comparable) max).compareTo(maxValues[colIdx]) > 0) { + maxValues[colIdx] = max; + } + } + } + } + } + } + + SimpleColStatsCollector[] collectors = SimpleColStatsCollector.create(statsCollectors); + SimpleColStats[] result = new SimpleColStats[fieldCount]; + for (int i = 0; i < fieldCount; i++) { + SimpleColStats fieldStats = + new SimpleColStats(minValues[i], maxValues[i], nullCounts[i]); + result[i] = collectors[i].convert(fieldStats); + } + return result; + } + + @FunctionalInterface + private interface RowGroupStatsProvider { + List getRowGroupStatistics(int rowGroupIndex); + } +} diff --git a/paimon-mosaic-format/src/main/java/org/apache/paimon/format/mosaic/MosaicWriterFactory.java b/paimon-mosaic-format/src/main/java/org/apache/paimon/format/mosaic/MosaicWriterFactory.java new file mode 100644 index 000000000000..dd7bb72f98e5 --- /dev/null +++ b/paimon-mosaic-format/src/main/java/org/apache/paimon/format/mosaic/MosaicWriterFactory.java @@ -0,0 +1,44 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.format.mosaic; + +import org.apache.paimon.format.FileFormatFactory; +import org.apache.paimon.format.FormatWriter; +import org.apache.paimon.format.FormatWriterFactory; +import org.apache.paimon.fs.PositionOutputStream; +import org.apache.paimon.types.RowType; + +import java.io.IOException; + +/** A factory to create Mosaic {@link FormatWriter}. */ +public class MosaicWriterFactory implements FormatWriterFactory { + + private final RowType rowType; + private final FileFormatFactory.FormatContext formatContext; + + public MosaicWriterFactory(RowType rowType, FileFormatFactory.FormatContext formatContext) { + this.rowType = rowType; + this.formatContext = formatContext; + } + + @Override + public FormatWriter create(PositionOutputStream out, String compression) throws IOException { + return new MosaicRecordsWriter(out, rowType, formatContext); + } +} diff --git a/paimon-mosaic-format/src/main/java/org/apache/paimon/format/mosaic/MosaicWriterMetadata.java b/paimon-mosaic-format/src/main/java/org/apache/paimon/format/mosaic/MosaicWriterMetadata.java new file mode 100644 index 000000000000..ac4c9906c852 --- /dev/null +++ b/paimon-mosaic-format/src/main/java/org/apache/paimon/format/mosaic/MosaicWriterMetadata.java @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.format.mosaic; + +import org.apache.paimon.mosaic.ColumnStatistics; + +import java.util.List; + +/** In-memory metadata captured from MosaicWriter after close. */ +public class MosaicWriterMetadata { + + private final int numRowGroups; + private final List> rowGroupStats; + + public MosaicWriterMetadata(int numRowGroups, List> rowGroupStats) { + this.numRowGroups = numRowGroups; + this.rowGroupStats = rowGroupStats; + } + + public int numRowGroups() { + return numRowGroups; + } + + public List getRowGroupStatistics(int rowGroupIndex) { + return rowGroupStats.get(rowGroupIndex); + } +} diff --git a/paimon-mosaic-format/src/main/resources/META-INF/services/org.apache.paimon.format.FileFormatFactory b/paimon-mosaic-format/src/main/resources/META-INF/services/org.apache.paimon.format.FileFormatFactory new file mode 100644 index 000000000000..bc955c493506 --- /dev/null +++ b/paimon-mosaic-format/src/main/resources/META-INF/services/org.apache.paimon.format.FileFormatFactory @@ -0,0 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +org.apache.paimon.format.mosaic.MosaicFileFormatFactory diff --git a/paimon-mosaic-format/src/test/java/org/apache/paimon/format/mosaic/MosaicFileFormatTest.java b/paimon-mosaic-format/src/test/java/org/apache/paimon/format/mosaic/MosaicFileFormatTest.java new file mode 100644 index 000000000000..8e53164e8627 --- /dev/null +++ b/paimon-mosaic-format/src/test/java/org/apache/paimon/format/mosaic/MosaicFileFormatTest.java @@ -0,0 +1,124 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.format.mosaic; + +import org.apache.paimon.format.FileFormatFactory; +import org.apache.paimon.format.FormatReaderFactory; +import org.apache.paimon.format.FormatWriterFactory; +import org.apache.paimon.options.Options; +import org.apache.paimon.types.DataTypes; +import org.apache.paimon.types.RowType; + +import org.junit.jupiter.api.Test; + +import java.util.ArrayList; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +/** Unit tests for {@link MosaicFileFormat} and {@link MosaicFileFormatFactory}. */ +class MosaicFileFormatTest { + + @Test + void testFactoryIdentifier() { + MosaicFileFormatFactory factory = new MosaicFileFormatFactory(); + assertThat(factory.identifier()).isEqualTo("mosaic"); + } + + @Test + void testFactoryCreate() { + MosaicFileFormatFactory factory = new MosaicFileFormatFactory(); + FileFormatFactory.FormatContext context = + new FileFormatFactory.FormatContext(new Options(), 1024, 1024); + assertThat(factory.create(context)).isInstanceOf(MosaicFileFormat.class); + } + + @Test + void testCreateReaderFactory() { + MosaicFileFormat format = createFormat(); + RowType rowType = DataTypes.ROW(DataTypes.INT(), DataTypes.STRING()); + FormatReaderFactory readerFactory = + format.createReaderFactory(rowType, rowType, new ArrayList<>()); + assertThat(readerFactory).isInstanceOf(MosaicReaderFactory.class); + } + + @Test + void testCreateWriterFactory() { + MosaicFileFormat format = createFormat(); + RowType rowType = DataTypes.ROW(DataTypes.INT(), DataTypes.STRING()); + FormatWriterFactory writerFactory = format.createWriterFactory(rowType); + assertThat(writerFactory).isInstanceOf(MosaicWriterFactory.class); + } + + @Test + void testValidateDataFieldsSupported() { + MosaicFileFormat format = createFormat(); + RowType rowType = + DataTypes.ROW( + DataTypes.INT(), + DataTypes.BIGINT(), + DataTypes.STRING(), + DataTypes.DOUBLE(), + DataTypes.FLOAT(), + DataTypes.BOOLEAN(), + DataTypes.DATE(), + DataTypes.TIMESTAMP(3), + DataTypes.DECIMAL(10, 2), + DataTypes.BYTES()); + format.validateDataFields(rowType); + } + + @Test + void testValidateDataFieldsMapUnsupported() { + MosaicFileFormat format = createFormat(); + RowType rowType = DataTypes.ROW(DataTypes.MAP(DataTypes.STRING(), DataTypes.INT())); + assertThatThrownBy(() -> format.validateDataFields(rowType)) + .isInstanceOf(UnsupportedOperationException.class) + .hasMessageContaining("MAP"); + } + + @Test + void testValidateDataFieldsMultisetUnsupported() { + MosaicFileFormat format = createFormat(); + RowType rowType = DataTypes.ROW(DataTypes.MULTISET(DataTypes.STRING())); + assertThatThrownBy(() -> format.validateDataFields(rowType)) + .isInstanceOf(UnsupportedOperationException.class) + .hasMessageContaining("MULTISET"); + } + + @Test + void testCreateStatsExtractor() { + MosaicFileFormat format = createFormat(); + RowType rowType = DataTypes.ROW(DataTypes.INT(), DataTypes.STRING()); + assertThat( + format.createStatsExtractor( + rowType, + new org.apache.paimon.statistics.SimpleColStatsCollector.Factory[] { + org.apache.paimon.statistics.SimpleColStatsCollector.from( + "full"), + org.apache.paimon.statistics.SimpleColStatsCollector.from( + "full") + })) + .isPresent(); + } + + private static MosaicFileFormat createFormat() { + return new MosaicFileFormat(new FileFormatFactory.FormatContext(new Options(), 1024, 1024)); + } +} diff --git a/paimon-mosaic-format/src/test/java/org/apache/paimon/format/mosaic/MosaicFormatReadWriteTest.java b/paimon-mosaic-format/src/test/java/org/apache/paimon/format/mosaic/MosaicFormatReadWriteTest.java new file mode 100644 index 000000000000..f3ce3a9042a6 --- /dev/null +++ b/paimon-mosaic-format/src/test/java/org/apache/paimon/format/mosaic/MosaicFormatReadWriteTest.java @@ -0,0 +1,146 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.format.mosaic; + +import org.apache.paimon.data.BinaryString; +import org.apache.paimon.data.Decimal; +import org.apache.paimon.data.GenericArray; +import org.apache.paimon.data.GenericRow; +import org.apache.paimon.data.InternalRow; +import org.apache.paimon.data.Timestamp; +import org.apache.paimon.format.FileFormat; +import org.apache.paimon.format.FileFormatFactory; +import org.apache.paimon.format.FormatReadWriteTest; +import org.apache.paimon.options.Options; +import org.apache.paimon.types.DataTypes; +import org.apache.paimon.types.RowType; + +import org.junit.jupiter.api.BeforeAll; + +import java.math.BigDecimal; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.junit.jupiter.api.Assumptions.assumeTrue; + +/** Round-trip read/write tests for Mosaic format. */ +class MosaicFormatReadWriteTest extends FormatReadWriteTest { + + MosaicFormatReadWriteTest() { + super("mosaic"); + } + + @BeforeAll + static void checkNativeLibrary() { + assumeTrue(isNativeAvailable(), "Mosaic native library not available"); + } + + @Override + protected FileFormat fileFormat() { + return new MosaicFileFormat(new FileFormatFactory.FormatContext(new Options(), 1024, 1024)); + } + + @Override + public String compression() { + return "zstd"; + } + + @Override + public boolean supportNestedReadPruning() { + return false; + } + + @Override + protected RowType rowTypeForFullTypesTest() { + return RowType.builder() + .field("f_int", DataTypes.INT().notNull()) + .field("f_string", DataTypes.STRING()) + .field("f_double", DataTypes.DOUBLE().notNull()) + .field("f_boolean", DataTypes.BOOLEAN()) + .field("f_tinyint", DataTypes.TINYINT()) + .field("f_smallint", DataTypes.SMALLINT()) + .field("f_bigint", DataTypes.BIGINT()) + .field("f_float", DataTypes.FLOAT()) + .field("f_binary", DataTypes.BYTES()) + .field("f_date", DataTypes.DATE()) + .field("f_timestamp3", DataTypes.TIMESTAMP(3)) + .field("f_timestamp6", DataTypes.TIMESTAMP(6)) + .field("f_decimal_5_2", DataTypes.DECIMAL(5, 2)) + .field("f_decimal_20_0", DataTypes.DECIMAL(20, 0)) + .field("f_array", DataTypes.ARRAY(DataTypes.INT())) + .field( + "f_row", + DataTypes.ROW( + DataTypes.FIELD(0, "a", DataTypes.INT()), + DataTypes.FIELD(1, "b", DataTypes.STRING()))) + .build(); + } + + @Override + protected GenericRow expectedRowForFullTypesTest() { + return GenericRow.of( + 42, + BinaryString.fromString("hello mosaic"), + 3.14d, + true, + (byte) 7, + (short) 256, + 9876543210L, + 1.5f, + new byte[] {1, 2, 3}, + 18000, + Timestamp.fromEpochMillis(1700000000000L), + Timestamp.fromMicros(1700000000000000L), + Decimal.fromBigDecimal(new BigDecimal("123.45"), 5, 2), + Decimal.fromBigDecimal(new BigDecimal("12345678901234567890"), 20, 0), + new GenericArray(new int[] {1, 2, 3}), + GenericRow.of(100, BinaryString.fromString("nested"))); + } + + @Override + protected void validateFullTypesResult(InternalRow actual, InternalRow expected) { + for (int i = 0; i < 14; i++) { + if (expected.isNullAt(i)) { + assertThat(actual.isNullAt(i)).isTrue(); + } + } + assertThat(actual.getInt(0)).isEqualTo(expected.getInt(0)); + assertThat(actual.getString(1)).isEqualTo(expected.getString(1)); + assertThat(actual.getDouble(2)).isEqualTo(expected.getDouble(2)); + assertThat(actual.getBoolean(3)).isEqualTo(expected.getBoolean(3)); + assertThat(actual.getByte(4)).isEqualTo(expected.getByte(4)); + assertThat(actual.getShort(5)).isEqualTo(expected.getShort(5)); + assertThat(actual.getLong(6)).isEqualTo(expected.getLong(6)); + assertThat(actual.getFloat(7)).isEqualTo(expected.getFloat(7)); + assertThat(actual.getBinary(8)).isEqualTo(expected.getBinary(8)); + assertThat(actual.getInt(9)).isEqualTo(expected.getInt(9)); + assertThat(actual.getTimestamp(10, 3)).isEqualTo(expected.getTimestamp(10, 3)); + assertThat(actual.getTimestamp(11, 6)).isEqualTo(expected.getTimestamp(11, 6)); + assertThat(actual.getDecimal(12, 5, 2)).isEqualTo(expected.getDecimal(12, 5, 2)); + assertThat(actual.getDecimal(13, 20, 0)).isEqualTo(expected.getDecimal(13, 20, 0)); + } + + private static boolean isNativeAvailable() { + try { + Class.forName("org.apache.paimon.mosaic.NativeLib"); + return true; + } catch (Throwable t) { + return false; + } + } +} diff --git a/paimon-mosaic-format/src/test/java/org/apache/paimon/format/mosaic/MosaicObjectsTest.java b/paimon-mosaic-format/src/test/java/org/apache/paimon/format/mosaic/MosaicObjectsTest.java new file mode 100644 index 000000000000..91806b99ea26 --- /dev/null +++ b/paimon-mosaic-format/src/test/java/org/apache/paimon/format/mosaic/MosaicObjectsTest.java @@ -0,0 +1,165 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.format.mosaic; + +import org.apache.paimon.data.BinaryString; +import org.apache.paimon.data.Decimal; +import org.apache.paimon.data.Timestamp; +import org.apache.paimon.types.DataTypes; + +import org.junit.jupiter.api.Test; + +import java.nio.ByteBuffer; +import java.nio.ByteOrder; + +import static org.assertj.core.api.Assertions.assertThat; + +/** Unit tests for {@link MosaicObjects}. */ +class MosaicObjectsTest { + + @Test + void testNullBytes() { + assertThat(MosaicObjects.convertStatsValue(null, DataTypes.INT())).isNull(); + } + + @Test + void testEmptyBytes() { + assertThat(MosaicObjects.convertStatsValue(new byte[0], DataTypes.INT())).isNull(); + } + + @Test + void testBoolean() { + assertThat(MosaicObjects.convertStatsValue(new byte[] {1}, DataTypes.BOOLEAN())) + .isEqualTo(true); + assertThat(MosaicObjects.convertStatsValue(new byte[] {0}, DataTypes.BOOLEAN())) + .isEqualTo(false); + } + + @Test + void testTinyInt() { + assertThat(MosaicObjects.convertStatsValue(new byte[] {42}, DataTypes.TINYINT())) + .isEqualTo((byte) 42); + assertThat(MosaicObjects.convertStatsValue(new byte[] {(byte) -1}, DataTypes.TINYINT())) + .isEqualTo((byte) -1); + } + + @Test + void testSmallInt() { + byte[] bytes = toLE(ByteBuffer.allocate(2).putShort((short) 1234)); + assertThat(MosaicObjects.convertStatsValue(bytes, DataTypes.SMALLINT())) + .isEqualTo((short) 1234); + } + + @Test + void testInt() { + byte[] bytes = toLE(ByteBuffer.allocate(4).putInt(123456)); + assertThat(MosaicObjects.convertStatsValue(bytes, DataTypes.INT())).isEqualTo(123456); + } + + @Test + void testIntNegative() { + byte[] bytes = toLE(ByteBuffer.allocate(4).putInt(-999)); + assertThat(MosaicObjects.convertStatsValue(bytes, DataTypes.INT())).isEqualTo(-999); + } + + @Test + void testBigInt() { + byte[] bytes = toLE(ByteBuffer.allocate(8).putLong(9876543210L)); + assertThat(MosaicObjects.convertStatsValue(bytes, DataTypes.BIGINT())) + .isEqualTo(9876543210L); + } + + @Test + void testFloat() { + byte[] bytes = toLE(ByteBuffer.allocate(4).putFloat(3.14f)); + assertThat(MosaicObjects.convertStatsValue(bytes, DataTypes.FLOAT())).isEqualTo(3.14f); + } + + @Test + void testDouble() { + byte[] bytes = toLE(ByteBuffer.allocate(8).putDouble(2.718281828)); + assertThat(MosaicObjects.convertStatsValue(bytes, DataTypes.DOUBLE())) + .isEqualTo(2.718281828); + } + + @Test + void testVarChar() { + byte[] bytes = "hello".getBytes(); + assertThat(MosaicObjects.convertStatsValue(bytes, DataTypes.STRING())) + .isEqualTo(BinaryString.fromString("hello")); + } + + @Test + void testBinary() { + byte[] bytes = new byte[] {1, 2, 3, 4, 5}; + assertThat(MosaicObjects.convertStatsValue(bytes, DataTypes.BYTES())).isEqualTo(bytes); + } + + @Test + void testDate() { + byte[] bytes = toLE(ByteBuffer.allocate(4).putInt(18000)); + assertThat(MosaicObjects.convertStatsValue(bytes, DataTypes.DATE())).isEqualTo(18000); + } + + @Test + void testTimestampMillis() { + long millis = 1700000000000L; + byte[] bytes = toLE(ByteBuffer.allocate(8).putLong(millis)); + Object result = MosaicObjects.convertStatsValue(bytes, DataTypes.TIMESTAMP(3)); + assertThat(result).isEqualTo(Timestamp.fromEpochMillis(millis)); + } + + @Test + void testTimestampMicros() { + long micros = 1700000000000000L; + byte[] bytes = toLE(ByteBuffer.allocate(8).putLong(micros)); + Object result = MosaicObjects.convertStatsValue(bytes, DataTypes.TIMESTAMP(6)); + assertThat(result).isEqualTo(Timestamp.fromMicros(micros)); + } + + @Test + void testDecimal() { + byte[] leBytes = new byte[] {(byte) 0xE8, 0x03, 0, 0, 0, 0, 0, 0}; + Object result = MosaicObjects.convertStatsValue(leBytes, DataTypes.DECIMAL(10, 2)); + assertThat(result).isInstanceOf(Decimal.class); + Decimal decimal = (Decimal) result; + assertThat(decimal.toBigDecimal().intValue()).isEqualTo(10); + } + + @Test + void testUnsupportedTypeReturnsNull() { + byte[] bytes = new byte[] {1, 2, 3}; + assertThat(MosaicObjects.convertStatsValue(bytes, DataTypes.ARRAY(DataTypes.INT()))) + .isNull(); + } + + private static byte[] toLE(ByteBuffer bigEndianBuf) { + byte[] beBytes = bigEndianBuf.array(); + ByteBuffer leBuf = ByteBuffer.allocate(beBytes.length).order(ByteOrder.LITTLE_ENDIAN); + ByteBuffer beBuf = ByteBuffer.wrap(beBytes).order(ByteOrder.BIG_ENDIAN); + if (beBytes.length == 2) { + leBuf.putShort(beBuf.getShort()); + } else if (beBytes.length == 4) { + leBuf.putInt(beBuf.getInt()); + } else if (beBytes.length == 8) { + leBuf.putLong(beBuf.getLong()); + } + return leBuf.array(); + } +} diff --git a/paimon-mosaic-format/src/test/java/org/apache/paimon/format/mosaic/MosaicReaderWriterTest.java b/paimon-mosaic-format/src/test/java/org/apache/paimon/format/mosaic/MosaicReaderWriterTest.java new file mode 100644 index 000000000000..3050669e977f --- /dev/null +++ b/paimon-mosaic-format/src/test/java/org/apache/paimon/format/mosaic/MosaicReaderWriterTest.java @@ -0,0 +1,260 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.format.mosaic; + +import org.apache.paimon.data.BinaryString; +import org.apache.paimon.data.GenericRow; +import org.apache.paimon.data.InternalRow; +import org.apache.paimon.data.serializer.InternalRowSerializer; +import org.apache.paimon.format.FileFormatFactory; +import org.apache.paimon.format.FormatReaderContext; +import org.apache.paimon.format.FormatReaderFactory; +import org.apache.paimon.format.FormatWriter; +import org.apache.paimon.format.FormatWriterFactory; +import org.apache.paimon.fs.Path; +import org.apache.paimon.fs.local.LocalFileIO; +import org.apache.paimon.options.Options; +import org.apache.paimon.predicate.Predicate; +import org.apache.paimon.predicate.PredicateBuilder; +import org.apache.paimon.reader.FileRecordIterator; +import org.apache.paimon.reader.RecordReader; +import org.apache.paimon.types.DataTypes; +import org.apache.paimon.types.RowType; + +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.UUID; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.junit.jupiter.api.Assumptions.assumeTrue; + +/** Integration tests for Mosaic reader and writer. */ +class MosaicReaderWriterTest { + + @TempDir java.nio.file.Path tempDir; + + @BeforeAll + static void checkNativeLibrary() { + assumeTrue(isNativeAvailable(), "Mosaic native library not available"); + } + + @Test + void testWriteAndRead() throws IOException { + RowType rowType = DataTypes.ROW(DataTypes.INT(), DataTypes.STRING()); + Path path = newPath(); + + writeRows( + rowType, + path, + GenericRow.of(1, BinaryString.fromString("hello")), + GenericRow.of(2, BinaryString.fromString("world"))); + + List result = readAll(rowType, rowType, path, null); + assertThat(result).hasSize(2); + assertThat(result.get(0).getInt(0)).isEqualTo(1); + assertThat(result.get(0).getString(1).toString()).isEqualTo("hello"); + assertThat(result.get(1).getInt(0)).isEqualTo(2); + assertThat(result.get(1).getString(1).toString()).isEqualTo("world"); + } + + @Test + void testNullValues() throws IOException { + RowType rowType = DataTypes.ROW(DataTypes.INT(), DataTypes.STRING()); + Path path = newPath(); + + writeRows( + rowType, + path, + GenericRow.of(1, null), + GenericRow.of(null, BinaryString.fromString("test")), + GenericRow.of(null, null)); + + List result = readAll(rowType, rowType, path, null); + assertThat(result).hasSize(3); + assertThat(result.get(0).isNullAt(1)).isTrue(); + assertThat(result.get(1).isNullAt(0)).isTrue(); + assertThat(result.get(2).isNullAt(0)).isTrue(); + assertThat(result.get(2).isNullAt(1)).isTrue(); + } + + @Test + void testColumnProjection() throws IOException { + RowType writeType = DataTypes.ROW(DataTypes.INT(), DataTypes.STRING(), DataTypes.DOUBLE()); + RowType readType = DataTypes.ROW(DataTypes.STRING()); + Path path = newPath(); + + writeRows( + writeType, + path, + GenericRow.of(1, BinaryString.fromString("aaa"), 1.1), + GenericRow.of(2, BinaryString.fromString("bbb"), 2.2)); + + List result = readAll(writeType, readType, path, null); + assertThat(result).hasSize(2); + assertThat(result.get(0).getString(0).toString()).isEqualTo("aaa"); + assertThat(result.get(1).getString(0).toString()).isEqualTo("bbb"); + } + + @Test + void testLargeDataset() throws IOException { + RowType rowType = DataTypes.ROW(DataTypes.INT(), DataTypes.STRING()); + Path path = newPath(); + + int numRows = 10000; + GenericRow[] rows = new GenericRow[numRows]; + for (int i = 0; i < numRows; i++) { + rows[i] = GenericRow.of(i, BinaryString.fromString("row" + i)); + } + writeRows(rowType, path, rows); + + List result = readAll(rowType, rowType, path, null); + assertThat(result).hasSize(numRows); + assertThat(result.get(0).getInt(0)).isEqualTo(0); + assertThat(result.get(numRows - 1).getInt(0)).isEqualTo(numRows - 1); + } + + @Test + void testRowGroupPredicateFiltering() throws IOException { + RowType rowType = DataTypes.ROW(DataTypes.INT().notNull(), DataTypes.STRING()); + Path path = newPath(); + + int numRows = 10000; + GenericRow[] rows = new GenericRow[numRows]; + for (int i = 0; i < numRows; i++) { + rows[i] = GenericRow.of(i, BinaryString.fromString("v" + i)); + } + writeRows(rowType, path, rows); + + PredicateBuilder builder = new PredicateBuilder(rowType); + Predicate predicate = builder.greaterThan(0, 9000); + List result = + readAll(rowType, rowType, path, Collections.singletonList(predicate)); + + for (InternalRow row : result) { + assertThat(row.getInt(0)).isGreaterThan(9000); + } + } + + @Test + void testReturnedPosition() throws IOException { + RowType rowType = DataTypes.ROW(DataTypes.INT(), DataTypes.STRING()); + Path path = newPath(); + + writeRows( + rowType, + path, + GenericRow.of(1, BinaryString.fromString("a")), + GenericRow.of(2, BinaryString.fromString("b")), + GenericRow.of(3, BinaryString.fromString("c"))); + + MosaicFileFormat format = createFormat(); + FormatReaderFactory readerFactory = format.createReaderFactory(rowType, rowType, null); + LocalFileIO fileIO = new LocalFileIO(); + RecordReader reader = + readerFactory.createReader( + new FormatReaderContext(fileIO, path, fileIO.getFileSize(path))); + + RecordReader.RecordIterator batch = reader.readBatch(); + assertThat(batch).isNotNull(); + FileRecordIterator fileIter = (FileRecordIterator) batch; + + fileIter.next(); + assertThat(fileIter.returnedPosition()).isEqualTo(0); + fileIter.next(); + assertThat(fileIter.returnedPosition()).isEqualTo(1); + fileIter.next(); + assertThat(fileIter.returnedPosition()).isEqualTo(2); + + reader.close(); + } + + @Test + void testReachTargetSize() throws IOException { + RowType rowType = DataTypes.ROW(DataTypes.INT(), DataTypes.STRING()); + Path path = newPath(); + MosaicFileFormat format = createFormat(); + FormatWriterFactory writerFactory = format.createWriterFactory(rowType); + + LocalFileIO fileIO = new LocalFileIO(); + FormatWriter writer = writerFactory.create(fileIO.newOutputStream(path, false), "zstd"); + + boolean reached = false; + for (int i = 0; i < 100000; i++) { + writer.addElement(GenericRow.of(i, BinaryString.fromString("value_" + i + "_padding"))); + if (writer.reachTargetSize(true, 1024)) { + reached = true; + break; + } + } + writer.close(); + assertThat(reached).isTrue(); + } + + private Path newPath() { + return new Path(tempDir.toUri().toString(), UUID.randomUUID() + ".mosaic"); + } + + private void writeRows(RowType rowType, Path path, GenericRow... rows) throws IOException { + MosaicFileFormat format = createFormat(); + FormatWriterFactory writerFactory = format.createWriterFactory(rowType); + LocalFileIO fileIO = new LocalFileIO(); + FormatWriter writer = writerFactory.create(fileIO.newOutputStream(path, false), "zstd"); + for (GenericRow row : rows) { + writer.addElement(row); + } + writer.close(); + } + + private List readAll( + RowType dataType, RowType readType, Path path, List predicates) + throws IOException { + MosaicFileFormat format = createFormat(); + FormatReaderFactory readerFactory = + format.createReaderFactory(dataType, readType, predicates); + LocalFileIO fileIO = new LocalFileIO(); + RecordReader reader = + readerFactory.createReader( + new FormatReaderContext(fileIO, path, fileIO.getFileSize(path))); + + InternalRowSerializer serializer = new InternalRowSerializer(readType); + List result = new ArrayList<>(); + reader.forEachRemaining(row -> result.add(serializer.copy(row))); + reader.close(); + return result; + } + + private static MosaicFileFormat createFormat() { + return new MosaicFileFormat(new FileFormatFactory.FormatContext(new Options(), 1024, 1024)); + } + + private static boolean isNativeAvailable() { + try { + Class.forName("org.apache.paimon.mosaic.NativeLib"); + return true; + } catch (Throwable t) { + return false; + } + } +} diff --git a/paimon-mosaic-format/src/test/java/org/apache/paimon/format/mosaic/MosaicSimpleStatsExtractorTest.java b/paimon-mosaic-format/src/test/java/org/apache/paimon/format/mosaic/MosaicSimpleStatsExtractorTest.java new file mode 100644 index 000000000000..78254341ba69 --- /dev/null +++ b/paimon-mosaic-format/src/test/java/org/apache/paimon/format/mosaic/MosaicSimpleStatsExtractorTest.java @@ -0,0 +1,90 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.format.mosaic; + +import org.apache.paimon.format.FileFormat; +import org.apache.paimon.format.FileFormatFactory; +import org.apache.paimon.format.SimpleColStats; +import org.apache.paimon.format.SimpleColStatsExtractorTest; +import org.apache.paimon.options.Options; +import org.apache.paimon.types.ArrayType; +import org.apache.paimon.types.DataType; +import org.apache.paimon.types.DataTypes; +import org.apache.paimon.types.RowType; + +import org.junit.jupiter.api.BeforeAll; + +import static org.junit.jupiter.api.Assumptions.assumeTrue; + +/** Tests for {@link MosaicSimpleStatsExtractor}. */ +class MosaicSimpleStatsExtractorTest extends SimpleColStatsExtractorTest { + + @BeforeAll + static void checkNativeLibrary() { + assumeTrue(isNativeAvailable(), "Mosaic native library not available"); + } + + @Override + protected FileFormat createFormat() { + return new MosaicFileFormat(new FileFormatFactory.FormatContext(new Options(), 1024, 1024)); + } + + @Override + protected RowType rowType() { + return RowType.builder() + .field("f_boolean", DataTypes.BOOLEAN()) + .field("f_tinyint", DataTypes.TINYINT()) + .field("f_smallint", DataTypes.SMALLINT()) + .field("f_int", DataTypes.INT()) + .field("f_bigint", DataTypes.BIGINT()) + .field("f_float", DataTypes.FLOAT()) + .field("f_double", DataTypes.DOUBLE()) + .field("f_string", DataTypes.STRING()) + .field("f_binary", DataTypes.BYTES()) + .field("f_decimal_5_2", DataTypes.DECIMAL(5, 2)) + .field("f_decimal_20_0", DataTypes.DECIMAL(20, 0)) + .field("f_date", DataTypes.DATE()) + .field("f_timestamp3", DataTypes.TIMESTAMP(3)) + .field("f_timestamp6", DataTypes.TIMESTAMP(6)) + .field("f_array", DataTypes.ARRAY(DataTypes.INT())) + .build(); + } + + @Override + protected String fileCompression() { + return "zstd"; + } + + @Override + protected SimpleColStats regenerate(SimpleColStats stats, DataType type) { + if (type instanceof ArrayType) { + return new SimpleColStats(null, null, stats.nullCount()); + } + return stats; + } + + private static boolean isNativeAvailable() { + try { + Class.forName("org.apache.paimon.mosaic.NativeLib"); + return true; + } catch (Throwable t) { + return false; + } + } +} diff --git a/paimon-mosaic-format/src/test/java/org/apache/paimon/format/mosaic/MosaicWriterMetadataTest.java b/paimon-mosaic-format/src/test/java/org/apache/paimon/format/mosaic/MosaicWriterMetadataTest.java new file mode 100644 index 000000000000..ac37504e7f38 --- /dev/null +++ b/paimon-mosaic-format/src/test/java/org/apache/paimon/format/mosaic/MosaicWriterMetadataTest.java @@ -0,0 +1,194 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.format.mosaic; + +import org.apache.paimon.data.BinaryString; +import org.apache.paimon.data.GenericRow; +import org.apache.paimon.format.FileFormatFactory; +import org.apache.paimon.format.FormatWriter; +import org.apache.paimon.format.FormatWriterFactory; +import org.apache.paimon.format.SimpleColStats; +import org.apache.paimon.format.SimpleStatsExtractor; +import org.apache.paimon.fs.Path; +import org.apache.paimon.fs.local.LocalFileIO; +import org.apache.paimon.options.Options; +import org.apache.paimon.statistics.SimpleColStatsCollector; +import org.apache.paimon.types.DataTypes; +import org.apache.paimon.types.RowType; + +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +import java.io.IOException; +import java.util.UUID; +import java.util.stream.IntStream; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.junit.jupiter.api.Assumptions.assumeTrue; + +/** Tests for writer metadata based stats extraction in Mosaic format. */ +class MosaicWriterMetadataTest { + + @TempDir java.nio.file.Path tempDir; + + @BeforeAll + static void checkNativeLibrary() { + assumeTrue(isNativeAvailable(), "Mosaic native library not available"); + } + + @Test + void testWriterMetadataNotNull() throws IOException { + RowType rowType = DataTypes.ROW(DataTypes.INT(), DataTypes.STRING()); + Path path = newPath(); + + FormatWriter writer = createWriter(rowType, path); + writer.addElement(GenericRow.of(1, BinaryString.fromString("hello"))); + writer.addElement(GenericRow.of(2, BinaryString.fromString("world"))); + writer.close(); + + Object metadata = writer.writerMetadata(); + assertThat(metadata).isNotNull(); + assertThat(metadata).isInstanceOf(MosaicWriterMetadata.class); + + MosaicWriterMetadata mosaicMeta = (MosaicWriterMetadata) metadata; + assertThat(mosaicMeta.numRowGroups()).isGreaterThan(0); + } + + @Test + void testStatsFromMetadataMatchesStatsFromFile() throws IOException { + RowType rowType = + RowType.builder() + .field("f_int", DataTypes.INT()) + .field("f_bigint", DataTypes.BIGINT()) + .field("f_string", DataTypes.STRING()) + .field("f_double", DataTypes.DOUBLE()) + .build(); + Path path = newPath(); + + FormatWriter writer = createWriter(rowType, path); + for (int i = 0; i < 1000; i++) { + writer.addElement( + GenericRow.of(i, (long) i * 100, BinaryString.fromString("val_" + i), i * 1.1)); + } + writer.close(); + + Object metadata = writer.writerMetadata(); + assertThat(metadata).isNotNull(); + + MosaicFileFormat format = createFormat(); + int fieldCount = rowType.getFieldCount(); + SimpleColStatsCollector.Factory[] collectors = + IntStream.range(0, fieldCount) + .mapToObj(i -> SimpleColStatsCollector.from("full")) + .toArray(SimpleColStatsCollector.Factory[]::new); + + SimpleStatsExtractor extractor = format.createStatsExtractor(rowType, collectors).get(); + LocalFileIO fileIO = new LocalFileIO(); + long fileSize = fileIO.getFileSize(path); + + SimpleColStats[] fromFile = extractor.extract(fileIO, path, fileSize); + SimpleColStats[] fromMetadata = extractor.extract(fileIO, path, fileSize, metadata); + + assertThat(fromMetadata).isEqualTo(fromFile); + } + + @Test + void testStatsFromMetadataWithNullValues() throws IOException { + RowType rowType = + RowType.builder() + .field("f_int", DataTypes.INT()) + .field("f_string", DataTypes.STRING()) + .build(); + Path path = newPath(); + + FormatWriter writer = createWriter(rowType, path); + writer.addElement(GenericRow.of(1, null)); + writer.addElement(GenericRow.of(null, BinaryString.fromString("a"))); + writer.addElement(GenericRow.of(3, BinaryString.fromString("b"))); + writer.close(); + + Object metadata = writer.writerMetadata(); + MosaicFileFormat format = createFormat(); + int fieldCount = rowType.getFieldCount(); + SimpleColStatsCollector.Factory[] collectors = + IntStream.range(0, fieldCount) + .mapToObj(i -> SimpleColStatsCollector.from("full")) + .toArray(SimpleColStatsCollector.Factory[]::new); + + SimpleStatsExtractor extractor = format.createStatsExtractor(rowType, collectors).get(); + LocalFileIO fileIO = new LocalFileIO(); + long fileSize = fileIO.getFileSize(path); + + SimpleColStats[] fromMetadata = extractor.extract(fileIO, path, fileSize, metadata); + assertThat(fromMetadata).isNotNull(); + assertThat(fromMetadata[0].nullCount()).isEqualTo(1L); + assertThat(fromMetadata[1].nullCount()).isEqualTo(1L); + } + + @Test + void testFallbackToFileWhenMetadataIsNull() throws IOException { + RowType rowType = DataTypes.ROW(DataTypes.INT(), DataTypes.STRING()); + Path path = newPath(); + + FormatWriter writer = createWriter(rowType, path); + writer.addElement(GenericRow.of(10, BinaryString.fromString("test"))); + writer.close(); + + MosaicFileFormat format = createFormat(); + int fieldCount = rowType.getFieldCount(); + SimpleColStatsCollector.Factory[] collectors = + IntStream.range(0, fieldCount) + .mapToObj(i -> SimpleColStatsCollector.from("full")) + .toArray(SimpleColStatsCollector.Factory[]::new); + + SimpleStatsExtractor extractor = format.createStatsExtractor(rowType, collectors).get(); + LocalFileIO fileIO = new LocalFileIO(); + long fileSize = fileIO.getFileSize(path); + + SimpleColStats[] fromFile = extractor.extract(fileIO, path, fileSize); + SimpleColStats[] fromNull = extractor.extract(fileIO, path, fileSize, null); + + assertThat(fromNull).isEqualTo(fromFile); + } + + private Path newPath() { + return new Path(tempDir.toUri().toString(), UUID.randomUUID() + ".mosaic"); + } + + private FormatWriter createWriter(RowType rowType, Path path) throws IOException { + MosaicFileFormat format = createFormat(); + FormatWriterFactory writerFactory = format.createWriterFactory(rowType); + LocalFileIO fileIO = new LocalFileIO(); + return writerFactory.create(fileIO.newOutputStream(path, false), "zstd"); + } + + private static MosaicFileFormat createFormat() { + return new MosaicFileFormat(new FileFormatFactory.FormatContext(new Options(), 1024, 1024)); + } + + private static boolean isNativeAvailable() { + try { + Class.forName("org.apache.paimon.mosaic.NativeLib"); + return true; + } catch (Throwable t) { + return false; + } + } +} diff --git a/pom.xml b/pom.xml index 22fddbb60dde..72ec3a7117ee 100644 --- a/pom.xml +++ b/pom.xml @@ -74,6 +74,7 @@ under the License. paimon-api paimon-lumina paimon-vortex + paimon-mosaic-format paimon-tantivy From 9bd655f6b63aa2cac7fa82f5e1ab3e141cb486e0 Mon Sep 17 00:00:00 2001 From: JingsongLi Date: Wed, 20 May 2026 20:28:20 +0800 Subject: [PATCH 02/11] [format] Address review comments for paimon-mosaic-format 1. Remove paimon-mosaic-format from root reactor to avoid CI/release dependency on external SNAPSHOT artifact 2. Fix writer close order: close native writer before collecting metadata (MosaicWriter requires close() before stats are accessible) 3. Configure statsColumns with all column indices to enable stats collection 4. Fix byte order: use big-endian (matching Mosaic's Value::to_be_bytes()) instead of little-endian for numeric stats decoding 5. Fix extractWithFileInfo() to use rowGroupNumRows() instead of reading full row groups (avoids data reads and Arrow buffer leaks) Co-Authored-By: Claude Opus 4.6 --- .../paimon/format/mosaic/MosaicObjects.java | 13 +------ .../format/mosaic/MosaicRecordsWriter.java | 12 ++++-- .../mosaic/MosaicSimpleStatsExtractor.java | 7 ++-- .../format/mosaic/MosaicObjectsTest.java | 38 ++++++------------- pom.xml | 2 +- 5 files changed, 27 insertions(+), 45 deletions(-) diff --git a/paimon-mosaic-format/src/main/java/org/apache/paimon/format/mosaic/MosaicObjects.java b/paimon-mosaic-format/src/main/java/org/apache/paimon/format/mosaic/MosaicObjects.java index 695a1920f8da..995434a22276 100644 --- a/paimon-mosaic-format/src/main/java/org/apache/paimon/format/mosaic/MosaicObjects.java +++ b/paimon-mosaic-format/src/main/java/org/apache/paimon/format/mosaic/MosaicObjects.java @@ -30,7 +30,6 @@ import java.math.BigDecimal; import java.math.BigInteger; import java.nio.ByteBuffer; -import java.nio.ByteOrder; /** Converts Mosaic's byte[] statistics to Paimon objects. */ public class MosaicObjects { @@ -40,7 +39,7 @@ public static Object convertStatsValue(byte[] bytes, DataType dataType) { if (bytes == null || bytes.length == 0) { return null; } - ByteBuffer buf = ByteBuffer.wrap(bytes).order(ByteOrder.LITTLE_ENDIAN); + ByteBuffer buf = ByteBuffer.wrap(bytes); switch (dataType.getTypeRoot()) { case BOOLEAN: return bytes[0] != 0; @@ -66,7 +65,7 @@ public static Object convertStatsValue(byte[] bytes, DataType dataType) { return bytes; case DECIMAL: DecimalType decimalType = (DecimalType) dataType; - BigInteger unscaled = new BigInteger(reverseBytes(bytes)); + BigInteger unscaled = new BigInteger(bytes); BigDecimal decimal = new BigDecimal(unscaled, decimalType.getScale()); return Decimal.fromBigDecimal( decimal, decimalType.getPrecision(), decimalType.getScale()); @@ -86,13 +85,5 @@ public static Object convertStatsValue(byte[] bytes, DataType dataType) { } } - private static byte[] reverseBytes(byte[] bytes) { - byte[] reversed = new byte[bytes.length]; - for (int i = 0; i < bytes.length; i++) { - reversed[i] = bytes[bytes.length - 1 - i]; - } - return reversed; - } - private MosaicObjects() {} } diff --git a/paimon-mosaic-format/src/main/java/org/apache/paimon/format/mosaic/MosaicRecordsWriter.java b/paimon-mosaic-format/src/main/java/org/apache/paimon/format/mosaic/MosaicRecordsWriter.java index 723f483d0863..c88e8ef5a3c4 100644 --- a/paimon-mosaic-format/src/main/java/org/apache/paimon/format/mosaic/MosaicRecordsWriter.java +++ b/paimon-mosaic-format/src/main/java/org/apache/paimon/format/mosaic/MosaicRecordsWriter.java @@ -62,7 +62,13 @@ public MosaicRecordsWriter( new ArrowFormatWriter(rowType, writeBatchSize, true, allocator, writeBatchMemory); Schema arrowSchema = arrowFormatWriter.getVectorSchemaRoot().getSchema(); - WriterOptions options = new WriterOptions().zstdLevel(formatContext.zstdLevel()); + int numFields = arrowSchema.getFields().size(); + int[] allColumns = new int[numFields]; + for (int i = 0; i < numFields; i++) { + allColumns[i] = i; + } + WriterOptions options = + new WriterOptions().zstdLevel(formatContext.zstdLevel()).statsColumns(allColumns); this.nativeWriter = new MosaicWriter(outputStream, arrowSchema, options, allocator); } @@ -108,13 +114,13 @@ public void close() throws IOException { } try { - collectMetadata(); + nativeWriter.close(); } catch (Throwable t) { throwable = addSuppressed(throwable, t); } try { - nativeWriter.close(); + collectMetadata(); } catch (Throwable t) { throwable = addSuppressed(throwable, t); } diff --git a/paimon-mosaic-format/src/main/java/org/apache/paimon/format/mosaic/MosaicSimpleStatsExtractor.java b/paimon-mosaic-format/src/main/java/org/apache/paimon/format/mosaic/MosaicSimpleStatsExtractor.java index 4c70d473b0ba..4a6132caa5bf 100644 --- a/paimon-mosaic-format/src/main/java/org/apache/paimon/format/mosaic/MosaicSimpleStatsExtractor.java +++ b/paimon-mosaic-format/src/main/java/org/apache/paimon/format/mosaic/MosaicSimpleStatsExtractor.java @@ -73,14 +73,13 @@ public SimpleColStats[] extract( public Pair extractWithFileInfo( FileIO fileIO, Path path, long length) { MosaicInputFileAdapter inputFile = new MosaicInputFileAdapter(fileIO, path); - long rowCount = 0; try (BufferAllocator allocator = new RootAllocator(); MosaicReader reader = MosaicReader.open(inputFile, length, allocator)) { - SimpleColStats[] stats = - extractFromStats(reader.numRowGroups(), reader::getRowGroupStatistics); int numRowGroups = reader.numRowGroups(); + SimpleColStats[] stats = extractFromStats(numRowGroups, reader::getRowGroupStatistics); + long rowCount = 0; for (int rg = 0; rg < numRowGroups; rg++) { - rowCount += reader.readRowGroup(rg, allocator).getRowCount(); + rowCount += reader.rowGroupNumRows(rg); } return Pair.of(stats, new FileInfo(rowCount)); } diff --git a/paimon-mosaic-format/src/test/java/org/apache/paimon/format/mosaic/MosaicObjectsTest.java b/paimon-mosaic-format/src/test/java/org/apache/paimon/format/mosaic/MosaicObjectsTest.java index 91806b99ea26..8fc369a0f9f1 100644 --- a/paimon-mosaic-format/src/test/java/org/apache/paimon/format/mosaic/MosaicObjectsTest.java +++ b/paimon-mosaic-format/src/test/java/org/apache/paimon/format/mosaic/MosaicObjectsTest.java @@ -26,7 +26,6 @@ import org.junit.jupiter.api.Test; import java.nio.ByteBuffer; -import java.nio.ByteOrder; import static org.assertj.core.api.Assertions.assertThat; @@ -61,39 +60,39 @@ void testTinyInt() { @Test void testSmallInt() { - byte[] bytes = toLE(ByteBuffer.allocate(2).putShort((short) 1234)); + byte[] bytes = ByteBuffer.allocate(2).putShort((short) 1234).array(); assertThat(MosaicObjects.convertStatsValue(bytes, DataTypes.SMALLINT())) .isEqualTo((short) 1234); } @Test void testInt() { - byte[] bytes = toLE(ByteBuffer.allocate(4).putInt(123456)); + byte[] bytes = ByteBuffer.allocate(4).putInt(123456).array(); assertThat(MosaicObjects.convertStatsValue(bytes, DataTypes.INT())).isEqualTo(123456); } @Test void testIntNegative() { - byte[] bytes = toLE(ByteBuffer.allocate(4).putInt(-999)); + byte[] bytes = ByteBuffer.allocate(4).putInt(-999).array(); assertThat(MosaicObjects.convertStatsValue(bytes, DataTypes.INT())).isEqualTo(-999); } @Test void testBigInt() { - byte[] bytes = toLE(ByteBuffer.allocate(8).putLong(9876543210L)); + byte[] bytes = ByteBuffer.allocate(8).putLong(9876543210L).array(); assertThat(MosaicObjects.convertStatsValue(bytes, DataTypes.BIGINT())) .isEqualTo(9876543210L); } @Test void testFloat() { - byte[] bytes = toLE(ByteBuffer.allocate(4).putFloat(3.14f)); + byte[] bytes = ByteBuffer.allocate(4).putFloat(3.14f).array(); assertThat(MosaicObjects.convertStatsValue(bytes, DataTypes.FLOAT())).isEqualTo(3.14f); } @Test void testDouble() { - byte[] bytes = toLE(ByteBuffer.allocate(8).putDouble(2.718281828)); + byte[] bytes = ByteBuffer.allocate(8).putDouble(2.718281828).array(); assertThat(MosaicObjects.convertStatsValue(bytes, DataTypes.DOUBLE())) .isEqualTo(2.718281828); } @@ -113,14 +112,14 @@ void testBinary() { @Test void testDate() { - byte[] bytes = toLE(ByteBuffer.allocate(4).putInt(18000)); + byte[] bytes = ByteBuffer.allocate(4).putInt(18000).array(); assertThat(MosaicObjects.convertStatsValue(bytes, DataTypes.DATE())).isEqualTo(18000); } @Test void testTimestampMillis() { long millis = 1700000000000L; - byte[] bytes = toLE(ByteBuffer.allocate(8).putLong(millis)); + byte[] bytes = ByteBuffer.allocate(8).putLong(millis).array(); Object result = MosaicObjects.convertStatsValue(bytes, DataTypes.TIMESTAMP(3)); assertThat(result).isEqualTo(Timestamp.fromEpochMillis(millis)); } @@ -128,15 +127,16 @@ void testTimestampMillis() { @Test void testTimestampMicros() { long micros = 1700000000000000L; - byte[] bytes = toLE(ByteBuffer.allocate(8).putLong(micros)); + byte[] bytes = ByteBuffer.allocate(8).putLong(micros).array(); Object result = MosaicObjects.convertStatsValue(bytes, DataTypes.TIMESTAMP(6)); assertThat(result).isEqualTo(Timestamp.fromMicros(micros)); } @Test void testDecimal() { - byte[] leBytes = new byte[] {(byte) 0xE8, 0x03, 0, 0, 0, 0, 0, 0}; - Object result = MosaicObjects.convertStatsValue(leBytes, DataTypes.DECIMAL(10, 2)); + // 1000 in big-endian two's complement = 0x03E8 + byte[] beBytes = new byte[] {0x03, (byte) 0xE8}; + Object result = MosaicObjects.convertStatsValue(beBytes, DataTypes.DECIMAL(10, 2)); assertThat(result).isInstanceOf(Decimal.class); Decimal decimal = (Decimal) result; assertThat(decimal.toBigDecimal().intValue()).isEqualTo(10); @@ -148,18 +148,4 @@ void testUnsupportedTypeReturnsNull() { assertThat(MosaicObjects.convertStatsValue(bytes, DataTypes.ARRAY(DataTypes.INT()))) .isNull(); } - - private static byte[] toLE(ByteBuffer bigEndianBuf) { - byte[] beBytes = bigEndianBuf.array(); - ByteBuffer leBuf = ByteBuffer.allocate(beBytes.length).order(ByteOrder.LITTLE_ENDIAN); - ByteBuffer beBuf = ByteBuffer.wrap(beBytes).order(ByteOrder.BIG_ENDIAN); - if (beBytes.length == 2) { - leBuf.putShort(beBuf.getShort()); - } else if (beBytes.length == 4) { - leBuf.putInt(beBuf.getInt()); - } else if (beBytes.length == 8) { - leBuf.putLong(beBuf.getLong()); - } - return leBuf.array(); - } } diff --git a/pom.xml b/pom.xml index 72ec3a7117ee..bc446bb2da3f 100644 --- a/pom.xml +++ b/pom.xml @@ -74,7 +74,7 @@ under the License. paimon-api paimon-lumina paimon-vortex - paimon-mosaic-format + paimon-tantivy From 35d3476996d579bedc8728acf7df1902920e7fb3 Mon Sep 17 00:00:00 2001 From: JingsongLi Date: Wed, 20 May 2026 21:27:03 +0800 Subject: [PATCH 03/11] Fix comments --- .../pom.xml | 6 ++-- .../format/mosaic/MosaicFileFormat.java | 0 .../mosaic/MosaicFileFormatFactory.java | 0 .../format/mosaic/MosaicInputFileAdapter.java | 0 .../paimon/format/mosaic/MosaicObjects.java | 0 .../format/mosaic/MosaicReaderFactory.java | 0 .../format/mosaic/MosaicRecordsReader.java | 0 .../format/mosaic/MosaicRecordsWriter.java | 0 .../mosaic/MosaicSimpleStatsExtractor.java | 0 .../format/mosaic/MosaicWriterFactory.java | 0 .../format/mosaic/MosaicWriterMetadata.java | 0 ...org.apache.paimon.format.FileFormatFactory | 0 .../format/mosaic/MosaicFileFormatTest.java | 0 .../mosaic/MosaicFormatReadWriteTest.java | 11 +----- .../format/mosaic/MosaicObjectsTest.java | 0 .../format/mosaic/MosaicReaderWriterTest.java | 27 ++++++++++---- .../MosaicSimpleStatsExtractorTest.java | 16 +-------- .../mosaic/MosaicWriterMetadataTest.java | 36 +++++++++++++++++++ pom.xml | 2 +- 19 files changed, 62 insertions(+), 36 deletions(-) rename {paimon-mosaic-format => paimon-mosaic}/pom.xml (94%) rename {paimon-mosaic-format => paimon-mosaic}/src/main/java/org/apache/paimon/format/mosaic/MosaicFileFormat.java (100%) rename {paimon-mosaic-format => paimon-mosaic}/src/main/java/org/apache/paimon/format/mosaic/MosaicFileFormatFactory.java (100%) rename {paimon-mosaic-format => paimon-mosaic}/src/main/java/org/apache/paimon/format/mosaic/MosaicInputFileAdapter.java (100%) rename {paimon-mosaic-format => paimon-mosaic}/src/main/java/org/apache/paimon/format/mosaic/MosaicObjects.java (100%) rename {paimon-mosaic-format => paimon-mosaic}/src/main/java/org/apache/paimon/format/mosaic/MosaicReaderFactory.java (100%) rename {paimon-mosaic-format => paimon-mosaic}/src/main/java/org/apache/paimon/format/mosaic/MosaicRecordsReader.java (100%) rename {paimon-mosaic-format => paimon-mosaic}/src/main/java/org/apache/paimon/format/mosaic/MosaicRecordsWriter.java (100%) rename {paimon-mosaic-format => paimon-mosaic}/src/main/java/org/apache/paimon/format/mosaic/MosaicSimpleStatsExtractor.java (100%) rename {paimon-mosaic-format => paimon-mosaic}/src/main/java/org/apache/paimon/format/mosaic/MosaicWriterFactory.java (100%) rename {paimon-mosaic-format => paimon-mosaic}/src/main/java/org/apache/paimon/format/mosaic/MosaicWriterMetadata.java (100%) rename {paimon-mosaic-format => paimon-mosaic}/src/main/resources/META-INF/services/org.apache.paimon.format.FileFormatFactory (100%) rename {paimon-mosaic-format => paimon-mosaic}/src/test/java/org/apache/paimon/format/mosaic/MosaicFileFormatTest.java (100%) rename {paimon-mosaic-format => paimon-mosaic}/src/test/java/org/apache/paimon/format/mosaic/MosaicFormatReadWriteTest.java (91%) rename {paimon-mosaic-format => paimon-mosaic}/src/test/java/org/apache/paimon/format/mosaic/MosaicObjectsTest.java (100%) rename {paimon-mosaic-format => paimon-mosaic}/src/test/java/org/apache/paimon/format/mosaic/MosaicReaderWriterTest.java (89%) rename {paimon-mosaic-format => paimon-mosaic}/src/test/java/org/apache/paimon/format/mosaic/MosaicSimpleStatsExtractorTest.java (81%) rename {paimon-mosaic-format => paimon-mosaic}/src/test/java/org/apache/paimon/format/mosaic/MosaicWriterMetadataTest.java (83%) diff --git a/paimon-mosaic-format/pom.xml b/paimon-mosaic/pom.xml similarity index 94% rename from paimon-mosaic-format/pom.xml rename to paimon-mosaic/pom.xml index a9182c998795..95c11789cf95 100644 --- a/paimon-mosaic-format/pom.xml +++ b/paimon-mosaic/pom.xml @@ -28,13 +28,13 @@ under the License. 1.5-SNAPSHOT - paimon-mosaic-format + paimon-mosaic Paimon : Mosaic Format - org.apache.paimon.mosaic - mosaic-writer + org.apache.paimon + mosaic 0.1.0-SNAPSHOT diff --git a/paimon-mosaic-format/src/main/java/org/apache/paimon/format/mosaic/MosaicFileFormat.java b/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicFileFormat.java similarity index 100% rename from paimon-mosaic-format/src/main/java/org/apache/paimon/format/mosaic/MosaicFileFormat.java rename to paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicFileFormat.java diff --git a/paimon-mosaic-format/src/main/java/org/apache/paimon/format/mosaic/MosaicFileFormatFactory.java b/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicFileFormatFactory.java similarity index 100% rename from paimon-mosaic-format/src/main/java/org/apache/paimon/format/mosaic/MosaicFileFormatFactory.java rename to paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicFileFormatFactory.java diff --git a/paimon-mosaic-format/src/main/java/org/apache/paimon/format/mosaic/MosaicInputFileAdapter.java b/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicInputFileAdapter.java similarity index 100% rename from paimon-mosaic-format/src/main/java/org/apache/paimon/format/mosaic/MosaicInputFileAdapter.java rename to paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicInputFileAdapter.java diff --git a/paimon-mosaic-format/src/main/java/org/apache/paimon/format/mosaic/MosaicObjects.java b/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicObjects.java similarity index 100% rename from paimon-mosaic-format/src/main/java/org/apache/paimon/format/mosaic/MosaicObjects.java rename to paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicObjects.java diff --git a/paimon-mosaic-format/src/main/java/org/apache/paimon/format/mosaic/MosaicReaderFactory.java b/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicReaderFactory.java similarity index 100% rename from paimon-mosaic-format/src/main/java/org/apache/paimon/format/mosaic/MosaicReaderFactory.java rename to paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicReaderFactory.java diff --git a/paimon-mosaic-format/src/main/java/org/apache/paimon/format/mosaic/MosaicRecordsReader.java b/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicRecordsReader.java similarity index 100% rename from paimon-mosaic-format/src/main/java/org/apache/paimon/format/mosaic/MosaicRecordsReader.java rename to paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicRecordsReader.java diff --git a/paimon-mosaic-format/src/main/java/org/apache/paimon/format/mosaic/MosaicRecordsWriter.java b/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicRecordsWriter.java similarity index 100% rename from paimon-mosaic-format/src/main/java/org/apache/paimon/format/mosaic/MosaicRecordsWriter.java rename to paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicRecordsWriter.java diff --git a/paimon-mosaic-format/src/main/java/org/apache/paimon/format/mosaic/MosaicSimpleStatsExtractor.java b/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicSimpleStatsExtractor.java similarity index 100% rename from paimon-mosaic-format/src/main/java/org/apache/paimon/format/mosaic/MosaicSimpleStatsExtractor.java rename to paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicSimpleStatsExtractor.java diff --git a/paimon-mosaic-format/src/main/java/org/apache/paimon/format/mosaic/MosaicWriterFactory.java b/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicWriterFactory.java similarity index 100% rename from paimon-mosaic-format/src/main/java/org/apache/paimon/format/mosaic/MosaicWriterFactory.java rename to paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicWriterFactory.java diff --git a/paimon-mosaic-format/src/main/java/org/apache/paimon/format/mosaic/MosaicWriterMetadata.java b/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicWriterMetadata.java similarity index 100% rename from paimon-mosaic-format/src/main/java/org/apache/paimon/format/mosaic/MosaicWriterMetadata.java rename to paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicWriterMetadata.java diff --git a/paimon-mosaic-format/src/main/resources/META-INF/services/org.apache.paimon.format.FileFormatFactory b/paimon-mosaic/src/main/resources/META-INF/services/org.apache.paimon.format.FileFormatFactory similarity index 100% rename from paimon-mosaic-format/src/main/resources/META-INF/services/org.apache.paimon.format.FileFormatFactory rename to paimon-mosaic/src/main/resources/META-INF/services/org.apache.paimon.format.FileFormatFactory diff --git a/paimon-mosaic-format/src/test/java/org/apache/paimon/format/mosaic/MosaicFileFormatTest.java b/paimon-mosaic/src/test/java/org/apache/paimon/format/mosaic/MosaicFileFormatTest.java similarity index 100% rename from paimon-mosaic-format/src/test/java/org/apache/paimon/format/mosaic/MosaicFileFormatTest.java rename to paimon-mosaic/src/test/java/org/apache/paimon/format/mosaic/MosaicFileFormatTest.java diff --git a/paimon-mosaic-format/src/test/java/org/apache/paimon/format/mosaic/MosaicFormatReadWriteTest.java b/paimon-mosaic/src/test/java/org/apache/paimon/format/mosaic/MosaicFormatReadWriteTest.java similarity index 91% rename from paimon-mosaic-format/src/test/java/org/apache/paimon/format/mosaic/MosaicFormatReadWriteTest.java rename to paimon-mosaic/src/test/java/org/apache/paimon/format/mosaic/MosaicFormatReadWriteTest.java index f3ce3a9042a6..41f632b3ee3f 100644 --- a/paimon-mosaic-format/src/test/java/org/apache/paimon/format/mosaic/MosaicFormatReadWriteTest.java +++ b/paimon-mosaic/src/test/java/org/apache/paimon/format/mosaic/MosaicFormatReadWriteTest.java @@ -20,7 +20,6 @@ import org.apache.paimon.data.BinaryString; import org.apache.paimon.data.Decimal; -import org.apache.paimon.data.GenericArray; import org.apache.paimon.data.GenericRow; import org.apache.paimon.data.InternalRow; import org.apache.paimon.data.Timestamp; @@ -82,12 +81,6 @@ protected RowType rowTypeForFullTypesTest() { .field("f_timestamp6", DataTypes.TIMESTAMP(6)) .field("f_decimal_5_2", DataTypes.DECIMAL(5, 2)) .field("f_decimal_20_0", DataTypes.DECIMAL(20, 0)) - .field("f_array", DataTypes.ARRAY(DataTypes.INT())) - .field( - "f_row", - DataTypes.ROW( - DataTypes.FIELD(0, "a", DataTypes.INT()), - DataTypes.FIELD(1, "b", DataTypes.STRING()))) .build(); } @@ -107,9 +100,7 @@ protected GenericRow expectedRowForFullTypesTest() { Timestamp.fromEpochMillis(1700000000000L), Timestamp.fromMicros(1700000000000000L), Decimal.fromBigDecimal(new BigDecimal("123.45"), 5, 2), - Decimal.fromBigDecimal(new BigDecimal("12345678901234567890"), 20, 0), - new GenericArray(new int[] {1, 2, 3}), - GenericRow.of(100, BinaryString.fromString("nested"))); + Decimal.fromBigDecimal(new BigDecimal("12345678901234567890"), 20, 0)); } @Override diff --git a/paimon-mosaic-format/src/test/java/org/apache/paimon/format/mosaic/MosaicObjectsTest.java b/paimon-mosaic/src/test/java/org/apache/paimon/format/mosaic/MosaicObjectsTest.java similarity index 100% rename from paimon-mosaic-format/src/test/java/org/apache/paimon/format/mosaic/MosaicObjectsTest.java rename to paimon-mosaic/src/test/java/org/apache/paimon/format/mosaic/MosaicObjectsTest.java diff --git a/paimon-mosaic-format/src/test/java/org/apache/paimon/format/mosaic/MosaicReaderWriterTest.java b/paimon-mosaic/src/test/java/org/apache/paimon/format/mosaic/MosaicReaderWriterTest.java similarity index 89% rename from paimon-mosaic-format/src/test/java/org/apache/paimon/format/mosaic/MosaicReaderWriterTest.java rename to paimon-mosaic/src/test/java/org/apache/paimon/format/mosaic/MosaicReaderWriterTest.java index 3050669e977f..c4a55cb43060 100644 --- a/paimon-mosaic-format/src/test/java/org/apache/paimon/format/mosaic/MosaicReaderWriterTest.java +++ b/paimon-mosaic/src/test/java/org/apache/paimon/format/mosaic/MosaicReaderWriterTest.java @@ -101,8 +101,13 @@ void testNullValues() throws IOException { @Test void testColumnProjection() throws IOException { - RowType writeType = DataTypes.ROW(DataTypes.INT(), DataTypes.STRING(), DataTypes.DOUBLE()); - RowType readType = DataTypes.ROW(DataTypes.STRING()); + RowType writeType = + RowType.builder() + .field("f_int", DataTypes.INT()) + .field("f_string", DataTypes.STRING()) + .field("f_double", DataTypes.DOUBLE()) + .build(); + RowType readType = RowType.builder().field("f_string", DataTypes.STRING()).build(); Path path = newPath(); writeRows( @@ -137,7 +142,11 @@ void testLargeDataset() throws IOException { @Test void testRowGroupPredicateFiltering() throws IOException { - RowType rowType = DataTypes.ROW(DataTypes.INT().notNull(), DataTypes.STRING()); + RowType rowType = + RowType.builder() + .field("f_int", DataTypes.INT()) + .field("f_string", DataTypes.STRING()) + .build(); Path path = newPath(); int numRows = 10000; @@ -147,14 +156,18 @@ void testRowGroupPredicateFiltering() throws IOException { } writeRows(rowType, path, rows); + // Predicate that cannot match any row group (all values are 0..9999) PredicateBuilder builder = new PredicateBuilder(rowType); - Predicate predicate = builder.greaterThan(0, 9000); + Predicate predicate = builder.greaterThan(0, 99999); List result = readAll(rowType, rowType, path, Collections.singletonList(predicate)); + assertThat(result).isEmpty(); - for (InternalRow row : result) { - assertThat(row.getInt(0)).isGreaterThan(9000); - } + // Predicate that matches the row group (values include range 0..9999) + Predicate matchPredicate = builder.greaterThan(0, 5000); + List matchResult = + readAll(rowType, rowType, path, Collections.singletonList(matchPredicate)); + assertThat(matchResult).hasSize(numRows); } @Test diff --git a/paimon-mosaic-format/src/test/java/org/apache/paimon/format/mosaic/MosaicSimpleStatsExtractorTest.java b/paimon-mosaic/src/test/java/org/apache/paimon/format/mosaic/MosaicSimpleStatsExtractorTest.java similarity index 81% rename from paimon-mosaic-format/src/test/java/org/apache/paimon/format/mosaic/MosaicSimpleStatsExtractorTest.java rename to paimon-mosaic/src/test/java/org/apache/paimon/format/mosaic/MosaicSimpleStatsExtractorTest.java index 78254341ba69..a303b7bc9571 100644 --- a/paimon-mosaic-format/src/test/java/org/apache/paimon/format/mosaic/MosaicSimpleStatsExtractorTest.java +++ b/paimon-mosaic/src/test/java/org/apache/paimon/format/mosaic/MosaicSimpleStatsExtractorTest.java @@ -20,11 +20,8 @@ import org.apache.paimon.format.FileFormat; import org.apache.paimon.format.FileFormatFactory; -import org.apache.paimon.format.SimpleColStats; import org.apache.paimon.format.SimpleColStatsExtractorTest; import org.apache.paimon.options.Options; -import org.apache.paimon.types.ArrayType; -import org.apache.paimon.types.DataType; import org.apache.paimon.types.DataTypes; import org.apache.paimon.types.RowType; @@ -55,14 +52,11 @@ protected RowType rowType() { .field("f_bigint", DataTypes.BIGINT()) .field("f_float", DataTypes.FLOAT()) .field("f_double", DataTypes.DOUBLE()) - .field("f_string", DataTypes.STRING()) - .field("f_binary", DataTypes.BYTES()) + .field("f_string", DataTypes.VARCHAR(100)) .field("f_decimal_5_2", DataTypes.DECIMAL(5, 2)) - .field("f_decimal_20_0", DataTypes.DECIMAL(20, 0)) .field("f_date", DataTypes.DATE()) .field("f_timestamp3", DataTypes.TIMESTAMP(3)) .field("f_timestamp6", DataTypes.TIMESTAMP(6)) - .field("f_array", DataTypes.ARRAY(DataTypes.INT())) .build(); } @@ -71,14 +65,6 @@ protected String fileCompression() { return "zstd"; } - @Override - protected SimpleColStats regenerate(SimpleColStats stats, DataType type) { - if (type instanceof ArrayType) { - return new SimpleColStats(null, null, stats.nullCount()); - } - return stats; - } - private static boolean isNativeAvailable() { try { Class.forName("org.apache.paimon.mosaic.NativeLib"); diff --git a/paimon-mosaic-format/src/test/java/org/apache/paimon/format/mosaic/MosaicWriterMetadataTest.java b/paimon-mosaic/src/test/java/org/apache/paimon/format/mosaic/MosaicWriterMetadataTest.java similarity index 83% rename from paimon-mosaic-format/src/test/java/org/apache/paimon/format/mosaic/MosaicWriterMetadataTest.java rename to paimon-mosaic/src/test/java/org/apache/paimon/format/mosaic/MosaicWriterMetadataTest.java index ac37504e7f38..e6d7a8fa9eb4 100644 --- a/paimon-mosaic-format/src/test/java/org/apache/paimon/format/mosaic/MosaicWriterMetadataTest.java +++ b/paimon-mosaic/src/test/java/org/apache/paimon/format/mosaic/MosaicWriterMetadataTest.java @@ -25,12 +25,14 @@ import org.apache.paimon.format.FormatWriterFactory; import org.apache.paimon.format.SimpleColStats; import org.apache.paimon.format.SimpleStatsExtractor; +import org.apache.paimon.format.SimpleStatsExtractor.FileInfo; import org.apache.paimon.fs.Path; import org.apache.paimon.fs.local.LocalFileIO; import org.apache.paimon.options.Options; import org.apache.paimon.statistics.SimpleColStatsCollector; import org.apache.paimon.types.DataTypes; import org.apache.paimon.types.RowType; +import org.apache.paimon.utils.Pair; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; @@ -142,6 +144,40 @@ void testStatsFromMetadataWithNullValues() throws IOException { assertThat(fromMetadata[1].nullCount()).isEqualTo(1L); } + @Test + void testExtractWithFileInfoRowCount() throws IOException { + RowType rowType = + RowType.builder() + .field("f_int", DataTypes.INT()) + .field("f_string", DataTypes.STRING()) + .build(); + Path path = newPath(); + + int numRows = 500; + FormatWriter writer = createWriter(rowType, path); + for (int i = 0; i < numRows; i++) { + writer.addElement(GenericRow.of(i, BinaryString.fromString("row_" + i))); + } + writer.close(); + + MosaicFileFormat format = createFormat(); + int fieldCount = rowType.getFieldCount(); + SimpleColStatsCollector.Factory[] collectors = + IntStream.range(0, fieldCount) + .mapToObj(i -> SimpleColStatsCollector.from("full")) + .toArray(SimpleColStatsCollector.Factory[]::new); + + SimpleStatsExtractor extractor = format.createStatsExtractor(rowType, collectors).get(); + LocalFileIO fileIO = new LocalFileIO(); + long fileSize = fileIO.getFileSize(path); + + Pair result = + extractor.extractWithFileInfo(fileIO, path, fileSize); + assertThat(result.getRight().getRowCount()).isEqualTo(numRows); + assertThat(result.getLeft()).isNotNull(); + assertThat(result.getLeft()).hasSize(fieldCount); + } + @Test void testFallbackToFileWhenMetadataIsNull() throws IOException { RowType rowType = DataTypes.ROW(DataTypes.INT(), DataTypes.STRING()); diff --git a/pom.xml b/pom.xml index bc446bb2da3f..991c6c487421 100644 --- a/pom.xml +++ b/pom.xml @@ -74,7 +74,7 @@ under the License. paimon-api paimon-lumina paimon-vortex - + paimon-mosaic paimon-tantivy From 7cd45793a8263e40e5ad6c113b342ed51196369c Mon Sep 17 00:00:00 2001 From: JingsongLi Date: Wed, 20 May 2026 22:00:51 +0800 Subject: [PATCH 04/11] fix --- .../format/mosaic/MosaicFileFormat.java | 10 ++ .../format/mosaic/MosaicRecordsWriter.java | 41 ++++-- .../mosaic/MosaicSimpleStatsExtractor.java | 37 ++++- .../format/mosaic/MosaicWriterFactory.java | 17 ++- .../format/mosaic/MosaicWriterMetadata.java | 11 +- .../format/mosaic/MosaicReaderWriterTest.java | 19 ++- .../MosaicSimpleStatsExtractorTest.java | 7 +- .../mosaic/MosaicWriterMetadataTest.java | 133 ++++++++++++++++-- 8 files changed, 239 insertions(+), 36 deletions(-) diff --git a/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicFileFormat.java b/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicFileFormat.java index b8a95e69aeda..b503d0ece31b 100644 --- a/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicFileFormat.java +++ b/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicFileFormat.java @@ -23,6 +23,8 @@ import org.apache.paimon.format.FormatReaderFactory; import org.apache.paimon.format.FormatWriterFactory; import org.apache.paimon.format.SimpleStatsExtractor; +import org.apache.paimon.options.ConfigOption; +import org.apache.paimon.options.ConfigOptions; import org.apache.paimon.predicate.Predicate; import org.apache.paimon.statistics.SimpleColStatsCollector; import org.apache.paimon.types.ArrayType; @@ -59,6 +61,14 @@ /** Mosaic {@link FileFormat}. */ public class MosaicFileFormat extends FileFormat { + public static final ConfigOption STATS_COLUMNS = + ConfigOptions.key("mosaic.stats-columns") + .stringType() + .defaultValue("") + .withDescription( + "Comma-separated list of column names to collect statistics for. " + + "Empty means no statistics collection."); + static { System.setProperty("arrow.enable_unsafe_memory_access", "true"); } diff --git a/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicRecordsWriter.java b/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicRecordsWriter.java index c88e8ef5a3c4..099bb8b9b07e 100644 --- a/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicRecordsWriter.java +++ b/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicRecordsWriter.java @@ -47,12 +47,15 @@ public class MosaicRecordsWriter implements BundleFormatWriter { private final ArrowFormatWriter arrowFormatWriter; private final MosaicWriter nativeWriter; private final BufferAllocator allocator; + private final List statsColumnNames; @Nullable private MosaicWriterMetadata metadata; public MosaicRecordsWriter( OutputStream outputStream, RowType rowType, - FileFormatFactory.FormatContext formatContext) { + FileFormatFactory.FormatContext formatContext, + List statsColumnNames) { + this.statsColumnNames = statsColumnNames; this.allocator = new RootAllocator(); int writeBatchSize = formatContext.writeBatchSize(); @@ -62,19 +65,35 @@ public MosaicRecordsWriter( new ArrowFormatWriter(rowType, writeBatchSize, true, allocator, writeBatchMemory); Schema arrowSchema = arrowFormatWriter.getVectorSchemaRoot().getSchema(); - int numFields = arrowSchema.getFields().size(); - int[] allColumns = new int[numFields]; - for (int i = 0; i < numFields; i++) { - allColumns[i] = i; + WriterOptions options = new WriterOptions().zstdLevel(formatContext.zstdLevel()); + if (!statsColumnNames.isEmpty()) { + int[] statsIndices = resolveColumnIndices(arrowSchema, statsColumnNames); + options.statsColumns(statsIndices); } - WriterOptions options = - new WriterOptions().zstdLevel(formatContext.zstdLevel()).statsColumns(allColumns); this.nativeWriter = new MosaicWriter(outputStream, arrowSchema, options, allocator); } + private static int[] resolveColumnIndices(Schema schema, List columnNames) { + List fields = schema.getFields(); + List indices = new ArrayList<>(); + for (String name : columnNames) { + for (int i = 0; i < fields.size(); i++) { + if (fields.get(i).getName().equals(name)) { + indices.add(i); + break; + } + } + } + int[] result = new int[indices.size()]; + for (int i = 0; i < indices.size(); i++) { + result[i] = indices.get(i); + } + return result; + } + @Override - public void addElement(InternalRow internalRow) throws IOException { + public void addElement(InternalRow internalRow) { if (!arrowFormatWriter.write(internalRow)) { flush(); if (!arrowFormatWriter.write(internalRow)) { @@ -84,7 +103,7 @@ public void addElement(InternalRow internalRow) throws IOException { } @Override - public void writeBundle(BundleRecords bundleRecords) throws IOException { + public void writeBundle(BundleRecords bundleRecords) { if (bundleRecords instanceof ArrowBundleRecords) { flush(); nativeWriter.write(((ArrowBundleRecords) bundleRecords).getVectorSchemaRoot()); @@ -96,7 +115,7 @@ public void writeBundle(BundleRecords bundleRecords) throws IOException { } @Override - public boolean reachTargetSize(boolean suggestedCheck, long targetSize) throws IOException { + public boolean reachTargetSize(boolean suggestedCheck, long targetSize) { if (!suggestedCheck) { return false; } @@ -154,7 +173,7 @@ private void collectMetadata() { for (int i = 0; i < numRowGroups; i++) { allStats.add(nativeWriter.getRowGroupStatistics(i)); } - this.metadata = new MosaicWriterMetadata(numRowGroups, allStats); + this.metadata = new MosaicWriterMetadata(numRowGroups, allStats, statsColumnNames); } private void flush() { diff --git a/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicSimpleStatsExtractor.java b/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicSimpleStatsExtractor.java index 4a6132caa5bf..ad204b6db443 100644 --- a/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicSimpleStatsExtractor.java +++ b/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicSimpleStatsExtractor.java @@ -34,7 +34,9 @@ import javax.annotation.Nullable; +import java.util.HashSet; import java.util.List; +import java.util.Set; import static org.apache.paimon.format.mosaic.MosaicObjects.convertStatsValue; @@ -55,7 +57,7 @@ public SimpleColStats[] extract(FileIO fileIO, Path path, long length) { MosaicInputFileAdapter inputFile = new MosaicInputFileAdapter(fileIO, path); try (BufferAllocator allocator = new RootAllocator(); MosaicReader reader = MosaicReader.open(inputFile, length, allocator)) { - return extractFromStats(reader.numRowGroups(), reader::getRowGroupStatistics); + return extractFromStats(reader.numRowGroups(), reader::getRowGroupStatistics, null); } } @@ -64,7 +66,9 @@ public SimpleColStats[] extract( FileIO fileIO, Path path, long length, @Nullable Object writerMetadata) { if (writerMetadata instanceof MosaicWriterMetadata) { MosaicWriterMetadata meta = (MosaicWriterMetadata) writerMetadata; - return extractFromStats(meta.numRowGroups(), meta::getRowGroupStatistics); + Set statsFieldIndices = resolveStatsFieldIndices(meta.statsColumnNames()); + return extractFromStats( + meta.numRowGroups(), meta::getRowGroupStatistics, statsFieldIndices); } return extract(fileIO, path, length); } @@ -76,7 +80,8 @@ public Pair extractWithFileInfo( try (BufferAllocator allocator = new RootAllocator(); MosaicReader reader = MosaicReader.open(inputFile, length, allocator)) { int numRowGroups = reader.numRowGroups(); - SimpleColStats[] stats = extractFromStats(numRowGroups, reader::getRowGroupStatistics); + SimpleColStats[] stats = + extractFromStats(numRowGroups, reader::getRowGroupStatistics, null); long rowCount = 0; for (int rg = 0; rg < numRowGroups; rg++) { rowCount += reader.rowGroupNumRows(rg); @@ -87,7 +92,9 @@ public Pair extractWithFileInfo( @SuppressWarnings("unchecked") private SimpleColStats[] extractFromStats( - int numRowGroups, RowGroupStatsProvider statsProvider) { + int numRowGroups, + RowGroupStatsProvider statsProvider, + @Nullable Set statsFieldIndices) { int fieldCount = rowType.getFieldCount(); Object[] minValues = new Object[fieldCount]; Object[] maxValues = new Object[fieldCount]; @@ -132,13 +139,29 @@ private SimpleColStats[] extractFromStats( SimpleColStatsCollector[] collectors = SimpleColStatsCollector.create(statsCollectors); SimpleColStats[] result = new SimpleColStats[fieldCount]; for (int i = 0; i < fieldCount; i++) { - SimpleColStats fieldStats = - new SimpleColStats(minValues[i], maxValues[i], nullCounts[i]); - result[i] = collectors[i].convert(fieldStats); + if (statsFieldIndices != null && !statsFieldIndices.contains(i)) { + result[i] = collectors[i].convert(new SimpleColStats(null, null, null)); + } else { + SimpleColStats fieldStats = + new SimpleColStats(minValues[i], maxValues[i], nullCounts[i]); + result[i] = collectors[i].convert(fieldStats); + } } return result; } + private Set resolveStatsFieldIndices(List statsColumnNames) { + Set indices = new HashSet<>(); + List fieldNames = rowType.getFieldNames(); + for (String name : statsColumnNames) { + int idx = fieldNames.indexOf(name); + if (idx >= 0) { + indices.add(idx); + } + } + return indices; + } + @FunctionalInterface private interface RowGroupStatsProvider { List getRowGroupStatistics(int rowGroupIndex); diff --git a/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicWriterFactory.java b/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicWriterFactory.java index dd7bb72f98e5..09980ccc3cea 100644 --- a/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicWriterFactory.java +++ b/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicWriterFactory.java @@ -25,20 +25,35 @@ import org.apache.paimon.types.RowType; import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.stream.Collectors; /** A factory to create Mosaic {@link FormatWriter}. */ public class MosaicWriterFactory implements FormatWriterFactory { private final RowType rowType; private final FileFormatFactory.FormatContext formatContext; + private final List statsColumnNames; public MosaicWriterFactory(RowType rowType, FileFormatFactory.FormatContext formatContext) { this.rowType = rowType; this.formatContext = formatContext; + String statsColumnsValue = formatContext.options().get(MosaicFileFormat.STATS_COLUMNS); + if (statsColumnsValue == null || statsColumnsValue.trim().isEmpty()) { + this.statsColumnNames = new ArrayList<>(); + } else { + this.statsColumnNames = + Arrays.stream(statsColumnsValue.split(",")) + .map(String::trim) + .filter(s -> !s.isEmpty()) + .collect(Collectors.toList()); + } } @Override public FormatWriter create(PositionOutputStream out, String compression) throws IOException { - return new MosaicRecordsWriter(out, rowType, formatContext); + return new MosaicRecordsWriter(out, rowType, formatContext, statsColumnNames); } } diff --git a/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicWriterMetadata.java b/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicWriterMetadata.java index ac4c9906c852..b73b970c0b88 100644 --- a/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicWriterMetadata.java +++ b/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicWriterMetadata.java @@ -27,10 +27,15 @@ public class MosaicWriterMetadata { private final int numRowGroups; private final List> rowGroupStats; + private final List statsColumnNames; - public MosaicWriterMetadata(int numRowGroups, List> rowGroupStats) { + public MosaicWriterMetadata( + int numRowGroups, + List> rowGroupStats, + List statsColumnNames) { this.numRowGroups = numRowGroups; this.rowGroupStats = rowGroupStats; + this.statsColumnNames = statsColumnNames; } public int numRowGroups() { @@ -40,4 +45,8 @@ public int numRowGroups() { public List getRowGroupStatistics(int rowGroupIndex) { return rowGroupStats.get(rowGroupIndex); } + + public List statsColumnNames() { + return statsColumnNames; + } } diff --git a/paimon-mosaic/src/test/java/org/apache/paimon/format/mosaic/MosaicReaderWriterTest.java b/paimon-mosaic/src/test/java/org/apache/paimon/format/mosaic/MosaicReaderWriterTest.java index c4a55cb43060..78da0d2500d4 100644 --- a/paimon-mosaic/src/test/java/org/apache/paimon/format/mosaic/MosaicReaderWriterTest.java +++ b/paimon-mosaic/src/test/java/org/apache/paimon/format/mosaic/MosaicReaderWriterTest.java @@ -154,7 +154,7 @@ void testRowGroupPredicateFiltering() throws IOException { for (int i = 0; i < numRows; i++) { rows[i] = GenericRow.of(i, BinaryString.fromString("v" + i)); } - writeRows(rowType, path, rows); + writeRows(rowType, path, "f_int", rows); // Predicate that cannot match any row group (all values are 0..9999) PredicateBuilder builder = new PredicateBuilder(rowType); @@ -230,7 +230,12 @@ private Path newPath() { } private void writeRows(RowType rowType, Path path, GenericRow... rows) throws IOException { - MosaicFileFormat format = createFormat(); + writeRows(rowType, path, "", rows); + } + + private void writeRows(RowType rowType, Path path, String statsColumns, GenericRow... rows) + throws IOException { + MosaicFileFormat format = createFormat(statsColumns); FormatWriterFactory writerFactory = format.createWriterFactory(rowType); LocalFileIO fileIO = new LocalFileIO(); FormatWriter writer = writerFactory.create(fileIO.newOutputStream(path, false), "zstd"); @@ -259,7 +264,15 @@ private List readAll( } private static MosaicFileFormat createFormat() { - return new MosaicFileFormat(new FileFormatFactory.FormatContext(new Options(), 1024, 1024)); + return createFormat(""); + } + + private static MosaicFileFormat createFormat(String statsColumns) { + Options options = new Options(); + if (!statsColumns.isEmpty()) { + options.set(MosaicFileFormat.STATS_COLUMNS, statsColumns); + } + return new MosaicFileFormat(new FileFormatFactory.FormatContext(options, 1024, 1024)); } private static boolean isNativeAvailable() { diff --git a/paimon-mosaic/src/test/java/org/apache/paimon/format/mosaic/MosaicSimpleStatsExtractorTest.java b/paimon-mosaic/src/test/java/org/apache/paimon/format/mosaic/MosaicSimpleStatsExtractorTest.java index a303b7bc9571..89ef34b160b1 100644 --- a/paimon-mosaic/src/test/java/org/apache/paimon/format/mosaic/MosaicSimpleStatsExtractorTest.java +++ b/paimon-mosaic/src/test/java/org/apache/paimon/format/mosaic/MosaicSimpleStatsExtractorTest.java @@ -39,7 +39,12 @@ static void checkNativeLibrary() { @Override protected FileFormat createFormat() { - return new MosaicFileFormat(new FileFormatFactory.FormatContext(new Options(), 1024, 1024)); + Options options = new Options(); + options.set( + MosaicFileFormat.STATS_COLUMNS, + "f_boolean,f_tinyint,f_smallint,f_int,f_bigint,f_float," + + "f_double,f_string,f_decimal_5_2,f_date,f_timestamp3,f_timestamp6"); + return new MosaicFileFormat(new FileFormatFactory.FormatContext(options, 1024, 1024)); } @Override diff --git a/paimon-mosaic/src/test/java/org/apache/paimon/format/mosaic/MosaicWriterMetadataTest.java b/paimon-mosaic/src/test/java/org/apache/paimon/format/mosaic/MosaicWriterMetadataTest.java index e6d7a8fa9eb4..c0e3fe46ddff 100644 --- a/paimon-mosaic/src/test/java/org/apache/paimon/format/mosaic/MosaicWriterMetadataTest.java +++ b/paimon-mosaic/src/test/java/org/apache/paimon/format/mosaic/MosaicWriterMetadataTest.java @@ -60,7 +60,7 @@ void testWriterMetadataNotNull() throws IOException { RowType rowType = DataTypes.ROW(DataTypes.INT(), DataTypes.STRING()); Path path = newPath(); - FormatWriter writer = createWriter(rowType, path); + FormatWriter writer = createWriter(rowType, path, "f0,f1"); writer.addElement(GenericRow.of(1, BinaryString.fromString("hello"))); writer.addElement(GenericRow.of(2, BinaryString.fromString("world"))); writer.close(); @@ -83,8 +83,9 @@ void testStatsFromMetadataMatchesStatsFromFile() throws IOException { .field("f_double", DataTypes.DOUBLE()) .build(); Path path = newPath(); + String statsColumns = "f_int,f_bigint,f_string,f_double"; - FormatWriter writer = createWriter(rowType, path); + FormatWriter writer = createWriter(rowType, path, statsColumns); for (int i = 0; i < 1000; i++) { writer.addElement( GenericRow.of(i, (long) i * 100, BinaryString.fromString("val_" + i), i * 1.1)); @@ -94,7 +95,7 @@ void testStatsFromMetadataMatchesStatsFromFile() throws IOException { Object metadata = writer.writerMetadata(); assertThat(metadata).isNotNull(); - MosaicFileFormat format = createFormat(); + MosaicFileFormat format = createFormat(statsColumns); int fieldCount = rowType.getFieldCount(); SimpleColStatsCollector.Factory[] collectors = IntStream.range(0, fieldCount) @@ -119,15 +120,16 @@ void testStatsFromMetadataWithNullValues() throws IOException { .field("f_string", DataTypes.STRING()) .build(); Path path = newPath(); + String statsColumns = "f_int,f_string"; - FormatWriter writer = createWriter(rowType, path); + FormatWriter writer = createWriter(rowType, path, statsColumns); writer.addElement(GenericRow.of(1, null)); writer.addElement(GenericRow.of(null, BinaryString.fromString("a"))); writer.addElement(GenericRow.of(3, BinaryString.fromString("b"))); writer.close(); Object metadata = writer.writerMetadata(); - MosaicFileFormat format = createFormat(); + MosaicFileFormat format = createFormat(statsColumns); int fieldCount = rowType.getFieldCount(); SimpleColStatsCollector.Factory[] collectors = IntStream.range(0, fieldCount) @@ -152,15 +154,16 @@ void testExtractWithFileInfoRowCount() throws IOException { .field("f_string", DataTypes.STRING()) .build(); Path path = newPath(); + String statsColumns = "f_int,f_string"; int numRows = 500; - FormatWriter writer = createWriter(rowType, path); + FormatWriter writer = createWriter(rowType, path, statsColumns); for (int i = 0; i < numRows; i++) { writer.addElement(GenericRow.of(i, BinaryString.fromString("row_" + i))); } writer.close(); - MosaicFileFormat format = createFormat(); + MosaicFileFormat format = createFormat(statsColumns); int fieldCount = rowType.getFieldCount(); SimpleColStatsCollector.Factory[] collectors = IntStream.range(0, fieldCount) @@ -178,16 +181,113 @@ void testExtractWithFileInfoRowCount() throws IOException { assertThat(result.getLeft()).hasSize(fieldCount); } + @Test + void testPartialStatsColumnsFromMetadata() throws IOException { + RowType rowType = + RowType.builder() + .field("f_int", DataTypes.INT()) + .field("f_string", DataTypes.STRING()) + .field("f_double", DataTypes.DOUBLE()) + .build(); + Path path = newPath(); + String statsColumns = "f_int"; + + FormatWriter writer = createWriter(rowType, path, statsColumns); + writer.addElement(GenericRow.of(1, BinaryString.fromString("a"), 1.0)); + writer.addElement(GenericRow.of(null, BinaryString.fromString("b"), 2.0)); + writer.addElement(GenericRow.of(3, null, null)); + writer.close(); + + Object metadata = writer.writerMetadata(); + assertThat(metadata).isInstanceOf(MosaicWriterMetadata.class); + MosaicWriterMetadata mosaicMeta = (MosaicWriterMetadata) metadata; + assertThat(mosaicMeta.statsColumnNames()).containsExactly("f_int"); + + MosaicFileFormat format = createFormat(statsColumns); + int fieldCount = rowType.getFieldCount(); + SimpleColStatsCollector.Factory[] collectors = + IntStream.range(0, fieldCount) + .mapToObj(i -> SimpleColStatsCollector.from("full")) + .toArray(SimpleColStatsCollector.Factory[]::new); + + SimpleStatsExtractor extractor = format.createStatsExtractor(rowType, collectors).get(); + LocalFileIO fileIO = new LocalFileIO(); + long fileSize = fileIO.getFileSize(path); + + SimpleColStats[] fromMetadata = extractor.extract(fileIO, path, fileSize, metadata); + + // f_int has stats: min=1, max=3, nullCount=1 + assertThat(fromMetadata[0].min()).isEqualTo(1); + assertThat(fromMetadata[0].max()).isEqualTo(3); + assertThat(fromMetadata[0].nullCount()).isEqualTo(1L); + + // f_string and f_double have no stats (not in statsColumns) + assertThat(fromMetadata[1].min()).isNull(); + assertThat(fromMetadata[1].max()).isNull(); + assertThat(fromMetadata[1].nullCount()).isNull(); + assertThat(fromMetadata[2].min()).isNull(); + assertThat(fromMetadata[2].max()).isNull(); + assertThat(fromMetadata[2].nullCount()).isNull(); + } + + @Test + void testStatsOnMiddleColumn() throws IOException { + RowType rowType = + RowType.builder() + .field("f_int", DataTypes.INT()) + .field("f_string", DataTypes.STRING()) + .field("f_double", DataTypes.DOUBLE()) + .build(); + Path path = newPath(); + String statsColumns = "f_string"; + + FormatWriter writer = createWriter(rowType, path, statsColumns); + writer.addElement(GenericRow.of(1, BinaryString.fromString("banana"), 1.0)); + writer.addElement(GenericRow.of(2, BinaryString.fromString("apple"), 2.0)); + writer.addElement(GenericRow.of(3, null, 3.0)); + writer.close(); + + Object metadata = writer.writerMetadata(); + MosaicFileFormat format = createFormat(statsColumns); + int fieldCount = rowType.getFieldCount(); + SimpleColStatsCollector.Factory[] collectors = + IntStream.range(0, fieldCount) + .mapToObj(i -> SimpleColStatsCollector.from("full")) + .toArray(SimpleColStatsCollector.Factory[]::new); + + SimpleStatsExtractor extractor = format.createStatsExtractor(rowType, collectors).get(); + LocalFileIO fileIO = new LocalFileIO(); + long fileSize = fileIO.getFileSize(path); + + SimpleColStats[] fromMetadata = extractor.extract(fileIO, path, fileSize, metadata); + + // f_int has no stats + assertThat(fromMetadata[0].min()).isNull(); + assertThat(fromMetadata[0].max()).isNull(); + assertThat(fromMetadata[0].nullCount()).isNull(); + + // f_string has stats: min="apple", max="banana", nullCount=1 + assertThat(fromMetadata[1].min()).isEqualTo(BinaryString.fromString("apple")); + assertThat(fromMetadata[1].max()).isEqualTo(BinaryString.fromString("banana")); + assertThat(fromMetadata[1].nullCount()).isEqualTo(1L); + + // f_double has no stats + assertThat(fromMetadata[2].min()).isNull(); + assertThat(fromMetadata[2].max()).isNull(); + assertThat(fromMetadata[2].nullCount()).isNull(); + } + @Test void testFallbackToFileWhenMetadataIsNull() throws IOException { RowType rowType = DataTypes.ROW(DataTypes.INT(), DataTypes.STRING()); Path path = newPath(); + String statsColumns = "f0,f1"; - FormatWriter writer = createWriter(rowType, path); + FormatWriter writer = createWriter(rowType, path, statsColumns); writer.addElement(GenericRow.of(10, BinaryString.fromString("test"))); writer.close(); - MosaicFileFormat format = createFormat(); + MosaicFileFormat format = createFormat(statsColumns); int fieldCount = rowType.getFieldCount(); SimpleColStatsCollector.Factory[] collectors = IntStream.range(0, fieldCount) @@ -208,15 +308,24 @@ private Path newPath() { return new Path(tempDir.toUri().toString(), UUID.randomUUID() + ".mosaic"); } - private FormatWriter createWriter(RowType rowType, Path path) throws IOException { - MosaicFileFormat format = createFormat(); + private FormatWriter createWriter(RowType rowType, Path path, String statsColumns) + throws IOException { + MosaicFileFormat format = createFormat(statsColumns); FormatWriterFactory writerFactory = format.createWriterFactory(rowType); LocalFileIO fileIO = new LocalFileIO(); return writerFactory.create(fileIO.newOutputStream(path, false), "zstd"); } private static MosaicFileFormat createFormat() { - return new MosaicFileFormat(new FileFormatFactory.FormatContext(new Options(), 1024, 1024)); + return createFormat(""); + } + + private static MosaicFileFormat createFormat(String statsColumns) { + Options options = new Options(); + if (!statsColumns.isEmpty()) { + options.set(MosaicFileFormat.STATS_COLUMNS, statsColumns); + } + return new MosaicFileFormat(new FileFormatFactory.FormatContext(options, 1024, 1024)); } private static boolean isNativeAvailable() { From a7a7beafb7e2ddbdef93235dec5eff8fb60a7c30 Mon Sep 17 00:00:00 2001 From: JingsongLi Date: Wed, 20 May 2026 22:43:21 +0800 Subject: [PATCH 05/11] fix --- .../mosaic/MosaicSimpleStatsExtractor.java | 5 +- .../mosaic/MosaicWriterMetadataTest.java | 47 +++++++++++++++++++ 2 files changed, 51 insertions(+), 1 deletion(-) diff --git a/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicSimpleStatsExtractor.java b/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicSimpleStatsExtractor.java index ad204b6db443..096002dcd372 100644 --- a/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicSimpleStatsExtractor.java +++ b/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicSimpleStatsExtractor.java @@ -99,6 +99,7 @@ private SimpleColStats[] extractFromStats( Object[] minValues = new Object[fieldCount]; Object[] maxValues = new Object[fieldCount]; long[] nullCounts = new long[fieldCount]; + Set seenColumns = new HashSet<>(); for (int rg = 0; rg < numRowGroups; rg++) { List stats = statsProvider.getRowGroupStatistics(rg); @@ -108,6 +109,7 @@ private SimpleColStats[] extractFromStats( continue; } + seenColumns.add(colIdx); nullCounts[colIdx] += stat.getNullCount(); if (stat.hasMinMax()) { @@ -136,10 +138,11 @@ private SimpleColStats[] extractFromStats( } } + Set trackedColumns = statsFieldIndices != null ? statsFieldIndices : seenColumns; SimpleColStatsCollector[] collectors = SimpleColStatsCollector.create(statsCollectors); SimpleColStats[] result = new SimpleColStats[fieldCount]; for (int i = 0; i < fieldCount; i++) { - if (statsFieldIndices != null && !statsFieldIndices.contains(i)) { + if (!trackedColumns.contains(i)) { result[i] = collectors[i].convert(new SimpleColStats(null, null, null)); } else { SimpleColStats fieldStats = diff --git a/paimon-mosaic/src/test/java/org/apache/paimon/format/mosaic/MosaicWriterMetadataTest.java b/paimon-mosaic/src/test/java/org/apache/paimon/format/mosaic/MosaicWriterMetadataTest.java index c0e3fe46ddff..4caf65d66176 100644 --- a/paimon-mosaic/src/test/java/org/apache/paimon/format/mosaic/MosaicWriterMetadataTest.java +++ b/paimon-mosaic/src/test/java/org/apache/paimon/format/mosaic/MosaicWriterMetadataTest.java @@ -277,6 +277,53 @@ void testStatsOnMiddleColumn() throws IOException { assertThat(fromMetadata[2].nullCount()).isNull(); } + @Test + void testPartialStatsColumnsFromFile() throws IOException { + RowType rowType = + RowType.builder() + .field("f_int", DataTypes.INT()) + .field("f_string", DataTypes.STRING()) + .field("f_double", DataTypes.DOUBLE()) + .build(); + Path path = newPath(); + String statsColumns = "f_string"; + + FormatWriter writer = createWriter(rowType, path, statsColumns); + writer.addElement(GenericRow.of(1, BinaryString.fromString("banana"), 1.0)); + writer.addElement(GenericRow.of(2, BinaryString.fromString("apple"), 2.0)); + writer.addElement(GenericRow.of(3, null, 3.0)); + writer.close(); + + // Extract from file (no writer metadata), simulating fallback path + MosaicFileFormat format = createFormat(statsColumns); + int fieldCount = rowType.getFieldCount(); + SimpleColStatsCollector.Factory[] collectors = + IntStream.range(0, fieldCount) + .mapToObj(i -> SimpleColStatsCollector.from("full")) + .toArray(SimpleColStatsCollector.Factory[]::new); + + SimpleStatsExtractor extractor = format.createStatsExtractor(rowType, collectors).get(); + LocalFileIO fileIO = new LocalFileIO(); + long fileSize = fileIO.getFileSize(path); + + SimpleColStats[] fromFile = extractor.extract(fileIO, path, fileSize); + + // f_int has no stats in file + assertThat(fromFile[0].min()).isNull(); + assertThat(fromFile[0].max()).isNull(); + assertThat(fromFile[0].nullCount()).isNull(); + + // f_string has stats + assertThat(fromFile[1].min()).isEqualTo(BinaryString.fromString("apple")); + assertThat(fromFile[1].max()).isEqualTo(BinaryString.fromString("banana")); + assertThat(fromFile[1].nullCount()).isEqualTo(1L); + + // f_double has no stats in file + assertThat(fromFile[2].min()).isNull(); + assertThat(fromFile[2].max()).isNull(); + assertThat(fromFile[2].nullCount()).isNull(); + } + @Test void testFallbackToFileWhenMetadataIsNull() throws IOException { RowType rowType = DataTypes.ROW(DataTypes.INT(), DataTypes.STRING()); From 9ae2aa07c6f72071a804ca8f4bcc1851cec73cc7 Mon Sep 17 00:00:00 2001 From: JingsongLi Date: Thu, 21 May 2026 09:15:38 +0800 Subject: [PATCH 06/11] fix with new API --- .../format/mosaic/MosaicRecordsReader.java | 81 ++++--------------- .../format/mosaic/MosaicRecordsWriter.java | 24 +----- .../mosaic/MosaicSimpleStatsExtractor.java | 13 +-- .../format/mosaic/MosaicWriterMetadata.java | 7 +- 4 files changed, 31 insertions(+), 94 deletions(-) diff --git a/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicRecordsReader.java b/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicRecordsReader.java index 76101378f0ca..e0ae0d4325dd 100644 --- a/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicRecordsReader.java +++ b/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicRecordsReader.java @@ -36,14 +36,13 @@ import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.memory.RootAllocator; import org.apache.arrow.vector.VectorSchemaRoot; -import org.apache.arrow.vector.types.pojo.Field; -import org.apache.arrow.vector.types.pojo.Schema; import javax.annotation.Nullable; import java.io.IOException; import java.util.Iterator; import java.util.List; +import java.util.Map; import static org.apache.paimon.format.mosaic.MosaicObjects.convertStatsValue; @@ -52,12 +51,10 @@ public class MosaicRecordsReader implements FileRecordReader { private final MosaicReader reader; private final ArrowBatchReader arrowBatchReader; - private final int[] columnIndices; private final Path filePath; private final BufferAllocator allocator; private final int numRowGroups; private final RowType dataSchemaRowType; - private final Schema fileSchema; @Nullable private final List predicates; private int currentRowGroup; @@ -83,8 +80,9 @@ public MosaicRecordsReader( throw e; } - this.fileSchema = reader.getSchema(); - this.columnIndices = computeColumnIndices(fileSchema, projectedRowType); + List projectedNames = projectedRowType.getFieldNames(); + reader.project(projectedNames.toArray(new String[0])); + this.numRowGroups = reader.numRowGroups(); this.currentRowGroup = 0; this.arrowBatchReader = new ArrowBatchReader(projectedRowType, true); @@ -103,12 +101,7 @@ public FileRecordIterator readBatch() throws IOException { releaseCurrentVsr(); - VectorSchemaRoot vsr; - if (columnIndices != null) { - vsr = reader.readRowGroup(currentRowGroup, columnIndices, allocator); - } else { - vsr = reader.readRowGroup(currentRowGroup, allocator); - } + VectorSchemaRoot vsr = reader.readRowGroup(currentRowGroup, allocator); currentRowGroup++; this.currentVsr = vsr; @@ -149,8 +142,8 @@ private boolean matchesRowGroup(int rowGroupIndex, long rowCount) { return true; } - List statsList = reader.getRowGroupStatistics(rowGroupIndex); - if (statsList.isEmpty()) { + Map statsMap = reader.getRowGroupStatistics(rowGroupIndex); + if (statsMap.isEmpty()) { return true; } @@ -159,24 +152,21 @@ private boolean matchesRowGroup(int rowGroupIndex, long rowCount) { GenericRow maxValues = new GenericRow(fieldCount); long[] nullCounts = new long[fieldCount]; - for (ColumnStatistics stats : statsList) { - int fileColIdx = stats.getColumnIndex(); - if (fileColIdx < 0 || fileColIdx >= fileSchema.getFields().size()) { - continue; - } - String colName = fileSchema.getFields().get(fileColIdx).getName(); - int schemaIdx = findFieldIndex(dataSchemaRowType, colName); - if (schemaIdx < 0) { + List fields = dataSchemaRowType.getFields(); + for (int i = 0; i < fieldCount; i++) { + String colName = fields.get(i).name(); + ColumnStatistics stats = statsMap.get(colName); + if (stats == null) { continue; } - nullCounts[schemaIdx] = stats.getNullCount(); + nullCounts[i] = stats.getNullCount(); if (stats.hasMinMax()) { - DataType dataType = dataSchemaRowType.getFields().get(schemaIdx).type(); + DataType dataType = fields.get(i).type(); Object min = convertStatsValue(stats.getMin(), dataType); Object max = convertStatsValue(stats.getMax(), dataType); - minValues.setField(schemaIdx, min); - maxValues.setField(schemaIdx, max); + minValues.setField(i, min); + maxValues.setField(i, max); } } @@ -188,16 +178,6 @@ private boolean matchesRowGroup(int rowGroupIndex, long rowCount) { return true; } - private static int findFieldIndex(RowType rowType, String name) { - List fields = rowType.getFields(); - for (int i = 0; i < fields.size(); i++) { - if (fields.get(i).name().equals(name)) { - return i; - } - } - return -1; - } - private void releaseCurrentVsr() { if (currentVsr != null) { currentVsr.close(); @@ -211,33 +191,4 @@ public void close() throws IOException { reader.close(); allocator.close(); } - - @Nullable - private static int[] computeColumnIndices(Schema fileSchema, RowType projectedRowType) { - List fileFields = fileSchema.getFields(); - if (fileFields.size() == projectedRowType.getFieldCount()) { - return null; - } - - int[] indices = new int[projectedRowType.getFieldCount()]; - for (int i = 0; i < projectedRowType.getFieldCount(); i++) { - String name = projectedRowType.getFields().get(i).name(); - int pos = findArrowFieldIndex(fileFields, name); - if (pos < 0) { - throw new IllegalArgumentException( - "Projected field '" + name + "' not found in Mosaic file schema"); - } - indices[i] = pos; - } - return indices; - } - - private static int findArrowFieldIndex(List fields, String name) { - for (int i = 0; i < fields.size(); i++) { - if (fields.get(i).getName().equals(name)) { - return i; - } - } - return -1; - } } diff --git a/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicRecordsWriter.java b/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicRecordsWriter.java index 099bb8b9b07e..214b00a155c7 100644 --- a/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicRecordsWriter.java +++ b/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicRecordsWriter.java @@ -40,6 +40,7 @@ import java.io.OutputStream; import java.util.ArrayList; import java.util.List; +import java.util.Map; /** Mosaic records writer. */ public class MosaicRecordsWriter implements BundleFormatWriter { @@ -67,31 +68,12 @@ public MosaicRecordsWriter( Schema arrowSchema = arrowFormatWriter.getVectorSchemaRoot().getSchema(); WriterOptions options = new WriterOptions().zstdLevel(formatContext.zstdLevel()); if (!statsColumnNames.isEmpty()) { - int[] statsIndices = resolveColumnIndices(arrowSchema, statsColumnNames); - options.statsColumns(statsIndices); + options.statsColumns(statsColumnNames.toArray(new String[0])); } this.nativeWriter = new MosaicWriter(outputStream, arrowSchema, options, allocator); } - private static int[] resolveColumnIndices(Schema schema, List columnNames) { - List fields = schema.getFields(); - List indices = new ArrayList<>(); - for (String name : columnNames) { - for (int i = 0; i < fields.size(); i++) { - if (fields.get(i).getName().equals(name)) { - indices.add(i); - break; - } - } - } - int[] result = new int[indices.size()]; - for (int i = 0; i < indices.size(); i++) { - result[i] = indices.get(i); - } - return result; - } - @Override public void addElement(InternalRow internalRow) { if (!arrowFormatWriter.write(internalRow)) { @@ -169,7 +151,7 @@ public Object writerMetadata() { private void collectMetadata() { int numRowGroups = nativeWriter.numRowGroups(); - List> allStats = new ArrayList<>(numRowGroups); + List> allStats = new ArrayList<>(numRowGroups); for (int i = 0; i < numRowGroups; i++) { allStats.add(nativeWriter.getRowGroupStatistics(i)); } diff --git a/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicSimpleStatsExtractor.java b/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicSimpleStatsExtractor.java index 096002dcd372..ca768c13a427 100644 --- a/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicSimpleStatsExtractor.java +++ b/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicSimpleStatsExtractor.java @@ -36,6 +36,7 @@ import java.util.HashSet; import java.util.List; +import java.util.Map; import java.util.Set; import static org.apache.paimon.format.mosaic.MosaicObjects.convertStatsValue; @@ -96,19 +97,21 @@ private SimpleColStats[] extractFromStats( RowGroupStatsProvider statsProvider, @Nullable Set statsFieldIndices) { int fieldCount = rowType.getFieldCount(); + List fieldNames = rowType.getFieldNames(); Object[] minValues = new Object[fieldCount]; Object[] maxValues = new Object[fieldCount]; long[] nullCounts = new long[fieldCount]; Set seenColumns = new HashSet<>(); for (int rg = 0; rg < numRowGroups; rg++) { - List stats = statsProvider.getRowGroupStatistics(rg); - for (ColumnStatistics stat : stats) { - int colIdx = stat.getColumnIndex(); - if (colIdx < 0 || colIdx >= fieldCount) { + Map statsMap = statsProvider.getRowGroupStatistics(rg); + for (Map.Entry entry : statsMap.entrySet()) { + int colIdx = fieldNames.indexOf(entry.getKey()); + if (colIdx < 0) { continue; } + ColumnStatistics stat = entry.getValue(); seenColumns.add(colIdx); nullCounts[colIdx] += stat.getNullCount(); @@ -167,6 +170,6 @@ private Set resolveStatsFieldIndices(List statsColumnNames) { @FunctionalInterface private interface RowGroupStatsProvider { - List getRowGroupStatistics(int rowGroupIndex); + Map getRowGroupStatistics(int rowGroupIndex); } } diff --git a/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicWriterMetadata.java b/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicWriterMetadata.java index b73b970c0b88..cd3149fd4470 100644 --- a/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicWriterMetadata.java +++ b/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicWriterMetadata.java @@ -21,17 +21,18 @@ import org.apache.paimon.mosaic.ColumnStatistics; import java.util.List; +import java.util.Map; /** In-memory metadata captured from MosaicWriter after close. */ public class MosaicWriterMetadata { private final int numRowGroups; - private final List> rowGroupStats; + private final List> rowGroupStats; private final List statsColumnNames; public MosaicWriterMetadata( int numRowGroups, - List> rowGroupStats, + List> rowGroupStats, List statsColumnNames) { this.numRowGroups = numRowGroups; this.rowGroupStats = rowGroupStats; @@ -42,7 +43,7 @@ public int numRowGroups() { return numRowGroups; } - public List getRowGroupStatistics(int rowGroupIndex) { + public Map getRowGroupStatistics(int rowGroupIndex) { return rowGroupStats.get(rowGroupIndex); } From ca344ffb910e770bf1fa2acb0f103090e80e78b0 Mon Sep 17 00:00:00 2001 From: JingsongLi Date: Thu, 21 May 2026 10:00:12 +0800 Subject: [PATCH 07/11] fix --- .../format/mosaic/MosaicFileFormat.java | 26 ++++-- .../format/mosaic/MosaicInputFileAdapter.java | 89 ++++++++++++++++++- .../paimon/format/mosaic/MosaicObjects.java | 48 ++++++---- .../format/mosaic/MosaicRecordsReader.java | 28 +++++- .../format/mosaic/MosaicRecordsWriter.java | 9 +- .../mosaic/MosaicSimpleStatsExtractor.java | 13 ++- .../format/mosaic/MosaicWriterFactory.java | 4 +- .../format/mosaic/MosaicObjectsTest.java | 58 ++++++++++++ .../format/mosaic/MosaicReaderWriterTest.java | 61 +++++++++++++ 9 files changed, 297 insertions(+), 39 deletions(-) diff --git a/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicFileFormat.java b/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicFileFormat.java index b503d0ece31b..306db02352b0 100644 --- a/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicFileFormat.java +++ b/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicFileFormat.java @@ -33,7 +33,7 @@ import org.apache.paimon.types.BlobType; import org.apache.paimon.types.BooleanType; import org.apache.paimon.types.CharType; -import org.apache.paimon.types.DataField; +import org.apache.paimon.types.DataType; import org.apache.paimon.types.DataTypeVisitor; import org.apache.paimon.types.DateType; import org.apache.paimon.types.DecimalType; @@ -69,6 +69,14 @@ public class MosaicFileFormat extends FileFormat { "Comma-separated list of column names to collect statistics for. " + "Empty means no statistics collection."); + public static final ConfigOption NUM_BUCKETS = + ConfigOptions.key("mosaic.num-buckets") + .intType() + .defaultValue(0) + .withDescription( + "Number of column buckets for parallel IO. " + + "0 means auto (defaults to number of columns)."); + static { System.setProperty("arrow.enable_unsafe_memory_access", "true"); } @@ -95,7 +103,10 @@ public FormatWriterFactory createWriterFactory(RowType type) { @Override public void validateDataFields(RowType rowType) { - rowType.accept(new MosaicRowTypeVisitor()); + MosaicRowTypeVisitor visitor = new MosaicRowTypeVisitor(); + for (DataType fieldType : rowType.getFieldTypes()) { + fieldType.accept(visitor); + } } @Override @@ -200,12 +211,14 @@ public Void visit(BlobType blobType) { @Override public Void visit(ArrayType arrayType) { - return null; + throw new UnsupportedOperationException( + "Mosaic file format does not support type ARRAY"); } @Override public Void visit(VectorType vectorType) { - return null; + throw new UnsupportedOperationException( + "Mosaic file format does not support type VECTOR"); } @Override @@ -221,10 +234,7 @@ public Void visit(MapType mapType) { @Override public Void visit(RowType rowType) { - for (DataField field : rowType.getFields()) { - field.type().accept(this); - } - return null; + throw new UnsupportedOperationException("Mosaic file format does not support type ROW"); } } } diff --git a/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicInputFileAdapter.java b/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicInputFileAdapter.java index 8806afb697fb..df287a3fc347 100644 --- a/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicInputFileAdapter.java +++ b/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicInputFileAdapter.java @@ -23,27 +23,41 @@ import org.apache.paimon.fs.SeekableInputStream; import org.apache.paimon.mosaic.InputFile; +import java.io.Closeable; import java.io.EOFException; import java.io.IOException; +import java.util.ArrayDeque; +import java.util.Deque; +import java.util.concurrent.Semaphore; /** * Adapts Paimon's {@link FileIO} to Mosaic's {@link InputFile} interface. * - *

Thread-safe: creates a new stream per call since Mosaic may invoke concurrently. + *

Maintains a pool of up to 8 {@link SeekableInputStream} instances. When all 8 are in use, + * callers block until one is returned. Thread-safe: Mosaic may invoke {@link #readFully} + * concurrently from multiple threads. */ -public class MosaicInputFileAdapter implements InputFile { +public class MosaicInputFileAdapter implements InputFile, Closeable { + + private static final int MAX_POOL_SIZE = 8; private final FileIO fileIO; private final Path path; + private final Semaphore semaphore; + private final Deque pool; + private boolean closed; public MosaicInputFileAdapter(FileIO fileIO, Path path) { this.fileIO = fileIO; this.path = path; + this.semaphore = new Semaphore(MAX_POOL_SIZE); + this.pool = new ArrayDeque<>(MAX_POOL_SIZE); } @Override public void readFully(long position, byte[] buffer, int offset, int length) throws IOException { - try (SeekableInputStream in = fileIO.newInputStream(path)) { + SeekableInputStream in = acquire(); + try { in.seek(position); int remaining = length; int off = offset; @@ -59,6 +73,75 @@ public void readFully(long position, byte[] buffer, int offset, int length) thro off += read; remaining -= read; } + release(in); + } catch (Throwable t) { + closeQuietly(in); + semaphore.release(); + throw t; + } + } + + private SeekableInputStream acquire() throws IOException { + try { + semaphore.acquire(); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + throw new IOException("Interrupted while waiting for stream", e); + } + synchronized (pool) { + if (closed) { + semaphore.release(); + throw new IOException("MosaicInputFileAdapter is closed"); + } + SeekableInputStream in = pool.pollFirst(); + if (in != null) { + return in; + } + } + return fileIO.newInputStream(path); + } + + private void release(SeekableInputStream in) { + synchronized (pool) { + if (!closed) { + pool.addLast(in); + semaphore.release(); + return; + } + } + closeQuietly(in); + semaphore.release(); + } + + @Override + public void close() throws IOException { + Deque toClose; + synchronized (pool) { + closed = true; + toClose = new ArrayDeque<>(pool); + pool.clear(); + } + IOException firstException = null; + for (SeekableInputStream in : toClose) { + try { + in.close(); + } catch (IOException e) { + if (firstException == null) { + firstException = e; + } else { + firstException.addSuppressed(e); + } + } + } + if (firstException != null) { + throw firstException; + } + } + + private static void closeQuietly(SeekableInputStream in) { + try { + in.close(); + } catch (IOException ignored) { } } } diff --git a/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicObjects.java b/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicObjects.java index 995434a22276..54d15c43c09e 100644 --- a/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicObjects.java +++ b/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicObjects.java @@ -23,6 +23,7 @@ import org.apache.paimon.data.Timestamp; import org.apache.paimon.types.DataType; import org.apache.paimon.types.DecimalType; +import org.apache.paimon.types.LocalZonedTimestampType; import org.apache.paimon.types.TimestampType; import javax.annotation.Nullable; @@ -36,7 +37,20 @@ public class MosaicObjects { @Nullable public static Object convertStatsValue(byte[] bytes, DataType dataType) { - if (bytes == null || bytes.length == 0) { + if (bytes == null) { + return null; + } + switch (dataType.getTypeRoot()) { + case CHAR: + case VARCHAR: + return BinaryString.fromBytes(bytes); + case BINARY: + case VARBINARY: + return bytes; + default: + break; + } + if (bytes.length == 0) { return null; } ByteBuffer buf = ByteBuffer.wrap(bytes); @@ -57,12 +71,6 @@ public static Object convertStatsValue(byte[] bytes, DataType dataType) { return buf.getFloat(); case DOUBLE: return buf.getDouble(); - case CHAR: - case VARCHAR: - return BinaryString.fromBytes(bytes); - case BINARY: - case VARBINARY: - return bytes; case DECIMAL: DecimalType decimalType = (DecimalType) dataType; BigInteger unscaled = new BigInteger(bytes); @@ -70,20 +78,26 @@ public static Object convertStatsValue(byte[] bytes, DataType dataType) { return Decimal.fromBigDecimal( decimal, decimalType.getPrecision(), decimalType.getScale()); case TIMESTAMP_WITHOUT_TIME_ZONE: - TimestampType tsType = (TimestampType) dataType; - long tsValue = buf.getLong(); - if (tsType.getPrecision() <= 3) { - return Timestamp.fromEpochMillis(tsValue); - } else if (tsType.getPrecision() <= 6) { - return Timestamp.fromMicros(tsValue); - } else { - return Timestamp.fromEpochMillis( - tsValue / 1_000_000, (int) (tsValue % 1_000_000)); - } + return convertTimestamp(buf, ((TimestampType) dataType).getPrecision()); + case TIMESTAMP_WITH_LOCAL_TIME_ZONE: + return convertTimestamp(buf, ((LocalZonedTimestampType) dataType).getPrecision()); default: return null; } } + private static Timestamp convertTimestamp(ByteBuffer buf, int precision) { + if (precision <= 3) { + return Timestamp.fromEpochMillis(buf.getLong()); + } else if (precision <= 6) { + return Timestamp.fromMicros(buf.getLong()); + } else { + // precision 7-9: 12 bytes = i64 millis (BE) + i32 nanos_of_milli (BE) + long millis = buf.getLong(); + int nanosOfMilli = buf.getInt(); + return Timestamp.fromEpochMillis(millis, nanosOfMilli); + } + } + private MosaicObjects() {} } diff --git a/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicRecordsReader.java b/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicRecordsReader.java index e0ae0d4325dd..24cdcbf05c96 100644 --- a/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicRecordsReader.java +++ b/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicRecordsReader.java @@ -24,7 +24,6 @@ import org.apache.paimon.data.InternalRow; import org.apache.paimon.fs.Path; import org.apache.paimon.mosaic.ColumnStatistics; -import org.apache.paimon.mosaic.InputFile; import org.apache.paimon.mosaic.MosaicReader; import org.apache.paimon.predicate.Predicate; import org.apache.paimon.reader.FileRecordIterator; @@ -36,19 +35,25 @@ import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.memory.RootAllocator; import org.apache.arrow.vector.VectorSchemaRoot; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.Schema; import javax.annotation.Nullable; import java.io.IOException; +import java.util.ArrayList; +import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; +import java.util.Set; import static org.apache.paimon.format.mosaic.MosaicObjects.convertStatsValue; /** File reader for Mosaic format. */ public class MosaicRecordsReader implements FileRecordReader { + private final MosaicInputFileAdapter inputFileAdapter; private final MosaicReader reader; private final ArrowBatchReader arrowBatchReader; private final Path filePath; @@ -62,26 +67,40 @@ public class MosaicRecordsReader implements FileRecordReader { private VectorSchemaRoot currentVsr; public MosaicRecordsReader( - InputFile inputFile, + MosaicInputFileAdapter inputFileAdapter, long fileSize, RowType dataSchemaRowType, RowType projectedRowType, @Nullable List predicates, Path filePath) { this.filePath = filePath; + this.inputFileAdapter = inputFileAdapter; this.dataSchemaRowType = dataSchemaRowType; this.predicates = predicates; this.allocator = new RootAllocator(); try { - this.reader = MosaicReader.open(inputFile, fileSize, allocator); + this.reader = MosaicReader.open(inputFileAdapter, fileSize, allocator); } catch (Exception e) { allocator.close(); throw e; } + Schema fileSchema = reader.getSchema(); + Set fileColumnNames = new HashSet<>(); + for (Field field : fileSchema.getFields()) { + fileColumnNames.add(field.getName()); + } List projectedNames = projectedRowType.getFieldNames(); - reader.project(projectedNames.toArray(new String[0])); + List existingColumns = new ArrayList<>(); + for (String name : projectedNames) { + if (fileColumnNames.contains(name)) { + existingColumns.add(name); + } + } + if (!existingColumns.isEmpty()) { + reader.project(existingColumns.toArray(new String[0])); + } this.numRowGroups = reader.numRowGroups(); this.currentRowGroup = 0; @@ -190,5 +209,6 @@ public void close() throws IOException { releaseCurrentVsr(); reader.close(); allocator.close(); + inputFileAdapter.close(); } } diff --git a/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicRecordsWriter.java b/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicRecordsWriter.java index 214b00a155c7..e495b8a62bd1 100644 --- a/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicRecordsWriter.java +++ b/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicRecordsWriter.java @@ -55,7 +55,8 @@ public MosaicRecordsWriter( OutputStream outputStream, RowType rowType, FileFormatFactory.FormatContext formatContext, - List statsColumnNames) { + List statsColumnNames, + int numBuckets) { this.statsColumnNames = statsColumnNames; this.allocator = new RootAllocator(); @@ -66,7 +67,11 @@ public MosaicRecordsWriter( new ArrowFormatWriter(rowType, writeBatchSize, true, allocator, writeBatchMemory); Schema arrowSchema = arrowFormatWriter.getVectorSchemaRoot().getSchema(); - WriterOptions options = new WriterOptions().zstdLevel(formatContext.zstdLevel()); + WriterOptions options = + new WriterOptions() + .zstdLevel(formatContext.zstdLevel()) + .numBuckets(numBuckets) + .rowGroupMaxSize(writeBatchMemory); if (!statsColumnNames.isEmpty()) { options.statsColumns(statsColumnNames.toArray(new String[0])); } diff --git a/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicSimpleStatsExtractor.java b/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicSimpleStatsExtractor.java index ca768c13a427..f1f3cdd2d076 100644 --- a/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicSimpleStatsExtractor.java +++ b/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicSimpleStatsExtractor.java @@ -34,6 +34,7 @@ import javax.annotation.Nullable; +import java.io.IOException; import java.util.HashSet; import java.util.List; import java.util.Map; @@ -55,10 +56,12 @@ public MosaicSimpleStatsExtractor( @Override public SimpleColStats[] extract(FileIO fileIO, Path path, long length) { - MosaicInputFileAdapter inputFile = new MosaicInputFileAdapter(fileIO, path); - try (BufferAllocator allocator = new RootAllocator(); + try (MosaicInputFileAdapter inputFile = new MosaicInputFileAdapter(fileIO, path); + BufferAllocator allocator = new RootAllocator(); MosaicReader reader = MosaicReader.open(inputFile, length, allocator)) { return extractFromStats(reader.numRowGroups(), reader::getRowGroupStatistics, null); + } catch (IOException e) { + throw new RuntimeException("Failed to extract stats from " + path, e); } } @@ -77,8 +80,8 @@ public SimpleColStats[] extract( @Override public Pair extractWithFileInfo( FileIO fileIO, Path path, long length) { - MosaicInputFileAdapter inputFile = new MosaicInputFileAdapter(fileIO, path); - try (BufferAllocator allocator = new RootAllocator(); + try (MosaicInputFileAdapter inputFile = new MosaicInputFileAdapter(fileIO, path); + BufferAllocator allocator = new RootAllocator(); MosaicReader reader = MosaicReader.open(inputFile, length, allocator)) { int numRowGroups = reader.numRowGroups(); SimpleColStats[] stats = @@ -88,6 +91,8 @@ public Pair extractWithFileInfo( rowCount += reader.rowGroupNumRows(rg); } return Pair.of(stats, new FileInfo(rowCount)); + } catch (IOException e) { + throw new RuntimeException("Failed to extract stats from " + path, e); } } diff --git a/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicWriterFactory.java b/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicWriterFactory.java index 09980ccc3cea..20edeeeb1e16 100644 --- a/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicWriterFactory.java +++ b/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicWriterFactory.java @@ -36,6 +36,7 @@ public class MosaicWriterFactory implements FormatWriterFactory { private final RowType rowType; private final FileFormatFactory.FormatContext formatContext; private final List statsColumnNames; + private final int numBuckets; public MosaicWriterFactory(RowType rowType, FileFormatFactory.FormatContext formatContext) { this.rowType = rowType; @@ -50,10 +51,11 @@ public MosaicWriterFactory(RowType rowType, FileFormatFactory.FormatContext form .filter(s -> !s.isEmpty()) .collect(Collectors.toList()); } + this.numBuckets = formatContext.options().get(MosaicFileFormat.NUM_BUCKETS); } @Override public FormatWriter create(PositionOutputStream out, String compression) throws IOException { - return new MosaicRecordsWriter(out, rowType, formatContext, statsColumnNames); + return new MosaicRecordsWriter(out, rowType, formatContext, statsColumnNames, numBuckets); } } diff --git a/paimon-mosaic/src/test/java/org/apache/paimon/format/mosaic/MosaicObjectsTest.java b/paimon-mosaic/src/test/java/org/apache/paimon/format/mosaic/MosaicObjectsTest.java index 8fc369a0f9f1..e05ed1709c4a 100644 --- a/paimon-mosaic/src/test/java/org/apache/paimon/format/mosaic/MosaicObjectsTest.java +++ b/paimon-mosaic/src/test/java/org/apache/paimon/format/mosaic/MosaicObjectsTest.java @@ -142,6 +142,64 @@ void testDecimal() { assertThat(decimal.toBigDecimal().intValue()).isEqualTo(10); } + @Test + void testTimestampNanos() { + long millis = 1700000000123L; + int nanosOfMilli = 456789; + byte[] bytes = ByteBuffer.allocate(12).putLong(millis).putInt(nanosOfMilli).array(); + Object result = MosaicObjects.convertStatsValue(bytes, DataTypes.TIMESTAMP(9)); + assertThat(result).isEqualTo(Timestamp.fromEpochMillis(millis, nanosOfMilli)); + } + + @Test + void testTimestampNanosPrecision7() { + long millis = 1700000000000L; + int nanosOfMilli = 100000; + byte[] bytes = ByteBuffer.allocate(12).putLong(millis).putInt(nanosOfMilli).array(); + Object result = MosaicObjects.convertStatsValue(bytes, DataTypes.TIMESTAMP(7)); + assertThat(result).isEqualTo(Timestamp.fromEpochMillis(millis, nanosOfMilli)); + } + + @Test + void testTimestampWithLocalTimeZoneMillis() { + long millis = 1700000000000L; + byte[] bytes = ByteBuffer.allocate(8).putLong(millis).array(); + Object result = + MosaicObjects.convertStatsValue(bytes, DataTypes.TIMESTAMP_WITH_LOCAL_TIME_ZONE(3)); + assertThat(result).isEqualTo(Timestamp.fromEpochMillis(millis)); + } + + @Test + void testTimestampWithLocalTimeZoneMicros() { + long micros = 1700000000000000L; + byte[] bytes = ByteBuffer.allocate(8).putLong(micros).array(); + Object result = + MosaicObjects.convertStatsValue(bytes, DataTypes.TIMESTAMP_WITH_LOCAL_TIME_ZONE(6)); + assertThat(result).isEqualTo(Timestamp.fromMicros(micros)); + } + + @Test + void testTimestampWithLocalTimeZoneNanos() { + long millis = 1700000000123L; + int nanosOfMilli = 456789; + byte[] bytes = ByteBuffer.allocate(12).putLong(millis).putInt(nanosOfMilli).array(); + Object result = + MosaicObjects.convertStatsValue(bytes, DataTypes.TIMESTAMP_WITH_LOCAL_TIME_ZONE(9)); + assertThat(result).isEqualTo(Timestamp.fromEpochMillis(millis, nanosOfMilli)); + } + + @Test + void testEmptyStringVarChar() { + Object result = MosaicObjects.convertStatsValue(new byte[0], DataTypes.STRING()); + assertThat(result).isEqualTo(BinaryString.fromString("")); + } + + @Test + void testEmptyBinary() { + Object result = MosaicObjects.convertStatsValue(new byte[0], DataTypes.BYTES()); + assertThat(result).isEqualTo(new byte[0]); + } + @Test void testUnsupportedTypeReturnsNull() { byte[] bytes = new byte[] {1, 2, 3}; diff --git a/paimon-mosaic/src/test/java/org/apache/paimon/format/mosaic/MosaicReaderWriterTest.java b/paimon-mosaic/src/test/java/org/apache/paimon/format/mosaic/MosaicReaderWriterTest.java index 78da0d2500d4..6f027aafe27c 100644 --- a/paimon-mosaic/src/test/java/org/apache/paimon/format/mosaic/MosaicReaderWriterTest.java +++ b/paimon-mosaic/src/test/java/org/apache/paimon/format/mosaic/MosaicReaderWriterTest.java @@ -203,6 +203,67 @@ void testReturnedPosition() throws IOException { reader.close(); } + @Test + void testProjectionWithMissingColumns() throws IOException { + RowType writeType = + RowType.builder() + .field("f_int", DataTypes.INT()) + .field("f_string", DataTypes.STRING()) + .build(); + // Read type has a column that doesn't exist in the file (schema evolution) + RowType readType = + RowType.builder() + .field("f_int", DataTypes.INT()) + .field("f_new_col", DataTypes.BIGINT()) + .field("f_string", DataTypes.STRING()) + .build(); + Path path = newPath(); + + writeRows( + writeType, + path, + GenericRow.of(1, BinaryString.fromString("aaa")), + GenericRow.of(2, BinaryString.fromString("bbb"))); + + List result = readAll(writeType, readType, path, null); + assertThat(result).hasSize(2); + assertThat(result.get(0).getInt(0)).isEqualTo(1); + assertThat(result.get(0).isNullAt(1)).isTrue(); + assertThat(result.get(0).getString(2).toString()).isEqualTo("aaa"); + assertThat(result.get(1).getInt(0)).isEqualTo(2); + assertThat(result.get(1).isNullAt(1)).isTrue(); + assertThat(result.get(1).getString(2).toString()).isEqualTo("bbb"); + } + + @Test + void testProjectionAllColumnsMissing() throws IOException { + RowType writeType = + RowType.builder() + .field("f_int", DataTypes.INT()) + .field("f_string", DataTypes.STRING()) + .build(); + // Read type has only columns that don't exist in the file + RowType readType = + RowType.builder() + .field("f_new_a", DataTypes.INT()) + .field("f_new_b", DataTypes.STRING()) + .build(); + Path path = newPath(); + + writeRows( + writeType, + path, + GenericRow.of(1, BinaryString.fromString("x")), + GenericRow.of(2, BinaryString.fromString("y"))); + + List result = readAll(writeType, readType, path, null); + assertThat(result).hasSize(2); + assertThat(result.get(0).isNullAt(0)).isTrue(); + assertThat(result.get(0).isNullAt(1)).isTrue(); + assertThat(result.get(1).isNullAt(0)).isTrue(); + assertThat(result.get(1).isNullAt(1)).isTrue(); + } + @Test void testReachTargetSize() throws IOException { RowType rowType = DataTypes.ROW(DataTypes.INT(), DataTypes.STRING()); From 9a2ef6367fd111c29a7e7a7c742dad91e252eda6 Mon Sep 17 00:00:00 2001 From: JingsongLi Date: Thu, 21 May 2026 10:48:30 +0800 Subject: [PATCH 08/11] fix --- .../format/mosaic/MosaicRecordsWriter.java | 18 ++- .../mosaic/MosaicSimpleStatsExtractor.java | 6 +- .../format/mosaic/MosaicWriterFactory.java | 3 +- .../MosaicSimpleStatsExtractorTest.java | 131 ++++++++++++++++++ 4 files changed, 153 insertions(+), 5 deletions(-) diff --git a/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicRecordsWriter.java b/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicRecordsWriter.java index e495b8a62bd1..7bc454cf0818 100644 --- a/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicRecordsWriter.java +++ b/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicRecordsWriter.java @@ -56,7 +56,8 @@ public MosaicRecordsWriter( RowType rowType, FileFormatFactory.FormatContext formatContext, List statsColumnNames, - int numBuckets) { + int numBuckets, + String compression) { this.statsColumnNames = statsColumnNames; this.allocator = new RootAllocator(); @@ -69,6 +70,7 @@ public MosaicRecordsWriter( Schema arrowSchema = arrowFormatWriter.getVectorSchemaRoot().getSchema(); WriterOptions options = new WriterOptions() + .compression(resolveCompression(compression)) .zstdLevel(formatContext.zstdLevel()) .numBuckets(numBuckets) .rowGroupMaxSize(writeBatchMemory); @@ -79,6 +81,20 @@ public MosaicRecordsWriter( this.nativeWriter = new MosaicWriter(outputStream, arrowSchema, options, allocator); } + private static int resolveCompression(String compression) { + if (compression == null) { + return WriterOptions.COMPRESSION_ZSTD; + } + switch (compression) { + case "none": + // return WriterOptions.COMPRESSION_NONE; + return 0; + case "zstd": + default: + return WriterOptions.COMPRESSION_ZSTD; + } + } + @Override public void addElement(InternalRow internalRow) { if (!arrowFormatWriter.write(internalRow)) { diff --git a/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicSimpleStatsExtractor.java b/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicSimpleStatsExtractor.java index f1f3cdd2d076..f426b27cfa81 100644 --- a/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicSimpleStatsExtractor.java +++ b/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicSimpleStatsExtractor.java @@ -124,7 +124,7 @@ private SimpleColStats[] extractFromStats( DataType dataType = rowType.getFields().get(colIdx).type(); Object min = convertStatsValue(stat.getMin(), dataType); Object max = convertStatsValue(stat.getMax(), dataType); - if (min != null) { + if (min instanceof Comparable) { if (minValues[colIdx] == null) { minValues[colIdx] = min; } else { @@ -133,7 +133,7 @@ private SimpleColStats[] extractFromStats( } } } - if (max != null) { + if (max instanceof Comparable) { if (maxValues[colIdx] == null) { maxValues[colIdx] = max; } else { @@ -150,7 +150,7 @@ private SimpleColStats[] extractFromStats( SimpleColStatsCollector[] collectors = SimpleColStatsCollector.create(statsCollectors); SimpleColStats[] result = new SimpleColStats[fieldCount]; for (int i = 0; i < fieldCount; i++) { - if (!trackedColumns.contains(i)) { + if (!trackedColumns.contains(i) || !seenColumns.contains(i)) { result[i] = collectors[i].convert(new SimpleColStats(null, null, null)); } else { SimpleColStats fieldStats = diff --git a/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicWriterFactory.java b/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicWriterFactory.java index 20edeeeb1e16..05f47acb04b0 100644 --- a/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicWriterFactory.java +++ b/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicWriterFactory.java @@ -56,6 +56,7 @@ public MosaicWriterFactory(RowType rowType, FileFormatFactory.FormatContext form @Override public FormatWriter create(PositionOutputStream out, String compression) throws IOException { - return new MosaicRecordsWriter(out, rowType, formatContext, statsColumnNames, numBuckets); + return new MosaicRecordsWriter( + out, rowType, formatContext, statsColumnNames, numBuckets, compression); } } diff --git a/paimon-mosaic/src/test/java/org/apache/paimon/format/mosaic/MosaicSimpleStatsExtractorTest.java b/paimon-mosaic/src/test/java/org/apache/paimon/format/mosaic/MosaicSimpleStatsExtractorTest.java index 89ef34b160b1..8477c5b06540 100644 --- a/paimon-mosaic/src/test/java/org/apache/paimon/format/mosaic/MosaicSimpleStatsExtractorTest.java +++ b/paimon-mosaic/src/test/java/org/apache/paimon/format/mosaic/MosaicSimpleStatsExtractorTest.java @@ -18,20 +18,38 @@ package org.apache.paimon.format.mosaic; +import org.apache.paimon.data.BinaryString; +import org.apache.paimon.data.GenericRow; import org.apache.paimon.format.FileFormat; import org.apache.paimon.format.FileFormatFactory; +import org.apache.paimon.format.FormatWriter; +import org.apache.paimon.format.FormatWriterFactory; +import org.apache.paimon.format.SimpleColStats; import org.apache.paimon.format.SimpleColStatsExtractorTest; +import org.apache.paimon.format.SimpleStatsExtractor; +import org.apache.paimon.fs.Path; +import org.apache.paimon.fs.local.LocalFileIO; import org.apache.paimon.options.Options; +import org.apache.paimon.statistics.SimpleColStatsCollector; import org.apache.paimon.types.DataTypes; import org.apache.paimon.types.RowType; import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; +import java.io.IOException; +import java.util.UUID; +import java.util.stream.IntStream; + +import static org.assertj.core.api.Assertions.assertThat; import static org.junit.jupiter.api.Assumptions.assumeTrue; /** Tests for {@link MosaicSimpleStatsExtractor}. */ class MosaicSimpleStatsExtractorTest extends SimpleColStatsExtractorTest { + @TempDir java.nio.file.Path statsTestTempDir; + @BeforeAll static void checkNativeLibrary() { assumeTrue(isNativeAvailable(), "Mosaic native library not available"); @@ -70,6 +88,119 @@ protected String fileCompression() { return "zstd"; } + @Test + void testUntrackedColumnsReturnNone() throws IOException { + // stats_columns only tracks f_int, but the table has f_int + f_string + RowType rowType = + RowType.builder() + .field("f_int", DataTypes.INT()) + .field("f_string", DataTypes.STRING()) + .build(); + Options options = new Options(); + options.set(MosaicFileFormat.STATS_COLUMNS, "f_int"); + MosaicFileFormat format = + new MosaicFileFormat(new FileFormatFactory.FormatContext(options, 1024, 1024)); + + Path path = new Path(statsTestTempDir.toUri().toString(), UUID.randomUUID() + ".mosaic"); + LocalFileIO fileIO = new LocalFileIO(); + FormatWriterFactory writerFactory = format.createWriterFactory(rowType); + FormatWriter writer = writerFactory.create(fileIO.newOutputStream(path, false), "zstd"); + writer.addElement(GenericRow.of(1, BinaryString.fromString("a"))); + writer.addElement(GenericRow.of(2, BinaryString.fromString("b"))); + writer.close(); + + SimpleColStatsCollector.Factory[] collectors = + IntStream.range(0, rowType.getFieldCount()) + .mapToObj(i -> SimpleColStatsCollector.from("full")) + .toArray(SimpleColStatsCollector.Factory[]::new); + SimpleStatsExtractor extractor = format.createStatsExtractor(rowType, collectors).get(); + SimpleColStats[] stats = extractor.extract(fileIO, path, fileIO.getFileSize(path)); + + // f_int is tracked, should have real stats + assertThat(stats[0].min()).isEqualTo(1); + assertThat(stats[0].max()).isEqualTo(2); + assertThat(stats[0].nullCount()).isEqualTo(0L); + // f_string is NOT tracked, should be NONE (null nullCount) + assertThat(stats[1].min()).isNull(); + assertThat(stats[1].max()).isNull(); + assertThat(stats[1].nullCount()).isNull(); + } + + @Test + void testBinaryColumnStatsNoException() throws Exception { + // Binary columns produce byte[] from convertStatsValue, which is not Comparable. + // Verify multi-row-group aggregation doesn't throw ClassCastException. + RowType rowType = + RowType.builder() + .field("f_int", DataTypes.INT()) + .field("f_binary", DataTypes.VARBINARY(100)) + .build(); + // Build a fake MosaicWriterMetadata with binary stats across 2 row groups + java.lang.reflect.Constructor ctor = + org.apache.paimon.mosaic.ColumnStatistics.class.getDeclaredConstructor( + long.class, byte[].class, byte[].class); + ctor.setAccessible(true); + + java.util.Map rg0 = + new java.util.HashMap<>(); + rg0.put( + "f_int", + (org.apache.paimon.mosaic.ColumnStatistics) + ctor.newInstance(0L, intBytes(0), intBytes(100))); + rg0.put( + "f_binary", + (org.apache.paimon.mosaic.ColumnStatistics) + ctor.newInstance(0L, new byte[] {1, 2}, new byte[] {3, 4})); + + java.util.Map rg1 = + new java.util.HashMap<>(); + rg1.put( + "f_int", + (org.apache.paimon.mosaic.ColumnStatistics) + ctor.newInstance(0L, intBytes(50), intBytes(200))); + rg1.put( + "f_binary", + (org.apache.paimon.mosaic.ColumnStatistics) + ctor.newInstance(0L, new byte[] {5, 6}, new byte[] {7, 8})); + + java.util.List> allStats = + java.util.Arrays.asList(rg0, rg1); + MosaicWriterMetadata metadata = + new MosaicWriterMetadata(2, allStats, java.util.Arrays.asList("f_int", "f_binary")); + + // Write a minimal file (only f_int in stats_columns since native rejects binary) + Options options = new Options(); + options.set(MosaicFileFormat.STATS_COLUMNS, "f_int"); + MosaicFileFormat format = + new MosaicFileFormat(new FileFormatFactory.FormatContext(options, 1024, 1024)); + Path path = new Path(statsTestTempDir.toUri().toString(), UUID.randomUUID() + ".mosaic"); + LocalFileIO fileIO = new LocalFileIO(); + FormatWriterFactory writerFactory = format.createWriterFactory(rowType); + FormatWriter writer = writerFactory.create(fileIO.newOutputStream(path, false), "zstd"); + writer.addElement(GenericRow.of(1, new byte[] {1})); + writer.close(); + + SimpleColStatsCollector.Factory[] collectors = + IntStream.range(0, rowType.getFieldCount()) + .mapToObj(i -> SimpleColStatsCollector.from("full")) + .toArray(SimpleColStatsCollector.Factory[]::new); + SimpleStatsExtractor extractor = format.createStatsExtractor(rowType, collectors).get(); + // Should not throw ClassCastException + SimpleColStats[] stats = + extractor.extract(fileIO, path, fileIO.getFileSize(path), metadata); + + // f_int aggregated across row groups: min=0, max=200 + assertThat(stats[0].min()).isEqualTo(0); + assertThat(stats[0].max()).isEqualTo(200); + // f_binary min/max should be null (byte[] not Comparable, skipped) + assertThat(stats[1].min()).isNull(); + assertThat(stats[1].max()).isNull(); + } + + private static byte[] intBytes(int value) { + return java.nio.ByteBuffer.allocate(4).putInt(value).array(); + } + private static boolean isNativeAvailable() { try { Class.forName("org.apache.paimon.mosaic.NativeLib"); From 83bb48f79a32df98d2751255d007cc7eb8d6709a Mon Sep 17 00:00:00 2001 From: JingsongLi Date: Thu, 21 May 2026 11:15:29 +0800 Subject: [PATCH 09/11] fix --- .../format/mosaic/MosaicFileFormat.java | 6 ++-- .../format/mosaic/MosaicRecordsWriter.java | 34 ++++++------------- .../format/mosaic/MosaicWriterFactory.java | 11 +++--- 3 files changed, 19 insertions(+), 32 deletions(-) diff --git a/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicFileFormat.java b/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicFileFormat.java index 306db02352b0..bff850a4e06c 100644 --- a/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicFileFormat.java +++ b/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicFileFormat.java @@ -72,10 +72,8 @@ public class MosaicFileFormat extends FileFormat { public static final ConfigOption NUM_BUCKETS = ConfigOptions.key("mosaic.num-buckets") .intType() - .defaultValue(0) - .withDescription( - "Number of column buckets for parallel IO. " - + "0 means auto (defaults to number of columns)."); + .noDefaultValue() + .withDescription("Number of column buckets for parallel IO."); static { System.setProperty("arrow.enable_unsafe_memory_access", "true"); diff --git a/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicRecordsWriter.java b/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicRecordsWriter.java index 7bc454cf0818..fdef0eb3652b 100644 --- a/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicRecordsWriter.java +++ b/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicRecordsWriter.java @@ -27,6 +27,7 @@ import org.apache.paimon.mosaic.ColumnStatistics; import org.apache.paimon.mosaic.MosaicWriter; import org.apache.paimon.mosaic.WriterOptions; +import org.apache.paimon.options.MemorySize; import org.apache.paimon.types.RowType; import org.apache.arrow.memory.BufferAllocator; @@ -56,8 +57,7 @@ public MosaicRecordsWriter( RowType rowType, FileFormatFactory.FormatContext formatContext, List statsColumnNames, - int numBuckets, - String compression) { + @Nullable Integer numBuckets) { this.statsColumnNames = statsColumnNames; this.allocator = new RootAllocator(); @@ -67,34 +67,22 @@ public MosaicRecordsWriter( this.arrowFormatWriter = new ArrowFormatWriter(rowType, writeBatchSize, true, allocator, writeBatchMemory); - Schema arrowSchema = arrowFormatWriter.getVectorSchemaRoot().getSchema(); - WriterOptions options = - new WriterOptions() - .compression(resolveCompression(compression)) - .zstdLevel(formatContext.zstdLevel()) - .numBuckets(numBuckets) - .rowGroupMaxSize(writeBatchMemory); + WriterOptions options = new WriterOptions().zstdLevel(formatContext.zstdLevel()); + if (numBuckets != null) { + options = options.numBuckets(numBuckets); + } + MemorySize blockSize = formatContext.blockSize(); + if (blockSize != null) { + options = options.rowGroupMaxSize(blockSize.getBytes()); + } if (!statsColumnNames.isEmpty()) { options.statsColumns(statsColumnNames.toArray(new String[0])); } + Schema arrowSchema = arrowFormatWriter.getVectorSchemaRoot().getSchema(); this.nativeWriter = new MosaicWriter(outputStream, arrowSchema, options, allocator); } - private static int resolveCompression(String compression) { - if (compression == null) { - return WriterOptions.COMPRESSION_ZSTD; - } - switch (compression) { - case "none": - // return WriterOptions.COMPRESSION_NONE; - return 0; - case "zstd": - default: - return WriterOptions.COMPRESSION_ZSTD; - } - } - @Override public void addElement(InternalRow internalRow) { if (!arrowFormatWriter.write(internalRow)) { diff --git a/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicWriterFactory.java b/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicWriterFactory.java index 05f47acb04b0..4d7b2ef2df35 100644 --- a/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicWriterFactory.java +++ b/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicWriterFactory.java @@ -24,7 +24,8 @@ import org.apache.paimon.fs.PositionOutputStream; import org.apache.paimon.types.RowType; -import java.io.IOException; +import javax.annotation.Nullable; + import java.util.ArrayList; import java.util.Arrays; import java.util.List; @@ -36,7 +37,7 @@ public class MosaicWriterFactory implements FormatWriterFactory { private final RowType rowType; private final FileFormatFactory.FormatContext formatContext; private final List statsColumnNames; - private final int numBuckets; + private final @Nullable Integer numBuckets; public MosaicWriterFactory(RowType rowType, FileFormatFactory.FormatContext formatContext) { this.rowType = rowType; @@ -55,8 +56,8 @@ public MosaicWriterFactory(RowType rowType, FileFormatFactory.FormatContext form } @Override - public FormatWriter create(PositionOutputStream out, String compression) throws IOException { - return new MosaicRecordsWriter( - out, rowType, formatContext, statsColumnNames, numBuckets, compression); + public FormatWriter create(PositionOutputStream out, String compression) { + // only support zstd, ignore compression + return new MosaicRecordsWriter(out, rowType, formatContext, statsColumnNames, numBuckets); } } From fe157b568178b3554a58fbeb9ccd5c82a8d5b1e3 Mon Sep 17 00:00:00 2001 From: JingsongLi Date: Thu, 21 May 2026 11:29:22 +0800 Subject: [PATCH 10/11] fix --- .../paimon/format/mosaic/MosaicWriterFactory.java | 14 +++++++++++++- .../format/mosaic/MosaicReaderWriterTest.java | 14 ++++++++++++++ 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicWriterFactory.java b/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicWriterFactory.java index 4d7b2ef2df35..ca67647f8cab 100644 --- a/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicWriterFactory.java +++ b/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicWriterFactory.java @@ -29,6 +29,7 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.List; +import java.util.Locale; import java.util.stream.Collectors; /** A factory to create Mosaic {@link FormatWriter}. */ @@ -57,7 +58,18 @@ public MosaicWriterFactory(RowType rowType, FileFormatFactory.FormatContext form @Override public FormatWriter create(PositionOutputStream out, String compression) { - // only support zstd, ignore compression + validateCompression(compression); return new MosaicRecordsWriter(out, rowType, formatContext, statsColumnNames, numBuckets); } + + private static void validateCompression(String compression) { + if (compression == null) { + return; + } + String normalized = compression.toLowerCase(Locale.ROOT); + if (!normalized.equals("zstd")) { + throw new UnsupportedOperationException( + "Mosaic format only supports zstd compression, but got: " + compression); + } + } } diff --git a/paimon-mosaic/src/test/java/org/apache/paimon/format/mosaic/MosaicReaderWriterTest.java b/paimon-mosaic/src/test/java/org/apache/paimon/format/mosaic/MosaicReaderWriterTest.java index 6f027aafe27c..60efceed08e3 100644 --- a/paimon-mosaic/src/test/java/org/apache/paimon/format/mosaic/MosaicReaderWriterTest.java +++ b/paimon-mosaic/src/test/java/org/apache/paimon/format/mosaic/MosaicReaderWriterTest.java @@ -48,6 +48,7 @@ import java.util.UUID; import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; import static org.junit.jupiter.api.Assumptions.assumeTrue; /** Integration tests for Mosaic reader and writer. */ @@ -264,6 +265,19 @@ void testProjectionAllColumnsMissing() throws IOException { assertThat(result.get(1).isNullAt(1)).isTrue(); } + @Test + void testUnsupportedCompressionThrows() { + RowType rowType = DataTypes.ROW(DataTypes.INT(), DataTypes.STRING()); + Path path = newPath(); + MosaicFileFormat format = createFormat(); + FormatWriterFactory writerFactory = format.createWriterFactory(rowType); + LocalFileIO fileIO = new LocalFileIO(); + + assertThatThrownBy(() -> writerFactory.create(fileIO.newOutputStream(path, false), "lz4")) + .isInstanceOf(UnsupportedOperationException.class) + .hasMessageContaining("lz4"); + } + @Test void testReachTargetSize() throws IOException { RowType rowType = DataTypes.ROW(DataTypes.INT(), DataTypes.STRING()); From 9e1d73752bdd4add255ac89ff66ca670e7b03c1a Mon Sep 17 00:00:00 2001 From: JingsongLi Date: Fri, 22 May 2026 16:55:58 +0800 Subject: [PATCH 11/11] vectoredReadable --- .../format/mosaic/MosaicInputFileAdapter.java | 124 ++++-------------- 1 file changed, 28 insertions(+), 96 deletions(-) diff --git a/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicInputFileAdapter.java b/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicInputFileAdapter.java index df287a3fc347..3a307ea0f296 100644 --- a/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicInputFileAdapter.java +++ b/paimon-mosaic/src/main/java/org/apache/paimon/format/mosaic/MosaicInputFileAdapter.java @@ -21,127 +21,59 @@ import org.apache.paimon.fs.FileIO; import org.apache.paimon.fs.Path; import org.apache.paimon.fs.SeekableInputStream; +import org.apache.paimon.fs.VectoredReadable; import org.apache.paimon.mosaic.InputFile; import java.io.Closeable; import java.io.EOFException; import java.io.IOException; -import java.util.ArrayDeque; -import java.util.Deque; -import java.util.concurrent.Semaphore; /** * Adapts Paimon's {@link FileIO} to Mosaic's {@link InputFile} interface. * - *

Maintains a pool of up to 8 {@link SeekableInputStream} instances. When all 8 are in use, - * callers block until one is returned. Thread-safe: Mosaic may invoke {@link #readFully} - * concurrently from multiple threads. + *

Maintains a single {@link SeekableInputStream}. If the stream implements {@link + * VectoredReadable}, reads use {@link VectoredReadable#preadFully} which is thread-safe. Otherwise, + * reads are synchronized to protect seek+read sequences. */ public class MosaicInputFileAdapter implements InputFile, Closeable { - private static final int MAX_POOL_SIZE = 8; - - private final FileIO fileIO; private final Path path; - private final Semaphore semaphore; - private final Deque pool; - private boolean closed; + private final SeekableInputStream in; + private final VectoredReadable vectoredReadable; - public MosaicInputFileAdapter(FileIO fileIO, Path path) { - this.fileIO = fileIO; + public MosaicInputFileAdapter(FileIO fileIO, Path path) throws IOException { this.path = path; - this.semaphore = new Semaphore(MAX_POOL_SIZE); - this.pool = new ArrayDeque<>(MAX_POOL_SIZE); + this.in = fileIO.newInputStream(path); + this.vectoredReadable = in instanceof VectoredReadable ? (VectoredReadable) in : null; } @Override public void readFully(long position, byte[] buffer, int offset, int length) throws IOException { - SeekableInputStream in = acquire(); - try { - in.seek(position); - int remaining = length; - int off = offset; - while (remaining > 0) { - int read = in.read(buffer, off, remaining); - if (read < 0) { - throw new EOFException( - "Reached end of file while reading " - + path - + " at position " - + position); + if (vectoredReadable != null) { + vectoredReadable.preadFully(position, buffer, offset, length); + } else { + synchronized (in) { + in.seek(position); + int remaining = length; + int off = offset; + while (remaining > 0) { + int read = in.read(buffer, off, remaining); + if (read < 0) { + throw new EOFException( + "Reached end of file while reading " + + path + + " at position " + + position); + } + off += read; + remaining -= read; } - off += read; - remaining -= read; - } - release(in); - } catch (Throwable t) { - closeQuietly(in); - semaphore.release(); - throw t; - } - } - - private SeekableInputStream acquire() throws IOException { - try { - semaphore.acquire(); - } catch (InterruptedException e) { - Thread.currentThread().interrupt(); - throw new IOException("Interrupted while waiting for stream", e); - } - synchronized (pool) { - if (closed) { - semaphore.release(); - throw new IOException("MosaicInputFileAdapter is closed"); - } - SeekableInputStream in = pool.pollFirst(); - if (in != null) { - return in; } } - return fileIO.newInputStream(path); - } - - private void release(SeekableInputStream in) { - synchronized (pool) { - if (!closed) { - pool.addLast(in); - semaphore.release(); - return; - } - } - closeQuietly(in); - semaphore.release(); } @Override public void close() throws IOException { - Deque toClose; - synchronized (pool) { - closed = true; - toClose = new ArrayDeque<>(pool); - pool.clear(); - } - IOException firstException = null; - for (SeekableInputStream in : toClose) { - try { - in.close(); - } catch (IOException e) { - if (firstException == null) { - firstException = e; - } else { - firstException.addSuppressed(e); - } - } - } - if (firstException != null) { - throw firstException; - } - } - - private static void closeQuietly(SeekableInputStream in) { - try { - in.close(); - } catch (IOException ignored) { - } + in.close(); } }