Skip to content
Open
63 changes: 63 additions & 0 deletions docs/docs/concepts/spec/fileindex.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -347,6 +347,69 @@ Bit-slice index bitmap format (V1)

RangeBitmap only support the following data type: TinyIntType, SmallIntType, IntType, BigIntType, DateType, TimeType, LocalZonedTimestampType, TimestampType, CharType, VarCharType, StringType, BooleanType, DoubleType, FloatType.

RTree only support the following data type: ArrayType of DoubleType


## Index: RTree

RTree file index is a spatial index, used to accelerate point query and range query on multi-dimensional data.

Advantage:
1. Efficient for multi-dimensional spatial queries.
2. Supports both point lookup and bounding box range query.

Shortcoming:
1. Only supports ARRAY<DOUBLE> data type.
2. The index structure may consume more space for high-dimensional data.

Options:
* `file-index.rtree.columns`: specify the columns that need rtree index.
* `file-index.rtree.<column_name>.dimensions`: to config the dimensions of the spatial data, default value is 2.
* `file-index.rtree.<column_name>.max-entries`: to config the maximum entries per node, default value is 32.

Table supports using rtree file index to optimize the `EQUALS` predicate. The literal can be either a point (double array) or a bounding box for range query.

<pre>
RTree file index format (V1)
+-------------------------------------------------+-----------------
| dimensions (4 bytes int) |
+-------------------------------------------------+
| max entries (4 bytes int) |
+-------------------------------------------------+
| tree size (4 bytes int) | HEAD
+-------------------------------------------------+-----------------
| node: is leaf (1 byte boolean) |
+-------------------------------------------------+
| node: entry count (4 bytes int) |
+-------------------------------------------------+
| node: bounding box min[0] (8 bytes double) |
+-------------------------------------------------+
| ... |
+-------------------------------------------------+
| node: bounding box min[dimensions-1] |
+-------------------------------------------------+
| node: bounding box max[0] (8 bytes double) |
+-------------------------------------------------+
| ... |
+-------------------------------------------------+
| node: bounding box max[dimensions-1] |
+-------------------------------------------------+-----------------
| if leaf: row id 1 (4 bytes int) |
+-------------------------------------------------+
| if leaf: row id 2 (4 bytes int) | BODY
+-------------------------------------------------+
| ... |
+-------------------------------------------------+-----------------
| if not leaf: child node 1 |
+-------------------------------------------------+
| if not leaf: child node 2 |
+-------------------------------------------------+
| ... |
+-------------------------------------------------+-----------------
</pre>

RTree only support the following data type: ArrayType of DoubleType.

## Index: Bit-Slice Index Bitmap

:::warning
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,180 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.paimon.fileindex.rtree;

import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.IOException;
import java.util.Arrays;

/** Represents an axis-aligned bounding box (AABB) for spatial indexing. */
public class BoundingBox {
private final double[] min;
private final double[] max;
private final int dimensions;

public BoundingBox(double[] min, double[] max) {
if (min.length != max.length) {
throw new IllegalArgumentException("Min and max must have same dimensions");
}
this.min = min.clone();
this.max = max.clone();
this.dimensions = min.length;
}

public BoundingBox(int dimensions) {
this.min = new double[dimensions];
this.max = new double[dimensions];
this.dimensions = dimensions;
java.util.Arrays.fill(min, Double.POSITIVE_INFINITY);
java.util.Arrays.fill(max, Double.NEGATIVE_INFINITY);
}

public static BoundingBox fromPoint(double[] point) {
return new BoundingBox(point, point);
}

public int getDimensions() {
return dimensions;
}

public double[] getMin() {
return min;
}

public double[] getMax() {
return max;
}

public void expand(BoundingBox other) {
for (int i = 0; i < dimensions; i++) {
min[i] = Math.min(min[i], other.min[i]);
max[i] = Math.max(max[i], other.max[i]);
}
}

public void expand(double[] point) {
for (int i = 0; i < dimensions; i++) {
min[i] = Math.min(min[i], point[i]);
max[i] = Math.max(max[i], point[i]);
}
}

public void clear() {
java.util.Arrays.fill(min, Double.POSITIVE_INFINITY);
java.util.Arrays.fill(max, Double.NEGATIVE_INFINITY);
}

public double getArea() {
double area = 1.0;
for (int i = 0; i < dimensions; i++) {
area *= (max[i] - min[i]);
}
return area;
}

public double getMargin() {
double margin = 0.0;
for (int i = 0; i < dimensions; i++) {
margin += 2 * (max[i] - min[i]);
}
return margin;
}

public double getExpansionArea(BoundingBox other) {
double newArea = 1.0;
for (int i = 0; i < dimensions; i++) {
newArea *= (Math.max(max[i], other.max[i]) - Math.min(min[i], other.min[i]));
}
return newArea - getArea();
}

public boolean intersects(BoundingBox other) {
for (int i = 0; i < dimensions; i++) {
if (max[i] < other.min[i] || min[i] > other.max[i]) {
return false;
}
}
return true;
}

public boolean contains(double[] point) {
for (int i = 0; i < dimensions; i++) {
if (point[i] < min[i] || point[i] > max[i]) {
return false;
}
}
return true;
}

public boolean contains(BoundingBox other) {
for (int i = 0; i < dimensions; i++) {
if (other.min[i] < min[i] || other.max[i] > max[i]) {
return false;
}
}
return true;
}

public void serialize(DataOutputStream dos) throws IOException {
for (int i = 0; i < dimensions; i++) {
dos.writeDouble(min[i]);
}
for (int i = 0; i < dimensions; i++) {
dos.writeDouble(max[i]);
}
}

public static BoundingBox deserialize(DataInputStream dis, int dimensions) throws IOException {
double[] min = new double[dimensions];
double[] max = new double[dimensions];
for (int i = 0; i < dimensions; i++) {
min[i] = dis.readDouble();
}
for (int i = 0; i < dimensions; i++) {
max[i] = dis.readDouble();
}
return new BoundingBox(min, max);
}

@Override
public String toString() {
return String.format(
"BoundingBox(min=%s, max=%s)", Arrays.toString(min), Arrays.toString(max));
}

@Override
public boolean equals(Object o) {
if (this == o) {
return true;
}
if (o == null || getClass() != o.getClass()) {
return false;
}
BoundingBox that = (BoundingBox) o;
return Arrays.equals(min, that.min) && Arrays.equals(max, that.max);
}

@Override
public int hashCode() {
int result = Arrays.hashCode(min);
result = 31 * result + Arrays.hashCode(max);
return result;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.paimon.fileindex.rtree;

/** Represents a leaf entry in an R-Tree node. */
public class LeafEntry {
private final int rowId;
private final BoundingBox bbox;

public LeafEntry(int rowId, BoundingBox bbox) {
this.rowId = rowId;
this.bbox = bbox;
}

public int getRowId() {
return rowId;
}

public BoundingBox getBbox() {
return bbox;
}
}
Loading
Loading